/* $NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $ */
/*-
* Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file contains kauth(9) listeners needed to implement the traditional
* NetBSD superuser access restrictions.
*
* There are two main resources a request can be issued to: user-owned and
* system owned. For the first, traditional Unix access checks are done, as
* well as superuser checks. If needed, the request context is examined before
* a decision is made. For the latter, usually only superuser checks are done
* as normal users are not allowed to access system resources.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/mount.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <secmodel/secmodel.h>
#include <secmodel/suser/suser.h>
MODULE(MODULE_CLASS_SECMODEL, suser, NULL);
static kauth_listener_t l_generic, l_system, l_process, l_network, l_machdep,
l_device, l_vnode;
static secmodel_t suser_sm;
SYSCTL_SETUP(sysctl_security_suser_setup, "secmodel_user sysctl")
{
const struct sysctlnode *rnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "models", NULL,
NULL, 0, NULL, 0,
CTL_SECURITY, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "suser", NULL,
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "name", NULL,
NULL, 0, __UNCONST(SECMODEL_SUSER_NAME), 0,
CTL_CREATE, CTL_EOL);
}
void
secmodel_suser_init(void)
{
}
void
secmodel_suser_start(void)
{
l_generic = kauth_listen_scope(KAUTH_SCOPE_GENERIC,
secmodel_suser_generic_cb, NULL);
l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
secmodel_suser_system_cb, NULL);
l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
secmodel_suser_process_cb, NULL);
l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
secmodel_suser_network_cb, NULL);
l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
secmodel_suser_machdep_cb, NULL);
l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
secmodel_suser_device_cb, NULL);
l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
secmodel_suser_vnode_cb, NULL);
}
void
secmodel_suser_stop(void)
{
kauth_unlisten_scope(l_generic);
kauth_unlisten_scope(l_system);
kauth_unlisten_scope(l_process);
kauth_unlisten_scope(l_network);
kauth_unlisten_scope(l_machdep);
kauth_unlisten_scope(l_device);
kauth_unlisten_scope(l_vnode);
}
static bool
suser_isroot(kauth_cred_t cred)
{
return kauth_cred_geteuid(cred) == 0;
}
static int
suser_eval(const char *what, void *arg, void *ret)
{
int error = 0;
if (strcasecmp(what, "is-root") == 0) {
kauth_cred_t cred = arg;
bool *bp = ret;
*bp = suser_isroot(cred);
} else {
error = ENOENT;
}
return error;
}
static int
suser_modcmd(modcmd_t cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MODULE_CMD_INIT:
error = secmodel_register(&suser_sm,
SECMODEL_SUSER_ID, SECMODEL_SUSER_NAME,
NULL, suser_eval, NULL);
if (error != 0)
printf("suser_modcmd::init: secmodel_register "
"returned %d\n", error);
secmodel_suser_init();
secmodel_suser_start();
break;
case MODULE_CMD_FINI:
secmodel_suser_stop();
error = secmodel_deregister(suser_sm);
if (error != 0)
printf("suser_modcmd::fini: secmodel_deregister "
"returned %d\n", error);
break;
case MODULE_CMD_AUTOUNLOAD:
error = EPERM;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Generic
* Responsibility: Superuser access
*/
int
secmodel_suser_generic_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_GENERIC_ISSUSER:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: System
* Responsibility: Superuser access
*/
int
secmodel_suser_system_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
enum kauth_system_req req;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)arg0;
switch (action) {
case KAUTH_SYSTEM_CPU:
switch (req) {
case KAUTH_REQ_SYSTEM_CPU_SETSTATE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_DEVMAPPER:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_FS_QUOTA:
switch (req) {
case KAUTH_REQ_SYSTEM_FS_QUOTA_GET:
case KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF:
case KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE:
case KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SYSVIPC:
switch (req) {
case KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS:
case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK:
case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK:
case KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_MOUNT:
switch (req) {
case KAUTH_REQ_SYSTEM_MOUNT_DEVICE:
case KAUTH_REQ_SYSTEM_MOUNT_GET:
case KAUTH_REQ_SYSTEM_MOUNT_NEW:
case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
case KAUTH_REQ_SYSTEM_MOUNT_UMAP:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
default:
break;
}
break;
case KAUTH_SYSTEM_MQUEUE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_PSET:
switch (req) {
case KAUTH_REQ_SYSTEM_PSET_ASSIGN:
case KAUTH_REQ_SYSTEM_PSET_BIND:
case KAUTH_REQ_SYSTEM_PSET_CREATE:
case KAUTH_REQ_SYSTEM_PSET_DESTROY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_TIME:
switch (req) {
case KAUTH_REQ_SYSTEM_TIME_ADJTIME:
case KAUTH_REQ_SYSTEM_TIME_NTPADJTIME:
case KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS:
case KAUTH_REQ_SYSTEM_TIME_SYSTEM:
case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SEMAPHORE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_SYSCTL:
switch (req) {
case KAUTH_REQ_SYSTEM_SYSCTL_ADD:
case KAUTH_REQ_SYSTEM_SYSCTL_DELETE:
case KAUTH_REQ_SYSTEM_SYSCTL_DESC:
case KAUTH_REQ_SYSTEM_SYSCTL_MODIFY:
case KAUTH_REQ_SYSTEM_SYSCTL_PRVT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SWAPCTL:
case KAUTH_SYSTEM_ACCOUNTING:
case KAUTH_SYSTEM_REBOOT:
case KAUTH_SYSTEM_CHROOT:
case KAUTH_SYSTEM_FILEHANDLE:
case KAUTH_SYSTEM_MKNOD:
case KAUTH_SYSTEM_SETIDCORE:
case KAUTH_SYSTEM_MODULE:
case KAUTH_SYSTEM_FS_RESERVEDSPACE:
case KAUTH_SYSTEM_MAP_VA_ZERO:
case KAUTH_SYSTEM_FS_EXTATTR:
case KAUTH_SYSTEM_FS_SNAPSHOT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_DEBUG:
break;
case KAUTH_SYSTEM_CHSYSFLAGS:
/* Deprecated. */
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_VERIEXEC:
switch (req) {
case KAUTH_REQ_SYSTEM_VERIEXEC_ACCESS:
case KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_LFS:
switch (req) {
case KAUTH_REQ_SYSTEM_LFS_MARKV:
case KAUTH_REQ_SYSTEM_LFS_BMAPV:
case KAUTH_REQ_SYSTEM_LFS_SEGCLEAN:
case KAUTH_REQ_SYSTEM_LFS_SEGWAIT:
case KAUTH_REQ_SYSTEM_LFS_FCNTL:
if (isroot)
result = KAUTH_RESULT_ALLOW;
default:
break;
}
break;
case KAUTH_SYSTEM_INTR:
switch (req) {
case KAUTH_REQ_SYSTEM_INTR_AFFINITY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_KERNADDR:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Process
* Responsibility: Superuser access
*/
int
secmodel_suser_process_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_PROCESS_SIGNAL:
case KAUTH_PROCESS_KTRACE:
case KAUTH_PROCESS_PROCFS:
case KAUTH_PROCESS_PTRACE:
case KAUTH_PROCESS_SCHEDULER_GETPARAM:
case KAUTH_PROCESS_SCHEDULER_SETPARAM:
case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
case KAUTH_PROCESS_SETID:
case KAUTH_PROCESS_KEVENT_FILTER:
case KAUTH_PROCESS_NICE:
case KAUTH_PROCESS_FORK:
case KAUTH_PROCESS_CORENAME:
case KAUTH_PROCESS_STOPFLAG:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_PROCESS_CANSEE: {
unsigned long req;
req = (unsigned long)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_CANSEE_ARGS:
case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
case KAUTH_REQ_PROCESS_CANSEE_EPROC:
case KAUTH_REQ_PROCESS_CANSEE_KPTR:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
case KAUTH_REQ_PROCESS_CANSEE_ENV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_PROCESS_RLIMIT: {
enum kauth_process_req req;
req = (enum kauth_process_req)(uintptr_t)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_RLIMIT_SET:
case KAUTH_REQ_PROCESS_RLIMIT_GET:
case KAUTH_REQ_PROCESS_RLIMIT_BYPASS:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Network
* Responsibility: Superuser access
*/
int
secmodel_suser_network_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
enum kauth_network_req req;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg0;
switch (action) {
case KAUTH_NETWORK_ALTQ:
switch (req) {
case KAUTH_REQ_NETWORK_ALTQ_AFMAP:
case KAUTH_REQ_NETWORK_ALTQ_BLUE:
case KAUTH_REQ_NETWORK_ALTQ_CBQ:
case KAUTH_REQ_NETWORK_ALTQ_CDNR:
case KAUTH_REQ_NETWORK_ALTQ_CONF:
case KAUTH_REQ_NETWORK_ALTQ_FIFOQ:
case KAUTH_REQ_NETWORK_ALTQ_HFSC:
case KAUTH_REQ_NETWORK_ALTQ_JOBS:
case KAUTH_REQ_NETWORK_ALTQ_PRIQ:
case KAUTH_REQ_NETWORK_ALTQ_RED:
case KAUTH_REQ_NETWORK_ALTQ_RIO:
case KAUTH_REQ_NETWORK_ALTQ_WFQ:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_BIND:
switch (req) {
case KAUTH_REQ_NETWORK_BIND_PORT:
case KAUTH_REQ_NETWORK_BIND_PRIVPORT:
case KAUTH_REQ_NETWORK_BIND_ANYADDR:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_FIREWALL:
switch (req) {
case KAUTH_REQ_NETWORK_FIREWALL_FW:
case KAUTH_REQ_NETWORK_FIREWALL_NAT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_FORWSRCRT:
case KAUTH_NETWORK_ROUTE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_NETWORK_INTERFACE:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_GET:
case KAUTH_REQ_NETWORK_INTERFACE_SET:
case KAUTH_REQ_NETWORK_INTERFACE_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_SETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_FIRMWARE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_BRIDGE:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_PPP:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_PPP_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_PVC:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_PVC_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_SLIP:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_TUN:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_IPV6:
switch (req) {
case KAUTH_REQ_NETWORK_IPV6_HOPBYHOP:
case KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_NFS:
switch (req) {
case KAUTH_REQ_NETWORK_NFS_EXPORT:
case KAUTH_REQ_NETWORK_NFS_SVC:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_SMB:
switch (req) {
case KAUTH_REQ_NETWORK_SMB_SHARE_ACCESS:
case KAUTH_REQ_NETWORK_SMB_SHARE_CREATE:
case KAUTH_REQ_NETWORK_SMB_VC_ACCESS:
case KAUTH_REQ_NETWORK_SMB_VC_CREATE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_WG:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_WG_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_WG_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_SOCKET:
switch (req) {
case KAUTH_REQ_NETWORK_SOCKET_DROP:
case KAUTH_REQ_NETWORK_SOCKET_OPEN:
case KAUTH_REQ_NETWORK_SOCKET_RAWSOCK:
case KAUTH_REQ_NETWORK_SOCKET_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
default:
break;
}
break;
case KAUTH_NETWORK_IPSEC:
switch (req) {
case KAUTH_REQ_NETWORK_IPSEC_BYPASS:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Machdep
* Responsibility: Superuser access
*/
int
secmodel_suser_machdep_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_MACHDEP_CPU_UCODE_APPLY:
case KAUTH_MACHDEP_IOPERM_GET:
case KAUTH_MACHDEP_LDT_GET:
case KAUTH_MACHDEP_LDT_SET:
case KAUTH_MACHDEP_MTRR_GET:
case KAUTH_MACHDEP_CACHEFLUSH:
case KAUTH_MACHDEP_IOPERM_SET:
case KAUTH_MACHDEP_IOPL:
case KAUTH_MACHDEP_MTRR_SET:
case KAUTH_MACHDEP_NVRAM:
case KAUTH_MACHDEP_UNMANAGEDMEM:
case KAUTH_MACHDEP_PXG:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_MACHDEP_SVS_DISABLE:
/* Deprecated. */
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Device
* Responsibility: Superuser access
*/
int
secmodel_suser_device_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_DEVICE_BLUETOOTH_SETPRIV:
case KAUTH_DEVICE_BLUETOOTH_SEND:
case KAUTH_DEVICE_BLUETOOTH_RECV:
case KAUTH_DEVICE_TTY_OPEN:
case KAUTH_DEVICE_TTY_PRIVSET:
case KAUTH_DEVICE_TTY_STI:
case KAUTH_DEVICE_TTY_VIRTUAL:
case KAUTH_DEVICE_RND_ADDDATA:
case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE:
case KAUTH_DEVICE_RND_GETPRIV:
case KAUTH_DEVICE_RND_SETPRIV:
case KAUTH_DEVICE_WSCONS_KEYBOARD_BELL:
case KAUTH_DEVICE_WSCONS_KEYBOARD_KEYREPEAT:
case KAUTH_DEVICE_NVMM_CTL:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_DEVICE_BLUETOOTH_BCSP:
case KAUTH_DEVICE_BLUETOOTH_BTUART: {
enum kauth_device_req req;
req = (enum kauth_device_req)(uintptr_t)arg0;
switch (req) {
case KAUTH_REQ_DEVICE_BLUETOOTH_BCSP_ADD:
case KAUTH_REQ_DEVICE_BLUETOOTH_BTUART_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_DEVICE_GPIO_PINSET:
/*
* root can access gpio pins, secmodel_securelevel can veto
* this decision.
*/
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
int
secmodel_suser_vnode_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
if (isroot) {
/* Superuser can execute only if the file's executable. */
if ((action & KAUTH_VNODE_EXECUTE) == 0 ||
(action & KAUTH_VNODE_IS_EXEC))
result = KAUTH_RESULT_ALLOW;
}
return (result);
}
/* $NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $ */
/*-
* Copyright (c) 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Lockless producer/consumer queue.
*
* Summary of the producer algorithm in pcq_put (may run many in
* parallel with each other and with a consumer):
*
* P1. initialize an item
*
* P2. atomic_cas(&pcq->pcq_pc) loop to advance the producer
* pointer, reserving a space at c (fails if not enough space)
*
* P3. atomic_store_release(&pcq->pcq_items[c], item) to publish
* the item in the space it reserved
*
* Summary of the consumer algorithm in pcq_get (must be serialized by
* caller with other consumers, may run in parallel with any number of
* producers):
*
* C1. atomic_load_relaxed(&pcq->pcq_pc) to get the consumer
* pointer and a snapshot of the producer pointer, which may
* point to null items or point to initialized items (fails if
* no space reserved for published items yet)
*
* C2. atomic_load_consume(&pcq->pcq_items[c]) to get the next
* unconsumed but potentially published item (fails if item
* not published yet)
*
* C3. pcq->pcq_items[c] = NULL to consume the next unconsumed but
* published item
*
* C4. membar_producer
*
* C5. atomic_cas(&pcq->pcq_pc) loop to advance the consumer
* pointer
*
* C6. use the item
*
* Note that there is a weird bare membar_producer which is not matched
* by membar_consumer. This is one of the rare cases of a memory
* barrier on one side that is not matched by a memory barrier on
* another side, but the ordering works out, with a somewhat more
* involved proof.
*
* Some properties that need to be proved:
*
* Theorem 1. For pcq_put call that leads into pcq_get:
* Initializing item at P1 is dependency-ordered before usage of
* item at C6, so items placed by pcq_put can be safely used by
* the caller of pcq_get.
*
* Proof sketch.
*
* Assume load/store P2 synchronizes with load/store C1
* (if not, pcq_get fails in `if (p == c) return NULL').
*
* Assume store-release P3 synchronizes with load-consume
* C2 (if not, pcq_get fails in `if (item == NULL) return
* NULL').
*
* Then:
*
* - P1 is sequenced before store-release P3
* - store-release P3 synchronizes with load-consume C2
* - load-consume C2 is dependency-ordered before C6
*
* Hence transitively, P1 is dependency-ordered before C6,
* QED.
*
* Theorem 2. For pcq_get call followed by pcq_put: Nulling out
* location at store C3 happens before placing a new item in the
* same location at store P3, so items are not lost.
*
* Proof sketch.
*
* Assume load/store C5 synchronizes with load/store P2
* (otherwise pcq_peek starts over the CAS loop or fails).
*
* Then:
*
* - store C3 is sequenced before membar_producer C4
* - membar_producer C4 is sequenced before load/store C5
* - load/store C5 synchronizes with load/store P2 at &pcq->pcq_pc
* - P2 is sequenced before store-release P3
*
* Hence transitively, store C3 happens before
* store-release P3, QED.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/pcq.h>
/*
* Internal producer-consumer queue structure. Note: providing a separate
* cache-line both for pcq_t::pcq_pc and pcq_t::pcq_items.
*/
struct pcq {
u_int pcq_nitems;
uint8_t pcq_pad1[COHERENCY_UNIT - sizeof(u_int)];
volatile uint32_t pcq_pc;
uint8_t pcq_pad2[COHERENCY_UNIT - sizeof(uint32_t)];
void * volatile pcq_items[];
};
/*
* Producer (p) - stored in the lower 16 bits of pcq_t::pcq_pc.
* Consumer (c) - in the higher 16 bits.
*
* We have a limitation of 16 bits i.e. 0xffff items in the queue.
* The PCQ_MAXLEN constant is set accordingly.
*/
static inline void
pcq_split(uint32_t v, u_int *p, u_int *c)
{
*p = v & 0xffff;
*c = v >> 16;
}
static inline uint32_t
pcq_combine(u_int p, u_int c)
{
return p | (c << 16);
}
static inline u_int
pcq_advance(pcq_t *pcq, u_int pc)
{
if (__predict_false(++pc == pcq->pcq_nitems)) {
return 0;
}
return pc;
}
/*
* pcq_put: place an item at the end of the queue.
*/
bool
pcq_put(pcq_t *pcq, void *item)
{
uint32_t v, nv;
u_int op, p, c;
KASSERT(item != NULL);
do {
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &op, &c);
p = pcq_advance(pcq, op);
if (p == c) {
/* Queue is full. */
return false;
}
nv = pcq_combine(p, c);
} while (atomic_cas_32(&pcq->pcq_pc, v, nv) != v);
/*
* Ensure that the update to pcq_pc is globally visible before the
* data item. See pcq_get(). This also ensures that any changes
* that the caller made to the data item are globally visible
* before we put it onto the list.
*/
atomic_store_release(&pcq->pcq_items[op], item);
/*
* Synchronization activity to wake up the consumer will ensure
* that the update to pcq_items[] is visible before the wakeup
* arrives. So, we do not need an additional memory barrier here.
*/
return true;
}
/*
* pcq_peek: return the next item from the queue without removal.
*/
void *
pcq_peek(pcq_t *pcq)
{
const uint32_t v = atomic_load_relaxed(&pcq->pcq_pc);
u_int p, c;
pcq_split(v, &p, &c);
/* See comment on race below in pcq_get(). */
return (p == c) ? NULL : atomic_load_consume(&pcq->pcq_items[c]);
}
/*
* pcq_get: remove and return the next item for consumption or NULL if empty.
*
* => The caller must prevent concurrent gets from occurring.
*/
void *
pcq_get(pcq_t *pcq)
{
uint32_t v, nv;
u_int p, c;
void *item;
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &p, &c);
if (p == c) {
/* Queue is empty: nothing to return. */
return NULL;
}
item = atomic_load_consume(&pcq->pcq_items[c]);
if (item == NULL) {
/*
* Raced with sender: we rely on a notification (e.g. softint
* or wakeup) being generated after the producer's pcq_put(),
* causing us to retry pcq_get() later.
*/
return NULL;
}
/*
* We have exclusive access to this slot, so no need for
* atomic_store_*.
*/
pcq->pcq_items[c] = NULL;
c = pcq_advance(pcq, c);
nv = pcq_combine(p, c);
/*
* Ensure that update to pcq_items[c] becomes globally visible
* before the update to pcq_pc. If it were reordered to occur
* after it, we could in theory wipe out a modification made
* to pcq_items[c] by pcq_put().
*
* No need for load-before-store ordering of membar_release
* because the only load we need to ensure happens first is the
* load of pcq->pcq_items[c], but that necessarily happens
* before the store to pcq->pcq_items[c] to null it out because
* it is at the same memory location. Yes, this is a bare
* membar_producer with no matching membar_consumer.
*/
membar_producer();
while (__predict_false(atomic_cas_32(&pcq->pcq_pc, v, nv) != v)) {
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &p, &c);
c = pcq_advance(pcq, c);
nv = pcq_combine(p, c);
}
return item;
}
pcq_t *
pcq_create(size_t nitems, km_flag_t kmflags)
{
pcq_t *pcq;
KASSERT(nitems > 0);
KASSERT(nitems <= PCQ_MAXLEN);
pcq = kmem_zalloc(offsetof(pcq_t, pcq_items[nitems]), kmflags);
if (pcq != NULL) {
pcq->pcq_nitems = nitems;
}
return pcq;
}
void
pcq_destroy(pcq_t *pcq)
{
kmem_free(pcq, offsetof(pcq_t, pcq_items[pcq->pcq_nitems]));
}
size_t
pcq_maxitems(pcq_t *pcq)
{
return pcq->pcq_nitems;
}
/* $NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $ */
/*-
* Copyright (c) 1998, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/signal.h>
#include <sys/ktrace.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscall_stats.h>
#include <uvm/uvm_extern.h>
#include <machine/cpu.h>
#include <machine/psl.h>
#include <machine/userret.h>
#include "opt_dtrace.h"
#ifndef __x86_64__
int x86_copyargs(void *, void *, size_t);
#endif
void syscall_intern(struct proc *);
static void syscall(struct trapframe *);
void
md_child_return(struct lwp *l)
{
struct trapframe *tf = l->l_md.md_regs;
X86_TF_RAX(tf) = 0;
X86_TF_RFLAGS(tf) &= ~PSL_C;
userret(l);
}
/*
* Process the tail end of a posix_spawn() for the child.
*/
void
cpu_spawn_return(struct lwp *l)
{
userret(l);
}
/*
* syscall(frame):
* System call request from POSIX system call gate interface to kernel.
* Like trap(), argument is call by reference.
*/
#ifdef KDTRACE_HOOKS
void syscall(struct trapframe *);
#else
static
#endif
void
syscall(struct trapframe *frame)
{
const struct sysent *callp;
struct proc *p;
struct lwp *l;
int error;
register_t code, rval[2];
#ifdef __x86_64__
/* Verify that the syscall args will fit in the trapframe space */
CTASSERT(offsetof(struct trapframe, tf_arg9) >=
sizeof(register_t) * (2 + SYS_MAXSYSARGS - 1));
#define args (&frame->tf_rdi)
#else
register_t args[2 + SYS_MAXSYSARGS];
#endif
l = curlwp;
p = l->l_proc;
code = X86_TF_RAX(frame) & (SYS_NSYSENT - 1);
callp = p->p_emul->e_sysent + code;
SYSCALL_COUNT(syscall_counts, code);
SYSCALL_TIME_SYS_ENTRY(l, syscall_times, code);
#ifdef __x86_64__
/*
* The first 6 syscall args are passed in rdi, rsi, rdx, r10, r8 and r9
* (rcx gets copied to r10 in the libc stub because the syscall
* instruction overwrites %cx) and are together in the trap frame
* with space following for 4 more entries.
*/
if (__predict_false(callp->sy_argsize > 6 * 8)) {
error = copyin((register_t *)frame->tf_rsp + 1,
&frame->tf_arg6, callp->sy_argsize - 6 * 8);
if (error != 0)
goto bad;
}
#else
if (callp->sy_argsize) {
error = x86_copyargs((char *)frame->tf_esp + sizeof(int), args,
callp->sy_argsize);
if (__predict_false(error != 0))
goto bad;
}
#endif
error = sy_invoke(callp, l, args, rval, code); if (__predict_true(error == 0)) { X86_TF_RAX(frame) = rval[0];
X86_TF_RDX(frame) = rval[1];
X86_TF_RFLAGS(frame) &= ~PSL_C; /* carry bit */
} else {
switch (error) {
case ERESTART:
/*
* The offset to adjust the PC by depends on whether we
* entered the kernel through the trap or call gate.
* We saved the instruction size in tf_err on entry.
*/
X86_TF_RIP(frame) -= frame->tf_err;
break;
case EJUSTRETURN:
/* nothing to do */
break;
default:
bad:
X86_TF_RAX(frame) = error;
X86_TF_RFLAGS(frame) |= PSL_C; /* carry bit */
break;
}
}
SYSCALL_TIME_SYS_EXIT(l);
userret(l);
}
void
syscall_intern(struct proc *p)
{
p->p_md.md_syscall = syscall;
}
/* $NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $ */
/* $KAME: in6_pcb.c,v 1.84 2001/02/08 18:02:08 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/domain.h>
#include <sys/once.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/scope6_var.h>
#include "faith.h"
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet/tcp_vtw.h>
const struct in6_addr zeroin6_addr;
#define IN6PCBHASH_PORT(table, lport) \
&(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define IN6PCBHASH_BIND(table, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
(((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
(laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + ntohs(lport)) & \
(table)->inpt_bindhash]
#define IN6PCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
((((faddr)->s6_addr32[0] ^ (faddr)->s6_addr32[1] ^ \
(faddr)->s6_addr32[2] ^ (faddr)->s6_addr32[3]) + ntohs(fport)) + \
(((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
(laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + \
ntohs(lport))) & (table)->inpt_bindhash]
int ip6_anonportmin = IPV6PORT_ANONMIN;
int ip6_anonportmax = IPV6PORT_ANONMAX;
int ip6_lowportmin = IPV6PORT_RESERVEDMIN;
int ip6_lowportmax = IPV6PORT_RESERVEDMAX;
void
in6pcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{
inpcb_init(table, bindhashsize, connecthashsize);
table->inpt_lastport = (in_port_t)ip6_anonportmax;
}
/*
* Bind address from sin6 to inp.
*/
static int
in6pcb_bind_addr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
int error;
int s;
/*
* We should check the family, but old programs
* incorrectly fail to initialize it.
*/
if (sin6->sin6_family != AF_INET6)
return EAFNOSUPPORT;
#ifndef INET
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
return EADDRNOTAVAIL;
#endif
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
return error;
s = pserialize_read_enter();
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
error = EINVAL;
goto out;
}
if (sin6->sin6_addr.s6_addr32[3]) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
bcopy(&sin6->sin6_addr.s6_addr32[3],
&sin.sin_addr, sizeof(sin.sin_addr));
if (!IN_MULTICAST(sin.sin_addr.s_addr)) {
struct ifaddr *ifa;
ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
if (ifa == NULL &&
(inp->inp_flags & IN6P_BINDANY) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
}
}
} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
// succeed
} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct ifaddr *ifa = NULL;
if ((inp->inp_flags & IN6P_FAITH) == 0) {
ifa = ifa_ifwithaddr(sin6tosa(sin6));
if (ifa == NULL &&
(inp->inp_flags & IN6P_BINDANY) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
}
/*
* bind to an anycast address might accidentally
* cause sending a packet with an anycast source
* address, so we forbid it.
*
* We should allow to bind to a deprecated address,
* since the application dare to use it.
* But, can we assume that they are careful enough
* to check if the address is deprecated or not?
* Maybe, as a safeguard, we should have a setsockopt
* flag to control the bind(2) behavior against
* deprecated addresses (default: forbid bind(2)).
*/
if (ifa &&
ifatoia6(ifa)->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
error = EADDRNOTAVAIL;
goto out;
}
}
in6p_laddr(inp) = sin6->sin6_addr;
error = 0;
out:
pserialize_read_exit(s);
return error;
}
/*
* Bind port from sin6 to inp.
*/
static int
in6pcb_bind_port(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
(so->so_options & SO_ACCEPTCONN) == 0))
wild = 1;
if (sin6->sin6_port != 0) {
enum kauth_network_req req;
#ifndef IPNOPRIVPORTS
if (ntohs(sin6->sin6_port) < IPV6PORT_RESERVED)
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
else
#endif /* IPNOPRIVPORTS */
req = KAUTH_REQ_NETWORK_BIND_PORT;
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND,
req, so, sin6, NULL);
if (error)
return EACCES;
}
if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow compepte duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
reuseport = SO_REUSEADDR|SO_REUSEPORT;
}
if (sin6->sin6_port != 0) { if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
#ifdef INET
struct inpcb *t;
struct vestigial_inpcb vestige;
t = inpcb_lookup_local(table,
*(struct in_addr *)&sin6->sin6_addr.s6_addr32[3],
sin6->sin6_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
#else
return EADDRNOTAVAIL;
#endif
}
{
struct inpcb *t;
struct vestigial_inpcb vestige;
t = in6pcb_lookup_local(table, &sin6->sin6_addr,
sin6->sin6_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
}
}
if (sin6->sin6_port == 0) {
int e;
e = in6pcb_set_port(sin6, inp, l);
if (e != 0)
return e;
} else {
inp->inp_lport = sin6->sin6_port;
inpcb_set_state(inp, INP_BOUND);
}
LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(IN6PCBHASH_PORT(table, inp->inp_lport),
inp, inp_lhash);
return 0;
}
int
in6pcb_bind(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcb *inp = v;
struct sockaddr_in6 lsin6;
int error;
if (inp->inp_af != AF_INET6)
return EINVAL;
/*
* If we already have a local port or a local address it means we're
* bounded.
*/
if (inp->inp_lport || !(IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0)))
return EINVAL;
if (NULL != sin6) {
/* We were provided a sockaddr_in6 to use. */
if (sin6->sin6_len != sizeof(*sin6))
return EINVAL;
} else {
/* We always bind to *something*, even if it's "anything". */
lsin6 = *((const struct sockaddr_in6 *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
sin6 = &lsin6;
}
/* Bind address. */
error = in6pcb_bind_addr(inp, sin6, l);
if (error)
return error;
/* Bind port. */
error = in6pcb_bind_port(inp, sin6, l);
if (error) {
/*
* Reset the address here to "any" so we don't "leak" the
* inpcb.
*/
in6p_laddr(inp) = in6addr_any;
return error;
}
#if 0
in6p_flowinfo(inp) = 0; /* XXX */
#endif
return 0;
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin6.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in6pcb_connect(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcb *inp = v;
struct in6_addr *in6a = NULL;
struct in6_addr ia6;
struct ifnet *ifp = NULL; /* outgoing interface */
int error = 0;
int scope_ambiguous = 0;
#ifdef INET
struct in6_addr mapped;
#endif
struct sockaddr_in6 tmp;
struct vestigial_inpcb vestige;
struct psref psref;
int bound;
(void)&in6a; /* XXX fool gcc */
if (inp->inp_af != AF_INET6)
return EINVAL;
if (sin6->sin6_len != sizeof(*sin6))
return EINVAL;
if (sin6->sin6_family != AF_INET6)
return EAFNOSUPPORT;
if (sin6->sin6_port == 0)
return EADDRNOTAVAIL;
if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
inp->inp_socket->so_type == SOCK_STREAM)
return EADDRNOTAVAIL;
if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
return error;
/* sanity check for mapped address case */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
return EINVAL;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) in6p_laddr(inp).s6_addr16[5] = htons(0xffff); if (!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
return EINVAL;
} else
{
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
return EINVAL;
}
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
bound = curlwp_bind();
/* Source address selection. */
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0) {
#ifdef INET
struct sockaddr_in sin;
struct in_ifaddr *ia4;
struct psref _psref;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
memcpy(&sin.sin_addr, &sin6->sin6_addr.s6_addr32[3],
sizeof(sin.sin_addr));
ia4 = in_selectsrc(&sin, &inp->inp_route,
inp->inp_socket->so_options, NULL, &error, &_psref);
if (ia4 == NULL) {
if (error == 0)
error = EADDRNOTAVAIL;
curlwp_bindx(bound);
return error;
}
memset(&mapped, 0, sizeof(mapped));
mapped.s6_addr16[5] = htons(0xffff);
memcpy(&mapped.s6_addr32[3], &IA_SIN(ia4)->sin_addr,
sizeof(IA_SIN(ia4)->sin_addr));
ia4_release(ia4, &_psref);
in6a = &mapped;
#else
curlwp_bindx(bound);
return EADDRNOTAVAIL;
#endif
} else {
/*
* XXX: in6_selectsrc might replace the bound local address
* with the address specified by setsockopt(IPV6_PKTINFO).
* Is it the intended behavior?
*/
error = in6_selectsrc(sin6, in6p_outputopts(inp),
in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp),
&ifp, &psref, &ia6);
if (error == 0)
in6a = &ia6;
if (ifp && scope_ambiguous &&
(error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) {
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
if (in6a == NULL) {
if_put(ifp, &psref);
curlwp_bindx(bound); if (error == 0)
error = EADDRNOTAVAIL;
return error;
}
}
if (ifp != NULL) {
in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim(inp, ifp);
if_put(ifp, &psref);
} else
in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim_rt(inp); curlwp_bindx(bound); if (in6pcb_lookup(inp->inp_table, &sin6->sin6_addr,
sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ? in6a : &in6p_laddr(inp),
inp->inp_lport, 0, &vestige)
|| vestige.valid)
return EADDRINUSE;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0))
{
if (inp->inp_lport == 0) {
error = in6pcb_bind(inp, NULL, l);
if (error != 0)
return error;
}
in6p_laddr(inp) = *in6a;
}
in6p_faddr(inp) = sin6->sin6_addr;
inp->inp_fport = sin6->sin6_port;
/* Late bind, if needed */
if (inp->inp_bindportonsend) {
struct sockaddr_in6 lsin = *((const struct sockaddr_in6 *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin.sin6_addr = in6p_laddr(inp);
lsin.sin6_port = 0;
if ((error = in6pcb_bind_port(inp, &lsin, l)) != 0)
return error;
}
inpcb_set_state(inp, INP_CONNECTED);
in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
if (ip6_auto_flowlabel)
in6p_flowinfo(inp) |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
#if defined(IPSEC)
if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp);
#endif
return 0;
}
void
in6pcb_disconnect(struct inpcb *inp)
{
memset((void *)&in6p_faddr(inp), 0, sizeof(in6p_faddr(inp)));
inp->inp_fport = 0;
inpcb_set_state(inp, INP_BOUND);
in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
#if defined(IPSEC)
if (ipsec_enabled) ipsec_pcbdisconn(inp->inp_sp);
#endif
if (inp->inp_socket->so_state & SS_NOFDREF) inpcb_destroy(inp);
}
void
in6pcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{
if (inp->inp_af != AF_INET6)
return;
sockaddr_in6_init(sin6, &in6p_laddr(inp), inp->inp_lport, 0, 0);
(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}
void
in6pcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{
if (inp->inp_af != AF_INET6)
return;
sockaddr_in6_init(sin6, &in6p_faddr(inp), inp->inp_fport, 0, 0);
(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The local address and/or port numbers
* may be specified to limit the search. The "usual action" will be
* taken, depending on the ctlinput cmd. The caller must filter any
* cmds that are uninteresting (e.g., no error in the map).
* Call the protocol specific routine (if any) to report
* any errors for each matching socket.
*
* Must be called at splsoftnet.
*
* Note: src (4th arg) carries the flowlabel value on the original IPv6
* header, in sin6_flowinfo member.
*/
int
in6pcb_notify(struct inpcbtable *table, const struct sockaddr *dst,
u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd,
void *cmdarg, void (*notify)(struct inpcb *, int))
{
struct inpcb *inp;
struct sockaddr_in6 sa6_src;
const struct sockaddr_in6 *sa6_dst;
in_port_t fport = fport_arg, lport = lport_arg;
int errno;
int nmatch = 0;
u_int32_t flowinfo;
if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6)
return 0;
sa6_dst = (const struct sockaddr_in6 *)dst;
if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
return 0;
/*
* note that src can be NULL when we get notify by local fragmentation.
*/
sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src;
flowinfo = sa6_src.sin6_flowinfo;
/*
* Redirects go to all references to the destination,
* and use in6pcb_rtchange to invalidate the route cache.
* Dead host indications: also use in6pcb_rtchange to invalidate
* the cache, and deliver the error to all the sockets.
* Otherwise, if we have knowledge of the local port and address,
* deliver only to that socket.
*/
if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
fport = 0;
lport = 0;
memset((void *)&sa6_src.sin6_addr, 0, sizeof(sa6_src.sin6_addr));
if (cmd != PRC_HOSTDEAD)
notify = in6pcb_rtchange;
}
errno = inet6ctlerrmap[cmd];
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
struct rtentry *rt = NULL;
if (inp->inp_af != AF_INET6)
continue;
/*
* Under the following condition, notify of redirects
* to the pcb, without making address matches against inpcb.
* - redirect notification is arrived.
* - the inpcb is unconnected.
* - the inpcb is caching !RTF_HOST routing entry.
* - the ICMPv6 notification is from the gateway cached in the
* inpcb. i.e. ICMPv6 notification is from nexthop gateway
* the inpcb used very recently.
*
* This is to improve interaction between netbsd/openbsd
* redirect handling code, and inpcb route cache code.
* without the clause, !RTF_HOST routing entry (which carries
* gateway used by inpcb right before the ICMPv6 redirect)
* will be cached forever in unconnected inpcb.
*
* There still is a question regarding to what is TRT:
* - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be
* generated on packet output. inpcb will always cache
* RTF_HOST routing entry so there's no need for the clause
* (ICMPv6 redirect will update RTF_HOST routing entry,
* and inpcb is caching it already).
* However, bsdi/freebsd are vulnerable to local DoS attacks
* due to the cloned routing entries.
* - Specwise, "destination cache" is mentioned in RFC2461.
* Jinmei says that it implies bsdi/freebsd behavior, itojun
* is not really convinced.
* - Having hiwat/lowat on # of cloned host route (redirect/
* pmtud) may be a good idea. netbsd/openbsd has it. see
* icmp6_mtudisc_update().
*/
if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) &&
IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
(rt = rtcache_validate(&inp->inp_route)) != NULL &&
!(rt->rt_flags & RTF_HOST)) {
const struct sockaddr_in6 *dst6;
dst6 = (const struct sockaddr_in6 *)
rtcache_getdst(&inp->inp_route);
if (dst6 == NULL)
;
else if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr,
&sa6_dst->sin6_addr)) {
rtcache_unref(rt, &inp->inp_route);
goto do_notify;
}
}
rtcache_unref(rt, &inp->inp_route);
/*
* If the error designates a new path MTU for a destination
* and the application (associated with this socket) wanted to
* know the value, notify. Note that we notify for all
* disconnected sockets if the corresponding application
* wanted. This is because some UDP applications keep sending
* sockets disconnected.
* XXX: should we avoid to notify the value to TCP sockets?
*/
if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 &&
(IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) ||
IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr))) {
ip6_notify_pmtu(inp, (const struct sockaddr_in6 *)dst,
(u_int32_t *)cmdarg);
}
/*
* Detect if we should notify the error. If no source and
* destination ports are specified, but non-zero flowinfo and
* local address match, notify the error. This is the case
* when the error is delivered with an encrypted buffer
* by ESP. Otherwise, just compare addresses and ports
* as usual.
*/
if (lport == 0 && fport == 0 && flowinfo &&
inp->inp_socket != NULL &&
flowinfo == (in6p_flowinfo(inp) & IPV6_FLOWLABEL_MASK) &&
IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr))
goto do_notify;
else if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
&sa6_dst->sin6_addr) ||
inp->inp_socket == NULL ||
(lport && inp->inp_lport != lport) ||
(!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
&sa6_src.sin6_addr)) ||
(fport && inp->inp_fport != fport))
continue;
do_notify:
if (notify)
(*notify)(inp, errno);
nmatch++;
}
return nmatch;
}
void
in6pcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
struct inpcb *inp;
struct ip6_moptions *im6o;
struct in6_multi_mship *imm, *nimm;
KASSERT(ifp != NULL);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
bool need_unlock = false;
if (inp->inp_af != AF_INET6)
continue;
/* The caller holds either one of inps' lock */
if (!inp_locked(inp)) {
inp_lock(inp);
need_unlock = true;
}
im6o = in6p_moptions(inp);
if (im6o) {
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (im6o->im6o_multicast_if_index == ifp->if_index)
im6o->im6o_multicast_if_index = 0;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
* XXX controversial - is it really legal for kernel
* to force this?
*/
LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships,
i6mm_chain, nimm) {
if (imm->i6mm_maddr->in6m_ifp == ifp) {
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
}
}
in_purgeifmcast(inp->inp_moptions, ifp);
if (need_unlock)
inp_unlock(inp);
}
}
void
in6pcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
struct rtentry *rt;
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
rt->rt_ifp == ifp) {
rtcache_unref(rt, &inp->inp_route);
in6pcb_rtchange(inp, 0);
} else
rtcache_unref(rt, &inp->inp_route);
}
}
/*
* After a routing change, flush old routing. A new route can be
* allocated the next time output is attempted.
*/
void
in6pcb_rtchange(struct inpcb *inp, int errno)
{
if (inp->inp_af != AF_INET6)
return;
rtcache_free(&inp->inp_route);
/*
* A new route can be allocated the next time
* output is attempted.
*/
}
struct inpcb *
in6pcb_lookup_local(struct inpcbtable *table, struct in6_addr *laddr6,
u_int lport_arg, int lookup_wildcard, struct vestigial_inpcb *vp)
{
struct inpcbhead *head;
struct inpcb *inp, *match = NULL;
int matchwild = 3, wildcard;
in_port_t lport = lport_arg;
if (vp) vp->valid = 0;
head = IN6PCBHASH_PORT(table, lport);
LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET6)
continue;
if (inp->inp_lport != lport)
continue;
wildcard = 0;
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
wildcard++;
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (!IN6_IS_ADDR_V4MAPPED(laddr6))
continue;
/* duplicate of IPv4 logic */
wildcard = 0;
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)) && in6p_faddr(inp).s6_addr32[3])
wildcard++;
if (!in6p_laddr(inp).s6_addr32[3]) {
if (laddr6->s6_addr32[3])
wildcard++;
} else {
if (!laddr6->s6_addr32[3])
wildcard++;
else {
if (in6p_laddr(inp).s6_addr32[3] !=
laddr6->s6_addr32[3])
continue;
}
}
} else if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
wildcard++;
} else {
if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++;
else {
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
laddr6))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (match && matchwild == 0)
return match;
if (vp && table->vestige && table->vestige->init_ports6) {
struct vestigial_inpcb better;
bool has_better = false;
void *state;
state = (*table->vestige->init_ports6)(laddr6,
lport_arg,
lookup_wildcard);
while (table->vestige && (*table->vestige->next_port6)(state, vp)) { if (vp->lport != lport)
continue;
wildcard = 0;
if (!IN6_IS_ADDR_UNSPECIFIED(&vp->faddr.v6))
wildcard++;
if (IN6_IS_ADDR_UNSPECIFIED(&vp->laddr.v6)) { if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
wildcard++;
} else {
if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if (vp->v6only)
continue;
}
if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++;
else {
if (!IN6_ARE_ADDR_EQUAL(&vp->laddr.v6, laddr6))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
better = *vp;
has_better = true;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (has_better) {
*vp = better;
return 0;
}
}
return match;
}
/*
* WARNING: return value (rtentry) could be IPv4 one if inpcb is connected to
* IPv4 mapped address.
*/
struct rtentry *
in6pcb_rtentry(struct inpcb *inp)
{
struct rtentry *rt;
struct route *ro;
union {
const struct sockaddr *sa;
const struct sockaddr_in6 *sa6;
#ifdef INET
const struct sockaddr_in *sa4;
#endif
} cdst;
ro = &inp->inp_route;
if (inp->inp_af != AF_INET6)
return NULL;
cdst.sa = rtcache_getdst(ro); if (cdst.sa == NULL)
;
#ifdef INET
else if (cdst.sa->sa_family == AF_INET) {
KASSERT(IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))); if (cdst.sa4->sin_addr.s_addr != in6p_faddr(inp).s6_addr32[3])
rtcache_free(ro);
}
#endif
else {
if (!IN6_ARE_ADDR_EQUAL(&cdst.sa6->sin6_addr,
&in6p_faddr(inp)))
rtcache_free(ro);
}
if ((rt = rtcache_validate(ro)) == NULL)
rt = rtcache_update(ro, 1);
#ifdef INET
if (rt == NULL && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
struct in_addr addr;
addr.s_addr = in6p_faddr(inp).s6_addr32[3];
sockaddr_in_init(&u.dst4, &addr, 0);
if (rtcache_setdst(ro, &u.dst) != 0)
return NULL;
rt = rtcache_init(ro);
} else
#endif
if (rt == NULL && !IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
if (rtcache_setdst(ro, &u.dst) != 0)
return NULL;
rt = rtcache_init(ro);
}
return rt;
}
void
in6pcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{
rtcache_unref(rt, &inp->inp_route);
}
struct inpcb *
in6pcb_lookup(struct inpcbtable *table, const struct in6_addr *faddr6,
u_int fport_arg, const struct in6_addr *laddr6, u_int lport_arg,
int faith,
struct vestigial_inpcb *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
if (vp) vp->valid = 0;
head = IN6PCBHASH_CONNECT(table, faddr6, fport, laddr6, lport);
LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6)
continue;
/* find exact match on both source and dest */
if (inp->inp_fport != fport)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
continue;
if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), faddr6))
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)))
continue;
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
continue;
if ((IN6_IS_ADDR_V4MAPPED(laddr6) || IN6_IS_ADDR_V4MAPPED(faddr6)) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
return inp;
}
if (vp && table->vestige) { if ((*table->vestige->lookup6)(faddr6, fport_arg,
laddr6, lport_arg, vp))
return NULL;
}
return NULL;
}
struct inpcb *
in6pcb_lookup_bound(struct inpcbtable *table, const struct in6_addr *laddr6,
u_int lport_arg, int faith)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t lport = lport_arg;
#ifdef INET
struct in6_addr zero_mapped;
#endif
head = IN6PCBHASH_BIND(table, laddr6, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
goto out;
}
#ifdef INET
if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
memset(&zero_mapped, 0, sizeof(zero_mapped));
zero_mapped.s6_addr16[5] = 0xffff;
head = IN6PCBHASH_BIND(table, &zero_mapped, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zero_mapped))
goto out;
}
}
#endif
head = IN6PCBHASH_BIND(table, &zeroin6_addr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zeroin6_addr))
goto out;
}
return NULL;
out:
if (inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
void
in6pcb_set_state(struct inpcb *inp, int state)
{ if (inp->inp_af != AF_INET6)
return;
if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) {
case INP_BOUND:
LIST_INSERT_HEAD(IN6PCBHASH_BIND(inp->inp_table,
&in6p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
case INP_CONNECTED:
LIST_INSERT_HEAD(IN6PCBHASH_CONNECT(inp->inp_table,
&in6p_faddr(inp), inp->inp_fport,
&in6p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
}
inp->inp_state = state;
}
/* $NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $ */
/*-
* Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/vnode.h>
#include <secmodel/secmodel.h>
#include <secmodel/extensions/extensions.h>
#include <secmodel/extensions/extensions_impl.h>
static int dovfsusermount;
static int hardlink_check_uid;
static int hardlink_check_gid;
static kauth_listener_t l_system, l_vnode;
static int secmodel_extensions_system_cb(kauth_cred_t, kauth_action_t,
void *, void *, void *, void *, void *);
static int secmodel_extensions_vnode_cb(kauth_cred_t, kauth_action_t,
void *, void *, void *, void *, void *);
void
secmodel_extensions_vfs_start(void)
{
l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
secmodel_extensions_system_cb, NULL);
l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
secmodel_extensions_vnode_cb, NULL);
}
void
secmodel_extensions_vfs_stop(void)
{
kauth_unlisten_scope(l_system);
kauth_unlisten_scope(l_vnode);
}
void
secmodel_extensions_vfs_sysctl(struct sysctllog **clog,
const struct sysctlnode *rnode)
{
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "usermount",
SYSCTL_DESCR("Whether unprivileged users may mount "
"filesystems"),
sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hardlink_check_uid",
SYSCTL_DESCR("Whether unprivileged users can hardlink "\
"to files they don't own"),
sysctl_extensions_user_handler, 0,
&hardlink_check_uid, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hardlink_check_gid",
SYSCTL_DESCR("Whether unprivileged users can hardlink "\
"to files that are not in their " \
"group membership"),
sysctl_extensions_user_handler, 0,
&hardlink_check_gid, 0,
CTL_CREATE, CTL_EOL);
/* Compatibility: vfs.generic.usermount */
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "generic",
SYSCTL_DESCR("Non-specific vfs related information"),
NULL, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "usermount",
SYSCTL_DESCR("Whether unprivileged users may mount "
"filesystems"),
sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
}
static int
secmodel_extensions_system_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
vnode_t *vp;
struct vattr va;
struct mount *mp;
u_long flags;
int result;
enum kauth_system_req req;
int error;
req = (enum kauth_system_req)(uintptr_t)arg0;
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_SYSTEM_MOUNT:
if (dovfsusermount == 0)
break;
switch (req) {
case KAUTH_REQ_SYSTEM_MOUNT_NEW:
vp = (vnode_t *)arg1;
mp = vp->v_mount;
flags = (u_long)arg2;
/*
* Ensure that the user owns the directory onto which
* the mount is attempted.
*/
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, cred);
VOP_UNLOCK(vp);
if (error)
break;
if (va.va_uid != kauth_cred_geteuid(cred))
break;
error = usermount_common_policy(mp, flags);
if (error)
break;
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
mp = arg1;
/* Must own the mount. */
if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred))
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
mp = arg1;
flags = (u_long)arg2;
/* Must own the mount. */
if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred) &&
usermount_common_policy(mp, flags) == 0)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
default:
break;
}
return (result);
}
static int
secmodel_extensions_vnode_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
int error;
bool isroot;
struct vattr va;
if ((action & KAUTH_VNODE_ADD_LINK) == 0)
return KAUTH_RESULT_DEFER;
error = VOP_GETATTR((vnode_t *)arg0, &va, cred);
if (error)
goto checkroot;
if (hardlink_check_uid && kauth_cred_geteuid(cred) != va.va_uid)
goto checkroot;
if (hardlink_check_gid && kauth_cred_groupmember(cred, va.va_gid) != 0)
goto checkroot;
return KAUTH_RESULT_DEFER;
checkroot:
error = secmodel_eval("org.netbsd.secmodel.suser", "is-root",
cred, &isroot);
if (error || !isroot)
return KAUTH_RESULT_DENY;
return KAUTH_RESULT_DEFER;
}
/* $NetBSD: uvm_page.h,v 1.109 2020/12/20 16:38:26 skrll Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.h 7.3 (Berkeley) 4/21/91
* from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _UVM_UVM_PAGE_H_
#define _UVM_UVM_PAGE_H_
#ifdef _KERNEL_OPT
#include "opt_uvm_page_trkown.h"
#endif
#include <sys/rwlock.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_pglist.h>
/*
* Management of resident (logical) pages.
*
* Each resident page has a vm_page structure, indexed by page number.
* There are several lists in the structure:
*
* - A red-black tree rooted with the containing object is used to
* quickly perform object+offset lookups.
* - A list of all pages for a given object, for a quick deactivation
* at a time of deallocation.
* - An ordered list of pages due for pageout.
*
* In addition, the structure contains the object and offset to which
* this page belongs (for pageout) and sundry status bits.
*
* Note that the page structure has no lock of its own. The page is
* generally protected by its owner's lock (UVM object or amap/anon).
* It should be noted that UVM has to serialize pmap(9) operations on
* the managed pages, e.g. for pmap_enter() calls. Hence, the lock
* order is as follows:
*
* [vmpage-owner-lock] ->
* any pmap locks (e.g. PV hash lock)
*
* Since the kernel is always self-consistent, no serialization is
* required for unmanaged mappings, e.g. for pmap_kenter_pa() calls.
*
* Field markings and the corresponding locks:
*
* f: free page queue lock, uvm_fpageqlock
* o: page owner (uvm_object::vmobjlock, vm_amap::am_lock, vm_anon::an_lock)
* i: vm_page::interlock
* => flags set and cleared only with o&i held can
* safely be tested for with only o held.
* o,i: o|i for read, o&i for write (depends on context - if could be loaned)
* => see uvm_loan.c
* w: wired page queue or uvm_pglistalloc:
* => wired page queue: o&i to change, stable from wire to unwire
* XXX What about concurrent or nested wire?
* => uvm_pglistalloc: owned by caller
* ?: locked by pmap or assumed page owner's lock
* p: locked by pagedaemon policy module (pdpolicy)
* c: cpu private
* s: stable, does not change
*
* UVM and pmap(9) may use uvm_page_owner_locked_p() to assert whether the
* page owner's lock is acquired.
*
* A page can have one of four identities:
*
* o free
* => pageq.list is entry on global free page queue
* => uanon is unused (or (void *)0xdeadbeef for DEBUG)
* => uobject is unused (or (void *)0xdeadbeef for DEBUG)
* => PG_FREE is set in flags
* o owned by a uvm_object
* => pageq.queue is entry on wired page queue, if any
* => uanon is NULL or the vm_anon to which it has been O->A loaned
* => uobject is owner
* o owned by a vm_anon
* => pageq is unused (XXX correct?)
* => uanon is owner
* => uobject is NULL
* => PG_ANON is set in flags
* o allocated by uvm_pglistalloc
* => pageq.queue is entry on resulting pglist, owned by caller
* => uanon is unused
* => uobject is unused
*
* The following transitions are allowed:
*
* - uvm_pagealloc: free -> owned by a uvm_object/vm_anon
* - uvm_pagefree: owned by a uvm_object/vm_anon -> free
* - uvm_pglistalloc: free -> allocated by uvm_pglistalloc
* - uvm_pglistfree: allocated by uvm_pglistalloc -> free
*
* On the ordering of fields:
*
* The fields most heavily used during fault processing are clustered
* together at the start of the structure to reduce cache misses.
* XXX This entire thing should be shrunk to fit in one cache line.
*/
struct vm_page {
/* _LP64: first cache line */
union {
TAILQ_ENTRY(vm_page) queue; /* w: wired page queue
* or uvm_pglistalloc output */
LIST_ENTRY(vm_page) list; /* f: global free page queue */
} pageq;
uint32_t pqflags; /* i: pagedaemon flags */
uint32_t flags; /* o: object flags */
paddr_t phys_addr; /* o: physical address of pg */
uint32_t loan_count; /* o,i: num. active loans */
uint32_t wire_count; /* o,i: wired down map refs */
struct vm_anon *uanon; /* o,i: anon */
struct uvm_object *uobject; /* o,i: object */
voff_t offset; /* o: offset into object */
/* _LP64: second cache line */
kmutex_t interlock; /* s: lock on identity */
TAILQ_ENTRY(vm_page) pdqueue; /* p: pagedaemon queue */
#ifdef __HAVE_VM_PAGE_MD
struct vm_page_md mdpage; /* ?: pmap-specific data */
#endif
#if defined(UVM_PAGE_TRKOWN)
/* debugging fields to track page ownership */
pid_t owner; /* proc that set PG_BUSY */
lwpid_t lowner; /* lwp that set PG_BUSY */
const char *owner_tag; /* why it was set busy */
#endif
};
/*
* Overview of UVM page flags, stored in pg->flags.
*
* Locking notes:
*
* PG_, struct vm_page::flags => locked by owner
* PG_AOBJ => additionally locked by vm_page::interlock
* PG_ANON => additionally locked by vm_page::interlock
* PG_FREE => additionally locked by uvm_fpageqlock
* for uvm_pglistalloc()
*
* Flag descriptions:
*
* PG_CLEAN:
* Page is known clean.
* The contents of the page is consistent with its backing store.
*
* PG_DIRTY:
* Page is known dirty.
* To avoid losing data, the contents of the page should be written
* back to the backing store before freeing the page.
*
* PG_BUSY:
* Page is long-term locked, usually because of I/O (transfer from the
* page memory to the backing store) is in progress. LWP attempting
* to access the page shall set PQ_WANTED and wait. PG_BUSY may only
* be set with a write lock held on the object.
*
* PG_PAGEOUT:
* Indicates that the page is being paged-out in preparation for
* being freed.
*
* PG_RELEASED:
* Indicates that the page, which is currently PG_BUSY, should be freed
* after the release of long-term lock. It is responsibility of the
* owning LWP (i.e. which set PG_BUSY) to do it.
*
* PG_FAKE:
* Page has been allocated, but not yet initialised. The flag is used
* to avoid overwriting of valid data, e.g. to prevent read from the
* backing store when in-core data is newer.
*
* PG_RDONLY:
* Indicates that the page must be mapped read-only.
*
* PG_MARKER:
* Dummy marker page, generally used for list traversal.
*/
/*
* if you want to renumber PG_CLEAN and PG_DIRTY, check __CTASSERTs in
* uvm_page_status.c first.
*/
#define PG_CLEAN 0x00000001 /* page is known clean */
#define PG_DIRTY 0x00000002 /* page is known dirty */
#define PG_BUSY 0x00000004 /* page is locked */
#define PG_PAGEOUT 0x00000010 /* page to be freed for pagedaemon */
#define PG_RELEASED 0x00000020 /* page to be freed when unbusied */
#define PG_FAKE 0x00000040 /* page is not yet initialized */
#define PG_RDONLY 0x00000080 /* page must be mapped read-only */
#define PG_TABLED 0x00000200 /* page is tabled in object */
#define PG_AOBJ 0x00000400 /* page is part of an anonymous
uvm_object */
#define PG_ANON 0x00000800 /* page is part of an anon, rather
than an uvm_object */
#define PG_FILE 0x00001000 /* file backed (non-anonymous) */
#define PG_READAHEAD 0x00002000 /* read-ahead but not "hit" yet */
#define PG_FREE 0x00004000 /* page is on free list */
#define PG_MARKER 0x00008000 /* dummy marker page */
#define PG_PAGER1 0x00010000 /* pager-specific flag */
#define PG_PGLCA 0x00020000 /* allocated by uvm_pglistalloc_contig */
#define PG_STAT (PG_ANON|PG_AOBJ|PG_FILE)
#define PG_SWAPBACKED (PG_ANON|PG_AOBJ)
#define UVM_PGFLAGBITS \
"\20\1CLEAN\2DIRTY\3BUSY" \
"\5PAGEOUT\6RELEASED\7FAKE\10RDONLY" \
"\11ZERO\12TABLED\13AOBJ\14ANON" \
"\15FILE\16READAHEAD\17FREE\20MARKER" \
"\21PAGER1\22PGLCA"
/*
* Flags stored in pg->pqflags, which is protected by pg->interlock.
*
* PQ_PRIVATE:
* ... is for uvmpdpol to do whatever it wants with.
*
* PQ_INTENT_SET:
* Indicates that the intent set on the page has not yet been realized.
*
* PQ_INTENT_QUEUED:
* Indicates that the page is, or will soon be, on a per-CPU queue for
* the intent to be realized.
*
* PQ_WANTED:
* Indicates that the page, which is currently PG_BUSY, is wanted by
* some other LWP. The page owner (i.e. LWP which set PG_BUSY) is
* responsible to clear both flags and wake up any waiters once it has
* released the long-term lock (PG_BUSY).
*/
#define PQ_INTENT_A 0x00000000 /* intend activation */
#define PQ_INTENT_I 0x00000001 /* intend deactivation */
#define PQ_INTENT_E 0x00000002 /* intend enqueue */
#define PQ_INTENT_D 0x00000003 /* intend dequeue */
#define PQ_INTENT_MASK 0x00000003 /* mask of intended state */
#define PQ_INTENT_SET 0x00000004 /* not realized yet */
#define PQ_INTENT_QUEUED 0x00000008 /* queued for processing */
#define PQ_PRIVATE 0x00000ff0 /* private for pdpolicy */
#define PQ_WANTED 0x00001000 /* someone is waiting for page */
#define UVM_PQFLAGBITS \
"\20\1INTENT_0\2INTENT_1\3INTENT_SET\4INTENT_QUEUED" \
"\5PRIVATE1\6PRIVATE2\7PRIVATE3\10PRIVATE4" \
"\11PRIVATE5\12PRIVATE6\13PRIVATE7\14PRIVATE8" \
"\15WANTED"
/*
* physical memory layout structure
*
* MD vmparam.h must #define:
* VM_PHYSEG_MAX = max number of physical memory segments we support
* (if this is "1" then we revert to a "contig" case)
* VM_PHYSSEG_STRAT: memory sort/search options (for VM_PHYSEG_MAX > 1)
* - VM_PSTRAT_RANDOM: linear search (random order)
* - VM_PSTRAT_BSEARCH: binary search (sorted by address)
* - VM_PSTRAT_BIGFIRST: linear search (sorted by largest segment first)
* - others?
* XXXCDC: eventually we should purge all left-over global variables...
*/
#define VM_PSTRAT_RANDOM 1
#define VM_PSTRAT_BSEARCH 2
#define VM_PSTRAT_BIGFIRST 3
#ifdef _KERNEL
/*
* prototypes: the following prototypes define the interface to pages
*/
void uvm_page_init(vaddr_t *, vaddr_t *);
void uvm_pglistalloc_init(void);
#if defined(UVM_PAGE_TRKOWN)
void uvm_page_own(struct vm_page *, const char *);
#endif
#if !defined(PMAP_STEAL_MEMORY)
bool uvm_page_physget(paddr_t *);
#endif
void uvm_page_recolor(int);
void uvm_page_rebucket(void);
void uvm_pageactivate(struct vm_page *);
vaddr_t uvm_pageboot_alloc(vsize_t);
void uvm_pagecopy(struct vm_page *, struct vm_page *);
void uvm_pagedeactivate(struct vm_page *);
void uvm_pagedequeue(struct vm_page *);
void uvm_pageenqueue(struct vm_page *);
void uvm_pagefree(struct vm_page *);
void uvm_pagelock(struct vm_page *);
void uvm_pagelock2(struct vm_page *, struct vm_page *);
void uvm_pageunlock(struct vm_page *);
void uvm_pageunlock2(struct vm_page *, struct vm_page *);
void uvm_page_unbusy(struct vm_page **, int);
struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t);
void uvm_pageunwire(struct vm_page *);
void uvm_pagewire(struct vm_page *);
void uvm_pagezero(struct vm_page *);
bool uvm_pageismanaged(paddr_t);
bool uvm_page_owner_locked_p(struct vm_page *, bool);
void uvm_pgfl_lock(void);
void uvm_pgfl_unlock(void);
unsigned int uvm_pagegetdirty(struct vm_page *);
void uvm_pagemarkdirty(struct vm_page *, unsigned int);
bool uvm_pagecheckdirty(struct vm_page *, bool);
bool uvm_pagereadonly_p(struct vm_page *);
bool uvm_page_locked_p(struct vm_page *);
void uvm_pagewakeup(struct vm_page *);
bool uvm_pagewanted_p(struct vm_page *);
void uvm_pagewait(struct vm_page *, krwlock_t *, const char *);
int uvm_page_lookup_freelist(struct vm_page *);
struct vm_page *uvm_phys_to_vm_page(paddr_t);
paddr_t uvm_vm_page_to_phys(const struct vm_page *);
#if defined(PMAP_DIRECT)
extern bool ubc_direct;
int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t,
int (*)(void *, size_t, void *), void *);
#endif
/*
* page dirtiness status for uvm_pagegetdirty and uvm_pagemarkdirty
*
* UNKNOWN means that we need to consult pmap to know if the page is
* dirty or not.
* basically, UVM_PAGE_STATUS_CLEAN implies that the page has no writable
* mapping.
*
* if you want to renumber these, check __CTASSERTs in
* uvm_page_status.c first.
*/
#define UVM_PAGE_STATUS_UNKNOWN 0
#define UVM_PAGE_STATUS_CLEAN 1
#define UVM_PAGE_STATUS_DIRTY 2
#define UVM_PAGE_NUM_STATUS 3
/*
* macros
*/
#define VM_PAGE_TO_PHYS(entry) uvm_vm_page_to_phys(entry)
#ifdef __HAVE_VM_PAGE_MD
#define VM_PAGE_TO_MD(pg) (&(pg)->mdpage)
#define VM_MD_TO_PAGE(md) (container_of((md), struct vm_page, mdpage))
#endif
/*
* Compute the page color for a given page.
*/
#define VM_PGCOLOR(pg) \
(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
#define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa)
/*
* VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
* per-CPU cache. If you need to be certain, pause caching.
*/
#define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE)
/*
* Use the lower 10 bits of pg->phys_addr to cache some some locators for
* the page. This implies that the smallest possible page size is 1kB, and
* that nobody should use pg->phys_addr directly (use VM_PAGE_TO_PHYS()).
*
* - 5 bits for the freelist index, because uvm_page_lookup_freelist()
* traverses an rbtree and therefore features prominently in traces
* captured during performance test. It would probably be more useful to
* cache physseg index here because freelist can be inferred from physseg,
* but it requires changes to allocation for UVM_HOTPLUG, so for now we'll
* go with freelist.
*
* - 5 bits for "bucket", a way for us to categorise pages further as
* needed (e.g. NUMA node).
*
* None of this is set in stone; it can be adjusted as needed.
*/
#define UVM_PHYSADDR_FREELIST __BITS(0,4)
#define UVM_PHYSADDR_BUCKET __BITS(5,9)
static inline unsigned
uvm_page_get_freelist(struct vm_page *pg)
{
unsigned fl = __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_FREELIST); KASSERT(fl == (unsigned)uvm_page_lookup_freelist(pg));
return fl;
}
static inline unsigned
uvm_page_get_bucket(struct vm_page *pg)
{
return __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_BUCKET);
}
static inline void
uvm_page_set_freelist(struct vm_page *pg, unsigned fl)
{
KASSERT(fl < 32);
pg->phys_addr &= ~UVM_PHYSADDR_FREELIST;
pg->phys_addr |= __SHIFTIN(fl, UVM_PHYSADDR_FREELIST);
}
static inline void
uvm_page_set_bucket(struct vm_page *pg, unsigned b)
{
KASSERT(b < 32);
pg->phys_addr &= ~UVM_PHYSADDR_BUCKET;
pg->phys_addr |= __SHIFTIN(b, UVM_PHYSADDR_BUCKET);
}
#endif /* _KERNEL */
#endif /* _UVM_UVM_PAGE_H_ */
/* $NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
*/
/*
* Signal subsystem.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $");
#include "opt_execfmt.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#include "opt_compat_sunos.h"
#include "opt_compat_netbsd.h"
#include "opt_compat_netbsd32.h"
#include "opt_pax.h"
#define SIGPROP /* include signal properties table */
#include <sys/param.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/systm.h>
#include <sys/wait.h>
#include <sys/ktrace.h>
#include <sys/syslog.h>
#include <sys/filedesc.h>
#include <sys/file.h>
#include <sys/pool.h>
#include <sys/ucontext.h>
#include <sys/exec.h>
#include <sys/kauth.h>
#include <sys/acct.h>
#include <sys/callout.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/exec_elf.h>
#include <sys/compat_stub.h>
#ifdef PAX_SEGVGUARD
#include <sys/pax.h>
#endif /* PAX_SEGVGUARD */
#include <uvm/uvm_extern.h>
/* Many hard-coded assumptions that there are <= 4 x 32bit signal mask bits */
__CTASSERT(NSIG <= 128);
#define SIGQUEUE_MAX 32
static pool_cache_t sigacts_cache __read_mostly;
static pool_cache_t ksiginfo_cache __read_mostly;
static callout_t proc_stop_ch __cacheline_aligned;
sigset_t contsigmask __cacheline_aligned;
sigset_t stopsigmask __cacheline_aligned;
static sigset_t vforksigmask __cacheline_aligned;
sigset_t sigcantmask __cacheline_aligned;
static void ksiginfo_exechook(struct proc *, void *);
static void proc_stop(struct proc *, int);
static void proc_stop_done(struct proc *, int);
static void proc_stop_callout(void *);
static int sigchecktrace(void);
static int sigpost(struct lwp *, sig_t, int, int);
static int sigput(sigpend_t *, struct proc *, ksiginfo_t *);
static int sigunwait(struct proc *, const ksiginfo_t *);
static void sigswitch(int, int, bool);
static void sigswitch_unlock_and_switch_away(struct lwp *);
static void sigacts_poolpage_free(struct pool *, void *);
static void *sigacts_poolpage_alloc(struct pool *, int);
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , signal__send,
"struct lwp *", /* target thread */
"struct proc *", /* target process */
"int"); /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__discard,
"struct lwp *", /* target thread */
"struct proc *", /* target process */
"int"); /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__handle,
"int", /* signal */
"ksiginfo_t *", /* signal info */
"void (*)(void)"); /* handler address */
static struct pool_allocator sigactspool_allocator = {
.pa_alloc = sigacts_poolpage_alloc,
.pa_free = sigacts_poolpage_free
};
#ifdef DEBUG
int kern_logsigexit = 1;
#else
int kern_logsigexit = 0;
#endif
static const char logcoredump[] =
"pid %d (%s), uid %d: exited on signal %d (core dumped)\n";
static const char lognocoredump[] =
"pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n";
static kauth_listener_t signal_listener;
static int
signal_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result, signum;
result = KAUTH_RESULT_DEFER;
p = arg0;
signum = (int)(unsigned long)arg1;
if (action != KAUTH_PROCESS_SIGNAL)
return result;
if (kauth_cred_uidmatch(cred, p->p_cred) || (signum == SIGCONT && (curproc->p_session == p->p_session)))
result = KAUTH_RESULT_ALLOW;
return result;
}
static int
sigacts_ctor(void *arg __unused, void *obj, int flags __unused)
{
memset(obj, 0, sizeof(struct sigacts));
return 0;
}
/*
* signal_init:
*
* Initialize global signal-related data structures.
*/
void
signal_init(void)
{
sigactspool_allocator.pa_pagesz = (PAGE_SIZE)*2;
sigacts_cache = pool_cache_init(sizeof(struct sigacts), 0, 0, 0,
"sigacts", sizeof(struct sigacts) > PAGE_SIZE ?
&sigactspool_allocator : NULL, IPL_NONE, sigacts_ctor, NULL, NULL);
ksiginfo_cache = pool_cache_init(sizeof(ksiginfo_t), 0, 0, 0,
"ksiginfo", NULL, IPL_VM, NULL, NULL, NULL);
exechook_establish(ksiginfo_exechook, NULL);
callout_init(&proc_stop_ch, CALLOUT_MPSAFE);
callout_setfunc(&proc_stop_ch, proc_stop_callout, NULL);
signal_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
signal_listener_cb, NULL);
}
/*
* sigacts_poolpage_alloc:
*
* Allocate a page for the sigacts memory pool.
*/
static void *
sigacts_poolpage_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(kernel_map,
PAGE_SIZE * 2, PAGE_SIZE * 2,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
| UVM_KMF_WIRED);
}
/*
* sigacts_poolpage_free:
*
* Free a page on behalf of the sigacts memory pool.
*/
static void
sigacts_poolpage_free(struct pool *pp, void *v)
{
uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * 2, UVM_KMF_WIRED);
}
/*
* sigactsinit:
*
* Create an initial sigacts structure, using the same signal state
* as of specified process. If 'share' is set, share the sigacts by
* holding a reference, otherwise just copy it from parent.
*/
struct sigacts *
sigactsinit(struct proc *pp, int share)
{
struct sigacts *ps = pp->p_sigacts, *ps2;
if (__predict_false(share)) {
atomic_inc_uint(&ps->sa_refcnt);
return ps;
}
ps2 = pool_cache_get(sigacts_cache, PR_WAITOK);
mutex_init(&ps2->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
ps2->sa_refcnt = 1;
mutex_enter(&ps->sa_mutex);
memcpy(ps2->sa_sigdesc, ps->sa_sigdesc, sizeof(ps2->sa_sigdesc));
mutex_exit(&ps->sa_mutex);
return ps2;
}
/*
* sigactsunshare:
*
* Make this process not share its sigacts, maintaining all signal state.
*/
void
sigactsunshare(struct proc *p)
{
struct sigacts *ps, *oldps = p->p_sigacts;
if (__predict_true(oldps->sa_refcnt == 1))
return;
ps = pool_cache_get(sigacts_cache, PR_WAITOK);
mutex_init(&ps->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
memcpy(ps->sa_sigdesc, oldps->sa_sigdesc, sizeof(ps->sa_sigdesc));
ps->sa_refcnt = 1;
p->p_sigacts = ps;
sigactsfree(oldps);
}
/*
* sigactsfree;
*
* Release a sigacts structure.
*/
void
sigactsfree(struct sigacts *ps)
{
membar_release();
if (atomic_dec_uint_nv(&ps->sa_refcnt) == 0) {
membar_acquire();
mutex_destroy(&ps->sa_mutex);
pool_cache_put(sigacts_cache, ps);
}
}
/*
* siginit:
*
* Initialize signal state for process 0; set to ignore signals that
* are ignored by default and disable the signal stack. Locking not
* required as the system is still cold.
*/
void
siginit(struct proc *p)
{
struct lwp *l;
struct sigacts *ps;
int signo, prop;
ps = p->p_sigacts;
sigemptyset(&contsigmask);
sigemptyset(&stopsigmask);
sigemptyset(&vforksigmask);
sigemptyset(&sigcantmask);
for (signo = 1; signo < NSIG; signo++) {
prop = sigprop[signo];
if (prop & SA_CONT)
sigaddset(&contsigmask, signo);
if (prop & SA_STOP)
sigaddset(&stopsigmask, signo);
if (prop & SA_STOP && signo != SIGSTOP)
sigaddset(&vforksigmask, signo);
if (prop & SA_CANTMASK)
sigaddset(&sigcantmask, signo);
if (prop & SA_IGNORE && signo != SIGCONT)
sigaddset(&p->p_sigctx.ps_sigignore, signo);
sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
}
sigemptyset(&p->p_sigctx.ps_sigcatch);
p->p_sflag &= ~PS_NOCLDSTOP;
ksiginfo_queue_init(&p->p_sigpend.sp_info);
sigemptyset(&p->p_sigpend.sp_set);
/*
* Reset per LWP state.
*/
l = LIST_FIRST(&p->p_lwps);
l->l_sigwaited = NULL;
l->l_sigstk = SS_INIT;
ksiginfo_queue_init(&l->l_sigpend.sp_info);
sigemptyset(&l->l_sigpend.sp_set);
/* One reference. */
ps->sa_refcnt = 1;
}
/*
* execsigs:
*
* Reset signals for an exec of the specified process.
*/
void
execsigs(struct proc *p)
{
struct sigacts *ps;
struct lwp *l;
int signo, prop;
sigset_t tset;
ksiginfoq_t kq;
KASSERT(p->p_nlwps == 1);
sigactsunshare(p);
ps = p->p_sigacts;
/*
* Reset caught signals. Held signals remain held through
* l->l_sigmask (unless they were caught, and are now ignored
* by default).
*
* No need to lock yet, the process has only one LWP and
* at this point the sigacts are private to the process.
*/
sigemptyset(&tset);
for (signo = 1; signo < NSIG; signo++) {
if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) {
prop = sigprop[signo];
if (prop & SA_IGNORE) {
if ((prop & SA_CONT) == 0)
sigaddset(&p->p_sigctx.ps_sigignore,
signo);
sigaddset(&tset, signo);
}
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
}
sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
}
ksiginfo_queue_init(&kq);
mutex_enter(p->p_lock);
sigclearall(p, &tset, &kq);
sigemptyset(&p->p_sigctx.ps_sigcatch);
/*
* Reset no zombies if child dies flag as Solaris does.
*/
p->p_flag &= ~(PK_NOCLDWAIT | PK_CLDSIGIGN);
if (SIGACTION_PS(ps, SIGCHLD).sa_handler == SIG_IGN)
SIGACTION_PS(ps, SIGCHLD).sa_handler = SIG_DFL;
/*
* Reset per-LWP state.
*/
l = LIST_FIRST(&p->p_lwps);
l->l_sigwaited = NULL;
l->l_sigstk = SS_INIT;
ksiginfo_queue_init(&l->l_sigpend.sp_info);
sigemptyset(&l->l_sigpend.sp_set);
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
}
/*
* ksiginfo_exechook:
*
* Free all pending ksiginfo entries from a process on exec.
* Additionally, drain any unused ksiginfo structures in the
* system back to the pool.
*
* XXX This should not be a hook, every process has signals.
*/
static void
ksiginfo_exechook(struct proc *p, void *v)
{
ksiginfoq_t kq;
ksiginfo_queue_init(&kq);
mutex_enter(p->p_lock);
sigclearall(p, NULL, &kq);
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
}
/*
* ksiginfo_alloc:
*
* Allocate a new ksiginfo structure from the pool, and optionally copy
* an existing one. If the existing ksiginfo_t is from the pool, and
* has not been queued somewhere, then just return it. Additionally,
* if the existing ksiginfo_t does not contain any information beyond
* the signal number, then just return it.
*/
ksiginfo_t *
ksiginfo_alloc(struct proc *p, ksiginfo_t *ok, int flags)
{
ksiginfo_t *kp;
if (ok != NULL) {
if ((ok->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) ==
KSI_FROMPOOL)
return ok;
if (KSI_EMPTY_P(ok))
return ok;
}
kp = pool_cache_get(ksiginfo_cache, flags);
if (kp == NULL) {
#ifdef DIAGNOSTIC
printf("Out of memory allocating ksiginfo for pid %d\n",
p->p_pid);
#endif
return NULL;
}
if (ok != NULL) {
memcpy(kp, ok, sizeof(*kp));
kp->ksi_flags &= ~KSI_QUEUED;
} else
KSI_INIT_EMPTY(kp);
kp->ksi_flags |= KSI_FROMPOOL;
return kp;
}
/*
* ksiginfo_free:
*
* If the given ksiginfo_t is from the pool and has not been queued,
* then free it.
*/
void
ksiginfo_free(ksiginfo_t *kp)
{
if ((kp->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) != KSI_FROMPOOL)
return;
pool_cache_put(ksiginfo_cache, kp);
}
/*
* ksiginfo_queue_drain:
*
* Drain a non-empty ksiginfo_t queue.
*/
void
ksiginfo_queue_drain0(ksiginfoq_t *kq)
{
ksiginfo_t *ksi;
KASSERT(!TAILQ_EMPTY(kq));
while (!TAILQ_EMPTY(kq)) {
ksi = TAILQ_FIRST(kq);
TAILQ_REMOVE(kq, ksi, ksi_list);
pool_cache_put(ksiginfo_cache, ksi);
}
}
static int
siggetinfo(sigpend_t *sp, ksiginfo_t *out, int signo)
{
ksiginfo_t *ksi, *nksi;
if (sp == NULL)
goto out;
/* Find siginfo and copy it out. */
int count = 0;
TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, nksi) {
if (ksi->ksi_signo != signo)
continue;
if (count++ > 0) /* Only remove the first, count all of them */
continue;
TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
ksi->ksi_flags &= ~KSI_QUEUED;
if (out != NULL) {
memcpy(out, ksi, sizeof(*out));
out->ksi_flags &= ~(KSI_FROMPOOL | KSI_QUEUED);
}
ksiginfo_free(ksi);
}
if (count)
return count;
out:
/* If there is no siginfo, then manufacture it. */
if (out != NULL) {
KSI_INIT(out);
out->ksi_info._signo = signo;
out->ksi_info._code = SI_NOINFO;
}
return 0;
}
/*
* sigget:
*
* Fetch the first pending signal from a set. Optionally, also fetch
* or manufacture a ksiginfo element. Returns the number of the first
* pending signal, or zero.
*/
int
sigget(sigpend_t *sp, ksiginfo_t *out, int signo, const sigset_t *mask)
{
sigset_t tset;
int count;
/* If there's no pending set, the signal is from the debugger. */
if (sp == NULL)
goto out;
/* Construct mask from signo, and 'mask'. */
if (signo == 0) {
if (mask != NULL) {
tset = *mask;
__sigandset(&sp->sp_set, &tset);
} else
tset = sp->sp_set;
/* If there are no signals pending - return. */
if ((signo = firstsig(&tset)) == 0)
goto out;
} else {
KASSERT(sigismember(&sp->sp_set, signo));
}
sigdelset(&sp->sp_set, signo);
out:
count = siggetinfo(sp, out, signo);
if (count > 1)
sigaddset(&sp->sp_set, signo);
return signo;
}
/*
* sigput:
*
* Append a new ksiginfo element to the list of pending ksiginfo's.
*/
static int
sigput(sigpend_t *sp, struct proc *p, ksiginfo_t *ksi)
{
ksiginfo_t *kp;
KASSERT(mutex_owned(p->p_lock));
KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
sigaddset(&sp->sp_set, ksi->ksi_signo);
/*
* If there is no siginfo, we are done.
*/
if (KSI_EMPTY_P(ksi))
return 0;
KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
size_t count = 0;
TAILQ_FOREACH(kp, &sp->sp_info, ksi_list) {
count++;
if (ksi->ksi_signo >= SIGRTMIN && ksi->ksi_signo <= SIGRTMAX)
continue;
if (kp->ksi_signo == ksi->ksi_signo) {
KSI_COPY(ksi, kp);
kp->ksi_flags |= KSI_QUEUED;
return 0;
}
}
if (count >= SIGQUEUE_MAX) {
#ifdef DIAGNOSTIC
printf("%s(%d): Signal queue is full signal=%d\n",
p->p_comm, p->p_pid, ksi->ksi_signo);
#endif
return EAGAIN;
}
ksi->ksi_flags |= KSI_QUEUED;
TAILQ_INSERT_TAIL(&sp->sp_info, ksi, ksi_list);
return 0;
}
/*
* sigclear:
*
* Clear all pending signals in the specified set.
*/
void
sigclear(sigpend_t *sp, const sigset_t *mask, ksiginfoq_t *kq)
{
ksiginfo_t *ksi, *next;
if (mask == NULL)
sigemptyset(&sp->sp_set);
else
sigminusset(mask, &sp->sp_set); TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, next) { if (mask == NULL || sigismember(mask, ksi->ksi_signo)) { TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list); KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0); KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0); TAILQ_INSERT_TAIL(kq, ksi, ksi_list);
}
}
}
/*
* sigclearall:
*
* Clear all pending signals in the specified set from a process and
* its LWPs.
*/
void
sigclearall(struct proc *p, const sigset_t *mask, ksiginfoq_t *kq)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
sigclear(&p->p_sigpend, mask, kq);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
sigclear(&l->l_sigpend, mask, kq);
}
}
/*
* sigispending:
*
* Return the first signal number if there are pending signals for the
* current LWP. May be called unlocked provided that LW_PENDSIG is set,
* and that the signal has been posted to the appopriate queue before
* LW_PENDSIG is set.
*
* This should only ever be called with (l == curlwp), unless the
* result does not matter (procfs, sysctl).
*/
int
sigispending(struct lwp *l, int signo)
{
struct proc *p = l->l_proc;
sigset_t tset;
membar_consumer();
tset = l->l_sigpend.sp_set;
sigplusset(&p->p_sigpend.sp_set, &tset);
sigminusset(&p->p_sigctx.ps_sigignore, &tset);
sigminusset(&l->l_sigmask, &tset);
if (signo == 0) {
return firstsig(&tset);
}
return sigismember(&tset, signo) ? signo : 0;
}
void
getucontext(struct lwp *l, ucontext_t *ucp)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
ucp->uc_flags = 0;
ucp->uc_link = l->l_ctxlink;
ucp->uc_sigmask = l->l_sigmask;
ucp->uc_flags |= _UC_SIGMASK;
/*
* The (unsupplied) definition of the `current execution stack'
* in the System V Interface Definition appears to allow returning
* the main context stack.
*/
if ((l->l_sigstk.ss_flags & SS_ONSTACK) == 0) {
ucp->uc_stack.ss_sp = (void *)l->l_proc->p_stackbase;
ucp->uc_stack.ss_size = ctob(l->l_proc->p_vmspace->vm_ssize);
ucp->uc_stack.ss_flags = 0; /* XXX, def. is Very Fishy */
} else {
/* Simply copy alternate signal execution stack. */
ucp->uc_stack = l->l_sigstk;
}
ucp->uc_flags |= _UC_STACK;
mutex_exit(p->p_lock);
cpu_getmcontext(l, &ucp->uc_mcontext, &ucp->uc_flags);
mutex_enter(p->p_lock);
}
int
setucontext(struct lwp *l, const ucontext_t *ucp)
{
struct proc *p = l->l_proc;
int error;
KASSERT(mutex_owned(p->p_lock));
if ((ucp->uc_flags & _UC_SIGMASK) != 0) {
error = sigprocmask1(l, SIG_SETMASK, &ucp->uc_sigmask, NULL);
if (error != 0)
return error;
}
mutex_exit(p->p_lock);
error = cpu_setmcontext(l, &ucp->uc_mcontext, ucp->uc_flags);
mutex_enter(p->p_lock);
if (error != 0)
return (error);
l->l_ctxlink = ucp->uc_link;
/*
* If there was stack information, update whether or not we are
* still running on an alternate signal stack.
*/
if ((ucp->uc_flags & _UC_STACK) != 0) {
if (ucp->uc_stack.ss_flags & SS_ONSTACK)
l->l_sigstk.ss_flags |= SS_ONSTACK;
else
l->l_sigstk.ss_flags &= ~SS_ONSTACK;
}
return 0;
}
/*
* killpg1: common code for kill process group/broadcast kill.
*/
int
killpg1(struct lwp *l, ksiginfo_t *ksi, int pgid, int all)
{
struct proc *p, *cp;
kauth_cred_t pc;
struct pgrp *pgrp;
int nfound;
int signo = ksi->ksi_signo;
cp = l->l_proc;
pc = l->l_cred;
nfound = 0;
mutex_enter(&proc_lock);
if (all) {
/*
* Broadcast.
*/
PROCLIST_FOREACH(p, &allproc) {
if (p->p_pid <= 1 || p == cp ||
(p->p_flag & PK_SYSTEM) != 0)
continue;
mutex_enter(p->p_lock);
if (kauth_authorize_process(pc,
KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL,
NULL) == 0) {
nfound++;
if (signo)
kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
}
} else {
if (pgid == 0)
/* Zero pgid means send to my process group. */
pgrp = cp->p_pgrp;
else {
pgrp = pgrp_find(pgid);
if (pgrp == NULL)
goto out;
}
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
if (p->p_pid <= 1 || p->p_flag & PK_SYSTEM)
continue;
mutex_enter(p->p_lock);
if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL,
p, KAUTH_ARG(signo), NULL, NULL) == 0) {
nfound++;
if (signo && P_ZOMBIE(p) == 0)
kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
}
}
out:
mutex_exit(&proc_lock);
return nfound ? 0 : ESRCH;
}
/*
* Send a signal to a process group. If checktty is set, limit to members
* which have a controlling terminal.
*/
void
pgsignal(struct pgrp *pgrp, int sig, int checkctty)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KSI_INIT_EMPTY(&ksi);
ksi.ksi_signo = sig;
kpgsignal(pgrp, &ksi, NULL, checkctty);
}
void
kpgsignal(struct pgrp *pgrp, ksiginfo_t *ksi, void *data, int checkctty)
{
struct proc *p;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KASSERT(pgrp != NULL);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist)
if (checkctty == 0 || p->p_lflag & PL_CONTROLT)
kpsignal(p, ksi, data);
}
/*
* Send a signal caused by a trap to the current LWP. If it will be caught
* immediately, deliver it with correct code. Otherwise, post it normally.
*/
void
trapsignal(struct lwp *l, ksiginfo_t *ksi)
{
struct proc *p;
struct sigacts *ps;
int signo = ksi->ksi_signo;
sigset_t *mask;
sig_t action;
KASSERT(KSI_TRAP_P(ksi));
ksi->ksi_lid = l->l_lid;
p = l->l_proc;
KASSERT(!cpu_intr_p());
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
lwp_exit(l);
panic("trapsignal");
/* NOTREACHED */
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
mutex_exit(&proc_lock);
sigswitch_unlock_and_switch_away(l);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
goto repeat;
}
mask = &l->l_sigmask;
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
if (ISSET(p->p_slflag, PSL_TRACED) &&
!(p->p_pptr == p->p_opptr && ISSET(p->p_lflag, PL_PPWAIT)) &&
p->p_xsig != SIGKILL &&
!sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
p->p_xsig = signo;
p->p_sigctx.ps_faked = true;
p->p_sigctx.ps_lwp = ksi->ksi_lid;
p->p_sigctx.ps_info = ksi->ksi_info;
sigswitch(0, signo, true);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, ksi);
else
ktrpsig(signo, action, mask, ksi);
}
return;
}
const bool caught = sigismember(&p->p_sigctx.ps_sigcatch, signo);
const bool masked = sigismember(mask, signo);
if (caught && !masked) {
mutex_exit(&proc_lock);
l->l_ru.ru_nsignals++;
kpsendsig(l, ksi, mask);
mutex_exit(p->p_lock);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, ksi);
else
ktrpsig(signo, action, mask, ksi);
}
return;
}
/*
* If the signal is masked or ignored, then unmask it and
* reset it to the default action so that the process or
* its tracer will be notified.
*/
const bool ignored = action == SIG_IGN;
if (masked || ignored) {
mutex_enter(&ps->sa_mutex);
sigdelset(mask, signo);
sigdelset(&p->p_sigctx.ps_sigcatch, signo);
sigdelset(&p->p_sigctx.ps_sigignore, signo);
sigdelset(&SIGACTION_PS(ps, signo).sa_mask, signo);
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
mutex_exit(&ps->sa_mutex);
}
kpsignal2(p, ksi);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
}
/*
* Fill in signal information and signal the parent for a child status change.
*/
void
child_psignal(struct proc *p, int mask)
{
ksiginfo_t ksi;
struct proc *q;
int xsig;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
xsig = p->p_xsig;
KSI_INIT(&ksi);
ksi.ksi_signo = SIGCHLD;
ksi.ksi_code = (xsig == SIGCONT ? CLD_CONTINUED : CLD_STOPPED);
ksi.ksi_pid = p->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(p->p_cred);
ksi.ksi_status = xsig;
ksi.ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
ksi.ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
q = p->p_pptr;
mutex_exit(p->p_lock);
mutex_enter(q->p_lock);
if ((q->p_sflag & mask) == 0)
kpsignal2(q, &ksi);
mutex_exit(q->p_lock);
mutex_enter(p->p_lock);
}
void
psignal(struct proc *p, int signo)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KSI_INIT_EMPTY(&ksi);
ksi.ksi_signo = signo;
mutex_enter(p->p_lock);
kpsignal2(p, &ksi);
mutex_exit(p->p_lock);
}
void
kpsignal(struct proc *p, ksiginfo_t *ksi, void *data)
{
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
if ((p->p_sflag & PS_WEXIT) == 0 && data) {
size_t fd;
filedesc_t *fdp = p->p_fd;
/* XXXSMP locking */
ksi->ksi_fd = -1;
dt = atomic_load_consume(&fdp->fd_dt);
for (fd = 0; fd < dt->dt_nfiles; fd++) {
if ((ff = dt->dt_ff[fd]) == NULL)
continue;
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL)
continue;
if (fp->f_data == data) {
ksi->ksi_fd = fd;
break;
}
}
}
mutex_enter(p->p_lock);
kpsignal2(p, ksi);
mutex_exit(p->p_lock);
}
/*
* sigismasked:
*
* Returns true if signal is ignored or masked for the specified LWP.
*/
int
sigismasked(struct lwp *l, int sig)
{
struct proc *p = l->l_proc;
return sigismember(&p->p_sigctx.ps_sigignore, sig) ||
sigismember(&l->l_sigmask, sig);
}
/*
* sigpost:
*
* Post a pending signal to an LWP. Returns non-zero if the LWP may
* be able to take the signal.
*/
static int
sigpost(struct lwp *l, sig_t action, int prop, int sig)
{
int rv, masked;
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
/*
* If the LWP is on the way out, sigclear() will be busy draining all
* pending signals. Don't give it more.
*/
if (l->l_stat == LSZOMB)
return 0;
SDT_PROBE(proc, kernel, , signal__send, l, p, sig, 0, 0);
lwp_lock(l);
if (__predict_false((l->l_flag & LW_DBGSUSPEND) != 0)) {
if ((prop & SA_KILL) != 0)
l->l_flag &= ~LW_DBGSUSPEND;
else {
lwp_unlock(l);
return 0;
}
}
/*
* Have the LWP check for signals. This ensures that even if no LWP
* is found to take the signal immediately, it should be taken soon.
*/
signotify(l);
/*
* SIGCONT can be masked, but if LWP is stopped, it needs restart.
* Note: SIGKILL and SIGSTOP cannot be masked.
*/
masked = sigismember(&l->l_sigmask, sig);
if (masked && ((prop & SA_CONT) == 0 || l->l_stat != LSSTOP)) {
lwp_unlock(l);
return 0;
}
/*
* If killing the process, make it run fast.
*/
if (__predict_false((prop & SA_KILL) != 0) &&
action == SIG_DFL && l->l_priority < MAXPRI_USER) {
KASSERT(l->l_class == SCHED_OTHER);
lwp_changepri(l, MAXPRI_USER);
}
/*
* If the LWP is running or on a run queue, then we win. If it's
* sleeping interruptably, wake it and make it take the signal. If
* the sleep isn't interruptable, then the chances are it will get
* to see the signal soon anyhow. If suspended, it can't take the
* signal right now. If it's LWP private or for all LWPs, save it
* for later; otherwise punt.
*/
rv = 0;
switch (l->l_stat) {
case LSRUN:
case LSONPROC:
rv = 1;
break;
case LSSLEEP:
if ((l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
}
break;
case LSSUSPENDED:
if ((prop & SA_KILL) != 0 && (l->l_flag & LW_WCORE) != 0) {
/* lwp_continue() will release the lock. */
lwp_continue(l);
return 1;
}
break;
case LSSTOP:
if ((prop & SA_STOP) != 0)
break;
/*
* If the LWP is stopped and we are sending a continue
* signal, then start it again.
*/
if ((prop & SA_CONT) != 0) {
if (l->l_wchan != NULL) {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
rv = 1;
break;
}
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
} else if (l->l_wchan == NULL || (l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
}
break;
default:
break;
}
lwp_unlock(l);
return rv;
}
/*
* Notify an LWP that it has a pending signal.
*/
void
signotify(struct lwp *l)
{
KASSERT(lwp_locked(l, NULL));
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
}
/*
* Find an LWP within process p that is waiting on signal ksi, and hand
* it on.
*/
static int
sigunwait(struct proc *p, const ksiginfo_t *ksi)
{
struct lwp *l;
int signo;
KASSERT(mutex_owned(p->p_lock));
signo = ksi->ksi_signo;
if (ksi->ksi_lid != 0) {
/*
* Signal came via _lwp_kill(). Find the LWP and see if
* it's interested.
*/
if ((l = lwp_find(p, ksi->ksi_lid)) == NULL)
return 0;
if (l->l_sigwaited == NULL ||
!sigismember(&l->l_sigwaitset, signo))
return 0;
} else {
/*
* Look for any LWP that may be interested.
*/
LIST_FOREACH(l, &p->p_sigwaiters, l_sigwaiter) {
KASSERT(l->l_sigwaited != NULL);
if (sigismember(&l->l_sigwaitset, signo))
break;
}
}
if (l != NULL) {
l->l_sigwaited->ksi_info = ksi->ksi_info;
l->l_sigwaited = NULL;
LIST_REMOVE(l, l_sigwaiter);
cv_signal(&l->l_sigcv);
return 1;
}
return 0;
}
/*
* Send the signal to the process. If the signal has an action, the action
* is usually performed by the target process rather than the caller; we add
* the signal to the set of pending signals for the process.
*
* Exceptions:
* o When a stop signal is sent to a sleeping process that takes the
* default action, the process is stopped without awakening it.
* o SIGCONT restarts stopped processes (or puts them back to sleep)
* regardless of the signal action (eg, blocked or ignored).
*
* Other ignored signals are discarded immediately.
*/
int
kpsignal2(struct proc *p, ksiginfo_t *ksi)
{
int prop, signo = ksi->ksi_signo;
struct lwp *l = NULL;
ksiginfo_t *kp;
lwpid_t lid;
sig_t action;
bool toall;
bool traced;
int error = 0;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
KASSERT(signo > 0);
KASSERT(signo < NSIG);
/*
* If the process is being created by fork, is a zombie or is
* exiting, then just drop the signal here and bail out.
*/
if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
return 0;
/*
* Notify any interested parties of the signal.
*/
KNOTE(&p->p_klist, NOTE_SIGNAL | signo);
/*
* Some signals including SIGKILL must act on the entire process.
*/
kp = NULL;
prop = sigprop[signo];
toall = ((prop & SA_TOALL) != 0);
lid = toall ? 0 : ksi->ksi_lid;
traced = ISSET(p->p_slflag, PSL_TRACED) &&
!sigismember(&p->p_sigctx.ps_sigpass, signo);
/*
* If proc is traced, always give parent a chance.
*/
if (traced) {
action = SIG_DFL;
if (lid == 0) {
/*
* If the process is being traced and the signal
* is being caught, make sure to save any ksiginfo.
*/
if ((kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
goto discard;
if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
goto out;
}
} else {
/*
* If the signal is being ignored, then drop it. Note: we
* don't set SIGCONT in ps_sigignore, and if it is set to
* SIG_IGN, action will be SIG_DFL here.
*/
if (sigismember(&p->p_sigctx.ps_sigignore, signo))
goto discard;
else if (sigismember(&p->p_sigctx.ps_sigcatch, signo))
action = SIG_CATCH;
else {
action = SIG_DFL;
/*
* If sending a tty stop signal to a member of an
* orphaned process group, discard the signal here if
* the action is default; don't stop the process below
* if sleeping, and don't clear any pending SIGCONT.
*/
if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0)
goto discard;
if (prop & SA_KILL && p->p_nice > NZERO)
p->p_nice = NZERO;
}
}
/*
* If stopping or continuing a process, discard any pending
* signals that would do the inverse.
*/
if ((prop & (SA_CONT | SA_STOP)) != 0) {
ksiginfoq_t kq;
ksiginfo_queue_init(&kq);
if ((prop & SA_CONT) != 0)
sigclear(&p->p_sigpend, &stopsigmask, &kq);
if ((prop & SA_STOP) != 0)
sigclear(&p->p_sigpend, &contsigmask, &kq);
ksiginfo_queue_drain(&kq); /* XXXSMP */
}
/*
* If the signal doesn't have SA_CANTMASK (no override for SIGKILL,
* please!), check if any LWPs are waiting on it. If yes, pass on
* the signal info. The signal won't be processed further here.
*/
if ((prop & SA_CANTMASK) == 0 && !LIST_EMPTY(&p->p_sigwaiters) &&
p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0 &&
sigunwait(p, ksi))
goto discard;
/*
* XXXSMP Should be allocated by the caller, we're holding locks
* here.
*/
if (kp == NULL && (kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
goto discard;
/*
* LWP private signals are easy - just find the LWP and post
* the signal to it.
*/
if (lid != 0) {
l = lwp_find(p, lid);
if (l != NULL) {
if ((error = sigput(&l->l_sigpend, p, kp)) != 0)
goto out;
membar_producer();
if (sigpost(l, action, prop, kp->ksi_signo) != 0)
signo = -1;
}
goto out;
}
/*
* Some signals go to all LWPs, even if posted with _lwp_kill()
* or for an SA process.
*/
if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
if (traced)
goto deliver;
/*
* If SIGCONT is default (or ignored) and process is
* asleep, we are finished; the process should not
* be awakened.
*/
if ((prop & SA_CONT) != 0 && action == SIG_DFL)
goto out;
} else {
/*
* Process is stopped or stopping.
* - If traced, then no action is needed, unless killing.
* - Run the process only if sending SIGCONT or SIGKILL.
*/
if (traced && signo != SIGKILL) {
goto out;
}
if ((prop & SA_CONT) != 0 || signo == SIGKILL) {
/*
* Re-adjust p_nstopchild if the process was
* stopped but not yet collected by its parent.
*/
if (p->p_stat == SSTOP && !p->p_waited)
p->p_pptr->p_nstopchild--;
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
if (traced) {
KASSERT(signo == SIGKILL);
goto deliver;
}
/*
* Do not make signal pending if SIGCONT is default.
*
* If the process catches SIGCONT, let it handle the
* signal itself (if waiting on event - process runs,
* otherwise continues sleeping).
*/
if ((prop & SA_CONT) != 0) {
p->p_xsig = SIGCONT;
p->p_sflag |= PS_CONTINUED;
child_psignal(p, 0);
if (action == SIG_DFL) {
KASSERT(signo != SIGKILL);
goto deliver;
}
}
} else if ((prop & SA_STOP) != 0) {
/*
* Already stopped, don't need to stop again.
* (If we did the shell could get confused.)
*/
goto out;
}
}
/*
* Make signal pending.
*/
KASSERT(!traced);
if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
goto out;
deliver:
/*
* Before we set LW_PENDSIG on any LWP, ensure that the signal is
* visible on the per process list (for sigispending()). This
* is unlikely to be needed in practice, but...
*/
membar_producer();
/*
* Try to find an LWP that can take the signal.
*/
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (sigpost(l, action, prop, kp->ksi_signo) && !toall)
break;
}
signo = -1;
out:
/*
* If the ksiginfo wasn't used, then bin it. XXXSMP freeing memory
* with locks held. The caller should take care of this.
*/
ksiginfo_free(kp);
if (signo == -1)
return error;
discard:
SDT_PROBE(proc, kernel, , signal__discard, l, p, signo, 0, 0);
return error;
}
void
kpsendsig(struct lwp *l, const ksiginfo_t *ksi, const sigset_t *mask)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
(*p->p_emul->e_sendsig)(ksi, mask);
}
/*
* Stop any LWPs sleeping interruptably.
*/
static void
proc_stop_lwps(struct proc *p)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
KASSERT((p->p_sflag & PS_STOPPING) != 0);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) {
l->l_stat = LSSTOP;
p->p_nrlwps--;
}
lwp_unlock(l);
}
}
/*
* Finish stopping of a process. Mark it stopped and notify the parent.
*
* Drop p_lock briefly if ppsig is true.
*/
static void
proc_stop_done(struct proc *p, int ppmask)
{
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
KASSERT((p->p_sflag & PS_STOPPING) != 0);
KASSERT(p->p_nrlwps == 0 || p->p_nrlwps == 1);
KASSERT(p->p_nrlwps == 0 || p == curproc);
p->p_sflag &= ~PS_STOPPING;
p->p_stat = SSTOP;
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
/* child_psignal drops p_lock briefly. */
child_psignal(p, ppmask);
cv_broadcast(&p->p_pptr->p_waitcv);
}
/*
* Stop the current process and switch away to the debugger notifying
* an event specific to a traced process only.
*/
void
eventswitch(int code, int pe_report_event, int entity)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
struct sigacts *ps;
sigset_t *mask;
sig_t action;
ksiginfo_t ksi;
const int signo = SIGTRAP;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
KASSERT(p->p_pptr != initproc);
KASSERT(l->l_stat == LSONPROC);
KASSERT(ISSET(p->p_slflag, PSL_TRACED));
KASSERT(!ISSET(l->l_flag, LW_SYSTEM));
KASSERT(p->p_nrlwps > 0);
KASSERT((code == TRAP_CHLD) || (code == TRAP_LWP) ||
(code == TRAP_EXEC));
KASSERT((code != TRAP_CHLD) || (entity > 1)); /* prevent pid1 */
KASSERT((code != TRAP_LWP) || (entity > 0));
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
if (pe_report_event == PTRACE_LWP_EXIT) {
/* Avoid double lwp_exit() and panic. */
return;
}
lwp_exit(l);
panic("eventswitch");
/* NOTREACHED */
}
/*
* If we are no longer traced, abandon this event signal.
*
* This avoids killing a process after detaching the debugger.
*/
if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
/*
* If there's a pending SIGKILL process it immediately.
*/
if (p->p_xsig == SIGKILL ||
sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
mutex_exit(&proc_lock);
sigswitch_unlock_and_switch_away(l);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
goto repeat;
}
KSI_INIT_TRAP(&ksi);
ksi.ksi_lid = l->l_lid;
ksi.ksi_signo = signo;
ksi.ksi_code = code;
ksi.ksi_pe_report_event = pe_report_event;
CTASSERT(sizeof(ksi.ksi_pe_other_pid) == sizeof(ksi.ksi_pe_lwp));
ksi.ksi_pe_other_pid = entity;
/* Needed for ktrace */
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
mask = &l->l_sigmask;
p->p_xsig = signo;
p->p_sigctx.ps_faked = true;
p->p_sigctx.ps_lwp = ksi.ksi_lid;
p->p_sigctx.ps_info = ksi.ksi_info;
sigswitch(0, signo, true);
if (code == TRAP_CHLD) {
mutex_enter(&proc_lock);
while (l->l_vforkwaiting)
cv_wait(&l->l_waitcv, &proc_lock);
mutex_exit(&proc_lock);
}
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
else
ktrpsig(signo, action, mask, &ksi);
}
}
void
eventswitchchild(struct proc *p, int code, int pe_report_event)
{
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) !=
(PSL_TRACED|PSL_TRACEDCHILD)) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
eventswitch(code, pe_report_event, p->p_oppid);
}
/*
* Stop the current process and switch away when being stopped or traced.
*/
static void
sigswitch(int ppmask, int signo, bool proc_lock_held)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(l->l_stat == LSONPROC);
KASSERT(p->p_nrlwps > 0);
if (proc_lock_held) {
KASSERT(mutex_owned(&proc_lock));
} else {
KASSERT(!mutex_owned(&proc_lock));
}
/*
* On entry we know that the process needs to stop. If it's
* the result of a 'sideways' stop signal that has been sourced
* through issignal(), then stop other LWPs in the process too.
*/
if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
KASSERT(signo != 0);
proc_stop(p, signo);
KASSERT(p->p_nrlwps > 0);
}
/*
* If we are the last live LWP, and the stop was a result of
* a new signal, then signal the parent.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
if (!proc_lock_held && !mutex_tryenter(&proc_lock)) {
mutex_exit(p->p_lock);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
}
if (p->p_nrlwps == 1 && (p->p_sflag & PS_STOPPING) != 0) {
/*
* Note that proc_stop_done() can drop
* p->p_lock briefly.
*/
proc_stop_done(p, ppmask);
}
mutex_exit(&proc_lock);
}
sigswitch_unlock_and_switch_away(l);
}
/*
* Unlock and switch away.
*/
static void
sigswitch_unlock_and_switch_away(struct lwp *l)
{
struct proc *p;
p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(!mutex_owned(&proc_lock));
KASSERT(l->l_stat == LSONPROC);
KASSERT(p->p_nrlwps > 0);
KASSERT(l->l_blcnt == 0);
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
p->p_nrlwps--;
lwp_lock(l);
KASSERT(l->l_stat == LSONPROC || l->l_stat == LSSLEEP);
l->l_stat = LSSTOP;
lwp_unlock(l);
}
mutex_exit(p->p_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
}
/*
* Check for a signal from the debugger.
*/
static int
sigchecktrace(void)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int signo;
KASSERT(mutex_owned(p->p_lock));
/* If there's a pending SIGKILL, process it immediately. */
if (sigismember(&p->p_sigpend.sp_set, SIGKILL))
return 0;
/*
* If we are no longer being traced, or the parent didn't
* give us a signal, or we're stopping, look for more signals.
*/
if ((p->p_slflag & PSL_TRACED) == 0 || p->p_xsig == 0 ||
(p->p_sflag & PS_STOPPING) != 0)
return 0;
/*
* If the new signal is being masked, look for other signals.
* `p->p_sigctx.ps_siglist |= mask' is done in setrunnable().
*/
signo = p->p_xsig;
p->p_xsig = 0;
if (sigismember(&l->l_sigmask, signo)) {
signo = 0;
}
return signo;
}
/*
* If the current process has received a signal (should be caught or cause
* termination, should interrupt current syscall), return the signal number.
*
* Stop signals with default action are processed immediately, then cleared;
* they aren't returned. This is checked after each entry to the system for
* a syscall or trap.
*
* We will also return -1 if the process is exiting and the current LWP must
* follow suit.
*/
int
issignal(struct lwp *l)
{
struct proc *p;
int siglwp, signo, prop;
sigpend_t *sp;
sigset_t ss;
bool traced;
p = l->l_proc;
sp = NULL;
signo = 0;
KASSERT(p == curproc);
KASSERT(mutex_owned(p->p_lock));
for (;;) {
/* Discard any signals that we have decided not to take. */
if (signo != 0) {
(void)sigget(sp, NULL, signo, NULL);
}
/*
* If the process is stopped/stopping, then stop ourselves
* now that we're on the kernel/userspace boundary. When
* we awaken, check for a signal from the debugger.
*/
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
sigswitch_unlock_and_switch_away(l);
mutex_enter(p->p_lock);
continue;
} else if (p->p_stat == SACTIVE)
signo = sigchecktrace();
else
signo = 0;
/* Signals from the debugger are "out of band". */
sp = NULL;
/*
* If the debugger didn't provide a signal, find a pending
* signal from our set. Check per-LWP signals first, and
* then per-process.
*/
if (signo == 0) {
sp = &l->l_sigpend;
ss = sp->sp_set;
siglwp = l->l_lid;
if ((p->p_lflag & PL_PPWAIT) != 0)
sigminusset(&vforksigmask, &ss);
sigminusset(&l->l_sigmask, &ss);
if ((signo = firstsig(&ss)) == 0) {
sp = &p->p_sigpend;
ss = sp->sp_set;
siglwp = 0;
if ((p->p_lflag & PL_PPWAIT) != 0)
sigminusset(&vforksigmask, &ss);
sigminusset(&l->l_sigmask, &ss);
if ((signo = firstsig(&ss)) == 0) {
/*
* No signal pending - clear the
* indicator and bail out.
*/
lwp_lock(l);
l->l_flag &= ~LW_PENDSIG;
lwp_unlock(l);
sp = NULL;
break;
}
}
}
traced = ISSET(p->p_slflag, PSL_TRACED) &&
!sigismember(&p->p_sigctx.ps_sigpass, signo);
if (sp) {
/* Overwrite process' signal context to correspond
* to the currently reported LWP. This is necessary
* for PT_GET_SIGINFO to report the correct signal when
* multiple LWPs have pending signals. We do this only
* when the signal comes from the queue, for signals
* created by the debugger we assume it set correct
* siginfo.
*/
ksiginfo_t *ksi = TAILQ_FIRST(&sp->sp_info);
if (ksi) {
p->p_sigctx.ps_lwp = ksi->ksi_lid;
p->p_sigctx.ps_info = ksi->ksi_info;
} else {
p->p_sigctx.ps_lwp = siglwp;
memset(&p->p_sigctx.ps_info, 0,
sizeof(p->p_sigctx.ps_info));
p->p_sigctx.ps_info._signo = signo;
p->p_sigctx.ps_info._code = SI_NOINFO;
}
}
/*
* We should see pending but ignored signals only if
* we are being traced.
*/
if (sigismember(&p->p_sigctx.ps_sigignore, signo) &&
!traced) {
/* Discard the signal. */
continue;
}
/*
* If traced, always stop, and stay stopped until released
* by the debugger. If the our parent is our debugger waiting
* for us and we vforked, don't hang as we could deadlock.
*/
if (traced && signo != SIGKILL &&
!(ISSET(p->p_lflag, PL_PPWAIT) &&
(p->p_pptr == p->p_opptr))) {
/*
* Take the signal, but don't remove it from the
* siginfo queue, because the debugger can send
* it later.
*/
if (sp)
sigdelset(&sp->sp_set, signo);
p->p_xsig = signo;
/* Handling of signal trace */
sigswitch(0, signo, false);
mutex_enter(p->p_lock);
/* Check for a signal from the debugger. */
if ((signo = sigchecktrace()) == 0)
continue;
/* Signals from the debugger are "out of band". */
sp = NULL;
}
prop = sigprop[signo];
/*
* Decide whether the signal should be returned.
*/
switch ((long)SIGACTION(p, signo).sa_handler) {
case (long)SIG_DFL:
/*
* Don't take default actions on system processes.
*/
if (p->p_pid <= 1) {
#ifdef DIAGNOSTIC
/*
* Are you sure you want to ignore SIGSEGV
* in init? XXX
*/
printf_nolog("Process (pid %d) got sig %d\n",
p->p_pid, signo);
#endif
continue;
}
/*
* If there is a pending stop signal to process with
* default action, stop here, then clear the signal.
* However, if process is member of an orphaned
* process group, ignore tty stop signals.
*/
if (prop & SA_STOP) {
/*
* XXX Don't hold proc_lock for p_lflag,
* but it's not a big deal.
*/
if ((traced &&
!(ISSET(p->p_lflag, PL_PPWAIT) &&
(p->p_pptr == p->p_opptr))) ||
((p->p_lflag & PL_ORPHANPG) != 0 &&
prop & SA_TTYSTOP)) {
/* Ignore the signal. */
continue;
}
/* Take the signal. */
(void)sigget(sp, NULL, signo, NULL);
p->p_xsig = signo;
p->p_sflag &= ~PS_CONTINUED;
signo = 0;
sigswitch(PS_NOCLDSTOP, p->p_xsig, false);
mutex_enter(p->p_lock);
} else if (prop & SA_IGNORE) {
/*
* Except for SIGCONT, shouldn't get here.
* Default action is to ignore; drop it.
*/
continue;
}
break;
case (long)SIG_IGN:
#ifdef DEBUG_ISSIGNAL
/*
* Masking above should prevent us ever trying
* to take action on an ignored signal other
* than SIGCONT, unless process is traced.
*/
if ((prop & SA_CONT) == 0 && !traced)
printf_nolog("issignal\n");
#endif
continue;
default:
/*
* This signal has an action, let postsig() process
* it.
*/
break;
}
break;
}
l->l_sigpendset = sp;
return signo;
}
/*
* Take the action for the specified signal
* from the current set of pending signals.
*/
void
postsig(int signo)
{
struct lwp *l;
struct proc *p;
struct sigacts *ps;
sig_t action;
sigset_t *returnmask;
ksiginfo_t ksi;
l = curlwp;
p = l->l_proc;
ps = p->p_sigacts;
KASSERT(mutex_owned(p->p_lock));
KASSERT(signo > 0);
/*
* Set the new mask value and also defer further occurrences of this
* signal.
*
* Special case: user has done a sigsuspend. Here the current mask is
* not of interest, but rather the mask from before the sigsuspend is
* what we want restored after the signal processing is completed.
*/
if (l->l_sigrestore) {
returnmask = &l->l_sigoldmask;
l->l_sigrestore = 0;
} else
returnmask = &l->l_sigmask;
/*
* Commit to taking the signal before releasing the mutex.
*/
action = SIGACTION_PS(ps, signo).sa_handler;
l->l_ru.ru_nsignals++;
if (l->l_sigpendset == NULL) {
/* From the debugger */
if (p->p_sigctx.ps_faked &&
signo == p->p_sigctx.ps_info._signo) {
KSI_INIT(&ksi);
ksi.ksi_info = p->p_sigctx.ps_info;
ksi.ksi_lid = p->p_sigctx.ps_lwp;
p->p_sigctx.ps_faked = false;
} else {
if (!siggetinfo(&l->l_sigpend, &ksi, signo))
(void)siggetinfo(&p->p_sigpend, &ksi, signo);
}
} else
sigget(l->l_sigpendset, &ksi, signo, NULL);
if (ktrpoint(KTR_PSIG)) {
mutex_exit(p->p_lock);
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action,
returnmask, &ksi);
else
ktrpsig(signo, action, returnmask, &ksi);
mutex_enter(p->p_lock);
}
SDT_PROBE(proc, kernel, , signal__handle, signo, &ksi, action, 0, 0);
if (action == SIG_DFL) {
/*
* Default action, where the default is to kill
* the process. (Other cases were ignored above.)
*/
sigexit(l, signo);
return;
}
/*
* If we get here, the signal must be caught.
*/
#ifdef DIAGNOSTIC
if (action == SIG_IGN || sigismember(&l->l_sigmask, signo))
panic("postsig action");
#endif
kpsendsig(l, &ksi, returnmask);
}
/*
* sendsig:
*
* Default signal delivery method for NetBSD.
*/
void
sendsig(const struct ksiginfo *ksi, const sigset_t *mask)
{
struct sigacts *sa;
int sig;
sig = ksi->ksi_signo;
sa = curproc->p_sigacts;
switch (sa->sa_sigdesc[sig].sd_vers) {
case __SIGTRAMP_SIGCODE_VERSION:
#ifdef __HAVE_STRUCT_SIGCONTEXT
case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
__SIGTRAMP_SIGCONTEXT_VERSION_MAX:
/* Compat for 1.6 and earlier. */
MODULE_HOOK_CALL_VOID(sendsig_sigcontext_16_hook, (ksi, mask),
break);
return;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
case __SIGTRAMP_SIGINFO_VERSION_MIN ...
__SIGTRAMP_SIGINFO_VERSION_MAX:
sendsig_siginfo(ksi, mask);
return;
default:
break;
}
printf("sendsig: bad version %d\n", sa->sa_sigdesc[sig].sd_vers);
sigexit(curlwp, SIGILL);
}
/*
* sendsig_reset:
*
* Reset the signal action. Called from emulation specific sendsig()
* before unlocking to deliver the signal.
*/
void
sendsig_reset(struct lwp *l, int signo)
{
struct proc *p = l->l_proc;
struct sigacts *ps = p->p_sigacts;
KASSERT(mutex_owned(p->p_lock));
p->p_sigctx.ps_lwp = 0;
memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
mutex_enter(&ps->sa_mutex);
sigplusset(&SIGACTION_PS(ps, signo).sa_mask, &l->l_sigmask);
if (SIGACTION_PS(ps, signo).sa_flags & SA_RESETHAND) {
sigdelset(&p->p_sigctx.ps_sigcatch, signo);
if (signo != SIGCONT && sigprop[signo] & SA_IGNORE)
sigaddset(&p->p_sigctx.ps_sigignore, signo);
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
}
mutex_exit(&ps->sa_mutex);
}
/*
* Kill the current process for stated reason.
*/
void
killproc(struct proc *p, const char *why)
{
KASSERT(mutex_owned(&proc_lock));
log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why);
psignal(p, SIGKILL);
}
/*
* Force the current process to exit with the specified signal, dumping core
* if appropriate. We bypass the normal tests for masked and caught
* signals, allowing unrecoverable failures to terminate the process without
* changing signal state. Mark the accounting record with the signal
* termination. If dumping core, save the signal number for the debugger.
* Calls exit and does not return.
*/
void
sigexit(struct lwp *l, int signo)
{
int exitsig, error, docore;
struct proc *p;
struct lwp *t;
p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(l->l_blcnt == 0);
/*
* Don't permit coredump() multiple times in the same process.
* Call back into sigexit, where we will be suspended until
* the deed is done. Note that this is a recursive call, but
* LW_WCORE will prevent us from coming back this way.
*/
if ((p->p_sflag & PS_WCORE) != 0) {
lwp_lock(l);
l->l_flag |= (LW_WCORE | LW_WEXIT | LW_WSUSPEND);
lwp_need_userret(l);
lwp_unlock(l);
mutex_exit(p->p_lock);
lwp_userret(l);
panic("sigexit 1");
/* NOTREACHED */
}
/* If process is already on the way out, then bail now. */
if ((p->p_sflag & PS_WEXIT) != 0) {
mutex_exit(p->p_lock);
lwp_exit(l);
panic("sigexit 2");
/* NOTREACHED */
}
/*
* Prepare all other LWPs for exit. If dumping core, suspend them
* so that their registers are available long enough to be dumped.
*/
if ((docore = (sigprop[signo] & SA_CORE)) != 0) {
p->p_sflag |= PS_WCORE;
for (;;) {
LIST_FOREACH(t, &p->p_lwps, l_sibling) {
lwp_lock(t);
if (t == l) {
t->l_flag &=
~(LW_WSUSPEND | LW_DBGSUSPEND);
lwp_unlock(t);
continue;
}
t->l_flag |= (LW_WCORE | LW_WEXIT);
lwp_need_userret(t);
lwp_suspend(l, t);
}
if (p->p_nrlwps == 1)
break;
/*
* Kick any LWPs sitting in lwp_wait1(), and wait
* for everyone else to stop before proceeding.
*/
p->p_nlwpwait++;
cv_broadcast(&p->p_lwpcv);
cv_wait(&p->p_lwpcv, p->p_lock);
p->p_nlwpwait--;
}
}
exitsig = signo;
p->p_acflag |= AXSIG;
memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
p->p_sigctx.ps_info._signo = signo;
p->p_sigctx.ps_info._code = SI_NOINFO;
if (docore) {
mutex_exit(p->p_lock);
MODULE_HOOK_CALL(coredump_hook, (l, NULL), enosys(), error);
if (kern_logsigexit) {
int uid = l->l_cred ?
(int)kauth_cred_geteuid(l->l_cred) : -1;
if (error)
log(LOG_INFO, lognocoredump, p->p_pid,
p->p_comm, uid, signo, error);
else
log(LOG_INFO, logcoredump, p->p_pid,
p->p_comm, uid, signo);
}
#ifdef PAX_SEGVGUARD
rw_enter(&exec_lock, RW_WRITER);
pax_segvguard(l, p->p_textvp, p->p_comm, true);
rw_exit(&exec_lock);
#endif /* PAX_SEGVGUARD */
/* Acquire the sched state mutex. exit1() will release it. */
mutex_enter(p->p_lock);
if (error == 0)
p->p_sflag |= PS_COREDUMP;
}
/* No longer dumping core. */
p->p_sflag &= ~PS_WCORE;
exit1(l, 0, exitsig);
/* NOTREACHED */
}
/*
* Since the "real" code may (or may not) be present in loadable module,
* we provide routines here which calls the module hooks.
*/
int
coredump_netbsd(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_netbsd_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_netbsd32(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_netbsd32_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_elf32(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_elf32_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_elf64(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_elf64_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
/*
* Put process 'p' into the stopped state and optionally, notify the parent.
*/
void
proc_stop(struct proc *p, int signo)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
/*
* First off, set the stopping indicator and bring all sleeping
* LWPs to a halt so they are included in p->p_nrlwps. We mustn't
* unlock between here and the p->p_nrlwps check below.
*/
p->p_sflag |= PS_STOPPING;
membar_producer();
proc_stop_lwps(p);
/*
* If there are no LWPs available to take the signal, then we
* signal the parent process immediately. Otherwise, the last
* LWP to stop will take care of it.
*/
if (p->p_nrlwps == 0) {
proc_stop_done(p, PS_NOCLDSTOP);
} else {
/*
* Have the remaining LWPs come to a halt, and trigger
* proc_stop_callout() to ensure that they do.
*/
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
sigpost(l, SIG_DFL, SA_STOP, signo);
}
callout_schedule(&proc_stop_ch, 1);
}
}
/*
* When stopping a process, we do not immediately set sleeping LWPs stopped,
* but wait for them to come to a halt at the kernel-user boundary. This is
* to allow LWPs to release any locks that they may hold before stopping.
*
* Non-interruptable sleeps can be long, and there is the potential for an
* LWP to begin sleeping interruptably soon after the process has been set
* stopping (PS_STOPPING). These LWPs will not notice that the process is
* stopping, and so complete halt of the process and the return of status
* information to the parent could be delayed indefinitely.
*
* To handle this race, proc_stop_callout() runs once per tick while there
* are stopping processes in the system. It sets LWPs that are sleeping
* interruptably into the LSSTOP state.
*
* Note that we are not concerned about keeping all LWPs stopped while the
* process is stopped: stopped LWPs can awaken briefly to handle signals.
* What we do need to ensure is that all LWPs in a stopping process have
* stopped at least once, so that notification can be sent to the parent
* process.
*/
static void
proc_stop_callout(void *cookie)
{
bool more, restart;
struct proc *p;
(void)cookie;
do {
restart = false;
more = false;
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
mutex_enter(p->p_lock);
if ((p->p_sflag & PS_STOPPING) == 0) {
mutex_exit(p->p_lock);
continue;
}
/* Stop any LWPs sleeping interruptably. */
proc_stop_lwps(p);
if (p->p_nrlwps == 0) {
/*
* We brought the process to a halt.
* Mark it as stopped and notify the
* parent.
*
* Note that proc_stop_done() will
* drop p->p_lock briefly.
* Arrange to restart and check
* all processes again.
*/
restart = true;
proc_stop_done(p, PS_NOCLDSTOP);
} else
more = true;
mutex_exit(p->p_lock);
if (restart)
break;
}
mutex_exit(&proc_lock);
} while (restart);
/*
* If we noted processes that are stopping but still have
* running LWPs, then arrange to check again in 1 tick.
*/
if (more)
callout_schedule(&proc_stop_ch, 1);
}
/*
* Given a process in state SSTOP, set the state back to SACTIVE and
* move LSSTOP'd LWPs to LSSLEEP or make them runnable.
*/
void
proc_unstop(struct proc *p)
{
struct lwp *l;
int sig;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
sig = p->p_xsig;
if (!p->p_waited)
p->p_pptr->p_nstopchild--;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
if (l->l_stat != LSSTOP || (l->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(l);
continue;
}
if (l->l_wchan == NULL) {
setrunnable(l);
continue;
}
if (sig && (l->l_flag & LW_SINTR) != 0) {
setrunnable(l);
sig = 0;
} else {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
lwp_unlock(l);
}
}
}
void
proc_stoptrace(int trapno, int sysnum, const register_t args[],
const register_t *ret, int error)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
struct sigacts *ps;
sigset_t *mask;
sig_t action;
ksiginfo_t ksi;
size_t i, sy_narg;
const int signo = SIGTRAP;
KASSERT((trapno == TRAP_SCE) || (trapno == TRAP_SCX));
KASSERT(p->p_pptr != initproc);
KASSERT(ISSET(p->p_slflag, PSL_TRACED));
KASSERT(ISSET(p->p_slflag, PSL_SYSCALL));
sy_narg = p->p_emul->e_sysent[sysnum].sy_narg;
KSI_INIT_TRAP(&ksi);
ksi.ksi_lid = l->l_lid;
ksi.ksi_signo = signo;
ksi.ksi_code = trapno;
ksi.ksi_sysnum = sysnum;
if (trapno == TRAP_SCE) {
ksi.ksi_retval[0] = 0;
ksi.ksi_retval[1] = 0;
ksi.ksi_error = 0;
} else {
ksi.ksi_retval[0] = ret[0];
ksi.ksi_retval[1] = ret[1];
ksi.ksi_error = error;
}
memset(ksi.ksi_args, 0, sizeof(ksi.ksi_args));
for (i = 0; i < sy_narg; i++)
ksi.ksi_args[i] = args[i];
mutex_enter(p->p_lock);
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
lwp_exit(l);
panic("proc_stoptrace");
/* NOTREACHED */
}
/*
* If there's a pending SIGKILL process it immediately.
*/
if (p->p_xsig == SIGKILL ||
sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
mutex_exit(p->p_lock);
return;
}
/*
* If we are no longer traced, abandon this event signal.
*
* This avoids killing a process after detaching the debugger.
*/
if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
mutex_exit(p->p_lock);
return;
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
sigswitch_unlock_and_switch_away(l);
mutex_enter(p->p_lock);
goto repeat;
}
/* Needed for ktrace */
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
mask = &l->l_sigmask;
p->p_xsig = signo;
p->p_sigctx.ps_lwp = ksi.ksi_lid;
p->p_sigctx.ps_info = ksi.ksi_info;
sigswitch(0, signo, false);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
else
ktrpsig(signo, action, mask, &ksi);
}
}
static int
filt_sigattach(struct knote *kn)
{
struct proc *p = curproc;
kn->kn_obj = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
mutex_enter(p->p_lock);
klist_insert(&p->p_klist, kn);
mutex_exit(p->p_lock);
return 0;
}
static void
filt_sigdetach(struct knote *kn)
{
struct proc *p = kn->kn_obj;
mutex_enter(p->p_lock);
klist_remove(&p->p_klist, kn);
mutex_exit(p->p_lock);
}
/*
* Signal knotes are shared with proc knotes, so we apply a mask to
* the hint in order to differentiate them from process hints. This
* could be avoided by using a signal-specific knote list, but probably
* isn't worth the trouble.
*/
static int
filt_signal(struct knote *kn, long hint)
{
if (hint & NOTE_SIGNAL) {
hint &= ~NOTE_SIGNAL;
if (kn->kn_id == hint)
kn->kn_data++;
}
return (kn->kn_data != 0);
}
const struct filterops sig_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_sigattach,
.f_detach = filt_sigdetach,
.f_event = filt_signal,
};
/* $NetBSD: uvm_physseg.c,v 1.20 2024/01/13 09:44:42 tnn Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.h 7.3 (Berkeley) 4/21/91
* from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* Consolidated API from uvm_page.c and others.
* Consolidated and designed by Cherry G. Mathew <cherry@zyx.in>
* rbtree(3) backing implementation by:
* Santhosh N. Raju <santhosh.raju@gmail.com>
*/
#ifdef _KERNEL_OPT
#include "opt_uvm.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/extent.h>
#include <sys/kmem.h>
#include <uvm/uvm.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_param.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_physseg.h>
/*
* uvm_physseg: describes one segment of physical memory
*/
struct uvm_physseg {
/* used during RB tree lookup for PHYS_TO_VM_PAGE(). */
#if defined(UVM_HOTPLUG)
struct rb_node rb_node; /* tree information */
#endif
paddr_t start; /* PF# of first page in segment */
paddr_t end; /* (PF# of last page in segment) + 1 */
struct vm_page *pgs; /* vm_page structures (from start) */
/* less performance sensitive fields. */
paddr_t avail_start; /* PF# of first free page in segment */
paddr_t avail_end; /* (PF# of last free page in segment) +1 */
struct extent *ext; /* extent(9) structure to manage pgs[] */
int free_list; /* which free list they belong on */
u_long start_hint; /* start looking for free pages here */
#ifdef __HAVE_PMAP_PHYSSEG
struct pmap_physseg pmseg; /* pmap specific (MD) data */
#endif
};
/*
* These functions are reserved for uvm(9) internal use and are not
* exported in the header file uvm_physseg.h
*
* Thus they are redefined here.
*/
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
#if defined(UVM_HOTPLUG) /* rbtree impementation */
#define HANDLE_TO_PHYSSEG_NODE(h) ((struct uvm_physseg *)(h))
#define PHYSSEG_NODE_TO_HANDLE(u) ((uvm_physseg_t)(u))
struct uvm_physseg_graph {
struct rb_tree rb_tree; /* Tree for entries */
int nentries; /* Number of entries */
} __aligned(COHERENCY_UNIT);
static struct uvm_physseg_graph uvm_physseg_graph __read_mostly;
/*
* Note on kmem(9) allocator usage:
* We take the conservative approach that plug/unplug are allowed to
* fail in high memory stress situations.
*
* We want to avoid re-entrant situations in which one plug/unplug
* operation is waiting on a previous one to complete, since this
* makes the design more complicated than necessary.
*
* We may review this and change its behaviour, once the use cases
* become more obvious.
*/
/*
* Special alloc()/free() functions for boot time support:
* We assume that alloc() at boot time is only for new 'vm_physseg's
* This allows us to use a static array for memory allocation at boot
* time. Thus we avoid using kmem(9) which is not ready at this point
* in boot.
*
* After kmem(9) is ready, we use it. We currently discard any free()s
* to this static array, since the size is small enough to be a
* trivial waste on all architectures we run on.
*/
static size_t nseg = 0;
static struct uvm_physseg uvm_physseg[VM_PHYSSEG_MAX];
static void *
uvm_physseg_alloc(size_t sz)
{
/*
* During boot time, we only support allocating vm_physseg
* entries from the static array.
* We need to assert for this.
*/
if (__predict_false(uvm.page_init_done == false)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to alloc size other than multiple"
" of struct uvm_physseg at boot\n", __func__);
size_t n = sz / sizeof(struct uvm_physseg);
nseg += n;
KASSERT(nseg > 0);
KASSERT(nseg <= VM_PHYSSEG_MAX);
return &uvm_physseg[nseg - n];
}
return kmem_zalloc(sz, KM_NOSLEEP);
}
static void
uvm_physseg_free(void *p, size_t sz)
{
/*
* This is a bit tricky. We do allow simulation of free()
* during boot (for eg: when MD code is "steal"ing memory,
* and the segment has been exhausted (and thus needs to be
* free() - ed.
* free() also complicates things because we leak the
* free(). Therefore calling code can't assume that free()-ed
* memory is available for alloc() again, at boot time.
*
* Thus we can't explicitly disallow free()s during
* boot time. However, the same restriction for alloc()
* applies to free(). We only allow uvm_physseg related free()s
* via this function during boot time.
*/
if (__predict_false(uvm.page_init_done == false)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to free size other than struct uvm_physseg"
" at boot\n", __func__);
}
/*
* Could have been in a single if(){} block - split for
* clarity
*/
if ((struct uvm_physseg *)p >= uvm_physseg &&
(struct uvm_physseg *)p < (uvm_physseg + VM_PHYSSEG_MAX)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to free() other than struct uvm_physseg"
" from static array\n", __func__);
if ((sz / sizeof(struct uvm_physseg)) >= VM_PHYSSEG_MAX)
panic("%s: tried to free() the entire static array!", __func__);
return; /* Nothing to free */
}
kmem_free(p, sz);
}
/* XXX: Multi page size */
bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
int preload;
size_t slabpages;
struct uvm_physseg *ps, *current_ps = NULL;
struct vm_page *slab = NULL, *pgs = NULL;
#ifdef DEBUG
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pfn, &off);
ps = HANDLE_TO_PHYSSEG_NODE(upm);
if (ps != NULL) /* XXX; do we allow "update" plugs ? */
return false;
#endif
/*
* do we have room?
*/
ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
if (ps == NULL) {
printf("uvm_page_physload: unable to load physical memory "
"segment\n");
printf("\t%d segments allocated, ignoring 0x%"PRIxPADDR" -> 0x%"PRIxPADDR"\n",
VM_PHYSSEG_MAX, pfn, pfn + pages + 1);
printf("\tincrease VM_PHYSSEG_MAX\n");
return false;
}
/* span init */
ps->start = pfn;
ps->end = pfn + pages;
/*
* XXX: Ugly hack because uvmexp.npages accounts for only
* those pages in the segment included below as well - this
* should be legacy and removed.
*/
ps->avail_start = ps->start;
ps->avail_end = ps->end;
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
preload = 1; /* We are going to assume it is a preload */
RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
/* If there are non NULL pages then we are not in a preload */
if (current_ps->pgs != NULL) {
preload = 0;
/* Try to scavenge from earlier unplug()s. */
pgs = uvm_physseg_seg_alloc_from_slab(current_ps, pages);
if (pgs != NULL) {
break;
}
}
}
/*
* if VM is already running, attempt to kmem_alloc vm_page structures
*/
if (!preload) {
if (pgs == NULL) { /* Brand new */
/* Iteratively try alloc down from uvmexp.npages */
for (slabpages = (size_t) uvmexp.npages; slabpages >= pages; slabpages--) {
slab = kmem_zalloc(sizeof *pgs * (long unsigned int)slabpages, KM_NOSLEEP);
if (slab != NULL)
break;
}
if (slab == NULL) {
uvm_physseg_free(ps, sizeof(struct uvm_physseg));
return false;
}
uvm_physseg_seg_chomp_slab(ps, slab, (size_t) slabpages);
/* We allocate enough for this plug */
pgs = uvm_physseg_seg_alloc_from_slab(ps, pages);
if (pgs == NULL) {
printf("unable to uvm_physseg_seg_alloc_from_slab() from backend\n");
return false;
}
} else {
/* Reuse scavenged extent */
ps->ext = current_ps->ext;
}
physmem += pages;
uvmpdpol_reinit();
} else { /* Boot time - see uvm_page.c:uvm_page_init() */
pgs = NULL;
ps->pgs = pgs;
}
/*
* now insert us in the proper place in uvm_physseg_graph.rb_tree
*/
current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
if (current_ps != ps) {
panic("uvm_page_physload: Duplicate address range detected!");
}
uvm_physseg_graph.nentries++;
/*
* uvm_pagefree() requires the PHYS_TO_VM_PAGE(pgs[i]) on the
* newly allocated pgs[] to return the correct value. This is
* a bit of a chicken and egg problem, since it needs
* uvm_physseg_find() to succeed. For this, the node needs to
* be inserted *before* uvm_physseg_init_seg() happens.
*
* During boot, this happens anyway, since
* uvm_physseg_init_seg() is called later on and separately
* from uvm_page.c:uvm_page_init().
* In the case of hotplug we need to ensure this.
*/
if (__predict_true(!preload))
uvm_physseg_init_seg(ps, pgs);
if (psp != NULL)
*psp = ps;
return true;
}
static int
uvm_physseg_compare_nodes(void *ctx, const void *nnode1, const void *nnode2)
{
const struct uvm_physseg *enode1 = nnode1;
const struct uvm_physseg *enode2 = nnode2;
KASSERT(enode1->start < enode2->start || enode1->start >= enode2->end);
KASSERT(enode2->start < enode1->start || enode2->start >= enode1->end);
if (enode1->start < enode2->start)
return -1;
if (enode1->start >= enode2->end)
return 1;
return 0;
}
static int
uvm_physseg_compare_key(void *ctx, const void *nnode, const void *pkey)
{
const struct uvm_physseg *enode = nnode;
const paddr_t pa = *(const paddr_t *) pkey;
if(enode->start <= pa && pa < enode->end)
return 0;
if (enode->start < pa)
return -1;
if (enode->end > pa)
return 1;
return 0;
}
static const rb_tree_ops_t uvm_physseg_tree_ops = {
.rbto_compare_nodes = uvm_physseg_compare_nodes,
.rbto_compare_key = uvm_physseg_compare_key,
.rbto_node_offset = offsetof(struct uvm_physseg, rb_node),
.rbto_context = NULL
};
/*
* uvm_physseg_init: init the physmem
*
* => physmem unit should not be in use at this point
*/
void
uvm_physseg_init(void)
{
rb_tree_init(&(uvm_physseg_graph.rb_tree), &uvm_physseg_tree_ops);
uvm_physseg_graph.nentries = 0;
}
uvm_physseg_t
uvm_physseg_get_next(uvm_physseg_t upm)
{
/* next of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(upm) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
RB_DIR_RIGHT);
}
uvm_physseg_t
uvm_physseg_get_prev(uvm_physseg_t upm)
{
/* prev of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(upm) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
RB_DIR_LEFT);
}
uvm_physseg_t
uvm_physseg_get_last(void)
{
return (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));
}
uvm_physseg_t
uvm_physseg_get_first(void)
{
return (uvm_physseg_t) RB_TREE_MIN(&(uvm_physseg_graph.rb_tree));
}
paddr_t
uvm_physseg_get_highest_frame(void)
{
struct uvm_physseg *ps =
(uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));
return ps->end - 1;
}
/*
* uvm_page_physunload: unload physical memory and return it to
* caller.
*/
bool
uvm_page_physunload(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
struct uvm_physseg *seg;
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (seg->free_list != freelist) {
return false;
}
/*
* During cold boot, what we're about to unplug hasn't been
* put on the uvm freelist, nor has uvmexp.npages been
* updated. (This happens in uvm_page.c:uvm_page_init())
*
* For hotplug, we assume here that the pages being unloaded
* here are completely out of sight of uvm (ie; not on any uvm
* lists), and that uvmexp.npages has been suitably
* decremented before we're called.
*
* XXX: will avail_end == start if avail_start < avail_end?
*/
/* try from front */
if (seg->avail_start == seg->start &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_start);
return uvm_physseg_unplug(seg->avail_start, 1);
}
/* try from rear */
if (seg->avail_end == seg->end &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_end - 1);
return uvm_physseg_unplug(seg->avail_end - 1, 1);
}
return false;
}
bool
uvm_page_physunload_force(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
struct uvm_physseg *seg;
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
/* any room in this bank? */
if (seg->avail_start >= seg->avail_end) {
return false; /* nope */
}
*paddrp = ctob(seg->avail_start);
/* Always unplug from front */
return uvm_physseg_unplug(seg->avail_start, 1);
}
/*
* vm_physseg_find: find vm_physseg structure that belongs to a PA
*/
uvm_physseg_t
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{
struct uvm_physseg * ps = NULL;
ps = rb_tree_find_node(&(uvm_physseg_graph.rb_tree), &pframe);
if(ps != NULL && offp != NULL)
*offp = pframe - ps->start;
return ps;
}
#else /* UVM_HOTPLUG */
/*
* physical memory config is stored in vm_physmem.
*/
#define VM_PHYSMEM_PTR(i) (&vm_physmem[i])
#if VM_PHYSSEG_MAX == 1
#define VM_PHYSMEM_PTR_SWAP(i, j) /* impossible */
#else
#define VM_PHYSMEM_PTR_SWAP(i, j) \
do { vm_physmem[(i)] = vm_physmem[(j)]; } while (0)
#endif
#define HANDLE_TO_PHYSSEG_NODE(h) (VM_PHYSMEM_PTR((int)h))
#define PHYSSEG_NODE_TO_HANDLE(u) ((int)((vsize_t) (u - vm_physmem) / sizeof(struct uvm_physseg)))
/* XXXCDC: uvm.physmem */
static struct uvm_physseg vm_physmem[VM_PHYSSEG_MAX] __read_mostly;
/* XXXCDC: uvm.nphysseg */
static int vm_nphysseg __read_mostly = 0;
#define vm_nphysmem vm_nphysseg
void
uvm_physseg_init(void)
{
/* XXX: Provisioning for rb_tree related init(s) */
return;
}
int
uvm_physseg_get_next(uvm_physseg_t lcv)
{
/* next of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(lcv) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (lcv + 1);
}
int
uvm_physseg_get_prev(uvm_physseg_t lcv)
{
/* prev of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(lcv) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (lcv - 1);
}
int
uvm_physseg_get_last(void)
{
return (vm_nphysseg - 1);
}
int
uvm_physseg_get_first(void)
{
return 0;
}
paddr_t
uvm_physseg_get_highest_frame(void)
{
int lcv;
paddr_t last = 0;
struct uvm_physseg *ps;
for (lcv = 0; lcv < vm_nphysseg; lcv++) {
ps = VM_PHYSMEM_PTR(lcv);
if (last < ps->end)
last = ps->end;
}
return last;
}
static struct vm_page *
uvm_post_preload_check(void)
{
int preload, lcv;
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) {
if (VM_PHYSMEM_PTR(lcv)->pgs)
break;
}
preload = (lcv == vm_nphysmem);
/*
* if VM is already running, attempt to kmem_alloc vm_page structures
*/
if (!preload) {
panic("Tried to add RAM after uvm_page_init");
}
return NULL;
}
/*
* uvm_page_physunload: unload physical memory and return it to
* caller.
*/
bool
uvm_page_physunload(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
int x;
struct uvm_physseg *seg;
uvm_post_preload_check();
seg = VM_PHYSMEM_PTR(psi);
if (seg->free_list != freelist) {
return false;
}
/* try from front */
if (seg->avail_start == seg->start &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_start);
seg->avail_start++;
seg->start++;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
/* try from rear */
if (seg->avail_end == seg->end &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_end - 1);
seg->avail_end--;
seg->end--;
/* nothing left? nuke it */
if (seg->avail_end == seg->start) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
return false;
}
bool
uvm_page_physunload_force(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
int x;
struct uvm_physseg *seg;
uvm_post_preload_check();
seg = VM_PHYSMEM_PTR(psi);
/* any room in this bank? */
if (seg->avail_start >= seg->avail_end) {
return false; /* nope */
}
*paddrp = ctob(seg->avail_start);
seg->avail_start++;
/* truncate! */
seg->start = seg->avail_start;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
int lcv;
struct vm_page *pgs;
struct uvm_physseg *ps;
#ifdef DEBUG
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pfn, &off);
if (uvm_physseg_valid_p(upm)) /* XXX; do we allow "update" plugs ? */
return false;
#endif
paddr_t start = pfn;
paddr_t end = pfn + pages;
paddr_t avail_start = start;
paddr_t avail_end = end;
if (uvmexp.pagesize == 0)
panic("uvm_page_physload: page size not set!");
/*
* do we have room?
*/
if (vm_nphysmem == VM_PHYSSEG_MAX) {
printf("uvm_page_physload: unable to load physical memory "
"segment\n");
printf("\t%d segments allocated, ignoring 0x%llx -> 0x%llx\n",
VM_PHYSSEG_MAX, (long long)start, (long long)end);
printf("\tincrease VM_PHYSSEG_MAX\n");
if (psp != NULL)
*psp = UVM_PHYSSEG_TYPE_INVALID_OVERFLOW;
return false;
}
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
pgs = uvm_post_preload_check();
/*
* now insert us in the proper place in vm_physmem[]
*/
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM)
/* random: put it at the end (easy!) */
ps = VM_PHYSMEM_PTR(vm_nphysmem);
lcv = vm_nphysmem;
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
{
int x;
/* sort by address for binary search */
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
if (start < VM_PHYSMEM_PTR(lcv)->start)
break;
ps = VM_PHYSMEM_PTR(lcv);
/* move back other entries, if necessary ... */
for (x = vm_nphysmem ; x > lcv ; x--)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x - 1);
}
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
{
int x;
/* sort by largest segment first */
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
if ((end - start) >
(VM_PHYSMEM_PTR(lcv)->end - VM_PHYSMEM_PTR(lcv)->start))
break;
ps = VM_PHYSMEM_PTR(lcv);
/* move back other entries, if necessary ... */
for (x = vm_nphysmem ; x > lcv ; x--)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x - 1);
}
#else
panic("uvm_page_physload: unknown physseg strategy selected!");
#endif
ps->start = start;
ps->end = end;
ps->avail_start = avail_start;
ps->avail_end = avail_end;
ps->pgs = pgs;
vm_nphysmem++;
if (psp != NULL)
*psp = lcv;
return true;
}
/*
* when VM_PHYSSEG_MAX is 1, we can simplify these functions
*/
#if VM_PHYSSEG_MAX == 1
static inline int vm_physseg_find_contig(struct uvm_physseg *, int, paddr_t, psize_t *);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
static inline int vm_physseg_find_bsearch(struct uvm_physseg *, int, paddr_t, psize_t *);
#else
static inline int vm_physseg_find_linear(struct uvm_physseg *, int, paddr_t, psize_t *);
#endif
/*
* vm_physseg_find: find vm_physseg structure that belongs to a PA
*/
inline int
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{
#if VM_PHYSSEG_MAX == 1
return vm_physseg_find_contig(vm_physmem, vm_nphysseg, pframe, offp);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
return vm_physseg_find_bsearch(vm_physmem, vm_nphysseg, pframe, offp);
#else
return vm_physseg_find_linear(vm_physmem, vm_nphysseg, pframe, offp);
#endif
}
#if VM_PHYSSEG_MAX == 1
static inline int
vm_physseg_find_contig(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* 'contig' case */
if (pframe >= segs[0].start && pframe < segs[0].end) {
if (offp)
*offp = pframe - segs[0].start;
return(0);
}
return(-1);
}
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
static inline int
vm_physseg_find_bsearch(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* binary search for it */
int start, len, guess;
/*
* if try is too large (thus target is less than try) we reduce
* the length to trunc(len/2) [i.e. everything smaller than "try"]
*
* if the try is too small (thus target is greater than try) then
* we set the new start to be (try + 1). this means we need to
* reduce the length to (round(len/2) - 1).
*
* note "adjust" below which takes advantage of the fact that
* (round(len/2) - 1) == trunc((len - 1) / 2)
* for any value of len we may have
*/
for (start = 0, len = nsegs ; len != 0 ; len = len / 2) {
guess = start + (len / 2); /* try in the middle */
/* start past our try? */
if (pframe >= segs[guess].start) {
/* was try correct? */
if (pframe < segs[guess].end) {
if (offp)
*offp = pframe - segs[guess].start;
return guess; /* got it */
}
start = guess + 1; /* next time, start here */
len--; /* "adjust" */
} else {
/*
* pframe before try, just reduce length of
* region, done in "for" loop
*/
}
}
return(-1);
}
#else
static inline int
vm_physseg_find_linear(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* linear search for it */
int lcv;
for (lcv = 0; lcv < nsegs; lcv++) { if (pframe >= segs[lcv].start &&
pframe < segs[lcv].end) {
if (offp) *offp = pframe - segs[lcv].start;
return(lcv); /* got it */
}
}
return(-1);
}
#endif
#endif /* UVM_HOTPLUG */
/*
* PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
* back from an I/O mapping (ugh!). used in some MD code as well. it can
* be prominent in flamegraphs, so optimise it and try to make it easy for
* the compiler by including next to the inline lookup routines.
*/
struct vm_page *
uvm_phys_to_vm_page(paddr_t pa)
{
#if VM_PHYSSEG_STRAT != VM_PSTRAT_BSEARCH
/* 'contig' and linear cases */
KASSERT(vm_nphysseg > 0);
struct uvm_physseg *ps = &vm_physmem[0];
struct uvm_physseg *end = &vm_physmem[vm_nphysseg];
paddr_t pframe = atop(pa);
do {
if (pframe >= ps->start && pframe < ps->end) { return &ps->pgs[pframe - ps->start];
}
} while (VM_PHYSSEG_MAX > 1 && __predict_false(++ps < end));
return NULL;
#else
/* binary search for it */
paddr_t pf = atop(pa);
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pf, &off);
if (upm != UVM_PHYSSEG_TYPE_INVALID)
return uvm_physseg_get_pg(upm, off);
return(NULL);
#endif
}
bool
uvm_physseg_valid_p(uvm_physseg_t upm)
{
struct uvm_physseg *ps;
if (upm == UVM_PHYSSEG_TYPE_INVALID ||
upm == UVM_PHYSSEG_TYPE_INVALID_EMPTY ||
upm == UVM_PHYSSEG_TYPE_INVALID_OVERFLOW)
return false;
/*
* This is the delicate init dance -
* needs to go with the dance.
*/
if (uvm.page_init_done != true)
return true;
ps = HANDLE_TO_PHYSSEG_NODE(upm);
/* Extra checks needed only post uvm_page_init() */
if (ps->pgs == NULL)
return false;
/* XXX: etc. */
return true;
}
/*
* Boot protocol dictates that these must be able to return partially
* initialised segments.
*/
paddr_t
uvm_physseg_get_start(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->start;
}
paddr_t
uvm_physseg_get_end(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->end;
}
paddr_t
uvm_physseg_get_avail_start(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->avail_start;
}
#if defined(UVM_PHYSSEG_LEGACY)
void
uvm_physseg_set_avail_start(uvm_physseg_t upm, paddr_t avail_start)
{
struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);
#if defined(DIAGNOSTIC)
paddr_t avail_end;
avail_end = uvm_physseg_get_avail_end(upm);
KASSERT(uvm_physseg_valid_p(upm));
KASSERT(avail_start < avail_end);
KASSERT(avail_start >= ps->start);
#endif
ps->avail_start = avail_start;
}
void
uvm_physseg_set_avail_end(uvm_physseg_t upm, paddr_t avail_end)
{
struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);
#if defined(DIAGNOSTIC)
paddr_t avail_start;
avail_start = uvm_physseg_get_avail_start(upm);
KASSERT(uvm_physseg_valid_p(upm));
KASSERT(avail_end > avail_start);
KASSERT(avail_end <= ps->end);
#endif
ps->avail_end = avail_end;
}
#endif /* UVM_PHYSSEG_LEGACY */
paddr_t
uvm_physseg_get_avail_end(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->avail_end;
}
inline struct vm_page *
uvm_physseg_get_pg(uvm_physseg_t upm, paddr_t idx)
{
KASSERT(uvm_physseg_valid_p(upm));
return &HANDLE_TO_PHYSSEG_NODE(upm)->pgs[idx];
}
#ifdef __HAVE_PMAP_PHYSSEG
struct pmap_physseg *
uvm_physseg_get_pmseg(uvm_physseg_t upm)
{
KASSERT(uvm_physseg_valid_p(upm));
return &(HANDLE_TO_PHYSSEG_NODE(upm)->pmseg);
}
#endif
int
uvm_physseg_get_free_list(uvm_physseg_t upm)
{ KASSERT(uvm_physseg_valid_p(upm));
return HANDLE_TO_PHYSSEG_NODE(upm)->free_list;
}
u_long
uvm_physseg_get_start_hint(uvm_physseg_t upm)
{
KASSERT(uvm_physseg_valid_p(upm));
return HANDLE_TO_PHYSSEG_NODE(upm)->start_hint;
}
bool
uvm_physseg_set_start_hint(uvm_physseg_t upm, u_long start_hint)
{
if (uvm_physseg_valid_p(upm) == false)
return false;
HANDLE_TO_PHYSSEG_NODE(upm)->start_hint = start_hint;
return true;
}
void
uvm_physseg_init_seg(uvm_physseg_t upm, struct vm_page *pgs)
{
psize_t i;
psize_t n;
paddr_t paddr;
struct uvm_physseg *seg;
struct vm_page *pg;
KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
KASSERT(pgs != NULL);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
KASSERT(seg != NULL);
KASSERT(seg->pgs == NULL);
n = seg->end - seg->start;
seg->pgs = pgs;
/* init and free vm_pages (we've already zeroed them) */
paddr = ctob(seg->start);
for (i = 0 ; i < n ; i++, paddr += PAGE_SIZE) {
pg = &seg->pgs[i];
pg->phys_addr = paddr;
#ifdef __HAVE_VM_PAGE_MD
VM_MDPAGE_INIT(pg);
#endif
if (atop(paddr) >= seg->avail_start &&
atop(paddr) < seg->avail_end) {
uvmexp.npages++;
/* add page to free pool */
uvm_page_set_freelist(pg,
uvm_page_lookup_freelist(pg));
/* Disable LOCKDEBUG: too many and too early. */
mutex_init(&pg->interlock, MUTEX_NODEBUG, IPL_NONE);
uvm_pagefree(pg);
}
}
}
void
uvm_physseg_seg_chomp_slab(uvm_physseg_t upm, struct vm_page *pgs, size_t n)
{
struct uvm_physseg *seg = HANDLE_TO_PHYSSEG_NODE(upm);
/* max number of pre-boot unplug()s allowed */
#define UVM_PHYSSEG_BOOT_UNPLUG_MAX VM_PHYSSEG_MAX
static char btslab_ex_storage[EXTENT_FIXED_STORAGE_SIZE(UVM_PHYSSEG_BOOT_UNPLUG_MAX)];
if (__predict_false(uvm.page_init_done == false)) {
seg->ext = extent_create("Boot time slab", (u_long) pgs, (u_long) (pgs + n),
(void *)btslab_ex_storage, sizeof(btslab_ex_storage), 0);
} else {
seg->ext = extent_create("Hotplug slab", (u_long) pgs, (u_long) (pgs + n), NULL, 0, 0);
}
KASSERT(seg->ext != NULL);
}
struct vm_page *
uvm_physseg_seg_alloc_from_slab(uvm_physseg_t upm, size_t pages)
{
int err;
struct uvm_physseg *seg;
struct vm_page *pgs = NULL;
KASSERT(pages > 0);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (__predict_false(seg->ext == NULL)) {
/*
* This is a situation unique to boot time.
* It shouldn't happen at any point other than from
* the first uvm_page.c:uvm_page_init() call
* Since we're in a loop, we can get away with the
* below.
*/
KASSERT(uvm.page_init_done != true);
uvm_physseg_t upmp = uvm_physseg_get_prev(upm);
KASSERT(upmp != UVM_PHYSSEG_TYPE_INVALID);
seg->ext = HANDLE_TO_PHYSSEG_NODE(upmp)->ext;
KASSERT(seg->ext != NULL);
}
/* We allocate enough for this segment */
err = extent_alloc(seg->ext, sizeof(*pgs) * pages, 1, 0, EX_BOUNDZERO, (u_long *)&pgs);
if (err != 0) {
#ifdef DEBUG
printf("%s: extent_alloc failed with error: %d \n",
__func__, err);
#endif
}
return pgs;
}
/*
* uvm_page_physload: load physical memory into VM system
*
* => all args are PFs
* => all pages in start/end get vm_page structures
* => areas marked by avail_start/avail_end get added to the free page pool
* => we are limited to VM_PHYSSEG_MAX physical memory segments
*/
uvm_physseg_t
uvm_page_physload(paddr_t start, paddr_t end, paddr_t avail_start,
paddr_t avail_end, int free_list)
{
struct uvm_physseg *ps;
uvm_physseg_t upm;
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
if (uvmexp.pagesize == 0)
panic("uvm_page_physload: page size not set!");
if (free_list >= VM_NFREELIST || free_list < VM_FREELIST_DEFAULT)
panic("uvm_page_physload: bad free list %d", free_list);
if (start >= end)
panic("uvm_page_physload: start[%" PRIxPADDR "] >= end[%"
PRIxPADDR "]", start, end);
if (uvm_physseg_plug(start, end - start, &upm) == false) {
panic("uvm_physseg_plug() failed at boot.");
/* NOTREACHED */
return UVM_PHYSSEG_TYPE_INVALID; /* XXX: correct type */
}
ps = HANDLE_TO_PHYSSEG_NODE(upm);
/* Legacy */
ps->avail_start = avail_start;
ps->avail_end = avail_end;
ps->free_list = free_list; /* XXX: */
return upm;
}
bool
uvm_physseg_unplug(paddr_t pfn, size_t pages)
{
uvm_physseg_t upm;
paddr_t off = 0, start __diagused, end;
struct uvm_physseg *seg;
upm = uvm_physseg_find(pfn, &off);
if (!uvm_physseg_valid_p(upm)) {
printf("%s: Tried to unplug from unknown offset\n", __func__);
return false;
}
seg = HANDLE_TO_PHYSSEG_NODE(upm);
start = uvm_physseg_get_start(upm);
end = uvm_physseg_get_end(upm);
if (end < (pfn + pages)) {
printf("%s: Tried to unplug oversized span \n", __func__);
return false;
}
KASSERT(pfn == start + off); /* sanity */
if (__predict_true(uvm.page_init_done == true)) {
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
if (extent_free(seg->ext, (u_long)(seg->pgs + off), sizeof(struct vm_page) * pages, EX_MALLOCOK | EX_NOWAIT) != 0)
return false;
}
if (off == 0 && (pfn + pages) == end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
int segcount = 0;
struct uvm_physseg *current_ps;
/* Complete segment */
if (uvm_physseg_graph.nentries == 1)
panic("%s: out of memory!", __func__);
if (__predict_true(uvm.page_init_done == true)) {
RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
if (seg->ext == current_ps->ext)
segcount++;
}
KASSERT(segcount > 0);
if (segcount == 1) {
extent_destroy(seg->ext);
}
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
rb_tree_remove_node(&(uvm_physseg_graph.rb_tree), upm);
memset(seg, 0, sizeof(struct uvm_physseg));
uvm_physseg_free(seg, sizeof(struct uvm_physseg));
uvm_physseg_graph.nentries--;
#else /* UVM_HOTPLUG */
int x;
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = upm ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
#endif /* UVM_HOTPLUG */
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
return true;
}
if (off > 0 &&
(pfn + pages) < end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
/* middle chunk - need a new segment */
struct uvm_physseg *ps, *current_ps;
ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
if (ps == NULL) {
printf("%s: Unable to allocated new fragment vm_physseg \n",
__func__);
return false;
}
/* Remove middle chunk */
if (__predict_true(uvm.page_init_done == true)) {
KASSERT(seg->ext != NULL);
ps->ext = seg->ext;
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
ps->start = pfn + pages;
ps->avail_start = ps->start; /* XXX: Legacy */
ps->end = seg->end;
ps->avail_end = ps->end; /* XXX: Legacy */
seg->end = pfn;
seg->avail_end = seg->end; /* XXX: Legacy */
/*
* The new pgs array points to the beginning of the
* tail fragment.
*/
if (__predict_true(uvm.page_init_done == true))
ps->pgs = seg->pgs + off + pages;
current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
if (current_ps != ps) {
panic("uvm_page_physload: Duplicate address range detected!");
}
uvm_physseg_graph.nentries++;
#else /* UVM_HOTPLUG */
panic("%s: can't unplug() from the middle of a segment without"
" UVM_HOTPLUG\n", __func__);
/* NOTREACHED */
#endif /* UVM_HOTPLUG */
return true;
}
if (off == 0 && (pfn + pages) < end) {
/* Remove front chunk */
if (__predict_true(uvm.page_init_done == true)) {
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
/* Truncate */
seg->start = pfn + pages;
seg->avail_start = seg->start; /* XXX: Legacy */
/*
* Move the pgs array start to the beginning of the
* tail end.
*/
if (__predict_true(uvm.page_init_done == true))
seg->pgs += pages;
return true;
}
if (off > 0 && (pfn + pages) == end) {
/* back chunk */
/* Truncate! */
seg->end = pfn;
seg->avail_end = seg->end; /* XXX: Legacy */
uvmexp.npages -= (int) pages;
return true;
}
printf("%s: Tried to unplug unknown range \n", __func__);
return false;
}
/* $NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $ */
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_wapbl.h"
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/kmem.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#ifdef UFS_EXTATTR
#include <ufs/ufs/extattr.h>
#endif
#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>
/*
* Last reference to an inode. If necessary, write or delete it.
*/
int
ufs_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
struct bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
struct mount *mp = vp->v_mount;
mode_t mode;
int allerror = 0, error;
bool wapbl_locked = false;
UFS_WAPBL_JUNLOCK_ASSERT(mp);
/*
* Ignore inodes related to stale file handles.
*/
if (ip->i_mode == 0)
goto out;
if (ip->i_nlink <= 0 && (mp->mnt_flag & MNT_RDONLY) == 0) {
#ifdef UFS_EXTATTR
ufs_extattr_vnode_inactive(vp, curlwp);
#endif
/*
* All file blocks must be freed before we can let the vnode
* be reclaimed, so can't postpone full truncating any further.
*/
ufs_truncate_all(vp);
#if defined(QUOTA) || defined(QUOTA2)
error = UFS_WAPBL_BEGIN(mp);
if (error) {
allerror = error;
} else {
wapbl_locked = true;
(void)chkiq(ip, -1, NOCRED, 0);
}
#endif
DIP_ASSIGN(ip, rdev, 0);
mode = ip->i_mode;
ip->i_mode = 0;
ip->i_omode = mode;
DIP_ASSIGN(ip, mode, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Defer final inode free and update to ufs_reclaim().
*/
}
if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
if (! wapbl_locked) { error = UFS_WAPBL_BEGIN(mp);
if (error) {
allerror = error;
goto out;
}
wapbl_locked = true;
}
UFS_UPDATE(vp, NULL, NULL, 0);
}
out:
if (wapbl_locked) UFS_WAPBL_END(mp);
/*
* If we are done with the inode, reclaim it
* so that it can be reused immediately.
*/
*ap->a_recycle = (ip->i_mode == 0);
if (ip->i_mode == 0 && (DIP(ip, size) != 0 || DIP(ip, blocks) != 0)) {
printf("%s: unlinked ino %" PRId64 " on \"%s\" has"
" non zero size %" PRIx64 " or blocks %" PRIx64
" with allerror %d\n",
__func__, ip->i_number, mp->mnt_stat.f_mntonname,
DIP(ip, size), DIP(ip, blocks), allerror);
panic("%s: dirty filesystem?", __func__);
}
return (allerror);
}
/*
* Reclaim an inode so that it can be used for other purposes.
*/
int
ufs_reclaim(struct vnode *vp)
{
struct inode *ip = VTOI(vp);
if (!UFS_WAPBL_BEGIN(vp->v_mount)) { UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); UFS_WAPBL_END(vp->v_mount);
}
UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
if (ip->i_devvp) { vrele(ip->i_devvp);
ip->i_devvp = 0;
}
#if defined(QUOTA) || defined(QUOTA2)
ufsquota_free(ip);
#endif
#ifdef UFS_DIRHASH
if (ip->i_dirhash != NULL) ufsdirhash_free(ip);
#endif
return (0);
}
/*
* allocate a range of blocks in a file.
* after this function returns, any page entirely contained within the range
* will map to invalid data and thus must be overwritten before it is made
* accessible to others.
*/
int
ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
int flags)
{
off_t neweof; /* file size after the operation */
off_t neweob; /* offset next to the last block after the operation */
off_t pagestart; /* starting offset of range covered by pgs */
off_t eob; /* offset next to allocated blocks */
struct uvm_object *uobj;
int i, delta, error, npages;
int bshift = vp->v_mount->mnt_fs_bshift;
int bsize = 1 << bshift;
int ppb = MAX(bsize >> PAGE_SHIFT, 1);
struct vm_page **pgs;
size_t pgssize;
UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx len 0x%jx u_size 0x%jx",
(uintptr_t)vp, off, len, vp->v_size);
neweof = MAX(vp->v_size, off + len);
GOP_SIZE(vp, neweof, &neweob, 0);
error = 0;
uobj = &vp->v_uobj;
/*
* read or create pages covering the range of the allocation and
* keep them locked until the new block is allocated, so there
* will be no window where the old contents of the new block are
* visible to racing threads.
*/
pagestart = trunc_page(off) & ~(bsize - 1);
npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
pgssize = npages * sizeof(struct vm_page *);
pgs = kmem_zalloc(pgssize, KM_SLEEP);
/*
* adjust off to be block-aligned.
*/
delta = off & (bsize - 1);
off -= delta;
len += delta;
genfs_node_wrlock(vp);
rw_enter(uobj->vmobjlock, RW_WRITER);
error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
PGO_NOTIMESTAMP | PGO_GLOCKHELD);
if (error) {
genfs_node_unlock(vp);
goto out;
}
/*
* now allocate the range.
*/
error = GOP_ALLOC(vp, off, len, flags, cred);
genfs_node_unlock(vp);
/*
* if the allocation succeeded, mark all the pages dirty
* and clear PG_RDONLY on any pages that are now fully backed
* by disk blocks. if the allocation failed, we do not invalidate
* the pages since they might have already existed and been dirty,
* in which case we need to keep them around. if we created the pages,
* they will be clean and read-only, and leaving such pages
* in the cache won't cause any problems.
*/
GOP_SIZE(vp, off + len, &eob, 0);
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) { KASSERT((pgs[i]->flags & PG_RELEASED) == 0); if (!error) { if (off <= pagestart + (i << PAGE_SHIFT) &&
pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
pgs[i]->flags &= ~PG_RDONLY;
}
uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY);
}
uvm_pagelock(pgs[i]);
uvm_pageactivate(pgs[i]);
uvm_pageunlock(pgs[i]);
}
uvm_page_unbusy(pgs, npages);
rw_exit(uobj->vmobjlock);
out:
kmem_free(pgs, pgssize);
return error;
}
int
ufs_truncate_retry(struct vnode *vp, int ioflag, uint64_t newsize,
kauth_cred_t cred)
{
struct inode *ip = VTOI(vp);
struct mount *mp = vp->v_mount;
int error = 0;
UFS_WAPBL_JUNLOCK_ASSERT(mp);
/*
* Truncate might temporarily fail, loop until done.
*/
do {
error = UFS_WAPBL_BEGIN(mp);
if (error)
goto out;
error = UFS_TRUNCATE(vp, newsize, ioflag, cred);
UFS_WAPBL_END(mp); if (error != 0 && error != EAGAIN)
goto out;
} while (ip->i_size != newsize);
out:
return error;
}
/* truncate all the data of the inode including extended attributes */
int
ufs_truncate_all(struct vnode *vp)
{
struct inode *ip = VTOI(vp);
off_t isize = ip->i_size;
if (ip->i_ump->um_fstype == UFS2) isize += ip->i_ffs2_extsize; if (isize == 0)
return 0;
return ufs_truncate_retry(vp, IO_NORMAL | IO_EXT, 0, NOCRED);
}
/* $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Shared support code for kernels built with the DEBUG option.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $");
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <uvm/uvm_extern.h>
#include <machine/lock.h>
/*
* Allocation/free validation by pointer address. Introduces
* significant overhead and is not enabled by default. Patch
* `debug_freecheck' to 1 at boot time to enable.
*/
#define FREECHECK_BYTES (8*1024*1024)
typedef struct fcitem {
void *i_addr;
struct fcitem *i_next;
} fcitem_t;
fcitem_t *freecheck_free;
__cpu_simple_lock_t freecheck_lock;
u_int debug_freecheck;
void
debug_init(void)
{
size_t cnt;
fcitem_t *i;
__cpu_simple_lock_init(&freecheck_lock);
if (debug_freecheck) {
i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0,
UVM_KMF_WIRED);
if (i == NULL) {
printf("freecheck_init: unable to allocate memory");
return;
}
for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) {
i->i_next = freecheck_free;
freecheck_free = i++;
}
}
}
void
freecheck_out(void **head, void *addr)
{
fcitem_t *i;
int s;
if (!debug_freecheck)
return;
s = splvm();
__cpu_simple_lock(&freecheck_lock);
for (i = *head; i != NULL; i = i->i_next) {
if (i->i_addr != addr)
continue;
__cpu_simple_unlock(&freecheck_lock);
splx(s);
panic("freecheck_out: %p already out", addr);
}
if ((i = freecheck_free) != NULL) {
freecheck_free = i->i_next;
i->i_addr = addr;
i->i_next = *head;
*head = i;
}
__cpu_simple_unlock(&freecheck_lock);
splx(s);
if (i == NULL) {
if (atomic_swap_uint(&debug_freecheck, 1) == 0) printf("freecheck_out: no more slots\n");
}
}
void
freecheck_in(void **head, void *addr)
{
fcitem_t *i;
void *pp;
int s;
if (!debug_freecheck)
return;
s = splvm();
__cpu_simple_lock(&freecheck_lock);
for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) { if (i->i_addr == addr) {
*(fcitem_t **)pp = i->i_next;
i->i_next = freecheck_free;
freecheck_free = i;
break;
}
}
__cpu_simple_unlock(&freecheck_lock);
splx(s);
if (i != NULL)
return;
#ifdef DDB
printf("freecheck_in: %p not out\n", addr);
Debugger();
#else
panic("freecheck_in: %p not out", addr);
#endif
}
/* $NetBSD: socketvar.h,v 1.168 2024/02/03 19:05:14 jdolecek Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)socketvar.h 8.3 (Berkeley) 2/19/95
*/
#ifndef _SYS_SOCKETVAR_H_
#define _SYS_SOCKETVAR_H_
#include <sys/select.h>
#include <sys/selinfo.h> /* for struct selinfo */
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#if !defined(_KERNEL)
struct uio;
struct lwp;
struct uidinfo;
#else
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#endif
TAILQ_HEAD(soqhead, socket);
/*
* Variables for socket buffering.
*/
struct sockbuf {
struct selinfo sb_sel; /* process selecting read/write */
struct mowner *sb_mowner; /* who owns data for this sockbuf */
struct socket *sb_so; /* back pointer to socket */
kcondvar_t sb_cv; /* notifier */
/* When re-zeroing this struct, we zero from sb_startzero to the end */
#define sb_startzero sb_cc
u_long sb_cc; /* actual chars in buffer */
u_long sb_hiwat; /* max actual char count */
u_long sb_mbcnt; /* chars of mbufs used */
u_long sb_mbmax; /* max chars of mbufs to use */
u_long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
struct mbuf *sb_mbtail; /* the last mbuf in the chain */
struct mbuf *sb_lastrecord; /* first mbuf of last record in
socket buffer */
int sb_flags; /* flags, see below */
int sb_timeo; /* timeout for read/write */
u_long sb_overflowed; /* # of drops due to full buffer */
};
#ifndef SB_MAX
#define SB_MAX (256*1024) /* default for max chars in sockbuf */
#endif
#define SB_LOCK 0x01 /* lock on data queue */
#define SB_NOTIFY 0x04 /* someone is waiting for data/space */
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_UPCALL 0x20 /* someone wants an upcall */
#define SB_NOINTR 0x40 /* operations not interruptible */
#define SB_KNOTE 0x100 /* kernel note attached */
#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */
/*
* Kernel structure per socket.
* Contains send and receive buffer queues,
* handle on protocol and pointer to protocol
* private data and error information.
*/
struct so_accf {
struct accept_filter *so_accept_filter;
void *so_accept_filter_arg; /* saved filter args */
char *so_accept_filter_str; /* saved user args */
};
struct sockaddr;
struct socket {
kmutex_t * volatile so_lock; /* pointer to lock on structure */
kcondvar_t so_cv; /* notifier */
short so_type; /* generic type, see socket.h */
short so_options; /* from socket call, see socket.h */
u_short so_linger; /* time to linger while closing */
short so_state; /* internal state flags SS_*, below */
int so_unused; /* used to be so_nbio */
void *so_pcb; /* protocol control block */
const struct protosw *so_proto; /* protocol handle */
/*
* Variables for connection queueing.
* Socket where accepts occur is so_head in all subsidiary sockets.
* If so_head is 0, socket is not related to an accept.
* For head socket so_q0 queues partially completed connections,
* while so_q is a queue of connections ready to be accepted.
* If a connection is aborted and it has so_head set, then
* it has to be pulled out of either so_q0 or so_q.
* We allow connections to queue up based on current queue lengths
* and limit on number of queued connections for this socket.
*/
struct socket *so_head; /* back pointer to accept socket */
struct soqhead *so_onq; /* queue (q or q0) that we're on */
struct soqhead so_q0; /* queue of partial connections */
struct soqhead so_q; /* queue of incoming connections */
TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */
short so_q0len; /* partials on so_q0 */
short so_qlen; /* number of connections on so_q */
short so_qlimit; /* max number queued connections */
short so_timeo; /* connection timeout */
u_short so_error; /* error affecting connection */
u_short so_rerror; /* error affecting receiving */
u_short so_aborting; /* references from soabort() */
pid_t so_pgid; /* pgid for signals */
u_long so_oobmark; /* chars to oob mark */
struct sockbuf so_snd; /* send buffer */
struct sockbuf so_rcv; /* receive buffer */
void *so_internal; /* Space for svr4 stream data */
void (*so_upcall) (struct socket *, void *, int, int);
void * so_upcallarg; /* Arg for above */
int (*so_send) (struct socket *, struct sockaddr *,
struct uio *, struct mbuf *,
struct mbuf *, int, struct lwp *);
int (*so_receive) (struct socket *,
struct mbuf **,
struct uio *, struct mbuf **,
struct mbuf **, int *);
struct mowner *so_mowner; /* who owns mbufs for this socket */
struct uidinfo *so_uidinfo; /* who opened the socket */
gid_t so_egid; /* creator effective gid */
pid_t so_cpid; /* creator pid */
struct so_accf *so_accf;
kauth_cred_t so_cred; /* socket credentials */
};
/*
* Socket state bits.
*/
#define SS_NOFDREF 0x001 /* no file table ref any more */
#define SS_ISCONNECTED 0x002 /* socket connected to a peer */
#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */
#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */
#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */
#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */
#define SS_RCVATMARK 0x040 /* at mark on input */
#define SS_ISABORTING 0x080 /* aborting fd references - close() */
#define SS_RESTARTSYS 0x100 /* restart blocked system calls */
#define SS_POLLRDBAND 0x200 /* poll should return POLLRDBAND */
#define SS_MORETOCOME 0x400 /*
* hint from sosend to lower layer;
* more data coming
*/
#define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */
#define SS_ISAPIPE 0x1000 /* socket is implementing a pipe */
#define SS_NBIO 0x2000 /* socket is in non blocking I/O */
#ifdef _KERNEL
struct accept_filter {
char accf_name[16];
void (*accf_callback)
(struct socket *, void *, int, int);
void * (*accf_create)
(struct socket *, char *);
void (*accf_destroy)
(struct socket *);
LIST_ENTRY(accept_filter) accf_next;
u_int accf_refcnt;
};
struct sockopt {
int sopt_level; /* option level */
int sopt_name; /* option name */
size_t sopt_size; /* data length */
size_t sopt_retsize; /* returned data length */
void * sopt_data; /* data pointer */
uint8_t sopt_buf[sizeof(int)]; /* internal storage */
};
#define SB_EMPTY_FIXUP(sb) \
do { \
KASSERT(solocked((sb)->sb_so)); \
if ((sb)->sb_mb == NULL) { \
(sb)->sb_mbtail = NULL; \
(sb)->sb_lastrecord = NULL; \
} \
} while (/*CONSTCOND*/0)
extern u_long sb_max;
extern int somaxkva;
extern int sock_loan_thresh;
extern kmutex_t *softnet_lock;
struct mbuf;
struct lwp;
struct msghdr;
struct stat;
struct knote;
struct sockaddr_big;
enum uio_seg;
/* 0x400 is SO_OTIMESTAMP */
#define SOOPT_TIMESTAMP(o) ((o) & (SO_TIMESTAMP | 0x400))
/*
* File operations on sockets.
*/
int soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int soo_fcntl(file_t *, u_int cmd, void *);
int soo_ioctl(file_t *, u_long cmd, void *);
int soo_poll(file_t *, int);
int soo_kqfilter(file_t *, struct knote *);
int soo_close(file_t *);
int soo_stat(file_t *, struct stat *);
void soo_restart(file_t *);
void sbappend(struct sockbuf *, struct mbuf *);
void sbappendstream(struct sockbuf *, struct mbuf *);
int sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
struct mbuf *);
int sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
struct mbuf *, int);
int sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
void sbappendrecord(struct sockbuf *, struct mbuf *);
void sbcheck(struct sockbuf *);
void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
struct mbuf *
sbcreatecontrol(void *, int, int, int);
struct mbuf *
sbcreatecontrol1(void **, int, int, int, int);
struct mbuf **
sbsavetimestamp(int, struct mbuf **);
void sbdrop(struct sockbuf *, int);
void sbdroprecord(struct sockbuf *);
void sbflush(struct sockbuf *);
void sbinsertoob(struct sockbuf *, struct mbuf *);
void sbrelease(struct sockbuf *, struct socket *);
int sbreserve(struct sockbuf *, u_long, struct socket *);
int sbwait(struct sockbuf *);
int sb_max_set(u_long);
void soinit(void);
void soinit1(void);
void soinit2(void);
int soabort(struct socket *);
int soaccept(struct socket *, struct sockaddr *);
int sofamily(const struct socket *);
int sobind(struct socket *, struct sockaddr *, struct lwp *);
void socantrcvmore(struct socket *);
void socantsendmore(struct socket *);
void soroverflow(struct socket *);
int soclose(struct socket *);
int soconnect(struct socket *, struct sockaddr *, struct lwp *);
int soconnect2(struct socket *, struct socket *);
int socreate(int, struct socket **, int, int, struct lwp *,
struct socket *);
int fsocreate(int, struct socket **, int, int, int *, file_t **,
struct socket *);
int sodisconnect(struct socket *);
void sofree(struct socket *);
int sogetopt(struct socket *, struct sockopt *);
void sohasoutofband(struct socket *);
void soisconnected(struct socket *);
void soisconnecting(struct socket *);
void soisdisconnected(struct socket *);
void soisdisconnecting(struct socket *);
int solisten(struct socket *, int, struct lwp *);
struct socket *
sonewconn(struct socket *, bool);
void soqinsque(struct socket *, struct socket *, int);
bool soqremque(struct socket *, int);
int soreceive(struct socket *, struct mbuf **, struct uio *,
struct mbuf **, struct mbuf **, int *);
int soreserve(struct socket *, u_long, u_long);
void sorflush(struct socket *);
int sosend(struct socket *, struct sockaddr *, struct uio *,
struct mbuf *, struct mbuf *, int, struct lwp *);
int sosetopt(struct socket *, struct sockopt *);
int so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t);
int soshutdown(struct socket *, int);
void sorestart(struct socket *);
void sowakeup(struct socket *, struct sockbuf *, int);
int sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int);
int sopoll(struct socket *, int);
struct socket *soget(bool);
void soput(struct socket *);
bool solocked(const struct socket *);
bool solocked2(const struct socket *, const struct socket *);
int sblock(struct sockbuf *, int);
void sbunlock(struct sockbuf *);
int sowait(struct socket *, bool, int);
void solockretry(struct socket *, kmutex_t *);
void sosetlock(struct socket *);
void solockreset(struct socket *, kmutex_t *);
void sockopt_init(struct sockopt *, int, int, size_t);
void sockopt_destroy(struct sockopt *);
int sockopt_set(struct sockopt *, const void *, size_t);
int sockopt_setint(struct sockopt *, int);
int sockopt_get(const struct sockopt *, void *, size_t);
int sockopt_getint(const struct sockopt *, int *);
int sockopt_setmbuf(struct sockopt *, struct mbuf *);
struct mbuf *sockopt_getmbuf(const struct sockopt *);
int copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *);
int copyout_sockname_sb(struct sockaddr *, unsigned int *,
int , struct sockaddr_big *);
int copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *);
void free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *);
int do_sys_getpeername(int, struct sockaddr *);
int do_sys_getsockname(int, struct sockaddr *);
int do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *);
int do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *,
struct msghdr *, int, register_t *);
int do_sys_recvmsg(struct lwp *, int, struct msghdr *,
struct mbuf **, struct mbuf **, register_t *);
int do_sys_recvmsg_so(struct lwp *, int, struct socket *,
struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *);
int do_sys_bind(struct lwp *, int, struct sockaddr *);
int do_sys_connect(struct lwp *, int, struct sockaddr *);
int do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *,
const sigset_t *, int, int);
int do_sys_peeloff(struct socket *, void *);
/*
* Inline functions for sockets and socket buffering.
*/
#include <sys/protosw.h>
#include <sys/mbuf.h>
/*
* Do we need to notify the other side when I/O is possible?
*/
static __inline int
sb_notify(struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so));
return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE);
}
/*
* How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
* Since the fields are unsigned, detect overflow and return 0.
*/
static __inline u_long
sbspace(const struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so)); if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
return 0;
return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
static __inline u_long
sbspace_oob(const struct sockbuf *sb)
{
u_long hiwat = sb->sb_hiwat;
if (hiwat < ULONG_MAX - 1024)
hiwat += 1024;
KASSERT(solocked(sb->sb_so)); if (hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
return 0;
return lmin(hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
/*
* How much socket buffer space has been used?
*/
static __inline u_long
sbused(const struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so));
return sb->sb_cc;
}
/* do we have to send all at once on a socket? */
static __inline int
sosendallatonce(const struct socket *so)
{
return so->so_proto->pr_flags & PR_ATOMIC;
}
/* can we read something from so? */
static __inline int
soreadable(const struct socket *so)
{
KASSERT(solocked(so)); return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || (so->so_state & SS_CANTRCVMORE) != 0 || so->so_qlen != 0 || so->so_error != 0 || so->so_rerror != 0;
}
/* can we write something to so? */
static __inline int
sowritable(const struct socket *so)
{
KASSERT(solocked(so)); return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat && ((so->so_state & SS_ISCONNECTED) != 0 || (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) || (so->so_state & SS_CANTSENDMORE) != 0 ||
so->so_error != 0;
}
/* adjust counters in sb reflecting allocation of m */
static __inline void
sballoc(struct sockbuf *sb, struct mbuf *m)
{
KASSERT(solocked(sb->sb_so));
sb->sb_cc += m->m_len;
sb->sb_mbcnt += MSIZE;
if (m->m_flags & M_EXT) sb->sb_mbcnt += m->m_ext.ext_size;
}
/* adjust counters in sb reflecting freeing of m */
static __inline void
sbfree(struct sockbuf *sb, struct mbuf *m)
{
KASSERT(solocked(sb->sb_so));
sb->sb_cc -= m->m_len;
sb->sb_mbcnt -= MSIZE;
if (m->m_flags & M_EXT) sb->sb_mbcnt -= m->m_ext.ext_size;
}
static __inline void
sorwakeup(struct socket *so)
{
KASSERT(solocked(so)); if (sb_notify(&so->so_rcv)) sowakeup(so, &so->so_rcv, POLL_IN);
}
static __inline void
sowwakeup(struct socket *so)
{
KASSERT(solocked(so)); if (sb_notify(&so->so_snd)) sowakeup(so, &so->so_snd, POLL_OUT);
}
static __inline void
solock(struct socket *so)
{
kmutex_t *lock;
lock = atomic_load_consume(&so->so_lock);
mutex_enter(lock);
if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock);
}
static __inline void
sounlock(struct socket *so)
{
mutex_exit(so->so_lock);
}
#ifdef SOCKBUF_DEBUG
/*
* SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly.
* SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly.
*
* => panic if the socket buffer is inconsistent.
* => 'where' is used for a panic message.
*/
void sblastrecordchk(struct sockbuf *, const char *);
#define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where))
void sblastmbufchk(struct sockbuf *, const char *);
#define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where))
#define SBCHECK(sb) sbcheck(sb)
#else
#define SBLASTRECORDCHK(sb, where) /* nothing */
#define SBLASTMBUFCHK(sb, where) /* nothing */
#define SBCHECK(sb) /* nothing */
#endif /* SOCKBUF_DEBUG */
/* sosend loan */
vaddr_t sokvaalloc(vaddr_t, vsize_t, struct socket *);
void sokvafree(vaddr_t, vsize_t);
void soloanfree(struct mbuf *, void *, size_t, void *);
/*
* Values for socket-buffer-append priority argument to sbappendaddrchain().
* The following flags are reserved for future implementation:
*
* SB_PRIO_NONE: honour normal socket-buffer limits.
*
* SB_PRIO_ONESHOT_OVERFLOW: if the socket has any space,
* deliver the entire chain. Intended for large requests
* that should be delivered in their entirety, or not at all.
*
* SB_PRIO_OVERDRAFT: allow a small (2*MLEN) overflow, over and
* aboce normal socket limits. Intended messages indicating
* buffer overflow in earlier normal/lower-priority messages .
*
* SB_PRIO_BESTEFFORT: Ignore limits entirely. Intended only for
* kernel-generated messages to specially-marked scokets which
* require "reliable" delivery, nd where the source socket/protocol
* message generator enforce some hard limit (but possibly well
* above kern.sbmax). It is entirely up to the in-kernel source to
* avoid complete mbuf exhaustion or DoS scenarios.
*/
#define SB_PRIO_NONE 0
#define SB_PRIO_ONESHOT_OVERFLOW 1
#define SB_PRIO_OVERDRAFT 2
#define SB_PRIO_BESTEFFORT 3
/*
* Accept filter functions (duh).
*/
int accept_filt_getopt(struct socket *, struct sockopt *);
int accept_filt_setopt(struct socket *, const struct sockopt *);
int accept_filt_clear(struct socket *);
int accept_filt_add(struct accept_filter *);
int accept_filt_del(struct accept_filter *);
struct accept_filter *accept_filt_get(char *);
#ifdef ACCEPT_FILTER_MOD
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_accf);
#endif
void accept_filter_init(void);
#endif
#ifdef DDB
int sofindproc(struct socket *so, int all, void (*pr)(const char *, ...));
void socket_print(const char *modif, void (*pr)(const char *, ...));
#endif
#endif /* _KERNEL */
#endif /* !_SYS_SOCKETVAR_H_ */
/* $NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $");
#include <sys/param.h>
/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];
#ifndef _KERNEL
#define FFS_EI /* always include byteswapped filesystems support */
#endif
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#ifndef _KERNEL
#include <ufs/ufs/dinode.h>
void panic(const char *, ...)
__attribute__((__noreturn__,__format__(__printf__,1,2)));
#else /* _KERNEL */
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/inttypes.h>
#include <sys/pool.h>
#include <sys/fstrans.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
/*
* Load up the contents of an inode and copy the appropriate pieces
* to the incore copy.
*/
void
ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
{
struct ufs1_dinode *dp1;
struct ufs2_dinode *dp2;
if (ip->i_ump->um_fstype == UFS1) {
dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
else
#endif
*ip->i_din.ffs1_din = *dp1;
ip->i_mode = ip->i_ffs1_mode;
ip->i_nlink = ip->i_ffs1_nlink;
ip->i_size = ip->i_ffs1_size;
ip->i_flags = ip->i_ffs1_flags;
ip->i_gen = ip->i_ffs1_gen;
ip->i_uid = ip->i_ffs1_uid;
ip->i_gid = ip->i_ffs1_gid;
} else {
dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
else
#endif
*ip->i_din.ffs2_din = *dp2;
ip->i_mode = ip->i_ffs2_mode;
ip->i_nlink = ip->i_ffs2_nlink;
ip->i_size = ip->i_ffs2_size;
ip->i_flags = ip->i_ffs2_flags;
ip->i_gen = ip->i_ffs2_gen;
ip->i_uid = ip->i_ffs2_uid;
ip->i_gid = ip->i_ffs2_gid;
}
}
int
ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
bool clearbuf, buf_t **bpp)
{
int error = 0;
KASSERT(blkno >= 0 || blkno == FFS_NOBLK); if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
return ENOMEM;
if (blkno != FFS_NOBLK) (*bpp)->b_blkno = blkno; if (clearbuf) clrbuf(*bpp); if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0) { brelse(*bpp, BC_INVAL);
*bpp = NULL;
}
return error;
}
#endif /* _KERNEL */
/*
* Update the frsum fields to reflect addition or deletion
* of some frags.
*/
void
ffs_fragacct(struct fs *fs, int fragmap, uint32_t fraglist[], int cnt,
int needswap)
{
int inblk;
int field, subfield;
int siz, pos;
inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
fragmap <<= 1;
for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
continue;
field = around[siz];
subfield = inside[siz];
for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) {
fraglist[siz] = ufs_rw32(
ufs_rw32(fraglist[siz], needswap) + cnt,
needswap);
pos += siz;
field <<= siz;
subfield <<= siz;
}
field <<= 1;
subfield <<= 1;
}
}
}
/*
* block operations
*
* check if a block is available
* returns true if all the corresponding bits in the free map are 1
* returns false if any corresponding bit in the free map is 0
*/
int
ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
{
u_char mask;
switch ((int)fs->fs_fragshift) {
case 3:
return (cp[h] == 0xff);
case 2:
mask = 0x0f << ((h & 0x1) << 2);
return ((cp[h >> 1] & mask) == mask);
case 1:
mask = 0x03 << ((h & 0x3) << 1);
return ((cp[h >> 2] & mask) == mask);
case 0:
mask = 0x01 << (h & 0x7);
return ((cp[h >> 3] & mask) == mask);
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* check if a block is completely allocated
* returns true if all the corresponding bits in the free map are 0
* returns false if any corresponding bit in the free map is 1
*/
int
ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
return (cp[h] == 0);
case 2:
return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
case 1:
return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
case 0:
return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* take a block out of the map
*/
void
ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
cp[h] = 0;
return;
case 2:
cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
return;
case 1:
cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
return;
case 0:
cp[h >> 3] &= ~(0x01 << (h & 0x7));
return;
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* put a block into the map
*/
void
ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
cp[h] = 0xff;
return;
case 2:
cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
return;
case 1:
cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
return;
case 0:
cp[h >> 3] |= (0x01 << (h & 0x7));
return;
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* Update the cluster map because of an allocation or free.
*
* Cnt == 1 means free; cnt == -1 means allocating.
*/
void
ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
{
int32_t *sump;
int32_t *lp;
u_char *freemapp, *mapp;
int i, start, end, forw, back, map;
unsigned int bit;
const int needswap = UFS_FSNEEDSWAP(fs);
/* KASSERT(mutex_owned(&ump->um_lock)); */
if (fs->fs_contigsumsize <= 0)
return;
freemapp = cg_clustersfree(cgp, needswap);
sump = cg_clustersum(cgp, needswap);
/*
* Allocate or clear the actual block.
*/
if (cnt > 0)
setbit(freemapp, blkno);
else
clrbit(freemapp, blkno);
/*
* Find the size of the cluster going forward.
*/
start = blkno + 1;
end = start + fs->fs_contigsumsize;
if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap)) end = ufs_rw32(cgp->cg_nclusterblks, needswap);
mapp = &freemapp[start / NBBY];
map = *mapp++;
bit = 1U << ((unsigned int)start % NBBY);
for (i = start; i < end; i++) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != (NBBY - 1)) {
bit <<= 1;
} else {
map = *mapp++;
bit = 1;
}
}
forw = i - start;
/*
* Find the size of the cluster going backward.
*/
start = blkno - 1;
end = start - fs->fs_contigsumsize;
if (end < 0)
end = -1;
mapp = &freemapp[start / NBBY];
map = *mapp--;
bit = 1U << ((unsigned int)start % NBBY);
for (i = start; i > end; i--) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != 0) {
bit >>= 1;
} else {
map = *mapp--;
bit = 1U << (NBBY - 1);
}
}
back = start - i;
/*
* Account for old cluster and the possibly new forward and
* back clusters.
*/
i = back + forw + 1;
if (i > fs->fs_contigsumsize)
i = fs->fs_contigsumsize;
ufs_add32(sump[i], cnt, needswap);
if (back > 0)
ufs_add32(sump[back], -cnt, needswap);
if (forw > 0) ufs_add32(sump[forw], -cnt, needswap);
/*
* Update cluster summary information.
*/
lp = &sump[fs->fs_contigsumsize];
for (i = fs->fs_contigsumsize; i > 0; i--) if (ufs_rw32(*lp--, needswap) > 0)
break;
#if defined(_KERNEL)
fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
#endif
}
/* $NetBSD: scsipi_ioctl.c,v 1.73 2019/12/27 09:41:51 msaitoh Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Contributed by HD Associates (hd@world.std.com).
* Copyright (c) 1992, 1993 HD Associates
*
* Berkeley style copyright.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsipi_ioctl.c,v 1.73 2019/12/27 09:41:51 msaitoh Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_freebsd.h"
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/device.h>
#include <sys/fcntl.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/scsiconf.h>
#include <sys/scsiio.h>
#include "scsibus.h"
#include "atapibus.h"
struct scsi_ioctl {
LIST_ENTRY(scsi_ioctl) si_list;
struct buf si_bp;
struct uio si_uio;
struct iovec si_iov;
scsireq_t si_screq;
struct scsipi_periph *si_periph;
};
static LIST_HEAD(, scsi_ioctl) si_head;
static kmutex_t si_lock;
void
scsipi_ioctl_init(void)
{
mutex_init(&si_lock, MUTEX_DEFAULT, IPL_BIO);
}
static struct scsi_ioctl *
si_get(void)
{
struct scsi_ioctl *si;
si = malloc(sizeof(struct scsi_ioctl), M_TEMP, M_WAITOK|M_ZERO);
buf_init(&si->si_bp);
mutex_enter(&si_lock);
LIST_INSERT_HEAD(&si_head, si, si_list);
mutex_exit(&si_lock);
return (si);
}
static void
si_free(struct scsi_ioctl *si)
{
mutex_enter(&si_lock);
LIST_REMOVE(si, si_list);
mutex_exit(&si_lock);
buf_destroy(&si->si_bp);
free(si, M_TEMP);
}
static struct scsi_ioctl *
si_find(struct buf *bp)
{
struct scsi_ioctl *si;
mutex_enter(&si_lock);
for (si = si_head.lh_first; si != 0; si = si->si_list.le_next)
if (bp == &si->si_bp)
break;
mutex_exit(&si_lock);
return (si);
}
/*
* We let the user interpret his own sense in the generic scsi world.
* This routine is called at interrupt time if the XS_CTL_USERCMD bit was set
* in the flags passed to scsi_scsipi_cmd(). No other completion processing
* takes place, even if we are running over another device driver.
* The lower level routines that call us here, will free the xs and restart
* the device's queue if such exists.
*/
void
scsipi_user_done(struct scsipi_xfer *xs)
{
struct buf *bp;
struct scsi_ioctl *si;
scsireq_t *screq;
struct scsipi_periph *periph = xs->xs_periph;
bp = xs->bp;
#ifdef DIAGNOSTIC
if (bp == NULL) {
scsipi_printaddr(periph);
printf("user command with no buf\n");
panic("scsipi_user_done");
}
#endif
si = si_find(bp);
#ifdef DIAGNOSTIC
if (si == NULL) {
scsipi_printaddr(periph);
printf("user command with no ioctl\n");
panic("scsipi_user_done");
}
#endif
screq = &si->si_screq;
SC_DEBUG(xs->xs_periph, SCSIPI_DB2, ("user-done\n"));
screq->retsts = 0;
screq->status = xs->status;
switch (xs->error) {
case XS_NOERROR:
SC_DEBUG(periph, SCSIPI_DB3, ("no error\n"));
screq->datalen_used =
xs->datalen - xs->resid; /* probably rubbish */
screq->retsts = SCCMD_OK;
break;
case XS_SENSE:
SC_DEBUG(periph, SCSIPI_DB3, ("have sense\n"));
screq->senselen_used = uimin(sizeof(xs->sense.scsi_sense),
SENSEBUFLEN);
memcpy(screq->sense, &xs->sense.scsi_sense,
screq->senselen_used);
screq->retsts = SCCMD_SENSE;
break;
case XS_SHORTSENSE:
SC_DEBUG(periph, SCSIPI_DB3, ("have short sense\n"));
screq->senselen_used = uimin(sizeof(xs->sense.atapi_sense),
SENSEBUFLEN);
memcpy(screq->sense, &xs->sense.atapi_sense,
screq->senselen_used);
screq->retsts = SCCMD_UNKNOWN; /* XXX need a shortsense here */
break;
case XS_DRIVER_STUFFUP:
scsipi_printaddr(periph);
printf("passthrough: adapter inconsistency\n");
screq->retsts = SCCMD_UNKNOWN;
break;
case XS_SELTIMEOUT:
SC_DEBUG(periph, SCSIPI_DB3, ("seltimeout\n"));
screq->retsts = SCCMD_TIMEOUT;
break;
case XS_TIMEOUT:
SC_DEBUG(periph, SCSIPI_DB3, ("timeout\n"));
screq->retsts = SCCMD_TIMEOUT;
break;
case XS_BUSY:
SC_DEBUG(periph, SCSIPI_DB3, ("busy\n"));
screq->retsts = SCCMD_BUSY;
break;
default:
scsipi_printaddr(periph);
printf("unknown error category %d from adapter\n",
xs->error);
screq->retsts = SCCMD_UNKNOWN;
break;
}
if (xs->xs_control & XS_CTL_ASYNC) {
mutex_enter(chan_mtx(periph->periph_channel));
scsipi_put_xs(xs);
mutex_exit(chan_mtx(periph->periph_channel));
}
}
/* Pseudo strategy function
* Called by scsipi_do_ioctl() via physio/physstrat if there is to
* be data transferred, and directly if there is no data transfer.
*
* Should I reorganize this so it returns to physio instead
* of sleeping in scsiio_scsipi_cmd? Is there any advantage, other
* than avoiding the probable duplicate wakeup in iodone? [PD]
*
* No, seems ok to me... [JRE]
* (I don't see any duplicate wakeups)
*
* Can't be used with block devices or raw_read/raw_write directly
* from the cdevsw/bdevsw tables because they couldn't have added
* the screq structure. [JRE]
*/
static void
scsistrategy(struct buf *bp)
{
struct scsi_ioctl *si;
scsireq_t *screq;
struct scsipi_periph *periph;
int error;
int flags = 0;
si = si_find(bp);
if (si == NULL) {
printf("scsistrategy: "
"No matching ioctl request found in queue\n");
error = EINVAL;
goto done;
}
screq = &si->si_screq;
periph = si->si_periph;
SC_DEBUG(periph, SCSIPI_DB2, ("user_strategy\n"));
/*
* We're in trouble if physio tried to break up the transfer.
*/
if (bp->b_bcount != screq->datalen) {
scsipi_printaddr(periph);
printf("physio split the request.. cannot proceed\n");
error = EIO;
goto done;
}
if (screq->timeout == 0) {
error = EINVAL;
goto done;
}
if (screq->cmdlen > sizeof(struct scsipi_generic)) {
scsipi_printaddr(periph);
printf("cmdlen too big\n");
error = EFAULT;
goto done;
}
if ((screq->flags & SCCMD_READ) && screq->datalen > 0)
flags |= XS_CTL_DATA_IN;
if ((screq->flags & SCCMD_WRITE) && screq->datalen > 0)
flags |= XS_CTL_DATA_OUT;
if (screq->flags & SCCMD_TARGET)
flags |= XS_CTL_TARGET;
if (screq->flags & SCCMD_ESCAPE)
flags |= XS_CTL_ESCAPE;
error = scsipi_command(periph, (void *)screq->cmd, screq->cmdlen,
(void *)bp->b_data, screq->datalen,
0, /* user must do the retries *//* ignored */
screq->timeout, bp, flags | XS_CTL_USERCMD);
done:
if (error)
bp->b_resid = bp->b_bcount;
bp->b_error = error;
biodone(bp);
return;
}
/*
* Something (e.g. another driver) has called us
* with a periph and a scsi-specific ioctl to perform,
* better try. If user-level type command, we must
* still be running in the context of the calling process
*/
int
scsipi_do_ioctl(struct scsipi_periph *periph, dev_t dev, u_long cmd,
void *addr, int flag, struct lwp *l)
{
int error;
SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_do_ioctl(0x%lx)\n", cmd));
if (addr == NULL)
return EINVAL;
/* Check for the safe-ness of this request. */
switch (cmd) {
case OSCIOCIDENTIFY:
case SCIOCIDENTIFY:
break;
case SCIOCCOMMAND:
if ((((scsireq_t *)addr)->flags & SCCMD_READ) == 0 &&
(flag & FWRITE) == 0)
return (EBADF);
break;
default:
if ((flag & FWRITE) == 0)
return (EBADF);
}
switch (cmd) {
case SCIOCCOMMAND: {
scsireq_t *screq = (scsireq_t *)addr;
struct scsi_ioctl *si;
int len;
len = screq->datalen;
/*
* If there is data, there must be a data buffer and a direction specified
*/
if (len > 0 && (screq->databuf == NULL ||
(screq->flags & (SCCMD_READ|SCCMD_WRITE)) == 0))
return (EINVAL);
si = si_get();
si->si_screq = *screq;
si->si_periph = periph;
if (len) {
si->si_iov.iov_base = screq->databuf;
si->si_iov.iov_len = len;
si->si_uio.uio_iov = &si->si_iov;
si->si_uio.uio_iovcnt = 1;
si->si_uio.uio_resid = len;
si->si_uio.uio_offset = 0;
si->si_uio.uio_rw =
(screq->flags & SCCMD_READ) ? UIO_READ : UIO_WRITE;
if ((flag & FKIOCTL) == 0) {
si->si_uio.uio_vmspace = l->l_proc->p_vmspace;
} else {
UIO_SETUP_SYSSPACE(&si->si_uio);
}
error = physio(scsistrategy, &si->si_bp, dev,
(screq->flags & SCCMD_READ) ? B_READ : B_WRITE,
periph->periph_channel->chan_adapter->adapt_minphys,
&si->si_uio);
} else {
/* if no data, no need to translate it.. */
si->si_bp.b_flags = 0;
si->si_bp.b_data = 0;
si->si_bp.b_bcount = 0;
si->si_bp.b_dev = dev;
si->si_bp.b_proc = l->l_proc;
scsistrategy(&si->si_bp);
error = si->si_bp.b_error;
}
*screq = si->si_screq;
si_free(si);
return (error);
}
case SCIOCDEBUG: {
int level = *((int *)addr);
SC_DEBUG(periph, SCSIPI_DB3, ("debug set to %d\n", level));
periph->periph_dbflags = 0;
if (level & 1)
periph->periph_dbflags |= SCSIPI_DB1;
if (level & 2)
periph->periph_dbflags |= SCSIPI_DB2;
if (level & 4)
periph->periph_dbflags |= SCSIPI_DB3;
if (level & 8) periph->periph_dbflags |= SCSIPI_DB4;
return (0);
}
case SCIOCRECONFIG:
case SCIOCDECONFIG:
return (EINVAL);
case SCIOCIDENTIFY: {
struct scsi_addr *sca = (struct scsi_addr *)addr;
switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(periph))) {
case SCSIPI_BUSTYPE_SCSI:
sca->type = TYPE_SCSI;
sca->addr.scsi.scbus =
device_unit(device_parent(periph->periph_dev));
sca->addr.scsi.target = periph->periph_target;
sca->addr.scsi.lun = periph->periph_lun;
return (0);
case SCSIPI_BUSTYPE_ATAPI:
sca->type = TYPE_ATAPI;
sca->addr.atapi.atbus =
device_unit(device_parent(periph->periph_dev));
sca->addr.atapi.drive = periph->periph_target;
return (0);
}
return (ENXIO);
}
#if defined(COMPAT_12) || defined(COMPAT_FREEBSD)
/* SCIOCIDENTIFY before ATAPI staff merge */
case OSCIOCIDENTIFY: {
struct oscsi_addr *sca = (struct oscsi_addr *)addr;
switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(periph))) {
case SCSIPI_BUSTYPE_SCSI:
sca->scbus =
device_unit(device_parent(periph->periph_dev));
sca->target = periph->periph_target;
sca->lun = periph->periph_lun;
return (0);
}
return (ENODEV);
}
#endif
default:
return (ENOTTY);
}
#ifdef DIAGNOSTIC
panic("scsipi_do_ioctl: impossible");
#endif
}
/* $NetBSD: lfs_accessors.h,v 1.51 2022/04/24 20:32:44 rillig Exp $ */
/* from NetBSD: lfs.h,v 1.165 2015/07/24 06:59:32 dholland Exp */
/* from NetBSD: dinode.h,v 1.25 2016/01/22 23:06:10 dholland Exp */
/* from NetBSD: dir.h,v 1.25 2015/09/01 06:16:03 dholland Exp */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Konrad E. Schroder <perseant@hhhh.org>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)lfs.h 8.9 (Berkeley) 5/8/95
*/
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Copyright (c) 1982, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)dinode.h 8.9 (Berkeley) 3/29/95
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)dir.h 8.5 (Berkeley) 4/27/95
*/
#ifndef _UFS_LFS_LFS_ACCESSORS_H_
#define _UFS_LFS_LFS_ACCESSORS_H_
#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#endif
#include <sys/bswap.h>
#include <ufs/lfs/lfs.h>
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#define KASSERT assert
#else
#include <sys/systm.h>
#endif
/*
* STRUCT_LFS is used by the libsa code to get accessors that work
* with struct salfs instead of struct lfs, and by the cleaner to
* get accessors that work with struct clfs.
*/
#ifndef STRUCT_LFS
#define STRUCT_LFS struct lfs
#endif
/*
* byte order
*/
/*
* For now at least, the bootblocks shall not be endian-independent.
* We can see later if it fits in the size budget. Also disable the
* byteswapping if LFS_EI is off.
*
* Caution: these functions "know" that bswap16/32/64 are unsigned,
* and if that changes will likely break silently.
*/
#if defined(_STANDALONE) || (defined(_KERNEL) && !defined(LFS_EI))
#define LFS_SWAP_int16_t(fs, val) (val)
#define LFS_SWAP_int32_t(fs, val) (val)
#define LFS_SWAP_int64_t(fs, val) (val)
#define LFS_SWAP_uint16_t(fs, val) (val)
#define LFS_SWAP_uint32_t(fs, val) (val)
#define LFS_SWAP_uint64_t(fs, val) (val)
#else
#define LFS_SWAP_int16_t(fs, val) \
((fs)->lfs_dobyteswap ? (int16_t)bswap16(val) : (val))
#define LFS_SWAP_int32_t(fs, val) \
((fs)->lfs_dobyteswap ? (int32_t)bswap32(val) : (val))
#define LFS_SWAP_int64_t(fs, val) \
((fs)->lfs_dobyteswap ? (int64_t)bswap64(val) : (val))
#define LFS_SWAP_uint16_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap16(val) : (val))
#define LFS_SWAP_uint32_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap32(val) : (val))
#define LFS_SWAP_uint64_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap64(val) : (val))
#endif
/*
* For handling directories we will need to know if the volume is
* little-endian.
*/
#if BYTE_ORDER == LITTLE_ENDIAN
#define LFS_LITTLE_ENDIAN_ONDISK(fs) (!(fs)->lfs_dobyteswap)
#else
#define LFS_LITTLE_ENDIAN_ONDISK(fs) ((fs)->lfs_dobyteswap)
#endif
/*
* Suppress spurious warnings -- we use
*
* type *foo = &obj->member;
*
* in macros to verify that obj->member has the right type. When the
* object is a packed structure with misaligned members, this causes
* some compiles to squeal that taking the address might lead to
* undefined behaviour later on -- which is helpful in general, not
* relevant in this case, because we don't do anything with foo
* afterward; we only declare it to get a type check and then we
* discard it.
*/
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Waddress-of-packed-member"
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
#endif
#endif
/*
* directories
*/
#define LFS_DIRHEADERSIZE(fs) \
((fs)->lfs_is64 ? sizeof(struct lfs_dirheader64) : sizeof(struct lfs_dirheader32))
/*
* The LFS_DIRSIZ macro gives the minimum record length which will hold
* the directory entry. This requires the amount of space in struct lfs_direct
* without the d_name field, plus enough space for the name with a terminating
* null byte (dp->d_namlen+1), rounded up to a 4 byte boundary.
*/
#define LFS_DIRECTSIZ(fs, namlen) \
(LFS_DIRHEADERSIZE(fs) + (((namlen)+1 + 3) &~ 3))
/*
* The size of the largest possible directory entry. This is
* used by ulfs_dirhash to figure the size of an array, so we
* need a single constant value true for both lfs32 and lfs64.
*/
#define LFS_MAXDIRENTRYSIZE \
(sizeof(struct lfs_dirheader64) + (((LFS_MAXNAMLEN+1)+1 + 3) & ~3))
#if (BYTE_ORDER == LITTLE_ENDIAN)
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \
(((oldfmt) && !(needswap)) ? \
LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#else
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \
(((oldfmt) && (needswap)) ? \
LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#endif
#define LFS_DIRSIZ(fs, dp) LFS_DIRECTSIZ(fs, lfs_dir_getnamlen(fs, dp))
/* Constants for the first argument of LFS_OLDDIRSIZ */
#define LFS_OLDDIRFMT 1
#define LFS_NEWDIRFMT 0
#define LFS_NEXTDIR(fs, dp) \
((LFS_DIRHEADER *)((char *)(dp) + lfs_dir_getreclen(fs, dp)))
static __inline char *
lfs_dir_nameptr(const STRUCT_LFS *fs, LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return (char *)(&dh->u_64 + 1);
} else {
return (char *)(&dh->u_32 + 1);
}
}
static __inline uint64_t
lfs_dir_getino(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return LFS_SWAP_uint64_t(fs, dh->u_64.dh_ino);
} else {
return LFS_SWAP_uint32_t(fs, dh->u_32.dh_ino);
}
}
static __inline uint16_t
lfs_dir_getreclen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return LFS_SWAP_uint16_t(fs, dh->u_64.dh_reclen);
} else {
return LFS_SWAP_uint16_t(fs, dh->u_32.dh_reclen);
}
}
static __inline uint8_t
lfs_dir_gettype(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
return dh->u_64.dh_type;
} else if (fs->lfs_hasolddirfmt) {
return LFS_DT_UNKNOWN;
} else {
return dh->u_32.dh_type;
}
}
static __inline uint8_t
lfs_dir_getnamlen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
return dh->u_64.dh_namlen;
} else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
/* low-order byte of old 16-bit namlen field */
return dh->u_32.dh_type;
} else {
return dh->u_32.dh_namlen;
}
}
static __inline void
lfs_dir_setino(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint64_t ino)
{
if (fs->lfs_is64) {
dh->u_64.dh_ino = LFS_SWAP_uint64_t(fs, ino);
} else {
dh->u_32.dh_ino = LFS_SWAP_uint32_t(fs, ino);
}
}
static __inline void
lfs_dir_setreclen(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint16_t reclen)
{
if (fs->lfs_is64) {
dh->u_64.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
} else {
dh->u_32.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
}
}
static __inline void
lfs_dir_settype(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t type)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
dh->u_64.dh_type = type;
} else if (fs->lfs_hasolddirfmt) {
/* do nothing */
return;
} else {
dh->u_32.dh_type = type;
}
}
static __inline void
lfs_dir_setnamlen(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t namlen)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
dh->u_64.dh_namlen = namlen;
} else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
/* low-order byte of old 16-bit namlen field */
dh->u_32.dh_type = namlen;
} else {
dh->u_32.dh_namlen = namlen;
}
}
static __inline void
lfs_copydirname(STRUCT_LFS *fs, char *dest, const char *src,
unsigned namlen, unsigned reclen)
{
unsigned spacelen;
KASSERT(reclen > LFS_DIRHEADERSIZE(fs));
spacelen = reclen - LFS_DIRHEADERSIZE(fs);
/* must always be at least 1 byte as a null terminator */
KASSERT(spacelen > namlen);
memcpy(dest, src, namlen);
memset(dest + namlen, '\0', spacelen - namlen);
}
static __inline LFS_DIRHEADER *
lfs_dirtemplate_dotdot(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
/* XXX blah, be nice to have a way to do this w/o casts */
if (fs->lfs_is64) {
return (LFS_DIRHEADER *)&dt->u_64.dotdot_header;
} else {
return (LFS_DIRHEADER *)&dt->u_32.dotdot_header;
}
}
static __inline char *
lfs_dirtemplate_dotdotname(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
if (fs->lfs_is64) {
return dt->u_64.dotdot_name;
} else {
return dt->u_32.dotdot_name;
}
}
/*
* dinodes
*/
/*
* Maximum length of a symlink that can be stored within the inode.
*/
#define LFS32_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int32_t))
#define LFS64_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int64_t))
#define LFS_MAXSYMLINKLEN(fs) \
((fs)->lfs_is64 ? LFS64_MAXSYMLINKLEN : LFS32_MAXSYMLINKLEN)
#define DINOSIZE(fs) ((fs)->lfs_is64 ? sizeof(struct lfs64_dinode) : sizeof(struct lfs32_dinode))
#define DINO_IN_BLOCK(fs, base, ix) \
((union lfs_dinode *)((char *)(base) + DINOSIZE(fs) * (ix)))
static __inline void
lfs_copy_dinode(STRUCT_LFS *fs,
union lfs_dinode *dst, const union lfs_dinode *src)
{
/*
* We can do structure assignment of the structs, but not of
* the whole union, as the union is the size of the (larger)
* 64-bit struct and on a 32-bit fs the upper half of it might
* be off the end of a buffer or otherwise invalid.
*/
if (fs->lfs_is64) {
dst->u_64 = src->u_64;
} else {
dst->u_32 = src->u_32;
}
}
#define LFS_DEF_DINO_ACCESSOR(type, type32, field) \
static __inline type \
lfs_dino_get##field(STRUCT_LFS *fs, union lfs_dinode *dip) \
{ \
if (fs->lfs_is64) { \
return LFS_SWAP_##type(fs, dip->u_64.di_##field); \
} else { \
return LFS_SWAP_##type32(fs, dip->u_32.di_##field); \
} \
} \
static __inline void \
lfs_dino_set##field(STRUCT_LFS *fs, union lfs_dinode *dip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &dip->u_64.di_##field; \
(void)p; \
dip->u_64.di_##field = LFS_SWAP_##type(fs, val); \
} else { \
type32 *p = &dip->u_32.di_##field; \
(void)p; \
dip->u_32.di_##field = LFS_SWAP_##type32(fs, val); \
} \
} \
LFS_DEF_DINO_ACCESSOR(uint16_t, uint16_t, mode)
LFS_DEF_DINO_ACCESSOR(int16_t, int16_t, nlink)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, inumber)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint64_t, size)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, atime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, atimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, mtime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, mtimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, ctime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, ctimensec)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, flags)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, blocks)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, gen)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, uid)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, gid)
/* XXX this should be done differently (it's a fake field) */
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, rdev)
static __inline daddr_t
lfs_dino_getdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
KASSERT(ix < ULFS_NDADDR);
if (fs->lfs_is64) {
return LFS_SWAP_int64_t(fs, dip->u_64.di_db[ix]);
} else {
/* note: this must sign-extend or UNWRITTEN gets trashed */
return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_db[ix]);
}
}
static __inline daddr_t
lfs_dino_getib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
KASSERT(ix < ULFS_NIADDR);
if (fs->lfs_is64) {
return LFS_SWAP_int64_t(fs, dip->u_64.di_ib[ix]);
} else {
/* note: this must sign-extend or UNWRITTEN gets trashed */
return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_ib[ix]);
}
}
static __inline void
lfs_dino_setdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
KASSERT(ix < ULFS_NDADDR);
if (fs->lfs_is64) {
dip->u_64.di_db[ix] = LFS_SWAP_int64_t(fs, val);
} else {
dip->u_32.di_db[ix] = LFS_SWAP_uint32_t(fs, val);
}
}
static __inline void
lfs_dino_setib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
KASSERT(ix < ULFS_NIADDR);
if (fs->lfs_is64) {
dip->u_64.di_ib[ix] = LFS_SWAP_int64_t(fs, val);
} else {
dip->u_32.di_ib[ix] = LFS_SWAP_uint32_t(fs, val);
}
}
/* birthtime is present only in the 64-bit inode */
static __inline void
lfs_dino_setbirthtime(STRUCT_LFS *fs, union lfs_dinode *dip,
const struct timespec *ts)
{
if (fs->lfs_is64) {
dip->u_64.di_birthtime = ts->tv_sec;
dip->u_64.di_birthnsec = ts->tv_nsec;
} else {
/* drop it on the floor */
}
}
/*
* indirect blocks
*/
static __inline daddr_t
lfs_iblock_get(STRUCT_LFS *fs, void *block, unsigned ix)
{
if (fs->lfs_is64) {
// XXX re-enable these asserts after reorging this file
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
return (daddr_t)(((int64_t *)block)[ix]);
} else {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
/* must sign-extend or UNWRITTEN gets trashed */
return (daddr_t)(int64_t)(((int32_t *)block)[ix]);
}
}
static __inline void
lfs_iblock_set(STRUCT_LFS *fs, void *block, unsigned ix, daddr_t val)
{
if (fs->lfs_is64) {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
((int64_t *)block)[ix] = val;
} else {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
((int32_t *)block)[ix] = val;
}
}
/*
* "struct buf" associated definitions
*/
# define LFS_LOCK_BUF(bp) do { \
if (((bp)->b_flags & B_LOCKED) == 0 && bp->b_iodone == NULL) { \
mutex_enter(&lfs_lock); \
++locked_queue_count; \
locked_queue_bytes += bp->b_bufsize; \
mutex_exit(&lfs_lock); \
} \
(bp)->b_flags |= B_LOCKED; \
} while (0)
# define LFS_UNLOCK_BUF(bp) do { \
if (((bp)->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) { \
mutex_enter(&lfs_lock); \
--locked_queue_count; \
locked_queue_bytes -= bp->b_bufsize; \
if (locked_queue_count < LFS_WAIT_BUFS && \
locked_queue_bytes < LFS_WAIT_BYTES) \
cv_broadcast(&locked_queue_cv); \
mutex_exit(&lfs_lock); \
} \
(bp)->b_flags &= ~B_LOCKED; \
} while (0)
/*
* "struct inode" associated definitions
*/
#define LFS_SET_UINO(ip, states) do { \
if (((states) & IN_ACCESSED) && !((ip)->i_state & IN_ACCESSED)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
if (((states) & IN_CLEANING) && !((ip)->i_state & IN_CLEANING)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
if (((states) & IN_MODIFIED) && !((ip)->i_state & IN_MODIFIED)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
(ip)->i_state |= (states); \
} while (0)
#define LFS_CLR_UINO(ip, states) do { \
if (((states) & IN_ACCESSED) && ((ip)->i_state & IN_ACCESSED)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
if (((states) & IN_CLEANING) && ((ip)->i_state & IN_CLEANING)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
if (((states) & IN_MODIFIED) && ((ip)->i_state & IN_MODIFIED)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
(ip)->i_state &= ~(states); \
if (lfs_sb_getuinodes((ip)->i_lfs) < 0) { \
panic("lfs_uinodes < 0"); \
} \
} while (0)
#define LFS_ITIMES(ip, acc, mod, cre) \
while ((ip)->i_state & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY)) \
lfs_itimes(ip, acc, mod, cre)
/*
* On-disk and in-memory checkpoint segment usage structure.
*/
#define SEGUPB(fs) (lfs_sb_getsepb(fs))
#define SEGTABSIZE_SU(fs) \
((lfs_sb_getnseg(fs) + SEGUPB(fs) - 1) / lfs_sb_getsepb(fs))
#ifdef _KERNEL
# define SHARE_IFLOCK(F) \
do { \
rw_enter(&(F)->lfs_iflock, RW_READER); \
} while(0)
# define UNSHARE_IFLOCK(F) \
do { \
rw_exit(&(F)->lfs_iflock); \
} while(0)
#else /* ! _KERNEL */
# define SHARE_IFLOCK(F)
# define UNSHARE_IFLOCK(F)
#endif /* ! _KERNEL */
/* Read in the block with a specific segment usage entry from the ifile. */
#define LFS_SEGENTRY(SP, F, IN, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
if ((_e = bread((F)->lfs_ivnode, \
((IN) / lfs_sb_getsepb(F)) + lfs_sb_getcleansz(F), \
lfs_sb_getbsize(F), 0, &(BP))) != 0) \
panic("lfs: ifile read: segentry %llu: error %d\n", \
(unsigned long long)(IN), _e); \
if (lfs_sb_getversion(F) == 1) \
(SP) = (SEGUSE *)((SEGUSE_V1 *)(BP)->b_data + \
((IN) & (lfs_sb_getsepb(F) - 1))); \
else \
(SP) = (SEGUSE *)(BP)->b_data + ((IN) % lfs_sb_getsepb(F)); \
UNSHARE_IFLOCK(F); \
} while (0)
#define LFS_WRITESEGENTRY(SP, F, IN, BP) do { \
if ((SP)->su_nbytes == 0) \
(SP)->su_flags |= SEGUSE_EMPTY; \
else \
(SP)->su_flags &= ~SEGUSE_EMPTY; \
(F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags; \
LFS_BWRITE_LOG(BP); \
} while (0)
/*
* FINFO (file info) entries.
*/
/* Size of an on-disk block pointer, e.g. in an indirect block. */
/* XXX: move to a more suitable location in this file */
#define LFS_BLKPTRSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))
/* Size of an on-disk inode number. */
/* XXX: move to a more suitable location in this file */
#define LFS_INUMSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))
/* size of a FINFO, without the block pointers */
#define FINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(FINFO64) : sizeof(FINFO32))
/* Full size of the provided FINFO record, including its block pointers. */
#define FINFO_FULLSIZE(fs, fip) \
(FINFOSIZE(fs) + lfs_fi_getnblocks(fs, fip) * LFS_BLKPTRSIZE(fs))
#define NEXT_FINFO(fs, fip) \
((FINFO *)((char *)(fip) + FINFO_FULLSIZE(fs, fip)))
#define LFS_DEF_FI_ACCESSOR(type, type32, field) \
static __inline type \
lfs_fi_get##field(STRUCT_LFS *fs, FINFO *fip) \
{ \
if (fs->lfs_is64) { \
return fip->u_64.fi_##field; \
} else { \
return fip->u_32.fi_##field; \
} \
} \
static __inline void \
lfs_fi_set##field(STRUCT_LFS *fs, FINFO *fip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &fip->u_64.fi_##field; \
(void)p; \
fip->u_64.fi_##field = val; \
} else { \
type32 *p = &fip->u_32.fi_##field; \
(void)p; \
fip->u_32.fi_##field = val; \
} \
} \
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, nblocks)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_FI_ACCESSOR(uint64_t, uint32_t, ino)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, lastlength)
static __inline daddr_t
lfs_fi_getblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
KASSERT(idx < lfs_fi_getnblocks(fs, fip));
if (fs->lfs_is64) {
return ((int64_t *)firstblock)[idx];
} else {
return ((int32_t *)firstblock)[idx];
}
}
static __inline void
lfs_fi_setblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx, daddr_t blk)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
KASSERT(idx < lfs_fi_getnblocks(fs, fip));
if (fs->lfs_is64) {
((int64_t *)firstblock)[idx] = blk;
} else {
((int32_t *)firstblock)[idx] = blk;
}
}
/*
* inode info entries (in the segment summary)
*/
#define IINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(IINFO64) : sizeof(IINFO32))
/* iinfos scroll backward from the end of the segment summary block */
#define SEGSUM_IINFOSTART(fs, buf) \
((IINFO *)((char *)buf + lfs_sb_getsumsize(fs) - IINFOSIZE(fs)))
#define NEXTLOWER_IINFO(fs, iip) \
((IINFO *)((char *)(iip) - IINFOSIZE(fs)))
#define NTH_IINFO(fs, buf, n) \
((IINFO *)((char *)SEGSUM_IINFOSTART(fs, buf) - (n)*IINFOSIZE(fs)))
static __inline uint64_t
lfs_ii_getblock(STRUCT_LFS *fs, IINFO *iip)
{
if (fs->lfs_is64) {
return iip->u_64.ii_block;
} else {
return iip->u_32.ii_block;
}
}
static __inline void
lfs_ii_setblock(STRUCT_LFS *fs, IINFO *iip, uint64_t block)
{
if (fs->lfs_is64) {
iip->u_64.ii_block = block;
} else {
iip->u_32.ii_block = block;
}
}
/*
* Index file inode entries.
*/
#define IFILE_ENTRYSIZE(fs) \
((fs)->lfs_is64 ? sizeof(IFILE64) : sizeof(IFILE32))
/*
* LFSv1 compatibility code is not allowed to touch if_atime, since it
* may not be mapped!
*/
/* Read in the block with a specific inode from the ifile. */
#define LFS_IENTRY(IP, F, IN, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
if ((_e = bread((F)->lfs_ivnode, \
(IN) / lfs_sb_getifpb(F) + lfs_sb_getcleansz(F) + lfs_sb_getsegtabsz(F), \
lfs_sb_getbsize(F), 0, &(BP))) != 0) \
panic("lfs: ifile ino %d read %d", (int)(IN), _e); \
if ((F)->lfs_is64) { \
(IP) = (IFILE *)((IFILE64 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} else if (lfs_sb_getversion(F) > 1) { \
(IP) = (IFILE *)((IFILE32 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} else { \
(IP) = (IFILE *)((IFILE_V1 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} \
UNSHARE_IFLOCK(F); \
} while (0)
#define LFS_IENTRY_NEXT(IP, F) do { \
if ((F)->lfs_is64) { \
(IP) = (IFILE *)((IFILE64 *)(IP) + 1); \
} else if (lfs_sb_getversion(F) > 1) { \
(IP) = (IFILE *)((IFILE32 *)(IP) + 1); \
} else { \
(IP) = (IFILE *)((IFILE_V1 *)(IP) + 1); \
} \
} while (0)
#define LFS_DEF_IF_ACCESSOR(type, type32, field) \
static __inline type \
lfs_if_get##field(STRUCT_LFS *fs, IFILE *ifp) \
{ \
if (fs->lfs_is64) { \
return ifp->u_64.if_##field; \
} else { \
return ifp->u_32.if_##field; \
} \
} \
static __inline void \
lfs_if_set##field(STRUCT_LFS *fs, IFILE *ifp, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &ifp->u_64.if_##field; \
(void)p; \
ifp->u_64.if_##field = val; \
} else { \
type32 *p = &ifp->u_32.if_##field; \
(void)p; \
ifp->u_32.if_##field = val; \
} \
} \
LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_IF_ACCESSOR(int64_t, int32_t, daddr)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, nextfree)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, atime_sec)
LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, atime_nsec)
/*
* Cleaner information structure. This resides in the ifile and is used
* to pass information from the kernel to the cleaner.
*/
#define CLEANSIZE_SU(fs) \
((((fs)->lfs_is64 ? sizeof(CLEANERINFO64) : sizeof(CLEANERINFO32)) + \
lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs))
#define LFS_DEF_CI_ACCESSOR(type, type32, field) \
static __inline type \
lfs_ci_get##field(STRUCT_LFS *fs, CLEANERINFO *cip) \
{ \
if (fs->lfs_is64) { \
return cip->u_64.field; \
} else { \
return cip->u_32.field; \
} \
} \
static __inline void \
lfs_ci_set##field(STRUCT_LFS *fs, CLEANERINFO *cip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &cip->u_64.field; \
(void)p; \
cip->u_64.field = val; \
} else { \
type32 *p = &cip->u_32.field; \
(void)p; \
cip->u_32.field = val; \
} \
} \
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, clean)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, dirty)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, bfree)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, avail)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_head)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_tail)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, flags)
static __inline void
lfs_ci_shiftcleantodirty(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) - num);
lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) + num);
}
static __inline void
lfs_ci_shiftdirtytoclean(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) - num);
lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) + num);
}
/* Read in the block with the cleaner info from the ifile. */
#define LFS_CLEANERINFO(CP, F, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
_e = bread((F)->lfs_ivnode, \
(daddr_t)0, lfs_sb_getbsize(F), 0, &(BP)); \
if (_e) \
panic("lfs: ifile read: cleanerinfo: error %d\n", _e); \
(CP) = (CLEANERINFO *)(BP)->b_data; \
UNSHARE_IFLOCK(F); \
} while (0)
/*
* Synchronize the Ifile cleaner info with current avail and bfree.
*/
#define LFS_SYNC_CLEANERINFO(cip, fs, bp, w) do { \
mutex_enter(&lfs_lock); \
if ((w) || lfs_ci_getbfree(fs, cip) != lfs_sb_getbfree(fs) || \
lfs_ci_getavail(fs, cip) != lfs_sb_getavail(fs) - fs->lfs_ravail - \
fs->lfs_favail) { \
lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); \
lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) - fs->lfs_ravail - \
fs->lfs_favail); \
if (((bp)->b_flags & B_GATHERED) == 0) { \
fs->lfs_flags |= LFS_IFDIRTY; \
} \
mutex_exit(&lfs_lock); \
(void) LFS_BWRITE_LOG(bp); /* Ifile */ \
} else { \
mutex_exit(&lfs_lock); \
brelse(bp, 0); \
} \
} while (0)
/*
* Get the head of the inode free list.
* Always called with the segment lock held.
*/
#define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do { \
if (lfs_sb_getversion(FS) > 1) { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_sb_setfreehd(FS, lfs_ci_getfree_head(FS, CIP)); \
brelse(BP, 0); \
} \
*(FREEP) = lfs_sb_getfreehd(FS); \
} while (0)
#define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do { \
lfs_sb_setfreehd(FS, VAL); \
if (lfs_sb_getversion(FS) > 1) { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_ci_setfree_head(FS, CIP, VAL); \
LFS_BWRITE_LOG(BP); \
mutex_enter(&lfs_lock); \
(FS)->lfs_flags |= LFS_IFDIRTY; \
mutex_exit(&lfs_lock); \
} \
} while (0)
#define LFS_GET_TAILFREE(FS, CIP, BP, FREEP) do { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
*(FREEP) = lfs_ci_getfree_tail(FS, CIP); \
brelse(BP, 0); \
} while (0)
#define LFS_PUT_TAILFREE(FS, CIP, BP, VAL) do { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_ci_setfree_tail(FS, CIP, VAL); \
LFS_BWRITE_LOG(BP); \
mutex_enter(&lfs_lock); \
(FS)->lfs_flags |= LFS_IFDIRTY; \
mutex_exit(&lfs_lock); \
} while (0)
/*
* On-disk segment summary information
*/
#define SEGSUM_SIZE(fs) \
(fs->lfs_is64 ? sizeof(SEGSUM64) : \
lfs_sb_getversion(fs) > 1 ? sizeof(SEGSUM32) : sizeof(SEGSUM_V1))
/*
* The SEGSUM structure is followed by FINFO structures. Get the pointer
* to the first FINFO.
*
* XXX this can't be a macro yet; this file needs to be resorted.
*/
#if 0
static __inline FINFO *
segsum_finfobase(STRUCT_LFS *fs, SEGSUM *ssp)
{
return (FINFO *)((char *)ssp + SEGSUM_SIZE(fs));
}
#else
#define SEGSUM_FINFOBASE(fs, ssp) \
((FINFO *)((char *)(ssp) + SEGSUM_SIZE(fs)));
#endif
#define LFS_DEF_SS_ACCESSOR(type, type32, field) \
static __inline type \
lfs_ss_get##field(STRUCT_LFS *fs, SEGSUM *ssp) \
{ \
if (fs->lfs_is64) { \
return ssp->u_64.ss_##field; \
} else { \
return ssp->u_32.ss_##field; \
} \
} \
static __inline void \
lfs_ss_set##field(STRUCT_LFS *fs, SEGSUM *ssp, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &ssp->u_64.ss_##field; \
(void)p; \
ssp->u_64.ss_##field = val; \
} else { \
type32 *p = &ssp->u_32.ss_##field; \
(void)p; \
ssp->u_32.ss_##field = val; \
} \
} \
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, sumsum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, datasum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, magic)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, ident)
LFS_DEF_SS_ACCESSOR(int64_t, int32_t, next)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, nfinfo)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, ninos)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, flags)
LFS_DEF_SS_ACCESSOR(uint64_t, uint32_t, reclino)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, serial)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, create)
static __inline size_t
lfs_ss_getsumstart(STRUCT_LFS *fs)
{
/* These are actually all the same. */
if (fs->lfs_is64) {
return offsetof(SEGSUM64, ss_datasum);
} else /* if (lfs_sb_getversion(fs) > 1) */ {
return offsetof(SEGSUM32, ss_datasum);
} /* else {
return offsetof(SEGSUM_V1, ss_datasum);
} */
/*
* XXX ^^^ until this file is resorted lfs_sb_getversion isn't
* defined yet.
*/
}
static __inline uint32_t
lfs_ss_getocreate(STRUCT_LFS *fs, SEGSUM *ssp)
{
KASSERT(fs->lfs_is64 == 0);
/* XXX need to resort this file before we can do this */
//KASSERT(lfs_sb_getversion(fs) == 1);
return ssp->u_v1.ss_create;
}
static __inline void
lfs_ss_setocreate(STRUCT_LFS *fs, SEGSUM *ssp, uint32_t val)
{
KASSERT(fs->lfs_is64 == 0);
/* XXX need to resort this file before we can do this */
//KASSERT(lfs_sb_getversion(fs) == 1);
ssp->u_v1.ss_create = val;
}
/*
* Super block.
*/
/*
* Generate accessors for the on-disk superblock fields with cpp.
*/
#define LFS_DEF_SB_ACCESSOR_FULL(type, type32, field) \
static __inline type \
lfs_sb_get##field(STRUCT_LFS *fs) \
{ \
if (fs->lfs_is64) { \
return fs->lfs_dlfs_u.u_64.dlfs_##field; \
} else { \
return fs->lfs_dlfs_u.u_32.dlfs_##field; \
} \
} \
static __inline void \
lfs_sb_set##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
fs->lfs_dlfs_u.u_64.dlfs_##field = val; \
} else { \
fs->lfs_dlfs_u.u_32.dlfs_##field = val; \
} \
} \
static __inline void \
lfs_sb_add##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
*p64 += val; \
} else { \
type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
*p32 += val; \
} \
} \
static __inline void \
lfs_sb_sub##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
*p64 -= val; \
} else { \
type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
*p32 -= val; \
} \
}
#define LFS_DEF_SB_ACCESSOR(t, f) LFS_DEF_SB_ACCESSOR_FULL(t, t, f)
#define LFS_DEF_SB_ACCESSOR_32ONLY(type, field, val64) \
static __inline type \
lfs_sb_get##field(STRUCT_LFS *fs) \
{ \
if (fs->lfs_is64) { \
return val64; \
} else { \
return fs->lfs_dlfs_u.u_32.dlfs_##field; \
} \
}
LFS_DEF_SB_ACCESSOR(uint32_t, version)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, size)
LFS_DEF_SB_ACCESSOR(uint32_t, ssize)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, dsize)
LFS_DEF_SB_ACCESSOR(uint32_t, bsize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsize)
LFS_DEF_SB_ACCESSOR(uint32_t, frag)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, freehd)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, bfree)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, nfiles)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, avail)
LFS_DEF_SB_ACCESSOR(int32_t, uinodes)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, idaddr)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, ifile, LFS_IFILE_INUM)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, nextseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, curseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, offset)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopf)
LFS_DEF_SB_ACCESSOR(uint32_t, minfree)
LFS_DEF_SB_ACCESSOR(uint64_t, maxfilesize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopb)
LFS_DEF_SB_ACCESSOR(uint32_t, ifpb)
LFS_DEF_SB_ACCESSOR(uint32_t, sepb)
LFS_DEF_SB_ACCESSOR(uint32_t, nindir)
LFS_DEF_SB_ACCESSOR(uint32_t, nseg)
LFS_DEF_SB_ACCESSOR(uint32_t, nspf)
LFS_DEF_SB_ACCESSOR(uint32_t, cleansz)
LFS_DEF_SB_ACCESSOR(uint32_t, segtabsz)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segmask, 0)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segshift, 0)
LFS_DEF_SB_ACCESSOR(uint64_t, bmask)
LFS_DEF_SB_ACCESSOR(uint32_t, bshift)
LFS_DEF_SB_ACCESSOR(uint64_t, ffmask)
LFS_DEF_SB_ACCESSOR(uint32_t, ffshift)
LFS_DEF_SB_ACCESSOR(uint64_t, fbmask)
LFS_DEF_SB_ACCESSOR(uint32_t, fbshift)
LFS_DEF_SB_ACCESSOR(uint32_t, blktodb)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbtodb)
LFS_DEF_SB_ACCESSOR(uint32_t, sushift)
LFS_DEF_SB_ACCESSOR(int32_t, maxsymlinklen)
LFS_DEF_SB_ACCESSOR(uint32_t, cksum)
LFS_DEF_SB_ACCESSOR(uint16_t, pflags)
LFS_DEF_SB_ACCESSOR(uint32_t, nclean)
LFS_DEF_SB_ACCESSOR(int32_t, dmeta)
LFS_DEF_SB_ACCESSOR(uint32_t, minfreeseg)
LFS_DEF_SB_ACCESSOR(uint32_t, sumsize)
LFS_DEF_SB_ACCESSOR(uint64_t, serial)
LFS_DEF_SB_ACCESSOR(uint32_t, ibsize)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, s0addr)
LFS_DEF_SB_ACCESSOR(uint64_t, tstamp)
LFS_DEF_SB_ACCESSOR(uint32_t, inodefmt)
LFS_DEF_SB_ACCESSOR(uint32_t, interleave)
LFS_DEF_SB_ACCESSOR(uint32_t, ident)
LFS_DEF_SB_ACCESSOR(uint32_t, resvseg)
/* special-case accessors */
/*
* the v1 otstamp field lives in what's now dlfs_inopf
*/
#define lfs_sb_getotstamp(fs) lfs_sb_getinopf(fs)
#define lfs_sb_setotstamp(fs, val) lfs_sb_setinopf(fs, val)
/*
* lfs_sboffs is an array
*/
static __inline int32_t
lfs_sb_getsboff(STRUCT_LFS *fs, unsigned n)
{
#ifdef KASSERT /* ugh */
KASSERT(n < LFS_MAXNUMSB);
#endif
if (fs->lfs_is64) {
return fs->lfs_dlfs_u.u_64.dlfs_sboffs[n];
} else {
return fs->lfs_dlfs_u.u_32.dlfs_sboffs[n];
}
}
static __inline void
lfs_sb_setsboff(STRUCT_LFS *fs, unsigned n, int32_t val)
{
#ifdef KASSERT /* ugh */
KASSERT(n < LFS_MAXNUMSB);
#endif
if (fs->lfs_is64) {
fs->lfs_dlfs_u.u_64.dlfs_sboffs[n] = val;
} else {
fs->lfs_dlfs_u.u_32.dlfs_sboffs[n] = val;
}
}
/*
* lfs_fsmnt is a string
*/
static __inline const char *
lfs_sb_getfsmnt(STRUCT_LFS *fs)
{
if (fs->lfs_is64) {
return (const char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt;
} else {
return (const char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt;
}
}
static __inline void
lfs_sb_setfsmnt(STRUCT_LFS *fs, const char *str)
{
if (fs->lfs_is64) {
(void)strncpy((char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt, str,
sizeof(fs->lfs_dlfs_u.u_64.dlfs_fsmnt));
} else {
(void)strncpy((char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt, str,
sizeof(fs->lfs_dlfs_u.u_32.dlfs_fsmnt));
}
}
/* Highest addressable fsb */
#define LFS_MAX_DADDR(fs) \
((fs)->lfs_is64 ? 0x7fffffffffffffff : 0x7fffffff)
/* LFS_NINDIR is the number of indirects in a file system block. */
#define LFS_NINDIR(fs) (lfs_sb_getnindir(fs))
/* LFS_INOPB is the number of inodes in a secondary storage block. */
#define LFS_INOPB(fs) (lfs_sb_getinopb(fs))
/* LFS_INOPF is the number of inodes in a fragment. */
#define LFS_INOPF(fs) (lfs_sb_getinopf(fs))
#define lfs_blkoff(fs, loc) ((int)((loc) & lfs_sb_getbmask(fs)))
#define lfs_fragoff(fs, loc) /* calculates (loc % fs->lfs_fsize) */ \
((int)((loc) & lfs_sb_getffmask(fs)))
/* XXX: lowercase these as they're no longer macros */
/* Frags to diskblocks */
static __inline uint64_t
LFS_FSBTODB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
return b << (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
return b << lfs_sb_getfsbtodb(fs);
#endif
}
/* Diskblocks to frags */
static __inline uint64_t
LFS_DBTOFSB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
return b >> (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
return b >> lfs_sb_getfsbtodb(fs);
#endif
}
#define lfs_lblkno(fs, loc) ((loc) >> lfs_sb_getbshift(fs))
#define lfs_lblktosize(fs, blk) ((blk) << lfs_sb_getbshift(fs))
/* Frags to bytes */
static __inline uint64_t
lfs_fsbtob(STRUCT_LFS *fs, uint64_t b)
{
return b << lfs_sb_getffshift(fs);
}
/* Bytes to frags */
static __inline uint64_t
lfs_btofsb(STRUCT_LFS *fs, uint64_t b)
{
return b >> lfs_sb_getffshift(fs);
}
#define lfs_numfrags(fs, loc) /* calculates (loc / fs->lfs_fsize) */ \
((loc) >> lfs_sb_getffshift(fs))
#define lfs_blkroundup(fs, size)/* calculates roundup(size, lfs_sb_getbsize(fs)) */ \
((off_t)(((size) + lfs_sb_getbmask(fs)) & (~lfs_sb_getbmask(fs))))
#define lfs_fragroundup(fs, size)/* calculates roundup(size, fs->lfs_fsize) */ \
((off_t)(((size) + lfs_sb_getffmask(fs)) & (~lfs_sb_getffmask(fs))))
#define lfs_fragstoblks(fs, frags)/* calculates (frags / fs->fs_frag) */ \
((frags) >> lfs_sb_getfbshift(fs))
#define lfs_blkstofrags(fs, blks)/* calculates (blks * fs->fs_frag) */ \
((blks) << lfs_sb_getfbshift(fs))
#define lfs_fragnum(fs, fsb) /* calculates (fsb % fs->lfs_frag) */ \
((fsb) & ((fs)->lfs_frag - 1))
#define lfs_blknum(fs, fsb) /* calculates rounddown(fsb, fs->lfs_frag) */ \
((fsb) &~ ((fs)->lfs_frag - 1))
#define lfs_dblksize(fs, dp, lbn) \
(((lbn) >= ULFS_NDADDR || lfs_dino_getsize(fs, dp) >= ((lbn) + 1) << lfs_sb_getbshift(fs)) \
? lfs_sb_getbsize(fs) \
: (lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, dp)))))
#define lfs_segsize(fs) (lfs_sb_getversion(fs) == 1 ? \
lfs_lblktosize((fs), lfs_sb_getssize(fs)) : \
lfs_sb_getssize(fs))
/* XXX segtod produces a result in frags despite the 'd' */
#define lfs_segtod(fs, seg) (lfs_btofsb(fs, lfs_segsize(fs)) * (seg))
#define lfs_dtosn(fs, daddr) /* block address to segment number */ \
((uint32_t)(((daddr) - lfs_sb_gets0addr(fs)) / lfs_segtod((fs), 1)))
#define lfs_sntod(fs, sn) /* segment number to disk address */ \
((daddr_t)(lfs_segtod((fs), (sn)) + lfs_sb_gets0addr(fs)))
/* XXX, blah. make this appear only if struct inode is defined */
#ifdef _UFS_LFS_LFS_INODE_H_
static __inline uint32_t
lfs_blksize(STRUCT_LFS *fs, struct inode *ip, uint64_t lbn)
{
if (lbn >= ULFS_NDADDR || lfs_dino_getsize(fs, ip->i_din) >= (lbn + 1) << lfs_sb_getbshift(fs)) {
return lfs_sb_getbsize(fs);
} else {
return lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, ip->i_din)));
}
}
#endif
/*
* union lfs_blocks
*/
static __inline void
lfs_blocks_fromvoid(STRUCT_LFS *fs, union lfs_blocks *bp, void *p)
{
if (fs->lfs_is64) {
bp->b64 = p;
} else {
bp->b32 = p;
}
}
static __inline void
lfs_blocks_fromfinfo(STRUCT_LFS *fs, union lfs_blocks *bp, FINFO *fip)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
if (fs->lfs_is64) {
bp->b64 = (int64_t *)firstblock;
} else {
bp->b32 = (int32_t *)firstblock;
}
}
static __inline daddr_t
lfs_blocks_get(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx)
{
if (fs->lfs_is64) {
return bp->b64[idx];
} else {
return bp->b32[idx];
}
}
static __inline void
lfs_blocks_set(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx, daddr_t val)
{
if (fs->lfs_is64) {
bp->b64[idx] = val;
} else {
bp->b32[idx] = val;
}
}
static __inline void
lfs_blocks_inc(STRUCT_LFS *fs, union lfs_blocks *bp)
{
if (fs->lfs_is64) {
bp->b64++;
} else {
bp->b32++;
}
}
static __inline int
lfs_blocks_eq(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
if (fs->lfs_is64) {
return bp1->b64 == bp2->b64;
} else {
return bp1->b32 == bp2->b32;
}
}
static __inline int
lfs_blocks_sub(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
/* (remember that the pointers are typed) */
if (fs->lfs_is64) {
return bp1->b64 - bp2->b64;
} else {
return bp1->b32 - bp2->b32;
}
}
/*
* struct segment
*/
/*
* Macros for determining free space on the disk, with the variable metadata
* of segment summaries and inode blocks taken into account.
*/
/*
* Estimate number of clean blocks not available for writing because
* they will contain metadata or overhead. This is calculated as
*
* E = ((C * M / D) * D + (0) * (T - D)) / T
* or more simply
* E = (C * M) / T
*
* where
* C is the clean space,
* D is the dirty space,
* M is the dirty metadata, and
* T = C + D is the total space on disk.
*
* This approximates the old formula of E = C * M / D when D is close to T,
* but avoids falsely reporting "disk full" when the sample size (D) is small.
*/
#define LFS_EST_CMETA(F) (( \
(lfs_sb_getdmeta(F) * (int64_t)lfs_sb_getnclean(F)) / \
(lfs_sb_getnseg(F))))
/* Estimate total size of the disk not including metadata */
#define LFS_EST_NONMETA(F) (lfs_sb_getdsize(F) - lfs_sb_getdmeta(F) - LFS_EST_CMETA(F))
/* Estimate number of blocks actually available for writing */
#define LFS_EST_BFREE(F) (lfs_sb_getbfree(F) > LFS_EST_CMETA(F) ? \
lfs_sb_getbfree(F) - LFS_EST_CMETA(F) : 0)
/* Amount of non-meta space not available to mortal man */
#define LFS_EST_RSVD(F) ((LFS_EST_NONMETA(F) * \
(uint64_t)lfs_sb_getminfree(F)) / \
100)
/* Can credential C write BB blocks? XXX: kauth_cred_geteuid is abusive */
#define ISSPACE(F, BB, C) \
((((C) == NOCRED || kauth_cred_geteuid(C) == 0) && \
LFS_EST_BFREE(F) >= (BB)) || \
(kauth_cred_geteuid(C) != 0 && IS_FREESPACE(F, BB)))
/* Can an ordinary user write BB blocks */
#define IS_FREESPACE(F, BB) \
(LFS_EST_BFREE(F) >= (BB) + LFS_EST_RSVD(F))
/*
* The minimum number of blocks to create a new inode. This is:
* directory direct block (1) + ULFS_NIADDR indirect blocks + inode block (1) +
* ifile direct block (1) + ULFS_NIADDR indirect blocks = 3 + 2 * ULFS_NIADDR blocks.
*/
#define LFS_NRESERVE(F) (lfs_btofsb((F), (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(F)))
/*
* Suppress spurious clang warnings
*/
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic pop
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic pop
#endif
#endif
#endif /* _UFS_LFS_LFS_ACCESSORS_H_ */
/* $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $ */
/*
* Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
*
* Further information about snapshots can be obtained from:
*
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
* 1614 Oxford Street mckusick@mckusick.com
* Berkeley, CA 94709-1608 +1-510-843-9542
* USA
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
*
* from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/sched.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/wapbl.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <uvm/uvm.h>
TAILQ_HEAD(inodelst, inode); /* List of active snapshots */
struct snap_info {
kmutex_t si_lock; /* Lock this snapinfo */
kmutex_t si_snaplock; /* Snapshot vnode common lock */
lwp_t *si_owner; /* Snaplock owner */
struct inodelst si_snapshots; /* List of active snapshots */
daddr_t *si_snapblklist; /* Snapshot block hints list */
uint32_t si_gen; /* Incremented on change */
};
#if !defined(FFS_NO_SNAPSHOT)
typedef int (*acctfunc_t)
(struct vnode *, void *, int, int, struct fs *, daddr_t, int);
static int snapshot_setup(struct mount *, struct vnode *);
static int snapshot_copyfs(struct mount *, struct vnode *, void **);
static int snapshot_expunge(struct mount *, struct vnode *,
struct fs *, daddr_t *, daddr_t **);
static int snapshot_expunge_snap(struct mount *, struct vnode *,
struct fs *, daddr_t);
static int snapshot_writefs(struct mount *, struct vnode *, void *);
static int cgaccount(struct vnode *, int, int *);
static int cgaccount1(int, struct vnode *, void *, int);
static int expunge(struct vnode *, struct inode *, struct fs *,
acctfunc_t, int);
static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
static int fullacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
static int snapacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
static int mapacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
#endif /* !defined(FFS_NO_SNAPSHOT) */
static int ffs_copyonwrite(void *, struct buf *, bool);
static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
static int rwfsblk(struct vnode *, int, void *, daddr_t);
static int syncsnap(struct vnode *);
static int wrsnapblk(struct vnode *, void *, daddr_t);
#if !defined(FFS_NO_SNAPSHOT)
static int blocks_in_journal(struct fs *);
#endif
static inline bool is_active_snapshot(struct snap_info *, struct inode *);
static inline daddr_t db_get(struct inode *, int);
static inline void db_assign(struct inode *, int, daddr_t);
static inline daddr_t ib_get(struct inode *, int);
static inline daddr_t idb_get(struct inode *, void *, int);
static inline void idb_assign(struct inode *, void *, int, daddr_t);
#ifdef DEBUG
static int snapdebug = 0;
#endif
int
ffs_snapshot_init(struct ufsmount *ump)
{
struct snap_info *si;
si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
TAILQ_INIT(&si->si_snapshots);
mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
si->si_owner = NULL;
si->si_gen = 0;
si->si_snapblklist = NULL;
return 0;
}
void
ffs_snapshot_fini(struct ufsmount *ump)
{
struct snap_info *si;
si = ump->um_snapinfo;
ump->um_snapinfo = NULL;
KASSERT(TAILQ_EMPTY(&si->si_snapshots));
mutex_destroy(&si->si_lock);
mutex_destroy(&si->si_snaplock);
KASSERT(si->si_snapblklist == NULL);
kmem_free(si, sizeof(*si));
}
/*
* Create a snapshot file and initialize it for the filesystem.
* Vnode is locked on entry and return.
*/
int
ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
{
#if defined(FFS_NO_SNAPSHOT)
return EOPNOTSUPP;
}
#else /* defined(FFS_NO_SNAPSHOT) */
bool suspended = false;
int error, redo = 0, snaploc;
void *sbbuf = NULL;
daddr_t *snaplist = NULL, snaplistsize = 0;
struct buf *bp, *nbp;
struct fs *copy_fs = NULL;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
struct timespec ts;
struct timeval starttime;
#ifdef DEBUG
struct timeval endtime;
#endif
struct vnode *devvp = ip->i_devvp;
/*
* If the vnode already is a snapshot, return.
*/
if ((ip->i_flags & SF_SNAPSHOT)) {
if ((ip->i_flags & SF_SNAPINVAL))
return EINVAL;
if (ctime) {
ctime->tv_sec = DIP(ip, mtime);
ctime->tv_nsec = DIP(ip, mtimensec);
}
return 0;
}
/*
* Check for free snapshot slot in the superblock.
*/
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
if (fs->fs_snapinum[snaploc] == 0)
break;
if (snaploc == FSMAXSNAP)
return (ENOSPC);
/*
* Prepare the vnode to become a snapshot.
*/
error = snapshot_setup(mp, vp);
if (error)
goto out;
/*
* Copy all the cylinder group maps. Although the
* filesystem is still active, we hope that only a few
* cylinder groups will change between now and when we
* suspend operations. Thus, we will be able to quickly
* touch up the few cylinder groups that changed during
* the suspension period.
*/
error = cgaccount(vp, 1, NULL);
if (error)
goto out;
/*
* snapshot is now valid
*/
ip->i_flags &= ~SF_SNAPINVAL;
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Ensure that the snapshot is completely on disk.
* Since we have marked it as a snapshot it is safe to
* unlock it as no process will be allowed to write to it.
*/
error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
if (error)
goto out;
VOP_UNLOCK(vp);
/*
* All allocations are done, so we can now suspend the filesystem.
*/
error = vfs_suspend(vp->v_mount, 0);
if (error == 0) {
suspended = true;
vrele_flush(vp->v_mount);
error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (error)
goto out;
getmicrotime(&starttime);
/*
* First, copy all the cylinder group maps that have changed.
*/
error = cgaccount(vp, 2, &redo);
if (error)
goto out;
/*
* Create a copy of the superblock and its summary information.
*/
error = snapshot_copyfs(mp, vp, &sbbuf);
if (error)
goto out;
copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
/*
* Expunge unlinked files from our view.
*/
error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
if (error)
goto out;
/*
* Record snapshot inode. Since this is the newest snapshot,
* it must be placed at the end of the list.
*/
if (ip->i_nlink > 0)
fs->fs_snapinum[snaploc] = ip->i_number;
mutex_enter(&si->si_lock);
if (is_active_snapshot(si, ip))
panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
if (TAILQ_FIRST(&si->si_snapshots) == ip) {
/*
* If this is the first snapshot on this filesystem, put the
* preliminary list in place and establish the cow handler.
*/
si->si_snapblklist = snaplist;
fscow_establish(mp, ffs_copyonwrite, devvp);
}
si->si_gen++;
mutex_exit(&si->si_lock);
vp->v_vflag |= VV_SYSTEM;
/*
* Set the mtime to the time the snapshot has been taken.
*/
TIMEVAL_TO_TIMESPEC(&starttime, &ts);
if (ctime)
*ctime = ts;
DIP_ASSIGN(ip, mtime, ts.tv_sec);
DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Copy allocation information from all snapshots and then
* expunge them from our view.
*/
error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
if (error)
goto out;
/*
* Write the superblock and its summary information to the snapshot.
*/
error = snapshot_writefs(mp, vp, sbbuf);
if (error)
goto out;
/*
* We're nearly done, ensure that the snapshot is completely on disk.
*/
error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
if (error)
goto out;
/*
* Invalidate and free all pages on the snapshot vnode.
* We will read and write through the buffercache.
*/
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, 0, 0,
PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
if (error)
goto out;
/*
* Invalidate short ( < fs_bsize ) buffers. We will always read
* full size buffers later.
*/
mutex_enter(&bufcache_lock);
KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_bcount == fs->fs_bsize)
continue;
error = bbusy(bp, false, 0, NULL);
if (error != 0) {
if (error == EPASSTHROUGH) {
nbp = LIST_FIRST(&vp->v_cleanblkhd);
continue;
}
break;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
mutex_exit(&bufcache_lock);
out:
if (sbbuf != NULL) {
free(copy_fs->fs_csp, M_UFSMNT);
free(sbbuf, M_UFSMNT);
}
if (fs->fs_active != NULL) {
free(fs->fs_active, M_DEVBUF);
fs->fs_active = NULL;
}
mutex_enter(&si->si_lock);
if (snaplist != NULL) {
if (si->si_snapblklist == snaplist)
si->si_snapblklist = NULL;
free(snaplist, M_UFSMNT);
}
if (error) {
fs->fs_snapinum[snaploc] = 0;
} else {
/*
* As this is the newest list, it is the most inclusive, so
* should replace the previous list.
*/
si->si_snapblklist = ip->i_snapblklist;
}
si->si_gen++;
mutex_exit(&si->si_lock);
if (suspended) {
VOP_UNLOCK(vp);
vfs_resume(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef DEBUG
getmicrotime(&endtime);
timersub(&endtime, &starttime, &endtime);
printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
endtime.tv_usec / 1000, redo, fs->fs_ncg);
#endif
}
if (error) {
if (UFS_WAPBL_BEGIN(mp) == 0) {
/*
* We depend on ffs_truncate() to call ffs_snapremove()
* before it may return an error. On failed
* ffs_truncate() we have normal file with leaked
* (meta-) data, but no snapshot to use.
*/
(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
UFS_WAPBL_END(mp);
}
} else if (ip->i_nlink > 0)
vref(vp);
return (error);
}
/*
* Prepare vnode to become a snapshot.
*/
static int
snapshot_setup(struct mount *mp, struct vnode *vp)
{
int error, n, len, loc, cg;
daddr_t blkno, numblks;
struct buf *ibp, *nbp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct lwp *l = curlwp;
const int wbreak = blocks_in_journal(fs)/8;
struct inode *ip = VTOI(vp);
/*
* Check mount, readonly reference and owner.
*/
if (vp->v_mount != mp)
return EXDEV;
if (vp->v_writecount != 0)
return EBUSY;
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
0, mp, vp, NULL);
if (error)
return EACCES;
/*
* Must completely truncate the file here. Allocated
* blocks on a snapshot mean that block has been copied
* on write, see ffs_copyonwrite() testing "blkno != 0"
*/
error = ufs_truncate_all(vp);
if (error)
return error;
/* Change inode to snapshot type file. */
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
#if defined(QUOTA) || defined(QUOTA2)
/* snapshot inodes are not accounted in quotas */
chkiq(ip, -1, l->l_cred, 0);
#endif
ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
ffs_update(vp, NULL, NULL, UPDATE_WAIT);
UFS_WAPBL_END(mp);
KASSERT(ip->i_flags & SF_SNAPSHOT);
/*
* Write an empty list of preallocated blocks to the end of
* the snapshot to set size to at least that of the filesystem.
*/
numblks = howmany(fs->fs_size, fs->fs_frag);
blkno = 1;
blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
error = vn_rdwr(UIO_WRITE, vp,
(void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
if (error)
return error;
/*
* Preallocate critical data structures so that we can copy
* them in without further allocation after we suspend all
* operations on the filesystem. We would like to just release
* the allocated buffers without writing them since they will
* be filled in below once we are ready to go, but this upsets
* the soft update code, so we go ahead and write the new buffers.
*
* Allocate all indirect blocks and mark all of them as not
* needing to be copied.
*/
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
goto out;
brelse(ibp, 0);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
/*
* Allocate copies for the superblock and its summary information.
*/
error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
0, &nbp);
if (error)
goto out;
bawrite(nbp);
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
len = howmany(fs->fs_cssize, fs->fs_bsize);
for (loc = 0; loc < len; loc++) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
fs->fs_bsize, l->l_cred, 0, &nbp);
if (error)
goto out;
bawrite(nbp);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
/*
* Allocate all cylinder group blocks.
*/
for (cg = 0; cg < fs->fs_ncg; cg++) {
error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
fs->fs_bsize, l->l_cred, 0, &nbp);
if (error)
goto out;
bawrite(nbp);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
out:
UFS_WAPBL_END(mp);
return error;
}
/*
* Create a copy of the superblock and its summary information.
* It is up to the caller to free copyfs and copy_fs->fs_csp.
*/
static int
snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
{
int error, i, len, loc, size;
void *space;
int32_t *lp;
struct buf *bp;
struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
struct vnode *devvp = VTOI(vp)->i_devvp;
/*
* Grab a copy of the superblock and its summary information.
* We delay writing it until the suspension is released below.
*/
*sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
loc = ffs_blkoff(fs, fs->fs_sblockloc);
if (loc > 0)
memset(*sbbuf, 0, loc);
copyfs = (struct fs *)((char *)(*sbbuf) + loc);
memcpy(copyfs, fs, fs->fs_sbsize);
size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
if (fs->fs_sbsize < size)
memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
size - fs->fs_sbsize);
size = ffs_blkroundup(fs, fs->fs_cssize);
if (fs->fs_contigsumsize > 0)
size += fs->fs_ncg * sizeof(int32_t);
space = malloc(size, M_UFSMNT, M_WAITOK);
copyfs->fs_csp = space;
memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
space = (char *)space + fs->fs_cssize;
loc = howmany(fs->fs_cssize, fs->fs_fsize);
i = fs->fs_frag - loc % fs->fs_frag;
len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
if (len > 0) {
if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
len, 0, &bp)) != 0) {
free(copyfs->fs_csp, M_UFSMNT);
free(*sbbuf, M_UFSMNT);
*sbbuf = NULL;
return error;
}
memcpy(space, bp->b_data, (u_int)len);
space = (char *)space + len;
brelse(bp, BC_INVAL | BC_NOCACHE);
}
if (fs->fs_contigsumsize > 0) {
copyfs->fs_maxcluster = lp = space;
for (i = 0; i < fs->fs_ncg; i++)
*lp++ = fs->fs_contigsumsize;
}
if (mp->mnt_wapbl)
copyfs->fs_flags &= ~FS_DOWAPBL;
return 0;
}
struct snapshot_expunge_ctx {
struct vnode *logvp;
struct vnode *vp;
struct fs *copy_fs;
};
static bool
snapshot_expunge_selector(void *cl, struct vnode *xvp)
{
struct snapshot_expunge_ctx *c = cl;
struct inode *xp;
KASSERT(mutex_owned(xvp->v_interlock));
xp = VTOI(xvp);
if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
(xp->i_flags & SF_SNAPSHOT))
return false;
#ifdef DEBUG
if (snapdebug)
vprint("ffs_snapshot: busy vnode", xvp);
#endif
if (xvp == c->logvp)
return true;
if (xp->i_nlink > 0)
return false;
if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
return false;
return true;
}
/*
* We must check for active files that have been unlinked (e.g., with a zero
* link count). We have to expunge all trace of these files from the snapshot
* so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
* Note that we skip unlinked snapshot files as they will be handled separately.
* Calculate the snapshot list size and create a preliminary list.
*/
static int
snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
daddr_t *snaplistsize, daddr_t **snaplist)
{
int cg, error = 0, len, loc;
daddr_t blkno, *blkp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *xp;
struct vnode *logvp = NULL, *xvp;
struct vnode_iterator *marker;
struct snapshot_expunge_ctx ctx;
*snaplist = NULL;
/*
* Get the log inode if any.
*/
if ((fs->fs_flags & FS_DOWAPBL) &&
fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO],
LK_EXCLUSIVE, &logvp);
if (error)
goto out;
}
/*
* We also calculate the needed size for the snapshot list.
*/
*snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
vfs_vnode_iterator_init(mp, &marker);
ctx.logvp = logvp;
ctx.vp = vp;
ctx.copy_fs = copy_fs;
while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
&ctx)))
{
/*
* If there is a fragment, clear it here.
*/
xp = VTOI(xvp);
blkno = 0;
loc = howmany(xp->i_size, fs->fs_bsize) - 1;
if (loc < UFS_NDADDR) {
len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
if (len > 0 && len < fs->fs_bsize) {
error = UFS_WAPBL_BEGIN(mp);
if (error) {
vrele(xvp);
vfs_vnode_iterator_destroy(marker);
goto out;
}
ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
len, xp->i_number);
blkno = db_get(xp, loc);
db_assign(xp, loc, 0);
UFS_WAPBL_END(mp);
}
}
*snaplistsize += 1;
error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
if (blkno)
db_assign(xp, loc, blkno);
if (!error) {
error = UFS_WAPBL_BEGIN(mp);
if (!error) {
error = ffs_freefile_snap(copy_fs, vp,
xp->i_number, xp->i_mode);
UFS_WAPBL_END(mp);
}
}
vrele(xvp);
if (error) {
vfs_vnode_iterator_destroy(marker);
goto out;
}
}
vfs_vnode_iterator_destroy(marker);
/*
* Create a preliminary list of preallocated snapshot blocks.
*/
*snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
blkp = &(*snaplist)[1];
*blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
for (cg = 0; cg < fs->fs_ncg; cg++) {
if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
break;
*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
}
len = howmany(fs->fs_cssize, fs->fs_bsize);
for (loc = 0; loc < len; loc++)
*blkp++ = blkno + loc;
for (; cg < fs->fs_ncg; cg++)
*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
(*snaplist)[0] = blkp - &(*snaplist)[0];
out:
if (logvp != NULL)
vput(logvp);
if (error && *snaplist != NULL) {
free(*snaplist, M_UFSMNT);
*snaplist = NULL;
}
return error;
}
/*
* Copy allocation information from all the snapshots in this snapshot and
* then expunge them from its view. Also, collect the list of allocated
* blocks in i_snapblklist.
*/
static int
snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
struct fs *copy_fs, daddr_t snaplistsize)
{
int error = 0, i;
daddr_t numblks, *snaplist = NULL;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp), *xp;
struct lwp *l = curlwp;
struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
if (xp != ip) {
error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
if (error)
break;
}
if (xp->i_nlink != 0)
continue;
error = UFS_WAPBL_BEGIN(mp);
if (error)
break;
error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
UFS_WAPBL_END(mp);
if (error)
break;
}
if (error)
goto out;
/*
* Allocate space for the full list of preallocated snapshot blocks.
*/
snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
ip->i_snapblklist = &snaplist[1];
/*
* Expunge the blocks used by the snapshots from the set of
* blocks marked as used in the snapshot bitmaps. Also, collect
* the list of allocated blocks in i_snapblklist.
*/
error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
if (error)
goto out;
if (snaplistsize < ip->i_snapblklist - snaplist)
panic("ffs_snapshot: list too small");
snaplistsize = ip->i_snapblklist - snaplist;
snaplist[0] = snaplistsize;
ip->i_snapblklist = &snaplist[0];
/*
* Write out the list of allocated blocks to the end of the snapshot.
*/
numblks = howmany(fs->fs_size, fs->fs_frag);
for (i = 0; i < snaplistsize; i++)
snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
for (i = 0; i < snaplistsize; i++)
snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
out:
if (error && snaplist != NULL) {
free(snaplist, M_UFSMNT);
ip->i_snapblklist = NULL;
}
return error;
}
/*
* Write the superblock and its summary information to the snapshot.
* Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
*/
static int
snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
{
int error, len, loc;
void *space;
daddr_t blkno;
struct buf *bp;
struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
/*
* Write the superblock and its summary information
* to the snapshot.
*/
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
len = howmany(fs->fs_cssize, fs->fs_bsize);
space = copyfs->fs_csp;
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs)) {
ffs_sb_swap(copyfs, copyfs);
ffs_csum_swap(space, space, fs->fs_cssize);
}
#endif
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for (loc = 0; loc < len; loc++) {
error = bread(vp, blkno + loc, fs->fs_bsize,
B_MODIFY, &bp);
if (error) {
break;
}
memcpy(bp->b_data, space, fs->fs_bsize);
space = (char *)space + fs->fs_bsize;
bawrite(bp);
}
if (error)
goto out;
error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
fs->fs_bsize, B_MODIFY, &bp);
if (error) {
goto out;
} else {
memcpy(bp->b_data, sbbuf, fs->fs_bsize);
bawrite(bp);
}
/*
* Copy the first UFS_NDADDR blocks to the snapshot so
* ffs_copyonwrite() and ffs_snapblkfree() will always work on
* indirect blocks.
*/
for (loc = 0; loc < UFS_NDADDR; loc++) {
if (db_get(ip, loc) != 0)
continue;
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
fs->fs_bsize, l->l_cred, 0, &bp);
if (error)
break;
error = rwfsblk(vp, B_READ, bp->b_data, loc);
if (error) {
brelse(bp, 0);
break;
}
bawrite(bp);
}
out:
UFS_WAPBL_END(mp);
return error;
}
/*
* Copy all cylinder group maps.
*/
static int
cgaccount(struct vnode *vp, int passno, int *redo)
{
int cg, error = 0;
struct buf *nbp;
struct fs *fs = VTOI(vp)->i_fs;
if (redo != NULL)
*redo = 0;
if (passno == 1)
fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
M_DEVBUF, M_WAITOK | M_ZERO);
for (cg = 0; cg < fs->fs_ncg; cg++) {
if (passno == 2 && ACTIVECG_ISSET(fs, cg))
continue;
if (redo != NULL)
*redo += 1;
error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error)
return error;
error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
fs->fs_bsize, curlwp->l_cred, 0, &nbp);
if (error) {
UFS_WAPBL_END(vp->v_mount);
break;
}
error = cgaccount1(cg, vp, nbp->b_data, passno);
bawrite(nbp);
UFS_WAPBL_END(vp->v_mount);
if (error)
break;
}
return error;
}
/*
* Copy a cylinder group map. All the unallocated blocks are marked
* BLK_NOCOPY so that the snapshot knows that it need not copy them
* if they are later written. If passno is one, then this is a first
* pass, so only setting needs to be done. If passno is 2, then this
* is a revision to a previous pass which must be undone as the
* replacement pass is done.
*/
static int
cgaccount1(int cg, struct vnode *vp, void *data, int passno)
{
struct buf *bp, *ibp;
struct inode *ip;
struct cg *cgp;
struct fs *fs;
struct lwp *l = curlwp;
daddr_t base, numblks;
int error, len, loc, ns __unused, indiroff;
ip = VTOI(vp);
fs = ip->i_fs;
ns = UFS_FSNEEDSWAP(fs);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, 0, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, ns)) {
brelse(bp, 0);
return (EIO);
}
ACTIVECG_SET(fs, cg);
memcpy(data, bp->b_data, fs->fs_cgsize);
brelse(bp, 0);
if (fs->fs_cgsize < fs->fs_bsize)
memset((char *)data + fs->fs_cgsize, 0,
fs->fs_bsize - fs->fs_cgsize);
numblks = howmany(fs->fs_size, fs->fs_frag);
len = howmany(fs->fs_fpg, fs->fs_frag);
base = cgbase(fs, cg) / fs->fs_frag;
if (base + len >= numblks)
len = numblks - base - 1;
loc = 0;
if (base < UFS_NDADDR) {
for ( ; loc < UFS_NDADDR; loc++) {
if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
db_assign(ip, loc, BLK_NOCOPY);
else if (db_get(ip, loc) == BLK_NOCOPY) {
if (passno == 2)
db_assign(ip, loc, 0);
else if (passno == 1)
panic("ffs_snapshot: lost direct block");
}
}
}
if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
return (error);
indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
for ( ; loc < len; loc++, indiroff++) {
if (indiroff >= FFS_NINDIR(fs)) {
bawrite(ibp);
if ((error = ffs_balloc(vp,
ffs_lblktosize(fs, (off_t)(base + loc)),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
return (error);
indiroff = 0;
}
if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
if (passno == 2)
idb_assign(ip, ibp->b_data, indiroff, 0);
else if (passno == 1)
panic("ffs_snapshot: lost indirect block");
}
}
bdwrite(ibp);
return (0);
}
/*
* Before expunging a snapshot inode, note all the
* blocks that it claims with BLK_SNAP so that fsck will
* be able to account for those blocks properly and so
* that this snapshot knows that it need not copy them
* if the other snapshot holding them is freed.
*/
static int
expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
acctfunc_t acctfunc, int expungetype)
{
int i, error, ns __unused;
daddr_t lbn, rlbn;
daddr_t len, blkno, numblks, blksperindir;
struct ufs1_dinode *dip1;
struct ufs2_dinode *dip2;
struct lwp *l = curlwp;
void *bap;
struct buf *bp;
struct mount *mp;
ns = UFS_FSNEEDSWAP(fs);
mp = snapvp->v_mount;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
/*
* Prepare to expunge the inode. If its inode block has not
* yet been copied, then allocate and fill the copy.
*/
lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
error = snapblkaddr(snapvp, lbn, &blkno);
if (error)
return error;
if (blkno != 0) {
error = bread(snapvp, lbn, fs->fs_bsize,
B_MODIFY, &bp);
} else {
error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, l->l_cred, 0, &bp);
if (! error)
error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
}
if (error) {
UFS_WAPBL_END(mp);
return error;
}
/*
* Set a snapshot inode to be a zero length file, regular files
* or unlinked snapshots to be completely unallocated.
*/
if (fs->fs_magic == FS_UFS1_MAGIC) {
dip1 = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
if (cancelip->i_flags & SF_SNAPSHOT) {
dip1->di_flags =
ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
SF_SNAPINVAL, ns);
}
if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
dip1->di_mode = 0;
dip1->di_size = 0;
dip1->di_blocks = 0;
memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
} else {
dip2 = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
if (cancelip->i_flags & SF_SNAPSHOT) {
dip2->di_flags =
ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
SF_SNAPINVAL, ns);
}
if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
dip2->di_mode = 0;
dip2->di_size = 0;
dip2->di_blocks = 0;
memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
}
bdwrite(bp);
UFS_WAPBL_END(mp);
/*
* Now go through and expunge all the blocks in the file
* using the function requested.
*/
numblks = howmany(cancelip->i_size, fs->fs_bsize);
if (fs->fs_magic == FS_UFS1_MAGIC)
bap = &cancelip->i_ffs1_db[0];
else
bap = &cancelip->i_ffs2_db[0];
error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
if (error)
return (error);
if (fs->fs_magic == FS_UFS1_MAGIC)
bap = &cancelip->i_ffs1_ib[0];
else
bap = &cancelip->i_ffs2_ib[0];
error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
if (error)
return (error);
blksperindir = 1;
lbn = -UFS_NDADDR;
len = numblks - UFS_NDADDR;
rlbn = UFS_NDADDR;
for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
error = indiracct(snapvp, ITOV(cancelip), i,
ib_get(cancelip, i), lbn, rlbn, len,
blksperindir, fs, acctfunc, expungetype);
if (error)
return (error);
blksperindir *= FFS_NINDIR(fs);
lbn -= blksperindir + 1;
len -= blksperindir;
rlbn += blksperindir;
}
return (0);
}
/*
* Descend an indirect block chain for vnode cancelvp accounting for all
* its indirect blocks in snapvp.
*/
static int
indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
{
int error, num, i;
daddr_t subblksperindir;
struct indir indirs[UFS_NIADDR + 2];
daddr_t last;
void *bap;
struct buf *bp;
if (blkno == 0) {
if (expungetype == BLK_NOCOPY)
return (0);
panic("indiracct: missing indir");
}
if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
return (error);
if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
panic("indiracct: botched params");
/*
* We have to expand bread here since it will deadlock looking
* up the block number for any blocks that are not in the cache.
*/
error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
false, &bp);
if (error)
return error;
if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
brelse(bp, 0);
return (error);
}
/*
* Account for the block pointers in this indirect block.
*/
last = howmany(remblks, blksperindir);
if (last > FFS_NINDIR(fs))
last = FFS_NINDIR(fs);
bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
memcpy((void *)bap, bp->b_data, fs->fs_bsize);
brelse(bp, 0);
error = (*acctfunc)(snapvp, bap, 0, last,
fs, level == 0 ? rlbn : -1, expungetype);
if (error || level == 0)
goto out;
/*
* Account for the block pointers in each of the indirect blocks
* in the levels below us.
*/
subblksperindir = blksperindir / FFS_NINDIR(fs);
for (lbn++, level--, i = 0; i < last; i++) {
error = indiracct(snapvp, cancelvp, level,
idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
subblksperindir, fs, acctfunc, expungetype);
if (error)
goto out;
rlbn += blksperindir;
lbn -= blksperindir;
remblks -= blksperindir;
}
out:
free(bap, M_DEVBUF);
return (error);
}
/*
* Do both snap accounting and map accounting.
*/
static int
fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno,
int exptype /* BLK_SNAP or BLK_NOCOPY */)
{
int error;
if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
return (error);
return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
}
/*
* Identify a set of blocks allocated in a snapshot inode.
*/
static int
snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno,
int expungetype /* BLK_SNAP or BLK_NOCOPY */)
{
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
struct mount *mp = vp->v_mount;
daddr_t blkno;
daddr_t lbn;
struct buf *ibp;
int error, n;
const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for ( n = 0; oldblkp < lastblkp; oldblkp++) {
blkno = idb_get(ip, bap, oldblkp);
if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
continue;
lbn = ffs_fragstoblks(fs, blkno);
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
} else {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
break;
blkno = idb_get(ip, ibp->b_data,
(lbn - UFS_NDADDR) % FFS_NINDIR(fs));
}
/*
* If we are expunging a snapshot vnode and we
* find a block marked BLK_NOCOPY, then it is
* one that has been allocated to this snapshot after
* we took our current snapshot and can be ignored.
*/
if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
if (lbn >= UFS_NDADDR)
brelse(ibp, 0);
} else {
if (blkno != 0)
panic("snapacct: bad block");
if (lbn < UFS_NDADDR)
db_assign(ip, lbn, expungetype);
else {
idb_assign(ip, ibp->b_data,
(lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
bdwrite(ibp);
}
}
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
UFS_WAPBL_END(mp);
return error;
}
/*
* Account for a set of blocks allocated in a snapshot inode.
*/
static int
mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno, int expungetype)
{
daddr_t blkno;
struct inode *ip;
struct mount *mp = vp->v_mount;
ino_t inum;
int acctit, error, n;
const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
ip = VTOI(vp);
inum = ip->i_number;
if (lblkno == -1)
acctit = 0;
else
acctit = 1;
for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
blkno = idb_get(ip, bap, oldblkp);
if (blkno == 0 || blkno == BLK_NOCOPY)
continue;
if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = ffs_blkstofrags(fs, lblkno);
ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
UFS_WAPBL_END(mp);
return (0);
}
/*
* Number of blocks that fit into the journal or zero if not logging.
*/
static int
blocks_in_journal(struct fs *fs)
{
off_t bpj;
if ((fs->fs_flags & FS_DOWAPBL) == 0)
return 0;
bpj = 1;
if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
switch (fs->fs_journal_location) {
case UFS_WAPBL_JOURNALLOC_END_PARTITION:
bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
break;
case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
break;
}
}
bpj /= fs->fs_bsize;
return (bpj > 0 ? bpj : 1);
}
#endif /* defined(FFS_NO_SNAPSHOT) */
/*
* Decrement extra reference on snapshot when last name is removed.
* It will not be freed until the last open reference goes away.
*/
void
ffs_snapgone(struct vnode *vp)
{
struct inode *xp, *ip = VTOI(vp);
struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
struct fs *fs;
struct snap_info *si;
int snaploc;
si = VFSTOUFS(mp)->um_snapinfo;
/*
* Find snapshot in incore list.
*/
mutex_enter(&si->si_lock);
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
if (xp == ip)
break;
mutex_exit(&si->si_lock);
if (xp != NULL)
vrele(ITOV(ip));
#ifdef DEBUG
else if (snapdebug)
printf("ffs_snapgone: lost snapshot vnode %llu\n",
(unsigned long long)ip->i_number);
#endif
/*
* Delete snapshot inode from superblock. Keep list dense.
*/
mutex_enter(&si->si_lock);
fs = ip->i_fs;
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
if (fs->fs_snapinum[snaploc] == ip->i_number)
break;
if (snaploc < FSMAXSNAP) {
for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
if (fs->fs_snapinum[snaploc] == 0)
break;
fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
}
fs->fs_snapinum[snaploc - 1] = 0;
}
si->si_gen++;
mutex_exit(&si->si_lock);
}
/*
* Prepare a snapshot file for being removed.
*/
void
ffs_snapremove(struct vnode *vp)
{
struct inode *ip = VTOI(vp), *xp;
struct vnode *devvp = ip->i_devvp;
struct fs *fs = ip->i_fs;
struct mount *mp = spec_node_getmountedfs(devvp);
struct buf *ibp;
struct snap_info *si;
struct lwp *l = curlwp;
daddr_t numblks, blkno, dblk;
int error, loc, last;
si = VFSTOUFS(mp)->um_snapinfo;
/*
* If active, delete from incore list (this snapshot may
* already have been in the process of being deleted, so
* would not have been active).
*
* Clear copy-on-write flag if last snapshot.
*/
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
if (is_active_snapshot(si, ip)) {
TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
if (TAILQ_FIRST(&si->si_snapshots) != 0) {
/* Roll back the list of preallocated blocks. */
xp = TAILQ_LAST(&si->si_snapshots, inodelst);
si->si_snapblklist = xp->i_snapblklist;
si->si_gen++;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
} else {
si->si_snapblklist = 0;
si->si_gen++;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
fscow_disestablish(mp, ffs_copyonwrite, devvp);
}
if (ip->i_snapblklist != NULL) {
free(ip->i_snapblklist, M_UFSMNT);
ip->i_snapblklist = NULL;
}
} else {
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
}
/*
* Clear all BLK_NOCOPY fields. Pass any block claims to other
* snapshots that want them (see ffs_snapblkfree below).
*/
for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
dblk = db_get(ip, blkno);
if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
db_assign(ip, blkno, 0);
else if ((dblk == ffs_blkstofrags(fs, blkno) &&
ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
ip->i_number))) {
DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
db_assign(ip, blkno, 0);
}
}
numblks = howmany(ip->i_size, fs->fs_bsize);
for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
continue;
if (fs->fs_size - blkno > FFS_NINDIR(fs))
last = FFS_NINDIR(fs);
else
last = fs->fs_size - blkno;
for (loc = 0; loc < last; loc++) {
dblk = idb_get(ip, ibp->b_data, loc);
if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
idb_assign(ip, ibp->b_data, loc, 0);
else if (dblk == ffs_blkstofrags(fs, blkno) &&
ffs_snapblkfree(fs, ip->i_devvp, dblk,
fs->fs_bsize, ip->i_number)) {
DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
idb_assign(ip, ibp->b_data, loc, 0);
}
}
bawrite(ibp);
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
KASSERT(error == 0);
}
/*
* Clear snapshot flag and drop reference.
*/
ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
#if defined(QUOTA) || defined(QUOTA2)
chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
chkiq(ip, 1, l->l_cred, FORCE);
#endif
}
/*
* Notification that a block is being freed. Return zero if the free
* should be allowed to proceed. Return non-zero if the snapshot file
* wants to claim the block. The block will be claimed if it is an
* uncopied part of one of the snapshots. It will be freed if it is
* either a BLK_NOCOPY or has already been copied in all of the snapshots.
* If a fragment is being freed, then all snapshots that care about
* it must make a copy since a snapshot file can only claim full sized
* blocks. Note that if more than one snapshot file maps the block,
* we can pick one at random to claim it. Since none of the snapshots
* can change, we are assurred that they will all see the same unmodified
* image. When deleting a snapshot file (see ffs_snapremove above), we
* must push any of these claimed blocks to one of the other snapshots
* that maps it. These claimed blocks are easily identified as they will
* have a block number equal to their logical block number within the
* snapshot. A copied block can never have this property because they
* must always have been allocated from a BLK_NOCOPY location.
*/
int
ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
long size, ino_t inum)
{
struct mount *mp = spec_node_getmountedfs(devvp);
struct buf *ibp;
struct inode *ip;
struct vnode *vp = NULL;
struct snap_info *si;
void *saved_data = NULL;
daddr_t lbn;
daddr_t blkno;
uint32_t gen;
int indiroff = 0, error = 0, claimedblk = 0;
si = VFSTOUFS(mp)->um_snapinfo;
lbn = ffs_fragstoblks(fs, bno);
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
si->si_owner = curlwp;
retry:
gen = si->si_gen;
TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
vp = ITOV(ip);
/*
* Lookup block being written.
*/
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
} else {
mutex_exit(&si->si_lock);
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
if (error) {
mutex_enter(&si->si_lock);
break;
}
indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs); blkno = idb_get(ip, ibp->b_data, indiroff);
mutex_enter(&si->si_lock);
if (gen != si->si_gen) {
brelse(ibp, 0);
goto retry;
}
}
/*
* Check to see if block needs to be copied.
*/
if (blkno == 0) {
/*
* A block that we map is being freed. If it has not
* been claimed yet, we will claim or copy it (below).
*/
claimedblk = 1;
} else if (blkno == BLK_SNAP) {
/*
* No previous snapshot claimed the block,
* so it will be freed and become a BLK_NOCOPY
* (don't care) for us.
*/
if (claimedblk)
panic("snapblkfree: inconsistent block type");
if (lbn < UFS_NDADDR) {
db_assign(ip, lbn, BLK_NOCOPY);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
} else {
idb_assign(ip, ibp->b_data, indiroff,
BLK_NOCOPY);
mutex_exit(&si->si_lock);
if (ip->i_nlink > 0)
bwrite(ibp);
else
bdwrite(ibp);
mutex_enter(&si->si_lock);
if (gen != si->si_gen)
goto retry;
}
continue;
} else /* BLK_NOCOPY or default */ {
/*
* If the snapshot has already copied the block
* (default), or does not care about the block,
* it is not needed.
*/
if (lbn >= UFS_NDADDR) brelse(ibp, 0);
continue;
}
/*
* If this is a full size block, we will just grab it
* and assign it to the snapshot inode. Otherwise we
* will proceed to copy it. See explanation for this
* routine as to why only a single snapshot needs to
* claim this block.
*/
if (size == fs->fs_bsize) {
#ifdef DEBUG
if (snapdebug)
printf("%s %llu lbn %" PRId64
"from inum %llu\n",
"Grabonremove: snapino",
(unsigned long long)ip->i_number,
lbn, (unsigned long long)inum);
#endif
mutex_exit(&si->si_lock);
if (lbn < UFS_NDADDR) {
db_assign(ip, lbn, bno);
} else {
idb_assign(ip, ibp->b_data, indiroff, bno);
if (ip->i_nlink > 0)
bwrite(ibp);
else
bdwrite(ibp);
}
DIP_ADD(ip, blocks, btodb(size));
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (ip->i_nlink > 0 && mp->mnt_wapbl) error = syncsnap(vp);
else
error = 0;
mutex_enter(&si->si_lock);
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
return (error == 0);
}
if (lbn >= UFS_NDADDR) brelse(ibp, 0);
#ifdef DEBUG
if (snapdebug)
printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
"Copyonremove: snapino ",
(unsigned long long)ip->i_number,
lbn, "for inum", (unsigned long long)inum, size);
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
mutex_exit(&si->si_lock);
if (saved_data == NULL) {
saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
error = rwfsblk(vp, B_READ, saved_data, lbn);
if (error) { free(saved_data, M_UFSMNT);
saved_data = NULL;
mutex_enter(&si->si_lock);
break;
}
}
error = wrsnapblk(vp, saved_data, lbn); if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
error = syncsnap(vp);
mutex_enter(&si->si_lock);
if (error)
break;
if (gen != si->si_gen)
goto retry;
}
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
if (saved_data) free(saved_data, M_UFSMNT);
/*
* If we have been unable to allocate a block in which to do
* the copy, then return non-zero so that the fragment will
* not be freed. Although space will be lost, the snapshot
* will stay consistent.
*/
return (error);
}
/*
* Associate snapshot files when mounting.
*/
void
ffs_snapshot_mount(struct mount *mp)
{
struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct lwp *l = curlwp;
struct vnode *vp;
struct inode *ip, *xp;
struct snap_info *si;
daddr_t snaplistsize, *snapblklist;
int i, error, ns __unused, snaploc, loc;
/*
* No persistent snapshots on apple ufs file systems.
*/
if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
return;
si = VFSTOUFS(mp)->um_snapinfo;
ns = UFS_FSNEEDSWAP(fs);
/*
* XXX The following needs to be set before ffs_truncate or
* VOP_READ can be called.
*/
mp->mnt_stat.f_iosize = fs->fs_bsize;
/*
* Process each snapshot listed in the superblock.
*/
vp = NULL;
mutex_enter(&si->si_lock);
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
if (fs->fs_snapinum[snaploc] == 0)
break;
if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
LK_EXCLUSIVE, &vp)) != 0) {
printf("ffs_snapshot_mount: vget failed %d\n", error);
continue;
}
ip = VTOI(vp);
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
SF_SNAPSHOT) {
printf("ffs_snapshot_mount: non-snapshot inode %d\n",
fs->fs_snapinum[snaploc]);
vput(vp);
vp = NULL;
for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
if (fs->fs_snapinum[loc] == 0)
break;
fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
}
fs->fs_snapinum[loc - 1] = 0;
snaploc--;
continue;
}
/*
* Read the block hints list. Use an empty list on
* read errors.
*/
error = vn_rdwr(UIO_READ, vp,
(void *)&snaplistsize, sizeof(snaplistsize),
ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
l->l_cred, NULL, NULL);
if (error) {
printf("ffs_snapshot_mount: read_1 failed %d\n", error);
snaplistsize = 1;
} else
snaplistsize = ufs_rw64(snaplistsize, ns);
snapblklist = malloc(
snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
if (error)
snapblklist[0] = 1;
else {
error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
snaplistsize * sizeof(daddr_t),
ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
l->l_cred, NULL, NULL);
for (i = 0; i < snaplistsize; i++)
snapblklist[i] = ufs_rw64(snapblklist[i], ns);
if (error) {
printf("ffs_snapshot_mount: read_2 failed %d\n",
error);
snapblklist[0] = 1;
}
}
ip->i_snapblklist = &snapblklist[0];
/*
* Link it onto the active snapshot list.
*/
if (is_active_snapshot(si, ip))
panic("ffs_snapshot_mount: %"PRIu64" already on list",
ip->i_number);
else
TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
vp->v_vflag |= VV_SYSTEM;
VOP_UNLOCK(vp);
}
/*
* No usable snapshots found.
*/
if (vp == NULL) {
mutex_exit(&si->si_lock);
return;
}
/*
* Attach the block hints list. We always want to
* use the list from the newest snapshot.
*/
xp = TAILQ_LAST(&si->si_snapshots, inodelst);
si->si_snapblklist = xp->i_snapblklist;
fscow_establish(mp, ffs_copyonwrite, devvp);
si->si_gen++;
mutex_exit(&si->si_lock);
}
/*
* Disassociate snapshot files when unmounting.
*/
void
ffs_snapshot_unmount(struct mount *mp)
{
struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
struct inode *xp;
struct vnode *vp = NULL;
struct snap_info *si;
si = VFSTOUFS(mp)->um_snapinfo;
mutex_enter(&si->si_lock);
while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
vp = ITOV(xp);
TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
if (xp->i_snapblklist == si->si_snapblklist)
si->si_snapblklist = NULL;
free(xp->i_snapblklist, M_UFSMNT);
if (xp->i_nlink > 0) {
si->si_gen++;
mutex_exit(&si->si_lock);
vrele(vp);
mutex_enter(&si->si_lock);
}
}
si->si_gen++;
mutex_exit(&si->si_lock);
if (vp)
fscow_disestablish(mp, ffs_copyonwrite, devvp);
}
/*
* Check for need to copy block that is about to be written,
* copying the block if necessary.
*/
static int
ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
{
struct fs *fs;
struct inode *ip;
struct vnode *devvp = v, *vp = NULL;
struct mount *mp = spec_node_getmountedfs(devvp);
struct snap_info *si;
void *saved_data = NULL;
daddr_t lbn, blkno, *snapblklist;
uint32_t gen;
int lower, upper, mid, snapshot_locked = 0, error = 0;
/*
* Check for valid snapshots.
*/
si = VFSTOUFS(mp)->um_snapinfo;
mutex_enter(&si->si_lock);
ip = TAILQ_FIRST(&si->si_snapshots);
if (ip == NULL) {
mutex_exit(&si->si_lock);
return 0;
}
/*
* First check to see if it is after the file system,
* in the journal or in the preallocated list.
* By doing these checks we avoid several potential deadlocks.
*/
fs = ip->i_fs;
lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
mutex_exit(&si->si_lock);
return 0;
}
if ((fs->fs_flags & FS_DOWAPBL) &&
fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
off_t blk_off, log_start, log_end;
log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
blk_off = dbtob(bp->b_blkno);
if (blk_off >= log_start && blk_off < log_end) {
mutex_exit(&si->si_lock);
return 0;
}
}
snapblklist = si->si_snapblklist;
upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
lower = 1;
while (lower <= upper) {
mid = (lower + upper) / 2;
if (snapblklist[mid] == lbn)
break;
if (snapblklist[mid] < lbn)
lower = mid + 1;
else
upper = mid - 1;
}
if (lower <= upper) {
mutex_exit(&si->si_lock);
return 0;
}
/*
* Not in the precomputed list, so check the snapshots.
*/
if (si->si_owner != curlwp) {
if (!mutex_tryenter(&si->si_snaplock)) {
mutex_exit(&si->si_lock);
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
}
si->si_owner = curlwp;
snapshot_locked = 1;
}
if (data_valid && bp->b_bcount == fs->fs_bsize)
saved_data = bp->b_data;
retry:
gen = si->si_gen;
TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
vp = ITOV(ip);
/*
* We ensure that everything of our own that needs to be
* copied will be done at the time that ffs_snapshot is
* called. Thus we can skip the check here which can
* deadlock in doing the lookup in ffs_balloc.
*/
if (bp->b_vp == vp)
continue;
/*
* Check to see if block needs to be copied.
*/
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
} else {
mutex_exit(&si->si_lock);
blkno = 0; /* XXX: GCC */
if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
mutex_enter(&si->si_lock);
break;
}
mutex_enter(&si->si_lock);
if (gen != si->si_gen)
goto retry;
}
KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0),
"ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd",
(intmax_t)blkno, (intmax_t)bp->b_lblkno);
if (blkno != 0)
continue;
if (curlwp == uvm.pagedaemon_lwp) {
error = ENOMEM;
break;
}
/* Only one level of recursion allowed. */
KASSERT(snapshot_locked);
/*
* Allocate the block into which to do the copy. Since
* multiple processes may all try to copy the same block,
* we have to recheck our need to do a copy if we sleep
* waiting for the lock.
*
* Because all snapshots on a filesystem share a single
* lock, we ensure that we will never be in competition
* with another process to allocate a block.
*/
#ifdef DEBUG
if (snapdebug) {
printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
(unsigned long long)ip->i_number, lbn);
if (bp->b_vp == devvp)
printf("fs metadata");
else
printf("inum %llu", (unsigned long long)
VTOI(bp->b_vp)->i_number);
printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
}
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
mutex_exit(&si->si_lock);
if (saved_data == NULL) {
saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
error = rwfsblk(vp, B_READ, saved_data, lbn);
if (error) {
free(saved_data, M_UFSMNT);
saved_data = NULL;
mutex_enter(&si->si_lock);
break;
}
}
error = wrsnapblk(vp, saved_data, lbn);
if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
error = syncsnap(vp);
mutex_enter(&si->si_lock);
if (error)
break;
if (gen != si->si_gen)
goto retry;
}
/*
* Note that we need to synchronously write snapshots that
* have not been unlinked, and hence will be visible after
* a crash, to ensure their integrity.
*/
if (snapshot_locked) {
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
} else
mutex_exit(&si->si_lock);
if (saved_data && saved_data != bp->b_data)
free(saved_data, M_UFSMNT);
return error;
}
/*
* Read from a snapshot.
*/
int
ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
{
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
struct buf *bp;
daddr_t lbn, nextlbn;
off_t fsbytes, bytesinfile;
long size, xfersize, blkoffset;
int error;
mutex_enter(&si->si_snaplock);
if (ioflag & IO_ALTSEMANTICS)
fsbytes = ip->i_size;
else
fsbytes = ffs_lfragtosize(fs, fs->fs_size);
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
bytesinfile = fsbytes - uio->uio_offset;
if (bytesinfile <= 0)
break;
lbn = ffs_lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
size = fs->fs_bsize;
blkoffset = ffs_blkoff(fs, uio->uio_offset);
xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
bytesinfile);
if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
if (ffs_lblktosize(fs, lbn) + size > fsbytes)
size = ffs_fragroundup(fs,
fsbytes - ffs_lblktosize(fs, lbn));
error = bread(vp, lbn, size, 0, &bp);
} else {
int nextsize = fs->fs_bsize;
error = breadn(vp, lbn,
size, &nextlbn, &nextsize, 1, 0, &bp);
}
if (error)
break;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < blkoffset + xfersize) {
xfersize = size - blkoffset;
if (xfersize <= 0)
break;
}
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
if (error)
break;
brelse(bp, BC_AGE);
}
if (bp != NULL)
brelse(bp, BC_AGE);
mutex_exit(&si->si_snaplock);
return error;
}
/*
* Lookup a snapshots data block address.
* Simpler than UFS_BALLOC() as we know all metadata is already allocated
* and safe even for the pagedaemon where we cannot bread().
*/
static int
snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
{
struct indir indirs[UFS_NIADDR + 2];
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *bp;
int error, num;
KASSERT(lbn >= 0);
if (lbn < UFS_NDADDR) {
*res = db_get(ip, lbn);
return 0;
}
if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
return error;
if (curlwp == uvm.pagedaemon_lwp) {
mutex_enter(&bufcache_lock);
bp = incore(vp, indirs[num-1].in_lbn);
if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
error = 0;
} else
error = ENOMEM;
mutex_exit(&bufcache_lock);
return error;
}
error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
if (error == 0) {
*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
brelse(bp, 0);
}
return error;
}
/*
* Read or write the specified block of the filesystem vp resides on
* from or to the disk bypassing the buffer cache.
*/
static int
rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
{
int error;
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *nbp;
nbp = getiobuf(NULL, true);
nbp->b_flags = flags;
nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
nbp->b_error = 0;
nbp->b_data = data;
nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
nbp->b_proc = NULL;
nbp->b_dev = ip->i_devvp->v_rdev;
SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
bdev_strategy(nbp);
error = biowait(nbp);
putiobuf(nbp);
return error;
}
/*
* Write all dirty buffers to disk and invalidate them.
*/
static int
syncsnap(struct vnode *vp)
{
int error;
buf_t *bp;
struct fs *fs = VTOI(vp)->i_fs;
mutex_enter(&bufcache_lock);
while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
error = bbusy(bp, false, 0, NULL);
if (error == EPASSTHROUGH)
continue;
else if (error != 0) {
mutex_exit(&bufcache_lock);
return error;
}
KASSERT(bp->b_bcount == fs->fs_bsize);
mutex_exit(&bufcache_lock);
error = rwfsblk(vp, B_WRITE, bp->b_data,
ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
brelse(bp, BC_INVAL | BC_VFLUSH);
if (error)
return error;
mutex_enter(&bufcache_lock);
}
mutex_exit(&bufcache_lock);
return 0;
}
/*
* Write the specified block to a snapshot.
*/
static int
wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
{
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *bp;
int error;
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
if (error)
return error;
memcpy(bp->b_data, data, fs->fs_bsize);
if (ip->i_nlink > 0)
error = bwrite(bp);
else
bawrite(bp);
return error;
}
/*
* Check if this inode is present on the active snapshot list.
* Must be called with snapinfo locked.
*/
static inline bool
is_active_snapshot(struct snap_info *si, struct inode *ip)
{
struct inode *xp;
KASSERT(mutex_owned(&si->si_lock));
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
if (xp == ip)
return true;
return false;
}
/*
* Get/Put direct block from inode or buffer containing disk addresses. Take
* care for fs type (UFS1/UFS2) and byte swapping. These functions should go
* into a global include.
*/
static inline daddr_t
db_get(struct inode *ip, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
}
static inline void
db_assign(struct inode *ip, int loc, daddr_t val)
{
if (ip->i_ump->um_fstype == UFS1)
ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
else
ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}
__unused static inline daddr_t
ib_get(struct inode *ip, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
}
static inline daddr_t
idb_get(struct inode *ip, void *bf, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
}
static inline void
idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
{
if (ip->i_ump->um_fstype == UFS1)
((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
else
((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}
/* $NetBSD: ufs_wapbl.h,v 1.19 2020/04/11 17:43:54 jdolecek Exp $ */
/*-
* Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _UFS_UFS_UFS_WAPBL_H_
#define _UFS_UFS_UFS_WAPBL_H_
#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif
/*
* Information for the journal location stored in the superblock.
* We store the journal version, some flags, the journal location
* type, and some location specific "locators" that identify where
* the log itself is located.
*/
/* fs->fs_journal_version */
#define UFS_WAPBL_VERSION 1
/* fs->fs_journal_location */
#define UFS_WAPBL_JOURNALLOC_NONE 0
#define UFS_WAPBL_JOURNALLOC_END_PARTITION 1
#define UFS_WAPBL_EPART_ADDR 0 /* locator slots */
#define UFS_WAPBL_EPART_COUNT 1
#define UFS_WAPBL_EPART_BLKSZ 2
#define UFS_WAPBL_EPART_UNUSED 3
#define UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM 2
#define UFS_WAPBL_INFS_ADDR 0 /* locator slots */
#define UFS_WAPBL_INFS_COUNT 1
#define UFS_WAPBL_INFS_BLKSZ 2
#define UFS_WAPBL_INFS_INO 3
/* fs->fs_journal_flags */
#define UFS_WAPBL_FLAGS_CREATE_LOG 0x1
#define UFS_WAPBL_FLAGS_CLEAR_LOG 0x2
/*
* The journal size is limited to between 1MB and 64MB.
* The default journal size is the filesystem size divided by
* the scale factor - this is 1M of journal per 1GB of filesystem
* space.
*
* XXX: Is 64MB too limiting? If user explicitly asks for more, allow it?
*/
#define UFS_WAPBL_JOURNAL_SCALE 1024
#define UFS_WAPBL_MIN_JOURNAL_SIZE (1024 * 1024)
#define UFS_WAPBL_MAX_JOURNAL_SIZE (64 * 1024 * 1024)
#if defined(WAPBL)
static __inline int
ufs_wapbl_begin(struct mount *mp, const char *file, int line)
{
if (mp->mnt_wapbl) {
int error;
error = wapbl_begin(mp->mnt_wapbl, file, line);
if (error)
return error;
}
return 0;
}
static __inline void
ufs_wapbl_end(struct mount *mp)
{
if (mp->mnt_wapbl) { wapbl_end(mp->mnt_wapbl);
}
}
#define UFS_WAPBL_BEGIN(mp) \
ufs_wapbl_begin(mp, __func__, __LINE__)
#define UFS_WAPBL_END(mp) ufs_wapbl_end(mp)
#define UFS_WAPBL_UPDATE(vp, access, modify, flags) \
if ((vp)->v_mount->mnt_wapbl) { \
UFS_UPDATE(vp, access, modify, flags); \
}
#ifdef DIAGNOSTIC
#define UFS_WAPBL_JLOCK_ASSERT(mp) \
if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp) \
if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl)
#else
#define UFS_WAPBL_JLOCK_ASSERT(mp)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
#endif
#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) \
if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode)
#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) \
if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode)
#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) \
(mp->mnt_wapbl) \
? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \
false, cookiep) \
: 0
#define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) \
( \
(mp->mnt_wapbl) \
? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \
true, NULL) \
: 0 \
)
#define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) \
if (mp->mnt_wapbl) wapbl_unregister_deallocation(mp->mnt_wapbl, cookie)
#else /* ! WAPBL */
#define UFS_WAPBL_BEGIN(mp) (__USE(mp), 0)
#define UFS_WAPBL_END(mp) do { } while (0)
#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0)
#define UFS_WAPBL_JLOCK_ASSERT(mp)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) do { } while (0)
#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) do { } while (0)
#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) 0
#define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) 0
#define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) do { } while (0)
#endif
#endif /* !_UFS_UFS_UFS_WAPBL_H_ */
/* $NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $ */
/*-
* Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $");
#include "veriexec.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/device.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/extattr.h>
#include <sys/verified_exec.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm_swap.h>
enum mountlist_type {
ME_MOUNT,
ME_MARKER
};
struct mountlist_entry {
TAILQ_ENTRY(mountlist_entry) me_list; /* Mount list. */
struct mount *me_mount; /* Actual mount if ME_MOUNT,
current mount else. */
enum mountlist_type me_type; /* Mount or marker. */
};
struct mount_iterator {
struct mountlist_entry mi_entry;
};
static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
bool (*)(void *, struct vnode *), void *, bool);
/* Root filesystem. */
vnode_t * rootvnode;
/* Mounted filesystem list. */
static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
static kmutex_t mountlist_lock __cacheline_aligned;
int vnode_offset_next_by_lru /* XXX: ugly hack for pstat.c */
= offsetof(vnode_impl_t, vi_lrulist.tqe_next);
kmutex_t vfs_list_lock __cacheline_aligned;
static specificdata_domain_t mount_specificdata_domain;
static kmutex_t mntid_lock;
static kmutex_t mountgen_lock __cacheline_aligned;
static uint64_t mountgen;
void
vfs_mount_sysinit(void)
{
TAILQ_INIT(&mountlist);
mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
mount_specificdata_domain = specificdata_domain_create();
mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
mountgen = 0;
}
struct mount *
vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
{
struct mount *mp;
int error __diagused;
mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
mp->mnt_op = vfsops;
mp->mnt_refcnt = 1;
TAILQ_INIT(&mp->mnt_vnodelist);
mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_vnodecovered = vp;
mount_initspecific(mp);
error = fstrans_mount(mp);
KASSERT(error == 0);
mutex_enter(&mountgen_lock);
mp->mnt_gen = mountgen++;
mutex_exit(&mountgen_lock);
return mp;
}
/*
* vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
* initialize a mount structure for it.
*
* Devname is usually updated by mount(8) after booting.
*/
int
vfs_rootmountalloc(const char *fstypename, const char *devname,
struct mount **mpp)
{
struct vfsops *vfsp = NULL;
struct mount *mp;
int error __diagused;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(vfsp, &vfs_list, vfs_list)
if (!strncmp(vfsp->vfs_name, fstypename,
sizeof(mp->mnt_stat.f_fstypename)))
break;
if (vfsp == NULL) {
mutex_exit(&vfs_list_lock);
return (ENODEV);
}
vfsp->vfs_refcount++;
mutex_exit(&vfs_list_lock);
if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
return ENOMEM;
error = vfs_busy(mp);
KASSERT(error == 0);
mp->mnt_flag = MNT_RDONLY;
(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
sizeof(mp->mnt_stat.f_fstypename));
mp->mnt_stat.f_mntonname[0] = '/';
mp->mnt_stat.f_mntonname[1] = '\0';
mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
'\0';
(void)copystr(devname, mp->mnt_stat.f_mntfromname,
sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
*mpp = mp;
return 0;
}
/*
* vfs_getnewfsid: get a new unique fsid.
*/
void
vfs_getnewfsid(struct mount *mp)
{
static u_short xxxfs_mntid;
struct mountlist_entry *me;
fsid_t tfsid;
int mtype;
mutex_enter(&mntid_lock);
if (xxxfs_mntid == 0)
++xxxfs_mntid;
mtype = makefstype(mp->mnt_op->vfs_name);
tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
tfsid.__fsid_val[1] = mtype;
/* Always increment to not return the same fsid to parallel mounts. */
xxxfs_mntid++;
/*
* Directly walk mountlist to prevent deadlock through
* mountlist_iterator_next() -> vfs_busy().
*/
mutex_enter(&mountlist_lock);
for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) { if (me->me_type == ME_MOUNT &&
me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] ==
tfsid.__fsid_val[0] &&
me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] ==
tfsid.__fsid_val[1]) {
tfsid.__fsid_val[0]++;
xxxfs_mntid++;
me = TAILQ_FIRST(&mountlist);
} else {
me = TAILQ_NEXT(me, me_list);
}
}
mutex_exit(&mountlist_lock);
mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
mutex_exit(&mntid_lock);
}
/*
* Lookup a mount point by filesystem identifier.
*
* XXX Needs to add a reference to the mount point.
*/
struct mount *
vfs_getvfs(fsid_t *fsid)
{
mount_iterator_t *iter;
struct mount *mp;
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_next(iter)) != NULL) {
if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
mountlist_iterator_destroy(iter);
return mp;
}
}
mountlist_iterator_destroy(iter);
return NULL;
}
/*
* Take a reference to a mount structure.
*/
void
vfs_ref(struct mount *mp)
{ KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
atomic_inc_uint(&mp->mnt_refcnt);
}
/*
* Drop a reference to a mount structure, freeing if the last reference.
*/
void
vfs_rele(struct mount *mp)
{
membar_release();
if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
return;
}
membar_acquire();
/*
* Nothing else has visibility of the mount: we can now
* free the data structures.
*/
KASSERT(mp->mnt_refcnt == 0);
specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
mutex_obj_free(mp->mnt_updating);
mutex_obj_free(mp->mnt_renamelock);
mutex_obj_free(mp->mnt_vnodelock);
if (mp->mnt_op != NULL) { vfs_delref(mp->mnt_op);
}
fstrans_unmount(mp);
/*
* Final free of mp gets done from fstrans_mount_dtor().
*
* Prevents this memory to be reused as a mount before
* fstrans releases all references to it.
*/
}
/*
* Mark a mount point as busy, and gain a new reference to it. Used to
* prevent the file system from being unmounted during critical sections.
*
* vfs_busy can be called multiple times and by multiple threads
* and must be accompanied by the same number of vfs_unbusy calls.
*
* => The caller must hold a pre-existing reference to the mount.
* => Will fail if the file system is being unmounted, or is unmounted.
*/
static inline int
_vfs_busy(struct mount *mp, bool wait)
{
KASSERT(mp->mnt_refcnt > 0);
if (wait) {
fstrans_start(mp);
} else {
if (fstrans_start_nowait(mp))
return EBUSY;
}
if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
fstrans_done(mp);
return ENOENT;
}
vfs_ref(mp);
return 0;
}
int
vfs_busy(struct mount *mp)
{ return _vfs_busy(mp, true);
}
int
vfs_trybusy(struct mount *mp)
{
return _vfs_busy(mp, false);
}
/*
* Unbusy a busy filesystem.
*
* Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
*/
void
vfs_unbusy(struct mount *mp)
{ KASSERT(mp->mnt_refcnt > 0);
fstrans_done(mp);
vfs_rele(mp);
}
/*
* Change a file systems lower mount.
* Both the current and the new lower mount may be NULL. The caller
* guarantees exclusive access to the mount and holds a pre-existing
* reference to the new lower mount.
*/
int
vfs_set_lowermount(struct mount *mp, struct mount *lowermp)
{
struct mount *oldlowermp;
int error;
#ifdef DEBUG
/*
* Limit the depth of file system stack so kernel sanitizers
* may stress mount/unmount without exhausting the kernel stack.
*/
int depth;
struct mount *mp2;
for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) { if (depth == 23)
return EINVAL;
}
#endif
if (lowermp) {
if (lowermp == dead_rootmount)
return ENOENT;
error = vfs_busy(lowermp);
if (error)
return error;
vfs_ref(lowermp);
}
oldlowermp = mp->mnt_lower;
mp->mnt_lower = lowermp;
if (lowermp)
vfs_unbusy(lowermp); if (oldlowermp) vfs_rele(oldlowermp);
return 0;
}
struct vnode_iterator {
vnode_impl_t vi_vnode;
};
void
vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
{
vnode_t *vp;
vnode_impl_t *vip;
vp = vnalloc_marker(mp);
vip = VNODE_TO_VIMPL(vp);
mutex_enter(mp->mnt_vnodelock);
TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
vp->v_usecount = 1;
mutex_exit(mp->mnt_vnodelock);
*vnip = (struct vnode_iterator *)vip;
}
void
vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
{
vnode_impl_t *mvip = &vni->vi_vnode;
vnode_t *mvp = VIMPL_TO_VNODE(mvip);
kmutex_t *lock;
KASSERT(vnis_marker(mvp));
if (vrefcnt(mvp) != 0) {
lock = mvp->v_mount->mnt_vnodelock;
mutex_enter(lock);
TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
mvp->v_usecount = 0;
mutex_exit(lock);
}
vnfree_marker(mvp);
}
static struct vnode *
vfs_vnode_iterator_next1(struct vnode_iterator *vni,
bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
{
vnode_impl_t *mvip = &vni->vi_vnode;
struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
vnode_t *vp;
vnode_impl_t *vip;
kmutex_t *lock;
int error;
KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
lock = mp->mnt_vnodelock;
do {
mutex_enter(lock);
vip = TAILQ_NEXT(mvip, vi_mntvnodes);
TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
VIMPL_TO_VNODE(mvip)->v_usecount = 0;
again:
if (vip == NULL) {
mutex_exit(lock);
return NULL;
}
vp = VIMPL_TO_VNODE(vip);
KASSERT(vp != NULL);
mutex_enter(vp->v_interlock);
if (vnis_marker(vp) ||
vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
(f && !(*f)(cl, vp))) {
mutex_exit(vp->v_interlock);
vip = TAILQ_NEXT(vip, vi_mntvnodes);
goto again;
}
TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
VIMPL_TO_VNODE(mvip)->v_usecount = 1;
mutex_exit(lock);
error = vcache_vget(vp);
KASSERT(error == 0 || error == ENOENT);
} while (error != 0);
return vp;
}
struct vnode *
vfs_vnode_iterator_next(struct vnode_iterator *vni,
bool (*f)(void *, struct vnode *), void *cl)
{
return vfs_vnode_iterator_next1(vni, f, cl, false);
}
/*
* Move a vnode from one mount queue to another.
*/
void
vfs_insmntque(vnode_t *vp, struct mount *mp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
struct mount *omp;
kmutex_t *lock;
KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
vp->v_tag == VT_VFS);
/*
* Delete from old mount point vnode list, if on one.
*/
if ((omp = vp->v_mount) != NULL) {
lock = omp->mnt_vnodelock;
mutex_enter(lock);
TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
mutex_exit(lock);
}
/*
* Insert into list of vnodes for the new mount point, if
* available. The caller must take a reference on the mount
* structure and donate to the vnode.
*/
if ((vp->v_mount = mp) != NULL) {
lock = mp->mnt_vnodelock;
mutex_enter(lock);
TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
mutex_exit(lock);
}
if (omp != NULL) {
/* Release reference to old mount. */
vfs_rele(omp);
}
}
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If FORCECLOSE is not specified, there should not be any active ones,
* return error if any are found (nb: this is a user error, not a
* system error). If FORCECLOSE is specified, detach any active vnodes
* that are found.
*
* If WRITECLOSE is set, only flush out regular file vnodes open for
* writing.
*
* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
*/
#ifdef DEBUG
int busyprt = 0; /* print out busy vnodes */
struct ctldebug debug1 = { "busyprt", &busyprt };
#endif
static vnode_t *
vflushnext(struct vnode_iterator *marker, int *when)
{
if (getticks() > *when) {
yield();
*when = getticks() + hz / 10;
}
preempt_point();
return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
}
/*
* Flush one vnode. Referenced on entry, unreferenced on return.
*/
static int
vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
{
int error;
struct vattr vattr;
if (vp == skipvp ||
((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
vrele(vp);
return 0;
}
/*
* If WRITECLOSE is set, only flush out regular file
* vnodes open for writing or open and unlinked.
*/
if ((flags & WRITECLOSE)) {
if (vp->v_type != VREG) {
vrele(vp);
return 0;
}
error = vn_lock(vp, LK_EXCLUSIVE);
if (error) {
KASSERT(error == ENOENT);
vrele(vp);
return 0;
}
error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
if (error == 0)
error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
VOP_UNLOCK(vp);
if (error) {
vrele(vp);
return error;
}
if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
vrele(vp);
return 0;
}
}
/*
* First try to recycle the vnode.
*/
if (vrecycle(vp))
return 0;
/*
* If FORCECLOSE is set, forcibly close the vnode.
* For block or character devices, revert to an
* anonymous device. For all other files, just
* kill them.
*/
if (flags & FORCECLOSE) {
if (vrefcnt(vp) > 1 &&
(vp->v_type == VBLK || vp->v_type == VCHR))
vcache_make_anon(vp);
else
vgone(vp);
return 0;
}
vrele(vp);
return EBUSY;
}
int
vflush(struct mount *mp, vnode_t *skipvp, int flags)
{
vnode_t *vp;
struct vnode_iterator *marker;
int busy, error, when, retries = 2;
do {
busy = error = when = 0;
/*
* First, flush out any vnode references from the
* deferred vrele list.
*/
vrele_flush(mp);
vfs_vnode_iterator_init(mp, &marker);
while ((vp = vflushnext(marker, &when)) != NULL) {
error = vflush_one(vp, skipvp, flags);
if (error == EBUSY) {
error = 0;
busy++;
#ifdef DEBUG
if (busyprt && retries == 0)
vprint("vflush: busy vnode", vp);
#endif
} else if (error != 0) {
break;
}
}
vfs_vnode_iterator_destroy(marker);
} while (error == 0 && busy > 0 && retries-- > 0);
if (error)
return error;
if (busy)
return EBUSY;
return 0;
}
/*
* Mount a file system.
*/
/*
* Scan all active processes to see if any of them have a current or root
* directory onto which the new filesystem has just been mounted. If so,
* replace them with the new mount point.
*/
static void
mount_checkdirs(vnode_t *olddp)
{
vnode_t *newdp, *rele1, *rele2;
struct cwdinfo *cwdi;
struct proc *p;
bool retry;
if (vrefcnt(olddp) == 1) {
return;
}
if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
panic("mount: lost mount");
do {
retry = false;
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) { if ((cwdi = p->p_cwdi) == NULL)
continue;
/*
* Cannot change to the old directory any more,
* so even if we see a stale value it is not a
* problem.
*/
if (cwdi->cwdi_cdir != olddp &&
cwdi->cwdi_rdir != olddp)
continue;
retry = true;
rele1 = NULL;
rele2 = NULL;
atomic_inc_uint(&cwdi->cwdi_refcnt);
mutex_exit(&proc_lock);
rw_enter(&cwdi->cwdi_lock, RW_WRITER);
if (cwdi->cwdi_cdir == olddp) {
rele1 = cwdi->cwdi_cdir;
vref(newdp);
cwdi->cwdi_cdir = newdp;
}
if (cwdi->cwdi_rdir == olddp) {
rele2 = cwdi->cwdi_rdir;
vref(newdp);
cwdi->cwdi_rdir = newdp;
}
rw_exit(&cwdi->cwdi_lock);
cwdfree(cwdi);
if (rele1 != NULL) vrele(rele1); if (rele2 != NULL) vrele(rele2);
mutex_enter(&proc_lock);
break;
}
mutex_exit(&proc_lock);
} while (retry);
if (rootvnode == olddp) {
vrele(rootvnode);
vref(newdp);
rootvnode = newdp;
}
vput(newdp);
}
/*
* Start extended attributes
*/
static int
start_extattr(struct mount *mp)
{
int error;
error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
if (error)
printf("%s: failed to start extattr: error = %d\n",
mp->mnt_stat.f_mntonname, error);
return error;
}
int
mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
const char *path, int flags, void *data, size_t *data_len)
{
vnode_t *vp = *vpp;
struct mount *mp;
struct pathbuf *pb;
struct nameidata nd;
int error, error2;
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
if (error) {
vfs_delref(vfsops);
return error;
}
/* Cannot make a non-dir a mount-point (from here anyway). */
if (vp->v_type != VDIR) {
vfs_delref(vfsops);
return ENOTDIR;
}
if (flags & MNT_EXPORTED) {
vfs_delref(vfsops);
return EINVAL;
}
if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
vfs_delref(vfsops);
return ENOMEM;
}
mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
/*
* The underlying file system may refuse the mount for
* various reasons. Allow the user to force it to happen.
*
* Set the mount level flags.
*/
mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
error = VFS_MOUNT(mp, path, data, data_len);
mp->mnt_flag &= ~MNT_OP_FLAGS;
if (error != 0) {
vfs_rele(mp);
return error;
}
/* Suspend new file system before taking mnt_updating. */
do {
error2 = vfs_suspend(mp, 0);
} while (error2 == EINTR || error2 == ERESTART); KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
mutex_enter(mp->mnt_updating);
/*
* Validate and prepare the mount point.
*/
error = pathbuf_copyin(path, &pb);
if (error != 0) {
goto err_mounted;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
error = namei(&nd);
pathbuf_destroy(pb);
if (error != 0) {
goto err_mounted;
}
if (nd.ni_vp != vp) {
vput(nd.ni_vp);
error = EINVAL;
goto err_mounted;
}
if (vp->v_mountedhere != NULL) {
vput(nd.ni_vp);
error = EBUSY;
goto err_mounted;
}
error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
if (error != 0) {
vput(nd.ni_vp);
goto err_mounted;
}
/*
* Put the new filesystem on the mount list after root.
*/
cache_purge(vp);
mp->mnt_iflag &= ~IMNT_WANTRDWR;
mountlist_append(mp); if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) vfs_syncer_add_to_worklist(mp);
vp->v_mountedhere = mp;
vput(nd.ni_vp);
mount_checkdirs(vp);
mutex_exit(mp->mnt_updating);
if (error2 == 0) vfs_resume(mp);
/* Hold an additional reference to the mount across VFS_START(). */
vfs_ref(mp);
(void) VFS_STATVFS(mp, &mp->mnt_stat);
error = VFS_START(mp, 0);
if (error) {
vrele(vp); } else if (flags & MNT_EXTATTR) { if (start_extattr(mp) != 0)
mp->mnt_flag &= ~MNT_EXTATTR;
}
/* Drop reference held for VFS_START(). */
vfs_rele(mp);
*vpp = NULL;
return error;
err_mounted:
if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
panic("Unmounting fresh file system failed");
mutex_exit(mp->mnt_updating);
if (error2 == 0) vfs_resume(mp); vfs_set_lowermount(mp, NULL);
vfs_rele(mp);
return error;
}
/*
* Do the actual file system unmount. File system is assumed to have
* been locked by the caller.
*
* => Caller hold reference to the mount, explicitly for dounmount().
*/
int
dounmount(struct mount *mp, int flags, struct lwp *l)
{
struct vnode *coveredvp, *vp;
struct vnode_impl *vip;
int error, async, used_syncer, used_extattr;
const bool was_suspended = fstrans_is_owner(mp);
#if NVERIEXEC > 0
error = veriexec_unmountchk(mp);
if (error)
return (error);
#endif /* NVERIEXEC > 0 */
if (!was_suspended) {
error = vfs_suspend(mp, 0);
if (error) {
return error;
}
}
KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
used_extattr = mp->mnt_flag & MNT_EXTATTR;
mp->mnt_iflag |= IMNT_UNMOUNT;
mutex_enter(mp->mnt_updating);
async = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
cache_purgevfs(mp); /* remove cache entries for this file sys */
if (used_syncer)
vfs_syncer_remove_from_worklist(mp);
error = 0;
if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
}
if (error == 0 || (flags & MNT_FORCE)) {
error = VFS_UNMOUNT(mp, flags);
}
if (error) {
mp->mnt_iflag &= ~IMNT_UNMOUNT;
if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
vfs_syncer_add_to_worklist(mp);
mp->mnt_flag |= async;
mutex_exit(mp->mnt_updating);
if (!was_suspended)
vfs_resume(mp);
if (used_extattr) {
if (start_extattr(mp) != 0)
mp->mnt_flag &= ~MNT_EXTATTR;
else
mp->mnt_flag |= MNT_EXTATTR;
}
return (error);
}
mutex_exit(mp->mnt_updating);
/*
* mark filesystem as gone to prevent further umounts
* after mnt_umounting lock is gone, this also prevents
* vfs_busy() from succeeding.
*/
mp->mnt_iflag |= IMNT_GONE;
if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
coveredvp->v_mountedhere = NULL;
}
if (!was_suspended)
vfs_resume(mp);
mountlist_remove(mp);
if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) {
vp = VIMPL_TO_VNODE(vip);
vprint("dangling", vp);
panic("unmount: dangling vnode");
}
vfs_hooks_unmount(mp);
vfs_set_lowermount(mp, NULL);
vfs_rele(mp); /* reference from mount() */
if (coveredvp != NULLVP) {
vrele(coveredvp);
}
return (0);
}
/*
* Unmount all file systems.
* We traverse the list in reverse order under the assumption that doing so
* will avoid needing to worry about dependencies.
*/
bool
vfs_unmountall(struct lwp *l)
{
printf("unmounting file systems...\n");
return vfs_unmountall1(l, true, true);
}
static void
vfs_unmount_print(struct mount *mp, const char *pfx)
{
aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_fstypename);
}
/*
* Return the mount with the highest generation less than "gen".
*/
static struct mount *
vfs_unmount_next(uint64_t gen)
{
mount_iterator_t *iter;
struct mount *mp, *nmp;
nmp = NULL;
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_next(iter)) != NULL) {
if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
mp->mnt_gen < gen) {
if (nmp != NULL)
vfs_rele(nmp);
nmp = mp;
vfs_ref(nmp);
}
}
mountlist_iterator_destroy(iter);
return nmp;
}
bool
vfs_unmount_forceone(struct lwp *l)
{
struct mount *mp;
int error;
mp = vfs_unmount_next(mountgen);
if (mp == NULL) {
return false;
}
#ifdef DEBUG
printf("forcefully unmounting %s (%s)...\n",
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
#endif
if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
vfs_unmount_print(mp, "forcefully ");
return true;
} else {
vfs_rele(mp);
}
#ifdef DEBUG
printf("forceful unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
#endif
return false;
}
bool
vfs_unmountall1(struct lwp *l, bool force, bool verbose)
{
struct mount *mp;
mount_iterator_t *iter;
bool any_error = false, progress = false;
uint64_t gen;
int error;
gen = mountgen;
for (;;) {
mp = vfs_unmount_next(gen);
if (mp == NULL)
break;
gen = mp->mnt_gen;
#ifdef DEBUG
printf("unmounting %p %s (%s)...\n",
(void *)mp, mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_mntfromname);
#endif
if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
vfs_unmount_print(mp, "");
progress = true;
} else {
vfs_rele(mp);
if (verbose) {
printf("unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
}
any_error = true;
}
}
if (verbose) {
printf("unmounting done\n");
}
if (any_error && verbose) {
printf("WARNING: some file systems would not unmount\n");
}
/* If the mountlist is empty it is time to remove swap. */
mountlist_iterator_init(&iter);
if (mountlist_iterator_next(iter) == NULL) {
uvm_swap_shutdown(l);
}
mountlist_iterator_destroy(iter);
return progress;
}
void
vfs_sync_all(struct lwp *l)
{
printf("syncing disks... ");
/* remove user processes from run queue */
suspendsched();
(void)spl0();
/* avoid coming back this way again if we panic. */
doing_shutdown = 1;
do_sys_sync(l);
/* Wait for sync to finish. */
if (vfs_syncwait() != 0) {
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
Debugger();
#endif
printf("giving up\n");
return;
} else
printf("done\n");
}
/*
* Sync and unmount file systems before shutting down.
*/
void
vfs_shutdown(void)
{
lwp_t *l = curlwp;
vfs_sync_all(l);
/*
* If we have panicked - do not make the situation potentially
* worse by unmounting the file systems.
*/
if (panicstr != NULL) {
return;
}
/* Unmount file systems. */
vfs_unmountall(l);
}
/*
* Print a list of supported file system types (used by vfs_mountroot)
*/
static void
vfs_print_fstypes(void)
{
struct vfsops *v;
int cnt = 0;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list)
++cnt;
mutex_exit(&vfs_list_lock);
if (cnt == 0) {
printf("WARNING: No file system modules have been loaded.\n");
return;
}
printf("Supported file systems:");
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
printf(" %s", v->vfs_name);
}
mutex_exit(&vfs_list_lock);
printf("\n");
}
/*
* Mount the root file system. If the operator didn't specify a
* file system to use, try all possible file systems until one
* succeeds.
*/
int
vfs_mountroot(void)
{
struct vfsops *v;
int error = ENODEV;
if (root_device == NULL)
panic("vfs_mountroot: root device unknown");
switch (device_class(root_device)) {
case DV_IFNET:
if (rootdev != NODEV)
panic("vfs_mountroot: rootdev set for DV_IFNET "
"(0x%llx -> %llu,%llu)",
(unsigned long long)rootdev,
(unsigned long long)major(rootdev),
(unsigned long long)minor(rootdev));
break;
case DV_DISK:
if (rootdev == NODEV)
panic("vfs_mountroot: rootdev not set for DV_DISK");
if (bdevvp(rootdev, &rootvp))
panic("vfs_mountroot: can't get vnode for rootdev");
vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(rootvp, FREAD, FSCRED);
VOP_UNLOCK(rootvp);
if (error) {
printf("vfs_mountroot: can't open root device\n");
return (error);
}
break;
case DV_VIRTUAL:
break;
default:
printf("%s: inappropriate for root file system\n",
device_xname(root_device));
return (ENODEV);
}
/*
* If user specified a root fs type, use it. Make sure the
* specified type exists and has a mount_root()
*/
if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
v = vfs_getopsbyname(rootfstype);
error = EFTYPE;
if (v != NULL) {
if (v->vfs_mountroot != NULL) {
error = (v->vfs_mountroot)();
}
v->vfs_refcount--;
}
goto done;
}
/*
* Try each file system currently configured into the kernel.
*/
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (v->vfs_mountroot == NULL)
continue;
#ifdef DEBUG
aprint_normal("mountroot: trying %s...\n", v->vfs_name);
#endif
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
error = (*v->vfs_mountroot)();
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (!error) {
aprint_normal("root file system type: %s\n",
v->vfs_name);
break;
}
}
mutex_exit(&vfs_list_lock);
if (v == NULL) {
vfs_print_fstypes();
printf("no file system for %s", device_xname(root_device));
if (device_class(root_device) == DV_DISK)
printf(" (dev 0x%llx)", (unsigned long long)rootdev);
printf("\n");
error = EFTYPE;
}
done:
if (error && device_class(root_device) == DV_DISK) {
vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(rootvp, FREAD, FSCRED);
VOP_UNLOCK(rootvp);
vrele(rootvp);
}
if (error == 0) {
mount_iterator_t *iter;
struct mount *mp;
mountlist_iterator_init(&iter);
mp = mountlist_iterator_next(iter);
KASSERT(mp != NULL);
mountlist_iterator_destroy(iter);
mp->mnt_flag |= MNT_ROOTFS;
mp->mnt_op->vfs_refcount++;
/*
* Get the vnode for '/'. Set cwdi0.cwdi_cdir to
* reference it, and donate it the reference grabbed
* with VFS_ROOT().
*/
error = VFS_ROOT(mp, LK_NONE, &rootvnode);
if (error)
panic("cannot find root vnode, error=%d", error);
cwdi0.cwdi_cdir = rootvnode;
cwdi0.cwdi_rdir = NULL;
/*
* Now that root is mounted, we can fixup initproc's CWD
* info. All other processes are kthreads, which merely
* share proc0's CWD info.
*/
initproc->p_cwdi->cwdi_cdir = rootvnode;
vref(initproc->p_cwdi->cwdi_cdir);
initproc->p_cwdi->cwdi_rdir = NULL;
/*
* Enable loading of modules from the filesystem
*/
module_load_vfs_init();
}
return (error);
}
/*
* mount_specific_key_create --
* Create a key for subsystem mount-specific data.
*/
int
mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
}
/*
* mount_specific_key_delete --
* Delete a key for subsystem mount-specific data.
*/
void
mount_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(mount_specificdata_domain, key);
}
/*
* mount_initspecific --
* Initialize a mount's specificdata container.
*/
void
mount_initspecific(struct mount *mp)
{
int error __diagused;
error = specificdata_init(mount_specificdata_domain,
&mp->mnt_specdataref);
KASSERT(error == 0);
}
/*
* mount_finispecific --
* Finalize a mount's specificdata container.
*/
void
mount_finispecific(struct mount *mp)
{
specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
}
/*
* mount_getspecific --
* Return mount-specific data corresponding to the specified key.
*/
void *
mount_getspecific(struct mount *mp, specificdata_key_t key)
{
return specificdata_getspecific(mount_specificdata_domain,
&mp->mnt_specdataref, key);
}
/*
* mount_setspecific --
* Set mount-specific data corresponding to the specified key.
*/
void
mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
{
specificdata_setspecific(mount_specificdata_domain,
&mp->mnt_specdataref, key, data);
}
/*
* Check to see if a filesystem is mounted on a block device.
*/
int
vfs_mountedon(vnode_t *vp)
{
vnode_t *vq;
int error = 0;
if (vp->v_type != VBLK)
return ENOTBLK;
if (spec_node_getmountedfs(vp) != NULL)
return EBUSY;
if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq)
== 0) {
if (spec_node_getmountedfs(vq) != NULL)
error = EBUSY;
vrele(vq);
}
return error;
}
/*
* Check if a device pointed to by vp is mounted.
*
* Returns:
* EINVAL if it's not a disk
* EBUSY if it's a disk and mounted
* 0 if it's a disk and not mounted
*/
int
rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
{
vnode_t *bvp;
dev_t dev;
int d_type;
bvp = NULL;
d_type = D_OTHER;
if (iskmemvp(vp))
return EINVAL;
switch (vp->v_type) {
case VCHR: {
const struct cdevsw *cdev;
dev = vp->v_rdev;
cdev = cdevsw_lookup(dev);
if (cdev != NULL) {
dev_t blkdev;
blkdev = devsw_chr2blk(dev);
if (blkdev != NODEV) { if (vfinddev(blkdev, VBLK, &bvp) != 0) { d_type = (cdev->d_flag & D_TYPEMASK);
/* XXX: what if bvp disappears? */
vrele(bvp);
}
}
}
break;
}
case VBLK: {
const struct bdevsw *bdev;
dev = vp->v_rdev;
bdev = bdevsw_lookup(dev);
if (bdev != NULL) d_type = (bdev->d_flag & D_TYPEMASK);
bvp = vp;
break;
}
default:
break;
}
if (d_type != D_DISK)
return EINVAL;
if (bvpp != NULL)
*bvpp = bvp;
/*
* XXX: This is bogus. We should be failing the request
* XXX: not only if this specific slice is mounted, but
* XXX: if it's on a disk with any other mounted slice.
*/
if (vfs_mountedon(bvp))
return EBUSY;
return 0;
}
/*
* Make a 'unique' number from a mount type name.
*/
long
makefstype(const char *type)
{
long rv;
for (rv = 0; *type; type++) {
rv <<= 2;
rv ^= *type;
}
return rv;
}
static struct mountlist_entry *
mountlist_alloc(enum mountlist_type type, struct mount *mp)
{
struct mountlist_entry *me;
me = kmem_zalloc(sizeof(*me), KM_SLEEP);
me->me_mount = mp;
me->me_type = type;
return me;
}
static void
mountlist_free(struct mountlist_entry *me)
{
kmem_free(me, sizeof(*me));
}
void
mountlist_iterator_init(mount_iterator_t **mip)
{
struct mountlist_entry *me;
me = mountlist_alloc(ME_MARKER, NULL);
mutex_enter(&mountlist_lock);
TAILQ_INSERT_HEAD(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
*mip = (mount_iterator_t *)me;
}
void
mountlist_iterator_destroy(mount_iterator_t *mi)
{
struct mountlist_entry *marker = &mi->mi_entry;
if (marker->me_mount != NULL) vfs_unbusy(marker->me_mount);
mutex_enter(&mountlist_lock);
TAILQ_REMOVE(&mountlist, marker, me_list);
mutex_exit(&mountlist_lock);
mountlist_free(marker);
}
/*
* Return the next mount or NULL for this iterator.
* Mark it busy on success.
*/
static inline struct mount *
_mountlist_iterator_next(mount_iterator_t *mi, bool wait)
{
struct mountlist_entry *me, *marker = &mi->mi_entry;
struct mount *mp;
int error;
if (marker->me_mount != NULL) { vfs_unbusy(marker->me_mount);
marker->me_mount = NULL;
}
mutex_enter(&mountlist_lock);
for (;;) {
KASSERT(marker->me_type == ME_MARKER);
me = TAILQ_NEXT(marker, me_list);
if (me == NULL) {
/* End of list: keep marker and return. */
mutex_exit(&mountlist_lock);
return NULL;
}
TAILQ_REMOVE(&mountlist, marker, me_list); TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
/* Skip other markers. */
if (me->me_type != ME_MOUNT)
continue;
/* Take an initial reference for vfs_busy() below. */
mp = me->me_mount;
KASSERT(mp != NULL); vfs_ref(mp);
mutex_exit(&mountlist_lock);
/* Try to mark this mount busy and return on success. */
if (wait)
error = vfs_busy(mp);
else
error = vfs_trybusy(mp);
if (error == 0) {
vfs_rele(mp);
marker->me_mount = mp;
return mp;
}
vfs_rele(mp);
mutex_enter(&mountlist_lock);
}
}
struct mount *
mountlist_iterator_next(mount_iterator_t *mi)
{
return _mountlist_iterator_next(mi, true);
}
struct mount *
mountlist_iterator_trynext(mount_iterator_t *mi)
{
return _mountlist_iterator_next(mi, false);
}
/*
* Attach new mount to the end of the mount list.
*/
void
mountlist_append(struct mount *mp)
{
struct mountlist_entry *me;
me = mountlist_alloc(ME_MOUNT, mp);
mutex_enter(&mountlist_lock);
TAILQ_INSERT_TAIL(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
}
/*
* Remove mount from mount list.
*/void
mountlist_remove(struct mount *mp)
{
struct mountlist_entry *me;
mutex_enter(&mountlist_lock);
TAILQ_FOREACH(me, &mountlist, me_list)
if (me->me_type == ME_MOUNT && me->me_mount == mp)
break;
KASSERT(me != NULL);
TAILQ_REMOVE(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
mountlist_free(me);
}
/*
* Unlocked variant to traverse the mountlist.
* To be used from DDB only.
*/
struct mount *
_mountlist_next(struct mount *mp)
{
struct mountlist_entry *me;
if (mp == NULL) {
me = TAILQ_FIRST(&mountlist);
} else {
TAILQ_FOREACH(me, &mountlist, me_list)
if (me->me_type == ME_MOUNT && me->me_mount == mp)
break;
if (me != NULL)
me = TAILQ_NEXT(me, me_list);
}
while (me != NULL && me->me_type != ME_MOUNT)
me = TAILQ_NEXT(me, me_list);
return (me ? me->me_mount : NULL);
}
/* $NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $ */
/*
* Copyright (c) 1998, 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Author: Lennart Augustsson <lennart@augustsson.net>
* Carlstedt Research & Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* wscons mux device.
*
* The mux device is a collection of real mice and keyboards and acts as
* a merge point for all the events from the different real devices.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif
#include "wsdisplay.h"
#include "wsmux.h"
#include "wskbd.h"
#include "wsmouse.h"
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/device.h>
#include <sys/device_impl.h> /* XXX autoconf abuse */
#include "opt_wsdisplay_compat.h"
#include <dev/wscons/wsconsio.h>
#include <dev/wscons/wsksymdef.h>
#include <dev/wscons/wseventvar.h>
#include <dev/wscons/wscons_callbacks.h>
#include <dev/wscons/wsmuxvar.h>
#include "ioconf.h"
#ifdef WSMUX_DEBUG
#define DPRINTF(x) if (wsmuxdebug) printf x
#define DPRINTFN(n,x) if (wsmuxdebug > (n)) printf x
int wsmuxdebug = 0;
#else
#define DPRINTF(x)
#define DPRINTFN(n,x)
#endif
/*
* The wsmux pseudo device is used to multiplex events from several wsmouse,
* wskbd, and/or wsmux devices together.
* The devices connected together form a tree with muxes in the interior
* and real devices (mouse and kbd) at the leaves. The special case of
* a tree with one node (mux or other) is supported as well.
* Only the device at the root of the tree can be opened (if a non-root
* device is opened the subtree rooted at that point is severed from the
* containing tree). When the root is opened it allocates a wseventvar
* struct which all the nodes in the tree will send their events too.
* An ioctl() performed on the root is propagated to all the nodes.
* There are also ioctl() operations to add and remove nodes from a tree.
*/
static int wsmux_mux_open(struct wsevsrc *, struct wseventvar *);
static int wsmux_mux_close(struct wsevsrc *);
static void wsmux_do_open(struct wsmux_softc *, struct wseventvar *);
static void wsmux_do_close(struct wsmux_softc *);
#if NWSDISPLAY > 0
static int wsmux_evsrc_set_display(device_t, struct wsevsrc *);
#else
#define wsmux_evsrc_set_display NULL
#endif
static int wsmux_do_displayioctl(device_t dev, u_long cmd,
void *data, int flag, struct lwp *l);
static int wsmux_do_ioctl(device_t, u_long, void *,int,struct lwp *);
static int wsmux_add_mux(int, struct wsmux_softc *);
#define WSMUXDEV(n) ((n) & 0x7f)
#define WSMUXCTL(n) ((n) & 0x80)
dev_type_open(wsmuxopen);
dev_type_close(wsmuxclose);
dev_type_read(wsmuxread);
dev_type_ioctl(wsmuxioctl);
dev_type_poll(wsmuxpoll);
dev_type_kqfilter(wsmuxkqfilter);
const struct cdevsw wsmux_cdevsw = {
.d_open = wsmuxopen,
.d_close = wsmuxclose,
.d_read = wsmuxread,
.d_write = nowrite,
.d_ioctl = wsmuxioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = wsmuxpoll,
.d_mmap = nommap,
.d_kqfilter = wsmuxkqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
struct wssrcops wsmux_srcops = {
WSMUX_MUX,
wsmux_mux_open, wsmux_mux_close, wsmux_do_ioctl, wsmux_do_displayioctl,
wsmux_evsrc_set_display
};
/* From upper level */
void
wsmuxattach(int n)
{
}
/* Keep track of all muxes that have been allocated */
static struct wsmux_softc **wsmuxdevs = NULL;
static int nwsmux = 0;
/* Return mux n, create if necessary */
struct wsmux_softc *
wsmux_getmux(int n)
{
struct wsmux_softc *sc;
n = WSMUXDEV(n); /* limit range */
/* Make sure there is room for mux n in the table */
if (n >= nwsmux) {
void *new;
new = realloc(wsmuxdevs, (n + 1) * sizeof(*wsmuxdevs),
M_DEVBUF, M_ZERO | M_WAITOK);
wsmuxdevs = new;
nwsmux = n + 1;
}
sc = wsmuxdevs[n]; if (sc == NULL) { sc = wsmux_create("wsmux", n);
wsmuxdevs[n] = sc;
}
return (sc);
}
/*
* open() of the pseudo device from device table.
*/
int
wsmuxopen(dev_t dev, int flags, int mode, struct lwp *l)
{
struct wsmux_softc *sc;
struct wseventvar *evar;
int minr, unit;
minr = minor(dev);
unit = WSMUXDEV(minr);
sc = wsmux_getmux(unit);
if (sc == NULL)
return (ENXIO);
DPRINTF(("wsmuxopen: %s: sc=%p l=%p\n",
device_xname(sc->sc_base.me_dv), sc, l));
if (WSMUXCTL(minr)) {
/* This is the control device which does not allow reads. */
if (flags & FREAD)
return (EINVAL);
return (0);
}
if ((flags & (FREAD | FWRITE)) == FWRITE)
/* Allow write only open */
return (0);
if (sc->sc_base.me_parent != NULL) {
/* Grab the mux out of the greedy hands of the parent mux. */
DPRINTF(("wsmuxopen: detach\n"));
wsmux_detach_sc(&sc->sc_base);
}
if (sc->sc_base.me_evp != NULL)
/* Already open. */
return (EBUSY);
evar = &sc->sc_base.me_evar;
wsevent_init(evar, l->l_proc);
#ifdef WSDISPLAY_COMPAT_RAWKBD
sc->sc_rawkbd = 0;
#endif
wsmux_do_open(sc, evar);
return (0);
}
/*
* Open of a mux via the parent mux.
*/
int
wsmux_mux_open(struct wsevsrc *me, struct wseventvar *evar)
{
struct wsmux_softc *sc = (struct wsmux_softc *)me;
#ifdef DIAGNOSTIC
if (sc->sc_base.me_evp != NULL) {
printf("wsmux_mux_open: busy\n");
return (EBUSY);
}
if (sc->sc_base.me_parent == NULL) {
printf("wsmux_mux_open: no parent\n");
return (EINVAL);
}
#endif
wsmux_do_open(sc, evar);
return (0);
}
/* Common part of opening a mux. */
void
wsmux_do_open(struct wsmux_softc *sc, struct wseventvar *evar)
{
struct wsevsrc *me;
sc->sc_base.me_evp = evar; /* remember event variable, mark as open */
/* Open all children. */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmuxopen: %s: m=%p dev=%s\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
if (me->me_evp != NULL) {
printf("wsmuxopen: dev already in use\n");
continue;
}
if (me->me_parent != sc) {
printf("wsmux_do_open: bad child=%p\n", me);
continue;
}
{
int error = wsevsrc_open(me, evar);
if (error) {
DPRINTF(("wsmuxopen: open failed %d\n", error));
}
}
#else
/* ignore errors, failing children will not be marked open */
(void)wsevsrc_open(me, evar);
#endif
}
}
/*
* close() of the pseudo device from device table.
*/
int
wsmuxclose(dev_t dev, int flags, int mode,
struct lwp *l)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
struct wseventvar *evar = sc->sc_base.me_evp;
if (WSMUXCTL(minr))
/* control device */
return (0);
if (evar == NULL)
/* Not open for read */
return (0);
wsmux_do_close(sc);
sc->sc_base.me_evp = NULL;
wsevent_fini(evar);
return (0);
}
/*
* Close of a mux via the parent mux.
*/
int
wsmux_mux_close(struct wsevsrc *me)
{
me->me_evp = NULL;
wsmux_do_close((struct wsmux_softc *)me);
return (0);
}
/* Common part of closing a mux. */
void
wsmux_do_close(struct wsmux_softc *sc)
{
struct wsevsrc *me;
DPRINTF(("wsmuxclose: %s: sc=%p\n",
device_xname(sc->sc_base.me_dv), sc));
/* Close all the children. */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmuxclose %s: m=%p dev=%s\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmuxclose: bad child=%p\n", me);
continue;
}
#endif
(void)wsevsrc_close(me);
me->me_evp = NULL;
}
}
/*
* read() of the pseudo device from device table.
*/
int
wsmuxread(dev_t dev, struct uio *uio, int flags)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
struct wseventvar *evar;
int error;
if (WSMUXCTL(minr)) {
/* control device */
return (EINVAL);
}
evar = sc->sc_base.me_evp;
if (evar == NULL) {
#ifdef DIAGNOSTIC
/* XXX can we get here? */
printf("wsmuxread: not open\n");
#endif
return (EINVAL);
}
DPRINTFN(5,("wsmuxread: %s event read evar=%p\n",
device_xname(sc->sc_base.me_dv), evar));
error = wsevent_read(evar, uio, flags);
DPRINTFN(5,("wsmuxread: %s event read ==> error=%d\n",
device_xname(sc->sc_base.me_dv), error));
return (error);
}
/*
* ioctl of the pseudo device from device table.
*/
int
wsmuxioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
int u = WSMUXDEV(minor(dev));
return wsmux_do_ioctl(wsmuxdevs[u]->sc_base.me_dv, cmd, data, flag, l);
}
/*
* ioctl of a mux via the parent mux, continuation of wsmuxioctl().
*/
int
wsmux_do_ioctl(device_t dv, u_long cmd, void *data, int flag,
struct lwp *lwp)
{
struct wsmux_softc *sc = device_private(dv);
struct wsevsrc *me;
int error, ok;
int s, n;
struct wseventvar *evar;
struct wscons_event event;
struct wsmux_device_list *l;
DPRINTF(("wsmux_do_ioctl: %s: enter sc=%p, cmd=%08lx\n",
device_xname(sc->sc_base.me_dv), sc, cmd));
switch (cmd) {
#if defined(COMPAT_50) || defined(MODULAR)
case WSMUXIO_OINJECTEVENT:
#endif /* defined(COMPAT_50) || defined(MODULAR) */
case WSMUXIO_INJECTEVENT:
/* Inject an event, e.g., from moused. */
DPRINTF(("%s: inject\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL) {
/* No event sink, so ignore it. */
DPRINTF(("wsmux_do_ioctl: event ignored\n"));
return (0);
}
s = spltty();
event.type = ((struct wscons_event *)data)->type;
event.value = ((struct wscons_event *)data)->value;
error = wsevent_inject(evar, &event, 1);
splx(s);
return error;
case WSMUXIO_ADD_DEVICE:
#define d ((struct wsmux_device *)data)
DPRINTF(("%s: add type=%d, no=%d\n",
device_xname(sc->sc_base.me_dv), d->type, d->idx));
switch (d->type) {
#if NWSMOUSE > 0
case WSMUX_MOUSE:
return (wsmouse_add_mux(d->idx, sc));
#endif
#if NWSKBD > 0
case WSMUX_KBD:
return (wskbd_add_mux(d->idx, sc));
#endif
case WSMUX_MUX:
return (wsmux_add_mux(d->idx, sc));
case WSMUX_BELL:
return (wsbell_add_mux(d->idx, sc));
default:
return (EINVAL);
}
case WSMUXIO_REMOVE_DEVICE:
DPRINTF(("%s: rem type=%d, no=%d\n",
device_xname(sc->sc_base.me_dv), d->type, d->idx));
/* Locate the device */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
if (me->me_ops->type == d->type &&
device_unit(me->me_dv) == d->idx) {
DPRINTF(("wsmux_do_ioctl: detach\n"));
wsmux_detach_sc(me);
return (0);
}
}
return (EINVAL);
#undef d
case WSMUXIO_LIST_DEVICES:
DPRINTF(("%s: list\n", device_xname(sc->sc_base.me_dv)));
l = (struct wsmux_device_list *)data;
n = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
if (n >= WSMUX_MAXDEV)
break;
l->devices[n].type = me->me_ops->type;
l->devices[n].idx = device_unit(me->me_dv);
n++;
}
l->ndevices = n;
return (0);
#ifdef WSDISPLAY_COMPAT_RAWKBD
case WSKBDIO_SETMODE:
sc->sc_rawkbd = *(int *)data;
DPRINTF(("wsmux_do_ioctl: save rawkbd = %d\n", sc->sc_rawkbd));
break;
#endif
case WSKBDIO_SETVERSION:
case WSMOUSEIO_SETVERSION:
case WSDISPLAYIO_SETVERSION:
DPRINTF(("%s: WSxxxIO_SETVERSION\n",
device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
return wsevent_setversion(evar, *(int *)data);
case FIONBIO:
DPRINTF(("%s: FIONBIO\n", device_xname(sc->sc_base.me_dv)));
return (0);
case FIOASYNC:
DPRINTF(("%s: FIOASYNC\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
evar->async = *(int *)data != 0;
return (0);
case FIOSETOWN:
DPRINTF(("%s: FIOSETOWN\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
if (-*(int *)data != evar->io->p_pgid
&& *(int *)data != evar->io->p_pid)
return (EPERM);
return (0);
case TIOCSPGRP:
DPRINTF(("%s: TIOCSPGRP\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
if (*(int *)data != evar->io->p_pgid)
return (EPERM);
return (0);
default:
DPRINTF(("%s: unknown\n", device_xname(sc->sc_base.me_dv)));
break;
}
if (sc->sc_base.me_evp == NULL
#if NWSDISPLAY > 0
&& sc->sc_base.me_dispdv == NULL
#endif
)
return (EACCES);
/* Return 0 if any of the ioctl() succeeds, otherwise the last error */
error = 0;
ok = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
#ifdef DIAGNOSTIC
/* XXX check evp? */
if (me->me_parent != sc) {
printf("wsmux_do_ioctl: bad child %p\n", me);
continue;
}
#endif
error = wsevsrc_ioctl(me, cmd, data, flag, lwp);
DPRINTF(("wsmux_do_ioctl: %s: me=%p dev=%s ==> %d\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv), error));
if (!error)
ok = 1;
}
if (ok) {
error = 0;
if (cmd == WSKBDIO_SETENCODING) {
sc->sc_kbd_layout = *((kbd_t *)data);
}
}
return (error);
}
/*
* poll() of the pseudo device from device table.
*/
int
wsmuxpoll(dev_t dev, int events, struct lwp *l)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
if (WSMUXCTL(minr)) {
/* control device */
return (0);
}
if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
printf("wsmuxpoll: not open\n");
#endif
return (POLLHUP);
}
return (wsevent_poll(sc->sc_base.me_evp, events, l));
}
/*
* kqfilter() of the pseudo device from device table.
*/
int
wsmuxkqfilter(dev_t dev, struct knote *kn)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
if (WSMUXCTL(minr)) {
/* control device */
return (1);
}
if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
printf("wsmuxkqfilter: not open\n");
#endif
return (1);
}
return (wsevent_kqfilter(sc->sc_base.me_evp, kn));
}
/*
* Add mux unit as a child to muxsc.
*/
int
wsmux_add_mux(int unit, struct wsmux_softc *muxsc)
{
struct wsmux_softc *sc, *m;
sc = wsmux_getmux(unit);
if (sc == NULL)
return (ENXIO);
DPRINTF(("wsmux_add_mux: %s(%p) to %s(%p)\n",
device_xname(sc->sc_base.me_dv), sc,
device_xname(muxsc->sc_base.me_dv), muxsc));
if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL)
return (EBUSY);
/* The mux we are adding must not be an ancestor of itself. */
for (m = muxsc; m != NULL ; m = m->sc_base.me_parent)
if (m == sc)
return (EINVAL);
return (wsmux_attach_sc(muxsc, &sc->sc_base));
}
/* Create a new mux softc. */
struct wsmux_softc *
wsmux_create(const char *name, int unit)
{
struct wsmux_softc *sc;
/* XXX This is wrong -- should use autoconfiguration framework */
DPRINTF(("wsmux_create: allocating\n"));
sc = malloc(sizeof *sc, M_DEVBUF, M_WAITOK|M_ZERO);
sc->sc_base.me_dv = malloc(sizeof(struct device), M_DEVBUF,
M_WAITOK|M_ZERO);
TAILQ_INIT(&sc->sc_cld);
snprintf(sc->sc_base.me_dv->dv_xname,
sizeof sc->sc_base.me_dv->dv_xname, "%s%d", name, unit);
sc->sc_base.me_dv->dv_private = sc;
sc->sc_base.me_dv->dv_unit = unit;
sc->sc_base.me_ops = &wsmux_srcops;
sc->sc_kbd_layout = KB_NONE;
return (sc);
}
/* Attach me as a child to sc. */
int
wsmux_attach_sc(struct wsmux_softc *sc, struct wsevsrc *me)
{
int error;
if (sc == NULL)
return (EINVAL);
DPRINTF(("wsmux_attach_sc: %s(%p): type=%d\n",
device_xname(sc->sc_base.me_dv), sc, me->me_ops->type));
#ifdef DIAGNOSTIC
if (me->me_parent != NULL) {
printf("wsmux_attach_sc: busy\n");
return (EBUSY);
}
#endif
me->me_parent = sc;
TAILQ_INSERT_TAIL(&sc->sc_cld, me, me_next);
error = 0;
#if NWSDISPLAY > 0
if (sc->sc_base.me_dispdv != NULL) {
/* This is a display mux, so attach the new device to it. */
DPRINTF(("wsmux_attach_sc: %s: set display %p\n",
device_xname(sc->sc_base.me_dv),
sc->sc_base.me_dispdv));
if (me->me_ops->dsetdisplay != NULL) {
error = wsevsrc_set_display(me, &sc->sc_base);
/* Ignore that the console already has a display. */
if (error == EBUSY)
error = 0;
if (!error) {
#ifdef WSDISPLAY_COMPAT_RAWKBD
DPRINTF(("wsmux_attach_sc: %s set rawkbd=%d\n",
device_xname(me->me_dv),
sc->sc_rawkbd));
(void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
&sc->sc_rawkbd, 0, 0);
#endif
if (sc->sc_kbd_layout != KB_NONE)
(void)wsevsrc_ioctl(me,
WSKBDIO_SETENCODING,
&sc->sc_kbd_layout, FWRITE, 0);
}
}
}
#endif
if (sc->sc_base.me_evp != NULL) {
/* Mux is open, so open the new subdevice */
DPRINTF(("wsmux_attach_sc: %s: calling open of %s\n",
device_xname(sc->sc_base.me_dv),
device_xname(me->me_dv)));
error = wsevsrc_open(me, sc->sc_base.me_evp);
} else {
DPRINTF(("wsmux_attach_sc: %s not open\n",
device_xname(sc->sc_base.me_dv)));
}
if (error) {
me->me_parent = NULL;
TAILQ_REMOVE(&sc->sc_cld, me, me_next);
}
DPRINTF(("wsmux_attach_sc: %s(%p) done, error=%d\n",
device_xname(sc->sc_base.me_dv), sc, error));
return (error);
}
/* Remove me from the parent. */
void
wsmux_detach_sc(struct wsevsrc *me)
{
struct wsmux_softc *sc = me->me_parent;
DPRINTF(("wsmux_detach_sc: %s(%p) parent=%p\n",
device_xname(me->me_dv), me, sc));
#ifdef DIAGNOSTIC
if (sc == NULL) {
printf("wsmux_detach_sc: %s has no parent\n",
device_xname(me->me_dv));
return;
}
#endif
#if NWSDISPLAY > 0
if (sc->sc_base.me_dispdv != NULL) {
if (me->me_ops->dsetdisplay != NULL)
/* ignore error, there's nothing we can do */
(void)wsevsrc_set_display(me, NULL);
} else
#endif
if (me->me_evp != NULL) {
DPRINTF(("wsmux_detach_sc: close\n"));
/* mux device is open, so close multiplexee */
(void)wsevsrc_close(me);
}
TAILQ_REMOVE(&sc->sc_cld, me, me_next);
me->me_parent = NULL;
DPRINTF(("wsmux_detach_sc: done sc=%p\n", sc));
}
/*
* Display ioctl() of a mux via the parent mux.
*/
int
wsmux_do_displayioctl(device_t dv, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct wsmux_softc *sc = device_private(dv);
struct wsevsrc *me;
int error, ok;
DPRINTF(("wsmux_displayioctl: %s: sc=%p, cmd=%08lx\n",
device_xname(sc->sc_base.me_dv), sc, cmd));
#ifdef WSDISPLAY_COMPAT_RAWKBD
if (cmd == WSKBDIO_SETMODE) {
sc->sc_rawkbd = *(int *)data;
DPRINTF(("wsmux_displayioctl: rawkbd = %d\n", sc->sc_rawkbd));
}
#endif
/*
* Return 0 if any of the ioctl() succeeds, otherwise the last error.
* Return EPASSTHROUGH if no mux component accepts the ioctl.
*/
error = EPASSTHROUGH;
ok = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmux_displayioctl: me=%p\n", me));
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmux_displayioctl: bad child %p\n", me);
continue;
}
#endif
if (me->me_ops->ddispioctl != NULL) {
error = wsevsrc_display_ioctl(me, cmd, data, flag, l);
DPRINTF(("wsmux_displayioctl: me=%p dev=%s ==> %d\n",
me, device_xname(me->me_dv), error));
if (!error)
ok = 1;
}
}
if (ok)
error = 0;
return (error);
}
#if NWSDISPLAY > 0
/*
* Set display of a mux via the parent mux.
*/
int
wsmux_evsrc_set_display(device_t dv, struct wsevsrc *ame)
{
struct wsmux_softc *muxsc = (struct wsmux_softc *)ame;
struct wsmux_softc *sc = device_private(dv);
device_t displaydv = muxsc ? muxsc->sc_base.me_dispdv : NULL;
DPRINTF(("wsmux_set_display: %s: displaydv=%p\n",
device_xname(sc->sc_base.me_dv), displaydv));
if (displaydv != NULL) {
if (sc->sc_base.me_dispdv != NULL)
return (EBUSY);
} else {
if (sc->sc_base.me_dispdv == NULL)
return (ENXIO);
}
return wsmux_set_display(sc, displaydv);
}
int
wsmux_set_display(struct wsmux_softc *sc, device_t displaydv)
{
device_t odisplaydv;
struct wsevsrc *me;
struct wsmux_softc *nsc = displaydv ? sc : NULL;
int error, ok;
odisplaydv = sc->sc_base.me_dispdv;
sc->sc_base.me_dispdv = displaydv;
if (displaydv)
aprint_verbose_dev(sc->sc_base.me_dv, "connecting to %s\n",
device_xname(displaydv));
ok = 0;
error = 0;
TAILQ_FOREACH(me, &sc->sc_cld,me_next) {
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmux_set_display: bad child parent %p\n", me);
continue;
}
#endif
if (me->me_ops->dsetdisplay != NULL) {
error = wsevsrc_set_display(me, &nsc->sc_base);
DPRINTF(("wsmux_set_display: m=%p dev=%s error=%d\n",
me, device_xname(me->me_dv), error));
if (!error) {
ok = 1;
#ifdef WSDISPLAY_COMPAT_RAWKBD
DPRINTF(("wsmux_set_display: %s set rawkbd=%d\n",
device_xname(me->me_dv), sc->sc_rawkbd));
(void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
&sc->sc_rawkbd, 0, 0);
#endif
}
}
}
if (ok)
error = 0;
if (displaydv == NULL)
aprint_verbose("%s: disconnecting from %s\n",
device_xname(sc->sc_base.me_dv),
device_xname(odisplaydv));
return (error);
}
#endif /* NWSDISPLAY > 0 */
/* $NetBSD: tcp_var.h,v 1.198 2022/10/28 05:18:39 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
*/
#ifndef _NETINET_TCP_VAR_H_
#define _NETINET_TCP_VAR_H_
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_mbuftrace.h"
#endif
/*
* TCP kernel structures and variables.
*/
#include <sys/callout.h>
#ifdef TCP_SIGNATURE
/*
* Defines which are needed by the xform_tcp module and tcp_[in|out]put
* for SADB verification and lookup.
*/
#define TCP_SIGLEN 16 /* length of computed digest in bytes */
#define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */
#define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */
/*
* Only a single SA per host may be specified at this time. An SPI is
* needed in order for the KEY_LOOKUP_SA() lookup to work.
*/
#define TCP_SIG_SPI 0x1000
#endif /* TCP_SIGNATURE */
/*
* Tcp+ip header, after ip options removed.
*/
struct tcpiphdr {
struct ipovly ti_i; /* overlaid ip structure */
struct tcphdr ti_t; /* tcp header */
};
#ifdef CTASSERT
CTASSERT(sizeof(struct tcpiphdr) == 40);
#endif
#define ti_x1 ti_i.ih_x1
#define ti_pr ti_i.ih_pr
#define ti_len ti_i.ih_len
#define ti_src ti_i.ih_src
#define ti_dst ti_i.ih_dst
#define ti_sport ti_t.th_sport
#define ti_dport ti_t.th_dport
#define ti_seq ti_t.th_seq
#define ti_ack ti_t.th_ack
#define ti_x2 ti_t.th_x2
#define ti_off ti_t.th_off
#define ti_flags ti_t.th_flags
#define ti_win ti_t.th_win
#define ti_sum ti_t.th_sum
#define ti_urp ti_t.th_urp
/*
* SACK option block.
*/
struct sackblk {
tcp_seq left; /* Left edge of sack block. */
tcp_seq right; /* Right edge of sack block. */
};
TAILQ_HEAD(sackhead, sackhole);
struct sackhole {
tcp_seq start;
tcp_seq end;
tcp_seq rxmit;
TAILQ_ENTRY(sackhole) sackhole_q;
};
struct syn_cache;
/*
* Tcp control block, one per tcp; fields:
*/
struct tcpcb {
int t_family; /* address family on the wire */
struct ipqehead segq; /* sequencing queue */
int t_segqlen; /* length of the above */
callout_t t_timer[TCPT_NTIMERS];/* tcp timers */
short t_state; /* state of this connection */
short t_rxtshift; /* log(2) of rexmt exp. backoff */
uint32_t t_rxtcur; /* current retransmit value */
short t_dupacks; /* consecutive dup acks recd */
/*
* t_partialacks:
* <0 not in fast recovery.
* ==0 in fast recovery. has not received partial acks
* >0 in fast recovery. has received partial acks
*/
short t_partialacks; /* partials acks during fast rexmit */
u_short t_peermss; /* peer's maximum segment size */
u_short t_ourmss; /* our's maximum segment size */
u_short t_segsz; /* current segment size in use */
char t_force; /* 1 if forcing out a byte */
u_int t_flags;
#define TF_ACKNOW 0x0001 /* ack peer immediately */
#define TF_DELACK 0x0002 /* ack, but try to delay it */
#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */
#define TF_NOOPT 0x0008 /* don't use tcp options */
#define TF_REQ_SCALE 0x0020 /* have/will request window scaling */
#define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */
#define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */
#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */
#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */
#define TF_SYN_REXMT 0x0400 /* rexmit timer fired on SYN */
#define TF_WILL_SACK 0x0800 /* try to use SACK */
#define TF_REASSEMBLING 0x1000 /* we're busy reassembling */
#define TF_DEAD 0x2000 /* dead and to-be-released */
#define TF_PMTUD_PEND 0x4000 /* Path MTU Discovery pending */
#define TF_ECN_PERMIT 0x10000 /* other side said is ECN-ready */
#define TF_ECN_SND_CWR 0x20000 /* ECN CWR in queue */
#define TF_ECN_SND_ECE 0x40000 /* ECN ECE in queue */
#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */
struct mbuf *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
callout_t t_delack_ch; /* delayed ACK callout */
/*
* The following fields are used as in the protocol specification.
* See RFC793, Dec. 1981, page 21.
*/
/* send sequence variables */
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_nxt; /* send next */
tcp_seq snd_up; /* send urgent pointer */
tcp_seq snd_wl1; /* window update seg seq number */
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq iss; /* initial send sequence number */
u_long snd_wnd; /* send window */
/*
* snd_recover
* it's basically same as the "recover" variable in RFC 2852 (NewReno).
* when entering fast retransmit, it's set to snd_max.
* newreno uses this to detect partial ack.
* snd_high
* it's basically same as the "send_high" variable in RFC 2852 (NewReno).
* on each RTO, it's set to snd_max.
* newreno uses this to avoid false fast retransmits.
*/
tcp_seq snd_recover;
tcp_seq snd_high;
/* receive sequence variables */
u_long rcv_wnd; /* receive window */
tcp_seq rcv_nxt; /* receive next */
tcp_seq rcv_up; /* receive urgent pointer */
tcp_seq irs; /* initial receive sequence number */
/*
* Additional variables for this implementation.
*/
/* receive variables */
tcp_seq rcv_adv; /* advertised window */
/*
* retransmit variables
*
* snd_max
* the highest sequence number we've ever sent.
* used to recognize retransmits.
*/
tcp_seq snd_max;
/* congestion control (for slow start, source quench, retransmit after loss) */
u_long snd_cwnd; /* congestion-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
/* auto-sizing variables */
u_int rfbuf_cnt; /* recv buffer autoscaling byte count */
uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
/*
* transmit timing stuff. See below for scale of srtt and rttvar.
* "Variance" is actually smoothed difference.
*/
uint32_t t_rcvtime; /* time last segment received */
uint32_t t_rtttime; /* time we started measuring rtt */
tcp_seq t_rtseq; /* sequence number being timed */
int32_t t_srtt; /* smoothed round-trip time */
int32_t t_rttvar; /* variance in round-trip time */
uint32_t t_rttmin; /* minimum rtt allowed */
u_long max_sndwnd; /* largest window peer has offered */
/* out-of-band data */
char t_oobflags; /* have some */
char t_iobc; /* input character */
#define TCPOOB_HAVEDATA 0x01
#define TCPOOB_HADDATA 0x02
short t_softerror; /* possible error not yet reported */
/* RFC 1323 variables */
u_char snd_scale; /* window scaling for send window */
u_char rcv_scale; /* window scaling for recv window */
u_char request_r_scale; /* pending window scaling */
u_char requested_s_scale;
u_int32_t ts_recent; /* timestamp echo data */
u_int32_t ts_recent_age; /* when last updated */
u_int32_t ts_timebase; /* our timebase */
tcp_seq last_ack_sent;
/* RFC 3465 variables */
u_long t_bytes_acked; /* ABC "bytes_acked" parameter */
/* SACK stuff */
#define TCP_SACK_MAX 3
#define TCPSACK_NONE 0
#define TCPSACK_HAVED 1
u_char rcv_sack_flags; /* SACK flags. */
struct sackblk rcv_dsack_block; /* RX D-SACK block. */
struct ipqehead timeq; /* time sequenced queue. */
struct sackhead snd_holes; /* TX SACK holes. */
int snd_numholes; /* Number of TX SACK holes. */
tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
tcp_seq sack_newdata; /* New data xmitted in this recovery
episode starts at this seq number*/
tcp_seq snd_fack; /* FACK TCP. Forward-most data held by
peer. */
/* CUBIC variables */
ulong snd_cubic_wmax; /* W_max */
ulong snd_cubic_wmax_last; /* Used for fast convergence */
ulong snd_cubic_ctime; /* Last congestion time */
/* pointer for syn cache entries*/
LIST_HEAD(, syn_cache) t_sc; /* list of entries by this tcb */
/* prediction of next mbuf when using large window sizes */
struct mbuf *t_lastm; /* last mbuf that data was sent from */
int t_inoff; /* data offset in previous mbuf */
int t_lastoff; /* last data address in mbuf chain */
int t_lastlen; /* last length read from mbuf chain */
/* Path-MTU discovery blackhole detection */
int t_mtudisc; /* perform mtudisc for this tcb */
/* Path-MTU Discovery Information */
u_int t_pmtud_mss_acked; /* MSS acked, lower bound for MTU */
u_int t_pmtud_mtu_sent; /* MTU used, upper bound for MTU */
tcp_seq t_pmtud_th_seq; /* TCP SEQ from ICMP payload */
u_int t_pmtud_nextmtu; /* Advertised Next-Hop MTU from ICMP */
u_short t_pmtud_ip_len; /* IP length from ICMP payload */
u_short t_pmtud_ip_hl; /* IP header length from ICMP payload */
uint8_t t_ecn_retries; /* # of ECN setup retries */
const struct tcp_congctl *t_congctl; /* per TCB congctl algorithm */
/* Keepalive per socket */
u_int t_keepinit;
u_int t_keepidle;
u_int t_keepintvl;
u_int t_keepcnt;
u_int t_maxidle; /* t_keepcnt * t_keepintvl */
u_int t_msl; /* MSL to use for this connexion */
/* maintain a few stats per connection: */
uint32_t t_rcvoopack; /* out-of-order packets received */
uint32_t t_sndrexmitpack; /* retransmit packets sent */
uint32_t t_sndzerowin; /* zero-window updates sent */
};
/*
* Macros to aid ECN TCP.
*/
#define TCP_ECN_ALLOWED(tp) (tp->t_flags & TF_ECN_PERMIT)
/*
* Macros to aid SACK/FACK TCP.
*/
#define TCP_SACK_ENABLED(tp) (tp->t_flags & TF_WILL_SACK)
#define TCP_FACK_FASTRECOV(tp) \
(TCP_SACK_ENABLED(tp) && \
(SEQ_GT(tp->snd_fack, tp->snd_una + tcprexmtthresh * tp->t_segsz)))
#ifdef _KERNEL
/*
* TCP reassembly queue locks.
*/
static __inline int tcp_reass_lock_try (struct tcpcb *)
__unused;
static __inline void tcp_reass_unlock (struct tcpcb *)
__unused;
static __inline int
tcp_reass_lock_try(struct tcpcb *tp)
{
int s;
/*
* Use splvm() -- we're blocking things that would cause
* mbuf allocation.
*/
s = splvm();
if (tp->t_flags & TF_REASSEMBLING) {
splx(s);
return (0);
}
tp->t_flags |= TF_REASSEMBLING;
splx(s);
return (1);
}
static __inline void
tcp_reass_unlock(struct tcpcb *tp)
{
int s;
s = splvm();
KASSERT((tp->t_flags & TF_REASSEMBLING) != 0);
tp->t_flags &= ~TF_REASSEMBLING;
splx(s);
}
#ifdef DIAGNOSTIC
#define TCP_REASS_LOCK(tp) \
do { \
if (tcp_reass_lock_try(tp) == 0) { \
printf("%s:%d: tcpcb %p reass already locked\n", \
__FILE__, __LINE__, tp); \
panic("tcp_reass_lock"); \
} \
} while (/*CONSTCOND*/ 0)
#define TCP_REASS_LOCK_CHECK(tp) \
do { \
if (((tp)->t_flags & TF_REASSEMBLING) == 0) { \
printf("%s:%d: tcpcb %p reass lock not held\n", \
__FILE__, __LINE__, tp); \
panic("tcp reass lock check"); \
} \
} while (/*CONSTCOND*/ 0)
#else
#define TCP_REASS_LOCK(tp) (void) tcp_reass_lock_try((tp))
#define TCP_REASS_LOCK_CHECK(tp) /* nothing */
#endif
#define TCP_REASS_UNLOCK(tp) tcp_reass_unlock((tp))
#endif /* _KERNEL */
/*
* Queue for delayed ACK processing.
*/
#ifdef _KERNEL
extern int tcp_delack_ticks;
void tcp_delack(void *);
#define TCP_RESTART_DELACK(tp) \
callout_reset(&(tp)->t_delack_ch, tcp_delack_ticks, \
tcp_delack, tp)
#define TCP_SET_DELACK(tp) \
do { \
if (((tp)->t_flags & TF_DELACK) == 0) { \
(tp)->t_flags |= TF_DELACK; \
TCP_RESTART_DELACK(tp); \
} \
} while (/*CONSTCOND*/0)
#define TCP_CLEAR_DELACK(tp) \
do { \
if ((tp)->t_flags & TF_DELACK) { \
(tp)->t_flags &= ~TF_DELACK; \
callout_stop(&(tp)->t_delack_ch); \
} \
} while (/*CONSTCOND*/0)
#endif /* _KERNEL */
/*
* Compute the current timestamp for a connection.
*/
#define TCP_TIMESTAMP(tp) (tcp_now - (tp)->ts_timebase)
/*
* Handy way of passing around TCP option info.
*/
struct tcp_opt_info {
int ts_present;
u_int32_t ts_val;
u_int32_t ts_ecr;
u_int16_t maxseg;
};
#define TOF_SIGNATURE 0x0040 /* signature option present */
#define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
/*
* See RFC2988 for a discussion of RTO calculation; comments assume
* familiarity with that document.
*
* The smoothed round-trip time and estimated variance are stored as
* fixed point numbers. Historically, srtt was scaled by
* TCP_RTT_SHIFT bits, and rttvar by TCP_RTTVAR_SHIFT bits. Because
* the values coincide with the alpha and beta parameters suggested
* for RTO calculation (1/8 for srtt, 1/4 for rttvar), the combination
* of computing 1/8 of the new value and transforming it to the
* fixed-point representation required zero instructions. However,
* the storage representations no longer coincide with the alpha/beta
* shifts; instead, more fractional bits are present.
*
* The storage representation of srtt is 1/32 slow ticks, or 1/64 s.
* (The assumption that a slow tick is 500 ms should not be present in
* the code.)
*
* The storage representation of rttvar is 1/16 slow ticks, or 1/32 s.
* There may be some confusion about this in the code.
*
* For historical reasons, these scales are also used in smoothing the
* average (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
* This results in alpha of 0.125 and beta of 0.25, following RFC2988
* section 2.3
*
* XXX Change SHIFT values to LGWEIGHT and REP_SHIFT, and adjust
* the code to use the correct ones.
*/
#define TCP_RTT_SHIFT 3 /* shift for srtt; 3 bits frac. */
#define TCP_RTTVAR_SHIFT 2 /* multiplier for rttvar; 2 bits */
/*
* Compute TCP retransmission timer, following RFC2988.
* This macro returns a value in slow timeout ticks.
*
* Section 2.2 requires that the RTO value be
* srtt + max(G, 4*RTTVAR)
* where G is the clock granularity.
*
* This comment has not necessarily been updated for the new storage
* representation:
*
* Because of the way we do the smoothing, srtt and rttvar
* will each average +1/2 tick of bias. When we compute
* the retransmit timer, we want 1/2 tick of rounding and
* 1 extra tick because of +-1/2 tick uncertainty in the
* firing of the timer. The bias will give us exactly the
* 1.5 tick we need. But, because the bias is
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
* This macro assumes that the value of 1<<TCP_RTTVAR_SHIFT
* is the same as the multiplier for rttvar.
*
* This macro appears to be wrong; it should be checking rttvar*4 in
* ticks and making sure we use 1 instead if rttvar*4 rounds to 0. It
* appears to be treating srtt as being in the old storage
* representation, resulting in a factor of 4 extra.
*/
#define TCP_REXMTVAL(tp) \
((((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) >> 2)
/*
* Compute the initial window for slow start.
*/
#define TCP_INITIAL_WINDOW(iw, segsz) \
uimin((iw) * (segsz), uimax(2 * (segsz), tcp_init_win_max[(iw)]))
/*
* TCP statistics.
* Each counter is an unsigned 64-bit value.
*
* Many of these should be kept per connection, but that's inconvenient
* at the moment.
*/
#define TCP_STAT_CONNATTEMPT 0 /* connections initiated */
#define TCP_STAT_ACCEPTS 1 /* connections accepted */
#define TCP_STAT_CONNECTS 2 /* connections established */
#define TCP_STAT_DROPS 3 /* connections dropped */
#define TCP_STAT_CONNDROPS 4 /* embryonic connections dropped */
#define TCP_STAT_CLOSED 5 /* conn. closed (includes drops) */
#define TCP_STAT_SEGSTIMED 6 /* segs where we tried to get rtt */
#define TCP_STAT_RTTUPDATED 7 /* times we succeeded */
#define TCP_STAT_DELACK 8 /* delayed ACKs sent */
#define TCP_STAT_TIMEOUTDROP 9 /* conn. dropped in rxmt timeout */
#define TCP_STAT_REXMTTIMEO 10 /* retransmit timeouts */
#define TCP_STAT_PERSISTTIMEO 11 /* persist timeouts */
#define TCP_STAT_KEEPTIMEO 12 /* keepalive timeouts */
#define TCP_STAT_KEEPPROBE 13 /* keepalive probes sent */
#define TCP_STAT_KEEPDROPS 14 /* connections dropped in keepalive */
#define TCP_STAT_PERSISTDROPS 15 /* connections dropped in persist */
#define TCP_STAT_CONNSDRAINED 16 /* connections drained due to memory
shortage */
#define TCP_STAT_PMTUBLACKHOLE 17 /* PMTUD blackhole detected */
#define TCP_STAT_SNDTOTAL 18 /* total packets sent */
#define TCP_STAT_SNDPACK 19 /* data packlets sent */
#define TCP_STAT_SNDBYTE 20 /* data bytes sent */
#define TCP_STAT_SNDREXMITPACK 21 /* data packets retransmitted */
#define TCP_STAT_SNDREXMITBYTE 22 /* data bytes retransmitted */
#define TCP_STAT_SNDACKS 23 /* ACK-only packets sent */
#define TCP_STAT_SNDPROBE 24 /* window probes sent */
#define TCP_STAT_SNDURG 25 /* packets sent with URG only */
#define TCP_STAT_SNDWINUP 26 /* window update-only packets sent */
#define TCP_STAT_SNDCTRL 27 /* control (SYN|FIN|RST) packets sent */
#define TCP_STAT_RCVTOTAL 28 /* total packets received */
#define TCP_STAT_RCVPACK 29 /* packets received in sequence */
#define TCP_STAT_RCVBYTE 30 /* bytes received in sequence */
#define TCP_STAT_RCVBADSUM 31 /* packets received with cksum errs */
#define TCP_STAT_RCVBADOFF 32 /* packets received with bad offset */
#define TCP_STAT_RCVMEMDROP 33 /* packets dropped for lack of memory */
#define TCP_STAT_RCVSHORT 34 /* packets received too short */
#define TCP_STAT_RCVDUPPACK 35 /* duplicate-only packets received */
#define TCP_STAT_RCVDUPBYTE 36 /* duplicate-only bytes received */
#define TCP_STAT_RCVPARTDUPPACK 37 /* packets with some duplicate data */
#define TCP_STAT_RCVPARTDUPBYTE 38 /* dup. bytes in part-dup. packets */
#define TCP_STAT_RCVOOPACK 39 /* out-of-order packets received */
#define TCP_STAT_RCVOOBYTE 40 /* out-of-order bytes received */
#define TCP_STAT_RCVPACKAFTERWIN 41 /* packets with data after window */
#define TCP_STAT_RCVBYTEAFTERWIN 42 /* bytes received after window */
#define TCP_STAT_RCVAFTERCLOSE 43 /* packets received after "close" */
#define TCP_STAT_RCVWINPROBE 44 /* rcvd window probe packets */
#define TCP_STAT_RCVDUPACK 45 /* rcvd duplicate ACKs */
#define TCP_STAT_RCVACKTOOMUCH 46 /* rcvd ACKs for unsent data */
#define TCP_STAT_RCVACKPACK 47 /* rcvd ACK packets */
#define TCP_STAT_RCVACKBYTE 48 /* bytes ACKed by rcvd ACKs */
#define TCP_STAT_RCVWINUPD 49 /* rcvd window update packets */
#define TCP_STAT_PAWSDROP 50 /* segments dropped due to PAWS */
#define TCP_STAT_PREDACK 51 /* times hdr predict OK for ACKs */
#define TCP_STAT_PREDDAT 52 /* times hdr predict OK for data pkts */
#define TCP_STAT_PCBHASHMISS 53 /* input packets missing PCB hash */
#define TCP_STAT_NOPORT 54 /* no socket on port */
#define TCP_STAT_BADSYN 55 /* received ACK for which we have
no SYN in compressed state */
#define TCP_STAT_DELAYED_FREE 56 /* delayed pool_put() of tcpcb */
#define TCP_STAT_SC_ADDED 57 /* # of sc entries added */
#define TCP_STAT_SC_COMPLETED 58 /* # of sc connections completed */
#define TCP_STAT_SC_TIMED_OUT 59 /* # of sc entries timed out */
#define TCP_STAT_SC_OVERFLOWED 60 /* # of sc drops due to overflow */
#define TCP_STAT_SC_RESET 61 /* # of sc drops due to RST */
#define TCP_STAT_SC_UNREACH 62 /* # of sc drops due to ICMP unreach */
#define TCP_STAT_SC_BUCKETOVERFLOW 63 /* # of sc drops due to bucket ovflow */
#define TCP_STAT_SC_ABORTED 64 /* # of sc entries aborted (no mem) */
#define TCP_STAT_SC_DUPESYN 65 /* # of duplicate SYNs received */
#define TCP_STAT_SC_DROPPED 66 /* # of SYNs dropped (no route/mem) */
#define TCP_STAT_SC_COLLISIONS 67 /* # of sc hash collisions */
#define TCP_STAT_SC_RETRANSMITTED 68 /* # of sc retransmissions */
#define TCP_STAT_SC_DELAYED_FREE 69 /* # of delayed pool_put()s */
#define TCP_STAT_SELFQUENCH 70 /* # of ENOBUFS we get on output */
#define TCP_STAT_BADSIG 71 /* # of drops due to bad signature */
#define TCP_STAT_GOODSIG 72 /* # of packets with good signature */
#define TCP_STAT_ECN_SHS 73 /* # of successful ECN handshakes */
#define TCP_STAT_ECN_CE 74 /* # of packets with CE bit */
#define TCP_STAT_ECN_ECT 75 /* # of packets with ECT(0) bit */
#define TCP_NSTATS 76
/*
* Names for TCP sysctl objects.
*/
#define TCPCTL_RFC1323 1 /* RFC1323 timestamps/scaling */
#define TCPCTL_SENDSPACE 2 /* default send buffer */
#define TCPCTL_RECVSPACE 3 /* default recv buffer */
#define TCPCTL_MSSDFLT 4 /* default seg size */
#define TCPCTL_SYN_CACHE_LIMIT 5 /* max size of comp. state engine */
#define TCPCTL_SYN_BUCKET_LIMIT 6 /* max size of hash bucket */
#if 0 /*obsoleted*/
#define TCPCTL_SYN_CACHE_INTER 7 /* interval of comp. state timer */
#endif
#define TCPCTL_INIT_WIN 8 /* initial window */
#define TCPCTL_MSS_IFMTU 9 /* mss from interface, not in_maxmtu */
#define TCPCTL_SACK 10 /* RFC2018 selective acknowledgement */
#define TCPCTL_WSCALE 11 /* RFC1323 window scaling */
#define TCPCTL_TSTAMP 12 /* RFC1323 timestamps */
#if 0 /*obsoleted*/
#define TCPCTL_COMPAT_42 13 /* 4.2BSD TCP bug work-arounds */
#endif
#define TCPCTL_CWM 14 /* Congestion Window Monitoring */
#define TCPCTL_CWM_BURSTSIZE 15 /* burst size allowed by CWM */
#define TCPCTL_ACK_ON_PUSH 16 /* ACK immediately on PUSH */
#define TCPCTL_KEEPIDLE 17 /* keepalive idle time */
#define TCPCTL_KEEPINTVL 18 /* keepalive probe interval */
#define TCPCTL_KEEPCNT 19 /* keepalive count */
#define TCPCTL_SLOWHZ 20 /* PR_SLOWHZ (read-only) */
#define TCPCTL_NEWRENO 21 /* NewReno Congestion Control */
#define TCPCTL_LOG_REFUSED 22 /* Log refused connections */
#if 0 /*obsoleted*/
#define TCPCTL_RSTRATELIMIT 23 /* RST rate limit */
#endif
#define TCPCTL_RSTPPSLIMIT 24 /* RST pps limit */
#define TCPCTL_DELACK_TICKS 25 /* # ticks to delay ACK */
#define TCPCTL_INIT_WIN_LOCAL 26 /* initial window for local nets */
#define TCPCTL_IDENT 27 /* rfc 931 identd */
#define TCPCTL_ACKDROPRATELIMIT 28 /* SYN/RST -> ACK rate limit */
#define TCPCTL_LOOPBACKCKSUM 29 /* do TCP checksum on loopback */
#define TCPCTL_STATS 30 /* TCP statistics */
#define TCPCTL_DEBUG 31 /* TCP debug sockets */
#define TCPCTL_DEBX 32 /* # of tcp debug sockets */
#define TCPCTL_DROP 33 /* drop tcp connection */
#define TCPCTL_MSL 34 /* Max Segment Life */
#ifdef _KERNEL
extern struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
extern const struct pr_usrreqs tcp_usrreqs;
extern u_int32_t tcp_now; /* for RFC 1323 timestamps */
extern int tcp_do_rfc1323; /* enabled/disabled? */
extern int tcp_do_sack; /* SACK enabled/disabled? */
extern int tcp_do_win_scale; /* RFC1323 window scaling enabled/disabled? */
extern int tcp_do_timestamps; /* RFC1323 timestamps enabled/disabled? */
extern int tcp_mssdflt; /* default seg size */
extern int tcp_minmss; /* minimal seg size */
extern int tcp_msl; /* max segment life */
extern int tcp_init_win; /* initial window */
extern int tcp_init_win_local; /* initial window for local nets */
extern int tcp_init_win_max[11];/* max sizes for values of tcp_init_win_* */
extern int tcp_mss_ifmtu; /* take MSS from interface, not in_maxmtu */
extern int tcp_cwm; /* enable Congestion Window Monitoring */
extern int tcp_cwm_burstsize; /* burst size allowed by CWM */
extern int tcp_ack_on_push; /* ACK immediately on PUSH */
extern int tcp_log_refused; /* log refused connections */
extern int tcp_do_ecn; /* TCP ECN enabled/disabled? */
extern int tcp_ecn_maxretries; /* Max ECN setup retries */
extern int tcp_do_rfc1948; /* ISS by cryptographic hash */
extern int tcp_sack_tp_maxholes; /* Max holes per connection. */
extern int tcp_sack_globalmaxholes; /* Max holes per system. */
extern int tcp_sack_globalholes; /* Number of holes present. */
extern int tcp_do_abc; /* RFC3465 ABC enabled/disabled? */
extern int tcp_abc_aggressive; /* 1: L=2*SMSS 0: L=1*SMSS */
extern int tcp_msl_enable; /* enable TIME_WAIT truncation */
extern int tcp_msl_loop; /* MSL for loopback */
extern int tcp_msl_local; /* MSL for 'local' */
extern int tcp_msl_remote; /* MSL otherwise */
extern int tcp_msl_remote_threshold; /* RTT threshold */
extern int tcp_rttlocal; /* Use RTT to decide who's 'local' */
extern int tcp4_vtw_enable;
extern int tcp6_vtw_enable;
extern int tcp_vtw_was_enabled;
extern int tcp_vtw_entries;
extern int tcp_rst_ppslim;
extern int tcp_ackdrop_ppslim;
#ifdef MBUFTRACE
extern struct mowner tcp_rx_mowner;
extern struct mowner tcp_tx_mowner;
extern struct mowner tcp_reass_mowner;
extern struct mowner tcp_sock_mowner;
extern struct mowner tcp_sock_rx_mowner;
extern struct mowner tcp_sock_tx_mowner;
extern struct mowner tcp_mowner;
#endif
extern int tcp_do_autorcvbuf;
extern int tcp_autorcvbuf_inc;
extern int tcp_autorcvbuf_max;
extern int tcp_do_autosndbuf;
extern int tcp_autosndbuf_inc;
extern int tcp_autosndbuf_max;
struct secasvar;
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
tcp_close(struct tcpcb *);
int tcp_isdead(struct tcpcb *);
#ifdef INET6
void *tcp6_ctlinput(int, const struct sockaddr *, void *);
#endif
void *tcp_ctlinput(int, const struct sockaddr *, void *);
int tcp_ctloutput(int, struct socket *, struct sockopt *);
struct tcpcb *
tcp_disconnect1(struct tcpcb *);
struct tcpcb *
tcp_drop(struct tcpcb *, int);
#ifdef TCP_SIGNATURE
int tcp_signature_apply(void *, void *, u_int);
struct secasvar *tcp_signature_getsav(struct mbuf *);
int tcp_signature(struct mbuf *, struct tcphdr *, int, struct secasvar *,
char *);
#endif
void tcp_drain(void);
void tcp_drainstub(void);
void tcp_established(struct tcpcb *);
void tcp_init(void);
void tcp_init_common(unsigned);
#ifdef INET6
int tcp6_input(struct mbuf **, int *, int);
#endif
void tcp_input(struct mbuf *, int, int);
u_int tcp_hdrsz(struct tcpcb *);
u_long tcp_mss_to_advertise(const struct ifnet *, int);
void tcp_mss_from_peer(struct tcpcb *, int);
void tcp_tcpcb_template(void);
struct tcpcb *
tcp_newtcpcb(int, struct inpcb *);
void tcp_notify(struct inpcb *, int);
u_int tcp_optlen(struct tcpcb *);
int tcp_output(struct tcpcb *);
void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
void tcp_quench(struct inpcb *);
void tcp_mtudisc(struct inpcb *, int);
#ifdef INET6
void tcp6_mtudisc_callback(struct in6_addr *);
#endif
void tcpipqent_init(void);
struct ipqent *tcpipqent_alloc(void);
void tcpipqent_free(struct ipqent *);
int tcp_respond(struct tcpcb *, struct mbuf *, struct mbuf *,
struct tcphdr *, tcp_seq, tcp_seq, int);
void tcp_rmx_rtt(struct tcpcb *);
void tcp_setpersist(struct tcpcb *);
#ifdef TCP_SIGNATURE
int tcp_signature_compute(struct mbuf *, struct tcphdr *, int, int,
int, u_char *, u_int);
#endif
void tcp_fasttimo(void);
struct mbuf *
tcp_template(struct tcpcb *);
void tcp_trace(short, short, struct tcpcb *, struct mbuf *, int);
struct tcpcb *
tcp_usrclosed(struct tcpcb *);
void tcp_usrreq_init(void);
void tcp_xmit_timer(struct tcpcb *, uint32_t);
tcp_seq tcp_new_iss(struct tcpcb *);
tcp_seq tcp_new_iss1(void *, void *, u_int16_t, u_int16_t, size_t);
void tcp_sack_init(void);
void tcp_new_dsack(struct tcpcb *, tcp_seq, u_int32_t);
void tcp_sack_option(struct tcpcb *, const struct tcphdr *,
const u_char *, int);
void tcp_del_sackholes(struct tcpcb *, const struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
int tcp_sack_numblks(const struct tcpcb *);
#define TCP_SACK_OPTLEN(nblks) ((nblks) * 8 + 2 + 2)
void tcp_statinc(u_int);
void tcp_statadd(u_int, uint64_t);
int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
int);
int tcp_dooptions(struct tcpcb *, const u_char *, int,
struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
#endif
#endif /* !_NETINET_TCP_VAR_H_ */
/* $NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $ */
/*-
* Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* uvm_object read-ahead
*
* TODO:
* - tune.
* - handle multiple streams.
* - find a better way to deal with PGO_LOCKED pager requests.
* (currently just ignored)
* - consider the amount of memory in the system.
* - consider the speed of the underlying device.
* - consider filesystem block size / block layout.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#if defined(READAHEAD_DEBUG)
#define DPRINTF(a) printf a
#else /* defined(READAHEAD_DEBUG) */
#define DPRINTF(a) /* nothing */
#endif /* defined(READAHEAD_DEBUG) */
/*
* uvm_ractx: read-ahead context.
*/
struct uvm_ractx {
int ra_flags;
#define RA_VALID 1
off_t ra_winstart; /* window start offset */
size_t ra_winsize; /* window size */
off_t ra_next; /* next offset to read-ahead */
};
#if defined(sun2) || defined(sun3)
/* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */
#undef MAXPHYS
#define MAXPHYS 0x8000 /* XXX */
#endif
#define RA_WINSIZE_INIT MAXPHYS /* initial window size */
#define RA_WINSIZE_MAX (MAXPHYS * 16) /* max window size */
#define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for
SEQUENTIAL hint */
#define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */
#define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */
static off_t ra_startio(struct uvm_object *, off_t, size_t);
static struct uvm_ractx *ra_allocctx(void);
static void ra_freectx(struct uvm_ractx *);
/*
* uvm_ra_init: initialize readahead module.
*/
void
uvm_ra_init(void)
{
}
static struct uvm_ractx *
ra_allocctx(void)
{
return kmem_alloc(sizeof(struct uvm_ractx), KM_NOSLEEP);
}
static void
ra_freectx(struct uvm_ractx *ra)
{
kmem_free(ra, sizeof(struct uvm_ractx));
}
/*
* ra_startio: start i/o for read-ahead.
*
* => start i/o for each RA_IOCHUNK sized chunk.
* => return offset to which we started i/o.
*/
static off_t
ra_startio(struct uvm_object *uobj, off_t off, size_t sz)
{
const off_t endoff = off + sz;
DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n",
__func__, uobj, off, endoff));
KASSERT(rw_write_held(uobj->vmobjlock));
/*
* Don't issue read-ahead if the last page of the range is already cached.
* The assumption is that since the access is sequential, the intermediate
* pages would have similar LRU stats, and hence likely to be still in cache
* too. This speeds up I/O using cache, since it avoids lookups and temporary
* allocations done by full pgo_get.
*/
struct vm_page *pg = uvm_pagelookup(uobj, trunc_page(endoff - 1));
if (pg != NULL) {
DPRINTF(("%s: off=%" PRIu64 ", sz=%zu already cached\n",
__func__, off, sz));
return endoff;
}
off = trunc_page(off);
while (off < endoff) {
const size_t chunksize = RA_IOCHUNK;
int error;
size_t donebytes;
int npages;
int orignpages;
size_t bytelen;
KASSERT((chunksize & (chunksize - 1)) == 0);
KASSERT((off & PAGE_MASK) == 0);
bytelen = ((off + chunksize) & -(off_t)chunksize) - off;
KASSERT((bytelen & PAGE_MASK) == 0);
npages = orignpages = bytelen >> PAGE_SHIFT;
KASSERT(npages != 0);
/*
* use UVM_ADV_RANDOM to avoid recursion.
*/
error = (*uobj->pgops->pgo_get)(uobj, off, NULL,
&npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, PGO_NOTIMESTAMP);
rw_enter(uobj->vmobjlock, RW_WRITER);
DPRINTF(("%s: off=%" PRIu64 ", bytelen=%zu -> %d\n",
__func__, off, bytelen, error));
if (error != 0 && error != EBUSY) {
if (error != EINVAL) { /* maybe past EOF */
DPRINTF(("%s: error=%d\n", __func__, error));
}
break;
}
KASSERT(orignpages == npages);
donebytes = orignpages << PAGE_SHIFT;
off += donebytes;
}
return off;
}
/* ------------------------------------------------------------ */
/*
* uvm_ra_allocctx: allocate a context.
*/
struct uvm_ractx *
uvm_ra_allocctx(void)
{
struct uvm_ractx *ra;
ra = ra_allocctx();
if (ra != NULL) { ra->ra_flags = 0;
}
return ra;
}
/*
* uvm_ra_freectx: free a context.
*/
void
uvm_ra_freectx(struct uvm_ractx *ra)
{
KASSERT(ra != NULL);
ra_freectx(ra);
}
/*
* uvm_ra_request: update a read-ahead context and start i/o if appropriate.
*
* => called when [reqoff, reqoff+reqsize) is requested.
* => object must be locked by caller, will return locked.
*/
void
uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj,
off_t reqoff, size_t reqsize)
{ KASSERT(rw_write_held(uobj->vmobjlock)); if (ra == NULL || advice == UVM_ADV_RANDOM) {
return;
}
if (advice == UVM_ADV_SEQUENTIAL) {
/*
* always do read-ahead with a large window.
*/
if ((ra->ra_flags & RA_VALID) == 0) {
ra->ra_winstart = ra->ra_next = 0;
ra->ra_flags |= RA_VALID;
}
if (reqoff < ra->ra_winstart) { ra->ra_next = reqoff;
}
ra->ra_winsize = RA_WINSIZE_SEQENTIAL;
goto do_readahead;
}
/*
* a request with UVM_ADV_NORMAL hint. (ie. no hint)
*
* we keep a sliding window in order to determine:
* - if the previous read-ahead was successful or not.
* - how many bytes to read-ahead.
*/
/*
* if it's the first request for this context,
* initialize context and return.
*/
if ((ra->ra_flags & RA_VALID) == 0) {
initialize:
ra->ra_winstart = ra->ra_next = reqoff + reqsize;
ra->ra_winsize = RA_WINSIZE_INIT;
ra->ra_flags |= RA_VALID;
goto done;
}
/*
* if it isn't in our window,
* initialize context and return.
* (read-ahead miss)
*/
if (reqoff < ra->ra_winstart ||
ra->ra_winstart + ra->ra_winsize < reqoff) {
/*
* ... unless we seem to be reading the same chunk repeatedly.
*
* XXX should have some margin?
*/
if (reqoff + reqsize == ra->ra_winstart) {
DPRINTF(("%s: %p: same block: off=%" PRIu64
", size=%zd, winstart=%" PRIu64 "\n",
__func__, ra, reqoff, reqsize, ra->ra_winstart));
goto done;
}
goto initialize;
}
/*
* it's in our window. (read-ahead hit)
* - start read-ahead i/o if appropriate.
* - advance and enlarge window.
*/
do_readahead:
/*
* don't bother to read-ahead behind current request.
*/
if (reqoff > ra->ra_next) { ra->ra_next = reqoff;
}
/*
* try to make [reqoff, reqoff+ra_winsize) in-core.
* note that [reqoff, ra_next) is considered already done.
*/
if (reqoff + ra->ra_winsize > ra->ra_next) {
off_t raoff = MAX(reqoff, ra->ra_next);
size_t rasize = reqoff + ra->ra_winsize - ra->ra_next;
#if defined(DIAGNOSTIC)
if (rasize > RA_WINSIZE_MAX) {
printf("%s: corrupted context", __func__);
rasize = RA_WINSIZE_MAX;
}
#endif /* defined(DIAGNOSTIC) */
/*
* issue read-ahead only if we can start big enough i/o.
* otherwise we end up with a stream of small i/o.
*/
if (rasize >= RA_MINSIZE) {
off_t next;
next = ra_startio(uobj, raoff, rasize);
ra->ra_next = next;
}
}
/*
* update window.
*
* enlarge window by reqsize, so that it grows in a predictable manner
* regardless of the size of each read(2).
*/
ra->ra_winstart = reqoff + reqsize;
ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize);
done:;
}
int
uvm_readahead(struct uvm_object *uobj, off_t off, off_t size)
{
/*
* don't allow too much read-ahead.
*/
if (size > RA_WINSIZE_MAX) {
size = RA_WINSIZE_MAX;
}
rw_enter(uobj->vmobjlock, RW_WRITER);
ra_startio(uobj, off, size);
rw_exit(uobj->vmobjlock);
return 0;
}
/* $NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $");
#define ELFSIZE 32
#include "exec_elf.c"
#include <sys/module.h>
#define ELF32_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux32Info) \
+ MAXPATHLEN + ALIGN(1))
MODULE(MODULE_CLASS_EXEC, exec_elf32, NULL);
static struct execsw exec_elf32_execsw[] = {
{
.es_hdrsz = sizeof (Elf32_Ehdr),
.es_makecmds = exec_elf32_makecmds,
.u = {
.elf_probe_func = netbsd_elf32_probe,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_FIRST,
.es_arglen = ELF32_AUXSIZE,
.es_copyargs = elf32_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf32,
.es_setup_stack = exec_setup_stack,
},
#if EXEC_ELF_NOTELESS
{
.es_hdrsz = sizeof (Elf32_Ehdr),
.es_makecmds = exec_elf32_makecmds,
.u {
elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_LAST,
.es_arglen = ELF32_AUXSIZE,
.es_copyargs = elf32_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf32,
.es_setup_stack = exec_setup_stack,
},
#endif
};
static int
exec_elf32_modcmd(modcmd_t cmd, void *arg)
{
#if ARCH_ELFSIZE == 64
/*
* If we are on a 64bit system, we don't want the 32bit execsw[] to be
* added in the global array, because the exec_elf32 module only works
* on 32bit systems.
*
* However, we need the exec_elf32 module, because it will make the 32bit
* functions available for netbsd32 and linux32.
*
* Therefore, allow this module on 64bit systems, but make it dormant.
*/
(void)exec_elf32_execsw; /* unused */
switch (cmd) {
case MODULE_CMD_INIT:
case MODULE_CMD_FINI:
return 0;
default:
return ENOTTY;
}
#else /* ARCH_ELFSIZE == 64 */
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(exec_elf32_execsw,
__arraycount(exec_elf32_execsw));
case MODULE_CMD_FINI:
return exec_remove(exec_elf32_execsw,
__arraycount(exec_elf32_execsw));
default:
return ENOTTY;
}
#endif /* ARCH_ELFSIZE == 64 */
}
/* $NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $ */
/* $KAME: in6_src.c,v 1.159 2005/10/19 01:40:32 t-momose Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>
#ifdef MIP6
#include <netinet6/mip6.h>
#include <netinet6/mip6_var.h>
#include "mip.h"
#if NMIP > 0
#include <net/if_mip.h>
#endif /* NMIP > 0 */
#endif /* MIP6 */
#include <netinet/tcp_vtw.h>
#define ADDR_LABEL_NOTAPP (-1)
struct in6_addrpolicy defaultaddrpolicy;
int ip6_prefer_tempaddr = 0;
static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route *, struct ifnet **, struct psref *);
static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
static void init_policy_queue(void);
static int add_addrsel_policyent(struct in6_addrpolicy *);
static int delete_addrsel_policyent(struct in6_addrpolicy *);
static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
void *);
static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
#define IFA6_IS_VALIDATED(ia) \
(((ia)->ia6_flags & (IN6_IFF_TENTATIVE | IN6_IFF_DETACHED)) == 0)
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and user specified options.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
#if 0 /* disabled ad-hoc */
#define REPLACE(r) do {\
char _buf1[INET6_ADDRSTRLEN], _buf2[INET6_ADDRSTRLEN]; \
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
printf("%s: replace %s with %s by %d\n", __func__, ia_best ? \
IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
goto replace; \
} while(/*CONSTCOND*/0)
#define NEXT(r) do {\
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
printf("%s: keep %s against %s by %d\n", ia_best ? \
IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
goto next; /* XXX: we can't use 'continue' here */ \
} while(/*CONSTCOND*/0)
#define BREAK(r) do { \
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
goto out; /* XXX: we can't use 'break' here */ \
} while(/*CONSTCOND*/0)
#else
#define REPLACE(r) goto replace
#define NEXT(r) goto next
#define BREAK(r) goto out
#endif
/*
* Called inside pserialize critical section. Don't sleep/block.
*/
static struct in6_ifaddr *
in6_select_best_ia(struct sockaddr_in6 *dstsock, struct in6_addr *dst,
const struct ifnet *ifp, const struct ip6_pktopts *opts,
const u_int32_t odstzone)
{
struct in6_ifaddr *ia, *ia_best = NULL;
int dst_scope = -1, best_scope = -1, best_matchlen = -1;
struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
IN6_ADDRLIST_READER_FOREACH(ia) {
int new_scope = -1, new_matchlen = -1;
struct in6_addrpolicy *new_policy = NULL;
u_int32_t srczone, osrczone, dstzone;
struct in6_addr src;
struct ifnet *ifp1 = ia->ia_ifp;
int prefer_tempaddr;
/*
* We'll never take an address that breaks the scope zone
* of the destination. We also skip an address if its zone
* does not contain the outgoing interface.
* XXX: we should probably use sin6_scope_id here.
*/
if (in6_setscope(dst, ifp1, &dstzone) ||
odstzone != dstzone) {
continue;
}
src = ia->ia_addr.sin6_addr;
/* Skip the scope test in impossible cases */
if (!(ifp->if_flags & IFF_LOOPBACK) &&
IN6_IS_ADDR_LOOPBACK(&src))
continue;
if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) ||
osrczone != srczone) {
continue;
}
/* avoid unusable addresses */
if ((ia->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_ANYCAST)))
continue;
if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
continue;
#if defined(MIP6) && NMIP > 0
/* avoid unusable home addresses. */
if ((ia->ia6_flags & IN6_IFF_HOME) &&
!mip6_ifa6_is_addr_valid_hoa(ia))
continue;
#endif /* MIP6 && NMIP > 0 */
/* Rule 1: Prefer same address */
if (IN6_ARE_ADDR_EQUAL(dst, &ia->ia_addr.sin6_addr)) {
ia_best = ia;
BREAK(1); /* there should be no better candidate */
}
if (ia_best == NULL)
REPLACE(1);
/* Rule 2: Prefer appropriate scope */
if (dst_scope < 0) dst_scope = in6_addrscope(dst);
new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
REPLACE(2);
NEXT(2);
} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
NEXT(2);
REPLACE(2);
}
/*
* Rule 3: Avoid deprecated addresses. Note that the case of
* !ip6_use_deprecated is already rejected above.
* Treat unvalidated addresses as deprecated here.
*/
if (IFA6_IS_VALIDATED(ia_best) && !IFA6_IS_VALIDATED(ia))
NEXT(3);
if (!IFA6_IS_VALIDATED(ia_best) && IFA6_IS_VALIDATED(ia))
REPLACE(3);
if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
NEXT(3);
if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
REPLACE(3);
/* Rule 4: Prefer home addresses */
#if defined(MIP6) && NMIP > 0
if (!MIP6_IS_MN)
goto skip_rule4;
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
/* both address are not home addresses. */
goto skip_rule4;
}
/*
* If SA is simultaneously a home address and care-of
* address and SB is not, then prefer SA. Similarly,
* if SB is simultaneously a home address and care-of
* address and SA is not, then prefer SB.
*/
if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
ia_best->ia_ifp->if_type != IFT_MIP)
&&
((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
ia->ia_ifp->if_type == IFT_MIP))
NEXT(4);
if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
ia_best->ia_ifp->if_type == IFT_MIP)
&&
((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
ia->ia_ifp->if_type != IFT_MIP))
REPLACE(4);
if (ip6po_usecoa == 0) {
/*
* If SA is just a home address and SB is just
* a care-of address, then prefer
* SA. Similarly, if SB is just a home address
* and SA is just a care-of address, then
* prefer SB.
*/
if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
NEXT(4);
}
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) != 0) {
REPLACE(4);
}
} else {
/*
* a sender don't want to use a home address
* because:
*
* 1) we cannot use. (ex. NS or NA to global
* addresses.)
*
* 2) a user specified not to use.
* (ex. mip6control -u)
*/
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) != 0) {
/* XXX breaks stat */
NEXT(0);
}
if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
/* XXX breaks stat */
REPLACE(0);
}
}
skip_rule4:
#endif /* MIP6 && NMIP > 0 */
/* Rule 5: Prefer outgoing interface */
if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
NEXT(5);
if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
REPLACE(5);
/*
* Rule 6: Prefer matching label
* Note that best_policy should be non-NULL here.
*/
if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label &&
dst_policy->label != new_policy->label)
NEXT(6);
if (dst_policy->label != best_policy->label &&
dst_policy->label == new_policy->label)
REPLACE(6);
}
/*
* Rule 7: Prefer public addresses.
* We allow users to reverse the logic by configuring
* a sysctl variable, so that privacy conscious users can
* always prefer temporary addresses.
*/
if (opts == NULL ||
opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
prefer_tempaddr = ip6_prefer_tempaddr;
} else if (opts->ip6po_prefer_tempaddr ==
IP6PO_TEMPADDR_NOTPREFER) {
prefer_tempaddr = 0;
} else
prefer_tempaddr = 1;
if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
REPLACE(7);
else
NEXT(7);
}
if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
NEXT(7);
else
REPLACE(7);
}
/*
* Rule 8: prefer addresses on alive interfaces.
* This is a KAME specific rule.
*/
if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
!(ia->ia_ifp->if_flags & IFF_UP))
NEXT(8);
if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
(ia->ia_ifp->if_flags & IFF_UP))
REPLACE(8);
/*
* Rule 9: prefer addresses on "preferred" interfaces.
* This is a KAME specific rule.
*/
#ifdef notyet /* until introducing address selection */
#define NDI_BEST ND_IFINFO(ia_best->ia_ifp)
#define NDI_NEW ND_IFINFO(ia->ia_ifp)
if ((NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
!(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
NEXT(9);
if (!(NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
REPLACE(9);
#undef NDI_BEST
#undef NDI_NEW
#endif
/*
* Rule 14: Use longest matching prefix.
* Note: in the address selection draft, this rule is
* documented as "Rule 8". However, since it is also
* documented that this rule can be overridden, we assign
* a large number so that it is easy to assign smaller numbers
* to more preferred rules.
*/
new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, dst);
if (best_matchlen < new_matchlen)
REPLACE(14);
if (new_matchlen < best_matchlen)
NEXT(14);
/* Rule 15 is reserved. */
/*
* Last resort: just keep the current candidate.
* Or, do we need more rules?
*/
continue;
replace:
ia_best = ia;
best_scope = (new_scope >= 0 ? new_scope :
in6_addrscope(&ia_best->ia_addr.sin6_addr));
best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr,
dst));
next:
continue;
out:
break;
}
return ia_best;
}
#undef REPLACE
#undef BREAK
#undef NEXT
int
in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route *ro, struct in6_addr *laddr,
struct ifnet **ifpp, struct psref *psref, struct in6_addr *ret_ia6)
{
struct in6_addr dst;
struct ifnet *ifp = NULL;
struct in6_ifaddr *ia = NULL;
struct in6_pktinfo *pi = NULL;
u_int32_t odstzone;
int error = 0, iferror;
#if defined(MIP6) && NMIP > 0
u_int8_t ip6po_usecoa = 0;
#endif /* MIP6 && NMIP > 0 */
struct psref local_psref;
int bound = curlwp_bind();
#define PSREF (psref == NULL) ? &local_psref : psref
int s;
KASSERT((ifpp != NULL && psref != NULL) ||
(ifpp == NULL && psref == NULL));
dst = dstsock->sin6_addr; /* make a copy for local operation */
if (ifpp) *ifpp = NULL;
/*
* Try to determine the outgoing interface for the given destination.
* We do this regardless of whether the socket is bound, since the
* caller may need this information as a side effect of the call
* to this function (e.g., for identifying the appropriate scope zone
* ID).
*/
iferror = in6_selectif(dstsock, opts, mopts, ro, &ifp, PSREF); if (ifpp != NULL)
*ifpp = ifp;
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
* assigned to the node, and can be used as the packet's source
* address. If everything is okay, use the address as source.
*/
if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
struct sockaddr_in6 srcsock;
struct in6_ifaddr *ia6;
int _s;
struct ifaddr *ifa;
/*
* Determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
* If the specified address is ambiguous wrt the scope zone,
* the interface must be specified; otherwise, ifa_ifwithaddr()
* will fail matching the address.
*/
memset(&srcsock, 0, sizeof(srcsock));
srcsock.sin6_family = AF_INET6;
srcsock.sin6_len = sizeof(srcsock);
srcsock.sin6_addr = pi->ipi6_addr;
if (ifp) {
error = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
if (error != 0)
goto exit;
}
_s = pserialize_read_enter();
ifa = ifa_ifwithaddr(sin6tosa(&srcsock));
if ((ia6 = ifatoia6(ifa)) == NULL ||
ia6->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) {
pserialize_read_exit(_s);
error = EADDRNOTAVAIL;
goto exit;
}
pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
if (ifpp)
*ifpp = ifp;
*ret_ia6 = ia6->ia_addr.sin6_addr;
pserialize_read_exit(_s);
goto exit;
}
/*
* If the socket has already bound the source, just use it. We don't
* care at the moment whether in6_selectif() succeeded above, even
* though it would eventually cause an error.
*/
if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) {
*ret_ia6 = *laddr;
goto exit;
}
/*
* The outgoing interface is crucial in the general selection procedure
* below. If it is not known at this point, we fail.
*/
if (ifp == NULL) {
error = iferror;
goto exit;
}
/*
* If the address is not yet determined, choose the best one based on
* the outgoing interface and the destination address.
*/
#if defined(MIP6) && NMIP > 0
/*
* a caller can specify IP6PO_USECOA to not to use a home
* address. for example, the case that the neighbour
* unreachability detection to the global address.
*/
if (opts != NULL &&
(opts->ip6po_flags & IP6PO_USECOA) != 0) {
ip6po_usecoa = 1;
}
#endif /* MIP6 && NMIP > 0 */
error = in6_setscope(&dst, ifp, &odstzone);
if (error != 0)
goto exit;
s = pserialize_read_enter();
ia = in6_select_best_ia(dstsock, &dst, ifp, opts, odstzone); if (ia == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
goto exit;
}
*ret_ia6 = ia->ia_addr.sin6_addr;
pserialize_read_exit(s);
exit:
if (ifpp == NULL)
if_put(ifp, PSREF);
curlwp_bindx(bound);
return error;
#undef PSREF
}
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct route **ro, struct rtentry **retrt, bool count_discard)
{
int error = 0;
struct rtentry *rt = NULL;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
struct sockaddr_in6 dst6;
} u;
KASSERT(ro != NULL); KASSERT(*ro != NULL); KASSERT(retrt != NULL);
#if 0
if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
dstsock->sin6_addr.s6_addr32[1] == 0 &&
!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
char ip6buf[INET6_ADDRSTRLEN];
printf("%s: strange destination %s\n", __func__,
IN6_PRINT(ip6buf, &dstsock->sin6_addr));
} else {
char ip6buf[INET6_ADDRSTRLEN];
printf("%s: destination = %s%%%d\n", __func__,
IN6_PRINT(ip6buf, &dstsock->sin6_addr),
dstsock->sin6_scope_id); /* for debug */
}
#endif
/*
* If the next hop address for the packet is specified by the caller,
* use it as the gateway.
*/
if (opts && opts->ip6po_nexthop) {
struct route *ron;
struct sockaddr_in6 *sin6_next;
sin6_next = satosin6(opts->ip6po_nexthop);
/* at this moment, we only support AF_INET6 next hops */
if (sin6_next->sin6_family != AF_INET6) {
IP6_STATINC(IP6_STAT_ODROPPED);
error = EAFNOSUPPORT; /* or should we proceed? */
goto done;
}
/*
* If the next hop is an IPv6 address, then the node identified
* by that address must be a neighbor of the sending host.
*/
ron = &opts->ip6po_nextroute;
rt = rtcache_lookup(ron, sin6tosa(sin6_next));
if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) != 0 ||
!nd6_is_addr_neighbor(sin6_next, rt->rt_ifp)) {
if (rt != NULL) {
if (count_discard) in6_ifstat_inc(rt->rt_ifp,
ifs6_out_discard);
rtcache_unref(rt, ron);
rt = NULL;
}
rtcache_free(ron);
error = EHOSTUNREACH;
goto done;
}
*ro = ron;
goto done;
}
/*
* Use a cached route if it exists and is valid, else try to allocate
* a new one. Note that we should check the address family of the
* cached destination, in case of sharing the cache with IPv4.
*
* for V4 mapped addresses we want to pick up the v4 route
* see PR kern/56348
*/
if (IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { in6_sin6_2_sin(&u.dst4, dstsock);
} else {
u.dst6 = *dstsock;
u.dst6.sin6_scope_id = 0;
}
rt = rtcache_lookup1(*ro, &u.dst, 1);
if (rt == NULL)
error = EHOSTUNREACH;
/*
* Check if the outgoing interface conflicts with
* the interface specified by ipi6_ifindex (if specified).
* Note that loopback interface is always okay.
* (this may happen when we are sending a packet to one of
* our own addresses.)
*/
if (opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
rt->rt_ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) {
if (count_discard) in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard);
error = EHOSTUNREACH;
rtcache_unref(rt, *ro);
rt = NULL;
}
}
done:
if (error == EHOSTUNREACH)
IP6_STATINC(IP6_STAT_NOROUTE);
*retrt = rt;
return error;
}
static int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route *ro, struct ifnet **retifp,
struct psref *psref)
{
int error = 0;
struct rtentry *rt = NULL;
struct in6_addr *dst;
struct in6_pktinfo *pi = NULL;
KASSERT(retifp != NULL);
*retifp = NULL;
dst = &dstsock->sin6_addr;
/* If the caller specify the outgoing interface explicitly, use it. */
if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
/* XXX boundary check is assumed to be already done. */
*retifp = if_get_byindex(pi->ipi6_ifindex, psref);
if (*retifp != NULL)
return 0;
goto getroute;
}
/*
* If the destination address is a multicast address and the outgoing
* interface for the address is specified by the caller, use it.
*/
if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) {
*retifp = if_get_byindex(mopts->im6o_multicast_if_index, psref);
if (*retifp != NULL)
return 0; /* we do not need a route for multicast. */
}
getroute:
error = in6_selectroute(dstsock, opts, &ro, &rt, false);
if (error != 0)
return error;
*retifp = if_get_byindex(rt->rt_ifp->if_index, psref);
/*
* do not use a rejected or black hole route.
* XXX: this check should be done in the L2 output routine.
* However, if we skipped this check here, we'd see the following
* scenario:
* - install a rejected route for a scoped address prefix
* (like fe80::/10)
* - send a packet to a destination that matches the scoped prefix,
* with ambiguity about the scope zone.
* - pick the outgoing interface from the route, and disambiguate the
* scope zone with the interface.
* - ip6_output() would try to get another route with the "new"
* destination, which may be valid.
* - we'd see no error on output.
* Although this may not be very harmful, it should still be confusing.
* We thus reject the case here.
*/
if ((rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
error = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
/* XXX: ifp can be returned with psref even if error */
goto out;
}
/*
* Adjust the "outgoing" interface. If we're going to loop the packet
* back to ourselves, the ifp would be the loopback interface.
* However, we'd rather know the interface associated to the
* destination address (which should probably be one of our own
* addresses.)
*/
if (rt->rt_ifa->ifa_ifp != *retifp &&
!if_is_deactivated(rt->rt_ifa->ifa_ifp)) {
if_put(*retifp, psref);
*retifp = rt->rt_ifa->ifa_ifp;
if_acquire(*retifp, psref);
}
out:
rtcache_unref(rt, ro);
return error;
}
/*
* Default hop limit selection. The precedence is as follows:
* 1. Hoplimit value specified via ioctl.
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
*/
int
in6pcb_selecthlim(struct inpcb *inp, struct ifnet *ifp)
{ if (inp && in6p_hops6(inp) >= 0)
return in6p_hops6(inp);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
else
return (ip6_defhlim);
}
int
in6pcb_selecthlim_rt(struct inpcb *inp)
{
struct rtentry *rt;
if (inp == NULL)
return in6pcb_selecthlim(inp, NULL);
rt = rtcache_validate(&inp->inp_route);
if (rt != NULL) {
int ret = in6pcb_selecthlim(inp, rt->rt_ifp);
rtcache_unref(rt, &inp->inp_route);
return ret;
} else
return in6pcb_selecthlim(inp, NULL);
}
/*
* Find an empty port and set it to the specified PCB.
*/
int
in6pcb_set_port(struct sockaddr_in6 *sin6, struct inpcb *inp, struct lwp *l)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
u_int16_t lport, *lastport;
enum kauth_network_req req;
int error = 0;
if (inp->inp_flags & IN6P_LOWPORT) {
#ifndef IPNOPRIVPORTS
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
lastport = &table->inpt_lastlow;
} else {
req = KAUTH_REQ_NETWORK_BIND_PORT;
lastport = &table->inpt_lastport;
}
/* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so,
sin6, NULL);
if (error)
return (EACCES);
/*
* Use RFC6056 randomized port selection
*/
error = portalgo_randport(&lport, inp, l->l_cred);
if (error)
return error;
inp->inp_flags |= IN6P_ANONPORT;
*lastport = lport;
inp->inp_lport = htons(lport);
in6pcb_set_state(inp, INP_BOUND);
return (0); /* success */
}
void
addrsel_policy_init(void)
{
init_policy_queue();
/* initialize the "last resort" policy */
memset(&defaultaddrpolicy, 0, sizeof(defaultaddrpolicy));
defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
}
/*
* XXX: NOMPSAFE if a policy is set
*/
static struct in6_addrpolicy *
lookup_addrsel_policy(struct sockaddr_in6 *key)
{
struct in6_addrpolicy *match = NULL;
match = match_addrsel_policy(key); if (match == NULL)
match = &defaultaddrpolicy;
else
match->use++;
return (match);
}
/*
* Subroutines to manage the address selection policy table via sysctl.
*/
struct sel_walkarg {
size_t w_total;
size_t w_given;
void * w_where;
void *w_limit;
};
int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);
int
sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS)
{
int error = 0;
int s;
s = splsoftnet();
if (newp) {
error = EPERM;
goto end;
}
if (oldp && oldlenp == NULL) {
error = EINVAL;
goto end;
}
if (oldp || oldlenp) {
struct sel_walkarg w;
size_t oldlen = *oldlenp;
memset(&w, 0, sizeof(w));
w.w_given = oldlen;
w.w_where = oldp;
if (oldp)
w.w_limit = (char *)oldp + oldlen;
error = walk_addrsel_policy(dump_addrsel_policyent, &w);
*oldlenp = w.w_total;
if (oldp && w.w_total > oldlen && error == 0)
error = ENOMEM;
}
end:
splx(s);
return (error);
}
int
in6_src_ioctl(u_long cmd, void *data)
{
int i;
struct in6_addrpolicy ent0;
if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
return (EOPNOTSUPP); /* check for safety */
ent0 = *(struct in6_addrpolicy *)data;
if (ent0.label == ADDR_LABEL_NOTAPP)
return (EINVAL);
/* check if the prefix mask is consecutive. */
if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
return (EINVAL);
/* clear trailing garbages (if any) of the prefix address. */
for (i = 0; i < 4; i++) {
ent0.addr.sin6_addr.s6_addr32[i] &=
ent0.addrmask.sin6_addr.s6_addr32[i];
}
ent0.use = 0;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
return (add_addrsel_policyent(&ent0));
case SIOCDADDRCTL_POLICY:
return (delete_addrsel_policyent(&ent0));
}
return (0); /* XXX: compromise compilers */
}
/*
* The followings are implementation of the policy table using a
* simple tail queue.
* XXX such details should be hidden.
* XXX implementation using binary tree should be more efficient.
*/
struct addrsel_policyent {
TAILQ_ENTRY(addrsel_policyent) ape_entry;
struct in6_addrpolicy ape_policy;
};
TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
struct addrsel_policyhead addrsel_policytab;
static void
init_policy_queue(void)
{
TAILQ_INIT(&addrsel_policytab);
}
static int
add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
{
struct addrsel_policyent *newpol, *pol;
/* duplication check */
TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
return (EEXIST); /* or override it? */
}
}
newpol = malloc(sizeof(*newpol), M_IFADDR, M_WAITOK|M_ZERO);
/* XXX: should validate entry */
newpol->ape_policy = *newpolicy;
TAILQ_INSERT_TAIL(&addrsel_policytab, newpol, ape_entry);
return (0);
}
static int
delete_addrsel_policyent(struct in6_addrpolicy *key)
{
struct addrsel_policyent *pol;
/* search for the entry in the table */
for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
pol = TAILQ_NEXT(pol, ape_entry)) {
if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
&pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
break;
}
}
if (pol == NULL) {
return (ESRCH);
}
TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry);
return (0);
}
static int
walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
{
struct addrsel_policyent *pol;
int error = 0;
TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
if ((error = (*callback)(&pol->ape_policy, w)) != 0)
return error;
}
return error;
}
static int
dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
{
int error = 0;
struct sel_walkarg *w = arg;
if (w->w_where && (char *)w->w_where + sizeof(*pol) <= (char *)w->w_limit) {
if ((error = copyout(pol, w->w_where, sizeof(*pol))) != 0)
return error;
w->w_where = (char *)w->w_where + sizeof(*pol);
}
w->w_total += sizeof(*pol);
return error;
}
static struct in6_addrpolicy *
match_addrsel_policy(struct sockaddr_in6 *key)
{
struct addrsel_policyent *pent;
struct in6_addrpolicy *bestpol = NULL, *pol;
int matchlen, bestmatchlen = -1;
u_char *mp, *ep, *k, *p, m;
for (pent = TAILQ_FIRST(&addrsel_policytab); pent;
pent = TAILQ_NEXT(pent, ape_entry)) {
matchlen = 0;
pol = &pent->ape_policy;
mp = (u_char *)&pol->addrmask.sin6_addr;
ep = mp + 16; /* XXX: scope field? */
k = (u_char *)&key->sin6_addr;
p = (u_char *)&pol->addr.sin6_addr;
for (; mp < ep && *mp; mp++, k++, p++) {
m = *mp;
if ((*k & m) != *p)
goto next; /* not match */
if (m == 0xff) /* short cut for a typical case */
matchlen += 8;
else {
while (m >= 0x80) {
matchlen++;
m <<= 1;
}
}
}
/* matched. check if this is better than the current best. */
if (bestpol == NULL ||
matchlen > bestmatchlen) {
bestpol = pol;
bestmatchlen = matchlen;
}
next:
continue;
}
return (bestpol);
}
/* $NetBSD: dtrace_bsd.h,v 1.9 2018/04/19 21:19:07 christos Exp $ */
/*-
* Copyright (c) 2007-2008 John Birrell (jb@freebsd.org)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/sys/sys/dtrace_bsd.h,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
*
* This file contains BSD shims for Sun's DTrace code.
*/
#ifndef _SYS_DTRACE_BSD_H
#define _SYS_DTRACE_BSD_H
#if defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>
/* Forward definitions: */
struct mbuf;
struct trapframe;
struct lwp;
struct vattr;
struct vnode;
struct ucred;
/*
* Cyclic clock function type definition used to hook the cyclic
* subsystem into the appropriate timer interrupt.
*/
typedef void (*cyclic_clock_func_t)(struct clockframe *);
extern cyclic_clock_func_t cyclic_clock_func[];
/*
* The dtrace module handles traps that occur during a DTrace probe.
* This type definition is used in the trap handler to provide a
* hook for the dtrace module to register its handler with.
*/
typedef int (*dtrace_trap_func_t)(struct trapframe *, u_int);
int dtrace_trap(struct trapframe *, u_int);
extern dtrace_trap_func_t dtrace_trap_func;
/* Used by the machine dependent trap() code. */
typedef int (*dtrace_invop_func_t)(uintptr_t, uintptr_t *, uintptr_t);
typedef void (*dtrace_doubletrap_func_t)(void);
/* Global variables in trap.c */
extern dtrace_invop_func_t dtrace_invop_func;
extern dtrace_doubletrap_func_t dtrace_doubletrap_func;
/* Virtual time hook function type. */
typedef void (*dtrace_vtime_switch_func_t)(struct lwp *);
extern int dtrace_vtime_active;
extern dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
/* The fasttrap module hooks into the fork, exit and exit. */
typedef void (*dtrace_fork_func_t)(struct proc *, struct proc *);
typedef void (*dtrace_execexit_func_t)(struct proc *);
/* Global variable in kern_fork.c */
extern dtrace_fork_func_t dtrace_fasttrap_fork;
/* Global variable in kern_exec.c */
extern dtrace_execexit_func_t dtrace_fasttrap_exec;
/* Global variable in kern_exit.c */
extern dtrace_execexit_func_t dtrace_fasttrap_exit;
/* The dtmalloc provider hooks into malloc. */
typedef void (*dtrace_malloc_probe_func_t)(u_int32_t, uintptr_t arg0,
uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4);
extern dtrace_malloc_probe_func_t dtrace_malloc_probe;
/* dtnfsclient NFSv3 access cache provider hooks. */
typedef void (*dtrace_nfsclient_accesscache_flush_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_accesscache_flush_probe_func_t
dtrace_nfsclient_accesscache_flush_done_probe;
typedef void (*dtrace_nfsclient_accesscache_get_probe_func_t)(uint32_t,
struct vnode *, uid_t, uint32_t);
extern dtrace_nfsclient_accesscache_get_probe_func_t
dtrace_nfsclient_accesscache_get_hit_probe,
dtrace_nfsclient_accesscache_get_miss_probe;
typedef void (*dtrace_nfsclient_accesscache_load_probe_func_t)(uint32_t,
struct vnode *, uid_t, uint32_t, int);
extern dtrace_nfsclient_accesscache_load_probe_func_t
dtrace_nfsclient_accesscache_load_done_probe;
/* dtnfsclient NFSv[23] attribute cache provider hooks. */
typedef void (*dtrace_nfsclient_attrcache_flush_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_attrcache_flush_probe_func_t
dtrace_nfsclient_attrcache_flush_done_probe;
typedef void (*dtrace_nfsclient_attrcache_get_hit_probe_func_t)(uint32_t,
struct vnode *, struct vattr *);
extern dtrace_nfsclient_attrcache_get_hit_probe_func_t
dtrace_nfsclient_attrcache_get_hit_probe;
typedef void (*dtrace_nfsclient_attrcache_get_miss_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_attrcache_get_miss_probe_func_t
dtrace_nfsclient_attrcache_get_miss_probe;
typedef void (*dtrace_nfsclient_attrcache_load_probe_func_t)(uint32_t,
struct vnode *, struct vattr *, int);
extern dtrace_nfsclient_attrcache_load_probe_func_t
dtrace_nfsclient_attrcache_load_done_probe;
/* dtnfsclient NFSv[23] RPC provider hooks. */
typedef void (*dtrace_nfsclient_nfs23_start_probe_func_t)(uint32_t,
struct vnode *, struct mbuf *, struct ucred *, int);
extern dtrace_nfsclient_nfs23_start_probe_func_t
dtrace_nfsclient_nfs23_start_probe;
typedef void (*dtrace_nfsclient_nfs23_done_probe_func_t)(uint32_t,
struct vnode *, struct mbuf *, struct ucred *, int, int);
extern dtrace_nfsclient_nfs23_done_probe_func_t
dtrace_nfsclient_nfs23_done_probe;
/*
* OpenSolaris compatible time functions returning nanoseconds.
* On OpenSolaris these return hrtime_t which we define as uint64_t.
*/
uint64_t dtrace_gethrtime(void);
uint64_t dtrace_gethrestime(void);
/* sizes based on DTrace structure requirements */
#define KDTRACE_PROC_SIZE 64
#define KDTRACE_PROC_ZERO 8
#define KDTRACE_THREAD_SIZE 256
#define KDTRACE_THREAD_ZERO 64
/*
* Functions for managing the opaque DTrace memory areas for
* processes and lwps.
*/
static __inline size_t kdtrace_proc_size(void);
static __inline void kdtrace_proc_ctor(void *, struct proc *);
static __inline void kdtrace_proc_dtor(void *, struct proc *);
static __inline size_t kdtrace_thread_size(void);
static __inline void kdtrace_thread_ctor(void *, struct lwp *);
static __inline void kdtrace_thread_dtor(void *, struct lwp *);
/* Return the DTrace process data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_proc_size(void)
{
return KDTRACE_PROC_SIZE;
}
/* Return the DTrace thread data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_thread_size(void)
{
return KDTRACE_THREAD_SIZE;
}
static __inline void
kdtrace_proc_ctor(void *arg, struct proc *p)
{
#ifdef KDTRACE_HOOKS
p->p_dtrace = kmem_zalloc(KDTRACE_PROC_SIZE, KM_SLEEP);
#endif
}
static __inline void
kdtrace_proc_dtor(void *arg, struct proc *p)
{
#ifdef KDTRACE_HOOKS
if (p->p_dtrace != NULL) { kmem_free(p->p_dtrace, KDTRACE_PROC_SIZE);
p->p_dtrace = NULL;
}
#endif
}
static __inline void
kdtrace_thread_ctor(void *arg, struct lwp *l)
{
#ifdef KDTRACE_HOOKS
l->l_dtrace = kmem_zalloc(KDTRACE_THREAD_SIZE, KM_SLEEP);
#endif
}
static __inline void
kdtrace_thread_dtor(void *arg, struct lwp *l)
{
#ifdef KDTRACE_HOOKS
if (l->l_dtrace != NULL) { kmem_free(l->l_dtrace, KDTRACE_THREAD_SIZE);
l->l_dtrace = NULL;
}
#endif
}
#endif /* _SYS_DTRACE_BSD_H */
/* $NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $ */
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_cb.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <netinet/in.h>
/*
* Routines to manage the raw protocol control blocks.
*
* TODO:
* hash lookups by protocol family/protocol + address family
* take care of unique address problems per AF?
* redo address binding to allow wildcards
*/
static u_long raw_sendspace = RAWSNDQ;
static u_long raw_recvspace = RAWRCVQ;
/*
* Allocate a nominal amount of buffer space for the socket.
*/
int
raw_attach(struct socket *so, int proto, struct rawcbhead *rawcbhead)
{
struct rawcb *rp;
int error;
/*
* It is assumed that raw_attach() is called after space has been
* allocated for the rawcb; consumer protocols may simply allocate
* type struct rawcb, or a wrapper data structure that begins with a
* struct rawcb.
*/
rp = sotorawcb(so);
KASSERT(rp != NULL);
sosetlock(so);
if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0) {
return error;
}
rp->rcb_socket = so;
rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
rp->rcb_proto.sp_protocol = proto;
LIST_INSERT_HEAD(rawcbhead, rp, rcb_list); KASSERT(solocked(so));
return 0;
}
/*
* Detach the raw connection block and discard socket resources.
*/
void
raw_detach(struct socket *so)
{
struct rawcb *rp = sotorawcb(so);
const size_t rcb_len = rp->rcb_len;
KASSERT(rp != NULL);
KASSERT(solocked(so));
/* Remove the last reference. */
LIST_REMOVE(rp, rcb_list);
so->so_pcb = NULL;
/* Note: sofree() drops the socket's lock. */
sofree(so);
kmem_free(rp, rcb_len);
if (so->so_lock != softnet_lock) {
so->so_lock = softnet_lock;
mutex_obj_hold(softnet_lock);
}
mutex_enter(softnet_lock);
}
/*
* Disconnect and possibly release resources.
*/
void
raw_disconnect(struct rawcb *rp)
{
struct socket *so = rp->rcb_socket;
if (so->so_state & SS_NOFDREF) {
raw_detach(so);
}
}
/* $NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $ */
/*-
* Copyright (c) 2000, 2008, 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by RedBack Networks Inc.
*
* Author: Bill Sommerfeld
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $");
#include "opt_mtrr.h"
#include <sys/param.h>
#include <sys/device.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#ifdef MULTIPROCESSOR
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/i82093var.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#include <machine/mtrr.h>
#include <machine/gdt.h>
#include "acpica.h"
#include <x86/fpu.h>
static void x86_ipi_ast(struct cpu_info *);
static void x86_ipi_halt(struct cpu_info *);
static void x86_ipi_kpreempt(struct cpu_info *);
static void x86_ipi_xcall(struct cpu_info *);
static void x86_ipi_generic(struct cpu_info *);
#ifdef MTRR
static void x86_ipi_reload_mtrr(struct cpu_info *);
#else
#define x86_ipi_reload_mtrr NULL
#endif
#if NACPICA > 0
void acpi_cpu_sleep(struct cpu_info *);
#else
#define acpi_cpu_sleep NULL
#endif
static void x86_ipi_synch_fpu(struct cpu_info *);
void (* const ipifunc[X86_NIPI])(struct cpu_info *) =
{
x86_ipi_halt, /* X86_IPI_HALT */
x86_ipi_ast, /* X86_IPI_AST */
x86_ipi_generic, /* X86_IPI_GENERIC */
x86_ipi_synch_fpu, /* X86_IPI_SYNCH_FPU */
x86_ipi_reload_mtrr, /* X86_IPI_MTRR */
NULL, /* X86_IPI_GDT */
x86_ipi_xcall, /* X86_IPI_XCALL */
acpi_cpu_sleep, /* X86_IPI_ACPI_CPU_SLEEP */
x86_ipi_kpreempt /* X86_IPI_KPREEMPT */
};
/*
* x86 IPI interface.
*/
int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
uint32_t o, n;
int ret = 0;
/* Don't send IPI to CPU which isn't (yet) running. */
if (__predict_false((ci->ci_flags & CPUF_RUNNING) == 0))
return ENOENT;
/* Set in new IPI bit, and capture previous state. */
for (o = 0;; o = n) {
n = atomic_cas_32(&ci->ci_ipis, o, o | ipimask);
if (__predict_true(o == n)) {
break;
}
}
/* If no IPI already pending, send one. */
if (o == 0) {
ret = x86_ipi(LAPIC_IPI_VECTOR, ci->ci_cpuid, LAPIC_DLMODE_FIXED);
if (ret != 0) {
printf("ipi of %x from %s to %s failed\n",
ipimask,
device_xname(curcpu()->ci_dev),
device_xname(ci->ci_dev));
}
}
return ret;
}
void
x86_broadcast_ipi(int ipimask)
{
struct cpu_info *ci, *self = curcpu();
int count = 0;
CPU_INFO_ITERATOR cii;
for (CPU_INFO_FOREACH(cii, ci)) { if (ci == self)
continue;
if ((ci->ci_flags & CPUF_RUNNING) == 0)
continue;
atomic_or_32(&ci->ci_ipis, ipimask);
count++;
}
if (!count)
return;
x86_ipi(LAPIC_IPI_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED);
}
void
x86_ipi_handler(void)
{
struct cpu_info *ci = curcpu();
uint32_t pending;
int bit;
pending = atomic_swap_32(&ci->ci_ipis, 0);
KDASSERT((pending >> X86_NIPI) == 0);
while ((bit = ffs(pending)) != 0) {
bit--;
pending &= ~(1 << bit);
ci->ci_ipi_events[bit].ev_count++;
(*ipifunc[bit])(ci);
}
}
/*
* Common x86 IPI handlers.
*/
static void
x86_ipi_halt(struct cpu_info *ci)
{
x86_disable_intr();
atomic_and_32(&ci->ci_flags, ~CPUF_RUNNING);
for (;;) {
x86_hlt();
}
}
static void
x86_ipi_synch_fpu(struct cpu_info *ci)
{
panic("%s: impossible", __func__);
}
#ifdef MTRR
static void
x86_ipi_reload_mtrr(struct cpu_info *ci)
{
if (mtrr_funcs != NULL) {
/*
* mtrr_reload_cpu() is a macro in mtrr.h which picks
* the appropriate function to use.
*/
mtrr_reload_cpu(ci);
}
}
#endif
static void
x86_ipi_kpreempt(struct cpu_info *ci)
{
softint_trigger(1 << SIR_PREEMPT);
}
static void
x86_ipi_ast(struct cpu_info *ci)
{
aston(ci->ci_onproc);
}
/*
* MD support for xcall(9) interface.
*/
static void
x86_ipi_xcall(struct cpu_info *ci)
{
xc_ipi_handler();
}
static void
x86_ipi_generic(struct cpu_info *ci)
{
ipi_cpu_handler();
}
void
xc_send_ipi(struct cpu_info *ci)
{ KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci);
if (ci) {
/* Unicast: remote CPU. */
x86_send_ipi(ci, X86_IPI_XCALL);
} else {
/* Broadcast: all, but local CPU (caller will handle it). */
x86_broadcast_ipi(X86_IPI_XCALL);
}
}
void
cpu_ipi(struct cpu_info *ci)
{ KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci);
if (ci) {
/* Unicast: remote CPU. */
x86_send_ipi(ci, X86_IPI_GENERIC);
} else {
/* Broadcast: all, but local CPU (caller will handle it). */
x86_broadcast_ipi(X86_IPI_GENERIC);
}
}
#else
int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
return 0;
}
void
x86_broadcast_ipi(int ipimask)
{
}
void
cpu_ipi(struct cpu_info *ci)
{
}
#endif
/* $NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $ */
/*-
* Copyright (c)2007,2008 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* per-cpu storage.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/rwlock.h>
#include <sys/vmem.h>
#include <sys/xcall.h>
#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1)
#define PERCPU_QCACHE_MAX 0
#define PERCPU_IMPORT_SIZE 2048
struct percpu {
unsigned pc_offset;
size_t pc_size;
percpu_callback_t pc_ctor;
percpu_callback_t pc_dtor;
void *pc_cookie;
LIST_ENTRY(percpu) pc_list;
};
static krwlock_t percpu_swap_lock __cacheline_aligned;
static vmem_t * percpu_offset_arena __read_mostly;
static struct {
kmutex_t lock;
unsigned int nextoff;
LIST_HEAD(, percpu) ctor_list;
struct lwp *busy;
kcondvar_t cv;
} percpu_allocation __cacheline_aligned;
static percpu_cpu_t *
cpu_percpu(struct cpu_info *ci)
{
return &ci->ci_data.cpu_percpu;
}
static unsigned int
percpu_offset(percpu_t *pc)
{
const unsigned int off = pc->pc_offset;
KASSERT(off < percpu_allocation.nextoff);
return off;
}
/*
* percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
*/
__noubsan
static void
percpu_cpu_swap(void *p1, void *p2)
{
struct cpu_info * const ci = p1;
percpu_cpu_t * const newpcc = p2;
percpu_cpu_t * const pcc = cpu_percpu(ci);
KASSERT(ci == curcpu() || !mp_online);
/*
* swap *pcc and *newpcc unless anyone has beaten us.
*/
rw_enter(&percpu_swap_lock, RW_WRITER);
if (newpcc->pcc_size > pcc->pcc_size) {
percpu_cpu_t tmp;
int s;
tmp = *pcc;
/*
* block interrupts so that we don't lose their modifications.
*/
s = splhigh();
/*
* copy data to new storage.
*/
memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
/*
* this assignment needs to be atomic for percpu_getptr_remote.
*/
pcc->pcc_data = newpcc->pcc_data;
splx(s);
pcc->pcc_size = newpcc->pcc_size;
*newpcc = tmp;
}
rw_exit(&percpu_swap_lock);
}
/*
* percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
*/
static void
percpu_cpu_enlarge(size_t size)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
percpu_cpu_t pcc;
pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
pcc.pcc_size = size;
if (!mp_online) {
percpu_cpu_swap(ci, &pcc);
} else {
uint64_t where;
where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
xc_wait(where);
}
KASSERT(pcc.pcc_size <= size);
if (pcc.pcc_data != NULL) {
kmem_free(pcc.pcc_data, pcc.pcc_size);
}
}
}
/*
* percpu_backend_alloc: vmem import callback for percpu_offset_arena
*/
static int
percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
vm_flag_t vmflags, vmem_addr_t *addrp)
{
unsigned int offset;
unsigned int nextoff;
ASSERT_SLEEPABLE();
KASSERT(dummy == NULL);
if ((vmflags & VM_NOSLEEP) != 0)
return ENOMEM;
size = roundup(size, PERCPU_IMPORT_SIZE);
mutex_enter(&percpu_allocation.lock);
offset = percpu_allocation.nextoff;
percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size;
mutex_exit(&percpu_allocation.lock);
percpu_cpu_enlarge(nextoff);
*resultsize = size;
*addrp = (vmem_addr_t)offset;
return 0;
}
static void
percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
{
size_t sz = (uintptr_t)vp2;
memset(vp, 0, sz);
}
/*
* percpu_zero: initialize percpu storage with zero.
*/
static void
percpu_zero(percpu_t *pc, size_t sz)
{
percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
}
/*
* percpu_init: subsystem initialization
*/
void
percpu_init(void)
{
ASSERT_SLEEPABLE();
rw_init(&percpu_swap_lock);
mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE);
percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE;
LIST_INIT(&percpu_allocation.ctor_list);
percpu_allocation.busy = NULL;
cv_init(&percpu_allocation.cv, "percpu");
percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
IPL_NONE);
}
/*
* percpu_init_cpu: cpu initialization
*
* => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
* => may be called for static CPUs afterward (typically just primary CPU)
*/
void
percpu_init_cpu(struct cpu_info *ci)
{
percpu_cpu_t * const pcc = cpu_percpu(ci);
struct percpu *pc;
size_t size = percpu_allocation.nextoff; /* XXX racy */
ASSERT_SLEEPABLE();
/*
* For the primary CPU, prior percpu_create may have already
* triggered allocation, so there's nothing more for us to do
* here.
*/
if (pcc->pcc_size)
return;
KASSERT(pcc->pcc_data == NULL);
/*
* Otherwise, allocate storage and, while the constructor list
* is locked, run constructors for all percpus on this CPU.
*/
pcc->pcc_size = size;
if (size) {
pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
percpu_allocation.busy = curlwp;
LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) {
KASSERT(pc->pc_ctor);
mutex_exit(&percpu_allocation.lock);
(*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset,
pc->pc_cookie, ci);
mutex_enter(&percpu_allocation.lock);
}
KASSERT(percpu_allocation.busy == curlwp);
percpu_allocation.busy = NULL;
cv_broadcast(&percpu_allocation.cv);
mutex_exit(&percpu_allocation.lock);
}
}
/*
* percpu_alloc: allocate percpu storage
*
* => called in thread context.
* => considered as an expensive and rare operation.
* => allocated storage is initialized with zeros.
*/
percpu_t *
percpu_alloc(size_t size)
{
return percpu_create(size, NULL, NULL, NULL);
}
/*
* percpu_create: allocate percpu storage and associate ctor/dtor with it
*
* => called in thread context.
* => considered as an expensive and rare operation.
* => allocated storage is initialized by ctor, or zeros if ctor is null
* => percpu_free will call dtor first, if dtor is nonnull
* => ctor or dtor may sleep, even on allocation
*/
percpu_t *
percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
void *cookie)
{
vmem_addr_t offset;
percpu_t *pc;
ASSERT_SLEEPABLE();
(void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
&offset);
pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
pc->pc_offset = offset;
pc->pc_size = size;
pc->pc_ctor = ctor;
pc->pc_dtor = dtor;
pc->pc_cookie = cookie;
if (ctor) {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
void *buf;
/*
* Wait until nobody is using the list of percpus with
* constructors.
*/
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
percpu_allocation.busy = curlwp;
mutex_exit(&percpu_allocation.lock);
/*
* Run the constructor for all CPUs. We use a
* temporary buffer wo that we need not hold the
* percpu_swap_lock while running the constructor.
*/
buf = kmem_alloc(size, KM_SLEEP);
for (CPU_INFO_FOREACH(cii, ci)) {
memset(buf, 0, size);
(*ctor)(buf, cookie, ci);
percpu_traverse_enter();
memcpy(percpu_getptr_remote(pc, ci), buf, size);
percpu_traverse_exit();
}
explicit_memset(buf, 0, size);
kmem_free(buf, size);
/*
* Insert the percpu into the list of percpus with
* constructors. We are now done using the list, so it
* is safe for concurrent percpu_create or concurrent
* percpu_init_cpu to run.
*/
mutex_enter(&percpu_allocation.lock);
KASSERT(percpu_allocation.busy == curlwp);
percpu_allocation.busy = NULL;
cv_broadcast(&percpu_allocation.cv);
LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list);
mutex_exit(&percpu_allocation.lock);
} else {
percpu_zero(pc, size);
}
return pc;
}
/*
* percpu_free: free percpu storage
*
* => called in thread context.
* => considered as an expensive and rare operation.
*/
void
percpu_free(percpu_t *pc, size_t size)
{
ASSERT_SLEEPABLE();
KASSERT(size == pc->pc_size);
/*
* If there's a constructor, take the percpu off the list of
* percpus with constructors, but first wait until nobody is
* using the list.
*/
if (pc->pc_ctor) {
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
LIST_REMOVE(pc, pc_list);
mutex_exit(&percpu_allocation.lock);
}
/* If there's a destructor, run it now for all CPUs. */
if (pc->pc_dtor) {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
void *buf;
buf = kmem_alloc(size, KM_SLEEP);
for (CPU_INFO_FOREACH(cii, ci)) {
percpu_traverse_enter();
memcpy(buf, percpu_getptr_remote(pc, ci), size);
explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
percpu_traverse_exit();
(*pc->pc_dtor)(buf, pc->pc_cookie, ci);
}
explicit_memset(buf, 0, size);
kmem_free(buf, size);
}
vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
kmem_free(pc, sizeof(*pc));
}
/*
* percpu_getref:
*
* => safe to be used in either thread or interrupt context
* => disables preemption; must be bracketed with a percpu_putref()
*/
void *
percpu_getref(percpu_t *pc)
{
kpreempt_disable();
return percpu_getptr_remote(pc, curcpu());
}
/*
* percpu_putref:
*
* => drops the preemption-disabled count after caller is done with per-cpu
* data
*/
void
percpu_putref(percpu_t *pc)
{
kpreempt_enable();
}
/*
* percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
* helpers to access remote cpu's percpu data.
*
* => called in thread context.
* => percpu_traverse_enter can block low-priority xcalls.
* => typical usage would be:
*
* sum = 0;
* percpu_traverse_enter();
* for (CPU_INFO_FOREACH(cii, ci)) {
* unsigned int *p = percpu_getptr_remote(pc, ci);
* sum += *p;
* }
* percpu_traverse_exit();
*/
void
percpu_traverse_enter(void)
{
ASSERT_SLEEPABLE();
rw_enter(&percpu_swap_lock, RW_READER);
}
void
percpu_traverse_exit(void)
{
rw_exit(&percpu_swap_lock);
}
void *
percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
{ return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
}
/*
* percpu_foreach: call the specified callback function for each cpus.
*
* => must be called from thread context.
* => callback executes on **current** CPU (or, really, arbitrary CPU,
* in case of preemption)
* => caller should not rely on the cpu iteration order.
* => the callback function should be minimum because it is executed with
* holding a global lock, which can block low-priority xcalls.
* eg. it's illegal for a callback function to sleep for memory allocation.
*/
void
percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
percpu_traverse_enter();
for (CPU_INFO_FOREACH(cii, ci)) { (*cb)(percpu_getptr_remote(pc, ci), arg, ci);
}
percpu_traverse_exit();
}
struct percpu_xcall_ctx {
percpu_callback_t ctx_cb;
void *ctx_arg;
};
static void
percpu_xcfunc(void * const v1, void * const v2)
{
percpu_t * const pc = v1;
struct percpu_xcall_ctx * const ctx = v2;
(*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu());
percpu_putref(pc);
}
/*
* percpu_foreach_xcall: call the specified callback function for each
* cpu. This version uses an xcall to run the callback on each cpu.
*
* => must be called from thread context.
* => callback executes on **remote** CPU in soft-interrupt context
* (at the specified soft interrupt priority).
* => caller should not rely on the cpu iteration order.
* => the callback function should be minimum because it may be
* executed in soft-interrupt context. eg. it's illegal for
* a callback function to sleep for memory allocation.
*/
void
percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb,
void *arg)
{
struct percpu_xcall_ctx ctx = {
.ctx_cb = cb,
.ctx_arg = arg,
};
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci));
}
}
/* $NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
* @(#)kern_time.c 8.4 (Berkeley) 5/26/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <sys/timex.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/intr.h>
/*
* Compute number of hz until specified time. Used to compute second
* argument to callout_reset() from an absolute time.
*/
int
tvhzto(const struct timeval *tvp)
{
struct timeval now, tv;
tv = *tvp; /* Don't modify original tvp. */
getmicrotime(&now);
timersub(&tv, &now, &tv);
return tvtohz(&tv);
}
/*
* Compute number of ticks in the specified amount of time.
*/
int
tvtohz(const struct timeval *tv)
{
unsigned long ticks;
long sec, usec;
/*
* If the number of usecs in the whole seconds part of the time
* difference fits in a long, then the total number of usecs will
* fit in an unsigned long. Compute the total and convert it to
* ticks, rounding up and adding 1 to allow for the current tick
* to expire. Rounding also depends on unsigned long arithmetic
* to avoid overflow.
*
* Otherwise, if the number of ticks in the whole seconds part of
* the time difference fits in a long, then convert the parts to
* ticks separately and add, using similar rounding methods and
* overflow avoidance. This method would work in the previous
* case, but it is slightly slower and assumes that hz is integral.
*
* Otherwise, round the time difference down to the maximum
* representable value.
*
* If ints are 32-bit, then the maximum value for any timeout in
* 10ms ticks is 248 days.
*/
sec = tv->tv_sec;
usec = tv->tv_usec;
KASSERT(usec >= 0); KASSERT(usec < 1000000);
/* catch overflows in conversion time_t->int */
if (tv->tv_sec > INT_MAX)
return INT_MAX;
if (tv->tv_sec < 0)
return 0;
if (sec < 0 || (sec == 0 && usec == 0)) {
/*
* Would expire now or in the past. Return 0 ticks.
* This is different from the legacy tvhzto() interface,
* and callers need to check for it.
*/
ticks = 0;
} else if (sec <= (LONG_MAX / 1000000))
ticks = (((sec * 1000000) + (unsigned long)usec + (tick - 1))
/ tick) + 1;
else if (sec <= (LONG_MAX / hz)) ticks = (sec * hz) +
(((unsigned long)usec + (tick - 1)) / tick) + 1;
else
ticks = LONG_MAX;
if (ticks > INT_MAX)
ticks = INT_MAX;
return ((int)ticks);
}
int
tshzto(const struct timespec *tsp)
{
struct timespec now, ts;
ts = *tsp; /* Don't modify original tsp. */
getnanotime(&now);
timespecsub(&ts, &now, &ts);
return tstohz(&ts);
}
int
tshztoup(const struct timespec *tsp)
{
struct timespec now, ts;
ts = *tsp; /* Don't modify original tsp. */
getnanouptime(&now);
timespecsub(&ts, &now, &ts);
return tstohz(&ts);
}
/*
* Compute number of ticks in the specified amount of time.
*/
int
tstohz(const struct timespec *ts)
{
struct timeval tv;
/*
* usec has great enough resolution for hz, so convert to a
* timeval and use tvtohz() above.
*/
TIMESPEC_TO_TIMEVAL(&tv, ts);
return tvtohz(&tv);
}
/*
* Check that a proposed value to load into the .it_value or
* .it_interval part of an interval timer is acceptable, and
* fix it to have at least minimal value (i.e. if it is less
* than the resolution of the clock, round it up.). We don't
* timeout the 0,0 value because this means to disable the
* timer or the interval.
*/
int
itimerfix(struct timeval *tv)
{ if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
return EINVAL;
if (tv->tv_sec < 0)
return ETIMEDOUT;
if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) tv->tv_usec = tick;
return 0;
}
int
itimespecfix(struct timespec *ts)
{
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return EINVAL;
if (ts->tv_sec < 0)
return ETIMEDOUT;
if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000;
return 0;
}
int
inittimeleft(struct timespec *ts, struct timespec *sleepts)
{ if (itimespecfix(ts)) {
return -1;
}
KASSERT(ts->tv_sec >= 0);
getnanouptime(sleepts);
return 0;
}
int
gettimeleft(struct timespec *ts, struct timespec *sleepts)
{
struct timespec now, sleptts;
KASSERT(ts->tv_sec >= 0);
/*
* Reduce ts by elapsed time based on monotonic time scale.
*/
getnanouptime(&now);
KASSERT(timespeccmp(sleepts, &now, <=));
timespecsub(&now, sleepts, &sleptts);
*sleepts = now;
if (timespeccmp(ts, &sleptts, <=)) { /* timed out */
timespecclear(ts);
return 0;
}
timespecsub(ts, &sleptts, ts);
return tstohz(ts);
}
void
clock_timeleft(clockid_t clockid, struct timespec *ts, struct timespec *sleepts)
{
struct timespec sleptts;
clock_gettime1(clockid, &sleptts);
timespecadd(ts, sleepts, ts);
timespecsub(ts, &sleptts, ts);
*sleepts = sleptts;
}
int
clock_gettime1(clockid_t clock_id, struct timespec *ts)
{
int error;
struct proc *p;
#define CPUCLOCK_ID_MASK (~(CLOCK_THREAD_CPUTIME_ID|CLOCK_PROCESS_CPUTIME_ID))
if (clock_id & CLOCK_PROCESS_CPUTIME_ID) {
pid_t pid = clock_id & CPUCLOCK_ID_MASK;
struct timeval cputime;
mutex_enter(&proc_lock);
p = pid == 0 ? curproc : proc_find(pid);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
mutex_enter(p->p_lock);
calcru(p, /*usertime*/NULL, /*systime*/NULL, /*intrtime*/NULL,
&cputime);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
// XXX: Perhaps create a special kauth type
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_PTRACE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error)
return error;
TIMEVAL_TO_TIMESPEC(&cputime, ts);
return 0;
} else if (clock_id & CLOCK_THREAD_CPUTIME_ID) {
struct lwp *l;
lwpid_t lid = clock_id & CPUCLOCK_ID_MASK;
struct bintime tm = {0, 0};
p = curproc;
mutex_enter(p->p_lock);
l = lid == 0 ? curlwp : lwp_find(p, lid); if (l == NULL) { mutex_exit(p->p_lock);
return ESRCH;
}
addrulwp(l, &tm);
mutex_exit(p->p_lock);
bintime2timespec(&tm, ts);
return 0;
}
switch (clock_id) {
case CLOCK_REALTIME:
nanotime(ts);
break;
case CLOCK_MONOTONIC:
nanouptime(ts);
break;
default:
return EINVAL;
}
return 0;
}
/*
* Calculate delta and convert from struct timespec to the ticks.
*/
int
ts2timo(clockid_t clock_id, int flags, struct timespec *ts,
int *timo, struct timespec *start)
{
int error;
struct timespec tsd;
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000L)
return EINVAL;
if ((flags & TIMER_ABSTIME) != 0 || start != NULL) {
error = clock_gettime1(clock_id, &tsd);
if (error != 0)
return error;
if (start != NULL) *start = tsd;
}
if ((flags & TIMER_ABSTIME) != 0) {
if (!timespecsubok(ts, &tsd))
return EINVAL;
timespecsub(ts, &tsd, ts);
}
error = itimespecfix(ts);
if (error != 0)
return error;
if (ts->tv_sec == 0 && ts->tv_nsec == 0)
return ETIMEDOUT;
*timo = tstohz(ts);
KASSERT(*timo > 0);
return 0;
}
bool
timespecaddok(const struct timespec *tsp, const struct timespec *usp)
{
enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
time_t a = tsp->tv_sec;
time_t b = usp->tv_sec;
bool carry;
/*
* Caller is responsible for guaranteeing valid timespec
* inputs. Any user-controlled inputs must be validated or
* adjusted.
*/
KASSERT(tsp->tv_nsec >= 0);
KASSERT(usp->tv_nsec >= 0);
KASSERT(tsp->tv_nsec < 1000000000L);
KASSERT(usp->tv_nsec < 1000000000L);
CTASSERT(1000000000L <= __type_max(long) - 1000000000L);
/*
* Fail if a + b + carry overflows TIME_MAX, or if a + b
* overflows TIME_MIN because timespecadd adds the carry after
* computing a + b.
*
* Break it into two mutually exclusive and exhaustive cases:
* I. a >= 0
* II. a < 0
*/
carry = (tsp->tv_nsec + usp->tv_nsec >= 1000000000L);
if (a >= 0) {
/*
* Case I: a >= 0. If b < 0, then b + 1 <= 0, so
*
* a + b + 1 <= a + 0 <= TIME_MAX,
*
* and
*
* a + b >= 0 + b = b >= TIME_MIN,
*
* so this can't overflow.
*
* If b >= 0, then a + b + carry >= a + b >= 0, so
* negative results and thus results below TIME_MIN are
* impossible; we need only avoid
*
* a + b + carry > TIME_MAX,
*
* which we will do by rejecting if
*
* b > TIME_MAX - a - carry,
*
* which in turn is incidentally always false if b < 0
* so we don't need extra logic to discriminate on the
* b >= 0 and b < 0 cases.
*
* Since 0 <= a <= TIME_MAX, we know
*
* 0 <= TIME_MAX - a <= TIME_MAX,
*
* and hence
*
* -1 <= TIME_MAX - a - 1 < TIME_MAX.
*
* So we can compute TIME_MAX - a - carry (i.e., either
* TIME_MAX - a or TIME_MAX - a - 1) safely without
* overflow.
*/
if (b > TIME_MAX - a - carry)
return false;
} else {
/*
* Case II: a < 0. If b >= 0, then since a + 1 <= 0,
* we have
*
* a + b + 1 <= b <= TIME_MAX,
*
* and
*
* a + b >= a >= TIME_MIN,
*
* so this can't overflow.
*
* If b < 0, then the intermediate a + b is negative
* and the outcome a + b + 1 is nonpositive, so we need
* only avoid
*
* a + b < TIME_MIN,
*
* which we will do by rejecting if
*
* a < TIME_MIN - b.
*
* (Reminder: The carry is added afterward in
* timespecadd, so to avoid overflow it is not enough
* to merely reject a + b + carry < TIME_MIN.)
*
* It is safe to compute the difference TIME_MIN - b
* because b is negative, so the result lies in
* (TIME_MIN, 0].
*/
if (b < 0 && a < TIME_MIN - b)
return false;
}
return true;
}
bool
timespecsubok(const struct timespec *tsp, const struct timespec *usp)
{
enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
time_t a = tsp->tv_sec, b = usp->tv_sec;
bool borrow;
/*
* Caller is responsible for guaranteeing valid timespec
* inputs. Any user-controlled inputs must be validated or
* adjusted.
*/
KASSERT(tsp->tv_nsec >= 0); KASSERT(usp->tv_nsec >= 0); KASSERT(tsp->tv_nsec < 1000000000L); KASSERT(usp->tv_nsec < 1000000000L);
CTASSERT(1000000000L <= __type_max(long) - 1000000000L);
/*
* Fail if a - b - borrow overflows TIME_MIN, or if a - b
* overflows TIME_MAX because timespecsub subtracts the borrow
* after computing a - b.
*
* Break it into two mutually exclusive and exhaustive cases:
* I. a < 0
* II. a >= 0
*/
borrow = (tsp->tv_nsec - usp->tv_nsec < 0);
if (a < 0) {
/*
* Case I: a < 0. If b < 0, then -b - 1 >= 0, so
*
* a - b - 1 >= a + 0 >= TIME_MIN,
*
* and, since a <= -1, provided that TIME_MIN <=
* -TIME_MAX - 1 so that TIME_MAX <= -TIME_MIN - 1 (in
* fact, equality holds, under the assumption of
* two's-complement arithmetic),
*
* a - b <= -1 - b = -b - 1 <= TIME_MAX,
*
* so this can't overflow.
*/
CTASSERT(TIME_MIN <= -TIME_MAX - 1);
/*
* If b >= 0, then a - b - borrow <= a - b < 0, so
* positive results and thus results above TIME_MAX are
* impossible; we need only avoid
*
* a - b - borrow < TIME_MIN,
*
* which we will do by rejecting if
*
* a < TIME_MIN + b + borrow.
*
* The right-hand side is safe to evaluate for any
* values of b and borrow as long as TIME_MIN +
* TIME_MAX + 1 <= TIME_MAX, i.e., TIME_MIN <= -1.
* (Note: If time_t were unsigned, this would fail!)
*
* Note: Unlike Case I in timespecaddok, this criterion
* does not work for b < 0, nor can the roles of a and
* b in the inequality be reversed (e.g., -b < TIME_MIN
* - a + borrow) without extra cases like checking for
* b = TEST_MIN.
*/
CTASSERT(TIME_MIN < -1);
if (b >= 0 && a < TIME_MIN + b + borrow)
return false;
} else {
/*
* Case II: a >= 0. If b >= 0, then
*
* a - b <= a <= TIME_MAX,
*
* and, provided TIME_MIN <= -TIME_MAX - 1 (in fact,
* equality holds, under the assumption of
* two's-complement arithmetic)
*
* a - b - 1 >= -b - 1 >= -TIME_MAX - 1 >= TIME_MIN,
*
* so this can't overflow.
*/
CTASSERT(TIME_MIN <= -TIME_MAX - 1);
/*
* If b < 0, then a - b >= a >= 0, so negative results
* and thus results below TIME_MIN are impossible; we
* need only avoid
*
* a - b > TIME_MAX,
*
* which we will do by rejecting if
*
* a > TIME_MAX + b.
*
* (Reminder: The borrow is subtracted afterward in
* timespecsub, so to avoid overflow it is not enough
* to merely reject a - b - borrow > TIME_MAX.)
*
* It is safe to compute the sum TIME_MAX + b because b
* is negative, so the result lies in [0, TIME_MAX).
*/
if (b < 0 && a > TIME_MAX + b)
return false;
}
return true;
}
/* $NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $ */
/*-
* Copyright (c) 1999-2002 Robert N. M. Watson
* Copyright (c) 2002-2003 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed by Robert Watson for the TrustedBSD Project.
*
* This software was developed for the FreeBSD Project in part by Network
* Associates Laboratories, the Security Research Division of Network
* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
* as part of the DARPA CHATS research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
* Support for file system extended attributes on the UFS1 file system.
*
* Extended attributes are defined in the form name=value, where name is
* a nul-terminated string in the style of a file name, and value is a
* binary blob of zero or more bytes. The UFS1 extended attribute service
* layers support for extended attributes onto a backing file, in the style
* of the quota implementation, meaning that it requires no underlying format
* changes to the file system. This design choice exchanges simplicity,
* usability, and easy deployment for performance.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/reboot.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lwp.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/lock.h>
#include <sys/dirent.h>
#include <sys/extattr.h>
#include <sys/sysctl.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
int ufs_extattr_sync = 1;
int ufs_extattr_autocreate = 1024;
static int ufs_extattr_valid_attrname(int attrnamespace,
const char *attrname);
static int ufs_extattr_enable_with_open(struct ufsmount *ump,
struct vnode *vp, int attrnamespace, const char *attrname,
struct lwp *l);
static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct vnode *backing_vnode,
struct lwp *l);
static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct lwp *l);
static int ufs_extattr_get(struct vnode *vp, int attrnamespace,
const char *name, struct uio *uio, size_t *size,
kauth_cred_t cred, struct lwp *l);
static int ufs_extattr_list(struct vnode *vp, int attrnamespace,
struct uio *uio, size_t *size, int flag,
kauth_cred_t cred, struct lwp *l);
static int ufs_extattr_set(struct vnode *vp, int attrnamespace,
const char *name, struct uio *uio, kauth_cred_t cred,
struct lwp *l);
static int ufs_extattr_rm(struct vnode *vp, int attrnamespace,
const char *name, kauth_cred_t cred, struct lwp *l);
static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
int, const char *);
static int ufs_extattr_get_header(struct vnode *,
struct ufs_extattr_list_entry *,
struct ufs_extattr_header *, off_t *);
/*
* Per-FS attribute lock protecting attribute operations.
* XXX Right now there is a lot of lock contention due to having a single
* lock per-FS; really, this should be far more fine-grained.
*/
static void
ufs_extattr_uepm_lock(struct ufsmount *ump)
{
/*
* XXX This needs to be recursive for the following reasons:
* - it is taken in ufs_extattr_vnode_inactive
* - which is called from VOP_INACTIVE
* - which can be triggered by any vrele, vput, or vn_close
* - several of these can happen while it's held
*/
if (mutex_owned(&ump->um_extattr.uepm_lock)) {
ump->um_extattr.uepm_lockcnt++;
return;
}
mutex_enter(&ump->um_extattr.uepm_lock);
}
static void
ufs_extattr_uepm_unlock(struct ufsmount *ump)
{
if (ump->um_extattr.uepm_lockcnt != 0) {
KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
ump->um_extattr.uepm_lockcnt--;
return;
}
mutex_exit(&ump->um_extattr.uepm_lock);
}
/*-
* Determine whether the name passed is a valid name for an actual
* attribute.
*
* Invalid currently consists of:
* NULL pointer for attrname
* zero-length attrname (used to retrieve application attribute list)
*/
static int
ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
{
if (attrname == NULL)
return 0;
if (strlen(attrname) == 0)
return 0;
return 1;
}
/*
* Autocreate an attribute storage
*/
static int
ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
const char *attrname, struct lwp *l, struct ufs_extattr_list_entry **uelep)
{
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct vnode *backing_vp;
struct pathbuf *pb;
char *path;
struct ufs_extattr_fileheader uef;
struct ufs_extattr_list_entry *uele;
int error;
path = PNBUF_GET();
/*
* We only support system and user namespace autocreation
*/
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
UFS_EXTATTR_SUBDIR_SYSTEM, attrname);
break;
case EXTATTR_NAMESPACE_USER:
(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
UFS_EXTATTR_SUBDIR_USER, attrname);
break;
default:
PNBUF_PUT(path);
*uelep = NULL;
return EINVAL;
break;
}
/*
* Release extended attribute mount lock, otherwise
* we can deadlock with another thread that would lock
* vp after we unlock it below, and call
* ufs_extattr_uepm_lock(ump), for instance
* in ufs_getextattr().
*/
ufs_extattr_uepm_unlock(ump);
/*
* XXX unlock/lock should only be done when setting extattr
* on backing store or one of its parent directory
* including root, but we always do it for now.
*/
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
VOP_UNLOCK(vp);
pb = pathbuf_create(path);
/*
* Since we do not hold ufs_extattr_uepm_lock anymore,
* another thread may race with us for backend creation,
* but only one can succeed here thanks to O_EXCL.
*
* backing_vp is the backing store.
*/
error = vn_open(NULL, pb, 0, O_CREAT|O_EXCL|O_RDWR, 0600,
&backing_vp, NULL, NULL);
/*
* Reacquire the lock on the vnode
*/
KASSERT(VOP_ISLOCKED(vp) == 0);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
ufs_extattr_uepm_lock(ump);
if (error != 0) {
pathbuf_destroy(pb);
PNBUF_PUT(path);
*uelep = NULL;
return error;
}
KASSERT(backing_vp != NULL);
KASSERT(VOP_ISLOCKED(backing_vp) == LK_EXCLUSIVE);
pathbuf_destroy(pb);
PNBUF_PUT(path);
uef.uef_magic = UFS_EXTATTR_MAGIC;
uef.uef_version = UFS_EXTATTR_VERSION;
uef.uef_size = ufs_extattr_autocreate;
error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND,
l->l_cred, NULL, l);
VOP_UNLOCK(backing_vp);
if (error != 0) {
printf("%s: write uef header failed for `%s' (%d)\n",
__func__, attrname, error);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return error;
}
/*
* Now enable attribute.
*/
error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
KASSERT(VOP_ISLOCKED(backing_vp) == 0);
if (error != 0) {
printf("%s: enable `%s' failed (%d)\n",
__func__, attrname, error);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return error;
}
uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
if (uele == NULL) {
printf("%s: attribute `%s' created but not found!\n",
__func__, attrname);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return ESRCH; /* really internal error */
}
printf("%s: EA backing store autocreated for %s\n",
mp->mnt_stat.f_mntonname, attrname);
*uelep = uele;
return 0;
}
/*
* Locate an attribute given a name and mountpoint.
* Must be holding uepm lock for the mount point.
*/
static struct ufs_extattr_list_entry *
ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
const char *attrname)
{
struct ufs_extattr_list_entry *search_attribute;
for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
search_attribute != NULL;
search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
if (!(strncmp(attrname, search_attribute->uele_attrname,
UFS_EXTATTR_MAXEXTATTRNAME)) &&
(attrnamespace == search_attribute->uele_attrnamespace)) {
return search_attribute;
}
}
return 0;
}
/*
* Initialize per-FS structures supporting extended attributes. Do not
* start extended attributes yet.
*/
void
ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
{
uepm->uepm_flags = 0;
uepm->uepm_lockcnt = 0;
LIST_INIT(&uepm->uepm_list);
mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
}
/*
* Destroy per-FS structures supporting extended attributes. Assumes
* that EAs have already been stopped, and will panic if not.
*/
void
ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
{
if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
panic("ufs_extattr_uepm_destroy: not initialized");
if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
panic("ufs_extattr_uepm_destroy: called while still started");
/*
* It's not clear that either order for the next three lines is
* ideal, and it should never be a problem if this is only called
* during unmount, and with vfs_busy().
*/
uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
mutex_destroy(&uepm->uepm_lock);
}
/*
* Start extended attribute support on an FS.
*/
int
ufs_extattr_start(struct mount *mp, struct lwp *l)
{
struct ufsmount *ump;
int error = 0;
ump = VFSTOUFS(mp);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
ufs_extattr_uepm_init(&ump->um_extattr);
ufs_extattr_uepm_lock(ump);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
error = EOPNOTSUPP;
goto unlock;
}
if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
error = EBUSY;
goto unlock;
}
ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
ump->um_extattr.uepm_ucred = l->l_cred;
kauth_cred_hold(ump->um_extattr.uepm_ucred);
unlock:
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Helper routine: given a locked parent directory and filename, return
* the locked vnode of the inode associated with the name. Will not
* follow symlinks, may return any type of vnode. Lock on parent will
* be released even in the event of a failure. In the event that the
* target is the parent (i.e., "."), there will be two references and
* one lock, requiring the caller to possibly special-case.
*/
static int
ufs_extattr_lookup(struct vnode *start_dvp, int lockparent,
const char *dirname,
struct vnode **vp, struct lwp *l)
{
struct vop_lookup_v2_args vargs;
struct componentname cnp;
struct vnode *target_vp;
char *pnbuf;
int error;
KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);
pnbuf = PNBUF_GET();
memset(&cnp, 0, sizeof(cnp));
cnp.cn_nameiop = LOOKUP;
cnp.cn_flags = ISLASTCN | lockparent;
cnp.cn_cred = l->l_cred;
cnp.cn_nameptr = pnbuf;
error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
if (error) {
if (lockparent == 0) {
VOP_UNLOCK(start_dvp);
}
PNBUF_PUT(pnbuf);
printf("%s: copystr failed (%d)\n", __func__, error);
return error;
}
cnp.cn_namelen--; /* trim nul termination */
vargs.a_desc = NULL;
vargs.a_dvp = start_dvp;
vargs.a_vpp = &target_vp;
vargs.a_cnp = &cnp;
error = ufs_lookup(&vargs);
PNBUF_PUT(pnbuf);
if (error) {
if (lockparent == 0) {
VOP_UNLOCK(start_dvp);
}
return error;
}
#if 0
if (target_vp == start_dvp)
panic("%s: target_vp == start_dvp", __func__);
#endif
if (target_vp != start_dvp) {
error = vn_lock(target_vp, LK_EXCLUSIVE);
if (lockparent == 0)
VOP_UNLOCK(start_dvp);
if (error) {
vrele(target_vp);
return error;
}
}
KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
*vp = target_vp;
return 0;
}
/*
* Enable an EA using the passed filesystem, backing vnode, attribute name,
* namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp
* to be locked when passed in. The vnode will be returned unlocked,
* regardless of success/failure of the function. As a result, the caller
* will always need to vrele(), but not vput().
*/
static int
ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
int attrnamespace, const char *attrname, struct lwp *l)
{
int error;
error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
if (error) {
printf("%s: VOP_OPEN(): failed (%d)\n", __func__, error);
VOP_UNLOCK(vp);
return error;
}
mutex_enter(vp->v_interlock);
vp->v_writecount++;
mutex_exit(vp->v_interlock);
vref(vp);
VOP_UNLOCK(vp);
error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
if (error != 0)
vn_close(vp, FREAD|FWRITE, l->l_cred);
return error;
}
/*
* Given a locked directory vnode, iterate over the names in the directory
* and use ufs_extattr_lookup() to retrieve locked vnodes of potential
* attribute files. Then invoke ufs_extattr_enable_with_open() on each
* to attempt to start the attribute. Leaves the directory locked on
* exit.
*/
static int
ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
int attrnamespace, struct lwp *l)
{
struct vop_readdir_args vargs;
struct statvfs *sbp = &ump->um_mountp->mnt_stat;
struct dirent *dp, *edp;
struct vnode *attr_vp;
struct uio auio;
struct iovec aiov;
char *dirbuf;
int error, eofflag = 0;
if (dvp->v_type != VDIR)
return ENOTDIR;
dirbuf = kmem_alloc(UFS_DIRBLKSIZ, KM_SLEEP);
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_offset = 0;
UIO_SETUP_SYSSPACE(&auio);
vargs.a_desc = NULL;
vargs.a_vp = dvp;
vargs.a_uio = &auio;
vargs.a_cred = l->l_cred;
vargs.a_eofflag = &eofflag;
vargs.a_ncookies = NULL;
vargs.a_cookies = NULL;
while (!eofflag) {
auio.uio_resid = UFS_DIRBLKSIZ;
aiov.iov_base = dirbuf;
aiov.iov_len = UFS_DIRBLKSIZ;
error = ufs_readdir(&vargs);
if (error) {
printf("%s: ufs_readdir (%d)\n", __func__, error);
return error;
}
/*
* XXXRW: While in UFS, we always get UFS_DIRBLKSIZ returns from
* the directory code on success, on other file systems this
* may not be the case. For portability, we should check the
* read length on return from ufs_readdir().
*/
edp = (struct dirent *)&dirbuf[UFS_DIRBLKSIZ];
for (dp = (struct dirent *)dirbuf; dp < edp; ) {
if (dp->d_reclen == 0)
break;
/* Skip "." and ".." */
if (dp->d_name[0] == '.' &&
(dp->d_name[1] == '\0' ||
(dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
goto next;
error = ufs_extattr_lookup(dvp, LOCKPARENT,
dp->d_name, &attr_vp, l);
if (error == ENOENT) {
goto next; /* keep silent */
} else if (error) {
printf("%s: lookup `%s' (%d)\n", __func__,
dp->d_name, error);
} else if (attr_vp == dvp) {
vrele(attr_vp);
} else if (attr_vp->v_type != VREG) {
vput(attr_vp);
} else {
error = ufs_extattr_enable_with_open(ump,
attr_vp, attrnamespace, dp->d_name, l);
vrele(attr_vp);
if (error) {
printf("%s: enable `%s' (%d)\n",
__func__, dp->d_name, error);
} else if (bootverbose) {
printf("%s: EA %s loaded\n",
sbp->f_mntonname, dp->d_name);
}
}
next:
dp = (struct dirent *) ((char *)dp + dp->d_reclen);
if (dp >= edp)
break;
}
}
kmem_free(dirbuf, UFS_DIRBLKSIZ);
return 0;
}
static int
ufs_extattr_subdir(struct lwp *l, struct mount *mp, struct vnode *attr_dvp,
const char *subdir, int namespace)
{
int error;
struct vnode *attr_sub;
error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, subdir, &attr_sub, l);
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
if (error) {
printf("%s: Can't find `%s/%s/%s' (%d)\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
return error;
}
KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
attr_sub, namespace, l);
if (error) {
printf("%s: ufs_extattr_iterate_directory `%s/%s/%s' (%d)\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
}
KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
vput(attr_sub);
return error;
}
/*
* Auto-start of extended attributes, to be executed (optionally) at
* mount-time.
*/
int
ufs_extattr_autostart(struct mount *mp, struct lwp *l)
{
struct vnode *rvp, *attr_dvp;
int error;
/*
* Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
* If so, automatically start EA's.
*/
error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp);
if (error) {
printf("%s: VFS_ROOT() (%d)\n", __func__, error);
return error;
}
KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
error = ufs_extattr_lookup(rvp, 0,
UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
if (error) {
/* rvp ref'd but now unlocked */
KASSERT(VOP_ISLOCKED(rvp) == 0);
vrele(rvp);
printf("%s: lookup `%s/%s' (%d)\n", __func__,
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, error);
return error;
}
if (rvp == attr_dvp) {
/* Should never happen. */
KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
vrele(attr_dvp);
vput(rvp);
printf("%s: `/' == `%s/%s' (%d)\n", __func__,
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, EINVAL);
return EINVAL;
}
KASSERT(VOP_ISLOCKED(rvp) == 0);
vrele(rvp);
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
if (attr_dvp->v_type != VDIR) {
printf("%s: `%s/%s' is not a directory\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR);
goto return_vput_attr_dvp;
}
error = ufs_extattr_start(mp, l);
if (error) {
printf("%s: ufs_extattr_start failed (%d)\n", __func__,
error);
goto return_vput_attr_dvp;
}
/*
* Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
* UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory,
* and start with appropriate type. Failures in either don't
* result in an over-all failure. attr_dvp is left locked to
* be cleaned up on exit.
*/
error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_SYSTEM,
EXTATTR_NAMESPACE_SYSTEM);
error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_USER,
EXTATTR_NAMESPACE_USER);
/* Mask startup failures in sub-directories. */
error = 0;
return_vput_attr_dvp:
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
vput(attr_dvp);
return error;
}
/*
* Stop extended attribute support on an FS.
*/
void
ufs_extattr_stop(struct mount *mp, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct ufsmount *ump = VFSTOUFS(mp);
ufs_extattr_uepm_lock(ump);
/*
* If we haven't been started, no big deal. Just short-circuit
* the processing work.
*/
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
goto unlock;
}
while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
uele = LIST_FIRST(&ump->um_extattr.uepm_list);
ufs_extattr_disable(ump, uele->uele_attrnamespace,
uele->uele_attrname, l);
}
ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
kauth_cred_free(ump->um_extattr.uepm_ucred);
ump->um_extattr.uepm_ucred = NULL;
unlock:
ufs_extattr_uepm_unlock(ump);
}
/*
* Enable a named attribute on the specified filesystem; provide an
* unlocked backing vnode to hold the attribute data.
*/
static int
ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct vnode *backing_vnode, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct iovec aiov;
struct uio auio;
int error = 0;
if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
return EINVAL;
if (backing_vnode->v_type != VREG)
return EINVAL;
attribute = kmem_zalloc(sizeof(*attribute), KM_SLEEP);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
error = EOPNOTSUPP;
goto free_exit;
}
if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
error = EEXIST;
goto free_exit;
}
strncpy(attribute->uele_attrname, attrname,
UFS_EXTATTR_MAXEXTATTRNAME);
attribute->uele_attrnamespace = attrnamespace;
memset(&attribute->uele_fileheader, 0,
sizeof(struct ufs_extattr_fileheader));
attribute->uele_backing_vnode = backing_vnode;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
aiov.iov_base = (void *) &attribute->uele_fileheader;
aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
auio.uio_offset = (off_t) 0;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
ump->um_extattr.uepm_ucred);
if (error)
goto unlock_free_exit;
if (auio.uio_resid != 0) {
printf("%s: malformed attribute header\n", __func__);
error = EINVAL;
goto unlock_free_exit;
}
/*
* Try to determine the byte order of the attribute file.
*/
if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
attribute->uele_flags |= UELE_F_NEEDSWAP;
attribute->uele_fileheader.uef_magic =
ufs_rw32(attribute->uele_fileheader.uef_magic,
UELE_NEEDSWAP(attribute));
if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
printf("%s: invalid attribute header magic\n",
__func__);
error = EINVAL;
goto unlock_free_exit;
}
}
attribute->uele_fileheader.uef_version =
ufs_rw32(attribute->uele_fileheader.uef_version,
UELE_NEEDSWAP(attribute));
attribute->uele_fileheader.uef_size =
ufs_rw32(attribute->uele_fileheader.uef_size,
UELE_NEEDSWAP(attribute));
if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
printf("%s: incorrect attribute header version %d != %d\n",
__func__, attribute->uele_fileheader.uef_version,
UFS_EXTATTR_VERSION);
error = EINVAL;
goto unlock_free_exit;
}
LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries);
VOP_UNLOCK(backing_vnode);
return 0;
unlock_free_exit:
VOP_UNLOCK(backing_vnode);
free_exit:
kmem_free(attribute, sizeof(*attribute));
return error;
}
/*
* Disable extended attribute support on an FS.
*/
static int
ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
int error = 0;
if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
return EINVAL;
uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
if (!uele)
return ENODATA;
LIST_REMOVE(uele, uele_entries);
error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, l->l_cred);
kmem_free(uele, sizeof(*uele));
return error;
}
/*
* VFS call to manage extended attributes in UFS. If filename_vp is
* non-NULL, it must be passed in locked, and regardless of errors in
* processing, will be unlocked.
*/
int
ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
int attrnamespace, const char *attrname)
{
struct lwp *l = curlwp;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
/*
* Only privileged processes can configure extended attributes.
*/
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_EXTATTR,
0, mp, NULL, NULL);
if (error) {
if (filename_vp != NULL)
VOP_UNLOCK(filename_vp);
return error;
}
switch(cmd) {
case UFS_EXTATTR_CMD_START:
case UFS_EXTATTR_CMD_STOP:
case UFS_EXTATTR_CMD_ENABLE:
case UFS_EXTATTR_CMD_DISABLE:
if (filename_vp != NULL) {
VOP_UNLOCK(filename_vp);
return EINVAL;
}
if (attrname != NULL)
return EINVAL;
break;
default:
return EINVAL;
}
switch(cmd) {
case UFS_EXTATTR_CMD_START:
error = ufs_extattr_autostart(mp, l);
return error;
case UFS_EXTATTR_CMD_STOP:
ufs_extattr_stop(mp, l);
return 0;
case UFS_EXTATTR_CMD_ENABLE:
/*
* ufs_extattr_enable_with_open() will always unlock the
* vnode, regardless of failure.
*/
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_enable_with_open(ump, filename_vp,
attrnamespace, attrname, l);
ufs_extattr_uepm_unlock(ump);
return error;
case UFS_EXTATTR_CMD_DISABLE:
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
ufs_extattr_uepm_unlock(ump);
return error;
default:
return EINVAL;
}
}
/*
* Read extended attribute header for a given vnode and attribute.
* Backing vnode should be locked and unlocked by caller.
*/
static int
ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
struct ufs_extattr_header *ueh, off_t *bap)
{
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct inode *ip = VTOI(vp);
off_t base_offset;
struct iovec aiov;
struct uio aio;
int error;
/*
* Find base offset of header in file based on file header size, and
* data header size + maximum data size, indexed by inode number.
*/
base_offset = sizeof(struct ufs_extattr_fileheader) +
ip->i_number * (sizeof(struct ufs_extattr_header) +
uele->uele_fileheader.uef_size);
/*
* Read in the data header to see if the data is defined, and if so
* how much.
*/
memset(ueh, 0, sizeof(struct ufs_extattr_header));
aiov.iov_base = ueh;
aiov.iov_len = sizeof(struct ufs_extattr_header);
aio.uio_iov = &aiov;
aio.uio_iovcnt = 1;
aio.uio_rw = UIO_READ;
aio.uio_offset = base_offset;
aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&aio);
error = VOP_READ(uele->uele_backing_vnode, &aio,
IO_NODELOCKED, ump->um_extattr.uepm_ucred);
if (error)
return error;
/*
* Attribute headers are kept in file system byte order.
* XXX What about the blob of data?
*/
ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
ueh->ueh_len = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));
/* Defined? */
if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
return ENODATA;
/* Valid for the current inode generation? */
if (ueh->ueh_i_gen != ip->i_gen) {
/*
* The inode itself has a different generation number
* than the uele data. For now, the best solution
* is to coerce this to undefined, and let it get cleaned
* up by the next write or extattrctl clean.
*/
printf("%s: %s: inode gen inconsistency (%u, %jd)\n",
__func__, mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
(intmax_t)ip->i_gen);
return ENODATA;
}
/* Local size consistency check. */
if (ueh->ueh_len > uele->uele_fileheader.uef_size)
return ENXIO;
/* Return base offset */
if (bap != NULL)
*bap = base_offset;
return 0;
}
/*
* Vnode operation to retrieve a named extended attribute.
*/
int
ufs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_uio, ap->a_size, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with retrieving a named attribute--assumes that
* the attribute lock has already been grabbed.
*/
static int
ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
off_t base_offset;
size_t len, old_len;
int error = 0;
if (strlen(name) == 0)
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute)
return ENODATA;
/*
* Allow only offsets of zero to encourage the read/replace
* extended attribute semantic. Otherwise we can't guarantee
* atomicity, as we don't provide locks for extended attributes.
*/
if (uio != NULL && uio->uio_offset != 0)
return ENXIO;
/*
* Don't need to get a lock on the backing file if the getattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
if (error)
goto vopunlock_exit;
/* Return full data size if caller requested it. */
if (size != NULL)
*size = ueh.ueh_len;
/* Return data if the caller requested it. */
if (uio != NULL) {
/* Allow for offset into the attribute data. */
uio->uio_offset = base_offset + sizeof(struct
ufs_extattr_header);
/*
* Figure out maximum to transfer -- use buffer size and
* local data limit.
*/
len = MIN(uio->uio_resid, ueh.ueh_len);
old_len = uio->uio_resid;
uio->uio_resid = len;
error = VOP_READ(attribute->uele_backing_vnode, uio,
IO_NODELOCKED, ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
uio->uio_resid = old_len - (len - uio->uio_resid);
}
vopunlock_exit:
if (uio != NULL)
uio->uio_offset = 0;
if (attribute->uele_backing_vnode != vp)
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Vnode operation to list extended attribute for a vnode
*/
int
ufs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN int flag;
IN kauth_cred_t a_cred;
struct proc *a_p;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with retrieving list of attributes--assumes that
* the attribute lock has already been grabbed.
*/
static int
ufs_extattr_list(struct vnode *vp, int attrnamespace,
struct uio *uio, size_t *size, int flag,
kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
size_t listsize = 0;
int error = 0;
/*
* XXX: We can move this inside the loop and iterate on individual
* attributes.
*/
error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
if (error)
return error;
LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
unsigned char attrnamelen;
if (uele->uele_attrnamespace != attrnamespace)
continue;
error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
if (error == ENODATA)
continue;
if (error != 0)
return error;
/*
* Don't need to get a lock on the backing file if
* the listattr is being applied to the backing file,
* as the lock is already held.
*/
if (uele->uele_backing_vnode != vp)
vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
/*
* +1 for trailing NUL (listxattr flavor)
* or leading name length (extattr_list_file flavor)
*/
attrnamelen = strlen(uele->uele_attrname);
listsize += attrnamelen + 1;
/* Return data if the caller requested it. */
if (uio != NULL) {
/*
* We support two flavors. Either NUL-terminated
* strings (a la listxattr), or non NUL-terminated,
* one byte length prefixed strings (for
* extattr_list_file). EXTATTR_LIST_LENPREFIX switches
* that second behavior.
*/
if (flag & EXTATTR_LIST_LENPREFIX) {
uint8_t len = (uint8_t)attrnamelen;
/* Copy leading name length */
error = uiomove(&len, sizeof(len), uio);
if (error != 0)
break;
} else {
/* Include trailing NULL */
attrnamelen++;
}
error = uiomove(uele->uele_attrname,
(size_t)attrnamelen, uio);
if (error != 0)
break;
}
if (uele->uele_backing_vnode != vp)
VOP_UNLOCK(uele->uele_backing_vnode);
if (error != 0)
return error;
}
if (uio != NULL)
uio->uio_offset = 0;
/* Return full data size if caller requested it. */
if (size != NULL)
*size = listsize;
return 0;
}
/*
* Vnode operation to remove a named attribute.
*/
int
ufs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Vnode operation to set a named attribute.
*/
int
ufs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
/*
* XXX: No longer a supported way to delete extended attributes.
*/
if (ap->a_uio == NULL) {
ufs_extattr_uepm_unlock(ump);
return EINVAL;
}
error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_uio, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with setting a vnode's extended attributes;
* assumes that the attribute lock has already been grabbed.
*/
static int
ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
struct uio *uio, kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct iovec local_aiov;
struct uio local_aio;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct inode *ip = VTOI(vp);
off_t base_offset;
int error = 0, ioflag;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
if (!ufs_extattr_valid_attrname(attrnamespace, name))
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute) {
error = ufs_extattr_autocreate_attr(vp, attrnamespace,
name, l, &attribute);
if (error == EEXIST) {
/* Another thread raced us for backend creation */
error = 0;
attribute =
ufs_extattr_find_attr(ump, attrnamespace, name);
}
if (error || !attribute)
return ENODATA;
}
/*
* Early rejection of invalid offsets/length.
* Reject: any offset but 0 (replace)
* Any size greater than attribute size limit
*/
if (uio->uio_offset != 0 ||
uio->uio_resid > attribute->uele_fileheader.uef_size)
return ENXIO;
/*
* Find base offset of header in file based on file header size, and
* data header size + maximum data size, indexed by inode number.
*/
base_offset = sizeof(struct ufs_extattr_fileheader) +
ip->i_number * (sizeof(struct ufs_extattr_header) +
attribute->uele_fileheader.uef_size);
/*
* Write out a data header for the data.
*/
ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
UELE_NEEDSWAP(attribute));
ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
UELE_NEEDSWAP(attribute));
ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
local_aiov.iov_base = &ueh;
local_aiov.iov_len = sizeof(struct ufs_extattr_header);
local_aio.uio_iov = &local_aiov;
local_aio.uio_iovcnt = 1;
local_aio.uio_rw = UIO_WRITE;
local_aio.uio_offset = base_offset;
local_aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&local_aio);
/*
* Don't need to get a lock on the backing file if the setattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode,
LK_EXCLUSIVE | LK_RETRY);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
if (local_aio.uio_resid != 0) {
error = ENXIO;
goto vopunlock_exit;
}
/*
* Write out user data.
* XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
*/
uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
ump->um_extattr.uepm_ucred);
vopunlock_exit:
uio->uio_offset = 0;
if (attribute->uele_backing_vnode != vp)
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Real work associated with removing an extended attribute from a vnode.
* Assumes the attribute lock has already been grabbed.
*/
static int
ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct iovec local_aiov;
struct uio local_aio;
off_t base_offset;
int error = 0, ioflag;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
if (!ufs_extattr_valid_attrname(attrnamespace, name))
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute)
return ENODATA;
/*
* Don't need to get a lock on the backing file if the getattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
if (error)
goto vopunlock_exit;
/* Flag it as not in use. */
ueh.ueh_flags = 0; /* No need to byte swap 0 */
ueh.ueh_len = 0; /* ...ditto... */
local_aiov.iov_base = &ueh;
local_aiov.iov_len = sizeof(struct ufs_extattr_header);
local_aio.uio_iov = &local_aiov;
local_aio.uio_iovcnt = 1;
local_aio.uio_rw = UIO_WRITE;
local_aio.uio_offset = base_offset;
local_aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&local_aio);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
if (local_aio.uio_resid != 0)
error = ENXIO;
vopunlock_exit:
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Called by UFS when an inode is no longer active and should have its
* attributes stripped.
*/
void
ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
/*
* In that case, we cannot lock. We should not have any active vnodes
* on the fs if this is not yet initialized but is going to be, so
* this can go unlocked.
*/
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
return;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return;
ufs_extattr_uepm_lock(ump); LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
ufs_extattr_rm(vp, uele->uele_attrnamespace,
uele->uele_attrname, lwp0.l_cred, l);
ufs_extattr_uepm_unlock(ump);
}
void
ufs_extattr_init(void)
{
}
void
ufs_extattr_done(void)
{
}
/* $NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Emmanuel Dreyfus.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>
kmutex_t clockctl_mtx;
int clockctl_refcnt;
#include "ioconf.h"
dev_type_ioctl(clockctlioctl);
const struct cdevsw clockctl_cdevsw = {
.d_open = clockctlopen,
.d_close = clockctlclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = clockctlioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER,
};
static kauth_listener_t clockctl_listener;
static int
clockctl_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_system_req req;
bool device_context;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)arg0;
if ((action != KAUTH_SYSTEM_TIME) ||
(req != KAUTH_REQ_SYSTEM_TIME_SYSTEM))
return result;
device_context = arg3 != NULL;
/* Device is controlled by permissions, so allow. */
if (device_context)
result = KAUTH_RESULT_ALLOW;
return result;
}
/*ARGSUSED*/
void
clockctlattach(int num)
{
/*
* Don't initialize the listener here - it will get handled as part
* of module initialization.
*/
#if 0
clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
clockctl_listener_cb, NULL);
#endif
}
/*
* Maintain a refcount for each open/close, so we know when it is
* safe to call devsw_detach()
*/
int
clockctlopen(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_enter(&clockctl_mtx);
clockctl_refcnt++;
mutex_exit(&clockctl_mtx);
return 0;
}
int
clockctlclose(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_enter(&clockctl_mtx);
clockctl_refcnt--;
mutex_exit(&clockctl_mtx);
return 0;
}
MODULE(MODULE_CLASS_DRIVER, clockctl, NULL);
int
clockctl_modcmd(modcmd_t cmd, void *data)
{
int error;
#ifdef _MODULE
int bmajor, cmajor;
#endif
error = 0;
switch (cmd) {
case MODULE_CMD_INIT:
mutex_init(&clockctl_mtx, MUTEX_DEFAULT, IPL_NONE);
clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
clockctl_listener_cb, NULL);
#ifdef _MODULE
bmajor = cmajor = -1;
error = devsw_attach("clockctl", NULL, &bmajor,
&clockctl_cdevsw, &cmajor);
if (error != 0)
kauth_unlisten_scope(clockctl_listener);
#endif
break;
case MODULE_CMD_FINI:
mutex_enter(&clockctl_mtx);
if (clockctl_refcnt != 0) {
mutex_exit(&clockctl_mtx);
return EBUSY;
}
#ifdef _MODULE
devsw_detach(NULL, &clockctl_cdevsw);
#endif
mutex_exit(&clockctl_mtx);
kauth_unlisten_scope(clockctl_listener);
mutex_destroy(&clockctl_mtx);
break;
default:
error = ENOTTY;
break;
}
return error;
}
int
clockctlioctl(
dev_t dev,
u_long cmd,
void *data,
int flags,
struct lwp *l)
{
int error = 0;
switch (cmd) {
case CLOCKCTL_SETTIMEOFDAY: {
struct clockctl_settimeofday *args = data;
error = settimeofday1(args->tv, true, args->tzp, l, false);
break;
}
case CLOCKCTL_ADJTIME: {
struct timeval atv, oldatv;
struct clockctl_adjtime *args = data;
if (args->delta) {
error = copyin(args->delta, &atv, sizeof(atv));
if (error)
return (error);
}
adjtime1(args->delta ? &atv : NULL,
args->olddelta ? &oldatv : NULL, l->l_proc);
if (args->olddelta) error = copyout(&oldatv, args->olddelta,
sizeof(oldatv));
break;
}
case CLOCKCTL_CLOCK_SETTIME: {
struct clockctl_clock_settime *args = data;
struct timespec ts;
error = copyin(args->tp, &ts, sizeof ts);
if (error)
return (error);
error = clock_settime1(l->l_proc, args->clock_id, &ts, false);
break;
}
case CLOCKCTL_NTP_ADJTIME: {
struct clockctl_ntp_adjtime *args = data;
struct timex ntv;
if (vec_ntp_timestatus == NULL) {
error = ENOTTY;
break;
}
error = copyin(args->tp, &ntv, sizeof(ntv));
if (error)
return (error);
(*vec_ntp_adjtime1)(&ntv);
error = copyout(&ntv, args->tp, sizeof(ntv));
if (error == 0) args->retval = (*vec_ntp_timestatus)();
break;
}
default:
MODULE_HOOK_CALL(clockctl_ioctl_50_hook,
(dev, cmd, data, flags, l), enosys(), error);
if (error == ENOSYS)
error = ENOTTY;
}
return (error);
}
/* $NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Konrad E. Schroder <perseant@hhhh.org>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1989, 1991, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $");
#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#include "opt_quota.h"
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/lfs/ulfs_quotacommon.h>
#include <ufs/lfs/ulfs_inode.h>
#include <ufs/lfs/ulfsmount.h>
#include <ufs/lfs/ulfs_bswap.h>
#include <ufs/lfs/ulfs_extern.h>
#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>
#include <ufs/lfs/lfs.h>
#include <ufs/lfs/lfs_accessors.h>
#include <ufs/lfs/lfs_kernel.h>
#include <ufs/lfs/lfs_extern.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
MODULE(MODULE_CLASS_VFS, lfs, NULL);
static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
static int lfs_flushfiles(struct mount *, int);
extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
extern const struct vnodeopv_desc lfs_specop_opv_desc;
extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
struct lwp * lfs_writer_daemon = NULL;
kcondvar_t lfs_writerd_cv;
int lfs_do_flush = 0;
#ifdef LFS_KERNEL_RFW
int lfs_do_rfw = 0;
#endif
const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
&lfs_vnodeop_opv_desc,
&lfs_specop_opv_desc,
&lfs_fifoop_opv_desc,
NULL,
};
struct vfsops lfs_vfsops = {
.vfs_name = MOUNT_LFS,
.vfs_min_mount_data = sizeof (struct ulfs_args),
.vfs_mount = lfs_mount,
.vfs_start = ulfs_start,
.vfs_unmount = lfs_unmount,
.vfs_root = ulfs_root,
.vfs_quotactl = ulfs_quotactl,
.vfs_statvfs = lfs_statvfs,
.vfs_sync = lfs_sync,
.vfs_vget = lfs_vget,
.vfs_loadvnode = lfs_loadvnode,
.vfs_newvnode = lfs_newvnode,
.vfs_fhtovp = lfs_fhtovp,
.vfs_vptofh = lfs_vptofh,
.vfs_init = lfs_init,
.vfs_reinit = lfs_reinit,
.vfs_done = lfs_done,
.vfs_mountroot = lfs_mountroot,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = lfs_extattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = lfs_vnodeopv_descs
};
const struct genfs_ops lfs_genfsops = {
.gop_size = lfs_gop_size,
.gop_alloc = ulfs_gop_alloc,
.gop_write = lfs_gop_write,
.gop_markupdate = ulfs_gop_markupdate,
.gop_putrange = genfs_gop_putrange,
};
struct shortlong {
const char *sname;
const char *lname;
};
static int
sysctl_lfs_dostats(SYSCTLFN_ARGS)
{
extern struct lfs_stats lfs_stats;
extern int lfs_dostats;
int error;
error = sysctl_lookup(SYSCTLFN_CALL(rnode));
if (error || newp == NULL)
return (error);
if (lfs_dostats == 0)
memset(&lfs_stats, 0, sizeof(lfs_stats));
return (0);
}
SYSCTL_SETUP(lfs_sysctl_setup, "lfs sysctl")
{
int i;
extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
lfs_fs_pagetrip, lfs_ignore_lazy_sync;
#ifdef DEBUG
extern int lfs_debug_log_subsys[DLOG_MAX];
struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
{ "rollforward", "Debug roll-forward code" },
{ "alloc", "Debug inode allocation and free list" },
{ "avail", "Debug space-available-now accounting" },
{ "flush", "Debug flush triggers" },
{ "lockedlist", "Debug locked list accounting" },
{ "vnode_verbose", "Verbose per-vnode-written debugging" },
{ "vnode", "Debug vnode use during segment write" },
{ "segment", "Debug segment writing" },
{ "seguse", "Debug segment used-bytes accounting" },
{ "cleaner", "Debug cleaning routines" },
{ "mount", "Debug mount/unmount routines" },
{ "pagecache", "Debug UBC interactions" },
{ "dirop", "Debug directory-operation accounting" },
{ "malloc", "Debug private malloc accounting" },
};
#endif /* DEBUG */
struct shortlong stat_names[] = { /* Must match lfs.h! */
{ "segsused", "Number of new segments allocated" },
{ "psegwrites", "Number of partial-segment writes" },
{ "psyncwrites", "Number of synchronous partial-segment"
" writes" },
{ "pcleanwrites", "Number of partial-segment writes by the"
" cleaner" },
{ "blocktot", "Number of blocks written" },
{ "cleanblocks", "Number of blocks written by the cleaner" },
{ "ncheckpoints", "Number of checkpoints made" },
{ "nwrites", "Number of whole writes" },
{ "nsync_writes", "Number of synchronous writes" },
{ "wait_exceeded", "Number of times writer waited for"
" cleaner" },
{ "write_exceeded", "Number of times writer invoked flush" },
{ "flush_invoked", "Number of times flush was invoked" },
{ "vflush_invoked", "Number of time vflush was called" },
{ "clean_inlocked", "Number of vnodes skipped for being dead" },
{ "clean_vnlocked", "Number of vnodes skipped for vget failure" },
{ "segs_reclaimed", "Number of segments reclaimed" },
};
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "lfs",
SYSCTL_DESCR("Log-structured file system"),
NULL, 0, NULL, 0,
CTL_VFS, 5, CTL_EOL);
/*
* XXX the "5" above could be dynamic, thereby eliminating one
* more instance of the "number to vfs" mapping problem, but
* "5" is the order as taken from sys/mount.h
*/
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "flushindir", NULL,
NULL, 0, &lfs_writeindir, 0,
CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "clean_vnhead", NULL,
NULL, 0, &lfs_clean_vnhead, 0,
CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "dostats",
SYSCTL_DESCR("Maintain statistics on LFS operations"),
sysctl_lfs_dostats, 0, &lfs_dostats, 0,
CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "pagetrip",
SYSCTL_DESCR("How many dirty pages in fs triggers"
" a flush"),
NULL, 0, &lfs_fs_pagetrip, 0,
CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "ignore_lazy_sync",
SYSCTL_DESCR("Lazy Sync is ignored entirely"),
NULL, 0, &lfs_ignore_lazy_sync, 0,
CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
#ifdef LFS_KERNEL_RFW
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "rfw",
SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
NULL, 0, &lfs_do_rfw, 0,
CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
#endif
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "stats",
SYSCTL_DESCR("Debugging options"),
NULL, 0, NULL, 0,
CTL_VFS, 5, LFS_STATS, CTL_EOL);
for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, stat_names[i].sname,
SYSCTL_DESCR(stat_names[i].lname),
NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
}
#ifdef DEBUG
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "debug",
SYSCTL_DESCR("Debugging options"),
NULL, 0, NULL, 0,
CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
for (i = 0; i < DLOG_MAX; i++) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, dlog_names[i].sname,
SYSCTL_DESCR(dlog_names[i].lname),
NULL, 0, &(lfs_debug_log_subsys[i]), 0,
CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
}
#endif
}
/* old cleaner syscall interface. see VOP_FCNTL() */
static const struct syscall_package lfs_syscalls[] = {
{ SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv },
{ SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv },
{ SYS___lfs_segwait50, 0, (sy_call_t *)sys___lfs_segwait50 },
{ SYS_lfs_segclean, 0, (sy_call_t *)sys_lfs_segclean },
{ 0, 0, NULL },
};
static int
lfs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = syscall_establish(NULL, lfs_syscalls);
if (error)
return error;
error = vfs_attach(&lfs_vfsops);
if (error != 0) {
syscall_disestablish(NULL, lfs_syscalls);
break;
}
cv_init(&lfs_allclean_wakeup, "segment");
break;
case MODULE_CMD_FINI:
error = vfs_detach(&lfs_vfsops);
if (error != 0)
break;
syscall_disestablish(NULL, lfs_syscalls);
cv_destroy(&lfs_allclean_wakeup);
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*
* XXX Same structure as FFS inodes? Should we share a common pool?
*/
struct pool lfs_inode_pool;
struct pool lfs_dinode_pool;
struct pool lfs_inoext_pool;
struct pool lfs_lbnentry_pool;
/*
* The writer daemon. UVM keeps track of how many dirty pages we are holding
* in lfs_subsys_pages; the daemon flushes the filesystem when this value
* crosses the (user-defined) threshold LFS_MAX_PAGES.
*/
static void
lfs_writerd(void *arg)
{
mount_iterator_t *iter;
struct mount *mp;
struct lfs *fs;
struct vfsops *vfs = NULL;
int fsflags;
int lfsc;
int wrote_something = 0;
mutex_enter(&lfs_lock);
KASSERTMSG(lfs_writer_daemon == NULL, "more than one LFS writer daemon");
lfs_writer_daemon = curlwp;
mutex_exit(&lfs_lock);
/* Take an extra reference to the LFS vfsops. */
vfs = vfs_getopsbyname(MOUNT_LFS);
mutex_enter(&lfs_lock);
for (;;) {
KASSERT(mutex_owned(&lfs_lock));
if (wrote_something == 0)
cv_timedwait(&lfs_writerd_cv, &lfs_lock, hz/10 + 1);
KASSERT(mutex_owned(&lfs_lock));
wrote_something = 0;
/*
* If global state wants a flush, flush everything.
*/
if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
locked_queue_bytes > LFS_MAX_BYTES ||
lfs_subsys_pages > LFS_MAX_PAGES) {
if (lfs_do_flush) {
DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
}
if (locked_queue_count > LFS_MAX_BUFS) {
DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
locked_queue_count, LFS_MAX_BUFS));
}
if (locked_queue_bytes > LFS_MAX_BYTES) {
DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
locked_queue_bytes, LFS_MAX_BYTES));
}
if (lfs_subsys_pages > LFS_MAX_PAGES) {
DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
lfs_subsys_pages, LFS_MAX_PAGES));
}
lfs_flush(NULL, SEGM_WRITERD, 0);
lfs_do_flush = 0;
KASSERT(mutex_owned(&lfs_lock));
continue;
}
KASSERT(mutex_owned(&lfs_lock));
mutex_exit(&lfs_lock);
/*
* Look through the list of LFSs to see if any of them
* have requested pageouts.
*/
mountlist_iterator_init(&iter);
lfsc = 0;
while ((mp = mountlist_iterator_next(iter)) != NULL) {
KASSERT(!mutex_owned(&lfs_lock));
if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
sizeof(mp->mnt_stat.f_fstypename)) == 0) {
++lfsc;
fs = VFSTOULFS(mp)->um_lfs;
daddr_t ooffset = 0;
fsflags = SEGM_SINGLE;
mutex_enter(&lfs_lock);
ooffset = lfs_sb_getoffset(fs);
if (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) {
/* Don't try to write if we're suspended */
mutex_exit(&lfs_lock);
continue;
}
if (LFS_STARVED_FOR_SEGS(fs)) {
mutex_exit(&lfs_lock);
DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
lfs_wakeup_cleaner(fs);
continue;
}
if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
lfs_dirvcount > LFS_MAX_DIROP) &&
fs->lfs_dirops == 0) {
fsflags &= ~SEGM_SINGLE;
fsflags |= SEGM_CKP;
DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
lfs_flush_fs(fs, fsflags);
} else if (fs->lfs_pdflush) {
DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
lfs_flush_fs(fs, fsflags);
} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "wrdirop");
lfs_flush_pchain(fs);
lfs_writer_leave(fs);
mutex_enter(&lfs_lock);
}
if (lfs_sb_getoffset(fs) != ooffset)
++wrote_something;
mutex_exit(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
}
if (lfsc == 0) {
mutex_enter(&lfs_lock);
lfs_writer_daemon = NULL;
mutex_exit(&lfs_lock);
mountlist_iterator_destroy(iter);
break;
}
mountlist_iterator_destroy(iter);
mutex_enter(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
/* Give up our extra reference so the module can be unloaded. */
mutex_enter(&vfs_list_lock);
if (vfs != NULL)
vfs->vfs_refcount--;
mutex_exit(&vfs_list_lock);
/* Done! */
kthread_exit(0);
}
/*
* Initialize the filesystem, most work done by ulfs_init.
*/
void
lfs_init(void)
{
/*
* XXX: should we use separate pools for 32-bit and 64-bit
* dinodes?
*/
malloc_type_attach(M_SEGMENT);
pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
"lfsinopl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_dinode_pool, sizeof(union lfs_dinode), 0, 0, 0,
"lfsdinopl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
"lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
"lfslbnpool", &pool_allocator_nointr, IPL_NONE);
ulfs_init();
#ifdef DEBUG
memset(lfs_log, 0, sizeof(lfs_log));
#endif
mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&lfs_writerd_cv, "lfswrite");
cv_init(&locked_queue_cv, "lfsbuf");
cv_init(&lfs_writing_cv, "lfsflush");
}
void
lfs_reinit(void)
{
ulfs_reinit();
}
void
lfs_done(void)
{
ulfs_done();
mutex_destroy(&lfs_lock);
cv_destroy(&lfs_writerd_cv);
cv_destroy(&locked_queue_cv);
cv_destroy(&lfs_writing_cv);
pool_destroy(&lfs_inode_pool);
pool_destroy(&lfs_dinode_pool);
pool_destroy(&lfs_inoext_pool);
pool_destroy(&lfs_lbnentry_pool);
malloc_type_detach(M_SEGMENT);
}
/*
* Called by main() when ulfs is going to be mounted as root.
*/
int
lfs_mountroot(void)
{
extern struct vnode *rootvp;
struct lfs *fs = NULL; /* LFS */
struct mount *mp;
struct lwp *l = curlwp;
struct ulfsmount *ump;
int error;
if (device_class(root_device) != DV_DISK)
return (ENODEV);
if (rootdev == NODEV)
return (ENODEV);
if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
vrele(rootvp);
return (error);
}
if ((error = lfs_mountfs(rootvp, mp, l))) {
vfs_unbusy(mp);
vfs_rele(mp);
return (error);
}
mountlist_append(mp);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
(void)lfs_statvfs(mp, &mp->mnt_stat);
vfs_unbusy(mp);
setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp)->um_lfs));
return (0);
}
/*
* VFS Operations.
*
* mount system call
*/
int
lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
struct vnode *devvp;
struct ulfs_args *args = data;
struct ulfsmount *ump = NULL;
struct lfs *fs = NULL; /* LFS */
int error = 0, update;
mode_t accessmode;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
ump = VFSTOULFS(mp);
if (ump == NULL)
return EIO;
args->fspec = NULL;
*data_len = sizeof *args;
return 0;
}
update = mp->mnt_flag & MNT_UPDATE;
/* Check arguments */
if (args->fspec != NULL) {
/*
* Look up the name and verify that it's sane.
*/
error = namei_simple_user(args->fspec,
NSM_FOLLOW_NOEMULROOT, &devvp);
if (error != 0)
return (error);
if (!update) {
/*
* Be sure this is a valid block device
*/
if (devvp->v_type != VBLK)
error = ENOTBLK;
else if (bdevsw_lookup(devvp->v_rdev) == NULL)
error = ENXIO;
} else {
/*
* Be sure we're still naming the same device
* used for our initial mount
*
* XXX dholland 20151010: if namei gives us a
* different vnode for the same device,
* wouldn't it be better to use it going
* forward rather than ignore it in favor of
* the old one?
*/
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
if (devvp != fs->lfs_devvp) { if (devvp->v_rdev != fs->lfs_devvp->v_rdev)
error = EINVAL;
else {
vrele(devvp);
devvp = fs->lfs_devvp;
vref(devvp);
}
}
}
} else {
if (!update) {
/* New mounts must have a filename for the device */
return (EINVAL);
} else {
/* Use the extant mount */
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
devvp = fs->lfs_devvp;
vref(devvp);
}
}
/*
* If mount by non-root, then verify that user has necessary
* permissions on the device.
*/
if (error == 0) {
accessmode = VREAD;
if (update ?
(mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
(mp->mnt_flag & MNT_RDONLY) == 0)
accessmode |= VWRITE;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp,
KAUTH_ARG(accessmode));
VOP_UNLOCK(devvp);
}
if (error) {
vrele(devvp);
return (error);
}
if (!update) {
int flags;
if (mp->mnt_flag & MNT_RDONLY)
flags = FREAD;
else
flags = FREAD|FWRITE;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(devvp, flags, FSCRED);
VOP_UNLOCK(devvp);
if (error)
goto fail;
error = lfs_mountfs(devvp, mp, l); /* LFS */
if (error) {
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
(void)VOP_CLOSE(devvp, flags, NOCRED);
VOP_UNLOCK(devvp);
goto fail;
}
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
} else {
/*
* Update the mount.
*/
/*
* The initial mount got a reference on this
* device, so drop the one obtained via
* namei(), above.
*/
vrele(devvp);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
if (!fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDONLY)) {
/*
* Changing from read/write to read-only.
*/
int flags = WRITECLOSE;
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
error = lfs_flushfiles(mp, flags);
if (error)
return error;
fs->lfs_ronly = 1; } else if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
/*
* Changing from read-only to read/write.
* Note in the superblocks that we're writing.
*/
/* XXX: quotas should have been on even if readonly */
if (fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
error = lfs_quota2_mount(mp);
#else
uprintf("%s: no kernel support for this "
"filesystem's quotas\n",
mp->mnt_stat.f_mntonname);
if (mp->mnt_flag & MNT_FORCE) { uprintf("%s: mounting anyway; "
"fsck afterwards\n",
mp->mnt_stat.f_mntonname);
} else {
error = EINVAL;
}
#endif
if (error) {
return error;
}
}
fs->lfs_ronly = 0;
if (lfs_sb_getpflags(fs) & LFS_PF_CLEAN) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
}
if (args->fspec == NULL)
return 0;
}
error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error == 0) lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
return error;
fail:
vrele(devvp);
return (error);
}
/*
* Helper for mountfs. Note that the fs pointer may be a dummy one
* pointing into a superblock buffer. (Which is gross; see below.)
*/
static int
lfs_checkmagic(struct lfs *fs)
{
switch (fs->lfs_dlfs_u.u_32.dlfs_magic) {
case LFS_MAGIC:
fs->lfs_is64 = false;
fs->lfs_dobyteswap = false;
break;
case LFS64_MAGIC:
fs->lfs_is64 = true;
fs->lfs_dobyteswap = false;
break;
#ifdef LFS_EI
case LFS_MAGIC_SWAPPED:
fs->lfs_is64 = false;
fs->lfs_dobyteswap = true;
break;
case LFS64_MAGIC_SWAPPED:
fs->lfs_is64 = true;
fs->lfs_dobyteswap = true;
break;
#endif
default:
/* XXX needs translation */
return EINVAL;
}
return 0;
}
/*
* Common code for mount and mountroot
* LFS specific
*/
int
lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
{
struct lfs *primarysb, *altsb, *thesb;
struct buf *primarybuf, *altbuf;
struct lfs *fs;
struct ulfsmount *ump;
struct vnode *vp;
dev_t dev;
int error, i, ronly, fsbsize;
kauth_cred_t cred;
CLEANERINFO *cip;
SEGUSE *sup;
daddr_t sb_addr;
ino_t *orphan;
size_t norphan;
cred = l ? l->l_cred : NOCRED;
/* The superblock is supposed to be 512 bytes. */
__CTASSERT(sizeof(struct dlfs) == DEV_BSIZE);
/*
* Flush out any old buffers remaining from a previous use.
*/
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
VOP_UNLOCK(devvp);
if (error)
return (error);
ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
/* Don't free random space on error. */
primarybuf = NULL;
altbuf = NULL;
ump = NULL;
sb_addr = LFS_LABELPAD / DEV_BSIZE;
while (1) {
/*
* Read in the superblock.
*
* Note that because LFS_SBPAD is substantially larger
* (8K) than the actual on-disk superblock (512 bytes)
* the buffer contains enough space to be used as a
* whole struct lfs (in-memory superblock) - we do this
* only so we can set and use the is64 and dobyteswap
* members. XXX this is gross and the logic here should
* be reworked.
*/
error = bread(devvp, sb_addr, LFS_SBPAD, 0, &primarybuf);
if (error)
goto out;
primarysb = (struct lfs *)primarybuf->b_data;
/* Check the basics. */
error = lfs_checkmagic(primarysb);
if (error) {
DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock wrong magic\n"));
goto out;
}
if (lfs_sb_getbsize(primarysb) > MAXBSIZE ||
lfs_sb_getversion(primarysb) > LFS_VERSION ||
lfs_sb_getbsize(primarysb) < sizeof(struct dlfs)) {
DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
/* XXX needs translation */
error = EINVAL;
goto out;
}
if (lfs_sb_getinodefmt(primarysb) > LFS_MAXINODEFMT) {
DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
lfs_sb_getinodefmt(primarysb)));
error = EINVAL;
goto out;
}
if (lfs_sb_getversion(primarysb) == 1)
fsbsize = DEV_BSIZE;
else {
fsbsize = 1 << lfs_sb_getffshift(primarysb);
/*
* Could be, if the frag size is large enough, that we
* don't have the "real" primary superblock. If that's
* the case, get the real one, and try again.
*/
if (sb_addr != (lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))) {
DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
" 0x%llx is not right, trying 0x%llx\n",
(long long)sb_addr,
(long long)(lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))));
sb_addr = lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT);
brelse(primarybuf, BC_INVAL);
continue;
}
}
break;
}
/*
* Check the second superblock to see which is newer; then mount
* using the older of the two. This is necessary to ensure that
* the filesystem is valid if it was not unmounted cleanly.
*/
if (lfs_sb_getsboff(primarysb, 1) &&
lfs_sb_getsboff(primarysb, 1) - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
{
error = bread(devvp, lfs_sb_getsboff(primarysb, 1) * (fsbsize / DEV_BSIZE),
LFS_SBPAD, 0, &altbuf);
if (error)
goto out;
altsb = (struct lfs *)altbuf->b_data;
/*
* Note: this used to do the sanity check only if the
* timestamp/serial comparison required use of altsb;
* this way is less tolerant, but if altsb is corrupted
* enough that the magic number, version, and blocksize
* are bogus, why would the timestamp or serial fields
* mean anything either? If this kind of thing happens,
* you need to fsck anyway.
*/
error = lfs_checkmagic(altsb);
if (error)
goto out;
/* Check the basics. */
if (lfs_sb_getbsize(altsb) > MAXBSIZE ||
lfs_sb_getversion(altsb) > LFS_VERSION ||
lfs_sb_getbsize(altsb) < sizeof(struct dlfs)) {
DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
" sanity failed\n"));
error = EINVAL; /* XXX needs translation */
goto out;
}
if (lfs_sb_getversion(primarysb) == 1) {
/* 1s resolution comparison */
if (lfs_sb_gettstamp(altsb) < lfs_sb_gettstamp(primarysb))
thesb = altsb;
else
thesb = primarysb;
} else {
/* monotonic infinite-resolution comparison */
if (lfs_sb_getserial(altsb) < lfs_sb_getserial(primarysb))
thesb = altsb;
else
thesb = primarysb;
}
} else {
DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock location"
" daddr=0x%x\n", lfs_sb_getsboff(primarysb, 1)));
error = EINVAL;
goto out;
}
/*
* Allocate the mount structure, copy the superblock into it.
* Note that the 32-bit and 64-bit superblocks are the same size.
*/
fs = kmem_zalloc(sizeof(struct lfs), KM_SLEEP);
memcpy(&fs->lfs_dlfs_u.u_32, &thesb->lfs_dlfs_u.u_32,
sizeof(struct dlfs));
fs->lfs_is64 = thesb->lfs_is64;
fs->lfs_dobyteswap = thesb->lfs_dobyteswap;
fs->lfs_hasolddirfmt = false; /* set for real below */
/* Compatibility */
if (lfs_sb_getversion(fs) < 2) {
lfs_sb_setsumsize(fs, LFS_V1_SUMMARY_SIZE);
lfs_sb_setibsize(fs, lfs_sb_getbsize(fs));
lfs_sb_sets0addr(fs, lfs_sb_getsboff(fs, 0));
lfs_sb_settstamp(fs, lfs_sb_getotstamp(fs));
lfs_sb_setfsbtodb(fs, 0);
}
if (lfs_sb_getresvseg(fs) == 0)
lfs_sb_setresvseg(fs, MIN(lfs_sb_getminfreeseg(fs) - 1, \
MAX(MIN_RESV_SEGS, lfs_sb_getminfreeseg(fs) / 2 + 1)));
/*
* If we aren't going to be able to write meaningfully to this
* filesystem, and were not mounted readonly, bomb out now.
*/
if (lfs_fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
" we need BUFPAGES >= %lld\n",
(long long)((bufmem_hiwater / bufmem_lowater) *
LFS_INVERSE_MAX_BYTES(
lfs_fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
kmem_free(fs, sizeof(struct lfs));
error = EFBIG; /* XXX needs translation */
goto out;
}
/* Before rolling forward, lock so vget will sleep for other procs */
if (l != NULL) {
fs->lfs_flags = LFS_NOTYET;
fs->lfs_rfpid = l->l_proc->p_pid;
}
ump = kmem_zalloc(sizeof(*ump), KM_SLEEP);
ump->um_lfs = fs;
ump->um_fstype = fs->lfs_is64 ? ULFS2 : ULFS1;
/* ump->um_cleaner_thread = NULL; */
brelse(primarybuf, BC_INVAL);
brelse(altbuf, BC_INVAL);
primarybuf = NULL;
altbuf = NULL;
/* Set up the I/O information */
fs->lfs_devbsize = DEV_BSIZE;
fs->lfs_iocount = 0;
fs->lfs_diropwait = 0;
fs->lfs_activesb = 0;
lfs_sb_setuinodes(fs, 0);
fs->lfs_ravail = 0;
fs->lfs_favail = 0;
fs->lfs_sbactive = 0;
/* Set up the ifile and lock aflags */
fs->lfs_doifile = 0;
fs->lfs_writer = 0;
fs->lfs_dirops = 0;
fs->lfs_nadirop = 0;
fs->lfs_seglock = 0;
fs->lfs_pdflush = 0;
fs->lfs_sleepers = 0;
fs->lfs_pages = 0;
rw_init(&fs->lfs_fraglock);
rw_init(&fs->lfs_iflock);
cv_init(&fs->lfs_sleeperscv, "lfs_slp");
cv_init(&fs->lfs_diropscv, "lfs_dirop");
cv_init(&fs->lfs_stopcv, "lfsstop");
cv_init(&fs->lfs_nextsegsleep, "segment");
/* Set the file system readonly/modify bits. */
fs->lfs_ronly = ronly;
if (ronly == 0)
fs->lfs_fmod = 1;
/* Device we're using */
dev = devvp->v_rdev;
fs->lfs_dev = dev;
fs->lfs_devvp = devvp;
/* ulfs-level information */
fs->um_flags = 0;
fs->um_bptrtodb = lfs_sb_getffshift(fs) - DEV_BSHIFT;
fs->um_seqinc = lfs_sb_getfrag(fs);
fs->um_nindir = lfs_sb_getnindir(fs);
fs->um_lognindir = ffs(lfs_sb_getnindir(fs)) - 1;
fs->um_maxsymlinklen = lfs_sb_getmaxsymlinklen(fs);
fs->um_dirblksiz = LFS_DIRBLKSIZ;
fs->um_maxfilesize = lfs_sb_getmaxfilesize(fs);
/* quota stuff */
/* XXX: these need to come from the on-disk superblock to be used */
fs->lfs_use_quota2 = 0;
fs->lfs_quota_magic = 0;
fs->lfs_quota_flags = 0;
fs->lfs_quotaino[0] = 0;
fs->lfs_quotaino[1] = 0;
/* Initialize the mount structure. */
mp->mnt_data = ump;
mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs);
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_iflag |= IMNT_SHRLOOKUP;
mp->mnt_fs_bshift = lfs_sb_getbshift(fs);
mp->mnt_iflag |= IMNT_CAN_RWTORO;
if (fs->um_maxsymlinklen > 0)
mp->mnt_iflag |= IMNT_DTYPE;
else
fs->lfs_hasolddirfmt = true;
ump->um_mountp = mp;
for (i = 0; i < ULFS_MAXQUOTAS; i++)
ump->um_quotas[i] = NULLVP;
spec_node_setmountedfs(devvp, mp);
/* Set up reserved memory for pageout */
lfs_setup_resblks(fs);
/* Set up vdirop tailq */
TAILQ_INIT(&fs->lfs_dchainhd);
/* and paging tailq */
TAILQ_INIT(&fs->lfs_pchainhd);
/* and delayed segment accounting for truncation list */
LIST_INIT(&fs->lfs_segdhd);
/*
* We use the ifile vnode for almost every operation. Instead of
* retrieving it from the hash table each time we retrieve it here,
* artificially increment the reference count and keep a pointer
* to it in the incore copy of the superblock.
*/
if ((error = VFS_VGET(mp, LFS_IFILE_INUM, LK_EXCLUSIVE, &vp)) != 0) {
DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
goto out;
}
fs->lfs_ivnode = vp;
vref(vp);
/* Set up inode bitmap, order free list, and gather orphans. */
lfs_order_freelist(fs, &orphan, &norphan);
/* Set up segment usage flags for the autocleaner. */
fs->lfs_nactive = 0;
fs->lfs_suflags = malloc(2 * sizeof(u_int32_t *),
M_SEGMENT, M_WAITOK);
fs->lfs_suflags[0] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
M_SEGMENT, M_WAITOK);
fs->lfs_suflags[1] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
M_SEGMENT, M_WAITOK);
memset(fs->lfs_suflags[1], 0, lfs_sb_getnseg(fs) * sizeof(u_int32_t));
for (i = 0; i < lfs_sb_getnseg(fs); i++) {
int changed;
struct buf *bp;
LFS_SEGENTRY(sup, fs, i, bp);
changed = 0;
if (!ronly) {
if (sup->su_nbytes == 0 &&
!(sup->su_flags & SEGUSE_EMPTY)) {
sup->su_flags |= SEGUSE_EMPTY;
++changed;
} else if (!(sup->su_nbytes == 0) &&
(sup->su_flags & SEGUSE_EMPTY)) {
sup->su_flags &= ~SEGUSE_EMPTY;
++changed;
}
if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
++changed;
}
}
fs->lfs_suflags[0][i] = sup->su_flags;
if (changed)
LFS_WRITESEGENTRY(sup, fs, i, bp);
else
brelse(bp, 0);
}
/* Free the orphans we discovered while ordering the freelist. */
lfs_free_orphans(fs, orphan, norphan);
/*
* XXX: if the fs has quotas, quotas should be on even if
* readonly. Otherwise you can't query the quota info!
* However, that's not how the quota2 code got written and I
* don't know if it'll behave itself if enabled while
* readonly, so for now use the same enable logic as ffs.
*
* XXX: also, if you use the -f behavior allowed here (and
* equivalently above for remount) it will corrupt the fs. It
* ought not to allow that. It should allow mounting readonly
* if there are quotas and the kernel doesn't have the quota
* code, but only readonly.
*
* XXX: and if you use the -f behavior allowed here it will
* likely crash at unmount time (or remount time) because we
* think quotas are active.
*
* Although none of this applies until there's a way to set
* lfs_use_quota2 and have quotas in the fs at all.
*/
if (!ronly && fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
error = lfs_quota2_mount(mp);
#else
uprintf("%s: no kernel support for this filesystem's quotas\n",
mp->mnt_stat.f_mntonname);
if (mp->mnt_flag & MNT_FORCE) {
uprintf("%s: mounting anyway; fsck afterwards\n",
mp->mnt_stat.f_mntonname);
} else {
error = EINVAL;
}
#endif
if (error) {
/* XXX XXX must clean up the stuff immediately above */
printf("lfs_mountfs: sorry, leaking some memory\n");
goto out;
}
}
#ifdef LFS_KERNEL_RFW
lfs_roll_forward(fs, mp, l);
#endif
/* If writing, sb is not clean; record in case of immediate crash */
if (!fs->lfs_ronly) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
/* Allow vget now that roll-forward is complete */
fs->lfs_flags &= ~(LFS_NOTYET);
wakeup(&fs->lfs_flags);
/*
* Initialize the ifile cleaner info with information from
* the superblock.
*/
{
struct buf *bp;
LFS_CLEANERINFO(cip, fs, bp);
lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs));
lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs));
(void) LFS_BWRITE_LOG(bp); /* Ifile */
}
/*
* Mark the current segment as ACTIVE, since we're going to
* be writing to it.
*/
{
struct buf *bp;
LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp);
sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
fs->lfs_nactive++;
LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); /* Ifile */
}
/* Now that roll-forward is done, unlock the Ifile */
vput(vp);
/* Start the pagedaemon-anticipating daemon */
mutex_enter(&lfs_lock);
if (lfs_writer_daemon == NULL &&
kthread_create(PRI_BIO, 0, NULL,
lfs_writerd, NULL, NULL, "lfs_writer") != 0)
panic("fork lfs_writer");
mutex_exit(&lfs_lock);
printf("WARNING: the log-structured file system is experimental\n"
"WARNING: it may cause system crashes and/or corrupt data\n");
return (0);
out:
if (primarybuf)
brelse(primarybuf, BC_INVAL);
if (altbuf)
brelse(altbuf, BC_INVAL);
if (ump) {
kmem_free(ump->um_lfs, sizeof(struct lfs));
kmem_free(ump, sizeof(*ump));
mp->mnt_data = NULL;
}
return (error);
}
/*
* unmount system call
*/
int
lfs_unmount(struct mount *mp, int mntflags)
{
struct ulfsmount *ump;
struct lfs *fs;
int error, ronly;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
error = lfs_flushfiles(mp, mntflags & MNT_FORCE ? FORCECLOSE : 0);
if (error)
return error;
/* Finish with the Ifile, now that we're done with it */
vgone(fs->lfs_ivnode);
ronly = !fs->lfs_ronly;
if (fs->lfs_devvp->v_type != VBAD)
spec_node_setmountedfs(fs->lfs_devvp, NULL);
vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_CLOSE(fs->lfs_devvp,
ronly ? FREAD : FREAD|FWRITE, NOCRED);
vput(fs->lfs_devvp);
/* Complain about page leakage */
if (fs->lfs_pages > 0)
printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
fs->lfs_pages, lfs_subsys_pages);
/* Free per-mount data structures */
free(fs->lfs_ino_bitmap, M_SEGMENT);
free(fs->lfs_suflags[0], M_SEGMENT);
free(fs->lfs_suflags[1], M_SEGMENT);
free(fs->lfs_suflags, M_SEGMENT);
lfs_free_resblks(fs);
cv_destroy(&fs->lfs_sleeperscv);
cv_destroy(&fs->lfs_diropscv);
cv_destroy(&fs->lfs_stopcv);
cv_destroy(&fs->lfs_nextsegsleep);
rw_destroy(&fs->lfs_fraglock);
rw_destroy(&fs->lfs_iflock);
kmem_free(fs, sizeof(struct lfs));
kmem_free(ump, sizeof(*ump));
mp->mnt_data = NULL;
mp->mnt_flag &= ~MNT_LOCAL;
return (error);
}
static int
lfs_flushfiles(struct mount *mp, int flags)
{
struct lwp *l = curlwp;
struct ulfsmount *ump;
struct lfs *fs;
struct vnode *vp;
int error;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
/* Two checkpoints */
if (!fs->lfs_ronly) {
lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
}
/* wake up the cleaner so it can die */
/* XXX: shouldn't this be *after* the error cases below? */
lfs_wakeup_cleaner(fs);
mutex_enter(&lfs_lock);
while (fs->lfs_sleepers)
cv_wait(&fs->lfs_sleeperscv, &lfs_lock);
mutex_exit(&lfs_lock);
#ifdef LFS_EXTATTR
if (ump->um_fstype == ULFS1) {
if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_STARTED) {
ulfs_extattr_stop(mp, curlwp);
}
if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_INITIALIZED) {
ulfs_extattr_uepm_destroy(&ump->um_extattr);
mp->mnt_flag &= ~MNT_EXTATTR;
}
}
#endif
#ifdef LFS_QUOTA
if ((error = lfsquota1_umount(mp, flags)) != 0)
return (error);
#endif
#ifdef LFS_QUOTA2
if ((error = lfsquota2_umount(mp, flags)) != 0)
return (error);
#endif
if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
return (error);
if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
return (error);
vp = fs->lfs_ivnode;
mutex_enter(vp->v_interlock);
if (LIST_FIRST(&vp->v_dirtyblkhd))
panic("lfs_unmount: still dirty blocks on ifile vnode");
mutex_exit(vp->v_interlock);
/* Explicitly write the superblock, to update serial and pflags */
if (!fs->lfs_ronly) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
mutex_enter(&lfs_lock);
while (fs->lfs_iocount)
mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
&lfs_lock);
mutex_exit(&lfs_lock);
return 0;
}
/*
* Get file system statistics.
*
* NB: We don't lock to access the superblock here, because it's not
* really that important if we get it wrong.
*/
int
lfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
struct lfs *fs;
struct ulfsmount *ump;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
sbp->f_bsize = lfs_sb_getbsize(fs);
sbp->f_frsize = lfs_sb_getfsize(fs);
sbp->f_iosize = lfs_sb_getbsize(fs);
sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;
sbp->f_bfree = LFS_EST_BFREE(fs);
/*
* XXX this should be lfs_sb_getsize (measured in frags)
* rather than dsize (measured in diskblocks). However,
* getsize needs a format version check (for version 1 it
* needs to be blockstofrags'd) so for the moment I'm going to
* leave this... it won't fire wrongly as frags are at least
* as big as diskblocks.
*/
KASSERT(sbp->f_bfree <= lfs_sb_getdsize(fs));
#if 0
if (sbp->f_bfree < 0)
sbp->f_bfree = 0;
#endif
sbp->f_bresvd = LFS_EST_RSVD(fs);
if (sbp->f_bfree > sbp->f_bresvd)
sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
else
sbp->f_bavail = 0;
/* XXX: huh? - dholland 20150728 */
sbp->f_files = lfs_sb_getbfree(fs) / lfs_btofsb(fs, lfs_sb_getibsize(fs))
* LFS_INOPB(fs);
sbp->f_ffree = sbp->f_files - lfs_sb_getnfiles(fs);
sbp->f_favail = sbp->f_ffree;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, mp);
return (0);
}
/*
* Go through the disk queues to initiate sandbagged IO;
* go through the inodes to write those that have been modified;
* initiate the writing of the super block if it has been modified.
*
* Note: we are always called with the filesystem marked `MPBUSY'.
*/
int
lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
int error;
struct lfs *fs;
fs = VFSTOULFS(mp)->um_lfs;
if (fs->lfs_ronly)
return 0;
/* Snapshots should not hose the syncer */
/*
* XXX Sync can block here anyway, since we don't have a very
* XXX good idea of how much data is pending. If it's more
* XXX than a segment and lfs_nextseg is close to the end of
* XXX the log, we'll likely block.
*/
mutex_enter(&lfs_lock);
if (fs->lfs_nowrap && lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs)) {
mutex_exit(&lfs_lock);
return 0;
}
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "lfs_dirops");
/* All syncs must be checkpoints until roll-forward is implemented. */
DLOG((DLOG_FLUSH, "lfs_sync at 0x%jx\n",
(uintmax_t)lfs_sb_getoffset(fs)));
error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
lfs_writer_leave(fs);
#ifdef LFS_QUOTA
lfs_qsync(mp);
#endif
return (error);
}
/*
* Look up an LFS dinode number to find its incore vnode. If not already
* in core, read it in from the specified device. Return the inode locked.
* Detection and handling of mount points must be done by the calling routine.
*/
int
lfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
int error;
error = vcache_get(mp, &ino, sizeof(ino), vpp);
if (error)
return error;
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
/*
* Create a new vnode/inode pair and initialize what fields we can.
*/
static void
lfs_init_vnode(struct ulfsmount *ump, ino_t ino, struct vnode *vp)
{
struct lfs *fs = ump->um_lfs;
struct inode *ip;
union lfs_dinode *dp;
ASSERT_NO_SEGLOCK(fs);
/* Initialize the inode. */
ip = pool_get(&lfs_inode_pool, PR_WAITOK);
memset(ip, 0, sizeof(*ip));
dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
memset(dp, 0, sizeof(*dp));
ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
ip->i_din = dp;
ip->i_ump = ump;
ip->i_vnode = vp;
ip->i_dev = fs->lfs_dev;
lfs_dino_setinumber(fs, dp, ino);
ip->i_number = ino;
ip->i_lfs = fs;
ip->i_lfs_effnblks = 0;
SPLAY_INIT(&ip->i_lfs_lbtree);
ip->i_lfs_nbtree = 0;
LIST_INIT(&ip->i_lfs_segdhd);
vp->v_tag = VT_LFS;
vp->v_op = lfs_vnodeop_p;
vp->v_data = ip;
}
/*
* Undo lfs_init_vnode().
*/
static void
lfs_deinit_vnode(struct ulfsmount *ump, struct vnode *vp)
{
struct inode *ip = VTOI(vp);
pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
pool_put(&lfs_dinode_pool, ip->i_din);
pool_put(&lfs_inode_pool, ip);
vp->v_data = NULL;
}
/*
* Read an inode from disk and initialize this vnode / inode pair.
* Caller assures no other thread will try to load this inode.
*/
int
lfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct lfs *fs;
union lfs_dinode *dip;
struct inode *ip;
struct buf *bp;
IFILE *ifp;
struct ulfsmount *ump;
ino_t ino;
daddr_t daddr;
int error, retries;
struct timespec ts;
KASSERT(key_len == sizeof(ino));
memcpy(&ino, key, key_len);
memset(&ts, 0, sizeof ts); /* XXX gcc */
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
/*
* If the filesystem is not completely mounted yet, suspend
* any access requests (wait for roll-forward to complete).
*/
mutex_enter(&lfs_lock);
while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
&lfs_lock);
mutex_exit(&lfs_lock);
/* Translate the inode number to a disk address. */
if (ino == LFS_IFILE_INUM)
daddr = lfs_sb_getidaddr(fs);
else {
/* XXX bounds-check this too */
LFS_IENTRY(ifp, fs, ino, bp);
daddr = lfs_if_getdaddr(fs, ifp);
if (lfs_sb_getversion(fs) > 1) {
ts.tv_sec = lfs_if_getatime_sec(fs, ifp);
ts.tv_nsec = lfs_if_getatime_nsec(fs, ifp);
}
brelse(bp, 0);
if (daddr == LFS_UNUSED_DADDR)
return (ENOENT);
}
/* Allocate/init new vnode/inode. */
lfs_init_vnode(ump, ino, vp);
ip = VTOI(vp);
/* If the cleaner supplied the inode, use it. */
if (curlwp == fs->lfs_cleaner_thread && fs->lfs_cleaner_hint != NULL &&
fs->lfs_cleaner_hint->bi_lbn == LFS_UNUSED_LBN) {
dip = fs->lfs_cleaner_hint->bi_bp;
if (fs->lfs_is64) {
error = copyin(dip, &ip->i_din->u_64,
sizeof(struct lfs64_dinode));
} else {
error = copyin(dip, &ip->i_din->u_32,
sizeof(struct lfs32_dinode));
}
if (error) {
lfs_deinit_vnode(ump, vp);
return error;
}
KASSERT(ip->i_number == ino);
goto out;
}
/* Read in the disk contents for the inode, copy into the inode. */
retries = 0;
again:
error = bread(fs->lfs_devvp, LFS_FSBTODB(fs, daddr),
(lfs_sb_getversion(fs) == 1 ? lfs_sb_getbsize(fs) : lfs_sb_getibsize(fs)),
0, &bp);
if (error) {
lfs_deinit_vnode(ump, vp);
return error;
}
dip = lfs_ifind(fs, ino, bp);
if (dip == NULL) {
/* Assume write has not completed yet; try again */
brelse(bp, BC_INVAL);
++retries;
if (retries <= LFS_IFIND_RETRIES) {
mutex_enter(&lfs_lock);
if (fs->lfs_iocount) {
DLOG((DLOG_VNODE,
"%s: dinode %d not found, retrying...\n",
__func__, ino));
(void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
"lfs ifind", 1, &lfs_lock);
} else
retries = LFS_IFIND_RETRIES;
mutex_exit(&lfs_lock);
goto again;
}
#ifdef DEBUG
/* If the seglock is held look at the bpp to see
what is there anyway */
mutex_enter(&lfs_lock);
if (fs->lfs_seglock > 0) {
struct buf **bpp;
union lfs_dinode *dp;
int i;
for (bpp = fs->lfs_sp->bpp;
bpp != fs->lfs_sp->cbpp; ++bpp) {
if ((*bpp)->b_vp == fs->lfs_ivnode &&
bpp != fs->lfs_sp->bpp) {
/* Inode block */
printf("%s: block 0x%" PRIx64 ": ",
__func__, (*bpp)->b_blkno);
for (i = 0; i < LFS_INOPB(fs); i++) {
dp = DINO_IN_BLOCK(fs,
(*bpp)->b_data, i);
if (lfs_dino_getinumber(fs, dp))
printf("%ju ",
(uintmax_t)lfs_dino_getinumber(fs, dp));
}
printf("\n");
}
}
}
mutex_exit(&lfs_lock);
#endif /* DEBUG */
panic("lfs_loadvnode: dinode not found");
}
lfs_copy_dinode(fs, ip->i_din, dip);
brelse(bp, 0);
out:
if (lfs_sb_getversion(fs) > 1) {
lfs_dino_setatime(fs, ip->i_din, ts.tv_sec);
lfs_dino_setatimensec(fs, ip->i_din, ts.tv_nsec);
}
lfs_vinit(mp, &vp);
*new_key = &ip->i_number;
return 0;
}
/*
* Create a new inode and initialize this vnode / inode pair.
*/
int
lfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
struct vattr *vap, kauth_cred_t cred, void *extra,
size_t *key_len, const void **new_key)
{
ino_t ino;
struct inode *ip;
struct ulfsmount *ump;
struct lfs *fs;
int error, mode, gen;
KASSERT(dvp != NULL || vap->va_fileid > 0);
KASSERT(dvp != NULL && dvp->v_mount == mp);
KASSERT(vap->va_type != VNON);
*key_len = sizeof(ino);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
mode = MAKEIMODE(vap->va_type, vap->va_mode);
/*
* Allocate fresh inode. With "dvp == NULL" take the inode number
* and version from "vap".
*/
if (dvp == NULL) {
ino = vap->va_fileid;
gen = vap->va_gen;
error = lfs_valloc_fixed(fs, ino, gen);
} else {
error = lfs_valloc(dvp, mode, cred, &ino, &gen);
}
if (error)
return error;
/* Attach inode to vnode. */
lfs_init_vnode(ump, ino, vp);
ip = VTOI(vp);
mutex_enter(&lfs_lock);
LFS_SET_UINO(ip, IN_CHANGE);
mutex_exit(&lfs_lock);
/* Note no blocks yet */
ip->i_lfs_hiblk = -1;
/* Set a new generation number for this inode. */
ip->i_gen = gen;
lfs_dino_setgen(fs, ip->i_din, gen);
memset(ip->i_lfs_fragsize, 0,
ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));
/* Set uid / gid. */
if (cred == NOCRED || cred == FSCRED) {
ip->i_gid = 0;
ip->i_uid = 0;
} else {
ip->i_gid = VTOI(dvp)->i_gid;
ip->i_uid = kauth_cred_geteuid(cred);
}
DIP_ASSIGN(ip, gid, ip->i_gid);
DIP_ASSIGN(ip, uid, ip->i_uid);
#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
error = lfs_chkiq(ip, 1, cred, 0);
if (error) {
lfs_vfree(dvp, ino, mode);
lfs_deinit_vnode(ump, vp);
return error;
}
#endif
/* Set type and finalize. */
ip->i_flags = 0;
DIP_ASSIGN(ip, flags, 0);
ip->i_mode = mode;
DIP_ASSIGN(ip, mode, mode);
if (vap->va_rdev != VNOVAL) {
/*
* Want to be able to use this to make badblock
* inodes, so don't truncate the dev number.
*/
// XXX clean this up
if (ump->um_fstype == ULFS1)
ip->i_din->u_32.di_rdev = ulfs_rw32(vap->va_rdev,
ULFS_MPNEEDSWAP(fs));
else
ip->i_din->u_64.di_rdev = ulfs_rw64(vap->va_rdev,
ULFS_MPNEEDSWAP(fs));
}
lfs_vinit(mp, &vp);
*new_key = &ip->i_number;
return 0;
}
/*
* File handle to vnode
*/
int
lfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
struct lfid lfh;
struct lfs *fs;
if (fhp->fid_len != sizeof(struct lfid))
return EINVAL;
memcpy(&lfh, fhp, sizeof(lfh));
if (lfh.lfid_ino < LFS_IFILE_INUM)
return ESTALE;
fs = VFSTOULFS(mp)->um_lfs;
if (lfh.lfid_ident != lfs_sb_getident(fs))
return ESTALE;
if (lfh.lfid_ino >
((lfs_dino_getsize(fs, VTOI(fs->lfs_ivnode)->i_din) >> lfs_sb_getbshift(fs)) -
lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs))
return ESTALE;
return (ulfs_fhtovp(mp, &lfh.lfid_ufid, lktype, vpp));
}
/*
* Vnode pointer to File handle
*/
/* ARGSUSED */
int
lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
struct inode *ip;
struct lfid lfh;
if (*fh_size < sizeof(struct lfid)) {
*fh_size = sizeof(struct lfid);
return E2BIG;
}
*fh_size = sizeof(struct lfid);
ip = VTOI(vp);
memset(&lfh, 0, sizeof(lfh));
lfh.lfid_len = sizeof(struct lfid);
lfh.lfid_ino = ip->i_number;
lfh.lfid_gen = ip->i_gen;
lfh.lfid_ident = lfs_sb_getident(ip->i_lfs);
memcpy(fhp, &lfh, sizeof(lfh));
return (0);
}
/*
* ulfs_bmaparray callback function for writing.
*
* Since blocks will be written to the new segment anyway,
* we don't care about current daddr of them.
*/
static bool
lfs_issequential_hole(const struct lfs *fs,
daddr_t daddr0, daddr_t daddr1)
{
(void)fs; /* not used */
KASSERT(daddr0 == UNWRITTEN ||
(0 <= daddr0 && daddr0 <= LFS_MAX_DADDR(fs)));
KASSERT(daddr1 == UNWRITTEN ||
(0 <= daddr1 && daddr1 <= LFS_MAX_DADDR(fs)));
/* NOTE: all we want to know here is 'hole or not'. */
/* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */
/*
* treat UNWRITTENs and all resident blocks as 'contiguous'
*/
if (daddr0 != 0 && daddr1 != 0)
return true;
/*
* both are in hole?
*/
if (daddr0 == 0 && daddr1 == 0)
return true; /* all holes are 'contiguous' for us. */
return false;
}
/*
* lfs_gop_write functions exactly like genfs_gop_write, except that
* (1) it requires the seglock to be held by its caller, and sp->fip
* to be properly initialized (it will return without re-initializing
* sp->fip, and without calling lfs_writeseg).
* (2) it uses the remaining space in the segment, rather than VOP_BMAP,
* to determine how large a block it can write at once (though it does
* still use VOP_BMAP to find holes in the file);
* (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
* (leaving lfs_writeseg to deal with the cluster blocks, so we might
* now have clusters of clusters, ick.)
*/
static int
lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
int i, error, run, haveeof = 0;
int fs_bshift;
vaddr_t kva;
off_t eof, offset, startoffset = 0;
size_t bytes, iobytes, skipbytes;
bool async = (flags & PGO_SYNCIO) == 0;
daddr_t lbn, blkno;
struct vm_page *pg;
struct buf *mbp, *bp;
struct vnode *devvp = VTOI(vp)->i_devvp;
struct inode *ip = VTOI(vp);
struct lfs *fs = ip->i_lfs;
struct segment *sp = fs->lfs_sp;
SEGSUM *ssp;
UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
const char * failreason = NULL;
ASSERT_SEGLOCK(fs);
/* The Ifile lives in the buffer cache */
KASSERT(vp != fs->lfs_ivnode);
/*
* We don't want to fill the disk before the cleaner has a chance
* to make room for us. If we're in danger of doing that, fail
* with EAGAIN. The caller will have to notice this, unlock
* so the cleaner can run, relock and try again.
*
* We must write everything, however, if our vnode is being
* reclaimed.
*/
mutex_enter(vp->v_interlock);
if (LFS_STARVED_FOR_SEGS(fs) && vdead_check(vp, VDEAD_NOWAIT) == 0) {
mutex_exit(vp->v_interlock);
failreason = "Starved for segs and not flushing vp";
goto tryagain;
}
mutex_exit(vp->v_interlock);
/*
* Sometimes things slip past the filters in lfs_putpages,
* and the pagedaemon tries to write pages---problem is
* that the pagedaemon never acquires the segment lock.
*
* Alternatively, pages that were clean when we called
* genfs_putpages may have become dirty in the meantime. In this
* case the segment header is not properly set up for blocks
* to be added to it.
*
* Unbusy and unclean the pages, and put them on the ACTIVE
* queue under the hypothesis that they couldn't have got here
* unless they were modified *quite* recently.
*
* XXXUBC that last statement is an oversimplification of course.
*/
if (!LFS_SEGLOCK_HELD(fs)) {
failreason = "Seglock not held";
goto tryagain;
}
if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
failreason = "Inode with no_gop_write";
goto tryagain;
}
if ((pgs[0]->offset & lfs_sb_getbmask(fs)) != 0) {
failreason = "Bad page offset";
goto tryagain;
}
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
GOP_SIZE(vp, vp->v_size, &eof, 0);
haveeof = 1;
if (vp->v_type == VREG)
fs_bshift = vp->v_mount->mnt_fs_bshift;
else
fs_bshift = DEV_BSHIFT;
error = 0;
pg = pgs[0];
startoffset = pg->offset;
KASSERT(eof >= 0);
if (startoffset >= eof) {
failreason = "Offset beyond EOF";
goto tryagain;
} else
bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
skipbytes = 0;
KASSERT(bytes != 0);
/* Swap PG_DELWRI for PG_PAGEOUT */
for (i = 0; i < npages; i++) {
if (pgs[i]->flags & PG_DELWRI) {
KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
pgs[i]->flags &= ~PG_DELWRI;
pgs[i]->flags |= PG_PAGEOUT;
uvm_pageout_start(1);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
uvm_pagelock(pgs[i]);
uvm_pageunwire(pgs[i]);
uvm_pageunlock(pgs[i]);
rw_exit(vp->v_uobj.vmobjlock);
}
}
/*
* Check to make sure we're starting on a block boundary.
* We'll check later to make sure we always write entire
* blocks (or fragments).
*/
if (startoffset & lfs_sb_getbmask(fs))
printf("%" PRId64 " & %" PRIu64 " = %" PRId64 "\n",
startoffset, lfs_sb_getbmask(fs),
startoffset & lfs_sb_getbmask(fs));
KASSERT((startoffset & lfs_sb_getbmask(fs)) == 0);
if (bytes & lfs_sb_getffmask(fs)) {
printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
panic("lfs_gop_write: non-integer blocks");
}
/*
* We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
* If we would, write what we have and try again. If we don't
* have anything to write, we'll have to sleep.
*/
ssp = (SEGSUM *)sp->segsum;
if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
(lfs_ss_getnfinfo(fs, ssp) < 1 ?
UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
#if 0
" with nfinfo=%d at offset 0x%jx\n",
(int)lfs_ss_getnfinfo(fs, ssp),
(uintmax_t)lfs_sb_getoffset(fs)));
#endif
lfs_updatemeta(sp);
lfs_release_finfo(fs);
(void) lfs_writeseg(fs, sp);
lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
/*
* Having given up all of the pager_map we were holding,
* we can now wait for aiodoned to reclaim it for us
* without fear of deadlock.
*/
kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
UVMPAGER_MAPIN_WAITOK);
}
mbp = getiobuf(NULL, true);
UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
(uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
mbp->b_bufsize = npages << PAGE_SHIFT;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY|BC_AGE;
mbp->b_iodone = uvm_aio_aiodone;
bp = NULL;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
lbn = offset >> fs_bshift;
error = ulfs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
lfs_issequential_hole);
if (error) {
UVMHIST_LOG(ubchist, "ulfs_bmaparray() -> %jd",
error,0,0,0);
skipbytes += bytes;
bytes = 0;
break;
}
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
if (blkno == (daddr_t)-1) {
skipbytes += iobytes;
continue;
}
/*
* Discover how much we can really pack into this buffer.
*/
/* If no room in the current segment, finish it up */
if (sp->sum_bytes_left < sizeof(int32_t) ||
sp->seg_bytes_left < (1 << lfs_sb_getbshift(fs))) {
int vers;
lfs_updatemeta(sp);
vers = lfs_fi_getversion(fs, sp->fip);
lfs_release_finfo(fs);
(void) lfs_writeseg(fs, sp);
lfs_acquire_finfo(fs, ip->i_number, vers);
}
/* Check both for space in segment and space in segsum */
iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
<< fs_bshift);
iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
<< fs_bshift);
KASSERT(iobytes > 0);
/* if it's really one i/o, don't make a second buf */
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
/*
* All the LFS output is done by the segwriter. It
* will increment numoutput by one for all the bufs it
* receives. However this buffer needs one extra to
* account for aiodone.
*/
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
} else {
bp = getiobuf(NULL, true);
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
/*
* LFS doesn't like async I/O here, dies with
* an assert in lfs_bwrite(). Is that assert
* valid? I retained non-async behaviour when
* converted this to use nestiobuf --pooka
*/
bp->b_flags &= ~B_ASYNC;
}
/* XXX This is silly ... is this necessary? */
mutex_enter(&bufcache_lock);
mutex_enter(vp->v_interlock);
bgetvp(vp, bp);
mutex_exit(vp->v_interlock);
mutex_exit(&bufcache_lock);
bp->b_lblkno = lfs_lblkno(fs, offset);
bp->b_private = mbp;
if (devvp->v_type == VBLK) {
bp->b_dev = devvp->v_rdev;
}
VOP_BWRITE(bp->b_vp, bp);
while (lfs_gatherblock(sp, bp, NULL))
continue;
}
nestiobuf_done(mbp, skipbytes, error);
if (skipbytes) {
UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
}
UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
if (!async) {
/* Start a segment write. */
UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
mutex_enter(&lfs_lock);
lfs_flush(fs, 0, 1);
mutex_exit(&lfs_lock);
}
if ((sp->seg_flags & SEGM_SINGLE) && lfs_sb_getcurseg(fs) != fs->lfs_startseg)
return EAGAIN;
return (0);
tryagain:
/*
* We can't write the pages, for whatever reason.
* Clean up after ourselves, and make the caller try again.
*/
mutex_enter(vp->v_interlock);
/* Tell why we're here, if we know */
if (failreason != NULL) {
DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
}
if (haveeof && startoffset >= eof) {
DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
" eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
pgs[0]->offset, eof, npages));
}
for (i = 0; i < npages; i++) {
pg = pgs[i];
if (pg->flags & PG_PAGEOUT)
uvm_pageout_done(1);
uvm_pagelock(pg);
if (pg->flags & PG_DELWRI) {
uvm_pageunwire(pg);
}
uvm_pageactivate(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
vp, pg->offset));
DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
pg->wire_count));
DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
pg->loan_count));
}
uvm_page_unbusy(pgs, npages);
mutex_exit(vp->v_interlock);
return EAGAIN;
}
/*
* finish vnode/inode initialization.
* used by lfs_vget.
*/
void
lfs_vinit(struct mount *mp, struct vnode **vpp)
{
struct vnode *vp = *vpp;
struct inode *ip = VTOI(vp);
struct ulfsmount *ump = VFSTOULFS(mp);
struct lfs *fs = ump->um_lfs;
int i;
ip->i_mode = lfs_dino_getmode(fs, ip->i_din);
ip->i_nlink = lfs_dino_getnlink(fs, ip->i_din);
ip->i_lfs_osize = ip->i_size = lfs_dino_getsize(fs, ip->i_din);
ip->i_flags = lfs_dino_getflags(fs, ip->i_din);
ip->i_gen = lfs_dino_getgen(fs, ip->i_din);
ip->i_uid = lfs_dino_getuid(fs, ip->i_din);
ip->i_gid = lfs_dino_getgid(fs, ip->i_din);
ip->i_lfs_effnblks = lfs_dino_getblocks(fs, ip->i_din);
ip->i_lfs_odnlink = lfs_dino_getnlink(fs, ip->i_din);
/*
* Initialize the vnode from the inode, check for aliases. In all
* cases re-init ip, the underlying vnode/inode may have changed.
*/
ulfs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
ip = VTOI(vp);
memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));
if (vp->v_type != VLNK || ip->i_size >= ip->i_lfs->um_maxsymlinklen) {
#ifdef DEBUG
for (i = (ip->i_size + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs);
i < ULFS_NDADDR; i++) {
if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
i == 0)
continue;
if (lfs_dino_getdb(fs, ip->i_din, i) != 0) {
lfs_dump_dinode(fs, ip->i_din);
panic("inconsistent inode (direct)");
}
}
for ( ; i < ULFS_NDADDR + ULFS_NIADDR; i++) {
if (lfs_dino_getib(fs, ip->i_din, i - ULFS_NDADDR) != 0) {
lfs_dump_dinode(fs, ip->i_din);
panic("inconsistent inode (indirect)");
}
}
#endif /* DEBUG */
for (i = 0; i < ULFS_NDADDR; i++)
if (lfs_dino_getdb(fs, ip->i_din, i) != 0)
ip->i_lfs_fragsize[i] = lfs_blksize(fs, ip, i);
}
KASSERTMSG((vp->v_type != VNON),
"lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
(unsigned long long)ip->i_number,
(ip->i_mode & LFS_IFMT) >> 12);
/*
* Finish inode initialization now that aliasing has been resolved.
*/
ip->i_devvp = fs->lfs_devvp;
vref(ip->i_devvp);
#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
ulfsquota_init(ip);
#endif
genfs_node_init(vp, &lfs_genfsops);
uvm_vnp_setsize(vp, ip->i_size);
/* Initialize hiblk from file size */
ip->i_lfs_hiblk = lfs_lblkno(ip->i_lfs, ip->i_size + lfs_sb_getbsize(ip->i_lfs) - 1) - 1;
*vpp = vp;
}
/*
* Resize the filesystem to contain the specified number of segments.
*/
int
lfs_resize_fs(struct lfs *fs, int newnsegs)
{
SEGUSE *sup;
CLEANERINFO *cip;
struct buf *bp, *obp;
daddr_t olast, nlast, ilast, noff, start, end;
struct vnode *ivp;
struct inode *ip;
int error, badnews, inc, oldnsegs;
int sbbytes, csbbytes, gain, cgain;
int i;
/* Only support v2 and up */
if (lfs_sb_getversion(fs) < 2)
return EOPNOTSUPP;
/* If we're doing nothing, do it fast */
oldnsegs = lfs_sb_getnseg(fs);
if (newnsegs == oldnsegs)
return 0;
/* We always have to have two superblocks */
if (newnsegs <= lfs_dtosn(fs, lfs_sb_getsboff(fs, 1)))
/* XXX this error code is rather nonsense */
return EFBIG;
ivp = fs->lfs_ivnode;
ip = VTOI(ivp);
error = 0;
/* Take the segment lock so no one else calls lfs_newseg() */
lfs_seglock(fs, SEGM_PROT);
/*
* Make sure the segments we're going to be losing, if any,
* are in fact empty. We hold the seglock, so their status
* cannot change underneath us. Count the superblocks we lose,
* while we're at it.
*/
sbbytes = csbbytes = 0;
cgain = 0;
for (i = newnsegs; i < oldnsegs; i++) {
LFS_SEGENTRY(sup, fs, i, bp);
badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
if (sup->su_flags & SEGUSE_SUPERBLOCK)
sbbytes += LFS_SBPAD;
if (!(sup->su_flags & SEGUSE_DIRTY)) {
++cgain;
if (sup->su_flags & SEGUSE_SUPERBLOCK)
csbbytes += LFS_SBPAD;
}
brelse(bp, 0);
if (badnews) {
error = EBUSY;
goto out;
}
}
/* Note old and new segment table endpoints, and old ifile size */
olast = lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs);
nlast = howmany(newnsegs, lfs_sb_getsepb(fs)) + lfs_sb_getcleansz(fs);
ilast = ivp->v_size >> lfs_sb_getbshift(fs);
noff = nlast - olast;
/*
* Make sure no one can use the Ifile while we change it around.
* Even after taking the iflock we need to make sure no one still
* is holding Ifile buffers, so we get each one, to drain them.
* (XXX this could be done better.)
*/
rw_enter(&fs->lfs_iflock, RW_WRITER);
for (i = 0; i < ilast; i++) {
/* XXX what to do if bread fails? */
bread(ivp, i, lfs_sb_getbsize(fs), 0, &bp);
brelse(bp, 0);
}
/* Allocate new Ifile blocks */
for (i = ilast; i < ilast + noff; i++) {
if (lfs_balloc(ivp, i * lfs_sb_getbsize(fs), lfs_sb_getbsize(fs), NOCRED, 0,
&bp) != 0)
panic("balloc extending ifile");
memset(bp->b_data, 0, lfs_sb_getbsize(fs));
VOP_BWRITE(bp->b_vp, bp);
}
/* Register new ifile size */
ip->i_size += noff * lfs_sb_getbsize(fs);
lfs_dino_setsize(fs, ip->i_din, ip->i_size);
uvm_vnp_setsize(ivp, ip->i_size);
/* Copy the inode table to its new position */
if (noff != 0) {
if (noff < 0) {
start = nlast;
end = ilast + noff;
inc = 1;
} else {
start = ilast + noff - 1;
end = nlast - 1;
inc = -1;
}
for (i = start; i != end; i += inc) {
if (bread(ivp, i, lfs_sb_getbsize(fs),
B_MODIFY, &bp) != 0)
panic("resize: bread dst blk failed");
if (bread(ivp, i - noff, lfs_sb_getbsize(fs),
0, &obp))
panic("resize: bread src blk failed");
memcpy(bp->b_data, obp->b_data, lfs_sb_getbsize(fs));
VOP_BWRITE(bp->b_vp, bp);
brelse(obp, 0);
}
}
/* If we are expanding, write the new empty SEGUSE entries */
if (newnsegs > oldnsegs) {
for (i = oldnsegs; i < newnsegs; i++) {
if ((error = bread(ivp, i / lfs_sb_getsepb(fs) +
lfs_sb_getcleansz(fs), lfs_sb_getbsize(fs),
B_MODIFY, &bp)) != 0)
panic("lfs: ifile read: %d", error);
while ((i + 1) % lfs_sb_getsepb(fs) && i < newnsegs) {
sup = &((SEGUSE *)bp->b_data)[i % lfs_sb_getsepb(fs)];
memset(sup, 0, sizeof(*sup));
i++;
}
VOP_BWRITE(bp->b_vp, bp);
}
}
/* Zero out unused superblock offsets */
for (i = 2; i < LFS_MAXNUMSB; i++)
if (lfs_dtosn(fs, lfs_sb_getsboff(fs, i)) >= newnsegs)
lfs_sb_setsboff(fs, i, 0x0);
/*
* Correct superblock entries that depend on fs size.
* The computations of these are as follows:
*
* size = lfs_segtod(fs, nseg)
* dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD)
* bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used
* avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD)
* + (lfs_segtod(fs, 1) - (offset - curseg))
* - lfs_segtod(fs, minfreeseg - (minfreeseg / 2))
*
* XXX - we should probably adjust minfreeseg as well.
*/
gain = (newnsegs - oldnsegs);
lfs_sb_setnseg(fs, newnsegs);
lfs_sb_setsegtabsz(fs, nlast - lfs_sb_getcleansz(fs));
lfs_sb_addsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
lfs_sb_adddsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes));
lfs_sb_addbfree(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes)
- gain * lfs_btofsb(fs, lfs_sb_getbsize(fs) / 2));
if (gain > 0) {
lfs_sb_addnclean(fs, gain);
lfs_sb_addavail(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
} else {
lfs_sb_subnclean(fs, cgain);
lfs_sb_subavail(fs, cgain * lfs_btofsb(fs, lfs_sb_getssize(fs)) -
lfs_btofsb(fs, csbbytes));
}
/* Resize segment flag cache */
fs->lfs_suflags[0] = realloc(fs->lfs_suflags[0],
lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
fs->lfs_suflags[1] = realloc(fs->lfs_suflags[1],
lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
for (i = oldnsegs; i < newnsegs; i++)
fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;
/* Truncate Ifile if necessary */
if (noff < 0)
lfs_truncate(ivp, ivp->v_size + (noff << lfs_sb_getbshift(fs)), 0,
NOCRED);
/* Update cleaner info so the cleaner can die */
/* XXX what to do if bread fails? */
bread(ivp, 0, lfs_sb_getbsize(fs), B_MODIFY, &bp);
cip = bp->b_data;
lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
VOP_BWRITE(bp->b_vp, bp);
/* Let Ifile accesses proceed */
rw_exit(&fs->lfs_iflock);
out:
lfs_segunlock(fs);
return error;
}
/*
* Extended attribute dispatch
*/
int
lfs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
int attrnamespace, const char *attrname)
{
#ifdef LFS_EXTATTR
struct ulfsmount *ump;
ump = VFSTOULFS(mp);
if (ump->um_fstype == ULFS1) {
return ulfs_extattrctl(mp, cmd, vp, attrnamespace, attrname);
}
#endif
return vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname);
}
/* $NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_log.c 8.3 (Berkeley) 2/14/95
*/
/*
* Error log buffer for kernel printf's.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/ioctl.h>
#include <sys/msgbuf.h>
#include <sys/file.h>
#include <sys/syslog.h>
#include <sys/conf.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/intr.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>
static int sysctl_msgbuf(SYSCTLFN_PROTO);
static void logsoftintr(void *);
static bool log_async;
static struct selinfo log_selp; /* process waiting on select call */
static pid_t log_pgid; /* process/group for async I/O */
static kcondvar_t log_cv;
static void *log_sih;
static kmutex_t log_lock;
int log_open; /* also used in log() */
int msgbufmapped; /* is the message buffer mapped */
int msgbufenabled; /* is logging to the buffer enabled */
struct kern_msgbuf *msgbufp; /* the mapped buffer, itself. */
void
initmsgbuf(void *bf, size_t bufsize)
{
struct kern_msgbuf *mbp;
long new_bufs;
/* Sanity-check the given size. */
if (bufsize < sizeof(struct kern_msgbuf))
return;
mbp = msgbufp = (struct kern_msgbuf *)bf;
new_bufs = bufsize - offsetof(struct kern_msgbuf, msg_bufc);
if ((mbp->msg_magic != MSG_MAGIC) || (mbp->msg_bufs != new_bufs) ||
(mbp->msg_bufr < 0) || (mbp->msg_bufr >= mbp->msg_bufs) ||
(mbp->msg_bufx < 0) || (mbp->msg_bufx >= mbp->msg_bufs)) {
/*
* If the buffer magic number is wrong, has changed
* size (which shouldn't happen often), or is
* internally inconsistent, initialize it.
*/
memset(bf, 0, bufsize);
mbp->msg_magic = MSG_MAGIC;
mbp->msg_bufs = new_bufs;
}
/* mark it as ready for use. */
msgbufmapped = msgbufenabled = 1;
}
void
loginit(void)
{
mutex_init(&log_lock, MUTEX_DEFAULT, IPL_VM);
selinit(&log_selp);
cv_init(&log_cv, "klog");
log_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
logsoftintr, NULL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "msgbufsize",
SYSCTL_DESCR("Size of the kernel message buffer"),
sysctl_msgbuf, 0, NULL, 0,
CTL_KERN, KERN_MSGBUFSIZE, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "msgbuf",
SYSCTL_DESCR("Kernel message buffer"),
sysctl_msgbuf, 0, NULL, 0,
CTL_KERN, KERN_MSGBUF, CTL_EOL);
}
/*ARGSUSED*/
static int
logopen(dev_t dev, int flags, int mode, struct lwp *l)
{
struct kern_msgbuf *mbp = msgbufp;
int error = 0;
mutex_spin_enter(&log_lock);
if (log_open) {
error = EBUSY;
} else {
log_open = 1;
log_pgid = l->l_proc->p_pid; /* signal process only */
/*
* The message buffer is initialized during system
* configuration. If it's been clobbered, note that
* and return an error. (This allows a user to read
* the buffer via /dev/kmem, and try to figure out
* what clobbered it.
*/
if (mbp->msg_magic != MSG_MAGIC) {
msgbufenabled = 0;
error = ENXIO;
}
}
mutex_spin_exit(&log_lock);
return error;
}
/*ARGSUSED*/
static int
logclose(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_spin_enter(&log_lock);
log_pgid = 0;
log_open = 0;
log_async = 0;
mutex_spin_exit(&log_lock);
return 0;
}
/*ARGSUSED*/
static int
logread(dev_t dev, struct uio *uio, int flag)
{
struct kern_msgbuf *mbp = msgbufp;
long l;
int error = 0;
mutex_spin_enter(&log_lock);
while (mbp->msg_bufr == mbp->msg_bufx) {
if (flag & IO_NDELAY) {
mutex_spin_exit(&log_lock);
return EWOULDBLOCK;
}
error = cv_wait_sig(&log_cv, &log_lock);
if (error) {
mutex_spin_exit(&log_lock);
return error;
}
}
while (uio->uio_resid > 0) {
l = mbp->msg_bufx - mbp->msg_bufr;
if (l < 0)
l = mbp->msg_bufs - mbp->msg_bufr;
l = uimin(l, uio->uio_resid);
if (l == 0)
break;
mutex_spin_exit(&log_lock);
error = uiomove(&mbp->msg_bufc[mbp->msg_bufr], (int)l, uio);
mutex_spin_enter(&log_lock);
if (error)
break;
mbp->msg_bufr += l;
if (mbp->msg_bufr < 0 || mbp->msg_bufr >= mbp->msg_bufs)
mbp->msg_bufr = 0;
}
mutex_spin_exit(&log_lock);
return error;
}
/*ARGSUSED*/
static int
logpoll(dev_t dev, int events, struct lwp *l)
{
int revents = 0;
if (events & (POLLIN | POLLRDNORM)) {
mutex_spin_enter(&log_lock);
if (msgbufp->msg_bufr != msgbufp->msg_bufx)
revents |= events & (POLLIN | POLLRDNORM);
else
selrecord(l, &log_selp);
mutex_spin_exit(&log_lock);
}
return revents;
}
static void
filt_logrdetach(struct knote *kn)
{
mutex_spin_enter(&log_lock);
selremove_knote(&log_selp, kn);
mutex_spin_exit(&log_lock);
}
static int
filt_logread(struct knote *kn, long hint)
{
int rv;
if ((hint & NOTE_SUBMIT) == 0) mutex_spin_enter(&log_lock); if (msgbufp->msg_bufr == msgbufp->msg_bufx) {
rv = 0;
} else if (msgbufp->msg_bufr < msgbufp->msg_bufx) {
kn->kn_data = msgbufp->msg_bufx - msgbufp->msg_bufr;
rv = 1;
} else {
kn->kn_data = (msgbufp->msg_bufs - msgbufp->msg_bufr) +
msgbufp->msg_bufx;
rv = 1;
}
if ((hint & NOTE_SUBMIT) == 0) mutex_spin_exit(&log_lock);
return rv;
}
static const struct filterops logread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_logrdetach,
.f_event = filt_logread,
};
static int
logkqfilter(dev_t dev, struct knote *kn)
{
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &logread_filtops;
mutex_spin_enter(&log_lock);
selrecord_knote(&log_selp, kn);
mutex_spin_exit(&log_lock);
break;
default:
return (EINVAL);
}
return (0);
}
void
logwakeup(void)
{ if (!cold && log_open) {
mutex_spin_enter(&log_lock);
selnotify(&log_selp, 0, NOTE_SUBMIT);
if (log_async) softint_schedule(log_sih);
cv_broadcast(&log_cv);
mutex_spin_exit(&log_lock);
}
}
static void
logsoftintr(void *cookie)
{
pid_t pid;
if ((pid = log_pgid) != 0)
fownsignal(pid, SIGIO, 0, 0, NULL);
}
/*ARGSUSED*/
static int
logioctl(dev_t dev, u_long com, void *data, int flag, struct lwp *lwp)
{
long l;
switch (com) {
/* return number of characters immediately available */
case FIONREAD:
mutex_spin_enter(&log_lock);
l = msgbufp->msg_bufx - msgbufp->msg_bufr;
if (l < 0)
l += msgbufp->msg_bufs;
mutex_spin_exit(&log_lock);
*(int *)data = l;
break;
case FIONBIO:
break;
case FIOASYNC:
/* No locking needed, 'thread private'. */
log_async = (*((int *)data) != 0);
break;
case TIOCSPGRP:
case FIOSETOWN:
return fsetown(&log_pgid, com, data);
case TIOCGPGRP:
case FIOGETOWN:
return fgetown(log_pgid, com, data);
default:
return (EPASSTHROUGH);
}
return (0);
}
static void
logskip(struct kern_msgbuf *mbp)
{
/*
* Move forward read pointer to the next line
* in the buffer. Note that the buffer is
* a ring buffer so we should reset msg_bufr
* to 0 when msg_bufr exceeds msg_bufs.
*
* To prevent to loop forever, give up if we
* cannot find a newline in mbp->msg_bufs
* characters (the max size of the buffer).
*/
for (int i = 0; i < mbp->msg_bufs; i++) {
char c0 = mbp->msg_bufc[mbp->msg_bufr];
if (++mbp->msg_bufr >= mbp->msg_bufs)
mbp->msg_bufr = 0;
if (c0 == '\n')
break;
}
}
static void
logaddchar(struct kern_msgbuf *mbp, int c)
{
mbp->msg_bufc[mbp->msg_bufx++] = c;
if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
mbp->msg_bufx = 0;
/* If the buffer is full, keep the most recent data. */
if (mbp->msg_bufr == mbp->msg_bufx) logskip(mbp);
}
void
logputchar(int c)
{
struct kern_msgbuf *mbp;
if (!cold) mutex_spin_enter(&log_lock); if (!msgbufenabled)
goto out;
mbp = msgbufp;
if (mbp->msg_magic != MSG_MAGIC) {
/*
* Arguably should panic or somehow notify the
* user... but how? Panic may be too drastic,
* and would obliterate the message being kicked
* out (maybe a panic itself), and printf
* would invoke us recursively. Silently punt
* for now. If syslog is running, it should
* notice.
*/
msgbufenabled = 0;
goto out;
}
logaddchar(mbp, c);
out:
if (!cold) mutex_spin_exit(&log_lock);
}
/*
* sysctl helper routine for kern.msgbufsize and kern.msgbuf. For the
* former it merely checks the message buffer is set up. For the latter,
* it also copies out the data if necessary.
*/
static int
sysctl_msgbuf(SYSCTLFN_ARGS)
{
char *where = oldp;
size_t len, maxlen;
long beg, end;
int error;
if (!logenabled(msgbufp)) {
msgbufenabled = 0;
return (ENXIO);
}
switch (rnode->sysctl_num) {
case KERN_MSGBUFSIZE: {
struct sysctlnode node = *rnode;
int msg_bufs = (int)msgbufp->msg_bufs;
node.sysctl_data = &msg_bufs;
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
case KERN_MSGBUF:
break;
default:
return (EOPNOTSUPP);
}
if (newp != NULL)
return (EPERM);
if (oldp == NULL) {
/* always return full buffer size */
*oldlenp = msgbufp->msg_bufs;
return (0);
}
sysctl_unlock();
/*
* First, copy from the write pointer to the end of
* message buffer.
*/
error = 0;
mutex_spin_enter(&log_lock);
maxlen = MIN(msgbufp->msg_bufs, *oldlenp);
beg = msgbufp->msg_bufx;
end = msgbufp->msg_bufs;
mutex_spin_exit(&log_lock);
while (maxlen > 0) {
len = MIN(end - beg, maxlen);
if (len == 0)
break;
/* XXX unlocked, but hardly matters. */
error = copyout(&msgbufp->msg_bufc[beg], where, len);
ktrmibio(-1, UIO_READ, where, len, error);
if (error)
break;
where += len;
maxlen -= len;
/*
* ... then, copy from the beginning of message buffer to
* the write pointer.
*/
beg = 0;
end = msgbufp->msg_bufx;
}
sysctl_relock();
return (error);
}
const struct cdevsw log_cdevsw = {
.d_open = logopen,
.d_close = logclose,
.d_read = logread,
.d_write = nowrite,
.d_ioctl = logioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = logpoll,
.d_mmap = nommap,
.d_kqfilter = logkqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* $NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
* Copyright (C) 1992 Wolfgang Solfrank.
* Copyright (C) 1992 TooLs GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by TooLs GmbH.
* 4. The name of TooLs GmbH may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $");
#include "opt_exec.h"
#include "opt_execfmt.h"
#include "opt_ktrace.h"
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "veriexec.h"
#include "opt_pax.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/mount.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/acct.h>
#include <sys/atomic.h>
#include <sys/exec.h>
#include <sys/futex.h>
#include <sys/ktrace.h>
#include <sys/uidinfo.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/lwpctl.h>
#include <sys/pax.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#if NVERIEXEC > 0
#include <sys/verified_exec.h>
#endif /* NVERIEXEC > 0 */
#include <sys/sdt.h>
#include <sys/spawn.h>
#include <sys/prot.h>
#include <sys/cprng.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
#include <compat/common/compat_util.h>
#ifndef MD_TOPDOWN_INIT
#ifdef __USE_TOPDOWN_VM
#define MD_TOPDOWN_INIT(epp) (epp)->ep_flags |= EXEC_TOPDOWN_VM
#else
#define MD_TOPDOWN_INIT(epp)
#endif
#endif
struct execve_data;
extern int user_va0_disable;
static size_t calcargs(struct execve_data * restrict, const size_t);
static size_t calcstack(struct execve_data * restrict, const size_t);
static int copyoutargs(struct execve_data * restrict, struct lwp *,
char * const);
static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
static int copyinargs(struct execve_data * restrict, char * const *,
char * const *, execve_fetch_element_t, char **);
static int copyinargstrs(struct execve_data * restrict, char * const *,
execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t));
static int exec_sigcode_map(struct proc *, const struct emul *);
#if defined(DEBUG) && !defined(DEBUG_EXEC)
#define DEBUG_EXEC
#endif
#ifdef DEBUG_EXEC
#define DPRINTF(a) printf a
#define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
__LINE__, (s), (a), (b))
static void dump_vmcmds(const struct exec_package * const, size_t, int);
#define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
#else
#define DPRINTF(a)
#define COPYPRINTF(s, a, b)
#define DUMPVMCMDS(p, x, e) do {} while (0)
#endif /* DEBUG_EXEC */
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
/*
* Exec function switch:
*
* Note that each makecmds function is responsible for loading the
* exec package with the necessary functions for any exec-type-specific
* handling.
*
* Functions for specific exec types should be defined in their own
* header file.
*/
static const struct execsw **execsw = NULL;
static int nexecs;
u_int exec_maxhdrsz; /* must not be static - used by netbsd32 */
/* list of dynamically loaded execsw entries */
static LIST_HEAD(execlist_head, exec_entry) ex_head =
LIST_HEAD_INITIALIZER(ex_head);
struct exec_entry {
LIST_ENTRY(exec_entry) ex_list;
SLIST_ENTRY(exec_entry) ex_slist;
const struct execsw *ex_sw;
};
#ifndef __HAVE_SYSCALL_INTERN
void syscall(void);
#endif
/* NetBSD autoloadable syscalls */
#ifdef MODULAR
#include <kern/syscalls_autoload.c>
#endif
/* NetBSD emul struct */
struct emul emul_netbsd = {
.e_name = "netbsd",
#ifdef EMUL_NATIVEROOT
.e_path = EMUL_NATIVEROOT,
#else
.e_path = NULL,
#endif
#ifndef __HAVE_MINIMAL_EMUL
.e_flags = EMUL_HAS_SYS___syscall,
.e_errno = NULL,
.e_nosys = SYS_syscall,
.e_nsysent = SYS_NSYSENT,
#endif
#ifdef MODULAR
.e_sc_autoload = netbsd_syscalls_autoload,
#endif
.e_sysent = sysent,
.e_nomodbits = sysent_nomodbits,
#ifdef SYSCALL_DEBUG
.e_syscallnames = syscallnames,
#else
.e_syscallnames = NULL,
#endif
.e_sendsig = sendsig,
.e_trapsignal = trapsignal,
.e_sigcode = NULL,
.e_esigcode = NULL,
.e_sigobject = NULL,
.e_setregs = setregs,
.e_proc_exec = NULL,
.e_proc_fork = NULL,
.e_proc_exit = NULL,
.e_lwp_fork = NULL,
.e_lwp_exit = NULL,
#ifdef __HAVE_SYSCALL_INTERN
.e_syscall_intern = syscall_intern,
#else
.e_syscall = syscall,
#endif
.e_sysctlovly = NULL,
.e_vm_default_addr = uvm_default_mapaddr,
.e_usertrap = NULL,
.e_ucsize = sizeof(ucontext_t),
.e_startlwp = startlwp
};
/*
* Exec lock. Used to control access to execsw[] structures.
* This must not be static so that netbsd32 can access it, too.
*/
krwlock_t exec_lock __cacheline_aligned;
/*
* Data used between a loadvm and execve part of an "exec" operation
*/
struct execve_data {
struct exec_package ed_pack;
struct pathbuf *ed_pathbuf;
struct vattr ed_attr;
struct ps_strings ed_arginfo;
char *ed_argp;
const char *ed_pathstring;
char *ed_resolvedname;
size_t ed_ps_strings_sz;
int ed_szsigcode;
size_t ed_argslen;
long ed_argc;
long ed_envc;
};
/*
* data passed from parent lwp to child during a posix_spawn()
*/
struct spawn_exec_data {
struct execve_data sed_exec;
struct posix_spawn_file_actions
*sed_actions;
struct posix_spawnattr *sed_attrs;
struct proc *sed_parent;
kcondvar_t sed_cv_child_ready;
kmutex_t sed_mtx_child;
int sed_error;
volatile uint32_t sed_refcnt;
};
static struct vm_map *exec_map;
static struct pool exec_pool;
static void *
exec_pool_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(exec_map, NCARGS, 0,
UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
}
static void
exec_pool_free(struct pool *pp, void *addr)
{
uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
}
static struct pool_allocator exec_palloc = {
.pa_alloc = exec_pool_alloc,
.pa_free = exec_pool_free,
.pa_pagesz = NCARGS
};
static void
exec_path_free(struct execve_data *data)
{
pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
pathbuf_destroy(data->ed_pathbuf);
if (data->ed_resolvedname) PNBUF_PUT(data->ed_resolvedname);
}
static int
exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp,
char **rpath)
{
int error;
char *p;
KASSERT(rpath != NULL);
*rpath = PNBUF_GET();
error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc);
if (error) {
DPRINTF(("%s: can't resolve name for %s, error %d\n",
__func__, epp->ep_kname, error));
PNBUF_PUT(*rpath);
*rpath = NULL;
return error;
}
epp->ep_resolvedname = *rpath;
if ((p = strrchr(*rpath, '/')) != NULL) epp->ep_kname = p + 1;
return 0;
}
/*
* check exec:
* given an "executable" described in the exec package's namei info,
* see what we can do with it.
*
* ON ENTRY:
* exec package with appropriate namei info
* lwp pointer of exec'ing lwp
* NO SELF-LOCKED VNODES
*
* ON EXIT:
* error: nothing held, etc. exec header still allocated.
* ok: filled exec package, executable's vnode (unlocked).
*
* EXEC SWITCH ENTRY:
* Locked vnode to check, exec package, proc.
*
* EXEC SWITCH EXIT:
* ok: return 0, filled exec package, executable's vnode (unlocked).
* error: destructive:
* everything deallocated execept exec header.
* non-destructive:
* error code, executable's vnode (unlocked),
* exec header unmodified.
*/
int
/*ARGSUSED*/
check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb,
char **rpath)
{
int error, i;
struct vnode *vp;
size_t resid;
if (epp->ep_resolvedname) {
struct nameidata nd;
// grab the absolute pathbuf here before namei() trashes it.
pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
/* first get the vnode */
if ((error = namei(&nd)) != 0)
return error;
epp->ep_vp = vp = nd.ni_vp;
#ifdef DIAGNOSTIC
/* paranoia (take this out once namei stuff stabilizes) */
memset(nd.ni_pnbuf, '~', PATH_MAX);
#endif
} else {
struct file *fp;
if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0)
return error;
epp->ep_vp = vp = fp->f_vnode;
vref(vp);
fd_putfile(epp->ep_xfd);
if ((error = exec_resolvename(l, epp, vp, rpath)) != 0)
return error;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
/* check access and type */
if (vp->v_type != VREG) {
error = EACCES;
goto bad1;
}
if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
goto bad1;
/* get attributes */
/* XXX VOP_GETATTR is the only thing that needs LK_EXCLUSIVE here */
if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
goto bad1;
/* Check mount point */
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad1;
}
if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
/* try to open it */
if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
goto bad1;
/* now we have the file, get the exec header */
error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
UIO_SYSSPACE, IO_NODELOCKED, l->l_cred, &resid, NULL);
if (error)
goto bad1;
/* unlock vp, since we need it unlocked from here on out. */
VOP_UNLOCK(vp);
#if NVERIEXEC > 0
error = veriexec_verify(l, vp,
epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname,
epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
NULL);
if (error)
goto bad2;
#endif /* NVERIEXEC > 0 */
#ifdef PAX_SEGVGUARD
error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
if (error)
goto bad2;
#endif /* PAX_SEGVGUARD */
epp->ep_hdrvalid = epp->ep_hdrlen - resid;
/*
* Set up default address space limits. Can be overridden
* by individual exec packages.
*/
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
/*
* set up the vmcmds for creation of the process
* address space
*/
error = ENOEXEC;
for (i = 0; i < nexecs; i++) {
int newerror;
epp->ep_esch = execsw[i];
newerror = (*execsw[i]->es_makecmds)(l, epp);
if (!newerror) {
/* Seems ok: check that entry point is not too high */
if (epp->ep_entry >= epp->ep_vm_maxaddr) {
#ifdef DIAGNOSTIC
printf("%s: rejecting %p due to "
"too high entry address (>= %p)\n",
__func__, (void *)epp->ep_entry,
(void *)epp->ep_vm_maxaddr);
#endif
error = ENOEXEC;
break;
}
/* Seems ok: check that entry point is not too low */
if (epp->ep_entry < epp->ep_vm_minaddr) {
#ifdef DIAGNOSTIC
printf("%s: rejecting %p due to "
"too low entry address (< %p)\n",
__func__, (void *)epp->ep_entry,
(void *)epp->ep_vm_minaddr);
#endif
error = ENOEXEC;
break;
}
/* check limits */
#ifdef DIAGNOSTIC
#define LMSG "%s: rejecting due to %s limit (%ju > %ju)\n"
#endif
#ifdef MAXTSIZ
if (epp->ep_tsize > MAXTSIZ) {
#ifdef DIAGNOSTIC
printf(LMSG, __func__, "text",
(uintmax_t)epp->ep_tsize,
(uintmax_t)MAXTSIZ);
#endif
error = ENOMEM;
break;
}
#endif
vsize_t dlimit =
(vsize_t)l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur;
if (epp->ep_dsize > dlimit) {
#ifdef DIAGNOSTIC
printf(LMSG, __func__, "data",
(uintmax_t)epp->ep_dsize,
(uintmax_t)dlimit);
#endif
error = ENOMEM;
break;
}
return 0;
}
/*
* Reset all the fields that may have been modified by the
* loader.
*/
KASSERT(epp->ep_emul_arg == NULL); if (epp->ep_emul_root != NULL) { vrele(epp->ep_emul_root);
epp->ep_emul_root = NULL;
}
if (epp->ep_interp != NULL) { vrele(epp->ep_interp);
epp->ep_interp = NULL;
}
epp->ep_pax_flags = 0;
/* make sure the first "interesting" error code is saved. */
if (error == ENOEXEC)
error = newerror;
if (epp->ep_flags & EXEC_DESTR)
/* Error from "#!" code, tidied up by recursive call */
return error;
}
/* not found, error */
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
#if NVERIEXEC > 0 || defined(PAX_SEGVGUARD)
bad2:
#endif
/*
* close and release the vnode, restore the old one, free the
* pathname buf, and punt.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(vp, FREAD, l->l_cred);
vput(vp);
return error;
bad1:
/*
* free the namei pathname buffer, and put the vnode
* (which we don't yet have open).
*/
vput(vp); /* was still locked */
return error;
}
#ifdef __MACHINE_STACK_GROWS_UP
#define STACK_PTHREADSPACE NBPG
#else
#define STACK_PTHREADSPACE 0
#endif
static int
execve_fetch_element(char * const *array, size_t index, char **value)
{
return copyin(array + index, value, sizeof(*value));
}
/*
* exec system call
*/
int
sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(char * const *) argp;
syscallarg(char * const *) envp;
} */
return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp),
SCARG(uap, envp), execve_fetch_element);
}
int
sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(char * const *) argp;
syscallarg(char * const *) envp;
} */
return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp),
SCARG(uap, envp), execve_fetch_element);
}
/*
* Load modules to try and execute an image that we do not understand.
* If no execsw entries are present, we load those likely to be needed
* in order to run native images only. Otherwise, we autoload all
* possible modules that could let us run the binary. XXX lame
*/
static void
exec_autoload(void)
{
#ifdef MODULAR
static const char * const native[] = {
"exec_elf32",
"exec_elf64",
"exec_script",
NULL
};
static const char * const compat[] = {
"exec_elf32",
"exec_elf64",
"exec_script",
"exec_aout",
"exec_coff",
"exec_ecoff",
"compat_aoutm68k",
"compat_netbsd32",
#if 0
"compat_linux",
"compat_linux32",
#endif
"compat_sunos",
"compat_sunos32",
"compat_ultrix",
NULL
};
char const * const *list;
int i;
list = nexecs == 0 ? native : compat;
for (i = 0; list[i] != NULL; i++) {
if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) {
continue;
}
yield();
}
#endif
}
/*
* Copy the user or kernel supplied upath to the allocated pathbuffer pbp
* making it absolute in the process, by prepending the current working
* directory if it is not. If offs is supplied it will contain the offset
* where the original supplied copy of upath starts.
*/
int
exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg,
struct pathbuf **pbp, size_t *offs)
{
char *path, *bp;
size_t len, tlen;
int error;
struct cwdinfo *cwdi;
path = PNBUF_GET();
if (seg == UIO_SYSSPACE) {
error = copystr(upath, path, MAXPATHLEN, &len);
} else {
error = copyinstr(upath, path, MAXPATHLEN, &len);
}
if (error)
goto err;
if (path[0] == '/') {
if (offs)
*offs = 0;
goto out;
}
len++;
if (len + 1 >= MAXPATHLEN) {
error = ENAMETOOLONG;
goto err;
}
bp = path + MAXPATHLEN - len;
memmove(bp, path, len);
*(--bp) = '/';
cwdi = l->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2,
GETCWD_CHECK_ACCESS, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto err;
tlen = path + MAXPATHLEN - bp;
memmove(path, bp, tlen);
path[tlen - 1] = '\0';
if (offs) *offs = tlen - len;
out:
*pbp = pathbuf_assimilate(path);
return 0;
err:
PNBUF_PUT(path);
return error;
}
vaddr_t
exec_vm_minaddr(vaddr_t va_min)
{
/*
* Increase va_min if we don't want NULL to be mappable by the
* process.
*/
#define VM_MIN_GUARD PAGE_SIZE
if (user_va0_disable && (va_min < VM_MIN_GUARD))
return VM_MIN_GUARD;
return va_min;
}
static int
execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd,
char * const *args, char * const *envs,
execve_fetch_element_t fetch_element,
struct execve_data * restrict data)
{
struct exec_package * const epp = &data->ed_pack;
int error;
struct proc *p;
char *dp;
u_int modgen;
KASSERT(data != NULL);
p = l->l_proc;
modgen = 0;
SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0);
/*
* Check if we have exceeded our number of processes limit.
* This is so that we handle the case where a root daemon
* forked, ran setuid to become the desired user and is trying
* to exec. The obvious place to do the reference counting check
* is setuid(), but we don't do the reference counting check there
* like other OS's do because then all the programs that use setuid()
* must be modified to check the return code of setuid() and exit().
* It is dangerous to make setuid() fail, because it fails open and
* the program will continue to run as root. If we make it succeed
* and return an error code, again we are not enforcing the limit.
* The best place to enforce the limit is here, when the process tries
* to execute a new image, because eventually the process will need
* to call exec in order to do something useful.
*/
retry:
if (p->p_flag & PK_SUGID) { if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p->p_rlimit[RLIMIT_NPROC],
KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
chgproccnt(kauth_cred_getuid(l->l_cred), 0) >
p->p_rlimit[RLIMIT_NPROC].rlim_cur)
return EAGAIN;
}
/*
* Drain existing references and forbid new ones. The process
* should be left alone until we're done here. This is necessary
* to avoid race conditions - e.g. in ptrace() - that might allow
* a local user to illicitly obtain elevated privileges.
*/
rw_enter(&p->p_reflock, RW_WRITER);
if (has_path) {
size_t offs;
/*
* Init the namei data to point the file user's program name.
* This is done here rather than in check_exec(), so that it's
* possible to override this settings if any of makecmd/probe
* functions call check_exec() recursively - for example,
* see exec_script_makecmds().
*/
if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
&data->ed_pathbuf, &offs)) != 0)
goto clrflg;
data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
epp->ep_kname = data->ed_pathstring + offs;
data->ed_resolvedname = PNBUF_GET();
epp->ep_resolvedname = data->ed_resolvedname;
epp->ep_xfd = -1;
} else { data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/"));
data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
epp->ep_kname = "*fexecve*";
data->ed_resolvedname = NULL;
epp->ep_resolvedname = NULL;
epp->ep_xfd = fd;
}
/*
* initialize the fields of the exec package.
*/
epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
epp->ep_hdrlen = exec_maxhdrsz;
epp->ep_hdrvalid = 0;
epp->ep_emul_arg = NULL;
epp->ep_emul_arg_free = NULL;
memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds));
epp->ep_vap = &data->ed_attr;
epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0;
MD_TOPDOWN_INIT(epp);
epp->ep_emul_root = NULL;
epp->ep_interp = NULL;
epp->ep_esch = NULL;
epp->ep_pax_flags = 0;
memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch));
rw_enter(&exec_lock, RW_READER);
/* see if we can run it. */
if ((error = check_exec(l, epp, data->ed_pathbuf,
&data->ed_resolvedname)) != 0) {
if (error != ENOENT && error != EACCES && error != ENOEXEC) {
DPRINTF(("%s: check exec failed for %s, error %d\n",
__func__, epp->ep_kname, error));
}
goto freehdr;
}
/* allocate an argument buffer */
data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
KASSERT(data->ed_argp != NULL);
dp = data->ed_argp;
if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) {
goto bad;
}
/*
* Calculate the new stack size.
*/
#ifdef __MACHINE_STACK_GROWS_UP
/*
* copyargs() fills argc/argv/envp from the lower address even on
* __MACHINE_STACK_GROWS_UP machines. Reserve a few words just below the SP
* so that _rtld() use it.
*/
#define RTLD_GAP 32
#else
#define RTLD_GAP 0
#endif
const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;
data->ed_argslen = calcargs(data, argenvstrlen);
const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);
if (len > epp->ep_ssize) {
/* in effect, compare to initial limit */
DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
error = ENOMEM;
goto bad;
}
/* adjust "active stack depth" for process VSZ */
epp->ep_ssize = len;
return 0;
bad:
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
/* kill any opened file descriptor, if necessary */
if (epp->ep_flags & EXEC_HASFD) { epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
}
/* close and put the exec'd file */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
vput(epp->ep_vp);
pool_put(&exec_pool, data->ed_argp);
freehdr:
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL) vrele(epp->ep_emul_root); if (epp->ep_interp != NULL) vrele(epp->ep_interp);
rw_exit(&exec_lock);
exec_path_free(data);
clrflg:
rw_exit(&p->p_reflock);
if (modgen != module_gen && error == ENOEXEC) {
modgen = module_gen;
exec_autoload();
goto retry;
}
SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
return error;
}
static int
execve_dovmcmds(struct lwp *l, struct execve_data * restrict data)
{
struct exec_package * const epp = &data->ed_pack;
struct proc *p = l->l_proc;
struct exec_vmcmd *base_vcp;
int error = 0;
size_t i;
/* record proc's vnode, for use by procfs and others */
if (p->p_textvp)
vrele(p->p_textvp);
vref(epp->ep_vp);
p->p_textvp = epp->ep_vp;
/* create the new process's VM space by running the vmcmds */
KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__);
#ifdef TRACE_EXEC
DUMPVMCMDS(epp, 0, 0);
#endif
base_vcp = NULL;
for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
struct exec_vmcmd *vcp;
vcp = &epp->ep_vmcmds.evs_cmds[i];
if (vcp->ev_flags & VMCMD_RELATIVE) {
KASSERTMSG(base_vcp != NULL,
"%s: relative vmcmd with no base", __func__);
KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0,
"%s: illegal base & relative vmcmd", __func__);
vcp->ev_addr += base_vcp->ev_addr;
}
error = (*vcp->ev_proc)(l, vcp);
if (error)
DUMPVMCMDS(epp, i, error);
if (vcp->ev_flags & VMCMD_BASE)
base_vcp = vcp;
}
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
vput(epp->ep_vp);
/* if an error happened, deallocate and punt */
if (error != 0) {
DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
}
return error;
}
static void
execve_free_data(struct execve_data *data)
{
struct exec_package * const epp = &data->ed_pack;
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
/* kill any opened file descriptor, if necessary */
if (epp->ep_flags & EXEC_HASFD) {
epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
}
/* close and put the exec'd file */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
vput(epp->ep_vp);
pool_put(&exec_pool, data->ed_argp);
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL)
vrele(epp->ep_emul_root);
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
exec_path_free(data);
}
static void
pathexec(struct proc *p, const char *resolvedname)
{
/* set command name & other accounting info */
const char *cmdname;
if (resolvedname == NULL) {
cmdname = "*fexecve*";
resolvedname = "/";
} else {
cmdname = strrchr(resolvedname, '/') + 1;
}
KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'",
resolvedname);
strlcpy(p->p_comm, cmdname, sizeof(p->p_comm));
kmem_strfree(p->p_path);
p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
}
/* XXX elsewhere */
static int
credexec(struct lwp *l, struct execve_data *data)
{
struct proc *p = l->l_proc;
struct vattr *attr = &data->ed_attr;
int error;
/*
* Deal with set[ug]id. MNT_NOSUID has already been used to disable
* s[ug]id. It's OK to check for PSL_TRACED here as we have blocked
* out additional references on the process for the moment.
*/
if ((p->p_slflag & PSL_TRACED) == 0 &&
(((attr->va_mode & S_ISUID) != 0 &&
kauth_cred_geteuid(l->l_cred) != attr->va_uid) ||
((attr->va_mode & S_ISGID) != 0 &&
kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
/*
* Mark the process as SUGID before we do
* anything that might block.
*/
proc_crmod_enter();
proc_crmod_leave(NULL, NULL, true);
if (data->ed_argc == 0) {
DPRINTF((
"%s: not executing set[ug]id binary with no args\n",
__func__));
return EINVAL;
}
/* Make sure file descriptors 0..2 are in use. */
if ((error = fd_checkstd()) != 0) {
DPRINTF(("%s: fdcheckstd failed %d\n",
__func__, error));
return error;
}
/*
* Copy the credential so other references don't see our
* changes.
*/
l->l_cred = kauth_cred_copy(l->l_cred);
#ifdef KTRACE
/*
* If the persistent trace flag isn't set, turn off.
*/
if (p->p_tracep) {
mutex_enter(&ktrace_lock);
if (!(p->p_traceflag & KTRFAC_PERSISTENT))
ktrderef(p);
mutex_exit(&ktrace_lock);
}
#endif
if (attr->va_mode & S_ISUID)
kauth_cred_seteuid(l->l_cred, attr->va_uid);
if (attr->va_mode & S_ISGID)
kauth_cred_setegid(l->l_cred, attr->va_gid);
} else {
if (kauth_cred_geteuid(l->l_cred) ==
kauth_cred_getuid(l->l_cred) &&
kauth_cred_getegid(l->l_cred) ==
kauth_cred_getgid(l->l_cred))
p->p_flag &= ~PK_SUGID;
}
/*
* Copy the credential so other references don't see our changes.
* Test to see if this is necessary first, since in the common case
* we won't need a private reference.
*/
if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
l->l_cred = kauth_cred_copy(l->l_cred);
kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
}
/* Update the master credentials. */
if (l->l_cred != p->p_cred) {
kauth_cred_t ocred;
mutex_enter(p->p_lock);
ocred = p->p_cred;
p->p_cred = kauth_cred_hold(l->l_cred);
mutex_exit(p->p_lock);
kauth_cred_free(ocred);
}
return 0;
}
static void
emulexec(struct lwp *l, struct exec_package *epp)
{
struct proc *p = l->l_proc;
/* The emulation root will usually have been found when we looked
* for the elf interpreter (or similar), if not look now. */
if (epp->ep_esch->es_emul->e_path != NULL &&
epp->ep_emul_root == NULL)
emul_find_root(l, epp);
/* Any old emulation root got removed by fdcloseexec */
rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
p->p_cwdi->cwdi_edir = epp->ep_emul_root;
rw_exit(&p->p_cwdi->cwdi_lock);
epp->ep_emul_root = NULL;
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
/*
* Call emulation specific exec hook. This can setup per-process
* p->p_emuldata or do any other per-process stuff an emulation needs.
*
* If we are executing process of different emulation than the
* original forked process, call e_proc_exit() of the old emulation
* first, then e_proc_exec() of new emulation. If the emulation is
* same, the exec hook code should deallocate any old emulation
* resources held previously by this process.
*/
if (p->p_emul && p->p_emul->e_proc_exit
&& p->p_emul != epp->ep_esch->es_emul)
(*p->p_emul->e_proc_exit)(p);
/*
* Call exec hook. Emulation code may NOT store reference to anything
* from &pack.
*/
if (epp->ep_esch->es_emul->e_proc_exec)
(*epp->ep_esch->es_emul->e_proc_exec)(p, epp);
/* update p_emul, the old value is no longer needed */
p->p_emul = epp->ep_esch->es_emul;
/* ...and the same for p_execsw */
p->p_execsw = epp->ep_esch;
#ifdef __HAVE_SYSCALL_INTERN
(*p->p_emul->e_syscall_intern)(p);
#endif
ktremul();
}
static int
execve_runproc(struct lwp *l, struct execve_data * restrict data,
bool no_local_exec_lock, bool is_spawn)
{
struct exec_package * const epp = &data->ed_pack;
int error = 0;
struct proc *p;
struct vmspace *vm;
/*
* In case of a posix_spawn operation, the child doing the exec
* might not hold the reader lock on exec_lock, but the parent
* will do this instead.
*/
KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
KASSERT(!no_local_exec_lock || is_spawn);
KASSERT(data != NULL);
p = l->l_proc;
/* Get rid of other LWPs. */
if (p->p_nlwps > 1) {
mutex_enter(p->p_lock);
exit_lwps(l);
mutex_exit(p->p_lock);
}
KDASSERT(p->p_nlwps == 1);
/*
* All of the other LWPs got rid of their robust futexes
* when they exited above, but we might still have some
* to dispose of. Do that now.
*/
if (__predict_false(l->l_robust_head != 0)) {
futex_release_all_lwp(l);
/*
* Since this LWP will live on with a different
* program image, we need to clear the robust
* futex list pointer here.
*/
l->l_robust_head = 0;
}
/* Destroy any lwpctl info. */
if (p->p_lwpctl != NULL)
lwp_ctl_exit();
/* Remove POSIX timers */
ptimers_free(p, TIMERS_POSIX);
/* Set the PaX flags. */
pax_set_flags(epp, p);
/*
* Do whatever is necessary to prepare the address space
* for remapping. Note that this might replace the current
* vmspace with another!
*
* vfork(): do not touch any user space data in the new child
* until we have awoken the parent below, or it will defeat
* lazy pmap switching (on x86).
*/
if (is_spawn)
uvmspace_spawn(l, epp->ep_vm_minaddr,
epp->ep_vm_maxaddr,
epp->ep_flags & EXEC_TOPDOWN_VM);
else
uvmspace_exec(l, epp->ep_vm_minaddr,
epp->ep_vm_maxaddr,
epp->ep_flags & EXEC_TOPDOWN_VM);
vm = p->p_vmspace;
vm->vm_taddr = (void *)epp->ep_taddr;
vm->vm_tsize = btoc(epp->ep_tsize);
vm->vm_daddr = (void*)epp->ep_daddr;
vm->vm_dsize = btoc(epp->ep_dsize);
vm->vm_ssize = btoc(epp->ep_ssize);
vm->vm_issize = 0;
vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
vm->vm_minsaddr = (void *)epp->ep_minsaddr;
pax_aslr_init_vm(l, vm, epp);
cwdexec(p);
fd_closeexec(); /* handle close on exec */
if (__predict_false(ktrace_on))
fd_ktrexecfd();
execsigs(p); /* reset caught signals */
mutex_enter(p->p_lock);
l->l_ctxlink = NULL; /* reset ucontext link */
p->p_acflag &= ~AFORK;
p->p_flag |= PK_EXEC;
mutex_exit(p->p_lock);
error = credexec(l, data);
if (error)
goto exec_abort;
#if defined(__HAVE_RAS)
/*
* Remove all RASs from the address space.
*/
ras_purgeall();
#endif
/*
* Stop profiling.
*/
if ((p->p_stflag & PST_PROFIL) != 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
/*
* It's OK to test PL_PPWAIT unlocked here, as other LWPs have
* exited and exec()/exit() are the only places it will be cleared.
*
* Once the parent has been awoken, curlwp may teleport to a new CPU
* in sched_vforkexec(), and it's then OK to start messing with user
* data. See comment above.
*/
if ((p->p_lflag & PL_PPWAIT) != 0) {
bool samecpu;
lwp_t *lp;
mutex_enter(&proc_lock);
lp = p->p_vforklwp;
p->p_vforklwp = NULL;
l->l_lwpctl = NULL; /* was on loan from blocked parent */
/* Clear flags after cv_broadcast() (scheduler needs them). */
p->p_lflag &= ~PL_PPWAIT;
lp->l_vforkwaiting = false;
/* If parent is still on same CPU, teleport curlwp elsewhere. */
samecpu = (lp->l_cpu == curlwp->l_cpu);
cv_broadcast(&lp->l_waitcv);
mutex_exit(&proc_lock);
/* Give the parent its CPU back - find a new home. */
KASSERT(!is_spawn);
sched_vforkexec(l, samecpu);
}
/* Now map address space. */
error = execve_dovmcmds(l, data);
if (error != 0)
goto exec_abort;
pathexec(p, epp->ep_resolvedname);
char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
error = copyoutargs(data, l, newstack);
if (error != 0)
goto exec_abort;
doexechooks(p);
/*
* Set initial SP at the top of the stack.
*
* Note that on machines where stack grows up (e.g. hppa), SP points to
* the end of arg/env strings. Userland guesses the address of argc
* via ps_strings::ps_argvstr.
*/
/* Setup new registers and do misc. setup. */
(*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
if (epp->ep_esch->es_setregs)
(*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);
/* Provide a consistent LWP private setting */
(void)lwp_setprivate(l, NULL);
/* Discard all PCU state; need to start fresh */
pcu_discard_all(l);
/* map the process's signal trampoline code */
if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) {
DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
goto exec_abort;
}
pool_put(&exec_pool, data->ed_argp);
/*
* Notify anyone who might care that we've exec'd.
*
* This is slightly racy; someone could sneak in and
* attach a knote after we've decided not to notify,
* or vice-versa, but that's not particularly bothersome.
* knote_proc_exec() will acquire p->p_lock as needed.
*/
if (!SLIST_EMPTY(&p->p_klist)) {
knote_proc_exec(p);
}
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0);
emulexec(l, epp);
/* Allow new references from the debugger/procfs. */
rw_exit(&p->p_reflock);
if (!no_local_exec_lock)
rw_exit(&exec_lock);
mutex_enter(&proc_lock);
/* posix_spawn(3) reports a single event with implied exec(3) */
if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
mutex_enter(p->p_lock);
eventswitch(TRAP_EXEC, 0, 0);
mutex_enter(&proc_lock);
}
if (p->p_sflag & PS_STOPEXEC) {
ksiginfoq_t kq;
KASSERT(l->l_blcnt == 0);
p->p_pptr->p_nstopchild++;
p->p_waited = 0;
mutex_enter(p->p_lock);
ksiginfo_queue_init(&kq);
sigclearall(p, &contsigmask, &kq);
lwp_lock(l);
l->l_stat = LSSTOP;
p->p_stat = SSTOP;
p->p_nrlwps--;
lwp_unlock(l);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
ksiginfo_queue_drain(&kq);
} else {
mutex_exit(&proc_lock);
}
exec_path_free(data);
#ifdef TRACE_EXEC
DPRINTF(("%s finished\n", __func__));
#endif
return EJUSTRETURN;
exec_abort:
SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
rw_exit(&p->p_reflock);
if (!no_local_exec_lock)
rw_exit(&exec_lock);
exec_path_free(data);
/*
* the old process doesn't exist anymore. exit gracefully.
* get rid of the (new) address space we have created, if any, get rid
* of our namei data and vnode, and exit noting failure
*/
if (vm != NULL) {
uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
}
exec_free_emul_arg(epp);
pool_put(&exec_pool, data->ed_argp);
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL)
vrele(epp->ep_emul_root);
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
/* Acquire the sched-state mutex (exit1() will release it). */
if (!is_spawn) {
mutex_enter(p->p_lock);
exit1(l, error, SIGABRT);
}
return error;
}
int
execve1(struct lwp *l, bool has_path, const char *path, int fd,
char * const *args, char * const *envs,
execve_fetch_element_t fetch_element)
{
struct execve_data data;
int error;
error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element,
&data);
if (error)
return error;
error = execve_runproc(l, &data, false, false);
return error;
}
static size_t
fromptrsz(const struct exec_package *epp)
{
return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
}
static size_t
ptrsz(const struct exec_package *epp)
{
return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
}
static size_t
calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
{
struct exec_package * const epp = &data->ed_pack;
const size_t nargenvptrs =
1 + /* long argc */
data->ed_argc + /* char *argv[] */
1 + /* \0 */
data->ed_envc + /* char *env[] */
1; /* \0 */
return (nargenvptrs * ptrsz(epp)) /* pointers */
+ argenvstrlen /* strings */
+ epp->ep_esch->es_arglen; /* auxinfo */
}
static size_t
calcstack(struct execve_data * restrict data, const size_t gaplen)
{
struct exec_package * const epp = &data->ed_pack;
data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
epp->ep_esch->es_emul->e_sigcode;
data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
sizeof(struct ps_strings32) : sizeof(struct ps_strings);
const size_t sigcode_psstr_sz =
data->ed_szsigcode + /* sigcode */
data->ed_ps_strings_sz + /* ps_strings */
STACK_PTHREADSPACE; /* pthread space */
const size_t stacklen =
data->ed_argslen +
gaplen +
sigcode_psstr_sz;
/* make the stack "safely" aligned */
return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
}
static int
copyoutargs(struct execve_data * restrict data, struct lwp *l,
char * const newstack)
{
struct exec_package * const epp = &data->ed_pack;
struct proc *p = l->l_proc;
int error;
memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo));
/* remember information about the process */
data->ed_arginfo.ps_nargvstr = data->ed_argc;
data->ed_arginfo.ps_nenvstr = data->ed_envc;
/*
* Allocate the stack address passed to the newly execve()'ed process.
*
* The new stack address will be set to the SP (stack pointer) register
* in setregs().
*/
char *newargs = STACK_ALLOC(
STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);
error = (*epp->ep_esch->es_copyargs)(l, epp,
&data->ed_arginfo, &newargs, data->ed_argp);
if (error) {
DPRINTF(("%s: copyargs failed %d\n", __func__, error));
return error;
}
error = copyoutpsstrs(data, p);
if (error != 0)
return error;
return 0;
}
static int
copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
{
struct exec_package * const epp = &data->ed_pack;
struct ps_strings32 arginfo32;
void *aip;
int error;
/* fill process ps_strings info */
p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
STACK_PTHREADSPACE), data->ed_ps_strings_sz);
if (epp->ep_flags & EXEC_32) {
aip = &arginfo32;
arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
} else
aip = &data->ed_arginfo;
/* copy out the process's ps_strings structure */
if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
!= 0) {
DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
__func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
return error;
}
return 0;
}
static int
copyinargs(struct execve_data * restrict data, char * const *args,
char * const *envs, execve_fetch_element_t fetch_element, char **dpp)
{
struct exec_package * const epp = &data->ed_pack;
char *dp;
size_t i;
int error;
dp = *dpp;
data->ed_argc = 0;
/* copy the fake args list, if there's one, freeing it as we go */
if (epp->ep_flags & EXEC_HASARGL) {
struct exec_fakearg *fa = epp->ep_fa;
while (fa->fa_arg != NULL) {
const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
size_t len;
len = strlcpy(dp, fa->fa_arg, maxlen);
/* Count NUL into len. */
if (len < maxlen)
len++;
else {
while (fa->fa_arg != NULL) {
kmem_free(fa->fa_arg, fa->fa_len);
fa++;
}
kmem_free(epp->ep_fa, epp->ep_fa_len);
epp->ep_flags &= ~EXEC_HASARGL;
return E2BIG;
}
ktrexecarg(fa->fa_arg, len - 1);
dp += len;
kmem_free(fa->fa_arg, fa->fa_len);
fa++;
data->ed_argc++;
}
kmem_free(epp->ep_fa, epp->ep_fa_len);
epp->ep_flags &= ~EXEC_HASARGL;
}
/*
* Read and count argument strings from user.
*/
if (args == NULL) {
DPRINTF(("%s: null args\n", __func__));
return EINVAL;
}
if (epp->ep_flags & EXEC_SKIPARG)
args = (const void *)((const char *)args + fromptrsz(epp));
i = 0;
error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
if (error != 0) {
DPRINTF(("%s: copyin arg %d\n", __func__, error));
return error;
}
data->ed_argc += i;
/*
* Read and count environment strings from user.
*/
data->ed_envc = 0;
/* environment need not be there */
if (envs == NULL)
goto done;
i = 0;
error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
if (error != 0) {
DPRINTF(("%s: copyin env %d\n", __func__, error));
return error;
}
data->ed_envc += i;
done:
*dpp = dp;
return 0;
}
static int
copyinargstrs(struct execve_data * restrict data, char * const *strs,
execve_fetch_element_t fetch_element, char **dpp, size_t *ip,
void (*ktr)(const void *, size_t))
{
char *dp, *sp;
size_t i;
int error;
dp = *dpp;
i = 0;
while (1) {
const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
size_t len;
if ((error = (*fetch_element)(strs, i, &sp)) != 0) {
return error;
}
if (!sp)
break;
if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
return error;
}
if (__predict_false(ktrace_on)) (*ktr)(dp, len - 1);
dp += len;
i++;
}
*dpp = dp;
*ip = i;
return 0;
}
/*
* Copy argv and env strings from kernel buffer (argp) to the new stack.
* Those strings are located just after auxinfo.
*/
int
copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
char **stackp, void *argp)
{
char **cpp, *dp, *sp;
size_t len;
void *nullp;
long argc, envc;
int error;
cpp = (char **)*stackp;
nullp = NULL;
argc = arginfo->ps_nargvstr;
envc = arginfo->ps_nenvstr;
/* argc on stack is long */
CTASSERT(sizeof(*cpp) == sizeof(argc));
dp = (char *)(cpp +
1 + /* long argc */
argc + /* char *argv[] */
1 + /* \0 */
envc + /* char *env[] */
1) + /* \0 */
pack->ep_esch->es_arglen; /* auxinfo */
sp = argp;
if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(argc));
return error;
}
/* XXX don't copy them out, remap them! */
arginfo->ps_argvstr = cpp; /* remember location of argv for later */
for (; --argc >= 0; sp += len, dp += len) {
if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(dp));
return error;
}
if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
COPYPRINTF("str", dp, (size_t)ARG_MAX);
return error;
}
}
if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(nullp));
return error;
}
arginfo->ps_envstr = cpp; /* remember location of envp for later */
for (; --envc >= 0; sp += len, dp += len) {
if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(dp));
return error;
}
if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
COPYPRINTF("str", dp, (size_t)ARG_MAX);
return error;
}
}
if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(nullp));
return error;
}
*stackp = (char *)cpp;
return 0;
}
/*
* Add execsw[] entries.
*/
int
exec_add(struct execsw *esp, int count)
{
struct exec_entry *it;
int i, error = 0;
if (count == 0) {
return 0;
}
/* Check for duplicates. */
rw_enter(&exec_lock, RW_WRITER);
for (i = 0; i < count; i++) {
LIST_FOREACH(it, &ex_head, ex_list) {
/* assume unique (makecmds, probe_func, emulation) */
if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
it->ex_sw->u.elf_probe_func ==
esp[i].u.elf_probe_func &&
it->ex_sw->es_emul == esp[i].es_emul) {
rw_exit(&exec_lock);
return EEXIST;
}
}
}
/* Allocate new entries. */
for (i = 0; i < count; i++) {
it = kmem_alloc(sizeof(*it), KM_SLEEP);
it->ex_sw = &esp[i];
error = exec_sigcode_alloc(it->ex_sw->es_emul);
if (error != 0) {
kmem_free(it, sizeof(*it));
break;
}
LIST_INSERT_HEAD(&ex_head, it, ex_list);
}
/* If even one fails, remove them all back. */
if (error != 0) {
for (i--; i >= 0; i--) {
it = LIST_FIRST(&ex_head);
LIST_REMOVE(it, ex_list);
exec_sigcode_free(it->ex_sw->es_emul);
kmem_free(it, sizeof(*it));
}
return error;
}
/* update execsw[] */
exec_init(0);
rw_exit(&exec_lock);
return 0;
}
/*
* Remove execsw[] entry.
*/
int
exec_remove(struct execsw *esp, int count)
{
struct exec_entry *it, *next;
int i;
const struct proclist_desc *pd;
proc_t *p;
if (count == 0) {
return 0;
}
/* Abort if any are busy. */
rw_enter(&exec_lock, RW_WRITER);
for (i = 0; i < count; i++) {
mutex_enter(&proc_lock);
for (pd = proclists; pd->pd_list != NULL; pd++) {
PROCLIST_FOREACH(p, pd->pd_list) {
if (p->p_execsw == &esp[i]) {
mutex_exit(&proc_lock);
rw_exit(&exec_lock);
return EBUSY;
}
}
}
mutex_exit(&proc_lock);
}
/* None are busy, so remove them all. */
for (i = 0; i < count; i++) {
for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
next = LIST_NEXT(it, ex_list);
if (it->ex_sw == &esp[i]) {
LIST_REMOVE(it, ex_list);
exec_sigcode_free(it->ex_sw->es_emul);
kmem_free(it, sizeof(*it));
break;
}
}
}
/* update execsw[] */
exec_init(0);
rw_exit(&exec_lock);
return 0;
}
/*
* Initialize exec structures. If init_boot is true, also does necessary
* one-time initialization (it's called from main() that way).
* Once system is multiuser, this should be called with exec_lock held,
* i.e. via exec_{add|remove}().
*/
int
exec_init(int init_boot)
{
const struct execsw **sw;
struct exec_entry *ex;
SLIST_HEAD(,exec_entry) first;
SLIST_HEAD(,exec_entry) any;
SLIST_HEAD(,exec_entry) last;
int i, sz;
if (init_boot) {
/* do one-time initializations */
vaddr_t vmin = 0, vmax;
rw_init(&exec_lock);
exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
"execargs", &exec_palloc, IPL_NONE);
pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
} else {
KASSERT(rw_write_held(&exec_lock));
}
/* Sort each entry onto the appropriate queue. */
SLIST_INIT(&first);
SLIST_INIT(&any);
SLIST_INIT(&last);
sz = 0;
LIST_FOREACH(ex, &ex_head, ex_list) {
switch(ex->ex_sw->es_prio) {
case EXECSW_PRIO_FIRST:
SLIST_INSERT_HEAD(&first, ex, ex_slist);
break;
case EXECSW_PRIO_ANY:
SLIST_INSERT_HEAD(&any, ex, ex_slist);
break;
case EXECSW_PRIO_LAST:
SLIST_INSERT_HEAD(&last, ex, ex_slist);
break;
default:
panic("%s", __func__);
break;
}
sz++;
}
/*
* Create new execsw[]. Ensure we do not try a zero-sized
* allocation.
*/
sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
i = 0;
SLIST_FOREACH(ex, &first, ex_slist) {
sw[i++] = ex->ex_sw;
}
SLIST_FOREACH(ex, &any, ex_slist) {
sw[i++] = ex->ex_sw;
}
SLIST_FOREACH(ex, &last, ex_slist) {
sw[i++] = ex->ex_sw;
}
/* Replace old execsw[] and free used memory. */
if (execsw != NULL) {
kmem_free(__UNCONST(execsw),
nexecs * sizeof(struct execsw *) + 1);
}
execsw = sw;
nexecs = sz;
/* Figure out the maximum size of an exec header. */
exec_maxhdrsz = sizeof(int);
for (i = 0; i < nexecs; i++) {
if (execsw[i]->es_hdrsz > exec_maxhdrsz)
exec_maxhdrsz = execsw[i]->es_hdrsz;
}
return 0;
}
int
exec_sigcode_alloc(const struct emul *e)
{
vaddr_t va;
vsize_t sz;
int error;
struct uvm_object *uobj;
KASSERT(rw_lock_held(&exec_lock));
if (e == NULL || e->e_sigobject == NULL)
return 0;
sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
if (sz == 0)
return 0;
/*
* Create a sigobject for this emulation.
*
* sigobject is an anonymous memory object (just like SYSV shared
* memory) that we keep a permanent reference to and that we map
* in all processes that need this sigcode. The creation is simple,
* we create an object, add a permanent reference to it, map it in
* kernel space, copy out the sigcode to it and unmap it.
* We map it with PROT_READ|PROT_EXEC into the process just
* the way sys_mmap() would map it.
*/
if (*e->e_sigobject == NULL) {
uobj = uao_create(sz, 0);
(*uobj->pgops->pgo_reference)(uobj);
va = vm_map_min(kernel_map);
if ((error = uvm_map(kernel_map, &va, round_page(sz),
uobj, 0, 0,
UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
printf("sigcode kernel mapping failed %d\n", error);
(*uobj->pgops->pgo_detach)(uobj);
return error;
}
memcpy((void *)va, e->e_sigcode, sz);
#ifdef PMAP_NEED_PROCWR
pmap_procwr(&proc0, va, sz);
#endif
uvm_unmap(kernel_map, va, va + round_page(sz));
*e->e_sigobject = uobj;
KASSERT(uobj->uo_refs == 1);
} else {
/* if already created, reference++ */
uobj = *e->e_sigobject;
(*uobj->pgops->pgo_reference)(uobj);
}
return 0;
}
void
exec_sigcode_free(const struct emul *e)
{
struct uvm_object *uobj;
KASSERT(rw_lock_held(&exec_lock));
if (e == NULL || e->e_sigobject == NULL)
return;
uobj = *e->e_sigobject;
if (uobj == NULL)
return;
if (uobj->uo_refs == 1)
*e->e_sigobject = NULL; /* I'm the last person to reference. */
(*uobj->pgops->pgo_detach)(uobj);
}
static int
exec_sigcode_map(struct proc *p, const struct emul *e)
{
vaddr_t va;
vsize_t sz;
int error;
struct uvm_object *uobj;
sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
if (e->e_sigobject == NULL || sz == 0)
return 0;
uobj = *e->e_sigobject;
if (uobj == NULL)
return 0;
/* Just a hint to uvm_map where to put it. */
va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
#ifdef __alpha__
/*
* Tru64 puts /sbin/loader at the end of user virtual memory,
* which causes the above calculation to put the sigcode at
* an invalid address. Put it just below the text instead.
*/
if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
}
#endif
(*uobj->pgops->pgo_reference)(uobj);
error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
uobj, 0, 0,
UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
UVM_ADV_RANDOM, 0));
if (error) {
DPRINTF(("%s, %d: map %p "
"uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
__func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
va, error));
(*uobj->pgops->pgo_detach)(uobj);
return error;
}
p->p_sigctx.ps_sigcode = (void *)va;
return 0;
}
/*
* Release a refcount on spawn_exec_data and destroy memory, if this
* was the last one.
*/
static void
spawn_exec_data_release(struct spawn_exec_data *data)
{
membar_release();
if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
return;
membar_acquire();
cv_destroy(&data->sed_cv_child_ready);
mutex_destroy(&data->sed_mtx_child);
if (data->sed_actions) posix_spawn_fa_free(data->sed_actions,
data->sed_actions->len);
if (data->sed_attrs) kmem_free(data->sed_attrs,
sizeof(*data->sed_attrs));
kmem_free(data, sizeof(*data));
}
static int
handle_posix_spawn_file_actions(struct posix_spawn_file_actions *actions)
{
struct lwp *l = curlwp;
register_t retval;
int error, newfd;
if (actions == NULL)
return 0;
for (size_t i = 0; i < actions->len; i++) {
const struct posix_spawn_file_actions_entry *fae =
&actions->fae[i];
switch (fae->fae_action) {
case FAE_OPEN:
if (fd_getfile(fae->fae_fildes) != NULL) {
error = fd_close(fae->fae_fildes);
if (error)
return error;
}
error = fd_open(fae->fae_path, fae->fae_oflag,
fae->fae_mode, &newfd);
if (error)
return error;
if (newfd != fae->fae_fildes) {
error = dodup(l, newfd,
fae->fae_fildes, 0, &retval);
if (fd_getfile(newfd) != NULL)
fd_close(newfd);
}
break;
case FAE_DUP2:
error = dodup(l, fae->fae_fildes,
fae->fae_newfildes, 0, &retval);
break;
case FAE_CLOSE:
if (fd_getfile(fae->fae_fildes) == NULL) {
return EBADF;
}
error = fd_close(fae->fae_fildes);
break;
case FAE_CHDIR:
error = do_sys_chdir(l, fae->fae_chdir_path,
UIO_SYSSPACE, &retval);
break;
case FAE_FCHDIR:
error = do_sys_fchdir(l, fae->fae_fildes, &retval);
break;
}
if (error)
return error;
}
return 0;
}
static int
handle_posix_spawn_attrs(struct posix_spawnattr *attrs, struct proc *parent)
{
struct sigaction sigact;
int error;
struct proc *p = curproc;
struct lwp *l = curlwp;
if (attrs == NULL)
return 0;
memset(&sigact, 0, sizeof(sigact));
sigact._sa_u._sa_handler = SIG_DFL;
sigact.sa_flags = 0;
/*
* set state to SSTOP so that this proc can be found by pid.
* see proc_enterprp, do_sched_setparam below
*/
mutex_enter(&proc_lock);
/*
* p_stat should be SACTIVE, so we need to adjust the
* parent's p_nstopchild here. For safety, just make
* we're on the good side of SDEAD before we adjust.
*/
int ostat = p->p_stat;
KASSERT(ostat < SSTOP);
p->p_stat = SSTOP;
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
mutex_exit(&proc_lock);
/* Set process group */
if (attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
pid_t mypid = p->p_pid;
pid_t pgrp = attrs->sa_pgroup;
if (pgrp == 0)
pgrp = mypid;
error = proc_enterpgrp(parent, mypid, pgrp, false);
if (error)
goto out;
}
/* Set scheduler policy */
if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
error = do_sched_setparam(p->p_pid, 0, attrs->sa_schedpolicy,
&attrs->sa_schedparam);
else if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDPARAM) {
error = do_sched_setparam(parent->p_pid, 0,
SCHED_NONE, &attrs->sa_schedparam);
}
if (error)
goto out;
/* Reset user ID's */
if (attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
error = do_setresgid(l, -1, kauth_cred_getgid(l->l_cred), -1,
ID_E_EQ_R | ID_E_EQ_S);
if (error)
return error;
error = do_setresuid(l, -1, kauth_cred_getuid(l->l_cred), -1,
ID_E_EQ_R | ID_E_EQ_S);
if (error)
goto out;
}
/* Set signal masks/defaults */
if (attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
mutex_enter(p->p_lock);
error = sigprocmask1(l, SIG_SETMASK, &attrs->sa_sigmask, NULL);
mutex_exit(p->p_lock);
if (error)
goto out;
}
if (attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
/*
* The following sigaction call is using a sigaction
* version 0 trampoline which is in the compatibility
* code only. This is not a problem because for SIG_DFL
* and SIG_IGN, the trampolines are now ignored. If they
* were not, this would be a problem because we are
* holding the exec_lock, and the compat code needs
* to do the same in order to replace the trampoline
* code of the process.
*/
for (int i = 1; i <= NSIG; i++) {
if (sigismember(&attrs->sa_sigdefault, i))
sigaction1(l, i, &sigact, NULL, NULL, 0);
}
}
error = 0;
out:
mutex_enter(&proc_lock);
p->p_stat = ostat;
p->p_pptr->p_nstopchild--;
mutex_exit(&proc_lock);
return error;
}
/*
* A child lwp of a posix_spawn operation starts here and ends up in
* cpu_spawn_return, dealing with all filedescriptor and scheduler
* manipulations in between.
* The parent waits for the child, as it is not clear whether the child
* will be able to acquire its own exec_lock. If it can, the parent can
* be released early and continue running in parallel. If not (or if the
* magic debug flag is passed in the scheduler attribute struct), the
* child rides on the parent's exec lock until it is ready to return to
* to userland - and only then releases the parent. This method loses
* concurrency, but improves error reporting.
*/
static void
spawn_return(void *arg)
{
struct spawn_exec_data *spawn_data = arg;
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int error;
bool have_reflock;
bool parent_is_waiting = true;
/*
* Check if we can release parent early.
* We either need to have no sed_attrs, or sed_attrs does not
* have POSIX_SPAWN_RETURNERROR or one of the flags, that require
* safe access to the parent proc (passed in sed_parent).
* We then try to get the exec_lock, and only if that works, we can
* release the parent here already.
*/
struct posix_spawnattr *attrs = spawn_data->sed_attrs;
if ((!attrs || (attrs->sa_flags
& (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
&& rw_tryenter(&exec_lock, RW_READER)) {
parent_is_waiting = false;
mutex_enter(&spawn_data->sed_mtx_child);
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
}
/* don't allow debugger access yet */
rw_enter(&p->p_reflock, RW_WRITER);
have_reflock = true;
/* handle posix_spawnattr */
error = handle_posix_spawn_attrs(attrs, spawn_data->sed_parent);
if (error)
goto report_error;
/* handle posix_spawn_file_actions */
error = handle_posix_spawn_file_actions(spawn_data->sed_actions);
if (error)
goto report_error;
/* now do the real exec */
error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
true);
have_reflock = false;
if (error == EJUSTRETURN)
error = 0;
else if (error)
goto report_error;
if (parent_is_waiting) {
mutex_enter(&spawn_data->sed_mtx_child);
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
}
/* release our refcount on the data */
spawn_exec_data_release(spawn_data);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
(PSL_TRACED|PSL_TRACEDCHILD)) {
eventswitchchild(p, TRAP_CHLD, PTRACE_POSIX_SPAWN);
}
/* and finally: leave to userland for the first time */
cpu_spawn_return(l);
/* NOTREACHED */
return;
report_error:
if (have_reflock) {
/*
* We have not passed through execve_runproc(),
* which would have released the p_reflock and also
* taken ownership of the sed_exec part of spawn_data,
* so release/free both here.
*/
rw_exit(&p->p_reflock);
execve_free_data(&spawn_data->sed_exec);
}
if (parent_is_waiting) {
/* pass error to parent */
mutex_enter(&spawn_data->sed_mtx_child);
spawn_data->sed_error = error;
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
} else {
rw_exit(&exec_lock);
}
/* release our refcount on the data */
spawn_exec_data_release(spawn_data);
/* done, exit */
mutex_enter(p->p_lock);
/*
* Posix explicitly asks for an exit code of 127 if we report
* errors from the child process - so, unfortunately, there
* is no way to report a more exact error code.
* A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
* flag bit in the attrp argument to posix_spawn(2), see above.
*/
exit1(l, 127, 0);
}
static __inline char **
posix_spawn_fae_path(struct posix_spawn_file_actions_entry *fae)
{
switch (fae->fae_action) {
case FAE_OPEN:
return &fae->fae_path;
case FAE_CHDIR:
return &fae->fae_chdir_path;
default:
return NULL;
}
}
void
posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
{
for (size_t i = 0; i < len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]);
if (pathp)
kmem_strfree(*pathp);
}
if (fa->len > 0) kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
kmem_free(fa, sizeof(*fa));
}
static int
posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
const struct posix_spawn_file_actions *ufa, rlim_t lim)
{
struct posix_spawn_file_actions *fa;
struct posix_spawn_file_actions_entry *fae;
char *pbuf = NULL;
int error;
size_t i = 0;
fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
error = copyin(ufa, fa, sizeof(*fa));
if (error || fa->len == 0) {
kmem_free(fa, sizeof(*fa));
return error; /* 0 if not an error, and len == 0 */
}
if (fa->len > lim) {
kmem_free(fa, sizeof(*fa));
return EINVAL;
}
fa->size = fa->len;
size_t fal = fa->len * sizeof(*fae);
fae = fa->fae;
fa->fae = kmem_alloc(fal, KM_SLEEP);
error = copyin(fae, fa->fae, fal);
if (error)
goto out;
pbuf = PNBUF_GET();
for (; i < fa->len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]);
if (pathp == NULL)
continue;
error = copyinstr(*pathp, pbuf, MAXPATHLEN, &fal);
if (error)
goto out;
*pathp = kmem_alloc(fal, KM_SLEEP);
memcpy(*pathp, pbuf, fal);
}
PNBUF_PUT(pbuf);
*fap = fa;
return 0;
out:
if (pbuf) PNBUF_PUT(pbuf); posix_spawn_fa_free(fa, i);
return error;
}
/*
* N.B. increments nprocs upon success. Callers need to drop nprocs if
* they fail for some other reason.
*/
int
check_posix_spawn(struct lwp *l1)
{
int error, tnprocs, count;
uid_t uid;
struct proc *p1;
p1 = l1->l_proc;
uid = kauth_cred_getuid(l1->l_cred);
tnprocs = atomic_inc_uint_nv(&nprocs);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create.
*/
if (__predict_false(tnprocs >= maxproc))
error = -1;
else
error = kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
if (error) {
atomic_dec_uint(&nprocs);
return EAGAIN;
}
/*
* Enforce limits.
*/
count = chgproccnt(uid, 1);
if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
return EAGAIN;
}
return 0;
}
int
do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
struct posix_spawn_file_actions *fa,
struct posix_spawnattr *sa,
char *const *argv, char *const *envp,
execve_fetch_element_t fetch)
{
struct proc *p1, *p2;
struct lwp *l2;
int error;
struct spawn_exec_data *spawn_data;
vaddr_t uaddr = 0;
pid_t pid;
bool have_exec_lock = false;
p1 = l1->l_proc;
/* Allocate and init spawn_data */
spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
spawn_data->sed_refcnt = 1; /* only parent so far */
cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
mutex_enter(&spawn_data->sed_mtx_child);
/*
* Do the first part of the exec now, collect state
* in spawn_data.
*/
error = execve_loadvm(l1, true, path, -1, argv,
envp, fetch, &spawn_data->sed_exec);
if (error == EJUSTRETURN)
error = 0;
else if (error)
goto error_exit;
have_exec_lock = true;
/*
* Allocate virtual address space for the U-area now, while it
* is still easy to abort the fork operation if we're out of
* kernel virtual address space.
*/
uaddr = uvm_uarea_alloc();
if (__predict_false(uaddr == 0)) {
error = ENOMEM;
goto error_exit;
}
/*
* Allocate new proc. Borrow proc0 vmspace for it, we will
* replace it with its own before returning to userland
* in the child.
*/
p2 = proc_alloc();
if (p2 == NULL) {
/* We were unable to allocate a process ID. */
error = EAGAIN;
goto error_exit;
}
/*
* This is a point of no return, we will have to go through
* the child proc to properly clean it up past this point.
*/
pid = p2->p_pid;
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
memset(&p2->p_startzero, 0,
(unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
memcpy(&p2->p_startcopy, &p1->p_startcopy,
(unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
p2->p_vmspace = proc0.p_vmspace;
TAILQ_INIT(&p2->p_sigpend.sp_info);
LIST_INIT(&p2->p_lwps);
LIST_INIT(&p2->p_sigwaiters);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
* Inherit flags we want to keep. The flags related to SIGCHLD
* handling are important in order to keep a consistent behaviour
* for the child after the fork. If we are a 32-bit process, the
* child will be too.
*/
p2->p_flag =
p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
p2->p_emul = p1->p_emul;
p2->p_execsw = p1->p_execsw;
mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&p2->p_reflock);
cv_init(&p2->p_waitcv, "wait");
cv_init(&p2->p_lwpcv, "lwpwait");
p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
kauth_proc_fork(p1, p2);
p2->p_raslist = NULL;
p2->p_fd = fd_copy();
/* XXX racy */
p2->p_mqueue_cnt = p1->p_mqueue_cnt;
p2->p_cwdi = cwdinit();
/*
* Note: p_limit (rlimit stuff) is copy-on-write, so normally
* we just need increase pl_refcnt.
*/
if (!p1->p_limit->pl_writeable) {
lim_addref(p1->p_limit);
p2->p_limit = p1->p_limit;
} else {
p2->p_limit = lim_copy(p1->p_limit);
}
p2->p_lflag = 0;
l1->l_vforkwaiting = false;
p2->p_sflag = 0;
p2->p_slflag = 0;
p2->p_pptr = p1;
p2->p_ppid = p1->p_pid;
LIST_INIT(&p2->p_children);
p2->p_aio = NULL;
#ifdef KTRACE
/*
* Copy traceflag and tracefile if enabled.
* If not inherited, these were zeroed above.
*/
if (p1->p_traceflag & KTRFAC_INHERIT) {
mutex_enter(&ktrace_lock);
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2);
mutex_exit(&ktrace_lock);
}
#endif
/*
* Create signal actions for the child process.
*/
p2->p_sigacts = sigactsinit(p1, 0);
mutex_enter(p1->p_lock);
p2->p_sflag |=
(p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
sched_proc_fork(p1, p2);
mutex_exit(p1->p_lock);
p2->p_stflag = p1->p_stflag;
/*
* p_stats.
* Copy parts of p_stats, and zero out the rest.
*/
p2->p_stats = pstatscopy(p1->p_stats);
/* copy over machdep flags to the new proc */
cpu_proc_fork(p1, p2);
/*
* Prepare remaining parts of spawn data
*/
spawn_data->sed_actions = fa;
spawn_data->sed_attrs = sa;
spawn_data->sed_parent = p1;
/* create LWP */
lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
&l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
l2->l_ctxlink = NULL; /* reset ucontext link */
/*
* Copy the credential so other references don't see our changes.
* Test to see if this is necessary first, since in the common case
* we won't need a private reference.
*/
if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
l2->l_cred = kauth_cred_copy(l2->l_cred);
kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
}
/* Update the master credentials. */
if (l2->l_cred != p2->p_cred) {
kauth_cred_t ocred;
mutex_enter(p2->p_lock);
ocred = p2->p_cred;
p2->p_cred = kauth_cred_hold(l2->l_cred);
mutex_exit(p2->p_lock);
kauth_cred_free(ocred);
}
*child_ok = true;
spawn_data->sed_refcnt = 2; /* child gets it as well */
#if 0
l2->l_nopreempt = 1; /* start it non-preemptable */
#endif
/*
* It's now safe for the scheduler and other processes to see the
* child process.
*/
mutex_enter(&proc_lock);
if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
p2->p_exitsig = SIGCHLD; /* signal for parent on exit */
if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) ==
(PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
proc_changeparent(p2, p1->p_pptr);
SET(p2->p_slflag, PSL_TRACEDCHILD);
}
p2->p_oppid = p1->p_pid; /* Remember the original parent id. */
LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list);
p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
(*p2->p_emul->e_syscall_intern)(p2);
#endif
/*
* Make child runnable, set start time, and add to run queue except
* if the parent requested the child to start in SSTOP state.
*/
mutex_enter(p2->p_lock);
getmicrotime(&p2->p_stats->p_start);
lwp_lock(l2);
KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL);
p2->p_nrlwps = 1;
p2->p_stat = SACTIVE;
setrunnable(l2);
/* LWP now unlocked */
mutex_exit(p2->p_lock);
mutex_exit(&proc_lock);
cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
error = spawn_data->sed_error;
mutex_exit(&spawn_data->sed_mtx_child);
spawn_exec_data_release(spawn_data);
rw_exit(&p1->p_reflock);
rw_exit(&exec_lock);
have_exec_lock = false;
*pid_res = pid;
if (error)
return error;
if (p1->p_slflag & PSL_TRACED) {
/* Paranoid check */
mutex_enter(&proc_lock);
if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) !=
(PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
mutex_exit(&proc_lock);
return 0;
}
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid);
}
return 0;
error_exit:
if (have_exec_lock) {
execve_free_data(&spawn_data->sed_exec);
rw_exit(&p1->p_reflock);
rw_exit(&exec_lock);
}
mutex_exit(&spawn_data->sed_mtx_child);
spawn_exec_data_release(spawn_data);
if (uaddr != 0) uvm_uarea_free(uaddr);
return error;
}
int
sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t *) pid;
syscallarg(const char *) path;
syscallarg(const struct posix_spawn_file_actions *) file_actions;
syscallarg(const struct posix_spawnattr *) attrp;
syscallarg(char *const *) argv;
syscallarg(char *const *) envp;
} */
int error;
struct posix_spawn_file_actions *fa = NULL;
struct posix_spawnattr *sa = NULL;
pid_t pid;
bool child_ok = false;
rlim_t max_fileactions;
proc_t *p = l1->l_proc;
/* check_posix_spawn() increments nprocs for us. */
error = check_posix_spawn(l1);
if (error) {
*retval = error;
return 0;
}
/* copy in file_actions struct */
if (SCARG(uap, file_actions) != NULL) {
max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
maxfiles);
error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
max_fileactions);
if (error)
goto error_exit;
}
/* copyin posix_spawnattr struct */
if (SCARG(uap, attrp) != NULL) {
sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
if (error)
goto error_exit;
}
/*
* Do the spawn
*/
error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
if (error)
goto error_exit;
if (error == 0 && SCARG(uap, pid) != NULL) error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
*retval = error;
return 0;
error_exit:
if (!child_ok) {
(void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
atomic_dec_uint(&nprocs);
if (sa) kmem_free(sa, sizeof(*sa)); if (fa) posix_spawn_fa_free(fa, fa->len);
}
*retval = error;
return 0;
}
void
exec_free_emul_arg(struct exec_package *epp)
{
if (epp->ep_emul_arg_free != NULL) {
KASSERT(epp->ep_emul_arg != NULL);
(*epp->ep_emul_arg_free)(epp->ep_emul_arg);
epp->ep_emul_arg_free = NULL;
epp->ep_emul_arg = NULL;
} else {
KASSERT(epp->ep_emul_arg == NULL);
}
}
#ifdef DEBUG_EXEC
static void
dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
{
struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0];
size_t j;
if (error == 0)
DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
else
DPRINTF(("vmcmds %zu/%u, error %d\n", x,
epp->ep_vmcmds.evs_used, error));
for (j = 0; j < epp->ep_vmcmds.evs_used; j++) {
DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
PRIxVSIZE" prot=0%o flags=%d\n", j,
vp[j].ev_proc == vmcmd_map_pagedvn ?
"pagedvn" :
vp[j].ev_proc == vmcmd_map_readvn ?
"readvn" :
vp[j].ev_proc == vmcmd_map_zero ?
"zero" : "*unknown*",
vp[j].ev_addr, vp[j].ev_len,
vp[j].ev_offset, vp[j].ev_prot,
vp[j].ev_flags));
if (error != 0 && j == x)
DPRINTF((" ^--- failed\n"));
}
}
#endif
/* $NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $ */
/*-
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Chris Torek.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncpy.c 8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
#ifdef _FORTIFY_SOURCE
#undef strncpy
#endif
/*
* Copy src to dst, truncating or null-padding to always copy n bytes.
* Return dst.
*/
char *
strncpy(char *dst, const char *src, size_t n)
{ if (n != 0) {
char *d = dst;
const char *s = src;
do {
if ((*d++ = *s++) == 0) {
/* NUL pad the remaining n-1 bytes */
while (--n != 0)
*d++ = 0;
break;
}
} while (--n != 0);
}
return (dst);
}
/* $NetBSD: procfs.h,v 1.84 2024/01/17 10:20:12 hannken Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs.h 8.9 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs.h 8.9 (Berkeley) 5/14/95
*/
/* This also pulls in __HAVE_PROCFS_MACHDEP */
#include <sys/ptrace.h>
#ifdef _KERNEL
#include <sys/proc.h>
/*
* The different types of node in a procfs filesystem
*/
typedef enum {
PFSauxv, /* ELF Auxiliary Vector */
PFSchroot, /* the process's current root directory */
PFScmdline, /* process command line args */
PFScpuinfo, /* CPU info (if -o linux) */
PFScpustat, /* status info (if -o linux) */
PFScurproc, /* symbolic link for curproc */
PFScwd, /* the process's current working directory */
PFSdevices, /* major/device name mappings (if -o linux) */
PFSemul, /* the process's emulation */
PFSenviron, /* process environment */
PFSexe, /* symlink to the executable file */
PFSfd, /* a directory containing the processes open fd's */
PFSfile, /* the executable file */
PFSfpregs, /* the process's FP register set */
PFSloadavg, /* load average (if -o linux) */
PFSlimit, /* resource limits */
PFSmap, /* memory map */
PFSmaps, /* memory map, Linux style (if -o linux) */
PFSmem, /* the process's memory image */
PFSmeminfo, /* system memory info (if -o linux) */
PFSmounts, /* mounted filesystems (if -o linux) */
PFSnote, /* process notifier */
PFSnotepg, /* process group notifier */
PFSproc, /* a process-specific sub-directory */
PFSregs, /* the process's register set */
PFSroot, /* the filesystem root */
PFSself, /* like curproc, but this is the Linux name */
PFSstat, /* process status (if -o linux) */
PFSstatm, /* process memory info (if -o linux) */
PFSstatus, /* process status */
PFStask, /* task subdirector (if -o linux) */
PFSuptime, /* elapsed time since (if -o linux) */
PFSversion, /* kernel version (if -o linux) */
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODE_TYPES
#endif
PFSlast, /* track number of types */
} pfstype;
/*
* control data for the proc file system.
*/
struct pfskey {
pfstype pk_type; /* type of procfs node */
pid_t pk_pid; /* associated process */
int pk_fd; /* associated fd if not -1 */
};
struct pfsnode {
LIST_ENTRY(pfsnode) pfs_hash; /* per pid hash list */
struct vnode *pfs_vnode; /* vnode associated with this pfsnode */
struct mount *pfs_mount; /* mount associated with this pfsnode */
struct pfskey pfs_key;
#define pfs_type pfs_key.pk_type
#define pfs_pid pfs_key.pk_pid
#define pfs_fd pfs_key.pk_fd
mode_t pfs_mode; /* mode bits for stat() */
u_long pfs_flags; /* open flags */
uint64_t pfs_fileno; /* unique file id */
};
#define PROCFS_NOTELEN 64 /* max length of a note (/proc/$pid/note) */
#define PROCFS_MAXNAMLEN 255
#endif /* _KERNEL */
struct procfs_args {
int version;
int flags;
};
#define PROCFS_ARGSVERSION 1
#define PROCFSMNT_LINUXCOMPAT 0x01
#define PROCFSMNT_BITS "\177\20" \
"b\00linuxcompat\0"
/*
* Kernel stuff follows
*/
#ifdef _KERNEL
#define CNEQ(cnp, s, len) \
((cnp)->cn_namelen == (len) && \
(memcmp((s), (cnp)->cn_nameptr, (len)) == 0))
#define UIO_MX 32
static __inline ino_t
procfs_fileno(pid_t _pid, pfstype _type, int _fd)
{
ino_t _ino;
switch (_type) {
case PFSroot:
return 2;
case PFScurproc:
return 3;
case PFSself:
return 4;
default:
_ino = _pid + 1;
if (_fd != -1)
_ino = _ino << 32 | _fd;
return _ino * PFSlast + _type;
}
}
#define PROCFS_FILENO(pid, type, fd) procfs_fileno(pid, type, fd)
#define PROCFS_TYPE(type) ((type) % PFSlast)
struct procfsmount {
int pmnt_flags;
};
#define VFSTOPROC(mp) ((struct procfsmount *)(mp)->mnt_data)
/*
* Convert between pfsnode vnode
*/
#define VTOPFS(vp) ((struct pfsnode *)(vp)->v_data)
#define PFSTOV(pfs) ((pfs)->pfs_vnode)
typedef struct vfs_namemap vfs_namemap_t;
struct vfs_namemap {
const char *nm_name;
int nm_val;
};
int vfs_getuserstr(struct uio *, char *, int *);
const vfs_namemap_t *vfs_findname(const vfs_namemap_t *, const char *, int);
struct mount;
struct proc *procfs_proc_find(struct mount *, pid_t);
bool procfs_use_linux_compat(struct mount *);
static inline bool
procfs_proc_is_linux_compat(void)
{
const char *emulname = curlwp->l_proc->p_emul->e_name;
return (strncmp(emulname, "linux", 5) == 0);
}
int procfs_proc_lock(struct mount *, int, struct proc **, int);
void procfs_proc_unlock(struct proc *);
int procfs_allocvp(struct mount *, struct vnode **, pid_t, pfstype, int);
int procfs_donote(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doregs(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dofpregs(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_domem(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_do_pid_stat(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dostatus(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_domap(struct lwp *, struct proc *, struct pfsnode *,
struct uio *, int);
int procfs_doprocargs(struct lwp *, struct proc *, struct pfsnode *,
struct uio *, int);
int procfs_domeminfo(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_dodevices(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_docpuinfo(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_docpustat(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doloadavg(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_do_pid_statm(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dofd(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_douptime(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_domounts(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doemul(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doversion(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doauxv(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_dolimit(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
void procfs_hashrem(struct pfsnode *);
int procfs_getfp(struct pfsnode *, struct proc *, struct file **);
/* functions to check whether or not files should be displayed */
int procfs_validauxv(struct lwp *, struct mount *);
int procfs_validfile(struct lwp *, struct mount *);
int procfs_validfpregs(struct lwp *, struct mount *);
int procfs_validregs(struct lwp *, struct mount *);
int procfs_validmap(struct lwp *, struct mount *);
int procfs_rw(void *);
int procfs_getcpuinfstr(char *, size_t *);
#define PROCFS_LOCKED 0x01
#define PROCFS_WANT 0x02
extern int (**procfs_vnodeop_p)(void *);
extern struct vfsops procfs_vfsops;
int procfs_root(struct mount *, int, struct vnode **);
#ifdef __HAVE_PROCFS_MACHDEP
struct vattr;
void procfs_machdep_allocvp(struct vnode *);
int procfs_machdep_rw(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_machdep_getattr(struct vnode *, struct vattr *, struct proc *);
#endif
#endif /* _KERNEL */
/* $NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $ */
/*-
* Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 2006 YAMAMOTO Takashi.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/specificdata.h>
#include <sys/queue.h>
#include <sys/mutex.h>
/*
* Locking notes:
*
* The specdataref_container pointer in the specificdata_reference
* is volatile. To read it, you must hold EITHER the domain lock
* or the ref lock. To write it, you must hold BOTH the domain lock
* and the ref lock. The locks must be acquired in the following
* order:
* domain -> ref
*/
typedef struct {
specificdata_dtor_t ski_dtor;
} specificdata_key_impl;
struct specificdata_container {
size_t sc_nkey;
LIST_ENTRY(specificdata_container) sc_list;
void * sc_data[]; /* variable length */
};
#define SPECIFICDATA_CONTAINER_BYTESIZE(n) \
(sizeof(struct specificdata_container) + ((n) * sizeof(void *)))
struct specificdata_domain {
kmutex_t sd_lock;
unsigned int sd_nkey;
LIST_HEAD(, specificdata_container) sd_list;
specificdata_key_impl *sd_keys;
};
static void
specificdata_container_link(specificdata_domain_t sd,
specificdata_container_t sc)
{
LIST_INSERT_HEAD(&sd->sd_list, sc, sc_list);
}
static void
specificdata_container_unlink(specificdata_domain_t sd,
specificdata_container_t sc)
{
LIST_REMOVE(sc, sc_list);
}
static void
specificdata_destroy_datum(specificdata_domain_t sd,
specificdata_container_t sc, specificdata_key_t key)
{
specificdata_dtor_t dtor;
void *data;
if (key >= sc->sc_nkey)
return;
KASSERT(key < sd->sd_nkey);
data = sc->sc_data[key];
dtor = sd->sd_keys[key].ski_dtor;
if (dtor != NULL) {
if (data != NULL) { sc->sc_data[key] = NULL;
(*dtor)(data);
}
} else {
KASSERT(data == NULL);
}
}
static void
specificdata_noop_dtor(void *data)
{
/* nothing */
}
/*
* specificdata_domain_create --
* Create a specificdata domain.
*/
specificdata_domain_t
specificdata_domain_create(void)
{
specificdata_domain_t sd;
sd = kmem_zalloc(sizeof(*sd), KM_SLEEP);
mutex_init(&sd->sd_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&sd->sd_list);
return (sd);
}
/*
* specificdata_domain_delete --
* Destroy a specificdata domain.
*/
void
specificdata_domain_delete(specificdata_domain_t sd)
{
panic("specificdata_domain_delete: not implemented");
}
/*
* specificdata_key_create --
* Create a specificdata key for a domain.
*
* Note: This is a rare operation.
*/
int
specificdata_key_create(specificdata_domain_t sd, specificdata_key_t *keyp,
specificdata_dtor_t dtor)
{
specificdata_key_impl *newkeys;
specificdata_key_t key = 0;
size_t nsz;
ASSERT_SLEEPABLE();
if (dtor == NULL)
dtor = specificdata_noop_dtor;
mutex_enter(&sd->sd_lock);
if (sd->sd_keys == NULL)
goto needalloc;
for (; key < sd->sd_nkey; key++) {
if (sd->sd_keys[key].ski_dtor == NULL)
goto gotit;
}
needalloc:
nsz = (sd->sd_nkey + 1) * sizeof(*newkeys);
/* XXXSMP allocating memory while holding a lock. */
newkeys = kmem_zalloc(nsz, KM_SLEEP);
if (sd->sd_keys != NULL) {
size_t osz = sd->sd_nkey * sizeof(*newkeys);
memcpy(newkeys, sd->sd_keys, osz);
kmem_free(sd->sd_keys, osz);
}
sd->sd_keys = newkeys;
sd->sd_nkey++;
gotit:
sd->sd_keys[key].ski_dtor = dtor;
mutex_exit(&sd->sd_lock);
*keyp = key;
return (0);
}
/*
* specificdata_key_delete --
* Destroy a specificdata key for a domain.
*
* Note: This is a rare operation.
*/
void
specificdata_key_delete(specificdata_domain_t sd, specificdata_key_t key)
{
specificdata_container_t sc;
mutex_enter(&sd->sd_lock);
if (key >= sd->sd_nkey)
goto out;
/*
* Traverse all of the specificdata containers in the domain
* and the destroy the datum for the dying key.
*/
LIST_FOREACH(sc, &sd->sd_list, sc_list) {
specificdata_destroy_datum(sd, sc, key);
}
sd->sd_keys[key].ski_dtor = NULL;
out:
mutex_exit(&sd->sd_lock);
}
/*
* specificdata_init --
* Initialize a specificdata container for operation in the
* specified domain.
*/
int
specificdata_init(specificdata_domain_t sd, specificdata_reference *ref)
{
/*
* Just NULL-out the container pointer; we'll allocate the
* container the first time specificdata is put into it.
*/
ref->specdataref_container = NULL;
mutex_init(&ref->specdataref_lock, MUTEX_DEFAULT, IPL_NONE);
return (0);
}
/*
* specificdata_fini --
* Destroy a specificdata container. We destroy all of the datums
* stuffed into the container just as if the key were destroyed.
*/
void
specificdata_fini(specificdata_domain_t sd, specificdata_reference *ref)
{
specificdata_container_t sc;
specificdata_key_t key;
ASSERT_SLEEPABLE();
mutex_destroy(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc == NULL)
return;
ref->specdataref_container = NULL;
mutex_enter(&sd->sd_lock);
specificdata_container_unlink(sd, sc); for (key = 0; key < sc->sc_nkey; key++) { specificdata_destroy_datum(sd, sc, key);
}
mutex_exit(&sd->sd_lock);
kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}
/*
* specificdata_getspecific --
* Get a datum from a container.
*/
void *
specificdata_getspecific(specificdata_domain_t sd, specificdata_reference *ref,
specificdata_key_t key)
{
specificdata_container_t sc;
void *data = NULL;
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc != NULL && key < sc->sc_nkey) data = sc->sc_data[key];
mutex_exit(&ref->specdataref_lock);
return (data);
}
/*
* specificdata_getspecific_unlocked --
* Get a datum from a container in a lockless fashion.
*
* Note: When using this routine, care must be taken to ensure
* that no other thread could cause the specificdata_reference
* to become invalid (i.e. point at the wrong container) by
* issuing a setspecific call or destroying the container.
*/
void *
specificdata_getspecific_unlocked(specificdata_domain_t sd,
specificdata_reference *ref,
specificdata_key_t key)
{
specificdata_container_t sc;
sc = ref->specdataref_container;
if (sc != NULL && key < sc->sc_nkey) return (sc->sc_data[key]);
return (NULL);
}
/*
* specificdata_setspecific --
* Put a datum into a container.
*/
void
specificdata_setspecific(specificdata_domain_t sd,
specificdata_reference *ref,
specificdata_key_t key, void *data)
{
specificdata_container_t sc, newsc;
size_t newnkey, sz;
ASSERT_SLEEPABLE();
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (__predict_true(sc != NULL && key < sc->sc_nkey)) {
sc->sc_data[key] = data;
mutex_exit(&ref->specdataref_lock);
return;
}
mutex_exit(&ref->specdataref_lock);
/*
* Slow path: need to resize.
*/
mutex_enter(&sd->sd_lock);
newnkey = sd->sd_nkey;
if (key >= newnkey) {
mutex_exit(&sd->sd_lock);
panic("specificdata_setspecific");
}
sz = SPECIFICDATA_CONTAINER_BYTESIZE(newnkey);
newsc = kmem_zalloc(sz, KM_SLEEP);
newsc->sc_nkey = newnkey;
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc != NULL) {
if (key < sc->sc_nkey) {
/*
* Someone beat us to the punch. Unwind and put
* the object into the now large enough container.
*/
sc->sc_data[key] = data;
mutex_exit(&ref->specdataref_lock);
mutex_exit(&sd->sd_lock);
kmem_free(newsc, sz);
return;
}
specificdata_container_unlink(sd, sc);
memcpy(newsc->sc_data, sc->sc_data,
sc->sc_nkey * sizeof(void *));
}
newsc->sc_data[key] = data;
specificdata_container_link(sd, newsc);
ref->specdataref_container = newsc;
mutex_exit(&ref->specdataref_lock);
mutex_exit(&sd->sd_lock);
if (sc != NULL)
kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}
/* $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $ */
/*-
* Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
/*
* The vnode cache subsystem.
*
* Life-cycle
*
* Normally, there are two points where new vnodes are created:
* VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
* starts in one of the following ways:
*
* - Allocation, via vcache_get(9) or vcache_new(9).
* - Reclamation of inactive vnode, via vcache_vget(9).
*
* Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
* was another, traditional way. Currently, only the draining thread
* recycles the vnodes. This behaviour might be revisited.
*
* The life-cycle ends when the last reference is dropped, usually
* in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
* the file system that vnode is inactive. Via this call, file system
* indicates whether vnode can be recycled (usually, it checks its own
* references, e.g. count of links, whether the file was removed).
*
* Depending on indication, vnode can be put into a free list (cache),
* or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
* disassociate underlying file system from the vnode, and finally
* destroyed.
*
* Vnode state
*
* Vnode is always in one of six states:
* - MARKER This is a marker vnode to help list traversal. It
* will never change its state.
* - LOADING Vnode is associating underlying file system and not
* yet ready to use.
* - LOADED Vnode has associated underlying file system and is
* ready to use.
* - BLOCKED Vnode is active but cannot get new references.
* - RECLAIMING Vnode is disassociating from the underlying file
* system.
* - RECLAIMED Vnode has disassociated from underlying file system
* and is dead.
*
* Valid state changes are:
* LOADING -> LOADED
* Vnode has been initialised in vcache_get() or
* vcache_new() and is ready to use.
* BLOCKED -> RECLAIMING
* Vnode starts disassociation from underlying file
* system in vcache_reclaim().
* RECLAIMING -> RECLAIMED
* Vnode finished disassociation from underlying file
* system in vcache_reclaim().
* LOADED -> BLOCKED
* Either vcache_rekey*() is changing the vnode key or
* vrelel() is about to call VOP_INACTIVE().
* BLOCKED -> LOADED
* The block condition is over.
* LOADING -> RECLAIMED
* Either vcache_get() or vcache_new() failed to
* associate the underlying file system or vcache_rekey*()
* drops a vnode used as placeholder.
*
* Of these states LOADING, BLOCKED and RECLAIMING are intermediate
* and it is possible to wait for state change.
*
* State is protected with v_interlock with one exception:
* to change from LOADING both v_interlock and vcache_lock must be held
* so it is possible to check "state == LOADING" without holding
* v_interlock. See vcache_get() for details.
*
* Reference counting
*
* Vnode is considered active, if reference count (vnode_t::v_usecount)
* is non-zero. It is maintained using: vref(9) and vrele(9), as well
* as vput(9), routines. Common points holding references are e.g.
* file openings, current working directory, mount points, etc.
*
* v_usecount is adjusted with atomic operations, however to change
* from a non-zero value to zero the interlock must also be held.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");
#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/hash.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pax.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/threadpool.h>
#include <sys/vnode_impl.h>
#include <sys/wapbl.h>
#include <sys/fstrans.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_stat.h>
/* Flags to vrelel. */
#define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
#define LRU_VRELE 0
#define LRU_FREE 1
#define LRU_HOLD 2
#define LRU_COUNT 3
/*
* There are three lru lists: one holds vnodes waiting for async release,
* one is for vnodes which have no buffer/page references and one for those
* which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
* private cache line as vnodes migrate between them while under the same
* lock (vdrain_lock).
*/
typedef struct {
vnode_impl_t *li_marker;
} lru_iter_t;
u_int numvnodes __cacheline_aligned;
static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
static struct threadpool *threadpool;
static struct threadpool_job vdrain_job;
static struct threadpool_job vrele_job;
static kmutex_t vdrain_lock __cacheline_aligned;
SLIST_HEAD(hashhead, vnode_impl);
static kmutex_t vcache_lock __cacheline_aligned;
static kcondvar_t vcache_cv;
static u_int vcache_hashsize;
static u_long vcache_hashmask;
static struct hashhead *vcache_hashtab;
static pool_cache_t vcache_pool;
static void lru_requeue(vnode_t *, vnodelst_t *);
static vnodelst_t * lru_which(vnode_t *);
static vnode_impl_t * lru_iter_first(int, lru_iter_t *);
static vnode_impl_t * lru_iter_next(lru_iter_t *);
static void lru_iter_release(lru_iter_t *);
static vnode_impl_t * vcache_alloc(void);
static void vcache_dealloc(vnode_impl_t *);
static void vcache_free(vnode_impl_t *);
static void vcache_init(void);
static void vcache_reinit(void);
static void vcache_reclaim(vnode_t *);
static void vrele_deferred(vnode_impl_t *);
static void vrelel(vnode_t *, int, int);
static void vnpanic(vnode_t *, const char *, ...)
__printflike(2, 3);
static bool vdrain_one(u_int);
static void vdrain_task(struct threadpool_job *);
static void vrele_task(struct threadpool_job *);
/* Routines having to do with the management of the vnode table. */
/*
* The high bit of v_usecount is a gate for vcache_tryvget(). It's set
* only when the vnode state is LOADED.
* The next bit of v_usecount is a flag for vrelel(). It's set
* from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
*/
#define VUSECOUNT_MASK 0x3fffffff
#define VUSECOUNT_GATE 0x80000000
#define VUSECOUNT_VGET 0x40000000
/*
* Return the current usecount of a vnode.
*/
inline int
vrefcnt(struct vnode *vp)
{ return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
}
/* Vnode state operations and diagnostics. */
#if defined(DIAGNOSTIC)
#define VSTATE_VALID(state) \
((state) != VS_ACTIVE && (state) != VS_MARKER)
#define VSTATE_GET(vp) \
vstate_assert_get((vp), __func__, __LINE__)
#define VSTATE_CHANGE(vp, from, to) \
vstate_assert_change((vp), (from), (to), __func__, __LINE__)
#define VSTATE_WAIT_STABLE(vp) \
vstate_assert_wait_stable((vp), __func__, __LINE__)
void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
bool has_lock)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int refcnt = vrefcnt(vp); if (!has_lock) { enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state); if (state == VS_ACTIVE && refcnt > 0 &&
(vstate == VS_LOADED || vstate == VS_BLOCKED))
return;
if (vstate == state)
return;
mutex_enter((vp)->v_interlock);
}
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if ((state == VS_ACTIVE && refcnt > 0 &&
(vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
vip->vi_state == state) {
if (!has_lock) mutex_exit((vp)->v_interlock);
return;
}
vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
vstate_name(vip->vi_state), refcnt,
vstate_name(state), func, line);
}
static enum vnode_state
vstate_assert_get(vnode_t *vp, const char *func, int line)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line);
return vip->vi_state;
}
static void
vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line);
}
static void
vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
const char *func, int line)
{ bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (from == VS_LOADING) KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
if (! VSTATE_VALID(from))
vnpanic(vp, "from is %s at %s:%d",
vstate_name(from), func, line);
if (! VSTATE_VALID(to))
vnpanic(vp, "to is %s at %s:%d",
vstate_name(to), func, line);
if (vip->vi_state != from)
vnpanic(vp, "from is %s, expected %s at %s:%d\n",
vstate_name(vip->vi_state), vstate_name(from), func, line);
if ((from == VS_LOADED) != gated)
vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
vstate_name(vip->vi_state), gated, func, line);
/* Open/close the gate for vcache_tryvget(). */
if (to == VS_LOADED) {
membar_release();
atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
} else {
atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
}
atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED)
cv_broadcast(&vp->v_cv);
}
#else /* defined(DIAGNOSTIC) */
#define VSTATE_GET(vp) \
(VNODE_TO_VIMPL((vp))->vi_state)
#define VSTATE_CHANGE(vp, from, to) \
vstate_change((vp), (from), (to))
#define VSTATE_WAIT_STABLE(vp) \
vstate_wait_stable((vp))
void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
bool has_lock)
{
}
static void
vstate_wait_stable(vnode_t *vp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
cv_wait(&vp->v_cv, vp->v_interlock);
}
static void
vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
/* Open/close the gate for vcache_tryvget(). */
if (to == VS_LOADED) {
membar_release();
atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
} else {
atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
}
atomic_store_relaxed(&vip->vi_state, to);
if (from == VS_LOADING)
cv_broadcast(&vcache_cv);
if (to == VS_LOADED || to == VS_RECLAIMED)
cv_broadcast(&vp->v_cv);
}
#endif /* defined(DIAGNOSTIC) */
void
vfs_vnode_sysinit(void)
{
int error __diagused, i;
dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
KASSERT(dead_rootmount != NULL);
dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
for (i = 0; i < LRU_COUNT; i++) {
TAILQ_INIT(&lru_list[i]);
}
vcache_init();
error = threadpool_get(&threadpool, PRI_NONE);
KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
}
/*
* Allocate a new marker vnode.
*/
vnode_t *
vnalloc_marker(struct mount *mp)
{
vnode_impl_t *vip;
vnode_t *vp;
vip = pool_cache_get(vcache_pool, PR_WAITOK);
memset(vip, 0, sizeof(*vip));
vp = VIMPL_TO_VNODE(vip);
uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
vp->v_mount = mp;
vp->v_type = VBAD;
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
klist_init(&vip->vi_klist.vk_klist);
vp->v_klist = &vip->vi_klist;
vip->vi_state = VS_MARKER;
return vp;
}
/*
* Free a marker vnode.
*/
void
vnfree_marker(vnode_t *vp)
{
vnode_impl_t *vip;
vip = VNODE_TO_VIMPL(vp);
KASSERT(vip->vi_state == VS_MARKER);
mutex_obj_free(vp->v_interlock);
uvm_obj_destroy(&vp->v_uobj, true);
klist_fini(&vip->vi_klist.vk_klist);
pool_cache_put(vcache_pool, vip);
}
/*
* Test a vnode for being a marker vnode.
*/
bool
vnis_marker(vnode_t *vp)
{
return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
}
/*
* Return the lru list this node should be on.
*/
static vnodelst_t *
lru_which(vnode_t *vp)
{
KASSERT(mutex_owned(vp->v_interlock));
if (vp->v_holdcnt > 0)
return &lru_list[LRU_HOLD];
else
return &lru_list[LRU_FREE];
}
/*
* Put vnode to end of given list.
* Both the current and the new list may be NULL, used on vnode alloc/free.
* Adjust numvnodes and signal vdrain thread if there is work.
*/
static void
lru_requeue(vnode_t *vp, vnodelst_t *listhd)
{
vnode_impl_t *vip;
int d;
/*
* If the vnode is on the correct list, and was put there recently,
* then leave it be, thus avoiding huge cache and lock contention.
*/
vip = VNODE_TO_VIMPL(vp);
if (listhd == vip->vi_lrulisthd &&
(getticks() - vip->vi_lrulisttm) < hz) {
return;
}
mutex_enter(&vdrain_lock);
d = 0;
if (vip->vi_lrulisthd != NULL) TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
else
d++;
vip->vi_lrulisthd = listhd;
vip->vi_lrulisttm = getticks();
if (vip->vi_lrulisthd != NULL)
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
else
d--; if (d != 0) {
/*
* Looks strange? This is not a bug. Don't store
* numvnodes unless there is a change - avoid false
* sharing on MP.
*/
numvnodes += d;
}
if (listhd == &lru_list[LRU_VRELE]) threadpool_schedule_job(threadpool, &vrele_job); if (d > 0 && numvnodes > desiredvnodes) threadpool_schedule_job(threadpool, &vdrain_job); if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
mutex_exit(&vdrain_lock);
}
/*
* LRU list iterator.
* Caller holds vdrain_lock.
*/
static vnode_impl_t *
lru_iter_first(int idx, lru_iter_t *iterp)
{
vnode_impl_t *marker;
KASSERT(mutex_owned(&vdrain_lock));
mutex_exit(&vdrain_lock);
marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
mutex_enter(&vdrain_lock);
marker->vi_lrulisthd = &lru_list[idx];
iterp->li_marker = marker;
TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
return lru_iter_next(iterp);
}
static vnode_impl_t *
lru_iter_next(lru_iter_t *iter)
{
vnode_impl_t *vip, *marker;
vnodelst_t *listhd;
KASSERT(mutex_owned(&vdrain_lock));
marker = iter->li_marker;
listhd = marker->vi_lrulisthd;
while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
TAILQ_REMOVE(listhd, marker, vi_lrulist);
TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
if (!vnis_marker(VIMPL_TO_VNODE(vip)))
break;
}
return vip;
}
static void
lru_iter_release(lru_iter_t *iter)
{
vnode_impl_t *marker;
KASSERT(mutex_owned(&vdrain_lock));
marker = iter->li_marker;
TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
mutex_exit(&vdrain_lock);
vnfree_marker(VIMPL_TO_VNODE(marker));
mutex_enter(&vdrain_lock);
}
/*
* Release deferred vrele vnodes for this mount.
* Called with file system suspended.
*/
void
vrele_flush(struct mount *mp)
{
lru_iter_t iter;
vnode_impl_t *vip;
KASSERT(fstrans_is_owner(mp));
mutex_enter(&vdrain_lock);
for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
vip = lru_iter_next(&iter)) {
if (VIMPL_TO_VNODE(vip)->v_mount != mp)
continue;
vrele_deferred(vip);
}
lru_iter_release(&iter);
mutex_exit(&vdrain_lock);
}
/*
* One pass through the LRU lists to keep the number of allocated
* vnodes below target. Returns true if target met.
*/
static bool
vdrain_one(u_int target)
{
int ix, lists[] = { LRU_FREE, LRU_HOLD };
lru_iter_t iter;
vnode_impl_t *vip;
vnode_t *vp;
struct mount *mp;
KASSERT(mutex_owned(&vdrain_lock));
for (ix = 0; ix < __arraycount(lists); ix++) {
for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
vip = lru_iter_next(&iter)) {
if (numvnodes < target) {
lru_iter_release(&iter);
return true;
}
vp = VIMPL_TO_VNODE(vip);
/* Probe usecount (unlocked). */
if (vrefcnt(vp) > 0)
continue;
/* Try v_interlock -- we lock the wrong direction! */
if (!mutex_tryenter(vp->v_interlock))
continue;
/* Probe usecount and state. */
if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
mutex_exit(vp->v_interlock);
continue;
}
mutex_exit(&vdrain_lock);
mp = vp->v_mount;
if (fstrans_start_nowait(mp) != 0) {
mutex_exit(vp->v_interlock);
mutex_enter(&vdrain_lock);
continue;
}
if (vcache_vget(vp) == 0) {
if (!vrecycle(vp)) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_EXCLUSIVE);
}
}
fstrans_done(mp);
mutex_enter(&vdrain_lock);
}
lru_iter_release(&iter);
}
return false;
}
/*
* threadpool task to keep the number of vnodes below desiredvnodes.
*/
static void
vdrain_task(struct threadpool_job *job)
{
u_int target;
target = desiredvnodes - desiredvnodes / 16;
mutex_enter(&vdrain_lock);
while (!vdrain_one(target))
kpause("vdrain", false, 1, &vdrain_lock);
threadpool_job_done(job);
mutex_exit(&vdrain_lock);
}
/*
* threadpool task to process asynchronous vrele.
*/
static void
vrele_task(struct threadpool_job *job)
{
int skipped;
lru_iter_t iter;
vnode_impl_t *vip;
struct mount *mp;
mutex_enter(&vdrain_lock);
while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
mp = VIMPL_TO_VNODE(vip)->v_mount;
if (fstrans_start_nowait(mp) == 0) {
vrele_deferred(vip);
fstrans_done(mp);
} else {
skipped++;
}
}
lru_iter_release(&iter);
if (skipped)
kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
}
threadpool_job_done(job);
lru_iter_release(&iter);
mutex_exit(&vdrain_lock);
}
/*
* Try to drop reference on a vnode. Abort if we are releasing the
* last reference. Note: this _must_ succeed if not the last reference.
*/
static bool
vtryrele(vnode_t *vp)
{
u_int use, next;
membar_release();
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
return false;
}
KASSERT((use & VUSECOUNT_MASK) > 1);
next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
if (__predict_true(next == use)) {
return true;
}
}
}
/*
* vput: unlock and release the reference.
*/
void
vput(vnode_t *vp)
{
int lktype;
/*
* Do an unlocked check of the usecount. If it looks like we're not
* about to drop the last reference, then unlock the vnode and try
* to drop the reference. If it ends up being the last reference
* after all, vrelel() can fix it all up. Most of the time this
* will all go to plan.
*/
if (vrefcnt(vp) > 1) {
VOP_UNLOCK(vp);
if (vtryrele(vp)) {
return;
}
lktype = LK_NONE;
} else {
lktype = VOP_ISLOCKED(vp);
KASSERT(lktype != LK_NONE);
}
mutex_enter(vp->v_interlock);
vrelel(vp, 0, lktype);
}
/*
* Release a vnode from the deferred list.
*/
static void
vrele_deferred(vnode_impl_t *vip)
{
vnode_t *vp;
KASSERT(mutex_owned(&vdrain_lock));
KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
vp = VIMPL_TO_VNODE(vip);
/*
* First remove the vnode from the vrele list.
* Put it on the last lru list, the last vrele()
* will put it back onto the right list before
* its usecount reaches zero.
*/
TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
vip->vi_lrulisthd = &lru_list[LRU_HOLD];
vip->vi_lrulisttm = getticks();
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
mutex_exit(&vdrain_lock);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_EXCLUSIVE);
mutex_enter(&vdrain_lock);
}
/*
* Vnode release. If reference count drops to zero, call inactive
* routine and either return to freelist or free to the pool.
*/
static void
vrelel(vnode_t *vp, int flags, int lktype)
{
const bool async = ((flags & VRELEL_ASYNC) != 0);
bool recycle, defer, objlock_held;
u_int use, next;
int error;
objlock_held = false;
retry:
KASSERT(mutex_owned(vp->v_interlock)); if (__predict_false(vp->v_op == dead_vnodeop_p &&
VSTATE_GET(vp) != VS_RECLAIMED)) {
vnpanic(vp, "dead but not clean");
}
/*
* If not the last reference, just unlock and drop the reference count.
*
* Otherwise make sure we pass a point in time where we hold the
* last reference with VGET flag unset.
*/
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
lktype = LK_NONE;
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
if (vtryrele(vp)) {
mutex_exit(vp->v_interlock);
return;
}
next = atomic_load_relaxed(&vp->v_usecount);
continue;
}
KASSERT((use & VUSECOUNT_MASK) == 1);
next = use & ~VUSECOUNT_VGET;
if (next != use) {
next = atomic_cas_uint(&vp->v_usecount, use, next);
}
if (__predict_true(next == use)) {
break;
}
}
membar_acquire();
if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
vnpanic(vp, "%s: bad ref count", __func__);
}
#ifdef DIAGNOSTIC
if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { vprint("vrelel: missing VOP_CLOSE()", vp);
}
#endif
/*
* If already clean there is no need to lock, defer or
* deactivate this node.
*/
if (VSTATE_GET(vp) == VS_RECLAIMED) { if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
lktype = LK_NONE;
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
goto out;
}
/*
* First try to get the vnode locked for VOP_INACTIVE().
* Defer vnode release to vrele task if caller requests
* it explicitly, is the pagedaemon or the lock failed.
*/
defer = false;
if ((curlwp == uvm.pagedaemon_lwp) || async) {
defer = true;
} else if (lktype == LK_SHARED) {
/* Excellent chance of getting, if the last ref. */
error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
if (error != 0) {
defer = true;
} else {
lktype = LK_EXCLUSIVE;
}
} else if (lktype == LK_NONE) {
/* Excellent chance of getting, if the last ref. */
error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
if (error != 0) {
defer = true;
} else {
lktype = LK_EXCLUSIVE;
}
}
KASSERT(mutex_owned(vp->v_interlock));
if (defer) {
/*
* Defer reclaim to the vrele task; it's not safe to
* clean it here. We donate it our last reference.
*/
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
lru_requeue(vp, &lru_list[LRU_VRELE]);
mutex_exit(vp->v_interlock);
return;
}
KASSERT(lktype == LK_EXCLUSIVE);
/* If the node gained another reference, retry. */
use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) {
goto retry;
}
KASSERT((use & VUSECOUNT_MASK) == 1); if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
(vp->v_vflag & VV_MAPPED) != 0) {
/* Take care of space accounting. */
if (!objlock_held) {
objlock_held = true;
if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
mutex_exit(vp->v_interlock);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
mutex_enter(vp->v_interlock);
goto retry;
}
}
if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
vp->v_vflag &= ~VV_MAPPED;
}
if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
/*
* Deactivate the vnode, but preserve our reference across
* the call to VOP_INACTIVE().
*
* If VOP_INACTIVE() indicates that the file has been
* deleted, then recycle the vnode.
*
* Note that VOP_INACTIVE() will not drop the vnode lock.
*/
mutex_exit(vp->v_interlock);
recycle = false;
VOP_INACTIVE(vp, &recycle);
if (!recycle) {
lktype = LK_NONE;
VOP_UNLOCK(vp);
}
mutex_enter(vp->v_interlock);
/*
* Block new references then check again to see if a
* new reference was acquired in the meantime. If
* it was, restore the vnode state and try again.
*/
if (recycle) {
VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
use = atomic_load_relaxed(&vp->v_usecount);
if ((use & VUSECOUNT_VGET) != 0) {
VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
goto retry;
}
KASSERT((use & VUSECOUNT_MASK) == 1);
}
/*
* Recycle the vnode if the file is now unused (unlinked).
*/
if (recycle) {
VSTATE_ASSERT(vp, VS_BLOCKED);
KASSERT(lktype == LK_EXCLUSIVE);
/* vcache_reclaim drops the lock. */
lktype = LK_NONE;
vcache_reclaim(vp);
}
KASSERT(vrefcnt(vp) > 0); KASSERT(lktype == LK_NONE);
out:
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
(use & VUSECOUNT_MASK) == 1)) {
/* Gained and released another reference, retry. */
goto retry;
}
next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
if (__predict_true(next == use)) {
if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
/* Gained another reference. */
mutex_exit(vp->v_interlock);
return;
}
break;
}
}
membar_acquire();
if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
/*
* It's clean so destroy it. It isn't referenced
* anywhere since it has been reclaimed.
*/
vcache_free(VNODE_TO_VIMPL(vp));
} else {
/*
* Otherwise, put it back onto the freelist. It
* can't be destroyed while still associated with
* a file system.
*/
lru_requeue(vp, lru_which(vp));
mutex_exit(vp->v_interlock);
}
}
void
vrele(vnode_t *vp)
{ if (vtryrele(vp)) {
return;
}
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_NONE);
}
/*
* Asynchronous vnode release, vnode is released in different context.
*/
void
vrele_async(vnode_t *vp)
{
if (vtryrele(vp)) {
return;
}
mutex_enter(vp->v_interlock);
vrelel(vp, VRELEL_ASYNC, LK_NONE);
}
/*
* Vnode reference, where a reference is already held by some other
* object (for example, a file structure).
*
* NB: lockless code sequences may rely on this not blocking.
*/
void
vref(vnode_t *vp)
{ KASSERT(vrefcnt(vp) > 0);
atomic_inc_uint(&vp->v_usecount);
}
/*
* Page or buffer structure gets a reference.
* Called with v_interlock held.
*/
void
vholdl(vnode_t *vp)
{ KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp));
}
/*
* Page or buffer structure gets a reference.
*/
void
vhold(vnode_t *vp)
{
mutex_enter(vp->v_interlock);
vholdl(vp);
mutex_exit(vp->v_interlock);
}
/*
* Page or buffer structure frees a reference.
* Called with v_interlock held.
*/
void
holdrelel(vnode_t *vp)
{ KASSERT(mutex_owned(vp->v_interlock));
if (vp->v_holdcnt <= 0) {
vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
}
vp->v_holdcnt--;
if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp));
}
/*
* Page or buffer structure frees a reference.
*/
void
holdrele(vnode_t *vp)
{
mutex_enter(vp->v_interlock);
holdrelel(vp);
mutex_exit(vp->v_interlock);
}
/*
* Recycle an unused vnode if caller holds the last reference.
*/
bool
vrecycle(vnode_t *vp)
{
int error __diagused;
mutex_enter(vp->v_interlock);
/* If the vnode is already clean we're done. */
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) != VS_LOADED) {
VSTATE_ASSERT(vp, VS_RECLAIMED);
vrelel(vp, 0, LK_NONE);
return true;
}
/* Prevent further references until the vnode is locked. */
VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
/* Make sure we hold the last reference. */
if (vrefcnt(vp) != 1) {
VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
mutex_exit(vp->v_interlock);
return false;
}
mutex_exit(vp->v_interlock);
/*
* On a leaf file system this lock will always succeed as we hold
* the last reference and prevent further references.
* On layered file systems waiting for the lock would open a can of
* deadlocks as the lower vnodes may have other active references.
*/
error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
mutex_enter(vp->v_interlock);
if (error) {
VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
mutex_exit(vp->v_interlock);
return false;
}
KASSERT(vrefcnt(vp) == 1);
vcache_reclaim(vp);
vrelel(vp, 0, LK_NONE);
return true;
}
/*
* Helper for vrevoke() to propagate suspension from lastmp
* to thismp. Both args may be NULL.
* Returns the currently suspended file system or NULL.
*/
static struct mount *
vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
{
int error;
if (lastmp == thismp)
return thismp;
if (lastmp != NULL)
vfs_resume(lastmp);
if (thismp == NULL)
return NULL;
do {
error = vfs_suspend(thismp, 0);
} while (error == EINTR || error == ERESTART);
if (error == 0)
return thismp;
KASSERT(error == EOPNOTSUPP || error == ENOENT);
return NULL;
}
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
void
vrevoke(vnode_t *vp)
{
struct mount *mp;
vnode_t *vq;
enum vtype type;
dev_t dev;
KASSERT(vrefcnt(vp) > 0);
mp = vrevoke_suspend_next(NULL, vp->v_mount);
mutex_enter(vp->v_interlock);
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) == VS_RECLAIMED) {
mutex_exit(vp->v_interlock);
} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
atomic_inc_uint(&vp->v_usecount);
mutex_exit(vp->v_interlock);
vgone(vp);
} else {
dev = vp->v_rdev;
type = vp->v_type;
mutex_exit(vp->v_interlock);
while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
== 0) {
mp = vrevoke_suspend_next(mp, vq->v_mount);
vgone(vq);
}
}
vrevoke_suspend_next(mp, NULL);
}
/*
* Eliminate all activity associated with a vnode in preparation for
* reuse. Drops a reference from the vnode.
*/
void
vgone(vnode_t *vp)
{
int lktype;
KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
lktype = LK_EXCLUSIVE;
mutex_enter(vp->v_interlock);
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) == VS_LOADED) {
VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
vcache_reclaim(vp);
lktype = LK_NONE;
}
VSTATE_ASSERT(vp, VS_RECLAIMED);
vrelel(vp, 0, lktype);
}
static inline uint32_t
vcache_hash(const struct vcache_key *key)
{
uint32_t hash = HASH32_BUF_INIT;
KASSERT(key->vk_key_len > 0);
hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
return hash;
}
static int
vcache_stats(struct hashstat_sysctl *hs, bool fill)
{
vnode_impl_t *vip;
uint64_t chain;
strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = vcache_hashmask + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
mutex_enter(&vcache_lock);
SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
chain++;
}
mutex_exit(&vcache_lock);
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
preempt_point();
}
return 0;
}
static void
vcache_init(void)
{
vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
KASSERT(vcache_pool != NULL);
mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&vcache_cv, "vcache");
vcache_hashsize = desiredvnodes;
vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
&vcache_hashmask);
hashstat_register("vcache", vcache_stats);
}
static void
vcache_reinit(void)
{
int i;
uint32_t hash;
u_long oldmask, newmask;
struct hashhead *oldtab, *newtab;
vnode_impl_t *vip;
newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
mutex_enter(&vcache_lock);
oldtab = vcache_hashtab;
oldmask = vcache_hashmask;
vcache_hashsize = desiredvnodes;
vcache_hashtab = newtab;
vcache_hashmask = newmask;
for (i = 0; i <= oldmask; i++) {
while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
hash = vcache_hash(&vip->vi_key);
SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
vip, vi_hash);
}
}
mutex_exit(&vcache_lock);
hashdone(oldtab, HASH_SLIST, oldmask);
}
static inline vnode_impl_t *
vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
{
struct hashhead *hashp;
vnode_impl_t *vip;
KASSERT(mutex_owned(&vcache_lock));
hashp = &vcache_hashtab[hash & vcache_hashmask];
SLIST_FOREACH(vip, hashp, vi_hash) { if (key->vk_mount != vip->vi_key.vk_mount)
continue;
if (key->vk_key_len != vip->vi_key.vk_key_len)
continue;
if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
continue;
return vip;
}
return NULL;
}
/*
* Allocate a new, uninitialized vcache node.
*/
static vnode_impl_t *
vcache_alloc(void)
{
vnode_impl_t *vip;
vnode_t *vp;
vip = pool_cache_get(vcache_pool, PR_WAITOK);
vp = VIMPL_TO_VNODE(vip);
memset(vip, 0, sizeof(*vip));
rw_init(&vip->vi_lock);
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
klist_init(&vip->vi_klist.vk_klist);
vp->v_klist = &vip->vi_klist;
cv_init(&vp->v_cv, "vnode");
cache_vnode_init(vp);
vp->v_usecount = 1;
vp->v_type = VNON;
vp->v_size = vp->v_writesize = VSIZENOTSET;
vip->vi_state = VS_LOADING;
lru_requeue(vp, &lru_list[LRU_FREE]);
return vip;
}
/*
* Deallocate a vcache node in state VS_LOADING.
*
* vcache_lock held on entry and released on return.
*/
static void
vcache_dealloc(vnode_impl_t *vip)
{
vnode_t *vp;
KASSERT(mutex_owned(&vcache_lock));
vp = VIMPL_TO_VNODE(vip);
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
mutex_enter(vp->v_interlock);
vp->v_op = dead_vnodeop_p;
VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
mutex_exit(&vcache_lock);
vrelel(vp, 0, LK_NONE);
}
/*
* Free an unused, unreferenced vcache node.
* v_interlock locked on entry.
*/
static void
vcache_free(vnode_impl_t *vip)
{
vnode_t *vp;
vp = VIMPL_TO_VNODE(vip);
KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) == 0); KASSERT(vp->v_holdcnt == 0); KASSERT(vp->v_writecount == 0);
lru_requeue(vp, NULL);
mutex_exit(vp->v_interlock);
vfs_insmntque(vp, NULL);
if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp);
mutex_obj_free(vp->v_interlock);
rw_destroy(&vip->vi_lock);
uvm_obj_destroy(&vp->v_uobj, true);
KASSERT(vp->v_klist == &vip->vi_klist);
klist_fini(&vip->vi_klist.vk_klist);
cv_destroy(&vp->v_cv);
cache_vnode_fini(vp);
pool_cache_put(vcache_pool, vip);
}
/*
* Try to get an initial reference on this cached vnode.
* Returns zero on success or EBUSY if the vnode state is not LOADED.
*
* NB: lockless code sequences may rely on this not blocking.
*/
int
vcache_tryvget(vnode_t *vp)
{
u_int use, next;
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
return EBUSY;
}
next = atomic_cas_uint(&vp->v_usecount,
use, (use + 1) | VUSECOUNT_VGET);
if (__predict_true(next == use)) { membar_acquire();
return 0;
}
}
}
/*
* Try to get an initial reference on this cached vnode.
* Returns zero on success and ENOENT if the vnode has been reclaimed.
* Will wait for the vnode state to be stable.
*
* v_interlock locked on entry and unlocked on exit.
*/
int
vcache_vget(vnode_t *vp)
{
int error;
KASSERT(mutex_owned(vp->v_interlock));
/* Increment hold count to prevent vnode from disappearing. */
vp->v_holdcnt++;
VSTATE_WAIT_STABLE(vp);
vp->v_holdcnt--;
/* If this was the last reference to a reclaimed vnode free it now. */
if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) vcache_free(VNODE_TO_VIMPL(vp));
else
mutex_exit(vp->v_interlock);
return ENOENT;
}
VSTATE_ASSERT(vp, VS_LOADED);
error = vcache_tryvget(vp);
KASSERT(error == 0);
mutex_exit(vp->v_interlock);
return 0;
}
/*
* Get a vnode / fs node pair by key and return it referenced through vpp.
*/
int
vcache_get(struct mount *mp, const void *key, size_t key_len,
struct vnode **vpp)
{
int error;
uint32_t hash;
const void *new_key;
struct vnode *vp;
struct vcache_key vcache_key;
vnode_impl_t *vip, *new_vip;
new_key = NULL;
*vpp = NULL;
vcache_key.vk_mount = mp;
vcache_key.vk_key = key;
vcache_key.vk_key_len = key_len;
hash = vcache_hash(&vcache_key);
again:
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&vcache_key, hash);
/* If found, take a reference or retry. */
if (__predict_true(vip != NULL)) {
/*
* If the vnode is loading we cannot take the v_interlock
* here as it might change during load (see uvm_obj_setlock()).
* As changing state from VS_LOADING requires both vcache_lock
* and v_interlock it is safe to test with vcache_lock held.
*
* Wait for vnodes changing state from VS_LOADING and retry.
*/
if (__predict_false(vip->vi_state == VS_LOADING)) {
cv_wait(&vcache_cv, &vcache_lock);
mutex_exit(&vcache_lock);
goto again;
}
vp = VIMPL_TO_VNODE(vip);
mutex_enter(vp->v_interlock);
mutex_exit(&vcache_lock);
error = vcache_vget(vp);
if (error == ENOENT)
goto again;
if (error == 0)
*vpp = vp; KASSERT((error != 0) == (*vpp == NULL));
return error;
}
mutex_exit(&vcache_lock);
/* Allocate and initialize a new vcache / vnode pair. */
error = vfs_busy(mp);
if (error)
return error;
new_vip = vcache_alloc();
new_vip->vi_key = vcache_key;
vp = VIMPL_TO_VNODE(new_vip);
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&vcache_key, hash);
if (vip == NULL) {
SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
new_vip, vi_hash);
vip = new_vip;
}
/* If another thread beat us inserting this node, retry. */
if (vip != new_vip) { vcache_dealloc(new_vip);
vfs_unbusy(mp);
goto again;
}
mutex_exit(&vcache_lock);
/* Load the fs node. Exclusive as new_node is VS_LOADING. */
error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
if (error) {
mutex_enter(&vcache_lock);
SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
new_vip, vnode_impl, vi_hash);
vcache_dealloc(new_vip);
vfs_unbusy(mp);
KASSERT(*vpp == NULL);
return error;
}
KASSERT(new_key != NULL); KASSERT(memcmp(key, new_key, key_len) == 0); KASSERT(vp->v_op != NULL);
vfs_insmntque(vp, mp);
if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE;
vfs_ref(mp);
vfs_unbusy(mp);
/* Finished loading, finalize node. */
mutex_enter(&vcache_lock);
new_vip->vi_key.vk_key = new_key;
mutex_enter(vp->v_interlock);
VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
mutex_exit(vp->v_interlock);
mutex_exit(&vcache_lock);
*vpp = vp;
return 0;
}
/*
* Create a new vnode / fs node pair and return it referenced through vpp.
*/
int
vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
kauth_cred_t cred, void *extra, struct vnode **vpp)
{
int error;
uint32_t hash;
struct vnode *vp, *ovp;
vnode_impl_t *vip, *ovip;
*vpp = NULL;
/* Allocate and initialize a new vcache / vnode pair. */
error = vfs_busy(mp);
if (error)
return error;
vip = vcache_alloc();
vip->vi_key.vk_mount = mp;
vp = VIMPL_TO_VNODE(vip);
/* Create and load the fs node. */
error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
&vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
if (error) {
mutex_enter(&vcache_lock);
vcache_dealloc(vip);
vfs_unbusy(mp);
KASSERT(*vpp == NULL);
return error;
}
KASSERT(vp->v_op != NULL); KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); if (vip->vi_key.vk_key_len > 0) { KASSERT(vip->vi_key.vk_key != NULL); hash = vcache_hash(&vip->vi_key);
/*
* Wait for previous instance to be reclaimed,
* then insert new node.
*/
mutex_enter(&vcache_lock);
while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
ovp = VIMPL_TO_VNODE(ovip);
mutex_enter(ovp->v_interlock);
mutex_exit(&vcache_lock);
error = vcache_vget(ovp);
KASSERT(error == ENOENT);
mutex_enter(&vcache_lock);
}
SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
vip, vi_hash);
mutex_exit(&vcache_lock);
}
vfs_insmntque(vp, mp);
if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE;
vfs_ref(mp);
vfs_unbusy(mp);
/* Finished loading, finalize node. */
mutex_enter(&vcache_lock);
mutex_enter(vp->v_interlock);
VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
mutex_exit(&vcache_lock);
mutex_exit(vp->v_interlock);
*vpp = vp;
return 0;
}
/*
* Prepare key change: update old cache nodes key and lock new cache node.
* Return an error if the new node already exists.
*/
int
vcache_rekey_enter(struct mount *mp, struct vnode *vp,
const void *old_key, size_t old_key_len,
const void *new_key, size_t new_key_len)
{
uint32_t old_hash, new_hash;
struct vcache_key old_vcache_key, new_vcache_key;
vnode_impl_t *vip, *new_vip;
old_vcache_key.vk_mount = mp;
old_vcache_key.vk_key = old_key;
old_vcache_key.vk_key_len = old_key_len;
old_hash = vcache_hash(&old_vcache_key);
new_vcache_key.vk_mount = mp;
new_vcache_key.vk_key = new_key;
new_vcache_key.vk_key_len = new_key_len;
new_hash = vcache_hash(&new_vcache_key);
new_vip = vcache_alloc();
new_vip->vi_key = new_vcache_key;
/* Insert locked new node used as placeholder. */
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&new_vcache_key, new_hash);
if (vip != NULL) {
vcache_dealloc(new_vip);
return EEXIST;
}
SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
new_vip, vi_hash);
/* Replace old nodes key with the temporary copy. */
vip = vcache_hash_lookup(&old_vcache_key, old_hash);
KASSERT(vip != NULL);
KASSERT(VIMPL_TO_VNODE(vip) == vp);
KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
vip->vi_key = old_vcache_key;
mutex_exit(&vcache_lock);
return 0;
}
/*
* Key change complete: update old node and remove placeholder.
*/
void
vcache_rekey_exit(struct mount *mp, struct vnode *vp,
const void *old_key, size_t old_key_len,
const void *new_key, size_t new_key_len)
{
uint32_t old_hash, new_hash;
struct vcache_key old_vcache_key, new_vcache_key;
vnode_impl_t *vip, *new_vip;
struct vnode *new_vp;
old_vcache_key.vk_mount = mp;
old_vcache_key.vk_key = old_key;
old_vcache_key.vk_key_len = old_key_len;
old_hash = vcache_hash(&old_vcache_key);
new_vcache_key.vk_mount = mp;
new_vcache_key.vk_key = new_key;
new_vcache_key.vk_key_len = new_key_len;
new_hash = vcache_hash(&new_vcache_key);
mutex_enter(&vcache_lock);
/* Lookup old and new node. */
vip = vcache_hash_lookup(&old_vcache_key, old_hash);
KASSERT(vip != NULL);
KASSERT(VIMPL_TO_VNODE(vip) == vp);
new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
KASSERT(new_vip != NULL);
KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
new_vp = VIMPL_TO_VNODE(new_vip);
mutex_enter(new_vp->v_interlock);
VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
mutex_exit(new_vp->v_interlock);
/* Rekey old node and put it onto its new hashlist. */
vip->vi_key = new_vcache_key;
if (old_hash != new_hash) {
SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
vip, vi_hash);
}
/* Remove new node used as placeholder. */
SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
new_vip, vnode_impl, vi_hash);
vcache_dealloc(new_vip);
}
/*
* Disassociate the underlying file system from a vnode.
*
* Must be called with vnode locked and will return unlocked.
* Must be called with the interlock held, and will return with it held.
*/
static void
vcache_reclaim(vnode_t *vp)
{
lwp_t *l = curlwp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
struct mount *mp = vp->v_mount;
uint32_t hash;
uint8_t temp_buf[64], *temp_key;
size_t temp_key_len;
bool recycle;
int error;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0);
temp_key_len = vip->vi_key.vk_key_len;
/*
* Prevent the vnode from being recycled or brought into use
* while we clean it out.
*/
VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
/*
* Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
* because VOP_RECLAIM() could cause vp->v_klist to
* become invalid. Don't check for interest in NOTE_REVOKE
* here; it's always posted because it sets EV_EOF.
*
* Once it's been posted, reset vp->v_klist to point to
* our own local storage, in case we were sharing with
* someone else.
*/
KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
vp->v_klist = &vip->vi_klist;
mutex_exit(vp->v_interlock);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
mutex_enter(vp->v_interlock);
if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
mutex_exit(vp->v_interlock);
rw_exit(vp->v_uobj.vmobjlock);
/*
* With vnode state set to reclaiming, purge name cache immediately
* to prevent new handles on vnode, and wait for existing threads
* trying to get a handle to notice VS_RECLAIMED status and abort.
*/
cache_purge(vp);
/* Replace the vnode key with a temporary copy. */
if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
} else {
temp_key = temp_buf;
}
if (vip->vi_key.vk_key_len > 0) { mutex_enter(&vcache_lock);
memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
vip->vi_key.vk_key = temp_key;
mutex_exit(&vcache_lock);
}
fstrans_start(mp);
/*
* Clean out any cached data associated with the vnode.
*/
error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
if (error != 0) { if (wapbl_vphaswapbl(vp)) WAPBL_DISCARD(wapbl_vptomp(vp));
error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
}
KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp);
}
/*
* Disassociate the underlying file system from the vnode.
* VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
* the vnode, and may destroy the vnode so that VOP_UNLOCK
* would no longer function.
*/
VOP_INACTIVE(vp, &recycle);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
if (VOP_RECLAIM(vp)) {
vnpanic(vp, "%s: cannot reclaim", __func__);
}
KASSERT(vp->v_data == NULL); KASSERT((vp->v_iflag & VI_PAGES) == 0); if (vp->v_type == VREG && vp->v_ractx != NULL) { uvm_ra_freectx(vp->v_ractx);
vp->v_ractx = NULL;
}
if (vip->vi_key.vk_key_len > 0) {
/* Remove from vnode cache. */
hash = vcache_hash(&vip->vi_key);
mutex_enter(&vcache_lock);
KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
mutex_exit(&vcache_lock);
}
if (temp_key != temp_buf) kmem_free(temp_key, temp_key_len);
/* Done with purge, notify sleepers of the grim news. */
mutex_enter(vp->v_interlock);
vp->v_op = dead_vnodeop_p;
VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
vp->v_tag = VT_NON;
mutex_exit(vp->v_interlock);
/*
* Move to dead mount. Must be after changing the operations
* vector as vnode operations enter the mount before using the
* operations vector. See sys/kern/vnode_if.c.
*/
vp->v_vflag &= ~VV_ROOT;
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
#ifdef PAX_SEGVGUARD
pax_segvguard_cleanup(vp);
#endif /* PAX_SEGVGUARD */
mutex_enter(vp->v_interlock);
fstrans_done(mp);
KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
}
/*
* Disassociate the underlying file system from an open device vnode
* and make it anonymous.
*
* Vnode unlocked on entry, drops a reference to the vnode.
*/
void
vcache_make_anon(vnode_t *vp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
uint32_t hash;
bool recycle;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
/* Remove from vnode cache. */
hash = vcache_hash(&vip->vi_key);
mutex_enter(&vcache_lock);
KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
vip->vi_key.vk_mount = dead_rootmount;
vip->vi_key.vk_key_len = 0;
vip->vi_key.vk_key = NULL;
mutex_exit(&vcache_lock);
/*
* Disassociate the underlying file system from the vnode.
* VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
* the vnode, and may destroy the vnode so that VOP_UNLOCK
* would no longer function.
*/
if (vn_lock(vp, LK_EXCLUSIVE)) {
vnpanic(vp, "%s: cannot lock", __func__);
}
VOP_INACTIVE(vp, &recycle);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
if (VOP_RECLAIM(vp)) {
vnpanic(vp, "%s: cannot reclaim", __func__);
}
/* Purge name cache. */
cache_purge(vp);
/* Done with purge, change operations vector. */
mutex_enter(vp->v_interlock);
vp->v_op = spec_vnodeop_p;
vp->v_vflag |= VV_MPSAFE;
mutex_exit(vp->v_interlock);
/*
* Move to dead mount. Must be after changing the operations
* vector as vnode operations enter the mount before using the
* operations vector. See sys/kern/vnode_if.c.
*/
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
vrele(vp);
}
/*
* Update outstanding I/O count and do wakeup if requested.
*/
void
vwakeup(struct buf *bp)
{
vnode_t *vp;
if ((vp = bp->b_vp) == NULL)
return;
KASSERT(bp->b_objlock == vp->v_interlock);
KASSERT(mutex_owned(bp->b_objlock));
if (--vp->v_numoutput < 0)
vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
if (vp->v_numoutput == 0)
cv_broadcast(&vp->v_cv);
}
/*
* Test a vnode for being or becoming dead. Returns one of:
* EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
* ENOENT: vnode is dead.
* 0: otherwise.
*
* Whenever this function returns a non-zero value all future
* calls will also return a non-zero value.
*/
int
vdead_check(struct vnode *vp, int flags)
{ KASSERT(mutex_owned(vp->v_interlock)); if (! ISSET(flags, VDEAD_NOWAIT)) VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMING) { KASSERT(ISSET(flags, VDEAD_NOWAIT));
return EBUSY;
} else if (VSTATE_GET(vp) == VS_RECLAIMED) {
return ENOENT;
}
return 0;
}
int
vfs_drainvnodes(void)
{
mutex_enter(&vdrain_lock);
if (!vdrain_one(desiredvnodes)) {
mutex_exit(&vdrain_lock);
return EBUSY;
}
mutex_exit(&vdrain_lock);
if (vcache_hashsize != desiredvnodes)
vcache_reinit();
return 0;
}
void
vnpanic(vnode_t *vp, const char *fmt, ...)
{
va_list ap;
#ifdef DIAGNOSTIC
vprint(NULL, vp);
#endif
va_start(ap, fmt);
vpanic(fmt, ap);
va_end(ap);
}
void
vshareilock(vnode_t *tvp, vnode_t *fvp)
{
kmutex_t *oldlock;
oldlock = tvp->v_interlock;
mutex_obj_hold(fvp->v_interlock);
tvp->v_interlock = fvp->v_interlock;
mutex_obj_free(oldlock);
}
void
vshareklist(vnode_t *tvp, vnode_t *fvp)
{
/*
* If two vnodes share klist state, they must also share
* an interlock.
*/
KASSERT(tvp->v_interlock == fvp->v_interlock);
/*
* We make the following assumptions:
*
* ==> Some other synchronization is happening outside of
* our view to make this safe.
*
* ==> That the "to" vnode will have the necessary references
* on the "from" vnode so that the storage for the klist
* won't be yanked out from beneath us (the vnode_impl).
*
* ==> If "from" is also sharing, we then assume that "from"
* has the necessary references, and so on.
*/
tvp->v_klist = fvp->v_klist;
}
/* $NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_mod.h>
static const struct syscall_package vfs_syscalls_40_syscalls[] = {
{ SYS_compat_40_mount, 0, (sy_call_t *)compat_40_sys_mount },
{ 0, 0, NULL },
};
int
compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) type;
syscallarg(const char *) path;
syscallarg(int) flags;
syscallarg(void *) data;
} */
register_t dummy;
return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
}
int
vfs_syscalls_40_init(void)
{
return syscall_establish(NULL, vfs_syscalls_40_syscalls);
}
int
vfs_syscalls_40_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_40_syscalls);
}
/* $NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Passive references
*
* Passive references are references to objects that guarantee the
* object will not be destroyed until the reference is released.
*
* Passive references require no interprocessor synchronization to
* acquire or release. However, destroying the target of passive
* references requires expensive interprocessor synchronization --
* xcalls to determine on which CPUs the object is still in use.
*
* Passive references may be held only on a single CPU and by a
* single LWP. They require the caller to allocate a little stack
* space, a struct psref object. Sleeping while a passive
* reference is held is allowed, provided that the owner's LWP is
* bound to a CPU -- e.g., the owner is a softint or a bound
* kthread. However, sleeping should be kept to a short duration,
* e.g. sleeping on an adaptive lock.
*
* Passive references serve as an intermediate stage between
* reference counting and passive serialization (pserialize(9)):
*
* - If you need references to transfer from CPU to CPU or LWP to
* LWP, or if you need long-term references, you must use
* reference counting, e.g. with atomic operations or locks,
* which incurs interprocessor synchronization for every use --
* cheaper than an xcall, but not scalable.
*
* - If all users *guarantee* that they will not sleep, then it is
* not necessary to use passive references: you may as well just
* use the even cheaper pserialize(9), because you have
* satisfied the requirements of a pserialize read section.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/psref.h>
#include <sys/queue.h>
#include <sys/xcall.h>
#include <sys/lwp.h>
SLIST_HEAD(psref_head, psref);
static bool _psref_held(const struct psref_target *, struct psref_class *,
bool);
/*
* struct psref_class
*
* Private global state for a class of passive reference targets.
* Opaque to callers.
*/
struct psref_class {
kmutex_t prc_lock;
kcondvar_t prc_cv;
struct percpu *prc_percpu; /* struct psref_cpu */
ipl_cookie_t prc_iplcookie;
unsigned int prc_xc_flags;
};
/*
* struct psref_cpu
*
* Private per-CPU state for a class of passive reference targets.
* Not exposed by the API.
*/
struct psref_cpu {
struct psref_head pcpu_head;
};
/*
* Data structures and functions for debugging.
*/
#ifndef PSREF_DEBUG_NITEMS
#define PSREF_DEBUG_NITEMS 16
#endif
struct psref_debug_item {
void *prdi_caller;
struct psref *prdi_psref;
};
struct psref_debug {
int prd_refs_peek;
struct psref_debug_item prd_items[PSREF_DEBUG_NITEMS];
};
#ifdef PSREF_DEBUG
static void psref_debug_acquire(struct psref *);
static void psref_debug_release(struct psref *);
static void psref_debug_lwp_free(void *);
static specificdata_key_t psref_debug_lwp_key;
#endif
/*
* psref_init()
*/
void
psref_init(void)
{
#ifdef PSREF_DEBUG
lwp_specific_key_create(&psref_debug_lwp_key, psref_debug_lwp_free);
#endif
}
/*
* psref_class_create(name, ipl)
*
* Create a new passive reference class, with the given wchan name
* and ipl.
*/
struct psref_class *
psref_class_create(const char *name, int ipl)
{
struct psref_class *class;
ASSERT_SLEEPABLE();
class = kmem_alloc(sizeof(*class), KM_SLEEP);
class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu));
mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl);
cv_init(&class->prc_cv, name);
class->prc_iplcookie = makeiplcookie(ipl);
class->prc_xc_flags = XC_HIGHPRI_IPL(ipl);
return class;
}
static void __diagused
psref_cpu_drained_p(void *p, void *cookie, struct cpu_info *ci __unused)
{
const struct psref_cpu *pcpu = p;
bool *retp = cookie;
if (!SLIST_EMPTY(&pcpu->pcpu_head))
*retp = false;
}
static bool __diagused
psref_class_drained_p(const struct psref_class *prc)
{
bool ret = true;
percpu_foreach(prc->prc_percpu, &psref_cpu_drained_p, &ret);
return ret;
}
/*
* psref_class_destroy(class)
*
* Destroy a passive reference class and free memory associated
* with it. All targets in this class must have been drained and
* destroyed already.
*/
void
psref_class_destroy(struct psref_class *class)
{
KASSERT(psref_class_drained_p(class));
cv_destroy(&class->prc_cv);
mutex_destroy(&class->prc_lock);
percpu_free(class->prc_percpu, sizeof(struct psref_cpu));
kmem_free(class, sizeof(*class));
}
/*
* psref_target_init(target, class)
*
* Initialize a passive reference target in the specified class.
* The caller is responsible for issuing a membar_producer after
* psref_target_init and before exposing a pointer to the target
* to other CPUs.
*/
void
psref_target_init(struct psref_target *target,
struct psref_class *class)
{
target->prt_class = class;
target->prt_draining = false;
}
#ifdef DEBUG
static bool
psref_exist(struct psref_cpu *pcpu, struct psref *psref)
{
struct psref *_psref;
SLIST_FOREACH(_psref, &pcpu->pcpu_head, psref_entry) {
if (_psref == psref)
return true;
}
return false;
}
static void
psref_check_duplication(struct psref_cpu *pcpu, struct psref *psref,
const struct psref_target *target)
{
bool found = false;
found = psref_exist(pcpu, psref);
if (found) {
panic("The psref is already in the list (acquiring twice?): "
"psref=%p target=%p", psref, target);
}
}
static void
psref_check_existence(struct psref_cpu *pcpu, struct psref *psref,
const struct psref_target *target)
{
bool found = false;
found = psref_exist(pcpu, psref);
if (!found) {
panic("The psref isn't in the list (releasing unused psref?): "
"psref=%p target=%p", psref, target);
}
}
#endif /* DEBUG */
/*
* psref_acquire(psref, target, class)
*
* Acquire a passive reference to the specified target, which must
* be in the specified class.
*
* The caller must guarantee that the target will not be destroyed
* before psref_acquire returns.
*
* The caller must additionally guarantee that it will not switch
* CPUs before releasing the passive reference, either by
* disabling kpreemption and avoiding sleeps, or by being in a
* softint or in an LWP bound to a CPU.
*/
void
psref_acquire(struct psref *psref, const struct psref_target *target,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
target);
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
#ifdef DEBUG
/* Sanity-check if the target is already acquired with the same psref. */
psref_check_duplication(pcpu, psref, target);
#endif
/* Record our reference. */
SLIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry);
psref->psref_target = target;
psref->psref_lwp = curlwp;
psref->psref_cpu = curcpu();
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
curlwp->l_psrefs++;
#endif
#ifdef PSREF_DEBUG
psref_debug_acquire(psref);
#endif
}
/*
* psref_release(psref, target, class)
*
* Release a passive reference to the specified target, which must
* be in the specified class.
*
* The caller must not have switched CPUs or LWPs since acquiring
* the passive reference.
*/
void
psref_release(struct psref *psref, const struct psref_target *target,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Make sure the psref looks sensible. */
KASSERTMSG((psref->psref_target == target),
"passive reference target mismatch: %p (ref) != %p (expected)",
psref->psref_target, target);
KASSERTMSG((psref->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
psref->psref_lwp, curlwp);
KASSERTMSG((psref->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(psref->psref_cpu), cpu_index(curcpu()));
/*
* Block interrupts and remove the psref from the current CPU's
* list. No need to percpu_getref or get the head of the list,
* and the caller guarantees that we are bound to a CPU anyway
* (as does blocking interrupts).
*/
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
#ifdef DEBUG
/* Sanity-check if the target is surely acquired before. */
psref_check_existence(pcpu, psref, target);
#endif
SLIST_REMOVE(&pcpu->pcpu_head, psref, psref, psref_entry);
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
KASSERT(curlwp->l_psrefs > 0);
curlwp->l_psrefs--;
#endif
#ifdef PSREF_DEBUG
psref_debug_release(psref);
#endif
/* If someone is waiting for users to drain, notify 'em. */
if (__predict_false(target->prt_draining)) cv_broadcast(&class->prc_cv);
}
/*
* psref_copy(pto, pfrom, class)
*
* Copy a passive reference from pfrom, which must be in the
* specified class, to pto. Both pfrom and pto must later be
* released with psref_release.
*
* The caller must not have switched CPUs or LWPs since acquiring
* pfrom, and must not switch CPUs or LWPs before releasing both
* pfrom and pto.
*/
void
psref_copy(struct psref *pto, const struct psref *pfrom,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((pto != pfrom),
"can't copy passive reference to itself: %p",
pto);
/* Make sure the pfrom reference looks sensible. */
KASSERTMSG((pfrom->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
pfrom->psref_lwp, curlwp);
KASSERTMSG((pfrom->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(pfrom->psref_cpu), cpu_index(curcpu()));
KASSERTMSG((pfrom->psref_target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
pfrom->psref_target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
/* Record the new reference. */
SLIST_INSERT_HEAD(&pcpu->pcpu_head, pto, psref_entry);
pto->psref_target = pfrom->psref_target;
pto->psref_lwp = curlwp;
pto->psref_cpu = curcpu();
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
curlwp->l_psrefs++;
#endif
}
/*
* struct psreffed
*
* Global state for draining a psref target.
*/
struct psreffed {
struct psref_class *class;
struct psref_target *target;
bool ret;
};
static void
psreffed_p_xc(void *cookie0, void *cookie1 __unused)
{
struct psreffed *P = cookie0;
/*
* If we hold a psref to the target, then answer true.
*
* This is the only dynamic decision that may be made with
* psref_held.
*
* No need to lock anything here: every write transitions from
* false to true, so there can be no conflicting writes. No
* need for a memory barrier here because P->ret is read only
* after xc_wait, which has already issued any necessary memory
* barriers.
*/
if (_psref_held(P->target, P->class, true))
P->ret = true;
}
static bool
psreffed_p(struct psref_target *target, struct psref_class *class)
{
struct psreffed P = {
.class = class,
.target = target,
.ret = false,
};
if (__predict_true(mp_online)) {
/*
* Ask all CPUs to say whether they hold a psref to the
* target.
*/
xc_wait(xc_broadcast(class->prc_xc_flags, &psreffed_p_xc, &P,
NULL));
} else
psreffed_p_xc(&P, NULL);
return P.ret;
}
/*
* psref_target_destroy(target, class)
*
* Destroy a passive reference target. Waits for all existing
* references to drain. Caller must guarantee no new references
* will be acquired once it calls psref_target_destroy, e.g. by
* removing the target from a global list first. May sleep.
*/
void
psref_target_destroy(struct psref_target *target, struct psref_class *class)
{
ASSERT_SLEEPABLE();
KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
target);
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Request psref_release to notify us when done. */
target->prt_draining = true;
/* Wait until there are no more references on any CPU. */
while (psreffed_p(target, class)) {
/*
* This enter/wait/exit business looks wrong, but it is
* both necessary, because psreffed_p performs a
* low-priority xcall and hence cannot run while a
* mutex is locked, and OK, because the wait is timed
* -- explicit wakeups are only an optimization.
*/
mutex_enter(&class->prc_lock);
(void)cv_timedwait(&class->prc_cv, &class->prc_lock, 1);
mutex_exit(&class->prc_lock);
}
/* No more references. Cause subsequent psref_acquire to kassert. */
target->prt_class = NULL;
}
static bool
_psref_held(const struct psref_target *target, struct psref_class *class,
bool lwp_mismatch_ok)
{
const struct psref_cpu *pcpu;
const struct psref *psref;
int s;
bool held = false;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
/* Search through all the references on this CPU. */
SLIST_FOREACH(psref, &pcpu->pcpu_head, psref_entry) {
/* Sanity-check the reference's CPU. */
KASSERTMSG((psref->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(psref->psref_cpu), cpu_index(curcpu()));
/* If it doesn't match, skip it and move on. */
if (psref->psref_target != target)
continue;
/*
* Sanity-check the reference's LWP if we are asserting
* via psref_held that this LWP holds it, but not if we
* are testing in psref_target_destroy whether any LWP
* still holds it.
*/
KASSERTMSG((lwp_mismatch_ok || psref->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
psref->psref_lwp, curlwp);
/* Stop here and report that we found it. */
held = true;
break;
}
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
return held;
}
/*
* psref_held(target, class)
*
* True if the current CPU holds a passive reference to target,
* false otherwise. May be used only inside assertions.
*/
bool
psref_held(const struct psref_target *target, struct psref_class *class)
{
return _psref_held(target, class, false);
}
#ifdef PSREF_DEBUG
void
psref_debug_init_lwp(struct lwp *l)
{
struct psref_debug *prd;
prd = kmem_zalloc(sizeof(*prd), KM_SLEEP);
lwp_setspecific_by_lwp(l, psref_debug_lwp_key, prd);
}
static void
psref_debug_lwp_free(void *arg)
{
struct psref_debug *prd = arg;
kmem_free(prd, sizeof(*prd));
}
static void
psref_debug_acquire(struct psref *psref)
{
struct psref_debug *prd;
struct lwp *l = curlwp;
int s, i;
prd = lwp_getspecific(psref_debug_lwp_key);
if (__predict_false(prd == NULL)) {
psref->psref_debug = NULL;
return;
}
s = splserial();
if (l->l_psrefs > prd->prd_refs_peek) {
prd->prd_refs_peek = l->l_psrefs;
if (__predict_false(prd->prd_refs_peek > PSREF_DEBUG_NITEMS))
panic("exceeded PSREF_DEBUG_NITEMS");
}
for (i = 0; i < prd->prd_refs_peek; i++) {
struct psref_debug_item *prdi = &prd->prd_items[i];
if (prdi->prdi_psref != NULL)
continue;
prdi->prdi_caller = psref->psref_debug;
prdi->prdi_psref = psref;
psref->psref_debug = prdi;
break;
}
if (__predict_false(i == prd->prd_refs_peek))
panic("out of range: %d", i);
splx(s);
}
static void
psref_debug_release(struct psref *psref)
{
int s;
s = splserial();
if (__predict_true(psref->psref_debug != NULL)) {
struct psref_debug_item *prdi = psref->psref_debug;
prdi->prdi_psref = NULL;
}
splx(s);
}
void
psref_debug_barrier(void)
{
struct psref_debug *prd;
struct lwp *l = curlwp;
int s, i;
prd = lwp_getspecific(psref_debug_lwp_key);
if (__predict_false(prd == NULL))
return;
s = splserial();
for (i = 0; i < prd->prd_refs_peek; i++) {
struct psref_debug_item *prdi = &prd->prd_items[i];
if (__predict_true(prdi->prdi_psref == NULL))
continue;
panic("psref leaked: lwp(%p) acquired at %p", l, prdi->prdi_caller);
}
prd->prd_refs_peek = 0; /* Reset the counter */
splx(s);
}
#endif /* PSREF_DEBUG */
/* $NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/sys/resource.h>
#include <compat/sys/time.h>
#include <compat/common/compat_mod.h>
static const struct syscall_package kern_50_syscalls[] = {
{ SYS_compat_50__lwp_park, 0, (sy_call_t *)compat_50_sys__lwp_park },
{ SYS_compat_50___sigtimedwait, 0,
(sy_call_t *)compat_50_sys___sigtimedwait },
{ SYS_compat_50_wait4, 0, (sy_call_t *)compat_50_sys_wait4 },
{ 0, 0, NULL }
};
int
compat_50_sys__lwp_park(struct lwp *l,
const struct compat_50_sys__lwp_park_args *uap, register_t *retval)
{
/* {
syscallarg(const struct timespec50 *) ts;
syscallarg(lwpid_t) unpark;
syscallarg(const void *) hint;
syscallarg(const void *) unparkhint;
} */
struct timespec ts, *tsp;
struct timespec50 ts50;
int error;
if (SCARG(uap, ts) == NULL)
tsp = NULL;
else {
error = copyin(SCARG(uap, ts), &ts50, sizeof(ts50));
if (error != 0)
return error;
timespec50_to_timespec(&ts50, &ts);
tsp = &ts;
}
if (SCARG(uap, unpark) != 0) {
error = lwp_unpark(&SCARG(uap, unpark), 1);
if (error != 0)
return error;
}
return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp);
}
static int
tscopyin(const void *u, void *s, size_t len)
{
struct timespec50 ts50;
int error;
KASSERT(len == sizeof(struct timespec));
error = copyin(u, &ts50, sizeof(ts50));
if (error)
return error;
timespec50_to_timespec(&ts50, s);
return 0;
}
static int
tscopyout(const void *s, void *u, size_t len)
{
struct timespec50 ts50;
KASSERT(len == sizeof(struct timespec));
timespec_to_timespec50(s, &ts50);
return copyout(&ts50, u, sizeof(ts50));
}
int
compat_50_sys___sigtimedwait(struct lwp *l,
const struct compat_50_sys___sigtimedwait_args *uap, register_t *retval)
{
int res;
res = sigtimedwait1(l,
(const struct sys_____sigtimedwait50_args *)uap, retval, copyin,
copyout, tscopyin, tscopyout);
if (!res)
*retval = 0; /* XXX NetBSD<=5 was not POSIX compliant */
return res;
}
int
compat_50_sys_wait4(struct lwp *l, const struct compat_50_sys_wait4_args *uap,
register_t *retval)
{
/* {
syscallarg(int) pid;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct rusage50 *) rusage;
} */
int status, error, pid = SCARG(uap, pid);
struct rusage50 ru50;
struct rusage ru;
error = do_sys_wait(&pid, &status, SCARG(uap, options),
SCARG(uap, rusage) != NULL ? &ru : NULL);
retval[0] = pid;
if (pid == 0)
return error;
if (SCARG(uap, rusage)) { rusage_to_rusage50(&ru, &ru50);
error = copyout(&ru50, SCARG(uap, rusage), sizeof(ru50));
}
if (error == 0 && SCARG(uap, status)) error = copyout(&status, SCARG(uap, status), sizeof(status));
return error;
}
int
kern_50_init(void)
{
return syscall_establish(NULL, kern_50_syscalls);
}
int
kern_50_fini(void)
{
return syscall_disestablish(NULL, kern_50_syscalls);
}
/* $NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $ */
/*-
* Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/sysctl.h>
#include <lib/libkern/libkern.h>
/*
* Disk error is the preface to plaintive error messages
* about failing disk transfers. It prints messages of the form
hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
* if the offset of the error in the transfer and a disk label
* are both available. blkdone should be -1 if the position of the error
* is unknown; the disklabel pointer may be null from drivers that have not
* been converted to use them. The message is printed with printf
* if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
* The message should be completed (with at least a newline) with printf
* or addlog, respectively. There is no trailing space.
*/
#ifndef PRIdaddr
#define PRIdaddr PRId64
#endif
void
diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
int blkdone, const struct disklabel *lp)
{
int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
void (*pr)(const char *, ...) __printflike(1, 2);
char partname = 'a' + part;
daddr_t sn;
if (/*CONSTCOND*/0)
/* Compiler will error this if the format is wrong... */
printf("%" PRIdaddr, bp->b_blkno);
if (pri != LOG_PRINTF) {
static const char fmt[] = "";
log(pri, fmt);
pr = addlog;
} else
pr = printf;
(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
bp->b_flags & B_READ ? "read" : "writ");
sn = bp->b_blkno;
if (bp->b_bcount <= DEV_BSIZE)
(*pr)("%" PRIdaddr, sn);
else {
if (blkdone >= 0) {
sn += blkdone;
(*pr)("%" PRIdaddr " of ", sn);
}
(*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
}
if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
sn += lp->d_partitions[part].p_offset;
(*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
dname, unit, sn, sn / lp->d_secpercyl);
sn %= lp->d_secpercyl;
(*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
sn / lp->d_nsectors, sn % lp->d_nsectors);
}
}
/*
* Searches the iostatlist for the disk corresponding to the
* name provided.
*/
struct disk *
disk_find(const char *name)
{
struct io_stats *stat;
stat = iostat_find(name);
if ((stat != NULL) && (stat->io_type == IOSTAT_DISK))
return stat->io_parent;
return (NULL);
}
void
disk_init(struct disk *diskp, const char *name, const struct dkdriver *driver)
{
u_int blocksize = DEV_BSIZE;
/*
* Initialize the wedge-related locks and other fields.
*/
mutex_init(&diskp->dk_rawlock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&diskp->dk_openlock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&diskp->dk_wedges);
diskp->dk_nwedges = 0;
diskp->dk_labelsector = LABELSECTOR;
diskp->dk_blkshift = DK_BSIZE2BLKSHIFT(blocksize);
diskp->dk_byteshift = DK_BSIZE2BYTESHIFT(blocksize);
diskp->dk_name = name;
diskp->dk_driver = driver;
}
/*
* Rename a disk.
*/
void
disk_rename(struct disk *diskp, const char *name)
{
diskp->dk_name = name;
iostat_rename(diskp->dk_stats, diskp->dk_name);
}
/*
* Attach a disk.
*/
void
disk_attach(struct disk *diskp)
{
/*
* Allocate and initialize the disklabel structures.
*/
diskp->dk_label = kmem_zalloc(sizeof(struct disklabel), KM_SLEEP);
diskp->dk_cpulabel = kmem_zalloc(sizeof(struct cpu_disklabel),
KM_SLEEP);
/*
* Set up the stats collection.
*/
diskp->dk_stats = iostat_alloc(IOSTAT_DISK, diskp, diskp->dk_name);
}
int
disk_begindetach(struct disk *dk, int (*lastclose)(device_t),
device_t self, int flags)
{
int rc;
rc = 0;
mutex_enter(&dk->dk_openlock);
if (dk->dk_openmask == 0)
; /* nothing to do */
else if ((flags & DETACH_FORCE) == 0)
rc = EBUSY;
else if (lastclose != NULL)
rc = (*lastclose)(self);
mutex_exit(&dk->dk_openlock);
return rc;
}
/*
* Detach a disk.
*/
void
disk_detach(struct disk *diskp)
{
/*
* Remove from the drivelist.
*/
iostat_free(diskp->dk_stats);
/*
* Release the disk-info dictionary.
*/
if (diskp->dk_info) {
prop_object_release(diskp->dk_info);
diskp->dk_info = NULL;
}
/*
* Free the space used by the disklabel structures.
*/
kmem_free(diskp->dk_label, sizeof(*diskp->dk_label));
kmem_free(diskp->dk_cpulabel, sizeof(*diskp->dk_cpulabel));
}
void
disk_destroy(struct disk *diskp)
{
mutex_destroy(&diskp->dk_openlock);
mutex_destroy(&diskp->dk_rawlock);
}
/*
* Mark the disk as having work queued for metrics collection.
*/
void
disk_wait(struct disk *diskp)
{
iostat_wait(diskp->dk_stats);
}
/*
* Mark the disk as busy for metrics collection.
*/
void
disk_busy(struct disk *diskp)
{
iostat_busy(diskp->dk_stats);
}
/*
* Finished disk operations, gather metrics.
*/
void
disk_unbusy(struct disk *diskp, long bcount, int read)
{
iostat_unbusy(diskp->dk_stats, bcount, read);
}
/*
* Return true if disk has an I/O operation in flight.
*/
bool
disk_isbusy(struct disk *diskp)
{
return iostat_isbusy(diskp->dk_stats);
}
/*
* Bounds checking against the media size, used for the raw partition.
* secsize, mediasize and b_blkno must all be the same units.
* Possibly this has to be DEV_BSIZE (512).
*/
int
bounds_check_with_mediasize(struct buf *bp, int secsize, uint64_t mediasize)
{
int64_t sz;
if (bp->b_blkno < 0) {
/* Reject negative offsets immediately. */
bp->b_error = EINVAL;
return 0;
}
sz = howmany((int64_t)bp->b_bcount, secsize);
/*
* bp->b_bcount is a 32-bit value, and we rejected a negative
* bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
*/
if (bp->b_blkno + sz > mediasize) {
sz = mediasize - bp->b_blkno;
if (sz == 0) {
/* If exactly at end of disk, return EOF. */
bp->b_resid = bp->b_bcount;
return 0;
}
if (sz < 0) {
/* If past end of disk, return EINVAL. */
bp->b_error = EINVAL;
return 0;
}
/* Otherwise, truncate request. */
bp->b_bcount = sz * secsize;
}
return 1;
}
/*
* Determine the size of the transfer, and make sure it is
* within the boundaries of the partition. Adjust transfer
* if needed, and signal errors or early completion.
*/
int
bounds_check_with_label(struct disk *dk, struct buf *bp, int wlabel)
{
struct disklabel *lp = dk->dk_label;
struct partition *p = lp->d_partitions + DISKPART(bp->b_dev);
uint64_t p_size, p_offset, labelsector;
int64_t sz;
if (bp->b_blkno < 0) {
/* Reject negative offsets immediately. */
bp->b_error = EINVAL;
return -1;
}
/* Protect against division by zero. XXX: Should never happen?!?! */
if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0) {
bp->b_error = EINVAL;
return -1;
}
p_size = (uint64_t)p->p_size << dk->dk_blkshift;
p_offset = (uint64_t)p->p_offset << dk->dk_blkshift;
#if RAW_PART == 3
labelsector = lp->d_partitions[2].p_offset;
#else
labelsector = lp->d_partitions[RAW_PART].p_offset;
#endif
labelsector = (labelsector + dk->dk_labelsector) << dk->dk_blkshift;
sz = howmany((int64_t)bp->b_bcount, DEV_BSIZE);
/*
* bp->b_bcount is a 32-bit value, and we rejected a negative
* bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
*/
if (bp->b_blkno + sz > p_size) {
sz = p_size - bp->b_blkno;
if (sz == 0) {
/* If exactly at end of disk, return EOF. */
bp->b_resid = bp->b_bcount;
return 0;
}
if (sz < 0) {
/* If past end of disk, return EINVAL. */
bp->b_error = EINVAL;
return -1;
}
/* Otherwise, truncate request. */
bp->b_bcount = sz << DEV_BSHIFT;
}
/* Overwriting disk label? */
if (bp->b_blkno + p_offset <= labelsector &&
bp->b_blkno + p_offset + sz > labelsector &&
(bp->b_flags & B_READ) == 0 && !wlabel) {
bp->b_error = EROFS;
return -1;
}
/* calculate cylinder for disksort to order transfers with */
bp->b_cylinder = (bp->b_blkno + p->p_offset) /
(lp->d_secsize / DEV_BSIZE) / lp->d_secpercyl;
return 1;
}
int
disk_read_sectors(void (*strat)(struct buf *), const struct disklabel *lp,
struct buf *bp, unsigned int sector, int count)
{
if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0)
return EINVAL;
bp->b_blkno = btodb((off_t)sector * lp->d_secsize);
bp->b_bcount = count * lp->d_secsize;
bp->b_flags = (bp->b_flags & ~B_WRITE) | B_READ;
bp->b_oflags &= ~BO_DONE;
bp->b_cylinder = sector / lp->d_secpercyl;
(*strat)(bp);
return biowait(bp);
}
const char *
convertdisklabel(struct disklabel *lp, void (*strat)(struct buf *),
struct buf *bp, uint32_t secperunit)
{
struct partition rp, *altp, *p;
int geom_ok;
const char *str;
memset(&rp, 0, sizeof(rp));
rp.p_size = secperunit;
rp.p_fstype = FS_UNUSED;
/* If we can seek to d_secperunit - 1, believe the disk geometry. */
if (secperunit != 0 &&
disk_read_sectors(strat, lp, bp, secperunit - 1, 1) == 0)
geom_ok = 1;
else
geom_ok = 0;
#if 0
printf("%s: secperunit (%" PRIu32 ") %s\n", __func__,
secperunit, geom_ok ? "ok" : "not ok");
#endif
p = &lp->d_partitions[RAW_PART];
if (RAW_PART == 'c' - 'a')
altp = &lp->d_partitions['d' - 'a'];
else
altp = &lp->d_partitions['c' - 'a'];
if (lp->d_npartitions > RAW_PART && p->p_offset == 0 && p->p_size != 0)
return NULL; /* already a raw partition */
else if (lp->d_npartitions > MAX('c', 'd') - 'a' &&
altp->p_offset == 0 && altp->p_size != 0) {
/* alternate partition ('c' or 'd') is suitable for raw slot,
* swap with 'd' or 'c'.
*/
rp = *p;
*p = *altp;
*altp = rp;
return NULL;
} else if (lp->d_npartitions <= RAW_PART &&
lp->d_npartitions > 'c' - 'a') {
/* No raw partition is present, but the alternate is present.
* Copy alternate to raw partition.
*/
lp->d_npartitions = RAW_PART + 1;
*p = *altp;
return NULL;
} else if (!geom_ok)
str = "no raw partition and disk reports bad geometry";
else if (lp->d_npartitions <= RAW_PART) {
memset(&lp->d_partitions[lp->d_npartitions], 0,
sizeof(struct partition) * (RAW_PART - lp->d_npartitions));
*p = rp;
lp->d_npartitions = RAW_PART + 1;
return NULL;
} else if (lp->d_npartitions < MAXPARTITIONS) {
memmove(p + 1, p,
sizeof(struct partition) * (lp->d_npartitions - RAW_PART));
*p = rp;
lp->d_npartitions++;
return NULL;
} else
str = "no raw partition and partition table is full";
#ifdef DIAGNOSTIC
printf("Bad partition: %s\n", str);
printf("type = %u, subtype = %u, typename = %s\n",
lp->d_type, lp->d_subtype, lp->d_typename);
printf("secsize = %u, nsectors = %u, ntracks = %u\n",
lp->d_secsize, lp->d_nsectors, lp->d_ntracks);
printf("ncylinders = %u, secpercyl = %u, secperunit = %u\n",
lp->d_ncylinders, lp->d_secpercyl, lp->d_secperunit);
printf("npartitions = %u\n", lp->d_npartitions);
for (size_t i = 0; i < MIN(lp->d_npartitions, MAXPARTITIONS); i++) {
p = &lp->d_partitions[i];
printf("\t%c: offset = %u size = %u fstype = %u\n",
(char)(i + 'a'), p->p_offset, p->p_size, p->p_fstype);
}
#endif
return str;
}
/*
* disk_ioctl --
* Generic disk ioctl handling.
*/
int
disk_ioctl(struct disk *dk, dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct dkwedge_info *dkw;
struct partinfo *pi;
struct partition *dp;
#ifdef __HAVE_OLD_DISKLABEL
struct disklabel newlabel;
#endif
switch (cmd) {
case DIOCGDISKINFO: {
prop_dictionary_t disk_info;
int error;
mutex_enter(&dk->dk_openlock);
if ((disk_info = dk->dk_info) == NULL) {
error = ENOTSUP;
} else {
prop_object_retain(disk_info);
error = 0;
}
mutex_exit(&dk->dk_openlock);
if (error)
return error;
error = prop_dictionary_copyout_ioctl(data, cmd, disk_info);
prop_object_release(disk_info);
return error;
}
case DIOCGSECTORSIZE:
*(u_int *)data = dk->dk_geom.dg_secsize;
return 0;
case DIOCGMEDIASIZE:
*(off_t *)data = (off_t)dk->dk_geom.dg_secsize *
dk->dk_geom.dg_secperunit;
return 0;
default:
break;
}
if (dev == NODEV)
return EPASSTHROUGH;
/* The following should be moved to dk_ioctl */
switch (cmd) {
case DIOCGDINFO:
if (dk->dk_label == NULL)
return EBUSY;
memcpy(data, dk->dk_label, sizeof (*dk->dk_label));
return 0;
#ifdef __HAVE_OLD_DISKLABEL
case ODIOCGDINFO:
if (dk->dk_label == NULL)
return EBUSY;
memcpy(&newlabel, dk->dk_label, sizeof(newlabel));
if (newlabel.d_npartitions > OLDMAXPARTITIONS)
return ENOTTY;
memcpy(data, &newlabel, sizeof(struct olddisklabel));
return 0;
#endif
case DIOCGPARTINFO:
pi = data;
memset(pi, 0, sizeof(*pi));
pi->pi_secsize = dk->dk_geom.dg_secsize;
pi->pi_bsize = MAX(BLKDEV_IOSIZE, pi->pi_secsize);
if (DISKPART(dev) == RAW_PART) {
pi->pi_size = dk->dk_geom.dg_secperunit;
return 0;
}
if (dk->dk_label == NULL)
return EBUSY;
dp = &dk->dk_label->d_partitions[DISKPART(dev)];
pi->pi_offset = dp->p_offset;
pi->pi_size = dp->p_size;
pi->pi_fstype = dp->p_fstype;
pi->pi_frag = dp->p_frag;
pi->pi_fsize = dp->p_fsize;
pi->pi_cpg = dp->p_cpg;
/*
* dholland 20130616: XXX this logic should not be
* here. It is here because the old buffer cache
* demands that all accesses to the same blocks need
* to be the same size; but it only works for FFS and
* nowadays I think it'll fail silently if the size
* info in the disklabel is wrong. (Or missing.) The
* buffer cache needs to be smarter; or failing that
* we need a reliable way here to get the right block
* size; or a reliable way to guarantee that (a) the
* fs is not mounted when we get here and (b) any
* buffers generated here will get purged when the fs
* does get mounted.
*/
if (dp->p_fstype == FS_BSDFFS && dp->p_frag != 0 && dp->p_fsize != 0) pi->pi_bsize = dp->p_frag * dp->p_fsize;
return 0;
case DIOCAWEDGE:
if ((flag & FWRITE) == 0)
return EBADF;
dkw = data;
strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
return dkwedge_add(dkw);
case DIOCDWEDGE:
if ((flag & FWRITE) == 0)
return EBADF;
dkw = data;
strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
return dkwedge_del(dkw);
case DIOCLWEDGES:
return dkwedge_list(dk, data, l);
case DIOCMWEDGES:
if ((flag & FWRITE) == 0)
return EBADF;
dkwedge_discover(dk);
return 0;
case DIOCRMWEDGES:
if ((flag & FWRITE) == 0)
return EBADF;
dkwedge_delidle(dk);
return 0;
default:
return EPASSTHROUGH;
}
}
/*
* disk_set_info --
* Canonicalize dk->dk_geom and set some parameters.
*
* If disk_set_info can happen concurrently with disk_ioctl in a
* driver, the driver must serialize calls to disk_set_info with
* dk_openlock.
*/
void
disk_set_info(device_t dev, struct disk *dk, const char *type)
{
struct disk_geom *dg = &dk->dk_geom;
if (dg->dg_secsize == 0) {
#ifdef DIAGNOSTIC
printf("%s: fixing 0 sector size\n", dk->dk_name);
#endif
dg->dg_secsize = DEV_BSIZE;
}
dk->dk_blkshift = DK_BSIZE2BLKSHIFT(dg->dg_secsize);
dk->dk_byteshift = DK_BSIZE2BYTESHIFT(dg->dg_secsize);
if (dg->dg_secperunit == 0) {
#ifdef DIAGNOSTIC
if (dg->dg_ncylinders == 0) {
printf("%s: secperunit and ncylinders are zero\n",
dk->dk_name);
}
if (dg->dg_nsectors == 0 || dg->dg_ntracks == 0) {
printf("%s: secperunit and (sectors or tracks) "
"are zero\n", dk->dk_name);
}
#endif
dg->dg_secperunit = (int64_t) dg->dg_nsectors *
dg->dg_ntracks * dg->dg_ncylinders;
}
if (dg->dg_ncylinders == 0) {
if (dg->dg_ntracks && dg->dg_nsectors)
dg->dg_ncylinders = dg->dg_secperunit /
(dg->dg_ntracks * dg->dg_nsectors);
}
prop_dictionary_t disk_info, odisk_info, geom;
disk_info = prop_dictionary_create();
geom = prop_dictionary_create();
prop_dictionary_set_uint64(geom, "sectors-per-unit",
dg->dg_secperunit);
prop_dictionary_set_uint32(geom, "sector-size", dg->dg_secsize);
if (dg->dg_nsectors)
prop_dictionary_set_uint16(geom, "sectors-per-track",
dg->dg_nsectors);
if (dg->dg_ntracks)
prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
dg->dg_ntracks);
if (dg->dg_ncylinders)
prop_dictionary_set_uint64(geom, "cylinders-per-unit",
dg->dg_ncylinders);
prop_dictionary_set(disk_info, "geometry", geom);
if (type)
prop_dictionary_set_string_nocopy(disk_info, "type", type);
prop_object_release(geom);
odisk_info = dk->dk_info;
dk->dk_info = disk_info;
if (dev)
prop_dictionary_set(device_properties(dev), "disk-info",
disk_info);
/*
* Don't release disk_info here; we keep a reference to it.
* disk_detach() will release it when we go away.
*/
if (odisk_info)
prop_object_release(odisk_info);
}
int
disklabel_dev_unit(dev_t dev)
{
return DISKUNIT(dev);
}
/* $NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $ */
/*
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* tmpfs vnode interface for named pipes.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $");
#include <sys/param.h>
#include <sys/vnode.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>
/*
* vnode operations vector used for fifos stored in a tmpfs file system.
*/
int (**tmpfs_fifoop_p)(void *);
const struct vnodeopv_entry_desc tmpfs_fifoop_entries[] = {
{ &vop_default_desc, vn_default_error },
GENFS_FIFOOP_ENTRIES,
{ &vop_close_desc, tmpfs_fifo_close },
{ &vop_access_desc, tmpfs_access },
{ &vop_accessx_desc, genfs_accessx },
{ &vop_getattr_desc, tmpfs_getattr },
{ &vop_setattr_desc, tmpfs_setattr },
{ &vop_read_desc, tmpfs_fifo_read },
{ &vop_write_desc, tmpfs_fifo_write },
{ &vop_fcntl_desc, genfs_fcntl },
{ &vop_fsync_desc, vn_fifo_bypass },
{ &vop_inactive_desc, tmpfs_inactive },
{ &vop_reclaim_desc, tmpfs_reclaim },
{ &vop_lock_desc, genfs_lock },
{ &vop_unlock_desc, genfs_unlock },
{ &vop_strategy_desc, vn_fifo_bypass },
{ &vop_print_desc, tmpfs_print },
{ &vop_islocked_desc, genfs_islocked },
{ &vop_bwrite_desc, genfs_nullop },
{ NULL, NULL }
};
const struct vnodeopv_desc tmpfs_fifoop_opv_desc = {
&tmpfs_fifoop_p, tmpfs_fifoop_entries
};
int
tmpfs_fifo_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap __unused = v;
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), v);
}
int
tmpfs_fifo_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_ATIME);
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), v);
}
int
tmpfs_fifo_write(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_MTIME);
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), v);
}
/* $NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c)2007 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* CPU related routines not shared with rump.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_cpu_ucode.h"
#include "opt_heartbeat.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/idle.h>
#include <sys/sched.h>
#include <sys/intr.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/cpuio.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/xcall.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/select.h>
#include <sys/namei.h>
#include <sys/callout.h>
#include <sys/pcu.h>
#include <sys/heartbeat.h>
#include <uvm/uvm_extern.h>
#include "ioconf.h"
/*
* If the port has stated that cpu_data is the first thing in cpu_info,
* verify that the claim is true. This will prevent them from getting out
* of sync.
*/
#ifdef __HAVE_CPU_DATA_FIRST
CTASSERT(offsetof(struct cpu_info, ci_data) == 0);
#else
CTASSERT(offsetof(struct cpu_info, ci_data) != 0);
#endif
int (*compat_cpuctl_ioctl)(struct lwp *, u_long, void *) = (void *)enosys;
static void cpu_xc_online(struct cpu_info *, void *);
static void cpu_xc_offline(struct cpu_info *, void *);
dev_type_ioctl(cpuctl_ioctl);
const struct cdevsw cpuctl_cdevsw = {
.d_open = nullopen,
.d_close = nullclose,
.d_read = nullread,
.d_write = nullwrite,
.d_ioctl = cpuctl_ioctl,
.d_stop = nullstop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
int
mi_cpu_attach(struct cpu_info *ci)
{
int error;
KASSERT(maxcpus > 0);
if ((ci->ci_index = ncpu) >= maxcpus)
panic("Too many CPUs. Increase MAXCPUS?");
kcpuset_set(kcpuset_attached, cpu_index(ci));
/*
* Create a convenience cpuset of just ourselves.
*/
kcpuset_create(&ci->ci_kcpuset, true);
kcpuset_set(ci->ci_kcpuset, cpu_index(ci));
TAILQ_INIT(&ci->ci_data.cpu_ld_locks);
__cpu_simple_lock_init(&ci->ci_data.cpu_ld_lock);
/* This is useful for eg, per-cpu evcnt */
snprintf(ci->ci_data.cpu_name, sizeof(ci->ci_data.cpu_name), "cpu%d",
cpu_index(ci));
if (__predict_false(cpu_infos == NULL)) {
size_t ci_bufsize = (maxcpus + 1) * sizeof(struct cpu_info *);
cpu_infos = kmem_zalloc(ci_bufsize, KM_SLEEP);
}
cpu_infos[cpu_index(ci)] = ci;
sched_cpuattach(ci);
error = create_idle_lwp(ci);
if (error != 0) {
/* XXX revert sched_cpuattach */
return error;
}
if (ci == curcpu())
ci->ci_onproc = curlwp;
else
ci->ci_onproc = ci->ci_data.cpu_idlelwp;
percpu_init_cpu(ci);
softint_init(ci);
callout_init_cpu(ci);
xc_init_cpu(ci);
pool_cache_cpu_init(ci);
selsysinit(ci);
cache_cpu_init(ci);
TAILQ_INIT(&ci->ci_data.cpu_biodone);
ncpu++;
ncpuonline++;
return 0;
}
void
cpuctlattach(int dummy __unused)
{
KASSERT(cpu_infos != NULL);
}
int
cpuctl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
{
CPU_INFO_ITERATOR cii;
cpustate_t *cs;
struct cpu_info *ci;
int error, i;
u_int id;
error = 0;
mutex_enter(&cpu_lock);
switch (cmd) {
case IOC_CPU_SETSTATE:
cs = data;
error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_CPU, KAUTH_REQ_SYSTEM_CPU_SETSTATE, cs, NULL,
NULL);
if (error != 0)
break;
if (cs->cs_id >= maxcpus ||
(ci = cpu_lookup(cs->cs_id)) == NULL) {
error = ESRCH;
break;
}
cpu_setintr(ci, cs->cs_intr); /* XXX neglect errors */
error = cpu_setstate(ci, cs->cs_online);
break;
case IOC_CPU_GETSTATE:
cs = data;
id = cs->cs_id;
memset(cs, 0, sizeof(*cs));
cs->cs_id = id;
if (cs->cs_id >= maxcpus ||
(ci = cpu_lookup(id)) == NULL) {
error = ESRCH;
break;
}
if ((ci->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
cs->cs_online = false;
else
cs->cs_online = true;
if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
cs->cs_intr = false;
else
cs->cs_intr = true;
cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod;
cs->cs_lastmodhi = (int32_t)
(ci->ci_schedstate.spc_lastmod >> 32);
cs->cs_intrcnt = cpu_intr_count(ci) + 1;
cs->cs_hwid = ci->ci_cpuid;
break;
case IOC_CPU_MAPID:
i = 0;
for (CPU_INFO_FOREACH(cii, ci)) {
if (i++ == *(int *)data)
break;
}
if (ci == NULL)
error = ESRCH;
else
*(int *)data = cpu_index(ci);
break;
case IOC_CPU_GETCOUNT:
*(int *)data = ncpu;
break;
#ifdef CPU_UCODE
case IOC_CPU_UCODE_GET_VERSION:
error = cpu_ucode_get_version((struct cpu_ucode_version *)data);
break;
case IOC_CPU_UCODE_APPLY:
error = kauth_authorize_machdep(l->l_cred,
KAUTH_MACHDEP_CPU_UCODE_APPLY,
NULL, NULL, NULL, NULL);
if (error != 0)
break;
error = cpu_ucode_apply((const struct cpu_ucode *)data);
break;
#endif
default:
error = (*compat_cpuctl_ioctl)(l, cmd, data);
break;
}
mutex_exit(&cpu_lock);
return error;
}
struct cpu_info *
cpu_lookup(u_int idx)
{
struct cpu_info *ci;
/*
* cpu_infos is a NULL terminated array of MAXCPUS + 1 entries,
* so an index of MAXCPUS here is ok. See mi_cpu_attach.
*/
KASSERT(idx <= maxcpus);
if (__predict_false(cpu_infos == NULL)) {
KASSERT(idx == 0);
return curcpu();
}
ci = cpu_infos[idx];
KASSERT(ci == NULL || cpu_index(ci) == idx); KASSERTMSG(idx < maxcpus || ci == NULL, "idx %d ci %p", idx, ci);
return ci;
}
static void
cpu_xc_offline(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc, *mspc = NULL;
struct cpu_info *target_ci;
struct lwp *l;
CPU_INFO_ITERATOR cii;
int s;
/*
* Thread that made the cross call (separate context) holds
* cpu_lock on our behalf.
*/
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags |= SPCF_OFFLINE;
splx(s);
/* Take the first available CPU for the migration. */
for (CPU_INFO_FOREACH(cii, target_ci)) {
mspc = &target_ci->ci_schedstate;
if ((mspc->spc_flags & SPCF_OFFLINE) == 0)
break;
}
KASSERT(target_ci != NULL);
/*
* Migrate all non-bound threads to the other CPU. Note that this
* runs from the xcall thread, thus handling of LSONPROC is not needed.
*/
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
struct cpu_info *mci;
lwp_lock(l);
if (l->l_cpu != ci || (l->l_pflag & (LP_BOUND | LP_INTR))) {
lwp_unlock(l);
continue;
}
/* Regular case - no affinity. */
if (l->l_affinity == NULL) {
lwp_migrate(l, target_ci);
continue;
}
/* Affinity is set, find an online CPU in the set. */
for (CPU_INFO_FOREACH(cii, mci)) {
mspc = &mci->ci_schedstate;
if ((mspc->spc_flags & SPCF_OFFLINE) == 0 &&
kcpuset_isset(l->l_affinity, cpu_index(mci)))
break;
}
if (mci == NULL) {
lwp_unlock(l);
mutex_exit(&proc_lock);
goto fail;
}
lwp_migrate(l, mci);
}
mutex_exit(&proc_lock);
#if PCU_UNIT_COUNT > 0
pcu_save_all_on_cpu();
#endif
heartbeat_suspend();
#ifdef __HAVE_MD_CPU_OFFLINE
cpu_offline_md();
#endif
return;
fail:
/* Just unset the SPCF_OFFLINE flag, caller will check */
s = splsched();
spc->spc_flags &= ~SPCF_OFFLINE;
splx(s);
}
static void
cpu_xc_online(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
heartbeat_resume();
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags &= ~SPCF_OFFLINE;
splx(s);
}
int
cpu_setstate(struct cpu_info *ci, bool online)
{
struct schedstate_percpu *spc;
CPU_INFO_ITERATOR cii;
struct cpu_info *ci2;
uint64_t where;
xcfunc_t func;
int nonline;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(&cpu_lock));
if (online) {
if ((spc->spc_flags & SPCF_OFFLINE) == 0)
return 0;
func = (xcfunc_t)cpu_xc_online;
} else {
if ((spc->spc_flags & SPCF_OFFLINE) != 0)
return 0;
nonline = 0;
/*
* Ensure that at least one CPU within the processor set
* stays online. Revisit this later.
*/
for (CPU_INFO_FOREACH(cii, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
continue;
if (ci2->ci_schedstate.spc_psid != spc->spc_psid)
continue;
nonline++;
}
if (nonline == 1)
return EBUSY;
func = (xcfunc_t)cpu_xc_offline;
}
where = xc_unicast(0, func, ci, NULL, ci);
xc_wait(where);
if (online) {
KASSERT((spc->spc_flags & SPCF_OFFLINE) == 0);
ncpuonline++;
} else {
if ((spc->spc_flags & SPCF_OFFLINE) == 0) {
/* If was not set offline, then it is busy */
return EBUSY;
}
ncpuonline--;
}
spc->spc_lastmod = time_second;
return 0;
}
#if defined(__HAVE_INTR_CONTROL)
static void
cpu_xc_intr(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags &= ~SPCF_NOINTR;
splx(s);
}
static void
cpu_xc_nointr(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags |= SPCF_NOINTR;
splx(s);
}
int
cpu_setintr(struct cpu_info *ci, bool intr)
{
struct schedstate_percpu *spc;
CPU_INFO_ITERATOR cii;
struct cpu_info *ci2;
uint64_t where;
xcfunc_t func;
int nintr;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(&cpu_lock));
if (intr) {
if ((spc->spc_flags & SPCF_NOINTR) == 0)
return 0;
func = (xcfunc_t)cpu_xc_intr;
} else {
if (CPU_IS_PRIMARY(ci)) /* XXX kern/45117 */
return EINVAL;
if ((spc->spc_flags & SPCF_NOINTR) != 0)
return 0;
/*
* Ensure that at least one CPU within the system
* is handing device interrupts.
*/
nintr = 0;
for (CPU_INFO_FOREACH(cii, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
continue;
if (ci2 == ci)
continue;
nintr++;
}
if (nintr == 0)
return EBUSY;
func = (xcfunc_t)cpu_xc_nointr;
}
where = xc_unicast(0, func, ci, NULL, ci);
xc_wait(where);
if (intr) {
KASSERT((spc->spc_flags & SPCF_NOINTR) == 0);
} else if ((spc->spc_flags & SPCF_NOINTR) == 0) {
/* If was not set offline, then it is busy */
return EBUSY;
}
/* Direct interrupts away from the CPU and record the change. */
cpu_intr_redistribute();
spc->spc_lastmod = time_second;
return 0;
}
#else /* __HAVE_INTR_CONTROL */
int
cpu_setintr(struct cpu_info *ci, bool intr)
{
return EOPNOTSUPP;
}
u_int
cpu_intr_count(struct cpu_info *ci)
{
return 0; /* 0 == "don't know" */
}
#endif /* __HAVE_INTR_CONTROL */
#ifdef CPU_UCODE
int
cpu_ucode_load(struct cpu_ucode_softc *sc, const char *fwname)
{
firmware_handle_t fwh;
int error;
if (sc->sc_blob != NULL) {
firmware_free(sc->sc_blob, sc->sc_blobsize);
sc->sc_blob = NULL;
sc->sc_blobsize = 0;
}
error = cpu_ucode_md_open(&fwh, sc->loader_version, fwname);
if (error != 0) {
#ifdef DEBUG
printf("ucode: firmware_open(%s) failed: %i\n", fwname, error);
#endif
goto err0;
}
sc->sc_blobsize = firmware_get_size(fwh);
if (sc->sc_blobsize == 0) {
error = EFTYPE;
firmware_close(fwh);
goto err0;
}
sc->sc_blob = firmware_malloc(sc->sc_blobsize);
if (sc->sc_blob == NULL) {
error = ENOMEM;
firmware_close(fwh);
goto err0;
}
error = firmware_read(fwh, 0, sc->sc_blob, sc->sc_blobsize);
firmware_close(fwh);
if (error != 0)
goto err1;
return 0;
err1:
firmware_free(sc->sc_blob, sc->sc_blobsize);
sc->sc_blob = NULL;
sc->sc_blobsize = 0;
err0:
return error;
}
#endif
/* $NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Originally written by Julian Elischer (julian@tfs.com)
* for TRW Financial Systems for use under the MACH(2.5) operating system.
*
* TRW Financial Systems, in accordance with their agreement with Carnegie
* Mellon University, makes this software available to CMU to distribute
* or use in any manner that they see fit as long as this message is kept with
* the software. For this reason TFS also grants any other persons or
* organisations permission to use or modify this software.
*
* TFS supplies this software to be publicly redistributed
* on the understanding that TFS is not responsible for the correct
* functioning of this software in any circumstances.
*
* Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
* major changes by Julian Elischer (julian@jules.dialix.oz.au) May 1993
*
* A lot of rewhacking done by mjacob (mjacob@nas.nasa.gov).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/proc.h>
#include <sys/mtio.h>
#include <sys/device.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>
#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_tape.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/stvar.h>
/* Defines for device specific stuff */
#define DEF_FIXED_BSIZE 512
#define STMODE(z) ( minor(z) & 0x03)
#define STDSTY(z) ((minor(z) >> 2) & 0x03)
#define STUNIT(z) ((minor(z) >> 4) )
#define STNMINOR 16
#define NORMAL_MODE 0
#define NOREW_MODE 1
#define EJECT_MODE 2
#define CTRL_MODE 3
#ifndef ST_MOUNT_DELAY
#define ST_MOUNT_DELAY 0
#endif
static dev_type_open(stopen);
static dev_type_close(stclose);
static dev_type_read(stread);
static dev_type_write(stwrite);
static dev_type_ioctl(stioctl);
static dev_type_strategy(ststrategy);
static dev_type_dump(stdump);
const struct bdevsw st_bdevsw = {
.d_open = stopen,
.d_close = stclose,
.d_strategy = ststrategy,
.d_ioctl = stioctl,
.d_dump = stdump,
.d_psize = nosize,
.d_discard = nodiscard,
.d_flag = D_TAPE | D_MPSAFE
};
const struct cdevsw st_cdevsw = {
.d_open = stopen,
.d_close = stclose,
.d_read = stread,
.d_write = stwrite,
.d_ioctl = stioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_TAPE | D_MPSAFE
};
/*
* Define various devices that we know mis-behave in some way,
* and note how they are bad, so we can correct for them
*/
static const struct st_quirk_inquiry_pattern st_quirk_patterns[] = {
{{T_SEQUENTIAL, T_REMOV,
" ", " ", " "}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_24}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 0, HALFINCH_1600}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 0, HALFINCH_6250} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 3600 ", ""}, {0, 12, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 3800 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " SLR5 4/8GB ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
/*
* lacking a manual for the 4200, it's not clear what the
* specific density codes should be- the device is a 2.5GB
* capable QIC drive, those density codes aren't readily
* available. The 'default' will just have to do.
*/
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 4200 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
/*
* At least -005 and -007 need this. I'll assume they all do unless I
* hear otherwise. - mycroft, 31MAR1994
*/
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 2525 25462", ""}, {0, 0, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{ST_Q_SENSE_HELP, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
/*
* One user reports that this works for his tape drive. It probably
* needs more work. - mycroft, 09APR1994
*/
{{T_SEQUENTIAL, T_REMOV,
"SANKYO ", "CP525 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ANRITSU ", "DMT780 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 150 21247", ""}, {ST_Q_ERASE_NOIMM, 12, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{0, 0, QIC_150}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 150 21531", ""}, {ST_Q_ERASE_NOIMM, 12, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{0, 0, QIC_150}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5099ES SCSI", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_11}, /* minor 4-7 */
{0, 0, QIC_24}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5150ES SCSI", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_24}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_150} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5525ES SCSI REV7", ""}, {0, 0, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_BLKSIZE, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WangDAT ", "Model 1300 ", ""}, {0, 0, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, DDS}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 1024, DDS}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 0, DDS} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"EXABYTE ", "EXB-8200 ", "263H"}, {0, 5, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"STK", "9490", ""},
{ST_Q_FORCE_BLKSIZE, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"STK", "SD-3", ""},
{ST_Q_FORCE_BLKSIZE, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"IBM", "03590", ""}, {ST_Q_IGNORE_LOADS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"HP ", "T4000s ", ""}, {ST_Q_UNIMODAL, 0, {
{0, 0, QIC_3095}, /* minor 0-3 */
{0, 0, QIC_3095}, /* minor 4-7 */
{0, 0, QIC_3095}, /* minor 8-11 */
{0, 0, QIC_3095}, /* minor 12-15 */
}}},
#if 0
{{T_SEQUENTIAL, T_REMOV,
"EXABYTE ", "EXB-8200 ", ""}, {0, 12, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
#endif
{{T_SEQUENTIAL, T_REMOV,
"TEAC ", "MT-2ST/N50 ", ""}, {ST_Q_IGNORE_LOADS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"OnStream", "ADR50 Drive", ""}, {ST_Q_UNIMODAL, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"OnStream DI-30", "", "1.0"}, {ST_Q_NOFILEMARKS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"NCR H621", "0-STD-03-46F880 ", ""}, {ST_Q_NOPREVENT, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"Seagate STT3401A", "hp0atxa", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 512, 0} /* minor 12-15 */
}}},
};
#define NOEJECT 0
#define EJECT 1
static void st_identify_drive(struct st_softc *,
struct scsipi_inquiry_pattern *);
static void st_loadquirks(struct st_softc *);
static int st_mount_tape(dev_t, int);
static void st_unmount(struct st_softc *, boolean);
static int st_decide_mode(struct st_softc *, boolean);
static void ststart(struct scsipi_periph *);
static int ststart1(struct scsipi_periph *, struct buf *, int *);
static void strestart(void *);
static void stdone(struct scsipi_xfer *, int);
static int st_read(struct st_softc *, char *, int, int);
static int st_space(struct st_softc *, int, u_int, int);
static int st_write_filemarks(struct st_softc *, int, int);
static int st_check_eod(struct st_softc *, boolean, int *, int);
static int st_load(struct st_softc *, u_int, int);
static int st_rewind(struct st_softc *, u_int, int);
static int st_interpret_sense(struct scsipi_xfer *);
static int st_touch_tape(struct st_softc *);
static int st_erase(struct st_softc *, int full, int flags);
static void st_updatefilepos(struct st_softc *);
static int st_rdpos(struct st_softc *, int, uint32_t *);
static int st_setpos(struct st_softc *, int, uint32_t *);
static const struct scsipi_periphsw st_switch = {
st_interpret_sense,
ststart,
NULL,
stdone
};
#if defined(ST_ENABLE_EARLYWARN)
#define ST_INIT_FLAGS ST_EARLYWARN
#else
#define ST_INIT_FLAGS 0
#endif
/*
* The routine called by the low level scsi routine when it discovers
* A device suitable for this driver
*/
void
stattach(device_t parent, device_t self, void *aux)
{
struct st_softc *st = device_private(self);
struct scsipibus_attach_args *sa = aux;
struct scsipi_periph *periph = sa->sa_periph;
SC_DEBUG(periph, SCSIPI_DB2, ("stattach: "));
st->sc_dev = self;
/* Store information needed to contact our base driver */
st->sc_periph = periph;
periph->periph_dev = st->sc_dev;
periph->periph_switch = &st_switch;
/* Set initial flags */
st->flags = ST_INIT_FLAGS;
/* Set up the buf queues for this device */
bufq_alloc(&st->buf_queue, "fcfs", 0);
bufq_alloc(&st->buf_defer, "fcfs", 0);
callout_init(&st->sc_callout, 0);
mutex_init(&st->sc_iolock, MUTEX_DEFAULT, IPL_VM);
/*
* Check if the drive is a known criminal and take
* Any steps needed to bring it into line
*/
st_identify_drive(st, &sa->sa_inqbuf);
aprint_naive("\n");
aprint_normal("\n");
/* Use the subdriver to request information regarding the drive. */
aprint_normal_dev(self, "%s", st->quirkdata ? "quirks apply, " : "");
if (scsipi_test_unit_ready(periph,
XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE) ||
st->ops(st, ST_OPS_MODESENSE,
XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE))
aprint_normal("drive empty\n");
else {
aprint_normal("density code %d, ", st->media_density);
if (st->media_blksize > 0)
aprint_normal("%d-byte", st->media_blksize);
else
aprint_normal("variable");
aprint_normal(" blocks, write-%s\n",
(st->flags & ST_READONLY) ? "protected" : "enabled");
}
st->stats = iostat_alloc(IOSTAT_TAPE, parent,
device_xname(st->sc_dev));
rnd_attach_source(&st->rnd_source, device_xname(st->sc_dev),
RND_TYPE_TAPE, RND_FLAG_DEFAULT);
}
int
stdetach(device_t self, int flags)
{
struct st_softc *st = device_private(self);
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_channel *chan = periph->periph_channel;
int bmaj, cmaj, mn;
/* locate the major number */
bmaj = bdevsw_lookup_major(&st_bdevsw);
cmaj = cdevsw_lookup_major(&st_cdevsw);
/* kill any pending restart */
callout_halt(&st->sc_callout, NULL);
mutex_enter(chan_mtx(chan));
/* Kill off any queued buffers. */
bufq_drain(st->buf_defer);
bufq_drain(st->buf_queue);
/* Kill off any pending commands. */
scsipi_kill_pending(st->sc_periph);
mutex_exit(chan_mtx(chan));
bufq_free(st->buf_defer);
bufq_free(st->buf_queue);
mutex_destroy(&st->sc_iolock);
/* Nuke the vnodes for any open instances */
mn = STUNIT(device_unit(self));
vdevgone(bmaj, mn, mn+STNMINOR-1, VBLK);
vdevgone(cmaj, mn, mn+STNMINOR-1, VCHR);
iostat_free(st->stats);
/* Unhook the entropy source. */
rnd_detach_source(&st->rnd_source);
return 0;
}
/*
* Use the inquiry routine in 'scsi_base' to get drive info so we can
* Further tailor our behaviour.
*/
static void
st_identify_drive(struct st_softc *st, struct scsipi_inquiry_pattern *inqbuf)
{
const struct st_quirk_inquiry_pattern *finger;
int priority;
finger = scsipi_inqmatch(inqbuf,
st_quirk_patterns,
sizeof(st_quirk_patterns) / sizeof(st_quirk_patterns[0]),
sizeof(st_quirk_patterns[0]), &priority);
if (priority != 0) {
st->quirkdata = &finger->quirkdata;
st->drive_quirks = finger->quirkdata.quirks;
st->quirks = finger->quirkdata.quirks; /* start value */
st->page_0_size = finger->quirkdata.page_0_size;
KASSERT(st->page_0_size <= MAX_PAGE_0_SIZE);
st_loadquirks(st);
}
}
/*
* initialise the subdevices to the default (QUIRK) state.
* this will remove any setting made by the system operator or previous
* operations.
*/
static void
st_loadquirks(struct st_softc *st)
{
const struct modes *mode;
struct modes *mode2;
int i;
mode = st->quirkdata->modes;
mode2 = st->modes;
for (i = 0; i < 4; i++) {
memset(mode2, 0, sizeof(struct modes));
st->modeflags[i] &= ~(BLKSIZE_SET_BY_QUIRK |
DENSITY_SET_BY_QUIRK | BLKSIZE_SET_BY_USER |
DENSITY_SET_BY_USER);
if ((mode->quirks | st->drive_quirks) & ST_Q_FORCE_BLKSIZE) {
mode2->blksize = mode->blksize;
st->modeflags[i] |= BLKSIZE_SET_BY_QUIRK;
}
if (mode->density) {
mode2->density = mode->density;
st->modeflags[i] |= DENSITY_SET_BY_QUIRK;
}
mode2->quirks |= mode->quirks;
mode++;
mode2++;
}
}
/* open the device. */
static int
stopen(dev_t dev, int flags, int mode, struct lwp *l)
{
u_int stmode, dsty;
int error, sflags, unit, tries, ntries;
struct st_softc *st;
struct scsipi_periph *periph;
struct scsipi_adapter *adapt;
unit = STUNIT(dev);
st = device_lookup_private(&st_cd, unit);
if (st == NULL)
return ENXIO;
stmode = STMODE(dev);
dsty = STDSTY(dev);
periph = st->sc_periph;
adapt = periph->periph_channel->chan_adapter;
SC_DEBUG(periph, SCSIPI_DB1,
("open: dev=0x%"PRIx64" (unit %d (of %d))\n", dev, unit,
st_cd.cd_ndevs));
/* Only allow one at a time */
if (periph->periph_flags & PERIPH_OPEN) {
aprint_error_dev(st->sc_dev, "already open\n");
return EBUSY;
}
if ((error = scsipi_adapter_addref(adapt)) != 0)
return error;
/* clear any latched errors. */
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
/*
* Catch any unit attention errors. Be silent about this
* unless we're already mounted. We ignore media change
* if we're in control mode or not mounted yet.
*/
if ((st->flags & ST_MOUNTED) == 0 || stmode == CTRL_MODE) {
#ifdef SCSIDEBUG
sflags = XS_CTL_IGNORE_MEDIA_CHANGE;
#else
sflags = XS_CTL_SILENT|XS_CTL_IGNORE_MEDIA_CHANGE;
#endif
} else
sflags = 0;
/*
* If we're already mounted or we aren't configured for
* a mount delay, only try a test unit ready once. Otherwise,
* try up to ST_MOUNT_DELAY times with a rest interval of
* one second between each try.
*/
if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0)
ntries = 1;
else
ntries = ST_MOUNT_DELAY;
for (error = tries = 0; tries < ntries; tries++) {
int slpintr, oflags;
/*
* If we had no error, or we're opening the control mode
* device, we jump out right away.
*/
error = scsipi_test_unit_ready(periph, sflags);
if (error == 0 || stmode == CTRL_MODE)
break;
/*
* We had an error.
*
* If we're already mounted or we aren't configured for
* a mount delay, or the error isn't a NOT READY error,
* skip to the error exit now.
*/
if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0 ||
(st->mt_key != SKEY_NOT_READY)) {
device_printf(st->sc_dev,
"mount error (sense key=%d) - "
"terminating mount session\n",
st->mt_key);
/*
* the following should not trigger unless
* something serious happened while the device
* was open (PREVENT MEDIUM REMOVAL in effect)
*/
if (st->flags & ST_WRITTEN &&
st->mt_key == SKEY_UNIT_ATTENTION) {
/*
* device / media state may have changed
* refrain from writing missing file marks
* onto potentially newly inserted/formatted
* media (e. g. emergency EJECT/RESET/etc.)
*/
st->flags &= ~(ST_WRITTEN|ST_FM_WRITTEN);
device_printf(st->sc_dev,
"CAUTION: file marks/data may be missing"
" - ASC = 0x%02x, ASCQ = 0x%02x\n",
st->asc, st->ascq);
}
goto bad;
}
/* clear any latched errors. */
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
/*
* Fake that we have the device open so
* we block other apps from getting in.
*/
oflags = periph->periph_flags;
periph->periph_flags |= PERIPH_OPEN;
slpintr = kpause("stload", true, hz, NULL);
periph->periph_flags = oflags; /* restore flags */
if (slpintr != 0 && slpintr != EWOULDBLOCK) {
device_printf(st->sc_dev, "load interrupted\n");
goto bad;
}
}
/*
* If the mode is 3 (e.g. minor = 3,7,11,15) then the device has
* been opened to set defaults and perform other, usually non-I/O
* related, operations. In this case, do a quick check to see
* whether the unit actually had a tape loaded (this will be known
* as to whether or not we got a NOT READY for the above
* unit attention). If a tape is there, go do a mount sequence.
*/
if (stmode == CTRL_MODE && st->mt_key != SKEY_NO_SENSE &&
st->mt_key != SKEY_UNIT_ATTENTION) {
periph->periph_flags |= PERIPH_OPEN;
return 0;
}
/*
* If we get this far and had an error set, that means we failed
* to pass the 'test unit ready' test for the non-controlmode device,
* so we bounce the open.
*/
if (error)
return error;
/* Else, we're now committed to saying we're open. */
periph->periph_flags |= PERIPH_OPEN; /* unit attn are now errors */
/*
* If it's a different mode, or if the media has been
* invalidated, unmount the tape from the previous
* session but continue with open processing
*/
if (st->last_dsty != dsty ||
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
st_unmount(st, NOEJECT);
/*
* If we are not mounted, then we should start a new
* mount session.
*/
if (!(st->flags & ST_MOUNTED)) { if ((error = st_mount_tape(dev, flags)) != 0)
goto bad;
st->last_dsty = dsty;
}
if (!(st->quirks & ST_Q_NOPREVENT)) { scsipi_prevent(periph, SPAMR_PREVENT_DT,
XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
}
SC_DEBUG(periph, SCSIPI_DB2, ("open complete\n"));
return 0;
bad:
st_unmount(st, NOEJECT);
scsipi_adapter_delref(adapt);
periph->periph_flags &= ~PERIPH_OPEN;
return error;
}
static int
stclose(dev_t dev, int flags, int mode, struct lwp *l)
{
int stxx, error = 0;
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter;
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("closing\n"));
/*
* Make sure that a tape opened in write-only mode will have
* file marks written on it when closed, even if not written to.
*
* This is for SUN compatibility. Actually, the Sun way of
* things is to:
*
* only write filemarks if there are fmks to be written and
* - open for write (possibly read/write)
* - the last operation was a write
* or:
* - opened for wronly
* - no data was written (including filemarks)
*/
stxx = st->flags & (ST_WRITTEN | ST_FM_WRITTEN);
if ((flags & FWRITE) != 0) {
int nm = 0;
#ifdef ST_SUNCOMPAT
/*
* on request only
* original compat code has not been working
* since ~1998
*/
if ((flags & O_ACCMODE) == FWRITE && (stxx == 0)) {
st->flags |= ST_WRITTEN;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("SUN compatibility: write FM(s) at close\n"));
}
#endif
error = st_check_eod(st, FALSE, &nm, 0);
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("wrote %d FM(s) at close error=%d\n", nm, error));
}
/* Allow robots to eject tape if needed. */
if (!(st->quirks & ST_Q_NOPREVENT)) {
scsipi_prevent(periph, SPAMR_ALLOW,
XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
}
switch (STMODE(dev)) {
case NORMAL_MODE:
st_unmount(st, NOEJECT);
break;
case NOREW_MODE:
case CTRL_MODE:
/*
* Leave mounted unless media seems to have been removed.
*
* Otherwise, if we're to terminate a tape with more than one
* filemark [ and because we're not rewinding here ], backspace
* one filemark so that later appends will see an unbroken
* sequence of:
*
* file - FMK - file - FMK ... file - FMK FMK (EOM)
*/
if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
st_unmount(st, NOEJECT);
} else if (error == 0) {
/*
* ST_WRITTEN was preserved from above.
*
* All we need to know here is:
*
* Were we writing this tape and was the last
* operation a write?
*
* Are there supposed to be 2FM at EOD?
*
* If both statements are true, then we backspace
* one filemark.
*/
stxx &= ~ST_FM_WRITTEN;
stxx |= (st->flags & ST_2FM_AT_EOD);
if ((flags & FWRITE) != 0 &&
(stxx == (ST_2FM_AT_EOD|ST_WRITTEN))) {
error = st_space(st, -1, SP_FILEMARKS, 0);
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_space(-1) error=%d\n", error));
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("no backspacing - flags = 0x%x, stxx=0x%x, st->flags=0x%x\n", flags, stxx, st->flags));
}
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("error %d from st_check_eod\n", error));
}
break;
case EJECT_MODE:
st_unmount(st, EJECT);
break;
}
KASSERTMSG((st->flags & ST_WRITTEN) == 0,
"pending ST_WRITTEN flag NOT cleared (flags=0x%x)", st->flags);
scsipi_wait_drain(periph);
scsipi_adapter_delref(adapt);
periph->periph_flags &= ~PERIPH_OPEN;
return error;
}
/*
* Start a new mount session.
* Copy in all the default parameters from the selected device mode.
* and try guess any that seem to be defaulted.
*/
static int
st_mount_tape(dev_t dev, int flags)
{
int unit;
u_int dsty;
struct st_softc *st;
struct scsipi_periph *periph;
int error = 0;
unit = STUNIT(dev);
dsty = STDSTY(dev);
st = device_lookup_private(&st_cd, unit);
periph = st->sc_periph;
if (st->flags & ST_MOUNTED)
return 0;
SC_DEBUG(periph, SCSIPI_DB1, ("mounting\n "));
st->flags |= ST_NEW_MOUNT;
st->quirks = st->drive_quirks | st->modes[dsty].quirks;
/*
* If the media is new, then make sure we give it a chance to
* to do a 'load' instruction. (We assume it is new.)
*/
if ((error = st_load(st, LD_LOAD, XS_CTL_SILENT)) != 0)
return error;
/*
* Throw another dummy instruction to catch
* 'Unit attention' errors. Many drives give
* these after doing a Load instruction (with
* the MEDIUM MAY HAVE CHANGED asc/ascq).
*/
scsipi_test_unit_ready(periph, XS_CTL_SILENT); /* XXX */
/*
* Some devices can't tell you much until they have been
* asked to look at the media. This quirk does this.
*/
if (st->quirks & ST_Q_SENSE_HELP) if ((error = st_touch_tape(st)) != 0)
return error;
/*
* Load the physical device parameters
* loads: blkmin, blkmax
*/
if ((error = st->ops(st, ST_OPS_RBL, 0)) != 0)
return error;
/*
* Load the media dependent parameters
* includes: media_blksize,media_density,numblks
* As we have a tape in, it should be reflected here.
* If not you may need the "quirk" above.
*/
if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
return error;
/*
* If we have gained a permanent density from somewhere,
* then use it in preference to the one supplied by
* default by the driver.
*/
if (st->modeflags[dsty] & (DENSITY_SET_BY_QUIRK | DENSITY_SET_BY_USER))
st->density = st->modes[dsty].density;
else
st->density = st->media_density;
/*
* If we have gained a permanent blocksize
* then use it in preference to the one supplied by
* default by the driver.
*/
st->flags &= ~ST_FIXEDBLOCKS;
if (st->modeflags[dsty] &
(BLKSIZE_SET_BY_QUIRK | BLKSIZE_SET_BY_USER)) {
st->blksize = st->modes[dsty].blksize;
if (st->blksize) st->flags |= ST_FIXEDBLOCKS;
} else {
if ((error = st_decide_mode(st, FALSE)) != 0)
return error;
}
if ((error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
/* ATAPI will return ENODEV for this, and this may be OK */
if (error != ENODEV) {
aprint_error_dev(st->sc_dev,
"cannot set selected mode\n");
return error;
}
}
st->flags &= ~ST_NEW_MOUNT;
st->flags |= ST_MOUNTED;
periph->periph_flags |= PERIPH_MEDIA_LOADED; /* move earlier? */
st->blkno = st->fileno = (daddr_t) 0;
return 0;
}
/*
* End the present mount session.
* Rewind, and optionally eject the tape.
* Reset various flags to indicate that all new
* operations require another mount operation
*/
static void
st_unmount(struct st_softc *st, boolean eject)
{
struct scsipi_periph *periph = st->sc_periph;
int nmarks;
if ((st->flags & ST_MOUNTED) == 0)
return;
SC_DEBUG(periph, SCSIPI_DB1, ("unmounting\n"));
st_check_eod(st, FALSE, &nmarks, XS_CTL_IGNORE_NOT_READY);
st_rewind(st, 0, XS_CTL_IGNORE_NOT_READY);
/*
* Section 9.3.3 of the SCSI specs states that a device shall return
* the density value specified in the last successful MODE SELECT
* after an unload operation, in case it is not able to
* automatically determine the density of the new medium.
*
* So we instruct the device to use the default density, which will
* prevent the use of stale density values (in particular,
* in st_touch_tape().
*/
st->density = 0;
if (st->ops(st, ST_OPS_MODESELECT, 0) != 0) {
aprint_error_dev(st->sc_dev,
"WARNING: cannot revert to default density\n");
}
if (eject) {
if (!(st->quirks & ST_Q_NOPREVENT)) {
scsipi_prevent(periph, SPAMR_ALLOW,
XS_CTL_IGNORE_ILLEGAL_REQUEST |
XS_CTL_IGNORE_NOT_READY);
}
st_load(st, LD_UNLOAD, XS_CTL_IGNORE_NOT_READY);
st->blkno = st->fileno = (daddr_t) -1;
} else {
st->blkno = st->fileno = (daddr_t) 0;
}
st->flags &= ~(ST_MOUNTED | ST_NEW_MOUNT);
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
}
/*
* Given all we know about the device, media, mode, 'quirks' and
* initial operation, make a decision as to how we should be set
* to run (regarding blocking and EOD marks)
*/
int
st_decide_mode(struct st_softc *st, boolean first_read)
{
SC_DEBUG(st->sc_periph, SCSIPI_DB2, ("starting block mode decision\n"));
/*
* If the drive can only handle fixed-length blocks and only at
* one size, perhaps we should just do that.
*/
if (st->blkmin && (st->blkmin == st->blkmax)) { st->flags |= ST_FIXEDBLOCKS;
st->blksize = st->blkmin;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("blkmin == blkmax of %d\n", st->blkmin));
goto done;
}
/*
* If the tape density mandates (or even suggests) use of fixed
* or variable-length blocks, comply.
*/
switch (st->density) {
case HALFINCH_800:
case HALFINCH_1600:
case HALFINCH_6250:
case DDS:
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("density specified variable\n"));
goto done;
case QIC_11:
case QIC_24:
case QIC_120:
case QIC_150:
case QIC_525:
case QIC_1320:
case QIC_3095:
case QIC_3220:
st->flags |= ST_FIXEDBLOCKS;
if (st->media_blksize > 0)
st->blksize = st->media_blksize;
else
st->blksize = DEF_FIXED_BSIZE;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("density specified fixed\n"));
goto done;
}
/*
* If we're about to read the tape, perhaps we should choose
* fixed or variable-length blocks and block size according to
* what the drive found on the tape.
*/
if (first_read &&
(!(st->quirks & ST_Q_BLKSIZE) || (st->media_blksize == 0) ||
(st->media_blksize == DEF_FIXED_BSIZE) ||
(st->media_blksize == 1024))) {
if (st->media_blksize > 0)
st->flags |= ST_FIXEDBLOCKS;
else
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = st->media_blksize;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("Used media_blksize of %d\n", st->media_blksize));
goto done;
}
/*
* We're getting no hints from any direction. Choose variable-
* length blocks arbitrarily.
*/
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("Give up and default to variable mode\n"));
done:
/*
* Decide whether or not to write two file marks to signify end-
* of-data. Make the decision as a function of density. If
* the decision is not to use a second file mark, the SCSI BLANK
* CHECK condition code will be recognized as end-of-data when
* first read.
* (I think this should be a by-product of fixed/variable..julian)
*/
switch (st->density) {
/* case 8 mm: What is the SCSI density code for 8 mm, anyway? */
case QIC_11:
case QIC_24:
case QIC_120:
case QIC_150:
case QIC_525:
case QIC_1320:
case QIC_3095:
case QIC_3220:
st->flags &= ~ST_2FM_AT_EOD;
break;
default:
st->flags |= ST_2FM_AT_EOD;
}
return 0;
}
/*
* Actually translate the requested transfer into
* one the physical driver can understand
* The transfer is described by a buf and will include
* only one physical transfer.
*/
static void
ststrategy(struct buf *bp)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(bp->b_dev));
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_channel *chan = periph->periph_channel;
SC_DEBUG(periph, SCSIPI_DB1,
("ststrategy %d bytes @ blk %" PRId64 "\n", bp->b_bcount,
bp->b_blkno));
/* If it's a null transfer, return immediately */
if (bp->b_bcount == 0)
goto abort;
/* If offset is negative, error */
if (bp->b_blkno < 0) {
SC_DEBUG(periph, SCSIPI_DB3,
("EINVAL: ststrategy negative blockcount %" PRId64 "\n", bp->b_blkno));
bp->b_error = EINVAL;
goto abort;
}
/* Odd sized request on fixed drives are verboten */
if (st->flags & ST_FIXEDBLOCKS) {
if (bp->b_bcount % st->blksize) {
aprint_error_dev(st->sc_dev, "bad request, must be multiple of %d\n",
st->blksize);
bp->b_error = EIO;
goto abort;
}
}
/* as are out-of-range requests on variable drives. */
else if (bp->b_bcount < st->blkmin ||
(st->blkmax && bp->b_bcount > st->blkmax)) {
aprint_error_dev(st->sc_dev, "bad request, must be between %d and %d\n",
st->blkmin, st->blkmax);
bp->b_error = EIO;
goto abort;
}
mutex_enter(chan_mtx(chan));
/*
* Place it in the queue of activities for this tape
* at the end (a bit silly because we only have on user..
* (but it could fork()))
*/
bufq_put(st->buf_queue, bp);
/*
* Tell the device to get going on the transfer if it's
* not doing anything, otherwise just wait for completion
* (All a bit silly if we're only allowing 1 open but..)
*/
ststart(periph);
mutex_exit(chan_mtx(chan));
return;
abort:
/*
* Reset the residue because we didn't do anything,
* and send the buffer back as done.
*/
bp->b_resid = bp->b_bcount;
biodone(bp);
return;
}
/*
* ststart looks to see if there is a buf waiting for the device
* and that the device is not already busy. If the device is busy,
* the request is deferred and retried on the next attempt.
* If both are true, ststart creates a scsi command to perform
* the transfer required.
*
* The transfer request will call scsipi_done on completion,
* which will in turn call this routine again so that the next
* queued transfer is performed. The bufs are queued by the
* strategy routine (ststrategy)
*
* This routine is also called after other non-queued requests
* have been made of the scsi driver, to ensure that the queue
* continues to be drained.
* ststart() is called with channel lock held
*/
static int
ststart1(struct scsipi_periph *periph, struct buf *bp, int *errnop)
{
struct st_softc *st = device_private(periph->periph_dev);
struct scsipi_channel *chan = periph->periph_channel;
struct scsi_rw_tape cmd;
struct scsipi_xfer *xs;
int flags, error, complete = 1;
SC_DEBUG(periph, SCSIPI_DB2, ("ststart1 "));
mutex_enter(chan_mtx(chan));
if (periph->periph_active >= periph->periph_openings) {
error = EAGAIN;
goto out;
}
/* if a special awaits, let it proceed first */
if (periph->periph_flags & PERIPH_WAITING) {
periph->periph_flags &= ~PERIPH_WAITING;
cv_broadcast(periph_cv_periph(periph));
error = EAGAIN;
goto out;
}
/*
* If the device has been unmounted by the user
* then throw away all requests until done.
*/
if (__predict_false((st->flags & ST_MOUNTED) == 0 ||
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) {
error = EIO;
goto out;
}
/*
* only FIXEDBLOCK devices have pending I/O or space operations.
*/
if (st->flags & ST_FIXEDBLOCKS) {
/*
* If we are at a filemark but have not reported it yet
* then we should report it now
*/
if (st->flags & ST_AT_FILEMARK) {
if ((bp->b_flags & B_READ) == B_WRITE) {
/*
* Handling of ST_AT_FILEMARK in
* st_space will fill in the right file
* mark count.
* Back up over filemark
*/
if (st_space(st, 0, SP_FILEMARKS, 0)) {
error = EIO;
goto out;
}
} else {
error = 0;
st->flags &= ~ST_AT_FILEMARK;
goto out;
}
}
}
/*
* If we are at EOM but have not reported it
* yet then we should report it now.
*/
if (st->flags & (ST_EOM_PENDING|ST_EIO_PENDING)) {
error = 0;
if (st->flags & ST_EIO_PENDING)
error = EIO;
st->flags &= ~(ST_EOM_PENDING|ST_EIO_PENDING);
goto out;
}
/* Fill out the scsi command */
memset(&cmd, 0, sizeof(cmd));
flags = XS_CTL_NOSLEEP | XS_CTL_ASYNC;
if ((bp->b_flags & B_READ) == B_WRITE) {
cmd.opcode = WRITE;
st->flags &= ~ST_FM_WRITTEN;
flags |= XS_CTL_DATA_OUT;
} else {
cmd.opcode = READ;
flags |= XS_CTL_DATA_IN;
}
/*
* Handle "fixed-block-mode" tape drives by using the
* block count instead of the length.
*/
if (st->flags & ST_FIXEDBLOCKS) {
cmd.byte2 |= SRW_FIXED;
_lto3b(bp->b_bcount / st->blksize, cmd.len);
} else
_lto3b(bp->b_bcount, cmd.len);
/* Clear 'position updated' indicator */
st->flags &= ~ST_POSUPDATED;
/* go ask the adapter to do all this for us */
xs = scsipi_make_xs_locked(periph,
(struct scsipi_generic *)&cmd, sizeof(cmd),
(u_char *)bp->b_data, bp->b_bcount,
0, ST_IO_TIME, bp, flags);
if (__predict_false(xs == NULL)) {
/*
* out of memory. Keep this buffer in the queue, and
* retry later.
*/
callout_reset(&st->sc_callout, hz / 2, strestart,
periph);
error = EAGAIN;
goto out;
}
error = scsipi_execute_xs(xs);
/* with a scsipi_xfer preallocated, scsipi_command can't fail */
KASSERT(error == 0);
if (error == 0)
complete = 0;
out:
mutex_exit(chan_mtx(chan));
*errnop = error;
return complete;
}
static void
ststart(struct scsipi_periph *periph)
{
struct st_softc *st = device_private(periph->periph_dev);
struct scsipi_channel *chan = periph->periph_channel;
struct buf *bp;
int error, complete;
SC_DEBUG(periph, SCSIPI_DB2, ("ststart "));
mutex_exit(chan_mtx(chan));
mutex_enter(&st->sc_iolock);
while ((bp = bufq_get(st->buf_defer)) != NULL
|| (bp = bufq_get(st->buf_queue)) != NULL) {
iostat_busy(st->stats);
mutex_exit(&st->sc_iolock);
complete = ststart1(periph, bp, &error);
mutex_enter(&st->sc_iolock);
if (complete) {
iostat_unbusy(st->stats, 0,
((bp->b_flags & B_READ) == B_READ));
if (error == EAGAIN) {
bufq_put(st->buf_defer, bp);
break;
}
}
mutex_exit(&st->sc_iolock);
if (complete) {
bp->b_error = error;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
mutex_enter(&st->sc_iolock);
}
mutex_exit(&st->sc_iolock);
mutex_enter(chan_mtx(chan));
}
static void
strestart(void *v)
{
struct scsipi_periph *periph = (struct scsipi_periph *)v;
struct scsipi_channel *chan = periph->periph_channel;
mutex_enter(chan_mtx(chan));
ststart((struct scsipi_periph *)v);
mutex_exit(chan_mtx(chan));
}
static void
stdone(struct scsipi_xfer *xs, int error)
{
struct st_softc *st = device_private(xs->xs_periph->periph_dev);
struct buf *bp = xs->bp;
if (bp) {
bp->b_error = error;
bp->b_resid = xs->resid;
/*
* buggy device ? A SDLT320 can report an info
* field of 0x3de8000 on a Media Error/Write Error
* for this CBD: 0x0a 00 00 80 00 00
*/
if (bp->b_resid > bp->b_bcount || bp->b_resid < 0)
bp->b_resid = bp->b_bcount;
mutex_enter(&st->sc_iolock);
if ((bp->b_flags & B_READ) == B_WRITE)
st->flags |= ST_WRITTEN;
else
st->flags &= ~ST_WRITTEN;
iostat_unbusy(st->stats, bp->b_bcount,
((bp->b_flags & B_READ) == B_READ));
if ((st->flags & ST_POSUPDATED) == 0) {
if (error) {
st->fileno = st->blkno = -1;
} else if (st->blkno != -1) {
if (st->flags & ST_FIXEDBLOCKS)
st->blkno +=
(bp->b_bcount / st->blksize);
else
st->blkno++;
}
}
mutex_exit(&st->sc_iolock);
rnd_add_uint32(&st->rnd_source, bp->b_blkno);
biodone(bp);
}
}
static int
stread(dev_t dev, struct uio *uio, int iomode)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
int r = physio(ststrategy, NULL, dev, B_READ,
st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stread: result=%d]\n", r));
return r;
}
static int
stwrite(dev_t dev, struct uio *uio, int iomode)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
int r = physio(ststrategy, NULL, dev, B_WRITE,
st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stwrite: result=%d]\n", r));
return r;
}
/*
* Perform special action on behalf of the user;
* knows about the internals of this device
*/
static int
stioctl(dev_t dev, u_long cmd, void *arg, int flag, struct lwp *l)
{
int error = 0;
int unit;
int number, nmarks, dsty;
int flags;
struct st_softc *st;
int hold_blksize;
uint8_t hold_density;
struct mtop *mt = (struct mtop *) arg;
/* Find the device that the user is talking about */
flags = 0; /* give error messages, act on errors etc. */
unit = STUNIT(dev);
dsty = STDSTY(dev);
st = device_lookup_private(&st_cd, unit);
hold_blksize = st->blksize;
hold_density = st->density;
switch ((u_int)cmd) {
case MTIOCGET: {
struct mtget *g = (struct mtget *) arg;
/*
* (to get the current state of READONLY)
*/
error = st->ops(st, ST_OPS_MODESENSE, XS_CTL_SILENT);
if (error) {
/*
* Ignore the error if in control mode;
* this is mandated by st(4).
*/
if (STMODE(dev) != CTRL_MODE)
break;
error = 0;
}
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[ioctl: get status]\n"));
memset(g, 0, sizeof(struct mtget));
g->mt_type = MT_ISAR; /* Ultrix compat *//*? */
g->mt_blksiz = st->blksize;
g->mt_density = st->density;
g->mt_mblksiz[0] = st->modes[0].blksize;
g->mt_mblksiz[1] = st->modes[1].blksize;
g->mt_mblksiz[2] = st->modes[2].blksize;
g->mt_mblksiz[3] = st->modes[3].blksize;
g->mt_mdensity[0] = st->modes[0].density;
g->mt_mdensity[1] = st->modes[1].density;
g->mt_mdensity[2] = st->modes[2].density;
g->mt_mdensity[3] = st->modes[3].density;
g->mt_fileno = st->fileno;
g->mt_blkno = st->blkno;
if (st->flags & ST_READONLY)
g->mt_dsreg |= MT_DS_RDONLY;
if (st->flags & ST_MOUNTED)
g->mt_dsreg |= MT_DS_MOUNTED;
g->mt_resid = st->mt_resid;
g->mt_erreg = st->mt_erreg;
/*
* clear latched errors.
*/
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
break;
}
case MTIOCTOP: {
SC_DEBUG(st->sc_periph, SCSIPI_DB1,
("[ioctl: op=0x%x count=0x%x]\n", mt->mt_op,
mt->mt_count));
/* compat: in U*x it is a short */
number = mt->mt_count;
switch ((short) (mt->mt_op)) {
case MTWEOF: /* write an end-of-file record */
error = st_write_filemarks(st, number, flags);
break;
case MTBSF: /* backward space file */
number = -number;
/* FALLTHROUGH */
case MTFSF: /* forward space file */
error = st_check_eod(st, FALSE, &nmarks, flags);
if (!error)
error = st_space(st, number - nmarks,
SP_FILEMARKS, flags);
break;
case MTBSR: /* backward space record */
number = -number;
/* FALLTHROUGH */
case MTFSR: /* forward space record */
error = st_check_eod(st, true, &nmarks, flags);
if (!error)
error = st_space(st, number, SP_BLKS, flags);
break;
case MTREW: /* rewind */
error = st_rewind(st, 0, flags);
break;
case MTOFFL: /* rewind and put the drive offline */
st_unmount(st, EJECT);
break;
case MTNOP: /* no operation, sets status only */
break;
case MTRETEN: /* retension the tape */
error = st_load(st, LD_RETENSION, flags);
if (!error)
error = st_load(st, LD_LOAD, flags);
break;
case MTEOM: /* forward space to end of media */
error = st_check_eod(st, FALSE, &nmarks, flags);
if (!error)
error = st_space(st, 1, SP_EOM, flags);
break;
case MTCACHE: /* enable controller cache */
st->flags &= ~ST_DONTBUFFER;
goto try_new_value;
case MTNOCACHE: /* disable controller cache */
st->flags |= ST_DONTBUFFER;
goto try_new_value;
case MTERASE: /* erase volume */
error = st_erase(st, number, flags);
break;
case MTSETBSIZ: /* Set block size for device */
#ifdef NOTYET
if (!(st->flags & ST_NEW_MOUNT)) {
uprintf("re-mount tape before changing "
"blocksize");
error = EINVAL;
break;
}
#endif
if (number == 0)
st->flags &= ~ST_FIXEDBLOCKS;
else {
if ((st->blkmin || st->blkmax) &&
(number < st->blkmin ||
number > st->blkmax)) {
error = EINVAL;
break;
}
st->flags |= ST_FIXEDBLOCKS;
}
st->blksize = number;
st->flags |= ST_BLOCK_SET; /*XXX */
goto try_new_value;
case MTSETDNSTY: /* Set density for device and mode */
/*
* Any number >= 0 and <= 0xff is legal. Numbers
* above 0x80 are 'vendor unique'.
*/
if (number < 0 || number > 255) {
error = EINVAL;
break;
} else
st->density = number;
goto try_new_value;
case MTCMPRESS:
error = st->ops(st, (number == 0) ?
ST_OPS_CMPRSS_OFF : ST_OPS_CMPRSS_ON,
XS_CTL_SILENT);
break;
case MTEWARN:
if (number)
st->flags |= ST_EARLYWARN;
else
st->flags &= ~ST_EARLYWARN;
break;
default:
error = EINVAL;
}
break;
}
case MTIOCIEOT:
case MTIOCEEOT:
break;
case MTIOCRDSPOS:
error = st_rdpos(st, 0, (uint32_t *)arg);
break;
case MTIOCRDHPOS:
error = st_rdpos(st, 1, (uint32_t *)arg);
break;
case MTIOCSLOCATE:
error = st_setpos(st, 0, (uint32_t *)arg);
break;
case MTIOCHLOCATE:
error = st_setpos(st, 1, (uint32_t *)arg);
break;
default:
error = scsipi_do_ioctl(st->sc_periph, dev, cmd, arg, flag, l);
break;
}
return error;
try_new_value:
/*
* Check that the mode being asked for is aggreeable to the
* drive. If not, put it back the way it was.
*
* If in control mode, we can make (persistent) mode changes
* even if no medium is loaded (see st(4)).
*/
if ((STMODE(dev) != CTRL_MODE || (st->flags & ST_MOUNTED) != 0) &&
(error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
/* put it back as it was */
aprint_error_dev(st->sc_dev, "cannot set selected mode\n");
st->density = hold_density;
st->blksize = hold_blksize;
if (st->blksize)
st->flags |= ST_FIXEDBLOCKS;
else
st->flags &= ~ST_FIXEDBLOCKS;
return error;
}
/*
* As the drive liked it, if we are setting a new default,
* set it into the structures as such.
*
* The means for deciding this are not finalised yet- but
* if the device was opened in Control Mode, the values
* are persistent now across mounts.
*/
if (STMODE(dev) == CTRL_MODE) {
switch ((short) (mt->mt_op)) {
case MTSETBSIZ:
st->modes[dsty].blksize = st->blksize;
st->modeflags[dsty] |= BLKSIZE_SET_BY_USER;
break;
case MTSETDNSTY:
st->modes[dsty].density = st->density;
st->modeflags[dsty] |= DENSITY_SET_BY_USER;
break;
}
}
return 0;
}
/* Do a synchronous read. */
static int
st_read(struct st_softc *st, char *bf, int size, int flags)
{
struct scsi_rw_tape cmd;
/* If it's a null transfer, return immediately */
if (size == 0)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = READ;
if (st->flags & ST_FIXEDBLOCKS) { cmd.byte2 |= SRW_FIXED;
_lto3b(size / (st->blksize ? st->blksize : DEF_FIXED_BSIZE),
cmd.len);
} else
_lto3b(size, cmd.len);
return scsipi_command(st->sc_periph,
(void *)&cmd, sizeof(cmd), (void *)bf, size, 0, ST_IO_TIME, NULL,
flags | XS_CTL_DATA_IN);
}
/* issue an erase command */
static int
st_erase(struct st_softc *st, int full, int flags)
{
int tmo;
struct scsi_erase cmd;
/*
* Full erase means set LONG bit in erase command, which asks
* the drive to erase the entire unit. Without this bit, we're
* asking the drive to write an erase gap.
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = ERASE;
if (full) {
cmd.byte2 = SE_LONG;
tmo = ST_SPC_TIME;
} else
tmo = ST_IO_TIME;
/*
* XXX We always do this asynchronously, for now, unless the device
* has the ST_Q_ERASE_NOIMM quirk. How long should we wait if we
* want to (eventually) to it synchronously?
*/
if ((st->quirks & ST_Q_ERASE_NOIMM) == 0)
cmd.byte2 |= SE_IMMED;
return scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, tmo, NULL, flags);
}
/* skip N blocks/filemarks/seq filemarks/eom */
static int
st_space(struct st_softc *st, int number, u_int what, int flags)
{
struct scsi_space cmd;
int error;
switch (what) {
case SP_BLKS:
if (st->flags & ST_PER_ACTION) {
if (number > 0) {
st->flags &= ~ST_PER_ACTION;
return EIO;
} else if (number < 0) {
if (st->flags & ST_AT_FILEMARK) {
/*
* Handling of ST_AT_FILEMARK
* in st_space will fill in the
* right file mark count.
*/
error = st_space(st, 0, SP_FILEMARKS,
flags);
if (error)
return error;
}
if (st->flags & ST_BLANK_READ) {
st->flags &= ~ST_BLANK_READ;
return EIO;
}
st->flags &= ~(ST_EIO_PENDING|ST_EOM_PENDING);
}
}
break;
case SP_FILEMARKS:
if (st->flags & ST_EIO_PENDING) {
if (number > 0) {
/* pretend we just discovered the error */
st->flags &= ~ST_EIO_PENDING;
return EIO;
} else if (number < 0) {
/* back away from the error */
st->flags &= ~ST_EIO_PENDING;
}
}
if (st->flags & ST_AT_FILEMARK) {
st->flags &= ~ST_AT_FILEMARK;
number--;
}
if ((st->flags & ST_BLANK_READ) && (number < 0)) {
/* back away from unwritten tape */
st->flags &= ~ST_BLANK_READ;
number++; /* XXX dubious */
}
break;
case SP_EOM:
if (st->flags & ST_EOM_PENDING) {
/* we're already there */
st->flags &= ~ST_EOM_PENDING;
return 0;
}
if (st->flags & ST_EIO_PENDING) {
/* pretend we just discovered the error */
st->flags &= ~ST_EIO_PENDING;
return EIO;
}
if (st->flags & ST_AT_FILEMARK)
st->flags &= ~ST_AT_FILEMARK;
break;
}
if (number == 0)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SPACE;
cmd.byte2 = what;
_lto3b(number, cmd.number);
st->flags &= ~ST_POSUPDATED;
st->last_ctl_resid = 0;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
0, ST_SPC_TIME, NULL, flags);
if (error == 0 && (st->flags & ST_POSUPDATED) == 0) {
number = number - st->last_ctl_resid;
if (what == SP_BLKS) {
if (st->blkno != -1)
st->blkno += number;
} else if (what == SP_FILEMARKS) {
if (st->fileno != -1) {
st->fileno += number;
if (number > 0)
st->blkno = 0;
else if (number < 0)
st->blkno = -1;
}
} else if (what == SP_EOM) {
st_updatefilepos(st);
}
}
return error;
}
/*
* write N filemarks
*/
static int
st_write_filemarks(struct st_softc *st, int number, int flags)
{
int error;
struct scsi_write_filemarks cmd;
/*
* It's hard to write a negative number of file marks.
* Don't try.
*/
if (number < 0) {
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("EINVAL: st_write_filemarks not writing %d file marks\n", number));
return EINVAL;
}
switch (number) {
case 0: /* really a command to sync the drive's buffers */
break;
case 1:
if (st->flags & ST_FM_WRITTEN) /* already have one down */
st->flags &= ~ST_WRITTEN;
else
st->flags |= ST_FM_WRITTEN;
st->flags &= ~ST_PER_ACTION;
break;
default:
st->flags &= ~(ST_PER_ACTION | ST_WRITTEN);
}
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = WRITE_FILEMARKS;
if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
SCSIPI_BUSTYPE_ATAPI)
cmd.byte2 = SR_IMMED;
/*
* The ATAPI Onstream DI-30 doesn't support writing filemarks, but
* WRITE_FILEMARKS is still used to flush the buffer
*/
if ((st->quirks & ST_Q_NOFILEMARKS) == 0)
_lto3b(number, cmd.number);
/* XXX WE NEED TO BE ABLE TO GET A RESIDIUAL XXX */
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
0, ST_IO_TIME * 4, NULL, flags);
if (error == 0 && st->fileno != -1)
st->fileno += number;
return error;
}
/*
* Make sure the right number of file marks is on tape if the
* tape has been written. If the position argument is true,
* leave the tape positioned where it was originally.
*
* nmarks returns the number of marks to skip (or, if position
* true, which were skipped) to get back original position.
*/
static int
st_check_eod(struct st_softc *st, boolean position, int *nmarks, int flags)
{
int error;
switch (st->flags & (ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD)) {
default:
*nmarks = 0;
return 0;
case ST_WRITTEN:
case ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD:
*nmarks = 1;
break;
case ST_WRITTEN | ST_2FM_AT_EOD:
*nmarks = 2;
}
error = st_write_filemarks(st, *nmarks, flags);
if (position && !error)
error = st_space(st, -*nmarks, SP_FILEMARKS, flags);
return error;
}
/* load/unload/retension */
static int
st_load(struct st_softc *st, u_int type, int flags)
{
int error;
struct scsi_load cmd;
if (type != LD_LOAD) {
int nmarks;
error = st_check_eod(st, FALSE, &nmarks, flags);
if (error) {
aprint_error_dev(st->sc_dev,
"failed to write closing filemarks at "
"unload, errno=%d\n", error);
return error;
}
}
if (st->quirks & ST_Q_IGNORE_LOADS) {
if (type == LD_LOAD)
/*
* If we ignore loads, at least we should try a rewind.
*/
return st_rewind(st, 0, flags);
/* otherwise, we should do what's asked of us */
}
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = LOAD;
if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
SCSIPI_BUSTYPE_ATAPI)
cmd.byte2 = SR_IMMED;
cmd.how = type;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, ST_SPC_TIME, NULL, flags);
if (error) {
aprint_error_dev(st->sc_dev, "error %d in st_load (op %d)\n",
error, type);
}
return error;
}
/* Rewind the device */
static int
st_rewind(struct st_softc *st, u_int immediate, int flags)
{
struct scsi_rewind cmd;
int error;
int nmarks;
int timeout;
error = st_check_eod(st, FALSE, &nmarks, flags);
if (error) {
aprint_error_dev(st->sc_dev,
"failed to write closing filemarks at "
"rewind, errno=%d\n", error);
return error;
}
st->flags &= ~ST_PER_ACTION;
/* If requestor asked for immediate response, set a short timeout */
timeout = immediate ? ST_CTL_TIME : ST_SPC_TIME;
/* ATAPI tapes always need immediate to be set */
if (scsipi_periph_bustype(st->sc_periph) == SCSIPI_BUSTYPE_ATAPI)
immediate = SR_IMMED;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = REWIND;
cmd.byte2 = immediate;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, timeout, NULL, flags);
if (error) {
aprint_error_dev(st->sc_dev, "error %d trying to rewind\n",
error);
/* lost position */
st->fileno = st->blkno = -1;
} else
st->fileno = st->blkno = 0;
return error;
}
static void
st_updatefilepos(struct st_softc *st)
{
int error;
uint8_t posdata[32];
struct scsi_tape_read_position cmd;
memset(&cmd, 0, sizeof(cmd));
memset(&posdata, 0, sizeof(posdata));
cmd.opcode = READ_POSITION;
cmd.byte1 = 6; /* service action: LONG FORM */
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
(void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
XS_CTL_SILENT | XS_CTL_DATA_IN);
if (error == 0) {
#ifdef SCSIPI_DEBUG
if (st->sc_periph->periph_dbflags & SCSIPI_DB3) {
int hard;
printf("posdata: ");
for (hard = 0; hard < sizeof(posdata); hard++)
printf("%02x ", posdata[hard] & 0xff);
printf("\n");
}
#endif
if (posdata[0] & 0xC) { /* Block|Mark Position Unknown */
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos block/mark position unknown (0x%02x)\n",
posdata[0]));
} else {
st->fileno = _8btol(&posdata[16]);
st->blkno = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos file position %"PRId64"\n",
st->fileno));
return;
}
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos READ POSITION(LONG_FORM) failed (error=%d)\n",
error));
}
st->fileno = -1;
st->blkno = -1;
}
static int
st_rdpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
int error;
uint8_t posdata[20];
struct scsi_tape_read_position cmd;
/*
* We try and flush any buffered writes here if we were writing
* and we're trying to get hardware block position. It eats
* up performance substantially, but I'm wary of drive firmware.
*
* I think that *logical* block position is probably okay-
* but hardware block position might have to wait for data
* to hit media to be valid. Caveat Emptor.
*/
if (hard && (st->flags & ST_WRITTEN)) {
/* First flush any pending writes... */
error = st_write_filemarks(st, 0, XS_CTL_SILENT);
/*
* The latter case is for 'write protected' tapes
* which are too stupid to recognize a zero count
* for writing filemarks as a no-op.
*/
if (error != 0 && error != EACCES && error != EROFS)
return error;
}
memset(&cmd, 0, sizeof(cmd));
memset(&posdata, 0, sizeof(posdata));
cmd.opcode = READ_POSITION;
if (hard)
cmd.byte1 = 1;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
(void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
XS_CTL_SILENT | XS_CTL_DATA_IN);
if (error == 0) {
#if 0
printf("posdata:");
for (hard = 0; hard < sizeof(posdata); hard++)
printf("%02x ", posdata[hard] & 0xff);
printf("\n");
#endif
if (posdata[0] & 0x4) { /* Block Position Unknown */
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("EINVAL: strdpos block position unknown\n"));
error = EINVAL;
}
else
*blkptr = _4btol(&posdata[4]);
}
return error;
}
static int
st_setpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
int error;
struct scsi_tape_locate cmd;
/*
* We used to try and flush any buffered writes here.
* Now we push this onto user applications to either
* flush the pending writes themselves (via a zero count
* WRITE FILEMARKS command) or they can trust their tape
* drive to do this correctly for them.
*
* There are very ugly performance limitations otherwise.
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = LOCATE;
if (hard)
cmd.byte2 = 1 << 2;
_lto4b(*blkptr, cmd.blkaddr);
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, ST_SPC_TIME, NULL, 0);
/*
* Note file && block number position now unknown (if
* these things ever start being maintained in this driver)
*/
st->fileno = st->blkno = -1;
return error;
}
/*
* Look at the returned sense and act on the error and determine
* the unix error number to pass back..., 0 (== report no error),
* -1 = retry the operation, -2 continue error processing.
*/
static int
st_interpret_sense(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsi_sense_data *sense = &xs->sense.scsi_sense;
struct buf *bp = xs->bp;
struct st_softc *st = device_private(periph->periph_dev);
int retval = EJUSTRETURN;
int doprint = ((xs->xs_control & XS_CTL_SILENT) == 0);
uint8_t key;
int32_t info;
/*
* If it isn't a extended or extended/deferred error, let
* the generic code handle it.
*/
if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT &&
SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED)
return retval;
if (sense->response_code & SSD_RCODE_VALID)
info = _4btol(sense->info);
else
info = (st->flags & ST_FIXEDBLOCKS) ?
xs->datalen / st->blksize : xs->datalen;
key = SSD_SENSE_KEY(sense->flags);
st->mt_erreg = key;
st->asc = sense->asc;
st->ascq = sense->ascq;
st->mt_resid = (short) info;
if (key == SKEY_NOT_READY && st->asc == 0x4 && st->ascq == 0x1) {
/* Not Ready, Logical Unit Is in Process Of Becoming Ready */
if (!callout_pending(&periph->periph_callout))
scsipi_periph_freeze(periph, 1);
callout_reset(&periph->periph_callout,
hz, scsipi_periph_timed_thaw, periph);
return ERESTART;
}
/* If the device is not open yet, let generic handle */
if ((periph->periph_flags & PERIPH_OPEN) == 0)
return retval;
xs->resid = info;
if (st->flags & ST_FIXEDBLOCKS) {
if (bp) {
xs->resid *= st->blksize;
st->last_io_resid = xs->resid;
} else
st->last_ctl_resid = xs->resid;
if (key == SKEY_VOLUME_OVERFLOW) {
st->flags |= ST_EIO_PENDING;
if (bp)
bp->b_resid = xs->resid;
} else if (sense->flags & SSD_EOM) {
if ((st->flags & ST_EARLYWARN) == 0)
st->flags |= ST_EIO_PENDING;
st->flags |= ST_EOM_PENDING;
if (bp) {
#if 0
bp->b_resid = xs->resid;
#else
/*
* Grotesque as it seems, the few times
* I've actually seen a non-zero resid,
* the tape drive actually lied and had
* written all the data!
*/
bp->b_resid = 0;
#endif
}
}
if (sense->flags & SSD_FILEMARK) {
st->flags |= ST_AT_FILEMARK;
if (bp)
bp->b_resid = xs->resid;
if (st->fileno != (daddr_t) -1) {
st->fileno++;
st->blkno = 0;
st->flags |= ST_POSUPDATED;
}
}
if (sense->flags & SSD_ILI) {
st->flags |= ST_EIO_PENDING;
if (bp)
bp->b_resid = xs->resid;
if (sense->response_code & SSD_RCODE_VALID &&
(xs->xs_control & XS_CTL_SILENT) == 0)
aprint_error_dev(st->sc_dev,
"block wrong size, %d blocks residual\n",
info);
/*
* This quirk code helps the drive read
* the first tape block, regardless of
* format. That is required for these
* drives to return proper MODE SENSE
* information.
*/
if ((st->quirks & ST_Q_SENSE_HELP) &&
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
st->blksize -= 512;
else if ((st->flags & ST_POSUPDATED) == 0) {
if (st->blkno != (daddr_t) -1) {
st->blkno +=
(xs->datalen / st->blksize);
st->flags |= ST_POSUPDATED;
}
}
}
/*
* If data wanted and no data was transferred, do it immediately
*/
if (xs->datalen && xs->resid >= xs->datalen) {
if (st->flags & ST_EIO_PENDING)
return EIO;
if (st->flags & ST_AT_FILEMARK) {
if (bp)
bp->b_resid = xs->resid;
return 0;
}
}
} else { /* must be variable mode */
if (bp)
st->last_io_resid = xs->resid;
else
st->last_ctl_resid = xs->resid;
if (sense->flags & SSD_EOM) {
/*
* The current semantics of this
* driver requires EOM detection
* to return EIO unless early
* warning detection is enabled
* for variable mode (this is always
* on for fixed block mode).
*/
if (st->flags & ST_EARLYWARN) {
st->flags |= ST_EOM_PENDING;
retval = 0;
} else {
retval = EIO;
/*
* If we return an error we can't claim to
* have transferred all data.
*/
if (xs->resid == 0)
xs->resid = xs->datalen;
}
/*
* If it's an unadorned EOM detection,
* suppress printing an error.
*/
if (key == SKEY_NO_SENSE) {
doprint = 0;
}
} else if (sense->flags & SSD_FILEMARK) {
retval = 0;
if (st->fileno != (daddr_t) -1) {
st->fileno++;
st->blkno = 0;
st->flags |= ST_POSUPDATED;
}
} else if (sense->flags & SSD_ILI) {
if (info < 0) {
/*
* The tape record was bigger than the read
* we issued.
*/
if ((xs->xs_control & XS_CTL_SILENT) == 0) {
aprint_error_dev(st->sc_dev,
"%d-byte tape record too big"
" for %d-byte user buffer\n",
xs->datalen - info, xs->datalen);
}
retval = EIO;
} else {
retval = 0;
if (st->blkno != (daddr_t) -1) {
st->blkno++;
st->flags |= ST_POSUPDATED;
}
}
}
if (bp)
bp->b_resid = xs->resid;
}
#ifndef SCSIPI_DEBUG
if (retval == 0 && key == SKEY_NO_SENSE)
doprint = 0;
#endif
if (key == SKEY_BLANK_CHECK) {
/*
* This quirk code helps the drive read the
* first tape block, regardless of format. That
* is required for these drives to return proper
* MODE SENSE information.
*/
if ((st->quirks & ST_Q_SENSE_HELP) &&
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
/* still starting */
st->blksize -= 512;
} else if (!(st->flags & (ST_2FM_AT_EOD | ST_BLANK_READ))) {
st->flags |= ST_BLANK_READ;
xs->resid = xs->datalen;
if (bp) {
bp->b_resid = xs->resid;
/* return an EOF */
}
retval = 0;
/* lost position */
st->fileno = st->blkno = -1;
}
}
/*
* If generic sense processing will continue, we should not
* print sense info here.
*/
if (retval == EJUSTRETURN)
doprint = 0;
if (doprint) {
/* Print verbose sense info if possible */
if (scsipi_print_sense(xs, 0) != 0)
return retval;
/* Print less-verbose sense info */
scsipi_printaddr(periph);
printf("Sense Key 0x%02x", key);
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
switch (key) {
case SKEY_NOT_READY:
case SKEY_ILLEGAL_REQUEST:
case SKEY_UNIT_ATTENTION:
case SKEY_DATA_PROTECT:
break;
case SKEY_VOLUME_OVERFLOW:
case SKEY_BLANK_CHECK:
printf(", requested size: %d (decimal)", info);
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries)
printf(", retrying");
printf(", cmd 0x%x, info 0x%x",
xs->cmd->opcode, info);
break;
default:
printf(", info = %d (decimal)", info);
}
}
if (sense->extra_len != 0) {
int n;
printf(", data =");
for (n = 0; n < sense->extra_len; n++)
printf(" %02x", sense->csi[n]);
}
printf("\n");
}
return retval;
}
/*
* The quirk here is that the drive returns some value to st_mode_sense
* incorrectly until the tape has actually passed by the head.
*
* The method is to set the drive to large fixed-block state (user-specified
* density and 1024-byte blocks), then read and rewind to get it to sense the
* tape. If that doesn't work, try 512-byte fixed blocks. If that doesn't
* work, as a last resort, try variable- length blocks. The result will be
* the ability to do an accurate st_mode_sense.
*
* We know we can do a rewind because we just did a load, which implies rewind.
* Rewind seems preferable to space backward if we have a virgin tape.
*
* The rest of the code for this quirk is in ILI processing and BLANK CHECK
* error processing, both part of st_interpret_sense.
*/
static int
st_touch_tape(struct st_softc *st)
{
char *bf;
int readsize;
int error;
bf = malloc(1024, M_TEMP, M_WAITOK);
if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
goto bad;
/*
* If the block size is already known from the
* sense data, use it. Else start probing at 1024.
*/
if (st->media_blksize > 0)
st->blksize = st->media_blksize;
else
st->blksize = 1024;
do {
switch (st->blksize) {
case 512:
case 1024:
readsize = st->blksize;
st->flags |= ST_FIXEDBLOCKS;
break;
default:
readsize = 1;
st->flags &= ~ST_FIXEDBLOCKS;
}
if ((error = st->ops(st, ST_OPS_MODESELECT, XS_CTL_SILENT))
!= 0) {
/*
* The device did not agree with the proposed
* block size. If we exhausted our options,
* return failure, else try another.
*/
if (readsize == 1)
goto bad;
st->blksize -= 512;
continue;
}
st_read(st, bf, readsize, XS_CTL_SILENT); /* XXX */ if ((error = st_rewind(st, 0, 0)) != 0) {
bad: free(bf, M_TEMP);
return error;
}
} while (readsize != 1 && readsize > st->blksize);
free(bf, M_TEMP);
return 0;
}
static int
stdump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
/* Not implemented. */
return ENXIO;
}
/*
* Send a filled out parameter structure to the drive to
* set it into the desire modes etc.
*/
int
st_mode_select(struct st_softc *st, int flags)
{
u_int select_len;
struct select {
struct scsi_mode_parameter_header_6 header;
struct scsi_general_block_descriptor blk_desc;
u_char sense_data[MAX_PAGE_0_SIZE];
} select;
struct scsipi_periph *periph = st->sc_periph;
select_len = sizeof(select.header) + sizeof(select.blk_desc) +
st->page_0_size;
/*
* This quirk deals with drives that have only one valid mode
* and think this gives them license to reject all mode selects,
* even if the selected mode is the one that is supported.
*/
if (st->quirks & ST_Q_UNIMODAL) {
SC_DEBUG(periph, SCSIPI_DB3,
("not setting density 0x%x blksize 0x%x\n",
st->density, st->blksize));
return 0;
}
/* Set up for a mode select */
memset(&select, 0, sizeof(select));
select.header.blk_desc_len = sizeof(struct
scsi_general_block_descriptor);
select.header.dev_spec &= ~SMH_DSP_BUFF_MODE;
select.blk_desc.density = st->density;
if (st->flags & ST_DONTBUFFER)
select.header.dev_spec |= SMH_DSP_BUFF_MODE_OFF;
else
select.header.dev_spec |= SMH_DSP_BUFF_MODE_ON;
if (st->flags & ST_FIXEDBLOCKS)
_lto3b(st->blksize, select.blk_desc.blklen);
if (st->page_0_size)
memcpy(select.sense_data, st->sense_data, st->page_0_size);
/* do the command */
return scsipi_mode_select(periph, 0, &select.header, select_len,
flags, ST_RETRIES, ST_CTL_TIME);
}
/* $NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $ */
/*
* Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $");
#include "opt_pax.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/mman.h>
#include <sys/resourcevar.h>
#include <sys/device.h>
#include <sys/pax.h>
#include <uvm/uvm_extern.h>
#define VMCMD_EVCNT_DECL(name) \
static struct evcnt vmcmd_ev_##name = \
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "vmcmd", #name); \
EVCNT_ATTACH_STATIC(vmcmd_ev_##name)
#define VMCMD_EVCNT_INCR(name) \
vmcmd_ev_##name.ev_count++
VMCMD_EVCNT_DECL(calls);
VMCMD_EVCNT_DECL(extends);
VMCMD_EVCNT_DECL(kills);
#ifdef DEBUG_STACK
#define DPRINTF(a) uprintf a
#else
#define DPRINTF(a)
#endif
unsigned int user_stack_guard_size = 1024 * 1024;
unsigned int user_thread_stack_guard_size = 64 * 1024;
/*
* new_vmcmd():
* create a new vmcmd structure and fill in its fields based
* on function call arguments. make sure objects ref'd by
* the vmcmd are 'held'.
*/
void
new_vmcmd(struct exec_vmcmd_set *evsp,
int (*proc)(struct lwp * l, struct exec_vmcmd *),
vsize_t len, vaddr_t addr, struct vnode *vp, u_long offset,
u_int prot, int flags)
{
struct exec_vmcmd *vcp;
VMCMD_EVCNT_INCR(calls);
KASSERT(proc != vmcmd_map_pagedvn || (vp->v_iflag & VI_TEXT));
KASSERT(vp == NULL || vrefcnt(vp) > 0);
if (evsp->evs_used >= evsp->evs_cnt)
vmcmdset_extend(evsp);
vcp = &evsp->evs_cmds[evsp->evs_used++];
vcp->ev_proc = proc;
vcp->ev_len = len;
vcp->ev_addr = addr;
if ((vcp->ev_vp = vp) != NULL)
vref(vp);
vcp->ev_offset = offset;
vcp->ev_prot = prot;
vcp->ev_flags = flags;
}
void
vmcmdset_extend(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *nvcp;
u_int ocnt;
#ifdef DIAGNOSTIC
if (evsp->evs_used < evsp->evs_cnt)
panic("vmcmdset_extend: not necessary");
#endif
/* figure out number of entries in new set */
if ((ocnt = evsp->evs_cnt) != 0) {
evsp->evs_cnt += ocnt;
VMCMD_EVCNT_INCR(extends);
} else
evsp->evs_cnt = EXEC_DEFAULT_VMCMD_SETSIZE;
/* allocate it */
nvcp = kmem_alloc(evsp->evs_cnt * sizeof(struct exec_vmcmd), KM_SLEEP);
/* free the old struct, if there was one, and record the new one */
if (ocnt) {
memcpy(nvcp, evsp->evs_cmds,
(ocnt * sizeof(struct exec_vmcmd)));
kmem_free(evsp->evs_cmds, ocnt * sizeof(struct exec_vmcmd));
}
evsp->evs_cmds = nvcp;
}
void
kill_vmcmds(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *vcp;
u_int i;
VMCMD_EVCNT_INCR(kills);
if (evsp->evs_cnt == 0)
return;
for (i = 0; i < evsp->evs_used; i++) {
vcp = &evsp->evs_cmds[i];
if (vcp->ev_vp != NULL) vrele(vcp->ev_vp);
}
kmem_free(evsp->evs_cmds, evsp->evs_cnt * sizeof(struct exec_vmcmd));
evsp->evs_used = evsp->evs_cnt = 0;
}
/*
* vmcmd_map_pagedvn():
* handle vmcmd which specifies that a vnode should be mmap'd.
* appropriate for handling demand-paged text and data segments.
*/
static int
vmcmd_get_prot(struct lwp *l, const struct exec_vmcmd *cmd, vm_prot_t *prot,
vm_prot_t *maxprot)
{
vm_prot_t extraprot = PROT_MPROTECT_EXTRACT(cmd->ev_prot);
*prot = cmd->ev_prot & UVM_PROT_ALL;
*maxprot = PAX_MPROTECT_MAXPROTECT(l, *prot, extraprot, UVM_PROT_ALL);
if ((*prot & *maxprot) != *prot)
return EACCES;
return PAX_MPROTECT_VALIDATE(l, *prot);
}
int
vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct uvm_object *uobj;
struct vnode *vp = cmd->ev_vp;
struct proc *p = l->l_proc;
int error;
vm_prot_t prot, maxprot;
KASSERT(vp->v_iflag & VI_TEXT);
/*
* map the vnode in using uvm_map.
*/
if (cmd->ev_len == 0)
return 0;
if (cmd->ev_offset & PAGE_MASK)
return EINVAL;
if (cmd->ev_addr & PAGE_MASK)
return EINVAL;
if (cmd->ev_len & PAGE_MASK)
return EINVAL;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
/*
* check the file system's opinion about mmapping the file
*/
error = VOP_MMAP(vp, prot, l->l_cred);
if (error)
return error;
if ((vp->v_vflag & VV_MAPPED) == 0) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_MAPPED;
VOP_UNLOCK(vp);
}
/*
* do the map, reference the object for this map entry
*/
uobj = &vp->v_uobj;
vref(vp);
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, cmd->ev_len,
uobj, cmd->ev_offset, 0,
UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
UVM_ADV_NORMAL, UVM_FLAG_COPYONW|UVM_FLAG_FIXED));
if (error) {
uobj->pgops->pgo_detach(uobj);
}
return error;
}
/*
* vmcmd_map_readvn():
* handle vmcmd which specifies that a vnode should be read from.
* appropriate for non-demand-paged text/data segments, i.e. impure
* objects (a la OMAGIC and NMAGIC).
*/
int
vmcmd_map_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
long diff;
if (cmd->ev_len == 0)
return 0;
diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
cmd->ev_addr -= diff; /* required by uvm_map */
cmd->ev_offset -= diff;
cmd->ev_len += diff;
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_COPY,
UVM_ADV_NORMAL,
UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW));
if (error)
return error;
return vmcmd_readvn(l, cmd);
}
int
vmcmd_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
vm_prot_t prot, maxprot;
error = vn_rdwr(UIO_READ, cmd->ev_vp, (void *)cmd->ev_addr,
cmd->ev_len, cmd->ev_offset, UIO_USERSPACE, IO_UNIT,
l->l_cred, NULL, l);
if (error)
return error;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
#ifdef PMAP_NEED_PROCWR
/*
* we had to write the process, make sure the pages are synched
* with the instruction cache.
*/
if (prot & VM_PROT_EXECUTE)
pmap_procwr(p, cmd->ev_addr, cmd->ev_len);
#endif
/*
* we had to map in the area at PROT_ALL so that vn_rdwr()
* could write to it. however, the caller seems to want
* it mapped read-only, so now we are going to have to call
* uvm_map_protect() to fix up the protection. ICK.
*/
if (maxprot != VM_PROT_ALL) {
error = uvm_map_protect(&p->p_vmspace->vm_map,
trunc_page(cmd->ev_addr),
round_page(cmd->ev_addr + cmd->ev_len),
maxprot, true);
if (error)
return error;
}
if (prot != maxprot) {
error = uvm_map_protect(&p->p_vmspace->vm_map,
trunc_page(cmd->ev_addr),
round_page(cmd->ev_addr + cmd->ev_len),
prot, false);
if (error)
return error;
}
return 0;
}
/*
* vmcmd_map_zero():
* handle vmcmd which specifies a zero-filled address space region. The
* address range must be first allocated, then protected appropriately.
*/
int
vmcmd_map_zero(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
long diff;
vm_prot_t prot, maxprot;
diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
cmd->ev_addr -= diff; /* required by uvm_map */
cmd->ev_len += diff;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
UVM_ADV_NORMAL,
UVM_FLAG_FIXED|UVM_FLAG_COPYONW));
if (cmd->ev_flags & VMCMD_STACK)
curproc->p_vmspace->vm_issize += atop(round_page(cmd->ev_len));
return error;
}
/*
* exec_read():
*
* Read from vnode into buffer at offset.
*/
int
exec_read(struct lwp *l, struct vnode *vp, u_long off, void *bf, size_t size,
int ioflg)
{
int error;
size_t resid;
KASSERT((ioflg & IO_NODELOCKED) == 0 || VOP_ISLOCKED(vp) != LK_NONE);
if ((error = vn_rdwr(UIO_READ, vp, bf, size, off, UIO_SYSSPACE,
ioflg, l->l_cred, &resid, NULL)) != 0)
return error;
/*
* See if we got all of it
*/
if (resid != 0)
return ENOEXEC;
return 0;
}
/*
* exec_setup_stack(): Set up the stack segment for an elf
* executable.
*
* Note that the ep_ssize parameter must be set to be the current stack
* limit; this is adjusted in the body of execve() to yield the
* appropriate stack segment usage once the argument length is
* calculated.
*
* This function returns an int for uniformity with other (future) formats'
* stack setup functions. They might have errors to return.
*/
int
exec_setup_stack(struct lwp *l, struct exec_package *epp)
{
vsize_t max_stack_size;
vaddr_t access_linear_min;
vsize_t access_size;
vaddr_t noaccess_linear_min;
vsize_t noaccess_size;
#ifndef USRSTACK32
#define USRSTACK32 (0x00000000ffffffffL&~PGOFSET)
#endif
#ifndef MAXSSIZ32
#define MAXSSIZ32 (MAXSSIZ >> 2)
#endif
if (epp->ep_flags & EXEC_32) {
epp->ep_minsaddr = USRSTACK32;
max_stack_size = MAXSSIZ32;
} else {
epp->ep_minsaddr = USRSTACK;
max_stack_size = MAXSSIZ;
}
DPRINTF(("ep_minsaddr=%#jx max_stack_size=%#jx\n",
(uintmax_t)epp->ep_minsaddr, (uintmax_t)max_stack_size));
pax_aslr_stack(epp, &max_stack_size);
DPRINTF(("[RLIMIT_STACK].lim_cur=%#jx max_stack_size=%#jx\n",
(uintmax_t)l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
(uintmax_t)max_stack_size));
epp->ep_ssize = MIN(l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
max_stack_size);
l->l_proc->p_stackbase = epp->ep_minsaddr;
epp->ep_maxsaddr = (vaddr_t)STACK_GROW(epp->ep_minsaddr,
max_stack_size);
DPRINTF(("ep_ssize=%#jx ep_minsaddr=%#jx ep_maxsaddr=%#jx\n",
(uintmax_t)epp->ep_ssize, (uintmax_t)epp->ep_minsaddr,
(uintmax_t)epp->ep_maxsaddr));
/*
* set up commands for stack. note that this takes *two*, one to
* map the part of the stack which we can access, and one to map
* the part which we can't.
*
* arguably, it could be made into one, but that would require the
* addition of another mapping proc, which is unnecessary
*/
access_size = epp->ep_ssize;
access_linear_min = (vaddr_t)STACK_ALLOC(epp->ep_minsaddr, access_size);
noaccess_size = max_stack_size - access_size;
noaccess_linear_min = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
access_size), noaccess_size);
DPRINTF(("access_size=%#jx, access_linear_min=%#jx, "
"noaccess_size=%#jx, noaccess_linear_min=%#jx\n",
(uintmax_t)access_size, (uintmax_t)access_linear_min,
(uintmax_t)noaccess_size, (uintmax_t)noaccess_linear_min));
if (user_stack_guard_size > 0) {
#ifdef __MACHINE_STACK_GROWS_UP
vsize_t guard_size = MIN(VM_MAXUSER_ADDRESS - epp->ep_maxsaddr, user_stack_guard_size);
if (guard_size > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, guard_size,
epp->ep_maxsaddr, NULL, 0, VM_PROT_NONE);
#else
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, user_stack_guard_size,
epp->ep_maxsaddr - user_stack_guard_size, NULL, 0, VM_PROT_NONE);
#endif
}
if (noaccess_size > 0 && noaccess_size <= MAXSSIZ) {
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, noaccess_size,
noaccess_linear_min, NULL, 0,
VM_PROT_NONE | PROT_MPROTECT(VM_PROT_READ | VM_PROT_WRITE),
VMCMD_STACK);
}
KASSERT(access_size > 0);
KASSERT(access_size <= MAXSSIZ);
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, access_size,
access_linear_min, NULL, 0, VM_PROT_READ | VM_PROT_WRITE,
VMCMD_STACK);
return 0;
}
/* $NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Turnstiles are described in detail in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*
* Turnstiles are kept in a hash table. There are likely to be many more
* synchronisation objects than there are threads. Since a thread can block
* on only one lock at a time, we only need one turnstile per thread, and
* so they are allocated at thread creation time.
*
* When a thread decides it needs to block on a lock, it looks up the
* active turnstile for that lock. If no active turnstile exists, then
* the process lends its turnstile to the lock. If there is already an
* active turnstile for the lock, the thread places its turnstile on a
* list of free turnstiles, and references the active one instead.
*
* The act of looking up the turnstile acquires an interlock on the sleep
* queue. If a thread decides it doesn't need to block after all, then this
* interlock must be released by explicitly aborting the turnstile
* operation.
*
* When a thread is awakened, it needs to get its turnstile back. If there
* are still other threads waiting in the active turnstile, the thread
* grabs a free turnstile off the free list. Otherwise, it can take back
* the active turnstile from the lock (thus deactivating the turnstile).
*
* Turnstiles are where we do priority inheritence.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $");
#include <sys/param.h>
#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/sleeptab.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* Shift of 6 aligns to typical cache line size of 64 bytes; there's no
* point having two turnstile locks to back two lock objects that share one
* cache line.
*/
#define TS_HASH_SIZE 128
#define TS_HASH_MASK (TS_HASH_SIZE - 1)
#define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK)
static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned;
static union {
kmutex_t lock;
uint8_t pad[COHERENCY_UNIT];
} turnstile_locks[TS_HASH_SIZE] __cacheline_aligned;
/*
* turnstile_init:
*
* Initialize the turnstile mechanism.
*/
void
turnstile_init(void)
{
int i;
for (i = 0; i < TS_HASH_SIZE; i++) {
LIST_INIT(&turnstile_chains[i]);
mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED);
}
turnstile_ctor(&turnstile0);
}
/*
* turnstile_ctor:
*
* Constructor for turnstiles.
*/
void
turnstile_ctor(turnstile_t *ts)
{
memset(ts, 0, sizeof(*ts));
sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
}
/*
* turnstile_remove:
*
* Remove an LWP from a turnstile sleep queue and wake it.
*/
static inline void
turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
{
turnstile_t *nts;
KASSERT(l->l_ts == ts);
/*
* This process is no longer using the active turnstile.
* Find an inactive one on the free list to give to it.
*/
if ((nts = ts->ts_free) != NULL) {
KASSERT(TS_ALL_WAITERS(ts) > 1);
l->l_ts = nts;
ts->ts_free = nts->ts_free;
nts->ts_free = NULL;
} else {
/*
* If the free list is empty, this is the last
* waiter.
*/
KASSERT(TS_ALL_WAITERS(ts) == 1); LIST_REMOVE(ts, ts_chain);
}
ts->ts_waiters[q]--;
sleepq_remove(&ts->ts_sleepq[q], l, true);
}
/*
* turnstile_lookup:
*
* Look up the turnstile for the specified lock. This acquires and
* holds the turnstile chain lock (sleep queue interlock).
*/
turnstile_t *
turnstile_lookup(wchan_t obj)
{
turnstile_t *ts;
tschain_t *tc;
u_int hash;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
mutex_spin_enter(&turnstile_locks[hash].lock);
LIST_FOREACH(ts, tc, ts_chain) if (ts->ts_obj == obj)
return (ts);
/*
* No turnstile yet for this lock. No problem, turnstile_block()
* handles this by fetching the turnstile from the blocking thread.
*/
return (NULL);
}
/*
* turnstile_exit:
*
* Abort a turnstile operation.
*/
void
turnstile_exit(wchan_t obj)
{
mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock);
}
/*
* turnstile_lendpri:
*
* Lend our priority to lwps on the blocking chain.
*
* If the current owner of the lock (l->l_wchan, set by sleepq_enqueue)
* has a priority lower than ours (lwp_eprio(l)), lend our priority to
* him to avoid priority inversions.
*/
static void
turnstile_lendpri(lwp_t *cur)
{
lwp_t * l = cur;
pri_t prio;
/*
* NOTE: if you get a panic in this code block, it is likely that
* a lock has been destroyed or corrupted while still in use. Try
* compiling a kernel with LOCKDEBUG to pinpoint the problem.
*/
LOCKDEBUG_BARRIER(l->l_mutex, 1);
KASSERT(l == curlwp);
prio = lwp_eprio(l);
for (;;) {
lwp_t *owner;
turnstile_t *ts;
bool dolock;
if (l->l_wchan == NULL)
break;
/*
* Ask syncobj the owner of the lock.
*/
owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
if (owner == NULL)
break;
/*
* The owner may have changed as we have dropped the tc lock.
*/
if (cur == owner) {
/*
* We own the lock: stop here, sleepq_block()
* should wake up immediately.
*/
break;
}
/*
* Acquire owner->l_mutex if we don't have it yet.
* Because we already have another LWP lock (l->l_mutex) held,
* we need to play a try lock dance to avoid deadlock.
*/
dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); if (l == owner || (dolock && !lwp_trylock(owner))) {
/*
* The owner was changed behind us or trylock failed.
* Restart from curlwp.
*
* Note that there may be a livelock here:
* the owner may try grabbing cur's lock (which is the
* tc lock) while we're trying to grab the owner's lock.
*/
lwp_unlock(l);
l = cur;
lwp_lock(l);
prio = lwp_eprio(l);
continue;
}
/*
* If the owner's priority is already higher than ours,
* there's nothing to do anymore.
*/
if (prio <= lwp_eprio(owner)) {
if (dolock)
lwp_unlock(owner);
break;
}
/*
* Lend our priority to the 'owner' LWP.
*
* Update lenders info for turnstile_unlendpri.
*/
ts = l->l_ts;
KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
if (ts->ts_inheritor == NULL) {
ts->ts_inheritor = owner;
ts->ts_eprio = prio;
SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
lwp_lendpri(owner, prio);
} else if (prio > ts->ts_eprio) { ts->ts_eprio = prio;
lwp_lendpri(owner, prio);
}
if (dolock) lwp_unlock(l);
LOCKDEBUG_BARRIER(owner->l_mutex, 1);
l = owner;
}
LOCKDEBUG_BARRIER(l->l_mutex, 1);
if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { lwp_unlock(l);
lwp_lock(cur);
}
LOCKDEBUG_BARRIER(cur->l_mutex, 1);
}
/*
* turnstile_unlendpri: undo turnstile_lendpri
*/
static void
turnstile_unlendpri(turnstile_t *ts)
{
lwp_t * const l = curlwp;
turnstile_t *iter;
turnstile_t *next;
turnstile_t *prev = NULL;
pri_t prio;
bool dolock;
KASSERT(ts->ts_inheritor != NULL);
ts->ts_inheritor = NULL;
dolock = (atomic_load_relaxed(&l->l_mutex) ==
l->l_cpu->ci_schedstate.spc_lwplock);
if (dolock) { lwp_lock(l);
}
/*
* the following loop does two things.
*
* - remove ts from the list.
*
* - from the rest of the list, find the highest priority.
*/
prio = -1;
KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); for (iter = SLIST_FIRST(&l->l_pi_lenders);
iter != NULL; iter = next) {
KASSERT(lwp_eprio(l) >= ts->ts_eprio);
next = SLIST_NEXT(iter, ts_pichain);
if (iter == ts) {
if (prev == NULL) {
SLIST_REMOVE_HEAD(&l->l_pi_lenders,
ts_pichain);
} else {
SLIST_REMOVE_AFTER(prev, ts_pichain);
}
} else if (prio < iter->ts_eprio) {
prio = iter->ts_eprio;
}
prev = iter;
}
lwp_lendpri(l, prio);
if (dolock) { lwp_unlock(l);
}
}
/*
* turnstile_block:
*
* Enter an object into the turnstile chain and prepare the current
* LWP for sleep.
*/
void
turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
{
lwp_t * const l = curlwp; /* cached curlwp */
turnstile_t *ots;
tschain_t *tc;
kmutex_t *lock;
sleepq_t *sq;
u_int hash;
int nlocks;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
lock = &turnstile_locks[hash].lock;
KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(mutex_owned(lock)); KASSERT(l != NULL); KASSERT(l->l_ts != NULL);
if (ts == NULL) {
/*
* We are the first thread to wait for this object;
* lend our turnstile to it.
*/
ts = l->l_ts;
KASSERT(TS_ALL_WAITERS(ts) == 0); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q])); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
ts->ts_obj = obj;
ts->ts_inheritor = NULL;
LIST_INSERT_HEAD(tc, ts, ts_chain);
} else {
/*
* Object already has a turnstile. Put our turnstile
* onto the free list, and reference the existing
* turnstile instead.
*/
ots = l->l_ts;
KASSERT(ots->ts_free == NULL);
ots->ts_free = ts->ts_free;
ts->ts_free = ots;
l->l_ts = ts;
KASSERT(ts->ts_obj == obj); KASSERT(TS_ALL_WAITERS(ts) != 0); KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
!LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
}
sq = &ts->ts_sleepq[q];
ts->ts_waiters[q]++;
nlocks = sleepq_enter(sq, l, lock);
LOCKDEBUG_BARRIER(lock, 1);
sleepq_enqueue(sq, obj, sobj->sobj_name, sobj, false);
/*
* Disable preemption across this entire block, as we may drop
* scheduler locks (allowing preemption), and would prefer not
* to be interrupted while in a state of flux.
*/
KPREEMPT_DISABLE(l);
KASSERT(lock == l->l_mutex); turnstile_lendpri(l);
sleepq_block(0, false, sobj, nlocks);
KPREEMPT_ENABLE(l);
}
/*
* turnstile_wakeup:
*
* Wake up the specified number of threads that are blocked
* in a turnstile.
*/
void
turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
{
sleepq_t *sq;
kmutex_t *lock;
u_int hash;
lwp_t *l;
hash = TS_HASH(ts->ts_obj);
lock = &turnstile_locks[hash].lock;
sq = &ts->ts_sleepq[q];
KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(count > 0); KASSERT(count <= TS_WAITERS(ts, q)); KASSERT(mutex_owned(lock)); KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
/*
* restore inherited priority if necessary.
*/
if (ts->ts_inheritor != NULL) { turnstile_unlendpri(ts);
}
if (nl != NULL) {
#if defined(DEBUG) || defined(LOCKDEBUG)
LIST_FOREACH(l, sq, l_sleepchain) {
if (l == nl)
break;
}
if (l == NULL)
panic("turnstile_wakeup: nl not on sleepq");
#endif
turnstile_remove(ts, nl, q);
} else {
while (count-- > 0) {
l = LIST_FIRST(sq);
KASSERT(l != NULL);
turnstile_remove(ts, l, q);
}
}
mutex_spin_exit(lock);
}
/*
* turnstile_unsleep:
*
* Remove an LWP from the turnstile. This is called when the LWP has
* not been awoken normally but instead interrupted: for example, if it
* has received a signal. It's not a valid action for turnstiles,
* since LWPs blocking on a turnstile are not interruptable.
*/
void
turnstile_unsleep(lwp_t *l, bool cleanup)
{
lwp_unlock(l);
panic("turnstile_unsleep");
}
/*
* turnstile_changepri:
*
* Adjust the priority of an LWP residing on a turnstile.
*/
void
turnstile_changepri(lwp_t *l, pri_t pri)
{
/* XXX priority inheritance */
sleepq_changepri(l, pri);
}
#if defined(LOCKDEBUG)
/*
* turnstile_print:
*
* Given the address of a lock object, print the contents of a
* turnstile.
*/
void
turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
{
turnstile_t *ts;
tschain_t *tc;
sleepq_t *rsq, *wsq;
u_int hash;
lwp_t *l;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
LIST_FOREACH(ts, tc, ts_chain)
if (ts->ts_obj == obj)
break;
if (ts == NULL) {
(*pr)("Turnstile: no active turnstile for this lock.\n");
return;
}
rsq = &ts->ts_sleepq[TS_READER_Q];
wsq = &ts->ts_sleepq[TS_WRITER_Q];
(*pr)("Turnstile:\n");
(*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
LIST_FOREACH(l, rsq, l_sleepchain) {
(*pr)(" %p", l);
}
(*pr)("\n");
(*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
LIST_FOREACH(l, wsq, l_sleepchain) {
(*pr)(" %p", l);
}
(*pr)("\n");
}
#endif /* LOCKDEBUG */
/* $NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/buf.h>
#include <sys/atomic.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pager.h>
#include <uvm/uvm_page_array.h>
static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
off_t, enum uio_rw);
static void genfs_dio_iodone(struct buf *);
static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t,
off_t, bool, bool, bool, bool);
static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
void (*)(struct buf *));
static void genfs_rel_pages(struct vm_page **, unsigned int);
int genfs_maxdio = MAXPHYS;
static void
genfs_rel_pages(struct vm_page **pgs, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL || pg == PGO_DONTCARE)
continue;
KASSERT(uvm_page_owner_locked_p(pg, true)); if (pg->flags & PG_FAKE) { pg->flags |= PG_RELEASED;
}
}
uvm_page_unbusy(pgs, npages);
}
/*
* generic VM getpages routine.
* Return PG_BUSY pages for the given range,
* reading from backing store if necessary.
*/
int
genfs_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ * const ap = v;
off_t diskeof, memeof;
int i, error, npages, iflag;
const int flags = ap->a_flags;
struct vnode * const vp = ap->a_vp;
struct uvm_object * const uobj = &vp->v_uobj;
const bool async = (flags & PGO_SYNCIO) == 0;
const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
const bool overwrite = (flags & PGO_OVERWRITE) != 0;
const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0;
const bool need_wapbl = (vp->v_mount->mnt_wapbl &&
(flags & PGO_JOURNALLOCKED) == 0);
const bool glocked = (flags & PGO_GLOCKHELD) != 0;
bool holds_wapbl = false;
struct mount *trans_mount = NULL;
UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd",
(uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
KASSERT(memwrite >= overwrite); KASSERT(vp->v_type == VREG || vp->v_type == VDIR ||
vp->v_type == VLNK || vp->v_type == VBLK);
/*
* the object must be locked. it can only be a read lock when
* processing a read fault with PGO_LOCKED.
*/
KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) ||
((flags & PGO_LOCKED) != 0 && !memwrite));
#ifdef DIAGNOSTIC
if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
/*
* check for reclaimed vnode. v_interlock is not held here, but
* VI_DEADCHECK is set with vmobjlock held.
*/
iflag = atomic_load_relaxed(&vp->v_iflag); if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
mutex_enter(vp->v_interlock);
error = vdead_check(vp, VDEAD_NOWAIT);
mutex_exit(vp->v_interlock);
if (error) { if ((flags & PGO_LOCKED) == 0) rw_exit(uobj->vmobjlock);
return error;
}
}
startover:
error = 0;
const voff_t origvsize = vp->v_size;
const off_t origoffset = ap->a_offset;
const int orignpages = *ap->a_count;
GOP_SIZE(vp, origvsize, &diskeof, 0);
if (flags & PGO_PASTEOF) {
off_t newsize;
#if defined(DIAGNOSTIC)
off_t writeeof;
#endif /* defined(DIAGNOSTIC) */
newsize = MAX(origvsize,
origoffset + (orignpages << PAGE_SHIFT));
GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
#if defined(DIAGNOSTIC)
GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
if (newsize > round_page(writeeof)) {
panic("%s: past eof: %" PRId64 " vs. %" PRId64,
__func__, newsize, round_page(writeeof));
}
#endif /* defined(DIAGNOSTIC) */
} else { GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
}
KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); KASSERT((origoffset & (PAGE_SIZE - 1)) == 0); KASSERT(origoffset >= 0); KASSERT(orignpages > 0);
/*
* Bounds-check the request.
*/
if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
if ((flags & PGO_LOCKED) == 0) { rw_exit(uobj->vmobjlock);
}
UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx",
origoffset, *ap->a_count, memeof,0);
error = EINVAL;
goto out_err;
}
/* uobj is locked */
if ((flags & PGO_NOTIMESTAMP) == 0 && (vp->v_type != VBLK ||
(vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
int updflags = 0;
if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
updflags = GOP_UPDATE_ACCESSED;
}
if (memwrite) {
updflags |= GOP_UPDATE_MODIFIED;
}
if (updflags != 0) { GOP_MARKUPDATE(vp, updflags);
}
}
/*
* For PGO_LOCKED requests, just return whatever's in memory.
*/
if (flags & PGO_LOCKED) {
int nfound;
struct vm_page *pg;
KASSERT(!glocked);
npages = *ap->a_count;
#if defined(DEBUG)
for (i = 0; i < npages; i++) {
pg = ap->a_m[i];
KASSERT(pg == NULL || pg == PGO_DONTCARE);
}
#endif /* defined(DEBUG) */
nfound = uvn_findpages(uobj, origoffset, &npages,
ap->a_m, NULL,
UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
(memwrite ? UFP_NORDONLY : 0));
KASSERT(npages == *ap->a_count); if (nfound == 0) {
error = EBUSY;
goto out_err;
}
/*
* lock and unlock g_glock to ensure that no one is truncating
* the file behind us.
*/
if (!genfs_node_rdtrylock(vp)) {
/*
* restore the array.
*/
for (i = 0; i < npages; i++) {
pg = ap->a_m[i];
if (pg != NULL && pg != PGO_DONTCARE) { ap->a_m[i] = NULL;
}
KASSERT(ap->a_m[i] == NULL ||
ap->a_m[i] == PGO_DONTCARE);
}
} else {
genfs_node_unlock(vp);
}
error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0);
if (error == 0 && memwrite) { for (i = 0; i < npages; i++) { pg = ap->a_m[i]; if (pg == NULL || pg == PGO_DONTCARE) {
continue;
}
if (uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN) {
uvm_pagemarkdirty(pg,
UVM_PAGE_STATUS_UNKNOWN);
}
}
}
goto out_err;
}
rw_exit(uobj->vmobjlock);
/*
* find the requested pages and make some simple checks.
* leave space in the page array for a whole block.
*/
const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
const int fs_bsize = 1 << fs_bshift;
#define blk_mask (fs_bsize - 1)
#define trunc_blk(x) ((x) & ~blk_mask)
#define round_blk(x) (((x) + blk_mask) & ~blk_mask)
const int orignmempages = MIN(orignpages,
round_page(memeof - origoffset) >> PAGE_SHIFT);
npages = orignmempages;
const off_t startoffset = trunc_blk(origoffset);
const off_t endoffset = MIN(
round_page(round_blk(origoffset + (npages << PAGE_SHIFT))),
round_page(memeof));
const int ridx = (origoffset - startoffset) >> PAGE_SHIFT;
const int pgs_size = sizeof(struct vm_page *) *
((endoffset - startoffset) >> PAGE_SHIFT);
struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES];
if (pgs_size > sizeof(pgs_onstack)) {
pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP);
if (pgs == NULL) {
pgs = pgs_onstack;
error = ENOMEM;
goto out_err;
}
} else {
pgs = pgs_onstack;
(void)memset(pgs, 0, pgs_size);
}
UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx",
ridx, npages, startoffset, endoffset);
if (trans_mount == NULL) {
trans_mount = vp->v_mount;
fstrans_start(trans_mount);
/*
* check if this vnode is still valid.
*/
mutex_enter(vp->v_interlock);
error = vdead_check(vp, 0);
mutex_exit(vp->v_interlock);
if (error)
goto out_err_free;
/*
* XXX: This assumes that we come here only via
* the mmio path
*/
if (blockalloc && need_wapbl) {
error = WAPBL_BEGIN(trans_mount);
if (error)
goto out_err_free;
holds_wapbl = true;
}
}
/*
* hold g_glock to prevent a race with truncate.
*
* check if our idea of v_size is still valid.
*/
KASSERT(!glocked || genfs_node_wrlocked(vp));
if (!glocked) {
if (blockalloc) {
genfs_node_wrlock(vp);
} else {
genfs_node_rdlock(vp);
}
}
rw_enter(uobj->vmobjlock, RW_WRITER);
if (vp->v_size < origvsize) {
if (!glocked) { genfs_node_unlock(vp);
}
if (pgs != pgs_onstack) kmem_free(pgs, pgs_size);
goto startover;
}
if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL,
async ? UFP_NOWAIT : UFP_ALL) != orignmempages) {
if (!glocked) { genfs_node_unlock(vp);
}
KASSERT(async != 0); genfs_rel_pages(&pgs[ridx], orignmempages);
rw_exit(uobj->vmobjlock);
error = EBUSY;
goto out_err_free;
}
/*
* if PGO_OVERWRITE is set, don't bother reading the pages.
*/
if (overwrite) {
if (!glocked) { genfs_node_unlock(vp);
}
UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[ridx + i];
/*
* it's caller's responsibility to allocate blocks
* beforehand for the overwrite case.
*/
KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc);
pg->flags &= ~PG_RDONLY;
/*
* mark the page DIRTY.
* otherwise another thread can do putpages and pull
* our vnode from syncer's queue before our caller does
* ubc_release. note that putpages won't see CLEAN
* pages even if they are BUSY.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
}
npages += ridx;
goto out;
}
/*
* if the pages are already resident, just return them.
*/
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[ridx + i];
if ((pg->flags & PG_FAKE) || (blockalloc && (pg->flags & PG_RDONLY) != 0)) {
break;
}
}
if (i == npages) { if (!glocked) { genfs_node_unlock(vp);
}
UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
npages += ridx;
goto out;
}
/*
* the page wasn't resident and we're not overwriting,
* so we're going to have to do some i/o.
* find any additional pages needed to cover the expanded range.
*/
npages = (endoffset - startoffset) >> PAGE_SHIFT;
if (startoffset != origoffset || npages != orignmempages) {
int npgs;
/*
* we need to avoid deadlocks caused by locking
* additional pages at lower offsets than pages we
* already have locked. unlock them all and start over.
*/
genfs_rel_pages(&pgs[ridx], orignmempages);
memset(pgs, 0, pgs_size);
UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx",
startoffset, endoffset, 0,0);
npgs = npages;
if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL,
async ? UFP_NOWAIT : UFP_ALL) != npages) {
if (!glocked) { genfs_node_unlock(vp);
}
KASSERT(async != 0); genfs_rel_pages(pgs, npages);
rw_exit(uobj->vmobjlock);
error = EBUSY;
goto out_err_free;
}
}
rw_exit(uobj->vmobjlock);
error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof,
async, memwrite, blockalloc, glocked);
if (!glocked) { genfs_node_unlock(vp);
}
if (error == 0 && async)
goto out_err_free;
rw_enter(uobj->vmobjlock, RW_WRITER);
/*
* we're almost done! release the pages...
* for errors, we free the pages.
* otherwise we activate them and mark them as valid and clean.
* also, unbusy pages that were not actually requested.
*/
if (error) { genfs_rel_pages(pgs, npages);
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0);
goto out_err_free;
}
out:
UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0);
error = 0;
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL) {
continue;
}
UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx",
(uintptr_t)pg, pg->flags, 0,0);
if (pg->flags & PG_FAKE && !overwrite) {
/*
* we've read page's contents from the backing storage.
*
* for a read fault, we keep them CLEAN; if we
* encountered a hole while reading, the pages can
* already been dirtied with zeros.
*/
KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
pg->flags &= ~PG_FAKE;
}
KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0);
if (i < ridx || i >= ridx + orignmempages || async) {
UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx",
(uintptr_t)pg, pg->offset,0,0);
if (pg->flags & PG_FAKE) { KASSERT(overwrite);
uvm_pagezero(pg);
}
if (pg->flags & PG_RELEASED) {
uvm_pagefree(pg);
continue;
}
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
} else if (memwrite && !overwrite &&
uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
/*
* for a write fault, start dirtiness tracking of
* requested pages.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
}
}
rw_exit(uobj->vmobjlock);
if (ap->a_m != NULL) { memcpy(ap->a_m, &pgs[ridx],
orignmempages * sizeof(struct vm_page *));
}
out_err_free:
if (pgs != NULL && pgs != pgs_onstack) kmem_free(pgs, pgs_size);
out_err:
if (trans_mount != NULL) { if (holds_wapbl) WAPBL_END(trans_mount);
fstrans_done(trans_mount);
}
return error;
}
/*
* genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY.
*
* "glocked" (which is currently not actually used) tells us not whether
* the genfs_node is locked on entry (it always is) but whether it was
* locked on entry to genfs_getpages.
*/
static int
genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages,
off_t startoffset, off_t diskeof,
bool async, bool memwrite, bool blockalloc, bool glocked)
{
struct uvm_object * const uobj = &vp->v_uobj;
const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
const int dev_bshift = (vp->v_type != VBLK) ?
vp->v_mount->mnt_dev_bshift : DEV_BSHIFT;
kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */
size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
vaddr_t kva;
struct buf *bp, *mbp;
bool sawhole = false;
int i;
int error = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
/*
* read the desired page(s).
*/
totalbytes = npages << PAGE_SHIFT;
bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
tailbytes = totalbytes - bytes;
skipbytes = 0;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK));
if (kva == 0)
return EBUSY;
mbp = getiobuf(vp, true);
mbp->b_bufsize = totalbytes;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY;
if (async) {
mbp->b_flags = B_READ | B_ASYNC;
mbp->b_iodone = uvm_aio_aiodone;
} else {
mbp->b_flags = B_READ;
mbp->b_iodone = NULL;
}
if (async)
BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
else
BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
/*
* if EOF is in the middle of the range, zero the part past EOF.
* skip over pages which are not PG_FAKE since in that case they have
* valid data that we need to preserve.
*/
tailstart = bytes;
while (tailbytes > 0) {
const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
KASSERT(len <= tailbytes); if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) { memset((void *)(kva + tailstart), 0, len);
UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx",
(uintptr_t)kva, tailstart, len, 0);
}
tailstart += len;
tailbytes -= len;
}
/*
* now loop over the pages, reading as needed.
*/
bp = NULL;
off_t offset;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
int run;
daddr_t lbn, blkno;
int pidx;
struct vnode *devvp;
/*
* skip pages which don't need to be read.
*/
pidx = (offset - startoffset) >> PAGE_SHIFT;
while ((pgs[pidx]->flags & PG_FAKE) == 0) {
size_t b;
KASSERT((offset & (PAGE_SIZE - 1)) == 0);
if ((pgs[pidx]->flags & PG_RDONLY)) {
sawhole = true;
}
b = MIN(PAGE_SIZE, bytes);
offset += b;
bytes -= b;
skipbytes += b;
pidx++;
UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx",
offset, 0,0,0);
if (bytes == 0) {
goto loopdone;
}
}
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
lbn = offset >> fs_bshift;
error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
if (error) {
UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
lbn,error,0,0);
skipbytes += bytes;
bytes = 0;
goto loopdone;
}
/*
* see how many pages can be read with this i/o.
* reduce the i/o size if necessary to avoid
* overwriting pages with valid data.
*/
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
if (offset + iobytes > round_page(offset)) {
int pcount;
pcount = 1;
while (pidx + pcount < npages &&
pgs[pidx + pcount]->flags & PG_FAKE) {
pcount++;
}
iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
(offset - trunc_page(offset)));
}
/*
* if this block isn't allocated, zero it instead of
* reading it. unless we are going to allocate blocks,
* mark the pages we zeroed PG_RDONLY.
*/
if (blkno == (daddr_t)-1) {
int holepages = (round_page(offset + iobytes) -
trunc_page(offset)) >> PAGE_SHIFT;
UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0);
sawhole = true;
memset((char *)kva + (offset - startoffset), 0,
iobytes);
skipbytes += iobytes;
if (!blockalloc) {
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < holepages; i++) {
pgs[pidx + i]->flags |= PG_RDONLY;
}
rw_exit(uobj->vmobjlock);
}
continue;
}
/*
* allocate a sub-buf for this piece of the i/o
* (or just use mbp if there's only 1 piece),
* and start it going.
*/
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
} else {
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
bp = getiobuf(vp, true);
nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
}
bp->b_lblkno = 0;
/* adjust physical blkno for partial blocks */
bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
dev_bshift);
UVMHIST_LOG(ubchist,
"bp %#jx offset 0x%x bcount 0x%x blkno 0x%x",
(uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
VOP_STRATEGY(devvp, bp);
}
loopdone:
nestiobuf_done(mbp, skipbytes, error);
if (async) {
UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
return 0;
}
if (bp != NULL) { error = biowait(mbp);
}
/* Remove the mapping (make KVA available as soon as possible) */
uvm_pagermapout(kva, npages);
/*
* if this we encountered a hole then we have to do a little more work.
* for read faults, we marked the page PG_RDONLY so that future
* write accesses to the page will fault again.
* for write faults, we must make sure that the backing store for
* the page is completely allocated while the pages are locked.
*/
if (!error && sawhole && blockalloc) {
error = GOP_ALLOC(vp, startoffset,
npages << PAGE_SHIFT, 0, cred);
UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
startoffset, npages << PAGE_SHIFT, error,0);
if (!error) {
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL) {
continue;
}
pg->flags &= ~PG_RDONLY;
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
UVMHIST_LOG(ubchist, "mark dirty pg %#jx",
(uintptr_t)pg, 0, 0, 0);
}
rw_exit(uobj->vmobjlock);
}
}
putiobuf(mbp);
return error;
}
/*
* generic VM putpages routine.
* Write the given range of pages to backing store.
*
* => "offhi == 0" means flush all pages at or after "offlo".
* => object should be locked by caller. we return with the
* object unlocked.
* => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
* thus, a caller might want to unlock higher level resources
* (e.g. vm_map) before calling flush.
* => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
* => if PGO_ALLPAGES is set, then all pages in the object will be processed.
*
* note on "cleaning" object and PG_BUSY pages:
* this routine is holding the lock on the object. the only time
* that it can run into a PG_BUSY page that it does not own is if
* some other process has started I/O on the page (e.g. either
* a pagein, or a pageout). if the PG_BUSY page is being paged
* in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
* one has had a chance to modify it yet. if the PG_BUSY page is
* being paged out then it means that someone else has already started
* cleaning the page for us (how nice!). in this case, if we
* have syncio specified, then after we make our pass through the
* object we need to wait for the other PG_BUSY pages to clear
* off (i.e. we need to do an iosync). also note that once a
* page is PG_BUSY it must stay in its object until it is un-busyed.
*/
int
genfs_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ * const ap = v;
return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi,
ap->a_flags, NULL);
}
int
genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
int origflags, struct vm_page **busypg)
{
struct uvm_object * const uobj = &vp->v_uobj;
krwlock_t * const slock = uobj->vmobjlock;
off_t nextoff;
int i, error, npages, nback;
int freeflag;
/*
* This array is larger than it should so that it's size is constant.
* The right size is MAXPAGES.
*/
struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
#define MAXPAGES (MAXPHYS / PAGE_SIZE)
struct vm_page *pg, *tpg;
struct uvm_page_array a;
bool wasclean, needs_clean;
bool async = (origflags & PGO_SYNCIO) == 0;
bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
struct mount *trans_mp;
int flags;
bool modified; /* if we write out any pages */
bool holds_wapbl;
bool cleanall; /* try to pull off from the syncer's list */
bool onworklst;
bool nodirty;
const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;
UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); KASSERT((startoff & PAGE_MASK) == 0); KASSERT((endoff & PAGE_MASK) == 0); KASSERT(startoff < endoff || endoff == 0); KASSERT(rw_write_held(slock));
UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx",
(uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff);
#ifdef DIAGNOSTIC
if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
trans_mp = NULL;
holds_wapbl = false;
retry:
modified = false;
flags = origflags;
/*
* shortcut if we have no pages to process.
*/
nodirty = uvm_obj_clean_p(uobj);
#ifdef DIAGNOSTIC
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty);
mutex_exit(vp->v_interlock);
#endif
if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) {
mutex_enter(vp->v_interlock);
if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) { vn_syncer_remove_from_worklist(vp);
}
mutex_exit(vp->v_interlock);
if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp);
fstrans_done(trans_mp);
}
rw_exit(slock);
return (0);
}
/*
* the vnode has pages, set up to process the request.
*/
if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) {
if (pagedaemon) {
/* Pagedaemon must not sleep here. */
trans_mp = vp->v_mount;
error = fstrans_start_nowait(trans_mp);
if (error) { rw_exit(slock);
return error;
}
} else {
/*
* Cannot use vdeadcheck() here as this operation
* usually gets used from VOP_RECLAIM(). Test for
* change of v_mount instead and retry on change.
*/
rw_exit(slock);
trans_mp = vp->v_mount;
fstrans_start(trans_mp);
if (vp->v_mount != trans_mp) {
fstrans_done(trans_mp);
trans_mp = NULL;
} else {
holds_wapbl = (trans_mp->mnt_wapbl &&
(origflags & PGO_JOURNALLOCKED) == 0);
if (holds_wapbl) {
error = WAPBL_BEGIN(trans_mp);
if (error) { fstrans_done(trans_mp);
return error;
}
}
}
rw_enter(slock, RW_WRITER);
goto retry;
}
}
error = 0;
wasclean = uvm_obj_nowriteback_p(uobj);
nextoff = startoff;
if (endoff == 0 || flags & PGO_ALLPAGES) {
endoff = trunc_page(LLONG_MAX);
}
/*
* if this vnode is known not to have dirty pages,
* don't bother to clean it out.
*/
if (nodirty) {
/* We handled the dirtyonly && nodirty case above. */
KASSERT(!dirtyonly);
flags &= ~PGO_CLEANIT;
}
/*
* start the loop to scan pages.
*/
cleanall = true;
freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED;
uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY |
(!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0);
for (;;) {
bool pgprotected;
/*
* if !dirtyonly, iterate over all resident pages in the range.
*
* if dirtyonly, only possibly dirty pages are interesting.
* however, if we are asked to sync for integrity, we should
* wait on pages being written back by other threads as well.
*/
pg = uvm_page_array_fill_and_peek(&a, nextoff, 0);
if (pg == NULL) {
break;
}
KASSERT(pg->uobject == uobj); KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
(pg->flags & (PG_BUSY)) != 0);
KASSERT(pg->offset >= startoff); KASSERT(pg->offset >= nextoff); KASSERT(!dirtyonly ||
uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN ||
uvm_obj_page_writeback_p(pg));
if (pg->offset >= endoff) {
break;
}
/*
* a preempt point.
*/
if (preempt_needed()) {
nextoff = pg->offset; /* visit this page again */
rw_exit(slock);
preempt();
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
/*
* if the current page is busy, wait for it to become unbusy.
*/
if ((pg->flags & PG_BUSY) != 0) {
UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
0, 0, 0);
if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
&& (flags & PGO_BUSYFAIL) != 0) {
UVMHIST_LOG(ubchist, "busyfail %#jx",
(uintptr_t)pg, 0, 0, 0);
error = EDEADLK;
if (busypg != NULL) *busypg = pg;
break;
}
if (pagedaemon) {
/*
* someone has taken the page while we
* dropped the lock for fstrans_start.
*/
break;
}
/*
* don't bother to wait on other's activities
* unless we are asked to sync for integrity.
*/
if (!async && (flags & PGO_RECLAIM) == 0) {
wasclean = false;
nextoff = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
continue;
}
nextoff = pg->offset; /* visit this page again */
uvm_pagewait(pg, slock, "genput");
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
nextoff = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
/*
* if we're freeing, remove all mappings of the page now.
* if we're cleaning, check if the page is needs to be cleaned.
*/
pgprotected = false;
if (flags & PGO_FREE) {
pmap_page_protect(pg, VM_PROT_NONE);
pgprotected = true;
} else if (flags & PGO_CLEANIT) {
/*
* if we still have some hope to pull this vnode off
* from the syncer queue, write-protect the page.
*/
if (cleanall && wasclean) {
/*
* uobj pages get wired only by uvm_fault
* where uobj is locked.
*/
if (pg->wire_count == 0) { pmap_page_protect(pg,
VM_PROT_READ|VM_PROT_EXECUTE);
pgprotected = true;
} else {
cleanall = false;
}
}
}
if (flags & PGO_CLEANIT) {
needs_clean = uvm_pagecheckdirty(pg, pgprotected);
} else {
needs_clean = false;
}
/*
* if we're cleaning, build a cluster.
* the cluster will consist of pages which are currently dirty.
* if not cleaning, just operate on the one page.
*/
if (needs_clean) {
wasclean = false;
memset(pgs, 0, sizeof(pgs));
pg->flags |= PG_BUSY;
UVM_PAGE_OWN(pg, "genfs_putpages");
/*
* let the fs constrain the offset range of the cluster.
* we additionally constrain the range here such that
* it fits in the "pgs" pages array.
*/
off_t fslo, fshi, genlo, lo, off = pg->offset;
GOP_PUTRANGE(vp, off, &fslo, &fshi);
KASSERT(fslo == trunc_page(fslo)); KASSERT(fslo <= off); KASSERT(fshi == trunc_page(fshi)); KASSERT(fshi == 0 || off < fshi);
if (off > MAXPHYS / 2)
genlo = trunc_page(off - (MAXPHYS / 2));
else
genlo = 0;
lo = MAX(fslo, genlo);
/*
* first look backward.
*/
npages = (off - lo) >> PAGE_SHIFT;
nback = npages;
uvn_findpages(uobj, off - PAGE_SIZE, &nback,
&pgs[0], NULL,
UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
if (nback) { memmove(&pgs[0], &pgs[npages - nback],
nback * sizeof(pgs[0]));
if (npages - nback < nback)
memset(&pgs[nback], 0,
(npages - nback) * sizeof(pgs[0]));
else
memset(&pgs[npages - nback], 0,
nback * sizeof(pgs[0]));
}
/*
* then plug in our page of interest.
*/
pgs[nback] = pg;
/*
* then look forward to fill in the remaining space in
* the array of pages.
*
* pass our cached array of pages so that hopefully
* uvn_findpages can find some good pages in it.
* the array a was filled above with the one of
* following sets of flags:
* 0
* UVM_PAGE_ARRAY_FILL_DIRTY
* UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
*
* XXX this is fragile but it'll work: the array
* was earlier filled sparsely, but UFP_DIRTYONLY
* implies dense. see corresponding comment in
* uvn_findpages().
*/
npages = MAXPAGES - nback - 1;
if (fshi) npages = MIN(npages,
(fshi - off - 1) >> PAGE_SHIFT);
uvn_findpages(uobj, off + PAGE_SIZE, &npages,
&pgs[nback + 1], &a,
UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
npages += nback + 1;
} else {
pgs[0] = pg;
npages = 1;
nback = 0;
}
/*
* apply FREE or DEACTIVATE options if requested.
*/
for (i = 0; i < npages; i++) {
tpg = pgs[i];
KASSERT(tpg->uobject == uobj); KASSERT(i == 0 ||
pgs[i-1]->offset + PAGE_SIZE == tpg->offset);
KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) !=
UVM_PAGE_STATUS_DIRTY);
if (needs_clean) {
/*
* mark pages as WRITEBACK so that concurrent
* fsync can find and wait for our activities.
*/
uvm_obj_page_set_writeback(pgs[i]);
}
if (tpg->offset < startoff || tpg->offset >= endoff)
continue;
if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) { uvm_pagelock(tpg);
uvm_pagedeactivate(tpg);
uvm_pageunlock(tpg);
} else if (flags & PGO_FREE) {
pmap_page_protect(tpg, VM_PROT_NONE);
if (tpg->flags & PG_BUSY) {
tpg->flags |= freeflag;
if (pagedaemon) { uvm_pageout_start(1);
uvm_pagelock(tpg);
uvm_pagedequeue(tpg);
uvm_pageunlock(tpg);
}
} else {
/*
* ``page is not busy''
* implies that npages is 1
* and needs_clean is false.
*/
KASSERT(npages == 1); KASSERT(!needs_clean); KASSERT(pg == tpg); KASSERT(nextoff ==
tpg->offset + PAGE_SIZE);
uvm_pagefree(tpg);
if (pagedaemon) uvmexp.pdfreed++;
}
}
}
if (needs_clean) {
modified = true;
KASSERT(nextoff == pg->offset + PAGE_SIZE); KASSERT(nback < npages);
nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT);
KASSERT(pgs[nback] == pg); KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE);
/*
* start the i/o.
*/
rw_exit(slock);
error = GOP_WRITE(vp, pgs, npages, flags);
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
if (error) {
break;
}
}
}
uvm_page_array_fini(&a);
/*
* update ctime/mtime if the modification we started writing out might
* be from mmap'ed write.
*
* this is necessary when an application keeps a file mmaped and
* repeatedly modifies it via the window. note that, because we
* don't always write-protect pages when cleaning, such modifications
* might not involve any page faults.
*/
mutex_enter(vp->v_interlock);
if (modified && (vp->v_iflag & VI_WRMAP) != 0 && (vp->v_type != VBLK ||
(vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
}
/*
* if we no longer have any possibly dirty pages, take us off the
* syncer list.
*/
if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) &&
LIST_EMPTY(&vp->v_dirtyblkhd)) {
vn_syncer_remove_from_worklist(vp);
}
/* Wait for output to complete. */
rw_exit(slock);
if (!wasclean && !async && vp->v_numoutput != 0) { while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, vp->v_interlock);
}
onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
mutex_exit(vp->v_interlock);
if ((flags & PGO_RECLAIM) != 0 && onworklst) {
/*
* in the case of PGO_RECLAIM, ensure to make the vnode clean.
* retrying is not a big deal because, in many cases,
* uobj->uo_npages is already 0 here.
*/
rw_enter(slock, RW_WRITER);
goto retry;
}
if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp);
fstrans_done(trans_mp);
}
return (error);
}
/*
* Default putrange method for file systems that do not care
* how many pages are given to one GOP_WRITE() call.
*/
void
genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
{
*lop = 0;
*hip = 0;
}
int
genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
off = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
len = npages << PAGE_SHIFT;
error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
uvm_aio_aiodone);
return error;
}
/*
* genfs_gop_write_rwmap:
*
* a variant of genfs_gop_write. it's used by UDF for its directory buffers.
* this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
* the contents before writing it out to the underlying storage.
*/
int
genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
off = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
len = npages << PAGE_SHIFT;
error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
uvm_aio_aiodone);
return error;
}
/*
* Backend routine for doing I/O to vnode pages. Pages are already locked
* and mapped into kernel memory. Here we just look up the underlying
* device block addresses and call the strategy routine.
*/
static int
genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
enum uio_rw rw, void (*iodone)(struct buf *))
{
int s, error;
int fs_bshift, dev_bshift;
off_t eof, offset, startoffset;
size_t bytes, iobytes, skipbytes;
struct buf *mbp, *bp;
const bool async = (flags & PGO_SYNCIO) == 0;
const bool lazy = (flags & PGO_LAZY) == 0;
const bool iowrite = rw == UIO_WRITE;
const int brw = iowrite ? B_WRITE : B_READ;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx",
(uintptr_t)vp, (uintptr_t)kva, len, flags);
KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize);
GOP_SIZE(vp, vp->v_writesize, &eof, 0);
if (vp->v_type != VBLK) { fs_bshift = vp->v_mount->mnt_fs_bshift;
dev_bshift = vp->v_mount->mnt_dev_bshift;
} else {
fs_bshift = DEV_BSHIFT;
dev_bshift = DEV_BSHIFT;
}
error = 0;
startoffset = off;
bytes = MIN(len, eof - startoffset);
skipbytes = 0;
KASSERT(bytes != 0); if (iowrite) {
/*
* why += 2?
* 1 for biodone, 1 for uvm_aio_aiodone.
*/
mutex_enter(vp->v_interlock);
vp->v_numoutput += 2;
mutex_exit(vp->v_interlock);
}
mbp = getiobuf(vp, true);
UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
(uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
mbp->b_bufsize = len;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY | BC_AGE;
if (async) {
mbp->b_flags = brw | B_ASYNC;
mbp->b_iodone = iodone;
} else {
mbp->b_flags = brw;
mbp->b_iodone = NULL;
}
if (curlwp == uvm.pagedaemon_lwp)
BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
else if (async || lazy)
BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL);
else
BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
bp = NULL;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
int run;
daddr_t lbn, blkno;
struct vnode *devvp;
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
lbn = offset >> fs_bshift;
error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
if (error) {
UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
lbn, error, 0, 0);
skipbytes += bytes;
bytes = 0;
goto loopdone;
}
/*
* see how many pages can be read with this i/o.
* reduce the i/o size if necessary to avoid
* overwriting pages with valid data.
*/
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
/*
* if this block isn't allocated, zero it instead of
* reading it. unless we are going to allocate blocks,
* mark the pages we zeroed PG_RDONLY.
*/
if (blkno == (daddr_t)-1) {
if (!iowrite) { memset((char *)kva + (offset - startoffset), 0,
iobytes);
}
skipbytes += iobytes;
continue;
}
/*
* allocate a sub-buf for this piece of the i/o
* (or just use mbp if there's only 1 piece),
* and start it going.
*/
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
} else {
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
bp = getiobuf(vp, true);
nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
}
bp->b_lblkno = 0;
/* adjust physical blkno for partial blocks */
bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
dev_bshift);
UVMHIST_LOG(ubchist,
"bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx",
(uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
VOP_STRATEGY(devvp, bp);
}
loopdone:
if (skipbytes) {
UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
}
nestiobuf_done(mbp, skipbytes, error);
if (async) {
UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
return (0);
}
UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0);
error = biowait(mbp);
s = splbio();
(*iodone)(mbp);
splx(s);
UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0);
return (error);
}
int
genfs_compat_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
off_t origoffset;
struct vnode *vp = ap->a_vp;
struct uvm_object *uobj = &vp->v_uobj;
struct vm_page *pg, **pgs;
vaddr_t kva;
int i, error, orignpages, npages;
struct iovec iov;
struct uio uio;
kauth_cred_t cred = curlwp->l_cred;
const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
error = 0;
origoffset = ap->a_offset;
orignpages = *ap->a_count;
pgs = ap->a_m;
if (ap->a_flags & PGO_LOCKED) {
uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL,
UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0));
error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
return error;
}
if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
rw_exit(uobj->vmobjlock);
return EINVAL;
}
if ((ap->a_flags & PGO_SYNCIO) == 0) {
rw_exit(uobj->vmobjlock);
return 0;
}
npages = orignpages;
uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL);
rw_exit(uobj->vmobjlock);
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
for (i = 0; i < npages; i++) {
pg = pgs[i];
if ((pg->flags & PG_FAKE) == 0) {
continue;
}
iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
iov.iov_len = PAGE_SIZE;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = origoffset + (i << PAGE_SHIFT);
uio.uio_rw = UIO_READ;
uio.uio_resid = PAGE_SIZE;
UIO_SETUP_SYSSPACE(&uio);
/* XXX vn_lock */
error = VOP_READ(vp, &uio, 0, cred);
if (error) {
break;
}
if (uio.uio_resid) {
memset(iov.iov_base, 0, uio.uio_resid);
}
}
uvm_pagermapout(kva, npages);
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
pg = pgs[i];
if (error && (pg->flags & PG_FAKE) != 0) {
pg->flags |= PG_RELEASED;
} else {
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
if (error) {
uvm_page_unbusy(pgs, npages);
}
rw_exit(uobj->vmobjlock);
return error;
}
int
genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
off_t offset;
struct iovec iov;
struct uio uio;
kauth_cred_t cred = curlwp->l_cred;
struct buf *bp;
vaddr_t kva;
int error;
offset = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
iov.iov_base = (void *)kva;
iov.iov_len = npages << PAGE_SHIFT;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = offset;
uio.uio_rw = UIO_WRITE;
uio.uio_resid = npages << PAGE_SHIFT;
UIO_SETUP_SYSSPACE(&uio);
/* XXX vn_lock */
error = VOP_WRITE(vp, &uio, 0, cred);
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
bp = getiobuf(vp, true);
bp->b_cflags |= BC_BUSY | BC_AGE;
bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
bp->b_data = (char *)kva;
bp->b_bcount = npages << PAGE_SHIFT;
bp->b_bufsize = npages << PAGE_SHIFT;
bp->b_resid = 0;
bp->b_error = error;
uvm_aio_aiodone(bp);
return (error);
}
/*
* Process a uio using direct I/O. If we reach a part of the request
* which cannot be processed in this fashion for some reason, just return.
* The caller must handle some additional part of the request using
* buffered I/O before trying direct I/O again.
*/
void
genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
{
struct vmspace *vs;
struct iovec *iov;
vaddr_t va;
size_t len;
const int mask = DEV_BSIZE - 1;
int error;
bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl &&
(ioflag & IO_JOURNALLOCKED) == 0);
#ifdef DIAGNOSTIC
if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
/*
* We only support direct I/O to user space for now.
*/
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
return;
}
/*
* If the vnode is mapped, we would need to get the getpages lock
* to stabilize the bmap, but then we would get into trouble while
* locking the pages if the pages belong to this same vnode (or a
* multi-vnode cascade to the same effect). Just fall back to
* buffered I/O if the vnode is mapped to avoid this mess.
*/
if (vp->v_vflag & VV_MAPPED) {
return;
}
if (need_wapbl) {
error = WAPBL_BEGIN(vp->v_mount);
if (error)
return;
}
/*
* Do as much of the uio as possible with direct I/O.
*/
vs = uio->uio_vmspace;
while (uio->uio_resid) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
va = (vaddr_t)iov->iov_base;
len = MIN(iov->iov_len, genfs_maxdio);
len &= ~mask;
/*
* If the next chunk is smaller than DEV_BSIZE or extends past
* the current EOF, then fall back to buffered I/O.
*/
if (len == 0 || uio->uio_offset + len > vp->v_size) {
break;
}
/*
* Check alignment. The file offset must be at least
* sector-aligned. The exact constraint on memory alignment
* is very hardware-dependent, but requiring sector-aligned
* addresses there too is safe.
*/
if (uio->uio_offset & mask || va & mask) {
break;
}
error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
uio->uio_rw);
if (error) {
break;
}
iov->iov_base = (char *)iov->iov_base + len;
iov->iov_len -= len;
uio->uio_offset += len;
uio->uio_resid -= len;
}
if (need_wapbl)
WAPBL_END(vp->v_mount);
}
/*
* Iodone routine for direct I/O. We don't do much here since the request is
* always synchronous, so the caller will do most of the work after biowait().
*/
static void
genfs_dio_iodone(struct buf *bp)
{
KASSERT((bp->b_flags & B_ASYNC) == 0);
if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) {
mutex_enter(bp->b_objlock);
vwakeup(bp);
mutex_exit(bp->b_objlock);
}
putiobuf(bp);
}
/*
* Process one chunk of a direct I/O request.
*/
static int
genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
off_t off, enum uio_rw rw)
{
struct vm_map *map;
struct pmap *upm, *kpm __unused;
size_t klen = round_page(uva + len) - trunc_page(uva);
off_t spoff, epoff;
vaddr_t kva, puva;
paddr_t pa;
vm_prot_t prot;
int error, rv __diagused, poff, koff;
const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED |
(rw == UIO_WRITE ? PGO_FREE : 0);
/*
* For writes, verify that this range of the file already has fully
* allocated backing store. If there are any holes, just punt and
* make the caller take the buffered write path.
*/
if (rw == UIO_WRITE) {
daddr_t lbn, elbn, blkno;
int bsize, bshift, run;
bshift = vp->v_mount->mnt_fs_bshift;
bsize = 1 << bshift;
lbn = off >> bshift;
elbn = (off + len + bsize - 1) >> bshift;
while (lbn < elbn) {
error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
if (error) {
return error;
}
if (blkno == (daddr_t)-1) {
return ENOSPC;
}
lbn += 1 + run;
}
}
/*
* Flush any cached pages for parts of the file that we're about to
* access. If we're writing, invalidate pages as well.
*/
spoff = trunc_page(off);
epoff = round_page(off + len);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
if (error) {
return error;
}
/*
* Wire the user pages and remap them into kernel memory.
*/
prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
error = uvm_vslock(vs, (void *)uva, len, prot);
if (error) {
return error;
}
map = &vs->vm_map;
upm = vm_map_pmap(map);
kpm = vm_map_pmap(kernel_map);
puva = trunc_page(uva);
kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask,
UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
for (poff = 0; poff < klen; poff += PAGE_SIZE) {
rv = pmap_extract(upm, puva + poff, &pa);
KASSERT(rv);
pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED);
}
pmap_update(kpm);
/*
* Do the I/O.
*/
koff = uva - trunc_page(uva);
error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
genfs_dio_iodone);
/*
* Tear down the kernel mapping.
*/
pmap_kremove(kva, klen);
pmap_update(kpm);
uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);
/*
* Unwire the user pages.
*/
uvm_vsunlock(vs, (void *)uva, len);
return error;
}
/* $NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_inet_csum.h"
#include "opt_mbuftrace.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/once.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/md5.h>
#include <sys/cprng.h>
#include <net/route.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_vtw.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_syncache.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/key.h>
#endif
struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
u_int32_t tcp_now; /* slow ticks, for RFC 1323 timestamps */
percpu_t *tcpstat_percpu;
/* patchable/settable parameters for tcp */
int tcp_mssdflt = TCP_MSS;
int tcp_minmss = TCP_MINMSS;
int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */
int tcp_do_sack = 1; /* selective acknowledgement */
int tcp_do_win_scale = 1; /* RFC1323 window scaling */
int tcp_do_timestamps = 1; /* RFC1323 timestamps */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
int tcp_do_ecn = 0; /* Explicit Congestion Notification */
#ifndef TCP_INIT_WIN
#define TCP_INIT_WIN 4 /* initial slow start window */
#endif
#ifndef TCP_INIT_WIN_LOCAL
#define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */
#endif
/*
* Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460.
* This is to simulate current behavior for iw == 4
*/
int tcp_init_win_max[] = {
1 * 1460,
1 * 1460,
2 * 1460,
2 * 1460,
3 * 1460,
5 * 1460,
6 * 1460,
7 * 1460,
8 * 1460,
9 * 1460,
10 * 1460
};
int tcp_init_win = TCP_INIT_WIN;
int tcp_init_win_local = TCP_INIT_WIN_LOCAL;
int tcp_mss_ifmtu = 0;
int tcp_rst_ppslim = 100; /* 100pps */
int tcp_ackdrop_ppslim = 100; /* 100pps */
int tcp_do_loopback_cksum = 0;
int tcp_do_abc = 1; /* RFC3465 Appropriate byte counting. */
int tcp_abc_aggressive = 1; /* 1: L=2*SMSS 0: L=1*SMSS */
int tcp_sack_tp_maxholes = 32;
int tcp_sack_globalmaxholes = 1024;
int tcp_sack_globalholes = 0;
int tcp_ecn_maxretries = 1;
int tcp_msl_enable = 1; /* enable TIME_WAIT truncation */
int tcp_msl_loop = PR_SLOWHZ; /* MSL for loopback */
int tcp_msl_local = 5 * PR_SLOWHZ; /* MSL for 'local' */
int tcp_msl_remote = TCPTV_MSL; /* MSL otherwise */
int tcp_msl_remote_threshold = TCPTV_SRTTDFLT; /* RTT threshold */
int tcp_rttlocal = 0; /* Use RTT to decide who's 'local' */
int tcp4_vtw_enable = 0; /* 1 to enable */
int tcp6_vtw_enable = 0; /* 1 to enable */
int tcp_vtw_was_enabled = 0;
int tcp_vtw_entries = 1 << 4; /* 16 vestigial TIME_WAIT entries */
/* tcb hash */
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
#endif
int tcbhashsize = TCBHASHSIZE;
int tcp_freeq(struct tcpcb *);
static int tcp_iss_secret_init(void);
static void tcp_mtudisc_callback(struct in_addr);
#ifdef INET6
static void tcp6_mtudisc(struct inpcb *, int);
#endif
static struct pool tcpcb_pool;
static int tcp_drainwanted;
#ifdef TCP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum bad");
struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum ok");
struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum data");
struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "swcsum");
EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp_swcsum);
#if defined(INET6)
struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum bad");
struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum ok");
struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum data");
struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "swcsum");
EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp6_swcsum);
#endif /* defined(INET6) */
#endif /* TCP_CSUM_COUNTERS */
#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>
struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output big header");
struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict hit");
struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict miss");
struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy small");
struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy big");
struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output reference big");
EVCNT_ATTACH_STATIC(tcp_output_bigheader);
EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
EVCNT_ATTACH_STATIC(tcp_output_copysmall);
EVCNT_ATTACH_STATIC(tcp_output_copybig);
EVCNT_ATTACH_STATIC(tcp_output_refbig);
#endif /* TCP_OUTPUT_COUNTERS */
#ifdef TCP_REASS_COUNTERS
#include <sys/device.h>
struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp_reass", "calls");
struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert into empty queue");
struct evcnt tcp_reass_iteration[8] = {
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
};
struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend to first");
struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend");
struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert");
struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert at tail");
struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append");
struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append to tail fragment");
struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at end");
struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at start");
struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate segment");
struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate fragment");
EVCNT_ATTACH_STATIC(tcp_reass_);
EVCNT_ATTACH_STATIC(tcp_reass_empty);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
EVCNT_ATTACH_STATIC(tcp_reass_prepend);
EVCNT_ATTACH_STATIC(tcp_reass_insert);
EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
EVCNT_ATTACH_STATIC(tcp_reass_append);
EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
EVCNT_ATTACH_STATIC(tcp_reass_segdup);
EVCNT_ATTACH_STATIC(tcp_reass_fragdup);
#endif /* TCP_REASS_COUNTERS */
#ifdef MBUFTRACE
struct mowner tcp_mowner = MOWNER_INIT("tcp", "");
struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx");
struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx");
struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock");
struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx");
struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx");
#endif
static int
do_tcpinit(void)
{
inpcb_init(&tcbtable, tcbhashsize, tcbhashsize);
pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
NULL, IPL_SOFTNET);
tcp_usrreq_init();
/* Initialize timer state. */
tcp_timer_init();
/* Initialize the compressed state engine. */
syn_cache_init();
/* Initialize the congestion control algorithms. */
tcp_congctl_init();
/* Initialize the TCPCB template. */
tcp_tcpcb_template();
/* Initialize reassembly queue */
tcpipqent_init();
/* SACK */
tcp_sack_init();
MOWNER_ATTACH(&tcp_tx_mowner);
MOWNER_ATTACH(&tcp_rx_mowner);
MOWNER_ATTACH(&tcp_reass_mowner);
MOWNER_ATTACH(&tcp_sock_mowner);
MOWNER_ATTACH(&tcp_sock_tx_mowner);
MOWNER_ATTACH(&tcp_sock_rx_mowner);
MOWNER_ATTACH(&tcp_mowner);
tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS);
vtw_earlyinit();
tcp_slowtimo_init();
return 0;
}
void
tcp_init_common(unsigned basehlen)
{
static ONCE_DECL(dotcpinit);
unsigned hlen = basehlen + sizeof(struct tcphdr);
unsigned oldhlen;
if (max_linkhdr + hlen > MHLEN)
panic("tcp_init");
while ((oldhlen = max_protohdr) < hlen)
atomic_cas_uint(&max_protohdr, oldhlen, hlen);
RUN_ONCE(&dotcpinit, do_tcpinit);
}
/*
* Tcp initialization
*/
void
tcp_init(void)
{
icmp_mtudisc_callback_register(tcp_mtudisc_callback);
tcp_init_common(sizeof(struct ip));
}
/*
* Create template to be used to send tcp packets on a connection.
* Call after host entry created, allocates an mbuf and fills
* in a skeletal tcp/ip header, minimizing the amount of work
* necessary when the connection is used.
*/
struct mbuf *
tcp_template(struct tcpcb *tp)
{
struct inpcb *inp = tp->t_inpcb;
struct tcphdr *n;
struct mbuf *m;
int hlen;
switch (tp->t_family) {
case AF_INET:
hlen = sizeof(struct ip);
if (inp->inp_af == AF_INET)
break;
#ifdef INET6
if (inp->inp_af == AF_INET6) {
/* mapped addr case */
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)))
break;
}
#endif
return NULL; /*EINVAL*/
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
if (inp != NULL) {
/* more sainty check? */
break;
}
return NULL; /*EINVAL*/
#endif
default:
return NULL; /*EAFNOSUPPORT*/
}
KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES);
m = tp->t_template;
if (m && m->m_len == hlen + sizeof(struct tcphdr)) {
;
} else {
if (m)
m_freem(m);
m = tp->t_template = NULL;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return NULL;
MCLAIM(m, &tcp_mowner);
m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
}
memset(mtod(m, void *), 0, m->m_len);
n = (struct tcphdr *)(mtod(m, char *) + hlen);
switch (tp->t_family) {
case AF_INET:
{
struct ipovly *ipov;
mtod(m, struct ip *)->ip_v = 4;
mtod(m, struct ip *)->ip_hl = hlen >> 2;
ipov = mtod(m, struct ipovly *);
ipov->ih_pr = IPPROTO_TCP;
ipov->ih_len = htons(sizeof(struct tcphdr));
if (inp->inp_af == AF_INET) { ipov->ih_src = in4p_laddr(inp);
ipov->ih_dst = in4p_faddr(inp);
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
/* mapped addr case */
bcopy(&in6p_laddr(inp).s6_addr32[3], &ipov->ih_src,
sizeof(ipov->ih_src));
bcopy(&in6p_faddr(inp).s6_addr32[3], &ipov->ih_dst,
sizeof(ipov->ih_dst));
}
#endif
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
ipov->ih_dst.s_addr,
htons(sizeof(struct tcphdr) + IPPROTO_TCP));
break;
}
#ifdef INET6
case AF_INET6:
{
struct ip6_hdr *ip6;
mtod(m, struct ip *)->ip_v = 6;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_src = in6p_laddr(inp);
ip6->ip6_dst = in6p_faddr(inp);
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) { ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in6_cksum_phdr(&in6p_laddr(inp),
&in6p_faddr(inp), htonl(sizeof(struct tcphdr)),
htonl(IPPROTO_TCP));
break;
}
#endif
}
n->th_sport = inp->inp_lport;
n->th_dport = inp->inp_fport;
n->th_seq = 0;
n->th_ack = 0;
n->th_x2 = 0;
n->th_off = 5;
n->th_flags = 0;
n->th_win = 0;
n->th_urp = 0;
return m;
}
/*
* Send a single message to the TCP at address specified by
* the given TCP/IP header. If m == 0, then we make a copy
* of the tcpiphdr at ti and send directly to the addressed host.
* This is used to force keep alive messages out using the TCP
* template for a connection tp->t_template. If flags are given
* then we send a message back to the TCP which originated the
* segment ti, and discard the mbuf containing it and any other
* attached mbufs.
*
* In any case the ack and sequence number of the transmitted
* segment are as specified by the parameters.
*/
int
tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m,
struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
{
struct route *ro;
int error, tlen, win = 0;
int hlen;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
int family; /* family on packet, not inpcb! */
struct tcphdr *th;
if (tp != NULL && (flags & TH_RST) == 0) {
KASSERT(tp->t_inpcb != NULL);
win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
}
th = NULL; /* Quell uninitialized warning */
ip = NULL;
#ifdef INET6
ip6 = NULL;
#endif
if (m == NULL) {
if (!mtemplate)
return EINVAL;
/* get family information from template */
switch (mtod(mtemplate, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
return EAFNOSUPPORT;
}
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m) {
MCLAIM(m, &tcp_tx_mowner);
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return ENOBUFS;
tlen = 0;
m->m_data += max_linkhdr;
bcopy(mtod(mtemplate, void *), mtod(m, void *),
mtemplate->m_len);
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
break;
#endif
}
flags = TH_ACK;
} else {
if ((m->m_flags & M_PKTHDR) == 0) {
m_freem(m);
return EINVAL;
}
KASSERT(th0 != NULL);
/* get family information from m */
switch (mtod(m, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
ip = mtod(m, struct ip *);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
ip6 = mtod(m, struct ip6_hdr *);
break;
#endif
default:
m_freem(m);
return EAFNOSUPPORT;
}
/* clear h/w csum flags inherited from rx packet */
m->m_pkthdr.csum_flags = 0;
if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
tlen = sizeof(*th0);
else
tlen = th0->th_off << 2;
if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
mtod(m, char *) + hlen == (char *)th0) {
m->m_len = hlen + tlen;
m_freem(m->m_next);
m->m_next = NULL;
} else {
struct mbuf *n;
KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES);
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n && max_linkhdr + hlen + tlen > MHLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n) {
m_freem(m);
return ENOBUFS;
}
MCLAIM(n, &tcp_tx_mowner);
n->m_data += max_linkhdr;
n->m_len = hlen + tlen;
m_copyback(n, 0, hlen, mtod(m, void *));
m_copyback(n, hlen, tlen, (void *)th0);
m_freem(m);
m = n;
n = NULL;
}
#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
ip->ip_p = IPPROTO_TCP;
xchg(ip->ip_dst, ip->ip_src, struct in_addr);
ip->ip_p = IPPROTO_TCP;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
ip6->ip6_nxt = IPPROTO_TCP;
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
ip6->ip6_nxt = IPPROTO_TCP;
break;
#endif
}
xchg(th->th_dport, th->th_sport, u_int16_t);
#undef xchg
tlen = 0; /*be friendly with the following code*/
}
th->th_seq = htonl(seq);
th->th_ack = htonl(ack);
th->th_x2 = 0;
if ((flags & TH_SYN) == 0) {
if (tp)
win >>= tp->rcv_scale;
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
th->th_win = htons((u_int16_t)win);
th->th_off = sizeof (struct tcphdr) >> 2;
tlen += sizeof(*th);
} else {
tlen += th->th_off << 2;
}
m->m_len = hlen + tlen;
m->m_pkthdr.len = hlen + tlen;
m_reset_rcvif(m);
th->th_flags = flags;
th->th_urp = 0;
switch (family) {
case AF_INET:
{
struct ipovly *ipov = (struct ipovly *)ip;
memset(ipov->ih_x1, 0, sizeof ipov->ih_x1);
ipov->ih_len = htons((u_int16_t)tlen);
th->th_sum = 0;
th->th_sum = in_cksum(m, hlen + tlen);
ip->ip_len = htons(hlen + tlen);
ip->ip_ttl = ip_defttl;
break;
}
#ifdef INET6
case AF_INET6:
{
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
tlen);
ip6->ip6_plen = htons(tlen);
if (tp && tp->t_inpcb->inp_af == AF_INET6)
ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
else
ip6->ip6_hlim = ip6_defhlim;
ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) {
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
break;
}
#endif
}
if (tp != NULL && tp->t_inpcb->inp_af == AF_INET) {
ro = &tp->t_inpcb->inp_route;
KASSERT(family == AF_INET);
KASSERT(in_hosteq(ip->ip_dst, in4p_faddr(tp->t_inpcb)));
}
#ifdef INET6
else if (tp != NULL && tp->t_inpcb->inp_af == AF_INET6) {
ro = (struct route *)&tp->t_inpcb->inp_route;
#ifdef DIAGNOSTIC
if (family == AF_INET) {
if (!IN6_IS_ADDR_V4MAPPED(&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: not mapped addr");
if (memcmp(&ip->ip_dst,
&in6p_faddr(tp->t_inpcb).s6_addr32[3],
sizeof(ip->ip_dst)) != 0) {
panic("tcp_respond: ip_dst != in6p_faddr");
}
} else if (family == AF_INET6) {
if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: ip6_dst != in6p_faddr");
} else
panic("tcp_respond: address family mismatch");
#endif
}
#endif
else
ro = NULL;
switch (family) {
case AF_INET:
error = ip_output(m, NULL, ro,
(tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL,
tp ? tp->t_inpcb : NULL);
break;
#ifdef INET6
case AF_INET6:
error = ip6_output(m, NULL, ro, 0, NULL,
tp ? tp->t_inpcb : NULL, NULL);
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
return error;
}
/*
* Template TCPCB. Rather than zeroing a new TCPCB and initializing
* a bunch of members individually, we maintain this template for the
* static and mostly-static components of the TCPCB, and copy it into
* the new TCPCB instead.
*/
static struct tcpcb tcpcb_template = {
.t_srtt = TCPTV_SRTTBASE,
.t_rttmin = TCPTV_MIN,
.snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_numholes = 0,
.snd_cubic_wmax = 0,
.snd_cubic_wmax_last = 0,
.snd_cubic_ctime = 0,
.t_partialacks = -1,
.t_bytes_acked = 0,
.t_sndrexmitpack = 0,
.t_rcvoopack = 0,
.t_sndzerowin = 0,
};
/*
* Updates the TCPCB template whenever a parameter that would affect
* the template is changed.
*/
void
tcp_tcpcb_template(void)
{
struct tcpcb *tp = &tcpcb_template;
int flags;
tp->t_peermss = tcp_mssdflt;
tp->t_ourmss = tcp_mssdflt;
tp->t_segsz = tcp_mssdflt;
flags = 0;
if (tcp_do_rfc1323 && tcp_do_win_scale)
flags |= TF_REQ_SCALE;
if (tcp_do_rfc1323 && tcp_do_timestamps)
flags |= TF_REQ_TSTMP;
tp->t_flags = flags;
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
TCPTV_MIN, TCPTV_REXMTMAX);
/* Keep Alive */
tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS);
tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS);
tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS);
tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS));
tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
TCP_TIMER_MAXTICKS/tp->t_keepcnt);
/* MSL */
tp->t_msl = TCPTV_MSL;
}
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(int family, struct inpcb *inp)
{
struct tcpcb *tp;
int i;
/* XXX Consider using a pool_cache for speed. */
tp = pool_get(&tcpcb_pool, PR_NOWAIT); /* splsoftnet via tcp_usrreq */
if (tp == NULL)
return NULL;
memcpy(tp, &tcpcb_template, sizeof(*tp));
TAILQ_INIT(&tp->segq);
TAILQ_INIT(&tp->timeq);
tp->t_family = family; /* may be overridden later on */
TAILQ_INIT(&tp->snd_holes);
LIST_INIT(&tp->t_sc); /* XXX can template this */
/* Don't sweat this loop; hopefully the compiler will unroll it. */
for (i = 0; i < TCPT_NTIMERS; i++) {
callout_init(&tp->t_timer[i], CALLOUT_MPSAFE);
TCP_TIMER_INIT(tp, i);
}
callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE);
switch (family) {
case AF_INET:
in4p_ip(inp).ip_ttl = ip_defttl;
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
tp->t_mtudisc = ip_mtudisc;
break;
#ifdef INET6
case AF_INET6:
in6p_ip6(inp).ip6_hlim = in6pcb_selecthlim_rt(inp);
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
/* for IPv6, always try to run path MTU discovery */
tp->t_mtudisc = 1;
break;
#endif /* INET6 */
default:
for (i = 0; i < TCPT_NTIMERS; i++)
callout_destroy(&tp->t_timer[i]);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp); /* splsoftnet via tcp_usrreq */
return NULL;
}
/*
* Initialize our timebase. When we send timestamps, we take
* the delta from tcp_now -- this means each connection always
* gets a timebase of 1, which makes it, among other things,
* more difficult to determine how long a system has been up,
* and thus how many TCP sequence increments have occurred.
*
* We start with 1, because 0 doesn't work with linux, which
* considers timestamp 0 in a SYN packet as a bug and disables
* timestamps.
*/
tp->ts_timebase = tcp_now - 1;
tcp_congctl_select(tp, tcp_congctl_global_name);
return tp;
}
/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
*/
struct tcpcb *
tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so;
KASSERT(tp->t_inpcb != NULL);
so = tp->t_inpcb->inp_socket;
if (so == NULL)
return NULL;
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_state = TCPS_CLOSED;
(void) tcp_output(tp);
TCP_STATINC(TCP_STAT_DROPS);
} else
TCP_STATINC(TCP_STAT_CONNDROPS);
if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
return (tcp_close(tp));
}
/*
* Close a TCP control block:
* discard all space held by the tcp
* discard internet protocol block
* wake up any sleepers
*/
struct tcpcb *
tcp_close(struct tcpcb *tp)
{
struct inpcb *inp;
struct socket *so;
#ifdef RTV_RTT
struct rtentry *rt = NULL;
#endif
struct route *ro;
int j;
inp = tp->t_inpcb;
so = inp->inp_socket;
ro = &inp->inp_route;
#ifdef RTV_RTT
/*
* If we sent enough data to get some meaningful characteristics,
* save them in the routing entry. 'Enough' is arbitrarily
* defined as the sendpipesize (default 4K) * 16. This would
* give us 16 rtt samples assuming we only get one sample per
* window (the usual case on a long haul net). 16 samples is
* enough for the srtt filter to converge to within 5% of the correct
* value; fewer samples and we could save a very bogus rtt.
*
* Don't update the default route's characteristics and don't
* update anything that the user "locked".
*/
if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && ro && (rt = rtcache_validate(ro)) != NULL &&
!in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) {
u_long i = 0;
if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
if (rt->rt_rmx.rmx_rtt && i)
/*
* filter this update to half the old & half
* the new values, converting scale.
* See route.h and tcp_var.h for a
* description of the scaling constants.
*/
rt->rt_rmx.rmx_rtt =
(rt->rt_rmx.rmx_rtt + i) / 2;
else
rt->rt_rmx.rmx_rtt = i;
}
if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
if (rt->rt_rmx.rmx_rttvar && i)
rt->rt_rmx.rmx_rttvar =
(rt->rt_rmx.rmx_rttvar + i) / 2;
else
rt->rt_rmx.rmx_rttvar = i;
}
/*
* update the pipelimit (ssthresh) if it has been updated
* already or if a pipesize was specified & the threshold
* got below half the pipesize. I.e., wait for bad news
* before we start updating, then update on both good
* and bad news.
*/
if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
i < (rt->rt_rmx.rmx_sendpipe / 2)) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
i = (i + tp->t_segsz / 2) / tp->t_segsz;
if (i < 2)
i = 2;
i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
if (rt->rt_rmx.rmx_ssthresh)
rt->rt_rmx.rmx_ssthresh =
(rt->rt_rmx.rmx_ssthresh + i) / 2;
else
rt->rt_rmx.rmx_ssthresh = i;
}
}
rtcache_unref(rt, ro);
#endif /* RTV_RTT */
/* free the reassembly queue, if any */
TCP_REASS_LOCK(tp);
(void) tcp_freeq(tp);
TCP_REASS_UNLOCK(tp);
/* free the SACK holes list. */
tcp_free_sackholes(tp);
tcp_congctl_release(tp);
syn_cache_cleanup(tp);
if (tp->t_template) { m_free(tp->t_template);
tp->t_template = NULL;
}
/*
* Detaching the pcb will unlock the socket/tcpcb, and stopping
* the timers can also drop the lock. We need to prevent access
* to the tcpcb as it's half torn down. Flag the pcb as dead
* (prevents access by timers) and only then detach it.
*/
tp->t_flags |= TF_DEAD;
inp->inp_ppcb = NULL;
soisdisconnected(so);
inpcb_destroy(inp);
/*
* pcb is no longer visble elsewhere, so we can safely release
* the lock in callout_halt() if needed.
*/
TCP_STATINC(TCP_STAT_CLOSED);
for (j = 0; j < TCPT_NTIMERS; j++) {
callout_halt(&tp->t_timer[j], softnet_lock);
callout_destroy(&tp->t_timer[j]);
}
callout_halt(&tp->t_delack_ch, softnet_lock);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp);
return NULL;
}
int
tcp_freeq(struct tcpcb *tp)
{
struct ipqent *qe;
int rv = 0;
TCP_REASS_LOCK_CHECK(tp); while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) { TAILQ_REMOVE(&tp->segq, qe, ipqe_q); TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
m_freem(qe->ipqe_m);
tcpipqent_free(qe);
rv = 1;
}
tp->t_segqlen = 0;
KASSERT(TAILQ_EMPTY(&tp->timeq));
return (rv);
}
void
tcp_fasttimo(void)
{
if (tcp_drainwanted) {
tcp_drain();
tcp_drainwanted = 0;
}
}
void
tcp_drainstub(void)
{
tcp_drainwanted = 1;
}
/*
* Protocol drain routine. Called when memory is in short supply.
* Called from pr_fasttimo thus a callout context.
*/
void
tcp_drain(void)
{
struct inpcb *inp;
struct tcpcb *tp;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
/*
* Free the sequence queue of all TCP connections.
*/
TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) {
tp = intotcpcb(inp);
if (tp != NULL) {
/*
* If the tcpcb is already busy,
* just bail out now.
*/
if (tcp_reass_lock_try(tp) == 0)
continue;
if (tcp_freeq(tp))
TCP_STATINC(TCP_STAT_CONNSDRAINED);
TCP_REASS_UNLOCK(tp);
}
}
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Notify a tcp user of an asynchronous error;
* store error as soft error, but wake up user
* (for now, won't do anything until can select for soft error).
*/
void
tcp_notify(struct inpcb *inp, int error)
{
struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
struct socket *so = inp->inp_socket;
/*
* Ignore some errors if we are hooked up.
* If connection hasn't completed, has retransmitted several times,
* and receives a second error, give up now. This is better
* than waiting a long time to establish a connection that
* can never complete.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
return;
} else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
tp->t_rxtshift > 3 && tp->t_softerror)
so->so_error = error;
else
tp->t_softerror = error;
cv_broadcast(&so->so_cv);
sorwakeup(so);
sowwakeup(so);
}
#ifdef INET6
void *
tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct tcphdr th;
void (*notify)(struct inpcb *, int) = tcp_notify;
int nmatch;
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6_src = NULL;
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
struct mbuf *m;
int off;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
else if (cmd == PRC_QUENCH) {
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
} else if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
sa6_src = &sa6_any;
off = 0;
}
if (ip6) {
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(th)) {
if (cmd == PRC_MSGSIZE)
icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
return NULL;
}
memset(&th, 0, sizeof(th));
m_copydata(m, off, sizeof(th), (void *)&th);
if (cmd == PRC_MSGSIZE) {
int valid = 0;
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMPv6 message
* payload.
*/
if (in6pcb_lookup(&tcbtable, &sa6->sin6_addr,
th.th_dport,
(const struct in6_addr *)&sa6_src->sin6_addr,
th.th_sport, 0, 0))
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalcurate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* no need to call in6pcb_notify, it should have been
* called via callback if necessary
*/
return NULL;
}
nmatch = in6pcb_notify(&tcbtable, sa, th.th_dport,
(const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
if (nmatch == 0 && syn_cache_count &&
(inet6ctlerrmap[cmd] == EHOSTUNREACH ||
inet6ctlerrmap[cmd] == ENETUNREACH ||
inet6ctlerrmap[cmd] == EHOSTDOWN))
syn_cache_unreach((const struct sockaddr *)sa6_src,
sa, &th);
} else {
(void) in6pcb_notify(&tcbtable, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
}
return NULL;
}
#endif
/* assumes that ip header and tcp header are contiguous on mbuf */
void *
tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
struct ip *ip = v;
struct tcphdr *th;
struct icmp *icp;
extern const int inetctlerrmap[];
void (*notify)(struct inpcb *, int) = tcp_notify;
int errno;
int nmatch;
struct tcpcb *tp;
u_int mtu;
tcp_seq seq;
struct inpcb *inp;
#ifdef INET6
struct in6_addr src6, dst6;
#endif
if (sa->sa_family != AF_INET ||
sa->sa_len != sizeof(struct sockaddr_in))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
errno = inetctlerrmap[cmd];
if (cmd == PRC_QUENCH)
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
else if (PRC_IS_REDIRECT(cmd))
notify = inpcb_rtchange, ip = 0;
else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMP message
* payload.
*
* Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
*/
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
#ifdef INET6
in6_in_2_v4mapin6(&ip->ip_src, &src6);
in6_in_2_v4mapin6(&ip->ip_dst, &dst6);
#endif
if ((inp = inpcb_lookup(&tcbtable, ip->ip_dst,
th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL)
;
#ifdef INET6
else if ((inp = in6pcb_lookup(&tcbtable, &dst6,
th->th_dport, &src6, th->th_sport, 0, 0)) != NULL)
;
#endif
else
return NULL;
/*
* Now that we've validated that we are actually communicating
* with the host indicated in the ICMP message, locate the
* ICMP header, recalculate the new MTU, and create the
* corresponding routing entry.
*/
icp = (struct icmp *)((char *)ip -
offsetof(struct icmp, icmp_ip));
tp = intotcpcb(inp);
if (tp == NULL)
return NULL;
seq = ntohl(th->th_seq);
if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max))
return NULL;
/*
* If the ICMP message advertises a Next-Hop MTU
* equal or larger than the maximum packet size we have
* ever sent, drop the message.
*/
mtu = (u_int)ntohs(icp->icmp_nextmtu);
if (mtu >= tp->t_pmtud_mtu_sent)
return NULL;
if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
/*
* Calculate new MTU, and create corresponding
* route (traditional PMTUD).
*/
tp->t_flags &= ~TF_PMTUD_PEND;
icmp_mtudisc(icp, ip->ip_dst);
} else {
/*
* Record the information got in the ICMP
* message; act on it later.
* If we had already recorded an ICMP message,
* replace the old one only if the new message
* refers to an older TCP segment
*/
if (tp->t_flags & TF_PMTUD_PEND) {
if (SEQ_LT(tp->t_pmtud_th_seq, seq))
return NULL;
} else
tp->t_flags |= TF_PMTUD_PEND;
tp->t_pmtud_th_seq = seq;
tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
}
return NULL;
} else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if (errno == 0)
return NULL;
if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
nmatch = inpcb_notify(&tcbtable, satocsin(sa)->sin_addr,
th->th_dport, ip->ip_src, th->th_sport, errno, notify);
if (nmatch == 0 && syn_cache_count &&
(inetctlerrmap[cmd] == EHOSTUNREACH ||
inetctlerrmap[cmd] == ENETUNREACH ||
inetctlerrmap[cmd] == EHOSTDOWN)) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_port = th->th_sport;
sin.sin_addr = ip->ip_src;
syn_cache_unreach((struct sockaddr *)&sin, sa, th);
}
/* XXX mapped address case */
} else
inpcb_notifyall(&tcbtable, satocsin(sa)->sin_addr, errno,
notify);
return NULL;
}
/*
* When a source quench is received, we are being notified of congestion.
* Close the congestion window down to the Loss Window (one segment).
* We will gradually open it again as we proceed.
*/
void
tcp_quench(struct inpcb *inp)
{
struct tcpcb *tp = intotcpcb(inp);
if (tp) {
tp->snd_cwnd = tp->t_segsz;
tp->t_bytes_acked = 0;
}
}
/*
* Path MTU Discovery handlers.
*/
void
tcp_mtudisc_callback(struct in_addr faddr)
{
#ifdef INET6
struct in6_addr in6;
#endif
inpcb_notifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
#ifdef INET6
in6_in_2_v4mapin6(&faddr, &in6);
tcp6_mtudisc_callback(&in6);
#endif
}
/*
* On receipt of path MTU corrections, flush old route and replace it
* with the new one. Retransmit all unacknowledged packets, to ensure
* that all packets will be received.
*/
void
tcp_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
rt = inpcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
inpcb_rtentry_unref(rt, inp);
inpcb_rtchange(inp, errno);
if ((rt = inpcb_rtentry(inp)) == NULL)
return;
}
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0)
tp->snd_cwnd =
TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
inpcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#ifdef INET6
/*
* Path MTU Discovery handlers.
*/
void
tcp6_mtudisc_callback(struct in6_addr *faddr)
{
struct sockaddr_in6 sin6;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_addr = *faddr;
(void) in6pcb_notify(&tcbtable, (struct sockaddr *)&sin6, 0,
(const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
}
void
tcp6_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
rt = in6pcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
in6pcb_rtentry_unref(rt, inp);
in6pcb_rtchange(inp, errno);
rt = in6pcb_rtentry(inp);
if (rt == NULL)
return;
}
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0) {
tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
}
in6pcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#endif /* INET6 */
/*
* Compute the MSS to advertise to the peer. Called only during
* the 3-way handshake. If we are the server (peer initiated
* connection), we are called with a pointer to the interface
* on which the SYN packet arrived. If we are the client (we
* initiated connection), we are called with a pointer to the
* interface out which this connection should go.
*
* NOTE: Do not subtract IP option/extension header size nor IPsec
* header size from MSS advertisement. MSS option must hold the maximum
* segment size we can accept, so it must always be:
* max(if mtu) - ip header - tcp header
*/
u_long
tcp_mss_to_advertise(const struct ifnet *ifp, int af)
{
extern u_long in_maxmtu;
u_long mss = 0;
u_long hdrsiz;
/*
* In order to avoid defeating path MTU discovery on the peer,
* we advertise the max MTU of all attached networks as our MSS,
* per RFC 1191, section 3.1.
*
* We provide the option to advertise just the MTU of
* the interface on which we hope this connection will
* be receiving. If we are responding to a SYN, we
* will have a pretty good idea about this, but when
* initiating a connection there is a bit more doubt.
*
* We also need to ensure that loopback has a large enough
* MSS, as the loopback MTU is never included in in_maxmtu.
*/
if (ifp != NULL) switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
mss = ifp->if_mtu;
break;
}
if (tcp_mss_ifmtu == 0) switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
mss = uimax(in_maxmtu, mss);
break;
}
switch (af) {
case AF_INET:
hdrsiz = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
hdrsiz = sizeof(struct ip6_hdr);
break;
#endif
default:
hdrsiz = 0;
break;
}
hdrsiz += sizeof(struct tcphdr);
if (mss > hdrsiz)
mss -= hdrsiz;
mss = uimax(tcp_mssdflt, mss);
return (mss);
}
/*
* Set connection variables based on the peer's advertised MSS.
* We are passed the TCPCB for the actual connection. If we
* are the server, we are called by the compressed state engine
* when the 3-way handshake is complete. If we are the client,
* we are called when we receive the SYN,ACK from the server.
*
* NOTE: Our advertised MSS value must be initialized in the TCPCB
* before this routine is called!
*/
void
tcp_mss_from_peer(struct tcpcb *tp, int offer)
{
struct socket *so;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
struct rtentry *rt;
#endif
u_long bufsize;
int mss;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
so = tp->t_inpcb->inp_socket;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
/*
* As per RFC1122, use the default MSS value, unless they
* sent us an offer. Do not accept offers less than 256 bytes.
*/
mss = tcp_mssdflt;
if (offer)
mss = offer;
mss = uimax(mss, 256); /* sanity */
tp->t_peermss = mss;
mss -= tcp_optlen(tp);
if (tp->t_inpcb->inp_af == AF_INET)
mss -= ip_optlen(tp->t_inpcb);
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6)
mss -= ip6_optlen(tp->t_inpcb);
#endif
/*
* XXX XXX What if mss goes negative or zero? This can happen if a
* socket has large IPv6 options. We crash below.
*/
/*
* If there's a pipesize, change the socket buffer to that size.
* Make the socket buffer an integral number of MSS units. If
* the MSS is larger than the socket buffer, artificially decrease
* the MSS.
*/
#ifdef RTV_SPIPE
if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
bufsize = rt->rt_rmx.rmx_sendpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_snd.sb_hiwat;
}
if (bufsize < mss)
mss = bufsize;
else {
bufsize = roundup(bufsize, mss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_snd, bufsize, so);
}
tp->t_segsz = mss;
#ifdef RTV_SSTHRESH
if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
/*
* There's some sort of gateway or interface buffer
* limit on the path. Use this to set the slow
* start threshold, but set the threshold to no less
* than 2 * MSS.
*/
tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh);
}
#endif
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Processing necessary when a TCP connection is established.
*/
void
tcp_established(struct tcpcb *tp)
{
struct socket *so;
#ifdef RTV_RPIPE
struct rtentry *rt;
#endif
u_long bufsize;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
/* This is a while() to reduce the dreadful stairstepping below */
while (tp->t_inpcb->inp_af == AF_INET) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
if (__predict_true(tcp_msl_enable)) {
if (in4p_laddr(tp->t_inpcb).s_addr == INADDR_LOOPBACK) {
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in_localaddr(in4p_faddr(tp->t_inpcb))) {
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#ifdef INET6
while (tp->t_inpcb->inp_af == AF_INET6) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
rt = in6pcb_rtentry(tp->t_inpcb);
#endif
if (__predict_true(tcp_msl_enable)) {
extern const struct in6_addr in6addr_loopback;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(tp->t_inpcb),
&in6addr_loopback)) {
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in6_localaddr(&in6p_faddr(tp->t_inpcb))) {
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#endif
tp->t_state = TCPS_ESTABLISHED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
#ifdef RTV_RPIPE
if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
bufsize = rt->rt_rmx.rmx_recvpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_rcv.sb_hiwat;
}
if (bufsize > tp->t_ourmss) {
bufsize = roundup(bufsize, tp->t_ourmss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_rcv, bufsize, so);
}
#ifdef RTV_RPIPE
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Check if there's an initial rtt or rttvar. Convert from the
* route-table units to scaled multiples of the slow timeout timer.
* Called only during the 3-way handshake.
*/
void
tcp_rmx_rtt(struct tcpcb *tp)
{
#ifdef RTV_RTT
struct rtentry *rt = NULL;
int rtt;
KASSERT(tp->t_inpcb != NULL);
rt = inpcb_rtentry(tp->t_inpcb);
if (rt == NULL)
return;
if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
/*
* XXX The lock bit for MTU indicates that the value
* is also a minimum value; this is subject to time.
*/
if (rt->rt_rmx.rmx_locks & RTV_RTT)
TCPT_RANGESET(tp->t_rttmin,
rtt / (RTM_RTTUNIT / PR_SLOWHZ),
TCPTV_MIN, TCPTV_REXMTMAX);
tp->t_srtt = rtt /
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
((RTM_RTTUNIT / PR_SLOWHZ) >>
(TCP_RTTVAR_SHIFT + 2));
} else {
/* Default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
tp->t_rttmin, TCPTV_REXMTMAX);
}
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
/*
* Get a new sequence value given a tcp control block
*/
tcp_seq
tcp_new_iss(struct tcpcb *tp)
{
if (tp->t_inpcb->inp_af == AF_INET) {
return tcp_new_iss1(&in4p_laddr(tp->t_inpcb),
&in4p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in4p_laddr(tp->t_inpcb)));
}
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6) {
return tcp_new_iss1(&in6p_laddr(tp->t_inpcb),
&in6p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in6p_laddr(tp->t_inpcb)));
}
#endif
panic("tcp_new_iss: unreachable");
}
static u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */
/*
* Initialize RFC 1948 ISS Secret
*/
static int
tcp_iss_secret_init(void)
{
cprng_strong(kern_cprng,
tcp_iss_secret, sizeof(tcp_iss_secret), 0);
return 0;
}
/*
* This routine actually generates a new TCP initial sequence number.
*/
tcp_seq
tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
size_t addrsz)
{
tcp_seq tcp_iss;
if (tcp_do_rfc1948) {
MD5_CTX ctx;
u_int8_t hash[16]; /* XXX MD5 knowledge */
static ONCE_DECL(tcp_iss_secret_control);
/*
* If we haven't been here before, initialize our cryptographic
* hash secret.
*/
RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init);
/*
* Compute the base value of the ISS. It is a hash
* of (saddr, sport, daddr, dport, secret).
*/
MD5Init(&ctx);
MD5Update(&ctx, (u_char *) laddr, addrsz);
MD5Update(&ctx, (u_char *) &lport, sizeof(lport));
MD5Update(&ctx, (u_char *) faddr, addrsz);
MD5Update(&ctx, (u_char *) &fport, sizeof(fport));
MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));
MD5Final(hash, &ctx);
memcpy(&tcp_iss, hash, sizeof(tcp_iss));
#ifdef TCPISS_DEBUG
printf("ISS hash 0x%08x, ", tcp_iss);
#endif
} else {
/*
* Randomize.
*/
tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK;
#ifdef TCPISS_DEBUG
printf("ISS random 0x%08x, ", tcp_iss);
#endif
}
/*
* Add the offset in to the computed value.
*/
tcp_iss += tcp_iss_seq;
#ifdef TCPISS_DEBUG
printf("ISS %08x\n", tcp_iss);
#endif
return tcp_iss;
}
#if defined(IPSEC)
/* compute ESP/AH header size for TCP, including outer IP header. */
size_t
ipsec4_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
/* XXX mapped addr case (tp->t_inpcb) */
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#ifdef INET6
size_t
ipsec6_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET6:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
case AF_INET:
/* mapped address case - tricky */
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#endif
#endif /*IPSEC*/
/*
* Determine the length of the TCP options for this connection.
*
* XXX: What do we do for SACK, when we add that? Just reserve
* all of the space? Otherwise we can't exactly be incrementing
* cwnd by an amount that varies depending on the amount we last
* had to SACK!
*/
u_int
tcp_optlen(struct tcpcb *tp)
{
u_int optlen;
optlen = 0;
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP | TF_RCVD_TSTMP))
optlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
optlen += TCPOLEN_SIGLEN;
#endif
return optlen;
}
u_int
tcp_hdrsz(struct tcpcb *tp)
{
u_int hlen;
switch (tp->t_family) {
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
case AF_INET:
hlen = sizeof(struct ip);
break;
default:
hlen = 0;
break;
}
hlen += sizeof(struct tcphdr);
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
hlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
hlen += TCPOLEN_SIGLEN;
#endif
return hlen;
}
void
tcp_statinc(u_int stat)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATINC(stat);
}
void
tcp_statadd(u_int stat, uint64_t val)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATADD(stat, val);
}
/* $NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $ */
/* $KAME: in6.c,v 1.198 2001/07/18 09:12:38 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.2 (Berkeley) 11/15/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/cprng.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_llatbl.h>
#include <net/if_ether.h>
#include <net/if_dl.h>
#include <net/pfil.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/mld6_var.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/scope6_var.h>
#include <compat/netinet6/in6_var.h>
#include <compat/netinet6/nd6.h>
MALLOC_DEFINE(M_IP6OPT, "ip6_options", "IPv6 options");
/* enable backward compatibility code for obsoleted ioctls */
#define COMPAT_IN6IFIOCTL
#ifdef IN6_DEBUG
#define IN6_DPRINTF(__fmt, ...) printf(__fmt, __VA_ARGS__)
#else
#define IN6_DPRINTF(__fmt, ...) do { } while (/*CONSTCOND*/0)
#endif /* IN6_DEBUG */
/*
* Definitions of some constant IP6 addresses.
*/
const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
const struct in6_addr in6addr_nodelocal_allnodes =
IN6ADDR_NODELOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allnodes =
IN6ADDR_LINKLOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allrouters =
IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
const struct in6_addr in6mask0 = IN6MASK0;
const struct in6_addr in6mask32 = IN6MASK32;
const struct in6_addr in6mask64 = IN6MASK64;
const struct in6_addr in6mask96 = IN6MASK96;
const struct in6_addr in6mask128 = IN6MASK128;
const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6,
0, 0, IN6ADDR_ANY_INIT, 0};
struct pslist_head in6_ifaddr_list;
kmutex_t in6_ifaddr_lock;
static int in6_lifaddr_ioctl(struct socket *, u_long, void *,
struct ifnet *);
static int in6_ifaddprefix(struct in6_ifaddr *);
static int in6_ifremprefix(struct in6_ifaddr *);
static int in6_ifinit(struct ifnet *, struct in6_ifaddr *,
const struct sockaddr_in6 *, int);
static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
static int in6_update_ifa1(struct ifnet *, struct in6_aliasreq *,
struct in6_ifaddr **, struct psref *, int);
void
in6_init(void)
{
PSLIST_INIT(&in6_ifaddr_list);
mutex_init(&in6_ifaddr_lock, MUTEX_DEFAULT, IPL_NONE);
in6_sysctl_multicast_setup(NULL);
}
/*
* Add ownaddr as loopback rtentry. We previously add the route only if
* necessary (ex. on a p2p link). However, since we now manage addresses
* separately from prefixes, we should always add the route. We can't
* rely on the cloning mechanism from the corresponding interface route
* any more.
*/
void
in6_ifaddlocal(struct ifaddr *ifa)
{
if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &in6addr_any) ||
(ifa->ifa_ifp->if_flags & IFF_POINTOPOINT &&
IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), IFA_DSTIN6(ifa))))
{
rt_addrmsg(RTM_NEWADDR, ifa);
return;
}
rt_ifa_addlocal(ifa);
}
/*
* Remove loopback rtentry of ownaddr generated by in6_ifaddlocal(),
* if it exists.
*/
void
in6_ifremlocal(struct ifaddr *ifa)
{
struct in6_ifaddr *ia;
struct ifaddr *alt_ifa = NULL;
int ia_count = 0;
struct psref psref;
int s;
/*
* Some of BSD variants do not remove cloned routes
* from an interface direct route, when removing the direct route
* (see comments in net/net_osdep.h). Even for variants that do remove
* cloned routes, they could fail to remove the cloned routes when
* we handle multiple addresses that share a common prefix.
* So, we should remove the route corresponding to the deleted address.
*/
/*
* Delete the entry only if exactly one ifaddr matches the
* address, ifa->ifa_addr.
*
* If more than one ifaddr matches, replace the ifaddr in
* the routing table, rt_ifa, with a different ifaddr than
* the one we are purging, ifa. It is important to do
* this, or else the routing table can accumulate dangling
* pointers rt->rt_ifa->ifa_ifp to destroyed interfaces,
* which will lead to crashes, later. (More than one ifaddr
* can match if we assign the same address to multiple---probably
* p2p---interfaces.)
*
* XXX An old comment at this place said, "we should avoid
* XXX such a configuration [i.e., interfaces with the same
* XXX addressed assigned --ed.] in IPv6...". I do not
* XXX agree, especially now that I have fixed the dangling
* XXX ifp-pointers bug.
*/
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (!IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr))
continue;
if (ia->ia_ifp != ifa->ifa_ifp)
alt_ifa = &ia->ia_ifa;
if (++ia_count > 1 && alt_ifa != NULL)
break;
}
if (ia_count > 1 && alt_ifa != NULL)
ifa_acquire(alt_ifa, &psref);
pserialize_read_exit(s);
if (ia_count == 0)
return;
rt_ifa_remlocal(ifa, ia_count == 1 ? NULL : alt_ifa);
if (ia_count > 1 && alt_ifa != NULL)
ifa_release(alt_ifa, &psref);
}
/* Add prefix route for the network. */
static int
in6_ifaddprefix(struct in6_ifaddr *ia)
{
int error, flags = 0;
if (in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) == 128) {
if (ia->ia_dstaddr.sin6_family != AF_INET6)
/* We don't need to install a host route. */
return 0;
flags |= RTF_HOST;
}
/* Is this a connected route for neighbour discovery? */
if (nd6_need_cache(ia->ia_ifp))
flags |= RTF_CONNECTED;
if ((error = rtinit(&ia->ia_ifa, RTM_ADD, RTF_UP | flags)) == 0)
ia->ia_flags |= IFA_ROUTE;
else if (error == EEXIST)
/* Existence of the route is not an error. */
error = 0;
return error;
}
static int
in6_rt_ifa_matcher(struct rtentry *rt, void *v)
{
struct ifaddr *ifa = v;
if (rt->rt_ifa == ifa)
return 1;
else
return 0;
}
/* Delete network prefix route if present.
* Re-add it to another address if the prefix matches. */
static int
in6_ifremprefix(struct in6_ifaddr *target)
{
int error, s;
struct in6_ifaddr *ia;
if ((target->ia_flags & IFA_ROUTE) == 0)
return 0;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (target->ia_dstaddr.sin6_len) {
if (ia->ia_dstaddr.sin6_len == 0 ||
!IN6_ARE_ADDR_EQUAL(&ia->ia_dstaddr.sin6_addr,
&target->ia_dstaddr.sin6_addr))
continue;
} else {
if (!IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
&target->ia_addr.sin6_addr,
&target->ia_prefixmask.sin6_addr))
continue;
}
/*
* if we got a matching prefix route, move IFA_ROUTE to him
*/
if ((ia->ia_flags & IFA_ROUTE) == 0) {
struct psref psref;
int bound = curlwp_bind();
ia6_acquire(ia, &psref);
pserialize_read_exit(s);
rtinit(&target->ia_ifa, RTM_DELETE, 0);
target->ia_flags &= ~IFA_ROUTE;
error = in6_ifaddprefix(ia);
if (!ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING))
goto skip;
/*
* Replace rt_ifa of routes that have the removing address
* with the new address.
*/
rt_replace_ifa_matched_entries(AF_INET6,
in6_rt_ifa_matcher, &target->ia_ifa, &ia->ia_ifa);
skip:
ia6_release(ia, &psref);
curlwp_bindx(bound);
return error;
}
}
pserialize_read_exit(s);
/*
* noone seem to have prefix route. remove it.
*/
rtinit(&target->ia_ifa, RTM_DELETE, 0);
target->ia_flags &= ~IFA_ROUTE;
if (ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) {
/* Remove routes that have the removing address as rt_ifa. */
rt_delete_matched_entries(AF_INET6, in6_rt_ifa_matcher,
&target->ia_ifa, true);
}
return 0;
}
int
in6_mask2len(struct in6_addr *mask, u_char *lim0)
{
int x = 0, y;
u_char *lim = lim0, *p;
/* ignore the scope_id part */
if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
lim = (u_char *)mask + sizeof(*mask);
for (p = (u_char *)mask; p < lim; x++, p++) {
if (*p != 0xff)
break;
}
y = 0;
if (p < lim) {
for (y = 0; y < NBBY; y++) {
if ((*p & (0x80 >> y)) == 0)
break;
}
}
/*
* when the limit pointer is given, do a stricter check on the
* remaining bits.
*/
if (p < lim) {
if (y != 0 && (*p & (0x00ff >> y)) != 0)
return -1;
for (p = p + 1; p < lim; p++)
if (*p != 0)
return -1;
}
return x * NBBY + y;
}
#define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa))
#define ia62ifa(ia6) (&((ia6)->ia_ifa))
static int
in6_control1(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
struct in6_ifreq *ifr = (struct in6_ifreq *)data;
struct in6_ifaddr *ia = NULL;
struct in6_aliasreq *ifra = (struct in6_aliasreq *)data;
struct sockaddr_in6 *sa6;
int error, bound;
struct psref psref;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
case SIOCDADDRCTL_POLICY:
/* Privileged. */
return in6_src_ioctl(cmd, data);
/*
* XXX: Fix me, once we fix SIOCSIFADDR, SIOCIFDSTADDR, etc.
*/
case SIOCSIFADDR:
case SIOCSIFDSTADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
return EOPNOTSUPP;
case SIOCGETSGCNT_IN6:
case SIOCGETMIFCNT_IN6:
return mrt6_ioctl(cmd, data);
case SIOCGIFADDRPREF:
case SIOCSIFADDRPREF:
if (ifp == NULL)
return EINVAL;
return ifaddrpref_ioctl(so, cmd, data, ifp);
}
if (ifp == NULL)
return EOPNOTSUPP;
switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
case OSIOCSIFINFO_FLAGS_90:
case OSIOCSIFINFO_IN6_90:
case OSIOCSDEFIFACE_IN6:
case OSIOCSNDFLUSH_IN6:
case OSIOCSPFXFLUSH_IN6:
case OSIOCSRTRFLUSH_IN6:
#endif
case SIOCSIFINFO_FLAGS:
case SIOCSIFINFO_IN6:
/* Privileged. */
/* FALLTHROUGH */
#ifdef OSIOCGIFINFO_IN6
case OSIOCGIFINFO_IN6:
#endif
#ifdef OSIOCGIFINFO_IN6_90
case OSIOCGDRLST_IN6:
case OSIOCGPRLST_IN6:
case OSIOCGIFINFO_IN6_90:
case OSIOCGDEFIFACE_IN6:
#endif
case SIOCGIFINFO_IN6:
case SIOCGNBRINFO_IN6:
return nd6_ioctl(cmd, data, ifp);
}
switch (cmd) {
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* Privileged. */
/* FALLTHROUGH */
case SIOCGLIFADDR:
return in6_lifaddr_ioctl(so, cmd, data, ifp);
}
/*
* Find address for this interface, if it exists.
*
* In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
* only, and used the first interface address as the target of other
* operations (without checking ifra_addr). This was because netinet
* code/API assumed at most 1 interface address per interface.
* Since IPv6 allows a node to assign multiple addresses
* on a single interface, we almost always look and check the
* presence of ifra_addr, and reject invalid ones here.
* It also decreases duplicated code among SIOC*_IN6 operations.
*/
switch (cmd) {
case SIOCAIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
#endif
#ifdef OSIOCSIFPHYADDR_IN6
case OSIOCSIFPHYADDR_IN6:
#endif
case SIOCSIFPHYADDR_IN6:
sa6 = &ifra->ifra_addr;
break;
case SIOCSIFADDR_IN6:
case SIOCGIFADDR_IN6:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFNETMASK_IN6:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCDIFADDR_IN6:
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFPDSTADDR_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
case SIOCGIFSTAT_IN6:
case SIOCGIFSTAT_ICMP6:
sa6 = &ifr->ifr_addr;
break;
default:
sa6 = NULL;
break;
}
error = 0;
bound = curlwp_bind();
if (sa6 && sa6->sin6_family == AF_INET6) {
if (sa6->sin6_scope_id != 0)
error = sa6_embedscope(sa6, 0);
else
error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
if (error != 0)
goto out;
ia = in6ifa_ifpwithaddr_psref(ifp, &sa6->sin6_addr, &psref);
} else
ia = NULL;
switch (cmd) {
case SIOCSIFADDR_IN6:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFNETMASK_IN6:
/*
* Since IPv6 allows a node to assign multiple addresses
* on a single interface, SIOCSIFxxx ioctls are deprecated.
*/
error = EINVAL;
goto release;
case SIOCDIFADDR_IN6:
/*
* for IPv4, we look for existing in_ifaddr here to allow
* "ifconfig if0 delete" to remove the first IPv4 address on
* the interface. For IPv6, as the spec allows multiple
* interface address from the day one, we consider "remove the
* first one" semantics to be not preferable.
*/
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
#ifdef OSIOCAIFADDR_IN6
/* FALLTHROUGH */
case OSIOCAIFADDR_IN6:
#endif
/* FALLTHROUGH */
case SIOCAIFADDR_IN6:
/*
* We always require users to specify a valid IPv6 address for
* the corresponding operation.
*/
if (ifra->ifra_addr.sin6_family != AF_INET6 ||
ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
error = EAFNOSUPPORT;
goto release;
}
/* Privileged. */
break;
case SIOCGIFADDR_IN6:
/* This interface is basically deprecated. use SIOCGIFCONF. */
/* FALLTHROUGH */
case SIOCGIFAFLAG_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
/* must think again about its semantics */
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
break;
}
switch (cmd) {
case SIOCGIFADDR_IN6:
ifr->ifr_addr = ia->ia_addr;
error = sa6_recoverscope(&ifr->ifr_addr);
break;
case SIOCGIFDSTADDR_IN6:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
break;
}
/*
* XXX: should we check if ifa_dstaddr is NULL and return
* an error?
*/
ifr->ifr_dstaddr = ia->ia_dstaddr;
error = sa6_recoverscope(&ifr->ifr_dstaddr);
break;
case SIOCGIFNETMASK_IN6:
ifr->ifr_addr = ia->ia_prefixmask;
break;
case SIOCGIFAFLAG_IN6:
ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
break;
case SIOCGIFSTAT_IN6:
if (ifp == NULL) {
error = EINVAL;
break;
}
memset(&ifr->ifr_ifru.ifru_stat, 0,
sizeof(ifr->ifr_ifru.ifru_stat));
ifr->ifr_ifru.ifru_stat =
*((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->in6_ifstat;
break;
case SIOCGIFSTAT_ICMP6:
if (ifp == NULL) {
error = EINVAL;
break;
}
memset(&ifr->ifr_ifru.ifru_icmp6stat, 0,
sizeof(ifr->ifr_ifru.ifru_icmp6stat));
ifr->ifr_ifru.ifru_icmp6stat =
*((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->icmp6_ifstat;
break;
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
case SIOCGIFALIFETIME_IN6:
ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
time_t maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire = ((time_t)~0) &
(time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
if (ia->ia6_lifetime.ia6t_vltime <
maxexpire - ia->ia6_updatetime) {
retlt->ia6t_expire = ia->ia6_updatetime +
ia->ia6_lifetime.ia6t_vltime;
retlt->ia6t_expire = retlt->ia6t_expire ?
time_mono_to_wall(retlt->ia6t_expire) :
0;
} else
retlt->ia6t_expire = maxexpire;
}
if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
time_t maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire = ((time_t)~0) &
(time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
if (ia->ia6_lifetime.ia6t_pltime <
maxexpire - ia->ia6_updatetime) {
retlt->ia6t_preferred = ia->ia6_updatetime +
ia->ia6_lifetime.ia6t_pltime;
retlt->ia6t_preferred = retlt->ia6t_preferred ?
time_mono_to_wall(retlt->ia6t_preferred) :
0;
} else
retlt->ia6t_preferred = maxexpire;
}
#ifdef OSIOCFIFALIFETIME_IN6
if (cmd == OSIOCFIFALIFETIME_IN6)
in6_addrlifetime_to_in6_addrlifetime50(
&ifr->ifru.ifru_lifetime);
#endif
break;
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
in6_aliasreq50_to_in6_aliasreq(ifra);
#endif
/*FALLTHROUGH*/
case SIOCAIFADDR_IN6:
{
struct in6_addrlifetime *lt;
/* reject read-only flags */
if ((ifra->ifra_flags & IN6_IFF_DUPLICATED) != 0 ||
(ifra->ifra_flags & IN6_IFF_DETACHED) != 0 ||
(ifra->ifra_flags & IN6_IFF_TENTATIVE) != 0 ||
(ifra->ifra_flags & IN6_IFF_NODAD) != 0) {
error = EINVAL;
break;
}
/*
* ia6t_expire and ia6t_preferred won't be used for now,
* so just in case.
*/
lt = &ifra->ifra_lifetime;
if (lt->ia6t_expire != 0)
lt->ia6t_expire = time_wall_to_mono(lt->ia6t_expire);
if (lt->ia6t_preferred != 0)
lt->ia6t_preferred =
time_wall_to_mono(lt->ia6t_preferred);
/*
* make (ia == NULL) or update (ia != NULL) the interface
* address structure, and link it to the list.
*/
int s = splsoftnet();
error = in6_update_ifa1(ifp, ifra, &ia, &psref, 0);
splx(s);
/*
* in6_update_ifa1 doesn't create the address if its
* valid lifetime (vltime) is zero, since we would just
* delete the address immediately in that case anyway.
* So it may succeed but return null ia. In that case,
* nothing left to do.
*/
if (error || ia == NULL)
break;
pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
break;
}
case SIOCDIFADDR_IN6:
ia6_release(ia, &psref);
ifaref(&ia->ia_ifa);
in6_purgeaddr(&ia->ia_ifa);
pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
ifafree(&ia->ia_ifa);
ia = NULL;
break;
default:
error = ENOTTY;
}
release:
ia6_release(ia, &psref);
out:
curlwp_bindx(bound);
return error;
}
int
in6_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
int error, s;
switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
case OSIOCSIFINFO_FLAGS_90:
case OSIOCSIFINFO_IN6_90:
case OSIOCSDEFIFACE_IN6:
case OSIOCSNDFLUSH_IN6:
case OSIOCSPFXFLUSH_IN6:
case OSIOCSRTRFLUSH_IN6:
#endif
case SIOCSIFINFO_FLAGS:
case SIOCSIFINFO_IN6:
case SIOCALIFADDR:
case SIOCDLIFADDR:
case SIOCDIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
#endif
case SIOCAIFADDR_IN6:
case SIOCAADDRCTL_POLICY:
case SIOCDADDRCTL_POLICY:
if (kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_SOCKET,
KAUTH_REQ_NETWORK_SOCKET_SETPRIV,
so, NULL, NULL))
return EPERM;
break;
}
s = splsoftnet();
#ifndef NET_MPSAFE
KASSERT(KERNEL_LOCKED_P());
#endif
error = in6_control1(so , cmd, data, ifp);
splx(s);
return error;
}
static int
in6_get_llsol_addr(struct in6_addr *llsol, struct ifnet *ifp,
struct in6_addr *ip6)
{
int error;
memset(llsol, 0, sizeof(struct in6_addr));
llsol->s6_addr16[0] = htons(0xff02);
llsol->s6_addr32[1] = 0;
llsol->s6_addr32[2] = htonl(1);
llsol->s6_addr32[3] = ip6->s6_addr32[3];
llsol->s6_addr8[12] = 0xff;
error = in6_setscope(llsol, ifp, NULL);
if (error != 0) {
/* XXX: should not happen */
log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
}
return error;
}
static int
in6_join_mcastgroups(struct in6_aliasreq *ifra, struct in6_ifaddr *ia,
struct ifnet *ifp, int flags)
{
int error;
struct sockaddr_in6 mltaddr, mltmask;
struct in6_multi_mship *imm;
struct in6_addr llsol;
struct rtentry *rt;
int dad_delay;
char ip6buf[INET6_ADDRSTRLEN];
/* join solicited multicast addr for new host id */
error = in6_get_llsol_addr(&llsol, ifp, &ifra->ifra_addr.sin6_addr);
if (error != 0)
goto out;
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
/*
* We need a random delay for DAD on the address
* being configured. It also means delaying
* transmission of the corresponding MLD report to
* avoid report collision.
* [draft-ietf-ipv6-rfc2462bis-02.txt]
*/
dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
}
#define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */
/* join solicited multicast addr for new host id */
imm = in6_joingroup(ifp, &llsol, &error, dad_delay);
if (!imm) {
nd6log(LOG_ERR,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &llsol), if_name(ifp), error);
goto out;
}
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
sockaddr_in6_init(&mltmask, &in6mask32, 0, 0, 0);
/*
* join link-local all-nodes address
*/
sockaddr_in6_init(&mltaddr, &in6addr_linklocal_allnodes,
0, 0, 0);
if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
goto out; /* XXX: should not fail */
/*
* XXX: do we really need this automatic routes?
* We should probably reconsider this stuff. Most applications
* actually do not need the routes, since they usually specify
* the outgoing interface.
*/
rt = rtalloc1(sin6tosa(&mltaddr), 0);
if (rt) {
if (memcmp(&mltaddr.sin6_addr,
&satocsin6(rt_getkey(rt))->sin6_addr,
MLTMASK_LEN)) {
rt_unref(rt);
rt = NULL;
} else if (rt->rt_ifp != ifp) {
IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
"network %04x:%04x::/32 = %04x:%04x::/32\n",
__func__, rt->rt_ifp, ifp, ifp->if_xname,
ntohs(mltaddr.sin6_addr.s6_addr16[0]),
ntohs(mltaddr.sin6_addr.s6_addr16[1]),
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
error = rt_update_prepare(rt);
if (error == 0) {
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
rt_update_finish(rt);
} else {
/*
* If error != 0, the rtentry is being
* destroyed, so doing nothing doesn't
* matter.
*/
}
#else
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
#endif
}
}
if (!rt) {
struct rt_addrinfo info;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
/* XXX: we need RTF_CONNECTED to fake nd6_rtrequest */
info.rti_flags = RTF_UP | RTF_CONNECTED;
error = rtrequest1(RTM_ADD, &info, NULL);
if (error)
goto out;
} else {
rt_unref(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
if (!imm) {
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
goto out;
}
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
/*
* join node information group address
*/
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
/*
* The spec doesn't say anything about delay for this
* group, but the same logic should apply.
*/
dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
}
if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) != 0)
;
else if ((imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error,
dad_delay)) == NULL) { /* XXX jinmei */
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
/* XXX not very fatal, go on... */
} else {
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
}
/*
* join interface-local all-nodes address.
* (ff01::1%ifN, and ff01::%ifN/32)
*/
mltaddr.sin6_addr = in6addr_nodelocal_allnodes;
if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
goto out; /* XXX: should not fail */
/* XXX: again, do we really need the route? */
rt = rtalloc1(sin6tosa(&mltaddr), 0);
if (rt) {
/* 32bit came from "mltmask" */
if (memcmp(&mltaddr.sin6_addr,
&satocsin6(rt_getkey(rt))->sin6_addr,
32 / NBBY)) {
rt_unref(rt);
rt = NULL;
} else if (rt->rt_ifp != ifp) {
IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
"network %04x:%04x::/32 = %04x:%04x::/32\n",
__func__, rt->rt_ifp, ifp, ifp->if_xname,
ntohs(mltaddr.sin6_addr.s6_addr16[0]),
ntohs(mltaddr.sin6_addr.s6_addr16[1]),
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
error = rt_update_prepare(rt);
if (error == 0) {
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
rt_update_finish(rt);
} else {
/*
* If error != 0, the rtentry is being
* destroyed, so doing nothing doesn't
* matter.
*/
}
#else
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
#endif
}
}
if (!rt) {
struct rt_addrinfo info;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
info.rti_flags = RTF_UP | RTF_CONNECTED;
error = rtrequest1(RTM_ADD, &info, NULL);
if (error)
goto out;
#undef MLTMASK_LEN
} else {
rt_unref(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
if (!imm) {
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
goto out;
} else {
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
}
return 0;
out:
KASSERT(error != 0);
return error;
}
/*
* Update parameters of an IPv6 interface address.
* If necessary, a new entry is created and linked into address chains.
* This function is separated from in6_control().
* XXX: should this be performed under splsoftnet()?
*/
static int
in6_update_ifa1(struct ifnet *ifp, struct in6_aliasreq *ifra,
struct in6_ifaddr **iap, struct psref *psref, int flags)
{
int error = 0, hostIsNew = 0, plen = -1;
struct sockaddr_in6 dst6;
struct in6_addrlifetime *lt;
int dad_delay, was_tentative;
struct in6_ifaddr *ia = iap ? *iap : NULL;
char ip6buf[INET6_ADDRSTRLEN];
bool addrmaskNotChanged = false;
bool send_rtm_newaddr = (ip6_param_rt_msg == 1);
int saved_flags = 0;
KASSERT((iap == NULL && psref == NULL) ||
(iap != NULL && psref != NULL));
/* Validate parameters */
if (ifp == NULL || ifra == NULL) /* this maybe redundant */
return EINVAL;
/*
* The destination address for a p2p link must have a family
* of AF_UNSPEC or AF_INET6.
*/
if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
return EAFNOSUPPORT;
/*
* validate ifra_prefixmask. don't check sin6_family, netmask
* does not carry fields other than sin6_len.
*/
if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
return EINVAL;
/*
* Because the IPv6 address architecture is classless, we require
* users to specify a (non 0) prefix length (mask) for a new address.
* We also require the prefix (when specified) mask is valid, and thus
* reject a non-consecutive mask.
*/
if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
return EINVAL;
if (ifra->ifra_prefixmask.sin6_len != 0) {
plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
(u_char *)&ifra->ifra_prefixmask +
ifra->ifra_prefixmask.sin6_len);
if (plen <= 0)
return EINVAL;
} else {
/*
* In this case, ia must not be NULL. We just use its prefix
* length.
*/
plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
}
/*
* If the destination address on a p2p interface is specified,
* and the address is a scoped one, validate/set the scope
* zone identifier.
*/
dst6 = ifra->ifra_dstaddr;
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
(dst6.sin6_family == AF_INET6)) {
struct in6_addr in6_tmp;
u_int32_t zoneid;
in6_tmp = dst6.sin6_addr;
if (in6_setscope(&in6_tmp, ifp, &zoneid))
return EINVAL; /* XXX: should be impossible */
if (dst6.sin6_scope_id != 0) {
if (dst6.sin6_scope_id != zoneid)
return EINVAL;
} else /* user omit to specify the ID. */
dst6.sin6_scope_id = zoneid;
/* convert into the internal form */
if (sa6_embedscope(&dst6, 0))
return EINVAL; /* XXX: should be impossible */
}
/*
* The destination address can be specified only for a p2p or a
* loopback interface. If specified, the corresponding prefix length
* must be 128.
*/
if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
#ifdef FORCE_P2PPLEN
int i;
#endif
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
/* XXX: noisy message */
nd6log(LOG_INFO, "a destination can "
"be specified for a p2p or a loopback IF only\n");
return EINVAL;
}
if (plen != 128) {
nd6log(LOG_INFO, "prefixlen should "
"be 128 when dstaddr is specified\n");
#ifdef FORCE_P2PPLEN
/*
* To be compatible with old configurations,
* such as ifconfig gif0 inet6 2001::1 2001::2
* prefixlen 126, we override the specified
* prefixmask as if the prefix length was 128.
*/
ifra->ifra_prefixmask.sin6_len =
sizeof(struct sockaddr_in6);
for (i = 0; i < 4; i++)
ifra->ifra_prefixmask.sin6_addr.s6_addr32[i] =
0xffffffff;
plen = 128;
#else
return EINVAL;
#endif
}
}
/* lifetime consistency check */
lt = &ifra->ifra_lifetime;
if (lt->ia6t_pltime > lt->ia6t_vltime)
return EINVAL;
if (lt->ia6t_vltime == 0) {
/*
* the following log might be noisy, but this is a typical
* configuration mistake or a tool's bug.
*/
nd6log(LOG_INFO, "valid lifetime is 0 for %s\n",
IN6_PRINT(ip6buf, &ifra->ifra_addr.sin6_addr));
if (ia == NULL)
return 0; /* there's nothing to do */
}
#define sin6eq(a, b) \
((a)->sin6_len == sizeof(struct sockaddr_in6) && \
(b)->sin6_len == sizeof(struct sockaddr_in6) && \
IN6_ARE_ADDR_EQUAL(&(a)->sin6_addr, &(b)->sin6_addr))
if (!send_rtm_newaddr) {
if (ia != NULL &&
sin6eq(&ifra->ifra_addr, &ia->ia_addr) &&
sin6eq(&ifra->ifra_prefixmask, &ia->ia_prefixmask)) {
addrmaskNotChanged = true;
saved_flags = ia->ia6_flags; /* check it later */
}
}
#undef sin6eq
/*
* If this is a new address, allocate a new ifaddr and link it
* into chains.
*/
if (ia == NULL) {
hostIsNew = 1;
/*
* When in6_update_ifa() is called in a process of a received
* RA, it is called under an interrupt context. So, we should
* call malloc with M_NOWAIT.
*/
ia = malloc(sizeof(*ia), M_IFADDR, M_NOWAIT|M_ZERO);
if (ia == NULL)
return ENOBUFS;
LIST_INIT(&ia->ia6_memberships);
/* Initialize the address and masks, and put time stamp */
ia->ia_ifa.ifa_addr = sin6tosa(&ia->ia_addr);
ia->ia_addr.sin6_family = AF_INET6;
ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
ia->ia6_createtime = time_uptime;
if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
/*
* XXX: some functions expect that ifa_dstaddr is not
* NULL for p2p interfaces.
*/
ia->ia_ifa.ifa_dstaddr = sin6tosa(&ia->ia_dstaddr);
} else {
ia->ia_ifa.ifa_dstaddr = NULL;
}
ia->ia_ifa.ifa_netmask = sin6tosa(&ia->ia_prefixmask);
ia->ia_ifp = ifp;
IN6_ADDRLIST_ENTRY_INIT(ia);
ifa_psref_init(&ia->ia_ifa);
}
/* update timestamp */
ia->ia6_updatetime = time_uptime;
/* set prefix mask */
if (ifra->ifra_prefixmask.sin6_len) {
if (ia->ia_prefixmask.sin6_len) {
if (!IN6_ARE_ADDR_EQUAL(&ia->ia_prefixmask.sin6_addr,
&ifra->ifra_prefixmask.sin6_addr))
in6_ifremprefix(ia);
}
ia->ia_prefixmask = ifra->ifra_prefixmask;
}
/* Set destination address. */
if (dst6.sin6_family == AF_INET6) {
if (!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr,
&ia->ia_dstaddr.sin6_addr))
in6_ifremprefix(ia);
ia->ia_dstaddr = dst6;
}
/*
* Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred
* to see if the address is deprecated or invalidated, but initialize
* these members for applications.
*/
ia->ia6_lifetime = ifra->ifra_lifetime;
if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
ia->ia6_lifetime.ia6t_expire =
time_uptime + ia->ia6_lifetime.ia6t_vltime;
} else
ia->ia6_lifetime.ia6t_expire = 0;
if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
ia->ia6_lifetime.ia6t_preferred =
time_uptime + ia->ia6_lifetime.ia6t_pltime;
} else
ia->ia6_lifetime.ia6t_preferred = 0;
/*
* configure address flags.
* We need to preserve tentative state so DAD works if
* something adds the same address before DAD finishes.
*/
was_tentative = ia->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED);
ia->ia6_flags = ifra->ifra_flags;
/*
* Make the address tentative before joining multicast addresses,
* so that corresponding MLD responses would not have a tentative
* source address.
*/
ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */
if (ifp->if_link_state == LINK_STATE_DOWN) {
ia->ia6_flags |= IN6_IFF_DETACHED;
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
} else if ((hostIsNew || was_tentative) && if_do_dad(ifp) &&
ip6_dad_enabled()) {
ia->ia6_flags |= IN6_IFF_TENTATIVE;
}
/*
* backward compatibility - if IN6_IFF_DEPRECATED is set from the
* userland, make it deprecated.
*/
if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
ia->ia6_lifetime.ia6t_pltime = 0;
ia->ia6_lifetime.ia6t_preferred = time_uptime;
}
if (!send_rtm_newaddr) {
/*
* We will not send RTM_NEWADDR if the only difference between
* ia and ifra is preferred/valid lifetimes, because it is not
* very useful for userland programs to be notified of that
* changes.
*/
if (addrmaskNotChanged && ia->ia6_flags == saved_flags)
return 0;
}
if (hostIsNew) {
/*
* We need a reference to ia before calling in6_ifinit.
* Otherwise ia can be freed in in6_ifinit accidentally.
*/
ifaref(&ia->ia_ifa);
}
/* Must execute in6_ifinit and ifa_insert atomically */
mutex_enter(&in6_ifaddr_lock);
/* reset the interface and routing table appropriately. */
error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew);
if (error != 0) {
if (hostIsNew)
free(ia, M_IFADDR);
mutex_exit(&in6_ifaddr_lock);
return error;
}
/*
* We are done if we have simply modified an existing address.
*/
if (!hostIsNew) {
mutex_exit(&in6_ifaddr_lock);
return error;
}
/*
* Insert ia to the global list and ifa to the interface's list.
* A reference to it is already gained above.
*/
IN6_ADDRLIST_WRITER_INSERT_TAIL(ia);
ifa_insert(ifp, &ia->ia_ifa);
mutex_exit(&in6_ifaddr_lock);
/*
* Beyond this point, we should call in6_purgeaddr upon an error,
* not just go to unlink.
*/
/* join necessary multicast groups */
if ((ifp->if_flags & IFF_MULTICAST) != 0) {
error = in6_join_mcastgroups(ifra, ia, ifp, flags);
if (error != 0)
goto cleanup;
}
if (nd6_need_cache(ifp)) {
/* XXX maybe unnecessary */
ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
ia->ia_ifa.ifa_flags |= RTF_CONNECTED;
}
/*
* Perform DAD, if needed.
* XXX It may be of use, if we can administratively
* disable DAD.
*/
if (hostIsNew && if_do_dad(ifp) &&
((ifra->ifra_flags & IN6_IFF_NODAD) == 0) &&
(ia->ia6_flags & IN6_IFF_TENTATIVE))
{
int mindelay, maxdelay;
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
struct in6_addr llsol;
struct in6_multi *in6m_sol = NULL;
/*
* We need to impose a delay before sending an NS
* for DAD. Check if we also needed a delay for the
* corresponding MLD message. If we did, the delay
* should be larger than the MLD delay (this could be
* relaxed a bit, but this simple logic is at least
* safe).
*/
mindelay = 0;
error = in6_get_llsol_addr(&llsol, ifp,
&ifra->ifra_addr.sin6_addr);
in6_multi_lock(RW_READER);
if (error == 0)
in6m_sol = in6_lookup_multi(&llsol, ifp);
if (in6m_sol != NULL &&
in6m_sol->in6m_state == MLD_REPORTPENDING) {
mindelay = in6m_sol->in6m_timer;
}
in6_multi_unlock();
maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
if (maxdelay - mindelay == 0)
dad_delay = 0;
else {
dad_delay =
(cprng_fast32() % (maxdelay - mindelay)) +
mindelay;
}
}
/* +1 ensures callout is always used */
nd6_dad_start(&ia->ia_ifa, dad_delay + 1);
}
if (iap != NULL) {
*iap = ia;
if (hostIsNew)
ia6_acquire(ia, psref);
}
return 0;
cleanup:
in6_purgeaddr(&ia->ia_ifa);
return error;
}
int
in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
{
int rc, s;
s = splsoftnet();
rc = in6_update_ifa1(ifp, ifra, NULL, NULL, flags);
splx(s);
return rc;
}
void
in6_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
struct in6_multi_mship *imm;
/* KASSERT(!ifa_held(ifa)); XXX need ifa_not_held (psref_not_held) */
KASSERT(IFNET_LOCKED(ifp));
ifa->ifa_flags |= IFA_DESTROYING;
/* stop DAD processing */
nd6_dad_stop(ifa);
/* Delete any network route. */
in6_ifremprefix(ia);
/* Remove ownaddr's loopback rtentry, if it exists. */
in6_ifremlocal(&(ia->ia_ifa));
/*
* leave from multicast groups we have joined for the interface
*/
again:
mutex_enter(&in6_ifaddr_lock);
while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
struct in6_multi *in6m __diagused = imm->i6mm_maddr;
KASSERTMSG(in6m == NULL || in6m->in6m_ifp == ifp,
"in6m_ifp=%s ifp=%s", in6m ? in6m->in6m_ifp->if_xname : NULL,
ifp->if_xname);
LIST_REMOVE(imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
in6_leavegroup(imm);
goto again;
}
mutex_exit(&in6_ifaddr_lock);
in6_unlink_ifa(ia, ifp);
}
static void
in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
{
int s = splsoftnet();
mutex_enter(&in6_ifaddr_lock);
IN6_ADDRLIST_WRITER_REMOVE(ia);
ifa_remove(ifp, &ia->ia_ifa);
/* Assume ifa_remove called pserialize_perform and psref_destroy */
mutex_exit(&in6_ifaddr_lock);
IN6_ADDRLIST_ENTRY_DESTROY(ia);
/*
* release another refcnt for the link from in6_ifaddr.
* Note that we should decrement the refcnt at least once for all *BSD.
*/
ifafree(&ia->ia_ifa);
splx(s);
}
void
in6_purgeif(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
in6_ifdetach(ifp);
IFNET_UNLOCK(ifp);
}
void
in6_purge_mcast_references(struct in6_multi *in6m)
{
struct in6_ifaddr *ia;
KASSERT(in6_multi_locked(RW_WRITER));
mutex_enter(&in6_ifaddr_lock);
IN6_ADDRLIST_WRITER_FOREACH(ia) {
struct in6_multi_mship *imm;
LIST_FOREACH(imm, &ia->ia6_memberships, i6mm_chain) {
if (imm->i6mm_maddr == in6m)
imm->i6mm_maddr = NULL;
}
}
mutex_exit(&in6_ifaddr_lock);
}
/*
* SIOC[GAD]LIFADDR.
* SIOCGLIFADDR: get first address. (?)
* SIOCGLIFADDR with IFLR_PREFIX:
* get first address that matches the specified prefix.
* SIOCALIFADDR: add the specified address.
* SIOCALIFADDR with IFLR_PREFIX:
* add the specified prefix, filling hostid part from
* the first link-local address. prefixlen must be <= 64.
* SIOCDLIFADDR: delete the specified address.
* SIOCDLIFADDR with IFLR_PREFIX:
* delete the first address that matches the specified prefix.
* return values:
* EINVAL on invalid parameters
* EADDRNOTAVAIL on prefix match failed/specified address not found
* other values may be returned from in6_ioctl()
*
* NOTE: SIOCALIFADDR(with IFLR_PREFIX set) allows prefixlen less than 64.
* this is to accommodate address naming scheme other than RFC2374,
* in the future.
* RFC2373 defines interface id to be 64bit, but it allows non-RFC2374
* address encoding scheme. (see figure on page 8)
*/
static int
in6_lifaddr_ioctl(struct socket *so, u_long cmd, void *data,
struct ifnet *ifp)
{
struct in6_ifaddr *ia = NULL; /* XXX gcc 4.8 maybe-uninitialized */
struct if_laddrreq *iflr = (struct if_laddrreq *)data;
struct ifaddr *ifa;
struct sockaddr *sa;
/* sanity checks */
if (!data || !ifp) {
panic("invalid argument to in6_lifaddr_ioctl");
/* NOTREACHED */
}
switch (cmd) {
case SIOCGLIFADDR:
/* address must be specified on GET with IFLR_PREFIX */
if ((iflr->flags & IFLR_PREFIX) == 0)
break;
/* FALLTHROUGH */
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* address must be specified on ADD and DELETE */
sa = (struct sockaddr *)&iflr->addr;
if (sa->sa_family != AF_INET6)
return EINVAL;
if (sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
/* XXX need improvement */
sa = (struct sockaddr *)&iflr->dstaddr;
if (sa->sa_family && sa->sa_family != AF_INET6)
return EINVAL;
if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
break;
default: /* shouldn't happen */
#if 0
panic("invalid cmd to in6_lifaddr_ioctl");
/* NOTREACHED */
#else
return EOPNOTSUPP;
#endif
}
if (sizeof(struct in6_addr) * NBBY < iflr->prefixlen)
return EINVAL;
switch (cmd) {
case SIOCALIFADDR:
{
struct in6_aliasreq ifra;
struct in6_addr *xhostid = NULL;
int prefixlen;
int bound = curlwp_bind();
struct psref psref;
if ((iflr->flags & IFLR_PREFIX) != 0) {
struct sockaddr_in6 *sin6;
/*
* xhostid is to fill in the hostid part of the
* address. xhostid points to the first link-local
* address attached to the interface.
*/
ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref);
if (ia == NULL) {
curlwp_bindx(bound);
return EADDRNOTAVAIL;
}
xhostid = IFA_IN6(&ia->ia_ifa);
/* prefixlen must be <= 64. */
if (64 < iflr->prefixlen) {
ia6_release(ia, &psref);
curlwp_bindx(bound);
return EINVAL;
}
prefixlen = iflr->prefixlen;
/* hostid part must be zero. */
sin6 = (struct sockaddr_in6 *)&iflr->addr;
if (sin6->sin6_addr.s6_addr32[2] != 0
|| sin6->sin6_addr.s6_addr32[3] != 0) {
ia6_release(ia, &psref);
curlwp_bindx(bound);
return EINVAL;
}
} else
prefixlen = iflr->prefixlen;
/* copy args to in6_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
memset(&ifra, 0, sizeof(ifra));
memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name));
memcpy(&ifra.ifra_addr, &iflr->addr,
((struct sockaddr *)&iflr->addr)->sa_len);
if (xhostid) {
/* fill in hostid part */
ifra.ifra_addr.sin6_addr.s6_addr32[2] =
xhostid->s6_addr32[2];
ifra.ifra_addr.sin6_addr.s6_addr32[3] =
xhostid->s6_addr32[3];
}
if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /* XXX */
memcpy(&ifra.ifra_dstaddr, &iflr->dstaddr,
((struct sockaddr *)&iflr->dstaddr)->sa_len);
if (xhostid) {
ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] =
xhostid->s6_addr32[2];
ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] =
xhostid->s6_addr32[3];
}
}
if (xhostid) {
ia6_release(ia, &psref);
ia = NULL;
}
curlwp_bindx(bound);
ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen);
ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX;
return in6_control(so, SIOCAIFADDR_IN6, &ifra, ifp);
}
case SIOCGLIFADDR:
case SIOCDLIFADDR:
{
struct in6_addr mask, candidate, match;
struct sockaddr_in6 *sin6;
int cmp;
int error, s;
memset(&mask, 0, sizeof(mask));
if (iflr->flags & IFLR_PREFIX) {
/* lookup a prefix rather than address. */
in6_prefixlen2mask(&mask, iflr->prefixlen);
sin6 = (struct sockaddr_in6 *)&iflr->addr;
memcpy(&match, &sin6->sin6_addr, sizeof(match));
match.s6_addr32[0] &= mask.s6_addr32[0];
match.s6_addr32[1] &= mask.s6_addr32[1];
match.s6_addr32[2] &= mask.s6_addr32[2];
match.s6_addr32[3] &= mask.s6_addr32[3];
/* if you set extra bits, that's wrong */
if (memcmp(&match, &sin6->sin6_addr, sizeof(match)))
return EINVAL;
cmp = 1;
} else {
if (cmd == SIOCGLIFADDR) {
/* on getting an address, take the 1st match */
cmp = 0; /* XXX */
} else {
/* on deleting an address, do exact match */
in6_prefixlen2mask(&mask, 128);
sin6 = (struct sockaddr_in6 *)&iflr->addr;
memcpy(&match, &sin6->sin6_addr, sizeof(match));
cmp = 1;
}
}
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!cmp)
break;
/*
* XXX: this is adhoc, but is necessary to allow
* a user to specify fe80::/64 (not /10) for a
* link-local address.
*/
memcpy(&candidate, IFA_IN6(ifa), sizeof(candidate));
in6_clearscope(&candidate);
candidate.s6_addr32[0] &= mask.s6_addr32[0];
candidate.s6_addr32[1] &= mask.s6_addr32[1];
candidate.s6_addr32[2] &= mask.s6_addr32[2];
candidate.s6_addr32[3] &= mask.s6_addr32[3];
if (IN6_ARE_ADDR_EQUAL(&candidate, &match))
break;
}
if (!ifa) {
error = EADDRNOTAVAIL;
goto error;
}
ia = ifa2ia6(ifa);
if (cmd == SIOCGLIFADDR) {
/* fill in the if_laddrreq structure */
memcpy(&iflr->addr, &ia->ia_addr, ia->ia_addr.sin6_len);
error = sa6_recoverscope(
(struct sockaddr_in6 *)&iflr->addr);
if (error != 0)
goto error;
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
memcpy(&iflr->dstaddr, &ia->ia_dstaddr,
ia->ia_dstaddr.sin6_len);
error = sa6_recoverscope(
(struct sockaddr_in6 *)&iflr->dstaddr);
if (error != 0)
goto error;
} else
memset(&iflr->dstaddr, 0, sizeof(iflr->dstaddr));
iflr->prefixlen =
in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
iflr->flags = ia->ia6_flags; /* XXX */
error = 0;
} else {
struct in6_aliasreq ifra;
/* fill in6_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
memset(&ifra, 0, sizeof(ifra));
memcpy(ifra.ifra_name, iflr->iflr_name,
sizeof(ifra.ifra_name));
memcpy(&ifra.ifra_addr, &ia->ia_addr,
ia->ia_addr.sin6_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
memcpy(&ifra.ifra_dstaddr, &ia->ia_dstaddr,
ia->ia_dstaddr.sin6_len);
} else {
memset(&ifra.ifra_dstaddr, 0,
sizeof(ifra.ifra_dstaddr));
}
memcpy(&ifra.ifra_dstaddr, &ia->ia_prefixmask,
ia->ia_prefixmask.sin6_len);
ifra.ifra_flags = ia->ia6_flags;
pserialize_read_exit(s);
return in6_control(so, SIOCDIFADDR_IN6, &ifra, ifp);
}
error:
pserialize_read_exit(s);
return error;
}
}
return EOPNOTSUPP; /* just for safety */
}
/*
* Initialize an interface's internet6 address
* and routing table entry.
*/
static int
in6_ifinit(struct ifnet *ifp, struct in6_ifaddr *ia,
const struct sockaddr_in6 *sin6, int newhost)
{
int error = 0, ifacount = 0;
int s;
struct ifaddr *ifa;
KASSERT(mutex_owned(&in6_ifaddr_lock));
/*
* Give the interface a chance to initialize
* if this is its first address,
* and to validate the address if necessary.
*/
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifacount++;
}
pserialize_read_exit(s);
ia->ia_addr = *sin6;
if (ifacount == 0 &&
(error = if_addr_init(ifp, &ia->ia_ifa, true)) != 0) {
return error;
}
ia->ia_ifa.ifa_metric = ifp->if_metric;
/* we could do in(6)_socktrim here, but just omit it at this moment. */
/* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */
if (newhost) {
/* set the rtrequest function to create llinfo */
if (ifp->if_flags & IFF_POINTOPOINT)
ia->ia_ifa.ifa_rtrequest = p2p_rtrequest;
else if ((ifp->if_flags & IFF_LOOPBACK) == 0)
ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
in6_ifaddlocal(&ia->ia_ifa);
} else {
/* Inform the routing socket of new flags/timings */
rt_addrmsg(RTM_NEWADDR, &ia->ia_ifa);
}
/* Add the network prefix route. */
if ((error = in6_ifaddprefix(ia)) != 0) {
if (newhost)
in6_ifremlocal(&ia->ia_ifa);
return error;
}
return error;
}
static struct ifaddr *
bestifa(struct ifaddr *best_ifa, struct ifaddr *ifa)
{
if (best_ifa == NULL || best_ifa->ifa_preference < ifa->ifa_preference)
return ifa;
return best_ifa;
}
/*
* Find an IPv6 interface link-local address specific to an interface.
*/
struct in6_ifaddr *
in6ifa_ifpforlinklocal(const struct ifnet *ifp, const int ignoreflags)
{
struct ifaddr *best_ifa = NULL, *ifa;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)))
continue;
if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0)
continue;
best_ifa = bestifa(best_ifa, ifa);
}
return (struct in6_ifaddr *)best_ifa;
}
struct in6_ifaddr *
in6ifa_ifpforlinklocal_psref(const struct ifnet *ifp, const int ignoreflags,
struct psref *psref)
{
struct in6_ifaddr *ia;
int s = pserialize_read_enter();
ia = in6ifa_ifpforlinklocal(ifp, ignoreflags);
if (ia != NULL)
ia6_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* find the internet address corresponding to a given address.
* ifaddr is returned referenced.
*/
struct in6_ifaddr *
in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
{
struct in6_ifaddr *ia;
int s;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
if (zoneid != 0 &&
zoneid != ia->ia_addr.sin6_scope_id)
continue;
ifaref(&ia->ia_ifa);
break;
}
}
pserialize_read_exit(s);
return ia;
}
/*
* find the internet address corresponding to a given interface and address.
*/
struct in6_ifaddr *
in6ifa_ifpwithaddr(const struct ifnet *ifp, const struct in6_addr *addr)
{
struct ifaddr *best_ifa = NULL, *ifa;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa)))
continue;
best_ifa = bestifa(best_ifa, ifa);
}
return (struct in6_ifaddr *)best_ifa;
}
struct in6_ifaddr *
in6ifa_ifpwithaddr_psref(const struct ifnet *ifp, const struct in6_addr *addr,
struct psref *psref)
{
struct in6_ifaddr *ia;
int s = pserialize_read_enter();
ia = in6ifa_ifpwithaddr(ifp, addr);
if (ia != NULL)
ia6_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
static struct in6_ifaddr *
bestia(struct in6_ifaddr *best_ia, struct in6_ifaddr *ia)
{
if (best_ia == NULL || best_ia->ia_ifa.ifa_preference < ia->ia_ifa.ifa_preference)
return ia;
return best_ia;
}
/*
* Determine if an address is on a local network.
*/
int
in6_localaddr(const struct in6_addr *in6)
{
struct in6_ifaddr *ia;
int s;
if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
return 1;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
&ia->ia_prefixmask.sin6_addr)) {
pserialize_read_exit(s);
return 1;
}
}
pserialize_read_exit(s);
return 0;
}
int
in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
{
struct in6_ifaddr *ia;
int s;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
&sa6->sin6_addr) &&
#ifdef SCOPEDROUTING
ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id &&
#endif
(ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) {
pserialize_read_exit(s);
return 1; /* true */
}
/* XXX: do we still have to go thru the rest of the list? */
}
pserialize_read_exit(s);
return 0; /* false */
}
/*
* return length of part which dst and src are equal
* hard coding...
*/
int
in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
{
int match = 0;
u_char *s = (u_char *)src, *d = (u_char *)dst;
u_char *lim = s + 16, r;
while (s < lim)
if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) {
match++;
r <<= 1;
}
break;
} else
match += NBBY;
return match;
}
void
in6_prefixlen2mask(struct in6_addr *maskp, int len)
{
static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
int bytelen, bitlen, i;
/* sanity check */
if (len < 0 || len > 128) {
log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
len);
return;
}
memset(maskp, 0, sizeof(*maskp));
bytelen = len / NBBY;
bitlen = len % NBBY;
for (i = 0; i < bytelen; i++)
maskp->s6_addr[i] = 0xff;
if (bitlen)
maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
}
/*
* return the best address out of the same scope. if no address was
* found, return the first valid address from designated IF.
*/
struct in6_ifaddr *
in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
{
int dst_scope = in6_addrscope(dst), blen = -1, tlen;
struct ifaddr *ifa;
struct in6_ifaddr *best_ia = NULL, *ia;
struct in6_ifaddr *dep[2]; /* last-resort: deprecated */
dep[0] = dep[1] = NULL;
/*
* We first look for addresses in the same scope.
* If there is one, return it.
* If two or more, return one which matches the dst longest.
* If none, return one of global addresses assigned other ifs.
*/
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (ia->ia6_flags & IN6_IFF_ANYCAST)
continue; /* XXX: is there any case to allow anycast? */
if (ia->ia6_flags & IN6_IFF_NOTREADY)
continue; /* don't use this interface */
if (ia->ia6_flags & IN6_IFF_DETACHED)
continue;
if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
if (ip6_use_deprecated)
dep[0] = ia;
continue;
}
if (dst_scope != in6_addrscope(IFA_IN6(ifa)))
continue;
/*
* call in6_matchlen() as few as possible
*/
if (best_ia == NULL) {
best_ia = ia;
continue;
}
if (blen == -1) blen = in6_matchlen(&best_ia->ia_addr.sin6_addr, dst);
tlen = in6_matchlen(IFA_IN6(ifa), dst);
if (tlen > blen) {
blen = tlen;
best_ia = ia;
} else if (tlen == blen) best_ia = bestia(best_ia, ia);
}
if (best_ia != NULL)
return best_ia;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (ia->ia6_flags & IN6_IFF_ANYCAST)
continue; /* XXX: is there any case to allow anycast? */
if (ia->ia6_flags & IN6_IFF_NOTREADY)
continue; /* don't use this interface */
if (ia->ia6_flags & IN6_IFF_DETACHED)
continue;
if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
if (ip6_use_deprecated)
dep[1] = (struct in6_ifaddr *)ifa;
continue;
}
best_ia = bestia(best_ia, ia);
}
if (best_ia != NULL)
return best_ia;
/* use the last-resort values, that are, deprecated addresses */
if (dep[0])
return dep[0];
if (dep[1])
return dep[1];
return NULL;
}
/*
* perform DAD when interface becomes IFF_UP.
*/
void
in6_if_link_up(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct in6_ifaddr *ia;
int s, bound;
char ip6buf[INET6_ADDRSTRLEN];
/* Ensure it's sane to run DAD */
if (ifp->if_link_state == LINK_STATE_DOWN)
return;
if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
return;
bound = curlwp_bind();
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
ia = (struct in6_ifaddr *)ifa;
/* If detached then mark as tentative */
if (ia->ia6_flags & IN6_IFF_DETACHED) {
ia->ia6_flags &= ~IN6_IFF_DETACHED;
if (ip6_dad_enabled() && if_do_dad(ifp)) {
ia->ia6_flags |= IN6_IFF_TENTATIVE;
nd6log(LOG_ERR, "%s marked tentative\n",
IN6_PRINT(ip6buf,
&ia->ia_addr.sin6_addr));
} else if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0)
rt_addrmsg(RTM_NEWADDR, ifa);
}
if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
int rand_delay;
/* Clear the duplicated flag as we're starting DAD. */
ia->ia6_flags &= ~IN6_IFF_DUPLICATED;
/*
* The TENTATIVE flag was likely set by hand
* beforehand, implicitly indicating the need for DAD.
* We may be able to skip the random delay in this
* case, but we impose delays just in case.
*/
rand_delay = cprng_fast32() %
(MAX_RTR_SOLICITATION_DELAY * hz);
/* +1 ensures callout is always used */
nd6_dad_start(ifa, rand_delay + 1);
}
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
}
void
in6_if_up(struct ifnet *ifp)
{
/*
* special cases, like 6to4, are handled in in6_ifattach
*/
in6_ifattach(ifp, NULL);
/* interface may not support link state, so bring it up also */
in6_if_link_up(ifp);
}
/*
* Mark all addresses as detached.
*/
void
in6_if_link_down(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct in6_ifaddr *ia;
int s, bound;
char ip6buf[INET6_ADDRSTRLEN];
bound = curlwp_bind();
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
ia = (struct in6_ifaddr *)ifa;
/* Stop DAD processing */
nd6_dad_stop(ifa);
/*
* Mark the address as detached.
* This satisfies RFC4862 Section 5.3, but we should apply
* this logic to all addresses to be a good citizen and
* avoid potential duplicated addresses.
* When the interface comes up again, detached addresses
* are marked tentative and DAD commences.
*/
if (!(ia->ia6_flags & IN6_IFF_DETACHED)) {
nd6log(LOG_DEBUG, "%s marked detached\n",
IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr));
ia->ia6_flags |= IN6_IFF_DETACHED;
ia->ia6_flags &=
~(IN6_IFF_TENTATIVE | IN6_IFF_DUPLICATED);
rt_addrmsg(RTM_NEWADDR, ifa);
}
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
/* Clear ND6_IFF_IFDISABLED to allow DAD again on link-up. */
if (ifp->if_afdata[AF_INET6] != NULL)
ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
}
void
in6_if_down(struct ifnet *ifp)
{
in6_if_link_down(ifp);
lltable_purge_entries(LLTABLE6(ifp));
}
void
in6_if_link_state_change(struct ifnet *ifp, int link_state)
{
/*
* Treat LINK_STATE_UNKNOWN as UP.
* LINK_STATE_UNKNOWN transitions to LINK_STATE_DOWN when
* if_link_state_change() transitions to LINK_STATE_UP.
*/
if (link_state == LINK_STATE_DOWN)
in6_if_link_down(ifp);
else
in6_if_link_up(ifp);
}
int
in6_tunnel_validate(const struct ip6_hdr *ip6, const struct in6_addr *src,
const struct in6_addr *dst)
{
/* check for address match */
if (!IN6_ARE_ADDR_EQUAL(src, &ip6->ip6_dst) ||
!IN6_ARE_ADDR_EQUAL(dst, &ip6->ip6_src))
return 0;
/* martian filters on outer source - done in ip6_input */
/* NOTE: the packet may be dropped by uRPF. */
/* return valid bytes length */
return sizeof(*src) + sizeof(*dst);
}
#define IN6_LLTBL_DEFAULT_HSIZE 32
#define IN6_LLTBL_HASH(k, h) \
(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
/*
* Do actual deallocation of @lle.
* Called by LLE_FREE_LOCKED when number of references
* drops to zero.
*/
static void
in6_lltable_destroy_lle(struct llentry *lle)
{
KASSERTMSG(lle->la_numheld == 0, "la_numheld=%d", lle->la_numheld);
LLE_WUNLOCK(lle);
LLE_LOCK_DESTROY(lle);
llentry_pool_put(lle);
}
static struct llentry *
in6_lltable_new(const struct in6_addr *addr6, u_int flags)
{
struct llentry *lle;
lle = llentry_pool_get(PR_NOWAIT);
if (lle == NULL) /* NB: caller generates msg */
return NULL;
lle->r_l3addr.addr6 = *addr6;
lle->lle_refcnt = 1;
lle->lle_free = in6_lltable_destroy_lle;
LLE_LOCK_INIT(lle);
callout_init(&lle->lle_timer, CALLOUT_MPSAFE);
return lle;
}
static int
in6_lltable_match_prefix(const struct sockaddr *prefix,
const struct sockaddr *mask, u_int flags, struct llentry *lle)
{
const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix;
const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask;
if (IN6_ARE_MASKED_ADDR_EQUAL(&lle->r_l3addr.addr6,
&pfx->sin6_addr, &msk->sin6_addr) &&
((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)))
return 1;
return 0;
}
static void
in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
{
LLE_WLOCK_ASSERT(lle);
(void) llentry_free(lle);
}
static int
in6_lltable_rtcheck(struct ifnet *ifp, u_int flags,
const struct sockaddr *l3addr, const struct rtentry *rt)
{
char ip6buf[INET6_ADDRSTRLEN];
if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) {
int s;
struct ifaddr *ifa;
/*
* Create an ND6 cache for an IPv6 neighbor
* that is not covered by our own prefix.
*/
/* XXX ifaof_ifpforaddr should take a const param */
s = pserialize_read_enter();
ifa = ifaof_ifpforaddr(l3addr, ifp);
if (ifa != NULL) {
pserialize_read_exit(s);
return 0;
}
pserialize_read_exit(s);
log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
IN6_PRINT(ip6buf,
&((const struct sockaddr_in6 *)l3addr)->sin6_addr));
return EINVAL;
}
return 0;
}
static inline uint32_t
in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
{
return IN6_LLTBL_HASH(dst->s6_addr32[3], hsize);
}
static uint32_t
in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
{
return in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize);
}
static void
in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
{
struct sockaddr_in6 *sin6;
sin6 = (struct sockaddr_in6 *)sa;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_addr = lle->r_l3addr.addr6;
}
static inline struct llentry *
in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
{
struct llentry *lle;
struct llentries *lleh;
u_int hashidx;
hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
lleh = &llt->lle_head[hashidx];
LIST_FOREACH(lle, lleh, lle_next) {
if (lle->la_flags & LLE_DELETED)
continue;
if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
break;
}
return lle;
}
static int
in6_lltable_delete(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct llentry *lle;
IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle == NULL) {
#ifdef LLTABLE_DEBUG
char buf[64];
sockaddr_format(l3addr, buf, sizeof(buf));
log(LOG_INFO, "%s: cache for %s is not found\n",
__func__, buf);
#endif
return ENOENT;
}
LLE_WLOCK(lle);
#ifdef LLTABLE_DEBUG
{
char buf[64];
sockaddr_format(l3addr, buf, sizeof(buf));
log(LOG_INFO, "%s: cache for %s (%p) is deleted\n",
__func__, buf, lle);
}
#endif
llentry_free(lle);
return 0;
}
static struct llentry *
in6_lltable_create(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr, const struct rtentry *rt)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
IF_AFDATA_WLOCK_ASSERT(ifp);
KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle != NULL) {
LLE_WLOCK(lle);
return lle;
}
/*
* A route that covers the given address must have
* been installed 1st because we are doing a resolution,
* verify this.
*/
if (!(flags & LLE_IFADDR) &&
in6_lltable_rtcheck(ifp, flags, l3addr, rt) != 0)
return NULL;
lle = in6_lltable_new(&sin6->sin6_addr, flags);
if (lle == NULL) {
log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
return NULL;
}
lle->la_flags = flags;
if ((flags & LLE_IFADDR) == LLE_IFADDR) {
memcpy(&lle->ll_addr, CLLADDR(ifp->if_sadl), ifp->if_addrlen);
lle->la_flags |= LLE_VALID;
}
lltable_link_entry(llt, lle);
LLE_WLOCK(lle);
return lle;
}
static struct llentry *
in6_lltable_lookup(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct llentry *lle;
IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle == NULL)
return NULL;
if (flags & LLE_EXCLUSIVE)
LLE_WLOCK(lle);
else
LLE_RLOCK(lle);
return lle;
}
static int
in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
struct rt_walkarg *w)
{
struct sockaddr_in6 sin6;
LLTABLE_LOCK_ASSERT();
/* skip deleted entries */
if (lle->la_flags & LLE_DELETED)
return 0;
sockaddr_in6_init(&sin6, &lle->r_l3addr.addr6, 0, 0, 0);
return lltable_dump_entry(llt, lle, w, sin6tosa(&sin6));
}
static struct lltable *
in6_lltattach(struct ifnet *ifp)
{
struct lltable *llt;
llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
llt->llt_af = AF_INET6;
llt->llt_ifp = ifp;
llt->llt_lookup = in6_lltable_lookup;
llt->llt_create = in6_lltable_create;
llt->llt_delete = in6_lltable_delete;
llt->llt_dump_entry = in6_lltable_dump_entry;
llt->llt_hash = in6_lltable_hash;
llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
llt->llt_free_entry = in6_lltable_free_entry;
llt->llt_match_prefix = in6_lltable_match_prefix;
lltable_link(llt);
return llt;
}
void *
in6_domifattach(struct ifnet *ifp)
{
struct in6_ifextra *ext;
ext = malloc(sizeof(*ext), M_IFADDR, M_WAITOK|M_ZERO);
ext->in6_ifstat = malloc(sizeof(struct in6_ifstat),
M_IFADDR, M_WAITOK|M_ZERO);
ext->icmp6_ifstat = malloc(sizeof(struct icmp6_ifstat),
M_IFADDR, M_WAITOK|M_ZERO);
ext->nd_ifinfo = nd6_ifattach(ifp);
ext->scope6_id = scope6_ifattach(ifp);
ext->lltable = in6_lltattach(ifp);
return ext;
}
void
in6_domifdetach(struct ifnet *ifp, void *aux)
{
struct in6_ifextra *ext = (struct in6_ifextra *)aux;
lltable_free(ext->lltable);
ext->lltable = NULL;
SOFTNET_LOCK_UNLESS_NET_MPSAFE();
nd6_ifdetach(ifp, ext);
SOFTNET_UNLOCK_UNLESS_NET_MPSAFE();
free(ext->in6_ifstat, M_IFADDR);
free(ext->icmp6_ifstat, M_IFADDR);
scope6_ifdetach(ext->scope6_id);
free(ext, M_IFADDR);
}
/*
* Convert IPv4 address stored in struct in_addr to IPv4-Mapped IPv6 address
* stored in struct in6_addr as defined in RFC 4921 section 2.5.5.2.
*/
void
in6_in_2_v4mapin6(const struct in_addr *in, struct in6_addr *in6)
{
in6->s6_addr32[0] = 0;
in6->s6_addr32[1] = 0;
in6->s6_addr32[2] = IPV6_ADDR_INT32_SMP;
in6->s6_addr32[3] = in->s_addr;
}
/*
* Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be
* v4 mapped addr or v4 compat addr
*/
void
in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
memset(sin, 0, sizeof(*sin));
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_family = AF_INET;
sin->sin_port = sin6->sin6_port;
sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
}
/* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6(const struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
memset(sin6, 0, sizeof(*sin6));
sin6->sin6_len = sizeof(struct sockaddr_in6);
sin6->sin6_family = AF_INET6;
sin6->sin6_port = sin->sin_port;
in6_in_2_v4mapin6(&sin->sin_addr, &sin6->sin6_addr);
}
/* Convert sockaddr_in6 into sockaddr_in. */
void
in6_sin6_2_sin_in_sock(struct sockaddr *nam)
{
struct sockaddr_in *sin_p;
struct sockaddr_in6 sin6;
/*
* Save original sockaddr_in6 addr and convert it
* to sockaddr_in.
*/
sin6 = *(struct sockaddr_in6 *)nam;
sin_p = (struct sockaddr_in *)nam;
in6_sin6_2_sin(sin_p, &sin6);
}
/* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
{
struct sockaddr_in *sin_p;
struct sockaddr_in6 *sin6_p;
sin6_p = malloc(sizeof(*sin6_p), M_SONAME, M_WAITOK);
sin_p = (struct sockaddr_in *)*nam;
in6_sin_2_v4mapsin6(sin_p, sin6_p);
free(*nam, M_SONAME);
*nam = sin6tosa(sin6_p);
}
/* $NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $ */
/*
* Copyright (c) 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_subr.c 8.20 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1994 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_subr.c 8.20 (Berkeley) 5/20/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <uvm/uvm_extern.h>
#include <fs/union/union.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
static LIST_HEAD(uhashhead, union_node) *uhashtbl;
static u_long uhash_mask; /* size of hash table - 1 */
#define UNION_HASH(u, l) \
((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
#define NOHASH ((u_long)-1)
static kmutex_t uhash_lock;
static void union_newupper(struct union_node *, struct vnode *);
static void union_newlower(struct union_node *, struct vnode *);
static void union_ref(struct union_node *);
static void union_rele(struct union_node *);
static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t, const char *);
int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
static void union_dircache_r(struct vnode *, struct vnode ***, int *);
struct vnode *union_dircache(struct vnode *, struct lwp *);
void
union_init(void)
{
mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
}
void
union_reinit(void)
{
struct union_node *un;
struct uhashhead *oldhash, *hash;
u_long oldmask, mask, val;
int i;
hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
mutex_enter(&uhash_lock);
oldhash = uhashtbl;
oldmask = uhash_mask;
uhashtbl = hash;
uhash_mask = mask;
for (i = 0; i <= oldmask; i++) {
while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
LIST_REMOVE(un, un_cache);
val = UNION_HASH(un->un_uppervp, un->un_lowervp);
LIST_INSERT_HEAD(&hash[val], un, un_cache);
}
}
mutex_exit(&uhash_lock);
hashdone(oldhash, HASH_LIST, oldmask);
}
/*
* Free global unionfs resources.
*/
void
union_done(void)
{
hashdone(uhashtbl, HASH_LIST, uhash_mask);
mutex_destroy(&uhash_lock);
/* Make sure to unset the readdir hook. */
vn_union_readdir_hook = NULL;
}
void
union_newlower(struct union_node *un, struct vnode *lowervp)
{
int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
int nhash = UNION_HASH(un->un_uppervp, lowervp);
if (un->un_lowervp == lowervp)
return;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_lowervp == NULL);
mutex_enter(&uhash_lock);
if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_enter(&un->un_lock);
un->un_lowervp = lowervp;
un->un_lowersz = VNOVAL;
mutex_exit(&un->un_lock);
if (ohash != nhash) {
LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
un->un_cflags |= UN_CACHED;
}
mutex_exit(&uhash_lock);
}
void
union_newupper(struct union_node *un, struct vnode *uppervp)
{
int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
int nhash = UNION_HASH(uppervp, un->un_lowervp);
struct vop_lock_args lock_ap;
struct vop_unlock_args unlock_ap;
int error __diagused;
if (un->un_uppervp == uppervp)
return;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
KASSERT(un->un_uppervp == NULL);
/*
* We have to transfer the vnode lock from the union vnode to
* the upper vnode. Lock the upper vnode first. We cannot use
* VOP_LOCK() here as it would break the fstrans state.
*/
lock_ap.a_desc = VDESC(vop_lock);
lock_ap.a_vp = uppervp;
lock_ap.a_flags = LK_EXCLUSIVE;
error = VCALL(lock_ap.a_vp, VOFFSET(vop_lock), &lock_ap);
KASSERT(error == 0);
mutex_enter(&uhash_lock);
if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_enter(&un->un_lock);
un->un_uppervp = uppervp;
un->un_uppersz = VNOVAL;
/*
* With the upper vnode in place unlock the union vnode to
* finalize the lock transfer.
*/
unlock_ap.a_desc = VDESC(vop_unlock);
unlock_ap.a_vp = UNIONTOV(un);
genfs_unlock(&unlock_ap);
/* Update union vnode interlock, vmobjlock, & klist. */
vshareilock(UNIONTOV(un), uppervp);
rw_obj_hold(uppervp->v_uobj.vmobjlock);
uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
vshareklist(UNIONTOV(un), uppervp);
mutex_exit(&un->un_lock);
if (ohash != nhash) {
LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
un->un_cflags |= UN_CACHED;
}
mutex_exit(&uhash_lock);
}
/*
* Keep track of size changes in the underlying vnodes.
* If the size changes, then callback to the vm layer
* giving priority to the upper layer size.
*
* Mutex un_lock hold on entry and released on return.
*/
void
union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
{
struct union_node *un = VTOUNION(vp);
off_t sz;
KASSERT(mutex_owned(&un->un_lock));
/* only interested in regular files */
if (vp->v_type != VREG) {
mutex_exit(&un->un_lock);
uvm_vnp_setsize(vp, 0);
return;
}
sz = VNOVAL;
if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { un->un_uppersz = uppersz;
if (sz == VNOVAL)
sz = un->un_uppersz;
}
if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { un->un_lowersz = lowersz;
if (sz == VNOVAL)
sz = un->un_lowersz;
}
mutex_exit(&un->un_lock);
if (sz != VNOVAL) {
#ifdef UNION_DIAGNOSTIC
printf("union: %s size now %qd\n",
uppersz != VNOVAL ? "upper" : "lower", sz);
#endif
uvm_vnp_setsize(vp, sz);
}
}
static void
union_ref(struct union_node *un)
{
KASSERT(mutex_owned(&uhash_lock));
un->un_refs++;
}
static void
union_rele(struct union_node *un)
{
mutex_enter(&uhash_lock);
un->un_refs--;
if (un->un_refs > 0) {
mutex_exit(&uhash_lock);
return;
}
if (un->un_cflags & UN_CACHED) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_exit(&uhash_lock);
if (un->un_pvp != NULLVP) vrele(un->un_pvp); if (un->un_uppervp != NULLVP) vrele(un->un_uppervp); if (un->un_lowervp != NULLVP) vrele(un->un_lowervp); if (un->un_dirvp != NULLVP) vrele(un->un_dirvp); if (un->un_path) free(un->un_path, M_TEMP);
mutex_destroy(&un->un_lock);
free(un, M_TEMP);
}
/*
* allocate a union_node/vnode pair. the vnode is
* referenced and unlocked. the new vnode is returned
* via (vpp). (mp) is the mountpoint of the union filesystem,
* (dvp) is the parent directory where the upper layer object
* should exist (but doesn't) and (cnp) is the componentname
* information which is partially copied to allow the upper
* layer object to be created at a later time. (uppervp)
* and (lowervp) reference the upper and lower layer objects
* being mapped. either, but not both, can be nil.
* both, if supplied, are unlocked.
* the reference is either maintained in the new union_node
* object which is allocated, or they are vrele'd.
*
* all union_nodes are maintained on a hash
* list. new nodes are only allocated when they cannot
* be found on this list. entries on the list are
* removed when the vfs reclaim entry is called.
*
* the vnode gets attached or referenced with vcache_get().
*/
int
union_allocvp(
struct vnode **vpp,
struct mount *mp,
struct vnode *undvp, /* parent union vnode */
struct vnode *dvp, /* may be null */
struct componentname *cnp, /* may be null */
struct vnode *uppervp, /* may be null */
struct vnode *lowervp, /* may be null */
int docache)
{
int error;
struct union_node *un = NULL, *un1;
struct vnode *vp, *xlowervp = NULLVP;
u_long hash[3];
int try;
bool is_dotdot;
is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));
if (uppervp == NULLVP && lowervp == NULLVP)
panic("union: unidentifiable allocation"); if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
xlowervp = lowervp;
lowervp = NULLVP;
}
/*
* If both uppervp and lowervp are not NULL we have to
* search union nodes with one vnode as NULL too.
*/
hash[0] = UNION_HASH(uppervp, lowervp);
if (uppervp == NULL || lowervp == NULL) {
hash[1] = hash[2] = NOHASH;
} else {
hash[1] = UNION_HASH(uppervp, NULLVP);
hash[2] = UNION_HASH(NULLVP, lowervp);
}
if (!docache) {
un = NULL;
goto found;
}
loop:
mutex_enter(&uhash_lock);
for (try = 0; try < 3; try++) {
if (hash[try] == NOHASH)
continue;
LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) { if ((un->un_lowervp && un->un_lowervp != lowervp) || (un->un_uppervp && un->un_uppervp != uppervp) ||
un->un_mount != mp)
continue;
union_ref(un);
mutex_exit(&uhash_lock);
error = vcache_get(mp, &un, sizeof(un), &vp);
KASSERT(error != 0 || UNIONTOV(un) == vp);
union_rele(un);
if (error == ENOENT)
goto loop;
else if (error)
goto out;
goto found;
}
}
mutex_exit(&uhash_lock);
found:
if (un) {
if (uppervp != dvp) {
if (is_dotdot)
VOP_UNLOCK(dvp);
vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
if (is_dotdot)
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
}
/*
* Save information about the upper layer.
*/
if (uppervp != un->un_uppervp) {
union_newupper(un, uppervp); } else if (uppervp) { vrele(uppervp);
}
/*
* Save information about the lower layer.
* This needs to keep track of pathname
* and directory information which union_vn_create
* might need.
*/
if (lowervp != un->un_lowervp) {
union_newlower(un, lowervp); if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1,
M_TEMP, M_WAITOK);
memcpy(un->un_path, cnp->cn_nameptr,
cnp->cn_namelen);
un->un_path[cnp->cn_namelen] = '\0';
vref(dvp);
un->un_dirvp = dvp;
}
} else if (lowervp) { vrele(lowervp);
}
*vpp = UNIONTOV(un);
if (uppervp != dvp) VOP_UNLOCK(*vpp);
error = 0;
goto out;
}
un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
un->un_refs = 1;
un->un_mount = mp;
un->un_vnode = NULL;
un->un_uppervp = uppervp;
un->un_lowervp = lowervp;
un->un_pvp = undvp;
if (undvp != NULLVP) vref(undvp);
un->un_dircache = 0;
un->un_openl = 0;
un->un_cflags = 0;
un->un_hooknode = false;
un->un_uppersz = VNOVAL;
un->un_lowersz = VNOVAL;
if (dvp && cnp && (lowervp != NULLVP)) {
un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
un->un_path[cnp->cn_namelen] = '\0';
vref(dvp);
un->un_dirvp = dvp;
} else {
un->un_path = 0;
un->un_dirvp = 0;
}
if (docache) {
mutex_enter(&uhash_lock);
LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) { if (un1->un_lowervp == lowervp && un1->un_uppervp == uppervp &&
un1->un_mount == mp) {
/*
* Another thread beat us, push back freshly
* allocated node and retry.
*/
mutex_exit(&uhash_lock);
union_rele(un);
goto loop;
}
}
LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
un->un_cflags |= UN_CACHED;
mutex_exit(&uhash_lock);
}
error = vcache_get(mp, &un, sizeof(un), vpp);
KASSERT(error != 0 || UNIONTOV(un) == *vpp);
union_rele(un);
if (error == ENOENT)
goto loop;
out:
if (xlowervp) vrele(xlowervp);
return error;
}
int
union_freevp(struct vnode *vp)
{
struct union_node *un = VTOUNION(vp);
/* Detach vnode from union node. */
un->un_vnode = NULL;
un->un_uppersz = VNOVAL;
un->un_lowersz = VNOVAL;
/* Detach union node from vnode. */
mutex_enter(vp->v_interlock);
vp->v_data = NULL;
mutex_exit(vp->v_interlock);
union_rele(un);
return 0;
}
int
union_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct vattr va;
struct vnode *svp;
struct union_node *un;
struct union_mount *um;
voff_t uppersz, lowersz;
KASSERT(key_len == sizeof(un));
memcpy(&un, key, key_len);
um = MOUNTTOUNIONMOUNT(mp);
svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;
vp->v_tag = VT_UNION;
vp->v_op = union_vnodeop_p;
vp->v_data = un;
un->un_vnode = vp;
vp->v_type = svp->v_type;
if (svp->v_type == VCHR || svp->v_type == VBLK) spec_node_init(vp, svp->v_rdev);
vshareilock(vp, svp);
rw_obj_hold(svp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
vshareklist(vp, svp);
/* detect the root vnode (and aliases) */
if ((un->un_uppervp == um->um_uppervp) &&
((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
if (un->un_lowervp == NULLVP) {
un->un_lowervp = um->um_lowervp;
if (un->un_lowervp != NULLVP) vref(un->un_lowervp);
}
vp->v_vflag |= VV_ROOT;
}
uppersz = lowersz = VNOVAL;
if (un->un_uppervp != NULLVP) { if (vn_lock(un->un_uppervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
uppersz = va.va_size;
VOP_UNLOCK(un->un_uppervp);
}
}
if (un->un_lowervp != NULLVP) { if (vn_lock(un->un_lowervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
lowersz = va.va_size;
VOP_UNLOCK(un->un_lowervp);
}
}
mutex_enter(&un->un_lock);
union_newsize(vp, uppersz, lowersz);
mutex_enter(&uhash_lock);
union_ref(un);
mutex_exit(&uhash_lock);
*new_key = &vp->v_data;
return 0;
}
/*
* copyfile. copy the vnode (fvp) to the vnode (tvp)
* using a sequence of reads and writes. both (fvp)
* and (tvp) are locked on entry and exit.
*/
int
union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
struct lwp *l)
{
char *tbuf;
struct uio uio;
struct iovec iov;
int error = 0;
/*
* strategy:
* allocate a buffer of size MAXBSIZE.
* loop doing reads and writes, keeping track
* of the current uio offset.
* give up at the first sign of trouble.
*/
uio.uio_offset = 0;
UIO_SETUP_SYSSPACE(&uio);
tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
/* ugly loop follows... */
do {
off_t offset = uio.uio_offset;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
iov.iov_base = tbuf;
iov.iov_len = MAXBSIZE;
uio.uio_resid = iov.iov_len;
uio.uio_rw = UIO_READ;
error = VOP_READ(fvp, &uio, 0, cred);
if (error == 0) {
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
iov.iov_base = tbuf;
iov.iov_len = MAXBSIZE - uio.uio_resid;
uio.uio_offset = offset;
uio.uio_rw = UIO_WRITE;
uio.uio_resid = iov.iov_len;
if (uio.uio_resid == 0)
break;
do {
error = VOP_WRITE(tvp, &uio, 0, cred);
} while ((uio.uio_resid > 0) && (error == 0));
}
} while (error == 0);
free(tbuf, M_TEMP);
return (error);
}
/*
* (un) is assumed to be locked on entry and remains
* locked on exit.
*/
int
union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
struct lwp *l)
{
int error;
struct vnode *lvp, *uvp;
struct vattr lvattr, uvattr;
error = union_vn_create(&uvp, un, l);
if (error)
return (error);
union_newupper(un, uvp);
lvp = un->un_lowervp;
if (docopy) {
/*
* XX - should not ignore errors
* from VOP_CLOSE
*/
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_GETATTR(lvp, &lvattr, cred);
if (error == 0)
error = VOP_OPEN(lvp, FREAD, cred);
if (error == 0) {
error = union_copyfile(lvp, uvp, cred, l);
(void) VOP_CLOSE(lvp, FREAD, cred);
}
if (error == 0) {
/* Copy permissions up too */
vattr_null(&uvattr);
uvattr.va_mode = lvattr.va_mode;
uvattr.va_flags = lvattr.va_flags;
error = VOP_SETATTR(uvp, &uvattr, cred);
}
VOP_UNLOCK(lvp);
#ifdef UNION_DIAGNOSTIC
if (error == 0)
uprintf("union: copied up %s\n", un->un_path);
#endif
}
union_vn_close(uvp, FWRITE, cred, l);
/*
* Subsequent IOs will go to the top layer, so
* call close on the lower vnode and open on the
* upper vnode to ensure that the filesystem keeps
* its references counts right. This doesn't do
* the right thing with (cred) and (FREAD) though.
* Ignoring error returns is not right, either.
*/
if (error == 0) {
int i;
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
for (i = 0; i < un->un_openl; i++) {
(void) VOP_CLOSE(lvp, FREAD, cred);
(void) VOP_OPEN(uvp, FREAD, cred);
}
un->un_openl = 0;
VOP_UNLOCK(lvp);
}
return (error);
}
/*
* Prepare the creation of a new node in the upper layer.
*
* (dvp) is the directory in which to create the new node.
* it is locked on entry and exit.
* (cnp) is the componentname to be created.
* (cred, path, hash) are credentials, path and its hash to fill (cnp).
*/
static int
union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
const char *path)
{
int error;
struct vnode *vp;
cnp->cn_nameiop = CREATE;
cnp->cn_flags = LOCKPARENT | ISLASTCN;
cnp->cn_cred = cred;
cnp->cn_nameptr = path;
cnp->cn_namelen = strlen(path);
error = VOP_LOOKUP(dvp, &vp, cnp);
if (error == 0) {
KASSERT(vp != NULL);
VOP_ABORTOP(dvp, cnp);
vrele(vp);
error = EEXIST;
} else if (error == EJUSTRETURN) {
error = 0;
}
return error;
}
/*
* Create a shadow directory in the upper layer.
* The new vnode is returned locked.
*
* (um) points to the union mount structure for access to the
* the mounting process's credentials.
* (dvp) is the directory in which to create the shadow directory.
* it is unlocked on entry and exit.
* (cnp) is the componentname to be created.
* (vpp) is the returned newly created shadow directory, which
* is returned locked.
*
* N.B. We still attempt to create shadow directories even if the union
* is mounted read-only, which is a little nonintuitive.
*/
int
union_mkshadow(struct union_mount *um, struct vnode *dvp,
struct componentname *cnp, struct vnode **vpp)
{
int error;
struct vattr va;
struct componentname cn;
char *pnbuf;
if (cnp->cn_namelen + 1 > MAXPATHLEN)
return ENAMETOOLONG;
pnbuf = PNBUF_GET();
memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
pnbuf[cnp->cn_namelen] = '\0';
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
error = union_do_lookup(dvp, &cn,
(um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
if (error) {
VOP_UNLOCK(dvp);
PNBUF_PUT(pnbuf);
return error;
}
/*
* policy: when creating the shadow directory in the
* upper layer, create it owned by the user who did
* the mount, group from parent directory, and mode
* 777 modified by umask (ie mostly identical to the
* mkdir syscall). (jsp, kb)
*/
vattr_null(&va);
va.va_type = VDIR;
va.va_mode = um->um_cmode;
KASSERT(*vpp == NULL);
error = VOP_MKDIR(dvp, vpp, &cn, &va);
VOP_UNLOCK(dvp);
PNBUF_PUT(pnbuf);
return error;
}
/*
* Create a whiteout entry in the upper layer.
*
* (um) points to the union mount structure for access to the
* the mounting process's credentials.
* (dvp) is the directory in which to create the whiteout.
* it is locked on entry and exit.
* (cnp) is the componentname to be created.
* (un) holds the path and its hash to be created.
*/
int
union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
struct componentname *cnp, struct union_node *un)
{
int error;
struct componentname cn;
error = union_do_lookup(dvp, &cn,
(um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
un->un_path);
if (error)
return error;
error = VOP_WHITEOUT(dvp, &cn, CREATE);
return error;
}
/*
* union_vn_create: creates and opens a new shadow file
* on the upper union layer. this function is similar
* in spirit to calling vn_open but it avoids calling namei().
* the problem with calling namei is that a) it locks too many
* things, and b) it doesn't start at the "right" directory,
* whereas union_do_lookup is told where to start.
*/
int
union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
{
struct vnode *vp;
kauth_cred_t cred = l->l_cred;
struct vattr vat;
struct vattr *vap = &vat;
int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
int error;
int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
struct componentname cn;
*vpp = NULLVP;
vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
un->un_path);
if (error) {
VOP_UNLOCK(un->un_dirvp);
return error;
}
/*
* Good - there was no race to create the file
* so go ahead and create it. The permissions
* on the file will be 0666 modified by the
* current user's umask. Access to the file, while
* it is unioned, will require access to the top *and*
* bottom files. Access when not unioned will simply
* require access to the top-level file.
* TODO: confirm choice of access permissions.
*/
vattr_null(vap);
vap->va_type = VREG;
vap->va_mode = cmode;
vp = NULL;
error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
if (error) {
VOP_UNLOCK(un->un_dirvp);
return error;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VOP_UNLOCK(un->un_dirvp);
error = VOP_OPEN(vp, fmode, cred);
if (error) {
vput(vp);
return error;
}
vp->v_writecount++;
VOP_UNLOCK(vp);
*vpp = vp;
return 0;
}
int
union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
{
if (fmode & FWRITE)
--vp->v_writecount;
return (VOP_CLOSE(vp, fmode, cred));
}
void
union_removed_upper(struct union_node *un)
{
struct vnode *vp = UNIONTOV(un);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#if 1
/*
* We do not set the uppervp to NULLVP here, because lowervp
* may also be NULLVP, so this routine would end up creating
* a bogus union node with no upper or lower VP (that causes
* pain in many places that assume at least one VP exists).
* Since we've removed this node from the cache hash chains,
* it won't be found again. When all current holders
* release it, union_inactive() will vgone() it.
*/
union_diruncache(un);
#else
union_newupper(un, NULLVP);
#endif
VOP_UNLOCK(vp);
mutex_enter(&uhash_lock);
if (un->un_cflags & UN_CACHED) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_exit(&uhash_lock);
}
#if 0
struct vnode *
union_lowervp(struct vnode *vp)
{
struct union_node *un = VTOUNION(vp);
if ((un->un_lowervp != NULLVP) &&
(vp->v_type == un->un_lowervp->v_type)) {
if (vget(un->un_lowervp, 0, true /* wait */) == 0)
return (un->un_lowervp);
}
return (NULLVP);
}
#endif
/*
* determine whether a whiteout is needed
* during a remove/rmdir operation.
*/
int
union_dowhiteout(struct union_node *un, kauth_cred_t cred)
{
struct vattr va;
if (un->un_lowervp != NULLVP)
return (1);
if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
(va.va_flags & OPAQUE))
return (1);
return (0);
}
static void
union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
{
struct union_node *un;
if (vp->v_op != union_vnodeop_p) {
if (vppp) {
vref(vp);
*(*vppp)++ = vp;
if (--(*cntp) == 0)
panic("union: dircache table too small");
} else {
(*cntp)++;
}
return;
}
un = VTOUNION(vp);
if (un->un_uppervp != NULLVP)
union_dircache_r(un->un_uppervp, vppp, cntp);
if (un->un_lowervp != NULLVP)
union_dircache_r(un->un_lowervp, vppp, cntp);
}
struct vnode *
union_dircache(struct vnode *vp, struct lwp *l)
{
int cnt;
struct vnode *nvp = NULLVP;
struct vnode **vpp;
struct vnode **dircache;
int error;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
dircache = VTOUNION(vp)->un_dircache;
nvp = NULLVP;
if (dircache == 0) {
cnt = 0;
union_dircache_r(vp, 0, &cnt);
cnt++;
dircache = (struct vnode **)
malloc(cnt * sizeof(struct vnode *),
M_TEMP, M_WAITOK);
vpp = dircache;
union_dircache_r(vp, &vpp, &cnt);
VTOUNION(vp)->un_dircache = dircache;
*vpp = NULLVP;
vpp = dircache + 1;
} else {
vpp = dircache;
do {
if (*vpp++ == VTOUNION(vp)->un_lowervp)
break;
} while (*vpp != NULLVP);
}
if (*vpp == NULLVP)
goto out;
vref(*vpp);
error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0,
NULLVP, *vpp, 0);
if (!error) {
vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
VTOUNION(vp)->un_dircache = 0;
VTOUNION(nvp)->un_hooknode = true;
VTOUNION(nvp)->un_dircache = dircache;
}
out:
VOP_UNLOCK(vp);
return (nvp);
}
void
union_diruncache(struct union_node *un)
{
struct vnode **vpp;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
if (un->un_dircache != 0) {
for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
vrele(*vpp);
free(un->un_dircache, M_TEMP);
un->un_dircache = 0;
}
}
/*
* Check whether node can rmdir (check empty).
*/
int
union_check_rmdir(struct union_node *un, kauth_cred_t cred)
{
int dirlen, eofflag, error;
char *dirbuf;
struct vattr va;
struct vnode *tvp;
struct dirent *dp, *edp;
struct componentname cn;
struct iovec aiov;
struct uio auio;
KASSERT(un->un_uppervp != NULL);
/* Check upper for being opaque. */
KASSERT(VOP_ISLOCKED(un->un_uppervp));
error = VOP_GETATTR(un->un_uppervp, &va, cred);
if (error || (va.va_flags & OPAQUE))
return error;
if (un->un_lowervp == NULL)
return 0;
/* Check lower for being empty. */
vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(un->un_lowervp, &va, cred);
if (error) {
VOP_UNLOCK(un->un_lowervp);
return error;
}
dirlen = va.va_blocksize;
dirbuf = kmem_alloc(dirlen, KM_SLEEP);
/* error = 0; */
eofflag = 0;
auio.uio_offset = 0;
do {
aiov.iov_len = dirlen;
aiov.iov_base = dirbuf;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = aiov.iov_len;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
NULL, NULL);
if (error)
break;
edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
for (dp = (struct dirent *)dirbuf;
error == 0 && dp < edp;
dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
if (dp->d_reclen == 0) {
error = ENOTEMPTY;
break;
}
if (dp->d_type == DT_WHT ||
(dp->d_namlen == 1 && dp->d_name[0] == '.') ||
(dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
continue;
/* Check for presence in the upper layer. */
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | RDONLY;
cn.cn_cred = cred;
cn.cn_nameptr = dp->d_name;
cn.cn_namelen = dp->d_namlen;
error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
error = 0;
continue;
}
if (error == 0)
vrele(tvp);
error = ENOTEMPTY;
}
} while (error == 0 && !eofflag);
kmem_free(dirbuf, dirlen);
VOP_UNLOCK(un->un_lowervp);
return error;
}
/*
* This hook is called from vn_readdir() to switch to lower directory
* entry after the upper directory is read.
*/
int
union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
{
struct vnode *vp = *vpp, *lvp;
struct vattr va;
int error;
if (vp->v_op != union_vnodeop_p)
return (0);
/*
* If the directory is opaque,
* then don't show lower entries
*/
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, fp->f_cred);
VOP_UNLOCK(vp);
if (error || (va.va_flags & OPAQUE))
return error;
if ((lvp = union_dircache(vp, l)) == NULLVP)
return (0);
error = VOP_OPEN(lvp, FREAD, fp->f_cred);
if (error) {
vput(lvp);
return (error);
}
VOP_UNLOCK(lvp);
fp->f_vnode = lvp;
fp->f_offset = 0;
error = vn_close(vp, FREAD, fp->f_cred);
if (error)
return (error);
*vpp = lvp;
return (0);
}
/* $NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
*/
/*
* Name caching:
*
* Names found by directory scans are retained in a cache for future
* reference. It is managed LRU, so frequently used names will hang
* around. The cache is indexed by hash value obtained from the name.
*
* The name cache is the brainchild of Robert Elz and was introduced in
* 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
* McKusick, May 21 1984.
*
* Data structures:
*
* Most Unix namecaches very sensibly use a global hash table to index
* names. The global hash table works well, but can cause concurrency
* headaches for the kernel hacker. In the NetBSD 10.0 implementation
* we are not sensible, and use a per-directory data structure to index
* names, but the cache otherwise functions the same.
*
* The index is a red-black tree. It should not be difficult to
* experiment with other types of index, however note that a tree
* can trivially be made to support lockless lookup.
*
* Each cached name is stored in a struct namecache, along with a
* pointer to the associated vnode (nc_vp). Names longer than a
* maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
* occur infrequently, and names shorter than this are stored directly
* in struct namecache. If it is a "negative" entry, (i.e. for a name
* that is known NOT to exist) the vnode pointer will be NULL.
*
* In practice this implementation is not any slower than the hash
* table that preceeded it and in some cases it significantly
* outperforms the hash table. Some reasons why this might be:
*
* - natural partitioning provided by the file system structure, which
* the prior implementation discarded (global hash table).
* - worst case tree traversal of O(log n), the hash table could have
* many collisions.
* - minimized cache misses & total L2/L3 CPU cache footprint; struct
* namecache and vnode_impl_t are laid out to keep cache footprint
* minimal in the lookup path; no hash table buckets to cache.
* - minimized number of conditionals & string comparisons.
*
* For a directory with 3 cached names for 3 distinct vnodes, the
* various vnodes and namecache structs would be connected like this
* (the root is at the bottom of the diagram):
*
* ...
* ^
* |- vi_nc_tree
* |
* +----o----+ +---------+ +---------+
* | VDIR | | VCHR | | VREG |
* | vnode o-----+ | vnode o-----+ | vnode o------+
* +---------+ | +---------+ | +---------+ |
* ^ | ^ | ^ |
* |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp |
* | | | | | |
* +----o----+ | +----o----+ | +----o----+ |
* +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+
* | +---------+ | +---------+ | +---------+
* | ^ | ^ | ^
* | | | | | |
* | | +----------------------+ | |
* |-nc_dvp | +-------------------------------------------------+
* | |/- vi_nc_tree | |
* | | |- nc_dvp |- nc_dvp
* | +----o----+ | |
* +-->| VDIR |<----------+ |
* | vnode |<------------------------------------+
* +---------+
*
* START HERE
*
* Replacement:
*
* As the cache becomes full, old and unused entries are purged as new
* entries are added. The synchronization overhead in maintaining a
* strict ordering would be prohibitive, so the VM system's "clock" or
* "second chance" page replacement algorithm is aped here. New
* entries go to the tail of the active list. After they age out and
* reach the head of the list, they are moved to the tail of the
* inactive list. Any use of the deactivated cache entry reactivates
* it, saving it from impending doom; if not reactivated, the entry
* eventually reaches the head of the inactive list and is purged.
*
* Concurrency:
*
* From a performance perspective, cache_lookup(nameiop == LOOKUP) is
* what really matters; insertion of new entries with cache_enter() is
* comparatively infrequent, and overshadowed by the cost of expensive
* file system metadata operations (which may involve disk I/O). We
* therefore want to make everything simplest in the lookup path.
*
* struct namecache is mostly stable except for list and tree related
* entries, changes to which don't affect the cached name or vnode.
* For changes to name+vnode, entries are purged in preference to
* modifying them.
*
* Read access to namecache entries is made via tree, list, or LRU
* list. A lock corresponding to the direction of access should be
* held. See definition of "struct namecache" in src/sys/namei.src,
* and the definition of "struct vnode" for the particulars.
*
* Per-CPU statistics, and LRU list totals are read unlocked, since an
* approximate value is OK. We maintain 32-bit sized per-CPU counters
* and 64-bit global counters since 32-bit sized counters can be
* observed locklessly while the global counters are protected by a
* mutex.
*
* The lock order is:
*
* 1) vi->vi_nc_lock (tree or parent -> child direction,
* used during forward lookup)
*
* 2) vi->vi_nc_listlock (list or child -> parent direction,
* used during reverse lookup)
*
* 3) cache_lru_lock (LRU list direction, used during reclaim)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $");
#define __NAMECACHE_PRIVATE
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_dtrace.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/hash.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/pool.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode_impl.h>
#include <miscfs/genfs/genfs.h>
/*
* Assert that data structure layout hasn't changed unintentionally.
*/
#ifdef _LP64
CTASSERT(sizeof(struct namecache) == 128);
#else
CTASSERT(sizeof(struct namecache) == 64);
#endif
CTASSERT(NC_NLEN_MASK >= MAXPATHLEN);
static void cache_activate(struct namecache *);
static void cache_update_stats(void *);
static int cache_compare_nodes(void *, const void *, const void *);
static void cache_deactivate(void);
static void cache_reclaim(void);
static int cache_stat_sysctl(SYSCTLFN_ARGS);
/*
* Global pool cache.
*/
static pool_cache_t cache_pool __read_mostly;
/*
* LRU replacement.
*/
enum cache_lru_id {
LRU_ACTIVE,
LRU_INACTIVE,
LRU_COUNT
};
static struct {
TAILQ_HEAD(, namecache) list[LRU_COUNT];
u_int count[LRU_COUNT];
} cache_lru __cacheline_aligned;
static kmutex_t cache_lru_lock __cacheline_aligned;
/*
* Cache effectiveness statistics. nchstats holds system-wide total.
*/
struct nchstats nchstats;
struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
struct nchcpu {
struct nchstats_percpu cur;
struct nchstats_percpu last;
};
static callout_t cache_stat_callout;
static kmutex_t cache_stat_lock __cacheline_aligned;
#define COUNT(f) do { \
lwp_t *l = curlwp; \
KPREEMPT_DISABLE(l); \
struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \
nchcpu->cur.f++; \
KPREEMPT_ENABLE(l); \
} while (/* CONSTCOND */ 0);
#define UPDATE(nchcpu, f) do { \
uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
nchcpu->last.f = cur; \
} while (/* CONSTCOND */ 0)
/*
* Tunables. cache_maxlen replaces the historical doingcache:
* set it zero to disable caching for debugging purposes.
*/
int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */
int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */
int cache_maxlen __read_mostly = NC_NLEN_MASK; /* max name length to cache */
int cache_stat_interval __read_mostly = 300; /* in seconds */
/*
* sysctl stuff.
*/
static struct sysctllog *cache_sysctllog;
/*
* This is a dummy name that cannot usually occur anywhere in the cache nor
* file system. It's used when caching the root vnode of mounted file
* systems. The name is attached to the directory that the file system is
* mounted on.
*/
static const char cache_mp_name[] = "";
static const int cache_mp_nlen = sizeof(cache_mp_name) - 1;
/*
* Red-black tree stuff.
*/
static const rb_tree_ops_t cache_rbtree_ops = {
.rbto_compare_nodes = cache_compare_nodes,
.rbto_compare_key = cache_compare_nodes,
.rbto_node_offset = offsetof(struct namecache, nc_tree),
.rbto_context = NULL
};
/*
* dtrace probes.
*/
SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
"int");
SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
"char *", "size_t");
/*
* rbtree: compare two nodes.
*/
static int
cache_compare_nodes(void *context, const void *n1, const void *n2)
{
const struct namecache *nc1 = n1;
const struct namecache *nc2 = n2;
if (nc1->nc_key < nc2->nc_key) {
return -1;
}
if (nc1->nc_key > nc2->nc_key) {
return 1;
}
KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2));
return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1));
}
/*
* Compute a key value for the given name. The name length is encoded in
* the key value to try and improve uniqueness, and so that length doesn't
* need to be compared separately for string comparisons.
*/
static uintptr_t
cache_key(const char *name, size_t nlen)
{
uintptr_t key;
KASSERT((nlen & ~NC_NLEN_MASK) == 0); key = hash32_buf(name, nlen, HASH32_STR_INIT);
return (key << NC_NLEN_BITS) | (uintptr_t)nlen;
}
/*
* Remove an entry from the cache. vi_nc_lock must be held, and if dir2node
* is true, then we're locking in the conventional direction and the list
* lock will be acquired when removing the entry from the vnode list.
*/
static void
cache_remove(struct namecache *ncp, const bool dir2node)
{
struct vnode *vp, *dvp = ncp->nc_dvp;
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
size_t namelen = NC_NLEN(ncp);
KASSERT(rw_write_held(&dvi->vi_nc_lock)); KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key); KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0);
/*
* Remove from the vnode's list. This excludes cache_revlookup(),
* and then it's safe to remove from the LRU lists.
*/
if ((vp = ncp->nc_vp) != NULL) {
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
if (__predict_true(dir2node)) {
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
rw_exit(&vi->vi_nc_listlock);
} else {
TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
}
}
/* Remove from the directory's rbtree. */
rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
/* Remove from the LRU lists. */
mutex_enter(&cache_lru_lock);
TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
cache_lru.count[ncp->nc_lrulist]--;
mutex_exit(&cache_lru_lock);
/* Finally, free it. */
if (namelen > NCHNAMLEN) {
size_t sz = offsetof(struct namecache, nc_name[namelen]);
kmem_free(ncp, sz);
} else {
pool_cache_put(cache_pool, ncp);
}
}
/*
* Find a single cache entry and return it. vi_nc_lock must be held.
*/
static struct namecache * __noinline
cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
uintptr_t key)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct rb_node *node = dvi->vi_nc_tree.rbt_root;
struct namecache *ncp;
enum cache_lru_id lrulist;
int diff;
KASSERT(namelen <= MAXPATHLEN); KASSERT(rw_lock_held(&dvi->vi_nc_lock));
/*
* Search the RB tree for the key. This is an inlined lookup
* tailored for exactly what's needed here that turns out to be
* quite a bit faster than using rb_tree_find_node().
*
* For a matching key memcmp() needs to be called once to confirm
* that the correct name has been found. Very rarely there will be
* a key value collision and the search will continue.
*/
for (;;) {
if (__predict_false(RB_SENTINEL_P(node))) {
return NULL;
}
ncp = (struct namecache *)node;
KASSERT((void *)&ncp->nc_tree == (void *)ncp);
KASSERT(ncp->nc_dvp == dvp);
if (ncp->nc_key == key) {
KASSERT(NC_NLEN(ncp) == namelen);
diff = memcmp(ncp->nc_name, name, namelen);
if (__predict_true(diff == 0)) {
break;
}
node = node->rb_nodes[diff < 0];
} else {
node = node->rb_nodes[ncp->nc_key < key];
}
}
/*
* If the entry is on the wrong LRU list, requeue it. This is an
* unlocked check, but it will rarely be wrong and even then there
* will be no harm caused.
*/
lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
if (__predict_false(lrulist != LRU_ACTIVE)) { cache_activate(ncp);
}
return ncp;
}
/*
* Look for a the name in the cache. We don't do this
* if the segment name is long, simply so the cache can avoid
* holding long names (which would either waste space, or
* add greatly to the complexity).
*
* Lookup is called with DVP pointing to the directory to search,
* and CNP providing the name of the entry being sought: cn_nameptr
* is the name, cn_namelen is its length, and cn_flags is the flags
* word from the namei operation.
*
* DVP must be locked.
*
* There are three possible non-error return states:
* 1. Nothing was found in the cache. Nothing is known about
* the requested name.
* 2. A negative entry was found in the cache, meaning that the
* requested name definitely does not exist.
* 3. A positive entry was found in the cache, meaning that the
* requested name does exist and that we are providing the
* vnode.
* In these cases the results are:
* 1. 0 returned; VN is set to NULL.
* 2. 1 returned; VN is set to NULL.
* 3. 1 returned; VN is set to the vnode found.
*
* The additional result argument ISWHT is set to zero, unless a
* negative entry is found that was entered as a whiteout, in which
* case ISWHT is set to one.
*
* The ISWHT_RET argument pointer may be null. In this case an
* assertion is made that the whiteout flag is not set. File systems
* that do not support whiteouts can/should do this.
*
* Filesystems that do support whiteouts should add ISWHITEOUT to
* cnp->cn_flags if ISWHT comes back nonzero.
*
* When a vnode is returned, it is locked, as per the vnode lookup
* locking protocol.
*
* There is no way for this function to fail, in the sense of
* generating an error that requires aborting the namei operation.
*
* (Prior to October 2012, this function returned an integer status,
* and a vnode, and mucked with the flags word in CNP for whiteouts.
* The integer status was -1 for "nothing found", ENOENT for "a
* negative entry found", 0 for "a positive entry found", and possibly
* other errors, and the value of VN might or might not have been set
* depending on what error occurred.)
*/
bool
cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
uint32_t nameiop, uint32_t cnflags,
int *iswht_ret, struct vnode **vn_ret)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
struct vnode *vp;
uintptr_t key;
int error;
bool hit;
krw_t op;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* Establish default result values */
if (iswht_ret != NULL) { *iswht_ret = 0;
}
*vn_ret = NULL;
if (__predict_false(namelen > cache_maxlen)) {
SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
name, namelen, 0, 0);
COUNT(ncs_long);
return false;
}
/* Compute the key up front - don't need the lock. */
key = cache_key(name, namelen);
/* Could the entry be purged below? */
if ((cnflags & ISLASTCN) != 0 && ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
op = RW_WRITER;
} else {
op = RW_READER;
}
/* Now look for the name. */
rw_enter(&dvi->vi_nc_lock, op);
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (__predict_false(ncp == NULL)) {
rw_exit(&dvi->vi_nc_lock);
COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp,
name, namelen, 0, 0);
return false;
}
if (__predict_false((cnflags & MAKEENTRY) == 0)) {
/*
* Last component and we are renaming or deleting,
* the cache entry is invalid, or otherwise don't
* want cache entry to exist.
*/
KASSERT((cnflags & ISLASTCN) != 0);
cache_remove(ncp, true);
rw_exit(&dvi->vi_nc_lock);
COUNT(ncs_badhits);
return false;
}
if ((vp = ncp->nc_vp) == NULL) {
if (iswht_ret != NULL) {
/*
* Restore the ISWHITEOUT flag saved earlier.
*/
*iswht_ret = ncp->nc_whiteout;
} else {
KASSERT(!ncp->nc_whiteout);
}
if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
/*
* Last component and we are preparing to create
* the named object, so flush the negative cache
* entry.
*/
COUNT(ncs_badhits);
cache_remove(ncp, true);
hit = false;
} else {
COUNT(ncs_neghits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
namelen, 0, 0);
/* found neg entry; vn is already null from above */
hit = true;
}
rw_exit(&dvi->vi_nc_lock);
return hit;
}
error = vcache_tryvget(vp);
rw_exit(&dvi->vi_nc_lock);
if (error) { KASSERT(error == EBUSY);
/*
* This vnode is being cleaned out.
* XXX badhits?
*/
COUNT(ncs_falsehits);
return false;
}
COUNT(ncs_goodhits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
/* found it */
*vn_ret = vp;
return true;
}
/*
* Version of the above without the nameiop argument, for NFS.
*/
bool
cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
uint32_t cnflags,
int *iswht_ret, struct vnode **vn_ret)
{
return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
iswht_ret, vn_ret);
}
/*
* Used by namei() to walk down a path, component by component by looking up
* names in the cache. The node locks are chained along the way: a parent's
* lock is not dropped until the child's is acquired.
*/
bool
cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
struct vnode **vn_ret, krwlock_t **plock,
kauth_cred_t cred)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
krwlock_t *oldlock, *newlock;
struct vnode *vp;
uintptr_t key;
int error;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* If disabled, or file system doesn't support this, bail out. */
if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
return false;
}
if (__predict_false(namelen > cache_maxlen)) {
COUNT(ncs_long);
return false;
}
/* Compute the key up front - don't need the lock. */
key = cache_key(name, namelen);
/*
* Acquire the directory lock. Once we have that, we can drop the
* previous one (if any).
*
* The two lock holds mean that the directory can't go away while
* here: the directory must be purged with cache_purge() before
* being freed, and both parent & child's vi_nc_lock must be taken
* before that point is passed.
*
* However if there's no previous lock, like at the root of the
* chain, then "dvp" must be referenced to prevent dvp going away
* before we get its lock.
*
* Note that the two locks can be the same if looking up a dot, for
* example: /usr/bin/. If looking up the parent (..) we can't wait
* on the lock as child -> parent is the wrong direction.
*/
if (*plock != &dvi->vi_nc_lock) {
oldlock = *plock;
newlock = &dvi->vi_nc_lock;
if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
return false;
}
} else {
oldlock = NULL;
newlock = NULL;
if (*plock == NULL) { KASSERT(vrefcnt(dvp) > 0);
}
}
/*
* First up check if the user is allowed to look up files in this
* directory.
*/
if (cred != FSCRED) {
if (dvi->vi_nc_mode == VNOVAL) {
if (newlock != NULL) { rw_exit(newlock);
}
return false;
}
KASSERT(dvi->vi_nc_uid != VNOVAL); KASSERT(dvi->vi_nc_gid != VNOVAL);
error = kauth_authorize_vnode(cred,
KAUTH_ACCESS_ACTION(VEXEC,
dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid,
dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC));
if (error != 0) { if (newlock != NULL) { rw_exit(newlock);
}
COUNT(ncs_denied);
return false;
}
}
/*
* Now look for a matching cache entry.
*/
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (__predict_false(ncp == NULL)) {
if (newlock != NULL) { rw_exit(newlock);
}
COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp,
name, namelen, 0, 0);
return false;
}
if ((vp = ncp->nc_vp) == NULL) {
/* found negative entry; vn is already null from above */
KASSERT(namelen != cache_mp_nlen); KASSERT(name != cache_mp_name); COUNT(ncs_neghits);
} else {
COUNT(ncs_goodhits); /* XXX can be "badhits" */
}
SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
/*
* Return with the directory lock still held. It will either be
* returned to us with another call to cache_lookup_linked() when
* looking up the next component, or the caller will release it
* manually when finished.
*/
if (oldlock) { rw_exit(oldlock);
}
if (newlock) { *plock = newlock;
}
*vn_ret = vp;
return true;
}
/*
* Scan cache looking for name of directory entry pointing at vp.
* Will not search for "." or "..".
*
* If the lookup succeeds the vnode is referenced and stored in dvpp.
*
* If bufp is non-NULL, also place the name in the buffer which starts
* at bufp, immediately before *bpp, and move bpp backwards to point
* at the start of it. (Yes, this is a little baroque, but it's done
* this way to cater to the whims of getcwd).
*
* Returns 0 on success, -1 on cache miss, positive errno on failure.
*/
int
cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
bool checkaccess, accmode_t accmode)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
struct namecache *ncp;
enum cache_lru_id lrulist;
struct vnode *dvp;
int error, nlen;
char *bp;
KASSERT(vp != NULL); if (cache_maxlen == 0)
goto out;
rw_enter(&vi->vi_nc_listlock, RW_READER);
if (checkaccess) {
/*
* Check if the user is allowed to see. NOTE: this is
* checking for access on the "wrong" directory. getcwd()
* wants to see that there is access on every component
* along the way, not that there is access to any individual
* component. Don't use this to check you can look in vp.
*
* I don't like it, I didn't come up with it, don't blame me!
*/
if (vi->vi_nc_mode == VNOVAL) {
rw_exit(&vi->vi_nc_listlock);
return -1;
}
KASSERT(vi->vi_nc_uid != VNOVAL); KASSERT(vi->vi_nc_gid != VNOVAL);
error = kauth_authorize_vnode(kauth_cred_get(),
KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred,
vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS,
NULL, accmode));
if (error != 0) {
rw_exit(&vi->vi_nc_listlock);
COUNT(ncs_denied);
return EACCES;
}
}
TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { KASSERT(ncp->nc_vp == vp); KASSERT(ncp->nc_dvp != NULL);
nlen = NC_NLEN(ncp);
/*
* Ignore mountpoint entries.
*/
if (nlen == cache_mp_nlen) {
continue;
}
/*
* The queue is partially sorted. Once we hit dots, nothing
* else remains but dots and dotdots, so bail out.
*/
if (ncp->nc_name[0] == '.') { if (nlen == 1 ||
(nlen == 2 && ncp->nc_name[1] == '.')) {
break;
}
}
/*
* Record a hit on the entry. This is an unlocked read but
* even if wrong it doesn't matter too much.
*/
lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
if (lrulist != LRU_ACTIVE) { cache_activate(ncp);
}
if (bufp) {
bp = *bpp;
bp -= nlen;
if (bp <= bufp) {
*dvpp = NULL;
rw_exit(&vi->vi_nc_listlock);
SDT_PROBE(vfs, namecache, revlookup,
fail, vp, ERANGE, 0, 0, 0);
return (ERANGE);
}
memcpy(bp, ncp->nc_name, nlen);
*bpp = bp;
}
dvp = ncp->nc_dvp;
error = vcache_tryvget(dvp);
rw_exit(&vi->vi_nc_listlock);
if (error) { KASSERT(error == EBUSY); if (bufp) (*bpp) += nlen;
*dvpp = NULL;
SDT_PROBE(vfs, namecache, revlookup, fail, vp,
error, 0, 0, 0);
return -1;
}
*dvpp = dvp;
SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
0, 0, 0);
COUNT(ncs_revhits);
return (0);
}
rw_exit(&vi->vi_nc_listlock);
COUNT(ncs_revmiss);
out:
*dvpp = NULL;
return (-1);
}
/*
* Add an entry to the cache.
*/
void
cache_enter(struct vnode *dvp, struct vnode *vp,
const char *name, size_t namelen, uint32_t cnflags)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp, *oncp;
int total;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* First, check whether we can/should add a cache entry. */
if ((cnflags & MAKEENTRY) == 0 ||
__predict_false(namelen > cache_maxlen)) {
SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
0, 0);
return;
}
SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
/*
* Reclaim some entries if over budget. This is an unlocked check,
* but it doesn't matter. Just need to catch up with things
* eventually: it doesn't matter if we go over temporarily.
*/
total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
if (__predict_false(total > desiredvnodes)) { cache_reclaim();
}
/* Now allocate a fresh entry. */
if (__predict_true(namelen <= NCHNAMLEN)) {
ncp = pool_cache_get(cache_pool, PR_WAITOK);
} else {
size_t sz = offsetof(struct namecache, nc_name[namelen]);
ncp = kmem_alloc(sz, KM_SLEEP);
}
/*
* Fill in cache info. For negative hits, save the ISWHITEOUT flag
* so we can restore it later when the cache entry is used again.
*/
ncp->nc_vp = vp;
ncp->nc_dvp = dvp;
ncp->nc_key = cache_key(name, namelen);
ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
memcpy(ncp->nc_name, name, namelen);
/*
* Insert to the directory. Concurrent lookups may race for a cache
* entry. If there's a entry there already, purge it.
*/
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
if (oncp != ncp) { KASSERT(oncp->nc_key == ncp->nc_key); KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp)); KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
cache_remove(oncp, true);
oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
KASSERT(oncp == ncp);
}
/*
* With the directory lock still held, insert to the tail of the
* ACTIVE LRU list (new) and take the opportunity to incrementally
* balance the lists.
*/
mutex_enter(&cache_lru_lock);
ncp->nc_lrulist = LRU_ACTIVE;
cache_lru.count[LRU_ACTIVE]++;
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
cache_deactivate();
mutex_exit(&cache_lru_lock);
/*
* Finally, insert to the vnode and unlock. With everything set up
* it's safe to let cache_revlookup() see the entry. Partially sort
* the per-vnode list: dots go to back so cache_revlookup() doesn't
* have to consider them.
*/
if (vp != NULL) {
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
if ((namelen == 1 && name[0] == '.') || (namelen == 2 && name[0] == '.' && name[1] == '.')) { TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
} else {
TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
}
rw_exit(&vi->vi_nc_listlock);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Set identity info in cache for a vnode. We only care about directories
* so ignore other updates. The cached info may be marked invalid if the
* inode has an ACL.
*/
void
cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid)
{ vnode_impl_t *vi = VNODE_TO_VIMPL(vp); if (vp->v_type == VDIR) {
/* Grab both locks, for forward & reverse lookup. */
rw_enter(&vi->vi_nc_lock, RW_WRITER);
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
if (valid) {
vi->vi_nc_mode = mode;
vi->vi_nc_uid = uid;
vi->vi_nc_gid = gid;
} else {
vi->vi_nc_mode = VNOVAL;
vi->vi_nc_uid = VNOVAL;
vi->vi_nc_gid = VNOVAL;
}
rw_exit(&vi->vi_nc_listlock);
rw_exit(&vi->vi_nc_lock);
}
}
/*
* Return true if we have identity for the given vnode, and use as an
* opportunity to confirm that everything squares up.
*
* Because of shared code, some file systems could provide partial
* information, missing some updates, so check the mount flag too.
*/
bool
cache_have_id(struct vnode *vp)
{ if (vp->v_type == VDIR && (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 && atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) {
return true;
} else {
return false;
}
}
/*
* Enter a mount point. cvp is the covered vnode, and rvp is the root of
* the mounted file system.
*/
void
cache_enter_mount(struct vnode *cvp, struct vnode *rvp)
{ KASSERT(vrefcnt(cvp) > 0); KASSERT(vrefcnt(rvp) > 0); KASSERT(cvp->v_type == VDIR); KASSERT((rvp->v_vflag & VV_ROOT) != 0); if (rvp->v_type == VDIR) { cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY);
}
}
/*
* Look up a cached mount point. Used in the strongly locked path.
*/
bool
cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret)
{
bool ret;
ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP,
MAKEENTRY, NULL, vn_ret);
KASSERT((*vn_ret != NULL) == ret);
return ret;
}
/*
* Try to cross a mount point. For use with cache_lookup_linked().
*/
bool
cache_cross_mount(struct vnode **dvp, krwlock_t **plock)
{
return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen,
dvp, plock, FSCRED);
}
/*
* Name cache initialization, from vfs_init() when the system is booting.
*/
void
nchinit(void)
{
cache_pool = pool_cache_init(sizeof(struct namecache),
coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
NULL, NULL);
KASSERT(cache_pool != NULL);
mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
KASSERT(cache_sysctllog == NULL);
sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "namecache_stats",
SYSCTL_DESCR("namecache statistics"),
cache_stat_sysctl, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
}
/*
* Called once for each CPU in the system as attached.
*/
void
cache_cpu_init(struct cpu_info *ci)
{
size_t sz;
sz = roundup2(sizeof(struct nchcpu), coherency_unit);
ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP);
KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0);
}
/*
* A vnode is being allocated: set up cache structures.
*/
void
cache_vnode_init(struct vnode *vp)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
rw_init(&vi->vi_nc_lock);
rw_init(&vi->vi_nc_listlock);
rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
TAILQ_INIT(&vi->vi_nc_list);
vi->vi_nc_mode = VNOVAL;
vi->vi_nc_uid = VNOVAL;
vi->vi_nc_gid = VNOVAL;
}
/*
* A vnode is being freed: finish cache structures.
*/
void
cache_vnode_fini(struct vnode *vp)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
rw_destroy(&vi->vi_nc_lock);
rw_destroy(&vi->vi_nc_listlock);
}
/*
* Helper for cache_purge1(): purge cache entries for the given vnode from
* all directories that the vnode is cached in.
*/
static void
cache_purge_parents(struct vnode *vp)
{
vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
struct vnode *dvp, *blocked;
struct namecache *ncp;
SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
blocked = NULL;
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
/*
* Locking in the wrong direction. Try for a hold on the
* directory node's lock, and if we get it then all good,
* nuke the entry and move on to the next.
*/
dvp = ncp->nc_dvp;
dvi = VNODE_TO_VIMPL(dvp);
if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
cache_remove(ncp, false);
rw_exit(&dvi->vi_nc_lock);
blocked = NULL;
continue;
}
/*
* We can't wait on the directory node's lock with our list
* lock held or the system could deadlock.
*
* Take a hold on the directory vnode to prevent it from
* being freed (taking the vnode & lock with it). Then
* wait for the lock to become available with no other locks
* held, and retry.
*
* If this happens twice in a row, give the other side a
* breather; we can do nothing until it lets go.
*/
vhold(dvp);
rw_exit(&vi->vi_nc_listlock);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
/* Do nothing. */
rw_exit(&dvi->vi_nc_lock);
holdrele(dvp);
if (blocked == dvp) { kpause("ncpurge", false, 1, NULL);
}
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
blocked = dvp;
}
rw_exit(&vi->vi_nc_listlock);
}
/*
* Helper for cache_purge1(): purge all cache entries hanging off the given
* directory vnode.
*/
static void
cache_purge_children(struct vnode *dvp)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
cache_remove(ncp, true);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Helper for cache_purge1(): purge cache entry from the given vnode,
* finding it by name.
*/
static void
cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
uintptr_t key;
SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); key = cache_key(name, namelen);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (ncp) { cache_remove(ncp, true);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Cache flush, a particular vnode; called when a vnode is renamed to
* hide entries that would now be invalid.
*/
void
cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
{ if (flags & PURGE_PARENTS) { cache_purge_parents(vp);
}
if (flags & PURGE_CHILDREN) { cache_purge_children(vp);
}
if (name != NULL) { cache_purge_name(vp, name, namelen);
}
}
/*
* vnode filter for cache_purgevfs().
*/
static bool
cache_vdir_filter(void *cookie, vnode_t *vp)
{
return vp->v_type == VDIR;
}
/*
* Cache flush, a whole filesystem; called when filesys is umounted to
* remove entries that would now be invalid.
*/
void
cache_purgevfs(struct mount *mp)
{
struct vnode_iterator *iter;
vnode_t *dvp;
vfs_vnode_iterator_init(mp, &iter);
for (;;) {
dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
if (dvp == NULL) {
break;
}
cache_purge_children(dvp);
vrele(dvp);
}
vfs_vnode_iterator_destroy(iter);
}
/*
* Re-queue an entry onto the tail of the active LRU list, after it has
* scored a hit.
*/
static void
cache_activate(struct namecache *ncp)
{
mutex_enter(&cache_lru_lock);
TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
cache_lru.count[ncp->nc_lrulist]--;
cache_lru.count[LRU_ACTIVE]++;
ncp->nc_lrulist = LRU_ACTIVE;
mutex_exit(&cache_lru_lock);
}
/*
* Try to balance the LRU lists. Pick some victim entries, and re-queue
* them from the head of the active list to the tail of the inactive list.
*/
static void
cache_deactivate(void)
{
struct namecache *ncp;
int total, i;
KASSERT(mutex_owned(&cache_lru_lock));
/* If we're nowhere near budget yet, don't bother. */
total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
if (total < (desiredvnodes >> 1)) {
return;
}
/*
* Aim for a 1:1 ratio of active to inactive. This is to allow each
* potential victim a reasonable amount of time to cycle through the
* inactive list in order to score a hit and be reactivated, while
* trying not to cause reactivations too frequently.
*/
if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
return;
}
/* Move only a few at a time; will catch up eventually. */
for (i = 0; i < cache_lru_maxdeact; i++) {
ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
if (ncp == NULL) {
break;
}
KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
ncp->nc_lrulist = LRU_INACTIVE;
TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
cache_lru.count[LRU_ACTIVE]--;
cache_lru.count[LRU_INACTIVE]++;
}
}
/*
* Free some entries from the cache, when we have gone over budget.
*
* We don't want to cause too much work for any individual caller, and it
* doesn't matter if we temporarily go over budget. This is also "just a
* cache" so it's not a big deal if we screw up and throw out something we
* shouldn't. So we take a relaxed attitude to this process to reduce its
* impact.
*/
static void
cache_reclaim(void)
{
struct namecache *ncp;
vnode_impl_t *dvi;
int toscan;
/*
* Scan up to a preset maximum number of entries, but no more than
* 0.8% of the total at once (to allow for very small systems).
*
* On bigger systems, do a larger chunk of work to reduce the number
* of times that cache_lru_lock is held for any length of time.
*/
mutex_enter(&cache_lru_lock);
toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
toscan = MAX(toscan, 1);
SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
while (toscan-- != 0) {
/* First try to balance the lists. */
cache_deactivate();
/* Now look for a victim on head of inactive list (old). */
ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
if (ncp == NULL) {
break;
}
dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
KASSERT(ncp->nc_lrulist == LRU_INACTIVE); KASSERT(dvi != NULL);
/*
* Locking in the wrong direction. If we can't get the
* lock, the directory is actively busy, and it could also
* cause problems for the next guy in here, so send the
* entry to the back of the list.
*/
if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
ncp, nc_lru);
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
ncp, nc_lru);
continue;
}
/*
* Now have the victim entry locked. Drop the LRU list
* lock, purge the entry, and start over. The hold on
* vi_nc_lock will prevent the vnode from vanishing until
* finished (cache_purge() will be called on dvp before it
* disappears, and that will wait on vi_nc_lock).
*/
mutex_exit(&cache_lru_lock);
cache_remove(ncp, true);
rw_exit(&dvi->vi_nc_lock);
mutex_enter(&cache_lru_lock);
}
mutex_exit(&cache_lru_lock);
}
/*
* For file system code: count a lookup that required a full re-scan of
* directory metadata.
*/
void
namecache_count_pass2(void)
{
COUNT(ncs_pass2);
}
/*
* For file system code: count a lookup that scored a hit in the directory
* metadata near the location of the last lookup.
*/
void
namecache_count_2passes(void)
{
COUNT(ncs_2passes);
}
/*
* Sum the stats from all CPUs into nchstats. This needs to run at least
* once within every window where a 32-bit counter could roll over. It's
* called regularly by timer to ensure this.
*/
static void
cache_update_stats(void *cookie)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
mutex_enter(&cache_stat_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
UPDATE(nchcpu, ncs_goodhits);
UPDATE(nchcpu, ncs_neghits);
UPDATE(nchcpu, ncs_badhits);
UPDATE(nchcpu, ncs_falsehits);
UPDATE(nchcpu, ncs_miss);
UPDATE(nchcpu, ncs_long);
UPDATE(nchcpu, ncs_pass2);
UPDATE(nchcpu, ncs_2passes);
UPDATE(nchcpu, ncs_revhits);
UPDATE(nchcpu, ncs_revmiss);
UPDATE(nchcpu, ncs_denied);
}
if (cookie != NULL) {
memcpy(cookie, &nchstats, sizeof(nchstats));
}
/* Reset the timer; arrive back here in N minutes at latest. */
callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
mutex_exit(&cache_stat_lock);
}
/*
* Fetch the current values of the stats for sysctl.
*/
static int
cache_stat_sysctl(SYSCTLFN_ARGS)
{
struct nchstats stats;
if (oldp == NULL) {
*oldlenp = sizeof(nchstats);
return 0;
}
if (*oldlenp <= 0) {
*oldlenp = 0;
return 0;
}
/* Refresh the global stats. */
sysctl_unlock();
cache_update_stats(&stats);
sysctl_relock();
*oldlenp = MIN(sizeof(stats), *oldlenp);
return sysctl_copyout(l, &stats, oldp, *oldlenp);
}
/*
* For the debugger, given the address of a vnode, print all associated
* names in the cache.
*/
#ifdef DDB
void
namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
{
struct vnode *dvp = NULL;
struct namecache *ncp;
enum cache_lru_id id;
for (id = 0; id < LRU_COUNT; id++) {
TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
if (ncp->nc_vp == vp) {
(*pr)("name %.*s\n", NC_NLEN(ncp),
ncp->nc_name);
dvp = ncp->nc_dvp;
}
}
}
if (dvp == NULL) {
(*pr)("name not found\n");
return;
}
for (id = 0; id < LRU_COUNT; id++) {
TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
if (ncp->nc_vp == dvp) {
(*pr)("parent %.*s\n", NC_NLEN(ncp),
ncp->nc_name);
}
}
}
}
#endif
/* $NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Matt Thomas <matt@3am-software.com>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <sys/types.h>
#include <stddef.h>
#include <assert.h>
#include <stdbool.h>
#ifdef RBDEBUG
#define KASSERT(s) assert(s)
#define __rbt_unused
#else
#define KASSERT(s) do { } while (/*CONSTCOND*/ 0)
#define __rbt_unused __unused
#endif
__RCSID("$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#else
#include <lib/libkern/libkern.h>
__KERNEL_RCSID(0, "$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#ifndef DIAGNOSTIC
#define __rbt_unused __unused
#else
#define __rbt_unused
#endif
#endif
#ifdef _LIBC
__weak_alias(rb_tree_init, _rb_tree_init)
__weak_alias(rb_tree_find_node, _rb_tree_find_node)
__weak_alias(rb_tree_find_node_geq, _rb_tree_find_node_geq)
__weak_alias(rb_tree_find_node_leq, _rb_tree_find_node_leq)
__weak_alias(rb_tree_insert_node, _rb_tree_insert_node)
__weak_alias(rb_tree_remove_node, _rb_tree_remove_node)
__weak_alias(rb_tree_iterate, _rb_tree_iterate)
#ifdef RBDEBUG
__weak_alias(rb_tree_check, _rb_tree_check)
__weak_alias(rb_tree_depths, _rb_tree_depths)
#endif
#include "namespace.h"
#endif
#ifdef RBTEST
#include "rbtree.h"
#else
#include <sys/rbtree.h>
#endif
static void rb_tree_insert_rebalance(struct rb_tree *, struct rb_node *);
static void rb_tree_removal_rebalance(struct rb_tree *, struct rb_node *,
unsigned int);
#ifdef RBDEBUG
static const struct rb_node *rb_tree_iterate_const(const struct rb_tree *,
const struct rb_node *, const unsigned int);
static bool rb_tree_check_node(const struct rb_tree *, const struct rb_node *,
const struct rb_node *, bool);
#else
#define rb_tree_check_node(a, b, c, d) true
#endif
#define RB_NODETOITEM(rbto, rbn) \
((void *)((uintptr_t)(rbn) - (rbto)->rbto_node_offset))
#define RB_ITEMTONODE(rbto, rbn) \
((rb_node_t *)((uintptr_t)(rbn) + (rbto)->rbto_node_offset))
#define RB_SENTINEL_NODE NULL
void
rb_tree_init(struct rb_tree *rbt, const rb_tree_ops_t *ops)
{
rbt->rbt_ops = ops;
rbt->rbt_root = RB_SENTINEL_NODE;
RB_TAILQ_INIT(&rbt->rbt_nodes);
#ifndef RBSMALL
rbt->rbt_minmax[RB_DIR_LEFT] = rbt->rbt_root; /* minimum node */
rbt->rbt_minmax[RB_DIR_RIGHT] = rbt->rbt_root; /* maximum node */
#endif
#ifdef RBSTATS
rbt->rbt_count = 0;
rbt->rbt_insertions = 0;
rbt->rbt_removals = 0;
rbt->rbt_insertion_rebalance_calls = 0;
rbt->rbt_insertion_rebalance_passes = 0;
rbt->rbt_removal_rebalance_calls = 0;
rbt->rbt_removal_rebalance_passes = 0;
#endif
}
void *
rb_tree_find_node(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root;
while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
parent = parent->rb_nodes[diff < 0];
}
return NULL;
}
void *
rb_tree_find_node_geq(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root, *last = NULL;
while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
if (diff > 0)
last = parent;
parent = parent->rb_nodes[diff < 0];
}
return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}
void *
rb_tree_find_node_leq(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root, *last = NULL;
while (!RB_SENTINEL_P(parent)) {
void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
if (diff < 0)
last = parent;
parent = parent->rb_nodes[diff < 0];
}
return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}
void *
rb_tree_insert_node(struct rb_tree *rbt, void *object)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;
struct rb_node *parent, *tmp, *self = RB_ITEMTONODE(rbto, object);
unsigned int position;
bool rebalance;
RBSTAT_INC(rbt->rbt_insertions);
tmp = rbt->rbt_root;
/*
* This is a hack. Because rbt->rbt_root is just a struct rb_node *,
* just like rb_node->rb_nodes[RB_DIR_LEFT], we can use this fact to
* avoid a lot of tests for root and know that even at root,
* updating RB_FATHER(rb_node)->rb_nodes[RB_POSITION(rb_node)] will
* update rbt->rbt_root.
*/
parent = (struct rb_node *)(void *)&rbt->rbt_root;
position = RB_DIR_LEFT;
/*
* Find out where to place this new leaf.
*/
while (!RB_SENTINEL_P(tmp)) {
void *tobj = RB_NODETOITEM(rbto, tmp);
const signed int diff = (*compare_nodes)(rbto->rbto_context,
tobj, object);
if (__predict_false(diff == 0)) {
/*
* Node already exists; return it.
*/
return tobj;
}
parent = tmp;
position = (diff < 0);
tmp = parent->rb_nodes[position];
}
#ifdef RBDEBUG
{
struct rb_node *prev = NULL, *next = NULL;
if (position == RB_DIR_RIGHT)
prev = parent;
else if (tmp != rbt->rbt_root)
next = parent;
/*
* Verify our sequential position
*/
KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
KASSERT(next == NULL || !RB_SENTINEL_P(next));
if (prev != NULL && next == NULL)
next = TAILQ_NEXT(prev, rb_link);
if (prev == NULL && next != NULL)
prev = TAILQ_PREV(next, rb_node_qh, rb_link);
KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
KASSERT(next == NULL || !RB_SENTINEL_P(next));
KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);
KASSERT(next == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, next)) < 0);
}
#endif
/*
* Initialize the node and insert as a leaf into the tree.
*/
RB_SET_FATHER(self, parent);
RB_SET_POSITION(self, position);
if (__predict_false(parent == (struct rb_node *)(void *)&rbt->rbt_root)) {
RB_MARK_BLACK(self); /* root is always black */
#ifndef RBSMALL
rbt->rbt_minmax[RB_DIR_LEFT] = self;
rbt->rbt_minmax[RB_DIR_RIGHT] = self;
#endif
rebalance = false;
} else {
KASSERT(position == RB_DIR_LEFT || position == RB_DIR_RIGHT);
#ifndef RBSMALL
/*
* Keep track of the minimum and maximum nodes. If our
* parent is a minmax node and we on their min/max side,
* we must be the new min/max node.
*/
if (parent == rbt->rbt_minmax[position]) rbt->rbt_minmax[position] = self;
#endif /* !RBSMALL */
/*
* All new nodes are colored red. We only need to rebalance
* if our parent is also red.
*/
RB_MARK_RED(self);
rebalance = RB_RED_P(parent);
}
KASSERT(RB_SENTINEL_P(parent->rb_nodes[position]));
self->rb_left = parent->rb_nodes[position];
self->rb_right = parent->rb_nodes[position];
parent->rb_nodes[position] = self;
KASSERT(RB_CHILDLESS_P(self));
/*
* Insert the new node into a sorted list for easy sequential access
*/
RBSTAT_INC(rbt->rbt_count);
#ifdef RBDEBUG
if (RB_ROOT_P(rbt, self)) {
RB_TAILQ_INSERT_HEAD(&rbt->rbt_nodes, self, rb_link);
} else if (position == RB_DIR_LEFT) {
KASSERT((*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self),
RB_NODETOITEM(rbto, RB_FATHER(self))) < 0);
RB_TAILQ_INSERT_BEFORE(RB_FATHER(self), self, rb_link);
} else {
KASSERT((*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, RB_FATHER(self)),
RB_NODETOITEM(rbto, self)) < 0);
RB_TAILQ_INSERT_AFTER(&rbt->rbt_nodes, RB_FATHER(self),
self, rb_link);
}
#endif
KASSERT(rb_tree_check_node(rbt, self, NULL, !rebalance));
/*
* Rebalance tree after insertion
*/
if (rebalance) { rb_tree_insert_rebalance(rbt, self);
KASSERT(rb_tree_check_node(rbt, self, NULL, true));
}
/* Successfully inserted, return our node pointer. */
return object;
}
/*
* Swap the location and colors of 'self' and its child @ which. The child
* can not be a sentinel node. This is our rotation function. However,
* since it preserves coloring, it great simplifies both insertion and
* removal since rotation almost always involves the exchanging of colors
* as a separate step.
*/
static void
rb_tree_reparent_nodes(__rbt_unused struct rb_tree *rbt,
struct rb_node *old_father, const unsigned int which)
{
const unsigned int other = which ^ RB_DIR_OTHER;
struct rb_node * const grandpa = RB_FATHER(old_father);
struct rb_node * const old_child = old_father->rb_nodes[which];
struct rb_node * const new_father = old_child;
struct rb_node * const new_child = old_father;
KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT); KASSERT(!RB_SENTINEL_P(old_child)); KASSERT(RB_FATHER(old_child) == old_father);
KASSERT(rb_tree_check_node(rbt, old_father, NULL, false));
KASSERT(rb_tree_check_node(rbt, old_child, NULL, false));
KASSERT(RB_ROOT_P(rbt, old_father) ||
rb_tree_check_node(rbt, grandpa, NULL, false));
/*
* Exchange descendant linkages.
*/
grandpa->rb_nodes[RB_POSITION(old_father)] = new_father;
new_child->rb_nodes[which] = old_child->rb_nodes[other];
new_father->rb_nodes[other] = new_child;
/*
* Update ancestor linkages
*/
RB_SET_FATHER(new_father, grandpa);
RB_SET_FATHER(new_child, new_father);
/*
* Exchange properties between new_father and new_child. The only
* change is that new_child's position is now on the other side.
*/
#if 0
{
struct rb_node tmp;
tmp.rb_info = 0;
RB_COPY_PROPERTIES(&tmp, old_child);
RB_COPY_PROPERTIES(new_father, old_father);
RB_COPY_PROPERTIES(new_child, &tmp);
}
#else
RB_SWAP_PROPERTIES(new_father, new_child);
#endif
RB_SET_POSITION(new_child, other);
/*
* Make sure to reparent the new child to ourself.
*/
if (!RB_SENTINEL_P(new_child->rb_nodes[which])) { RB_SET_FATHER(new_child->rb_nodes[which], new_child);
RB_SET_POSITION(new_child->rb_nodes[which], which);
}
KASSERT(rb_tree_check_node(rbt, new_father, NULL, false));
KASSERT(rb_tree_check_node(rbt, new_child, NULL, false));
KASSERT(RB_ROOT_P(rbt, new_father) ||
rb_tree_check_node(rbt, grandpa, NULL, false));
}
static void
rb_tree_insert_rebalance(struct rb_tree *rbt, struct rb_node *self)
{
struct rb_node * father = RB_FATHER(self);
struct rb_node * grandpa = RB_FATHER(father);
struct rb_node * uncle;
unsigned int which;
unsigned int other;
KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father));
RBSTAT_INC(rbt->rbt_insertion_rebalance_calls);
for (;;) {
KASSERT(!RB_SENTINEL_P(self));
KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father));
/*
* We are red and our parent is red, therefore we must have a
* grandfather and he must be black.
*/
grandpa = RB_FATHER(father);
KASSERT(RB_BLACK_P(grandpa));
KASSERT(RB_DIR_RIGHT == 1 && RB_DIR_LEFT == 0);
which = (father == grandpa->rb_right);
other = which ^ RB_DIR_OTHER;
uncle = grandpa->rb_nodes[other];
if (RB_BLACK_P(uncle))
break;
RBSTAT_INC(rbt->rbt_insertion_rebalance_passes);
/*
* Case 1: our uncle is red
* Simply invert the colors of our parent and
* uncle and make our grandparent red. And
* then solve the problem up at his level.
*/
RB_MARK_BLACK(uncle);
RB_MARK_BLACK(father);
if (__predict_false(RB_ROOT_P(rbt, grandpa))) {
/*
* If our grandpa is root, don't bother
* setting him to red, just return.
*/
KASSERT(RB_BLACK_P(grandpa));
return;
}
RB_MARK_RED(grandpa);
self = grandpa;
father = RB_FATHER(self);
KASSERT(RB_RED_P(self)); if (RB_BLACK_P(father)) {
/*
* If our greatgrandpa is black, we're done.
*/
KASSERT(RB_BLACK_P(rbt->rbt_root));
return;
}
}
KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father)); KASSERT(RB_BLACK_P(uncle)); KASSERT(RB_BLACK_P(grandpa));
/*
* Case 2&3: our uncle is black.
*/
if (self == father->rb_nodes[other]) {
/*
* Case 2: we are on the same side as our uncle
* Swap ourselves with our parent so this case
* becomes case 3. Basically our parent becomes our
* child.
*/
rb_tree_reparent_nodes(rbt, father, other);
KASSERT(RB_FATHER(father) == self); KASSERT(self->rb_nodes[which] == father); KASSERT(RB_FATHER(self) == grandpa);
self = father;
father = RB_FATHER(self);
}
KASSERT(RB_RED_P(self) && RB_RED_P(father)); KASSERT(grandpa->rb_nodes[which] == father);
/*
* Case 3: we are opposite a child of a black uncle.
* Swap our parent and grandparent. Since our grandfather
* is black, our father will become black and our new sibling
* (former grandparent) will become red.
*/
rb_tree_reparent_nodes(rbt, grandpa, which);
KASSERT(RB_FATHER(self) == father); KASSERT(RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER] == grandpa); KASSERT(RB_RED_P(self)); KASSERT(RB_BLACK_P(father)); KASSERT(RB_RED_P(grandpa));
/*
* Final step: Set the root to black.
*/
RB_MARK_BLACK(rbt->rbt_root);
}
static void
rb_tree_prune_node(struct rb_tree *rbt, struct rb_node *self, bool rebalance)
{
const unsigned int which = RB_POSITION(self);
struct rb_node *father = RB_FATHER(self);
#ifndef RBSMALL
const bool was_root = RB_ROOT_P(rbt, self);
#endif
KASSERT(rebalance || (RB_ROOT_P(rbt, self) || RB_RED_P(self))); KASSERT(!rebalance || RB_BLACK_P(self)); KASSERT(RB_CHILDLESS_P(self));
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
/*
* Since we are childless, we know that self->rb_left is pointing
* to the sentinel node.
*/
father->rb_nodes[which] = self->rb_left;
/*
* Remove ourselves from the node list, decrement the count,
* and update min/max.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) {
rbt->rbt_minmax[RB_POSITION(self)] = father;
/*
* When removing the root, rbt->rbt_minmax[RB_DIR_LEFT] is
* updated automatically, but we also need to update
* rbt->rbt_minmax[RB_DIR_RIGHT];
*/
if (__predict_false(was_root)) { rbt->rbt_minmax[RB_DIR_RIGHT] = father;
}
}
RB_SET_FATHER(self, NULL);
#endif
/*
* Rebalance if requested.
*/
if (rebalance) rb_tree_removal_rebalance(rbt, father, which);
KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
}
/*
* When deleting an interior node
*/
static void
rb_tree_swap_prune_and_rebalance(struct rb_tree *rbt, struct rb_node *self,
struct rb_node *standin)
{
const unsigned int standin_which = RB_POSITION(standin);
unsigned int standin_other = standin_which ^ RB_DIR_OTHER;
struct rb_node *standin_son;
struct rb_node *standin_father = RB_FATHER(standin);
bool rebalance = RB_BLACK_P(standin);
if (standin_father == self) {
/*
* As a child of self, any childen would be opposite of
* our parent.
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other]));
standin_son = standin->rb_nodes[standin_which];
} else {
/*
* Since we aren't a child of self, any childen would be
* on the same side as our parent.
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_which]));
standin_son = standin->rb_nodes[standin_other];
}
/*
* the node we are removing must have two children.
*/
KASSERT(RB_TWOCHILDREN_P(self));
/*
* If standin has a child, it must be red.
*/
KASSERT(RB_SENTINEL_P(standin_son) || RB_RED_P(standin_son));
/*
* Verify things are sane.
*/
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
KASSERT(rb_tree_check_node(rbt, standin, NULL, false));
if (__predict_false(RB_RED_P(standin_son))) {
/*
* We know we have a red child so if we flip it to black
* we don't have to rebalance.
*/
KASSERT(rb_tree_check_node(rbt, standin_son, NULL, true));
RB_MARK_BLACK(standin_son);
rebalance = false;
if (standin_father == self) {
KASSERT(RB_POSITION(standin_son) == standin_which);
} else {
KASSERT(RB_POSITION(standin_son) == standin_other);
/*
* Change the son's parentage to point to his grandpa.
*/
RB_SET_FATHER(standin_son, standin_father);
RB_SET_POSITION(standin_son, standin_which);
}
}
if (standin_father == self) {
/*
* If we are about to delete the standin's father, then when
* we call rebalance, we need to use ourselves as our father.
* Otherwise remember our original father. Also, sincef we are
* our standin's father we only need to reparent the standin's
* brother.
*
* | R --> S |
* | Q S --> Q T |
* | t --> |
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other])); KASSERT(!RB_SENTINEL_P(self->rb_nodes[standin_other])); KASSERT(self->rb_nodes[standin_which] == standin);
/*
* Have our son/standin adopt his brother as his new son.
*/
standin_father = standin;
} else {
/*
* | R --> S . |
* | / \ | T --> / \ | / |
* | ..... | S --> ..... | T |
*
* Sever standin's connection to his father.
*/
standin_father->rb_nodes[standin_which] = standin_son;
/*
* Adopt the far son.
*/
standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
RB_SET_FATHER(standin->rb_nodes[standin_other], standin);
KASSERT(RB_POSITION(self->rb_nodes[standin_other]) == standin_other);
/*
* Use standin_other because we need to preserve standin_which
* for the removal_rebalance.
*/
standin_other = standin_which;
}
/*
* Move the only remaining son to our standin. If our standin is our
* son, this will be the only son needed to be moved.
*/
KASSERT(standin->rb_nodes[standin_other] != self->rb_nodes[standin_other]);
standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
RB_SET_FATHER(standin->rb_nodes[standin_other], standin);
/*
* Now copy the result of self to standin and then replace
* self with standin in the tree.
*/
RB_COPY_PROPERTIES(standin, self);
RB_SET_FATHER(standin, RB_FATHER(self));
RB_FATHER(standin)->rb_nodes[RB_POSITION(standin)] = standin;
/*
* Remove ourselves from the node list, decrement the count,
* and update min/max.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) rbt->rbt_minmax[RB_POSITION(self)] = RB_FATHER(self);
RB_SET_FATHER(self, NULL);
#endif
KASSERT(rb_tree_check_node(rbt, standin, NULL, false));
KASSERT(RB_FATHER_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin_father, NULL, false));
KASSERT(RB_LEFT_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin->rb_left, NULL, false));
KASSERT(RB_RIGHT_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin->rb_right, NULL, false));
if (!rebalance)
return;
rb_tree_removal_rebalance(rbt, standin_father, standin_which);
KASSERT(rb_tree_check_node(rbt, standin, NULL, true));
}
/*
* We could do this by doing
* rb_tree_node_swap(rbt, self, which);
* rb_tree_prune_node(rbt, self, false);
*
* But it's more efficient to just evalate and recolor the child.
*/
static void
rb_tree_prune_blackred_branch(struct rb_tree *rbt, struct rb_node *self,
unsigned int which)
{
struct rb_node *father = RB_FATHER(self);
struct rb_node *son = self->rb_nodes[which];
#ifndef RBSMALL
const bool was_root = RB_ROOT_P(rbt, self);
#endif
KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
KASSERT(RB_BLACK_P(self) && RB_RED_P(son)); KASSERT(!RB_TWOCHILDREN_P(son)); KASSERT(RB_CHILDLESS_P(son));
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
KASSERT(rb_tree_check_node(rbt, son, NULL, false));
/*
* Remove ourselves from the tree and give our former child our
* properties (position, color, root).
*/
RB_COPY_PROPERTIES(son, self);
father->rb_nodes[RB_POSITION(son)] = son;
RB_SET_FATHER(son, father);
/*
* Remove ourselves from the node list, decrement the count,
* and update minmax.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(was_root)) {
KASSERT(rbt->rbt_minmax[which] == son);
rbt->rbt_minmax[which ^ RB_DIR_OTHER] = son;
} else if (rbt->rbt_minmax[RB_POSITION(self)] == self) {
rbt->rbt_minmax[RB_POSITION(self)] = son;
}
RB_SET_FATHER(self, NULL);
#endif
KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
KASSERT(rb_tree_check_node(rbt, son, NULL, true));
}
void
rb_tree_remove_node(struct rb_tree *rbt, void *object)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
struct rb_node *standin, *self = RB_ITEMTONODE(rbto, object);
unsigned int which;
KASSERT(!RB_SENTINEL_P(self));
RBSTAT_INC(rbt->rbt_removals);
/*
* In the following diagrams, we (the node to be removed) are S. Red
* nodes are lowercase. T could be either red or black.
*
* Remember the major axiom of the red-black tree: the number of
* black nodes from the root to each leaf is constant across all
* leaves, only the number of red nodes varies.
*
* Thus removing a red leaf doesn't require any other changes to a
* red-black tree. So if we must remove a node, attempt to rearrange
* the tree so we can remove a red node.
*
* The simpliest case is a childless red node or a childless root node:
*
* | T --> T | or | R --> * |
* | s --> * |
*/
if (RB_CHILDLESS_P(self)) { const bool rebalance = RB_BLACK_P(self) && !RB_ROOT_P(rbt, self); rb_tree_prune_node(rbt, self, rebalance);
return;
}
KASSERT(!RB_CHILDLESS_P(self));
if (!RB_TWOCHILDREN_P(self)) {
/*
* The next simpliest case is the node we are deleting is
* black and has one red child.
*
* | T --> T --> T |
* | S --> R --> R |
* | r --> s --> * |
*/
which = RB_LEFT_SENTINEL_P(self) ? RB_DIR_RIGHT : RB_DIR_LEFT;
KASSERT(RB_BLACK_P(self));
KASSERT(RB_RED_P(self->rb_nodes[which])); KASSERT(RB_CHILDLESS_P(self->rb_nodes[which])); rb_tree_prune_blackred_branch(rbt, self, which);
return;
}
KASSERT(RB_TWOCHILDREN_P(self));
/*
* We invert these because we prefer to remove from the inside of
* the tree.
*/
which = RB_POSITION(self) ^ RB_DIR_OTHER;
/*
* Let's find the node closes to us opposite of our parent
* Now swap it with ourself, "prune" it, and rebalance, if needed.
*/
standin = RB_ITEMTONODE(rbto, rb_tree_iterate(rbt, object, which));
rb_tree_swap_prune_and_rebalance(rbt, self, standin);
}
static void
rb_tree_removal_rebalance(struct rb_tree *rbt, struct rb_node *parent,
unsigned int which)
{ KASSERT(!RB_SENTINEL_P(parent)); KASSERT(RB_SENTINEL_P(parent->rb_nodes[which])); KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
RBSTAT_INC(rbt->rbt_removal_rebalance_calls);
while (RB_BLACK_P(parent->rb_nodes[which])) {
unsigned int other = which ^ RB_DIR_OTHER;
struct rb_node *brother = parent->rb_nodes[other];
RBSTAT_INC(rbt->rbt_removal_rebalance_passes);
KASSERT(!RB_SENTINEL_P(brother));
/*
* For cases 1, 2a, and 2b, our brother's children must
* be black and our father must be black
*/
if (RB_BLACK_P(parent) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) {
if (RB_RED_P(brother)) {
/*
* Case 1: Our brother is red, swap its
* position (and colors) with our parent.
* This should now be case 2b (unless C or E
* has a red child which is case 3; thus no
* explicit branch to case 2b).
*
* B -> D
* A d -> b E
* C E -> A C
*/
KASSERT(RB_BLACK_P(parent));
rb_tree_reparent_nodes(rbt, parent, other);
brother = parent->rb_nodes[other];
KASSERT(!RB_SENTINEL_P(brother)); KASSERT(RB_RED_P(parent)); KASSERT(RB_BLACK_P(brother));
KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
} else {
/*
* Both our parent and brother are black.
* Change our brother to red, advance up rank
* and go through the loop again.
*
* B -> *B
* *A D -> A d
* C E -> C E
*/
RB_MARK_RED(brother);
KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right)); if (RB_ROOT_P(rbt, parent))
return; /* root == parent == black */
KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
which = RB_POSITION(parent);
parent = RB_FATHER(parent);
continue;
}
}
/*
* Avoid an else here so that case 2a above can hit either
* case 2b, 3, or 4.
*/
if (RB_RED_P(parent) && RB_BLACK_P(brother) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) {
KASSERT(RB_RED_P(parent));
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right));
/*
* We are black, our father is red, our brother and
* both nephews are black. Simply invert/exchange the
* colors of our father and brother (to black and red
* respectively).
*
* | f --> F |
* | * B --> * b |
* | N N --> N N |
*/
RB_MARK_BLACK(parent);
RB_MARK_RED(brother);
KASSERT(rb_tree_check_node(rbt, brother, NULL, true));
break; /* We're done! */
} else {
/*
* Our brother must be black and have at least one
* red child (it may have two).
*/
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[which]) ||
RB_RED_P(brother->rb_nodes[other]));
if (RB_BLACK_P(brother->rb_nodes[other])) {
/*
* Case 3: our brother is black, our near
* nephew is red, and our far nephew is black.
* Swap our brother with our near nephew.
* This result in a tree that matches case 4.
* (Our father could be red or black).
*
* | F --> F |
* | x B --> x B |
* | n --> n |
*/
KASSERT(RB_RED_P(brother->rb_nodes[which]));
rb_tree_reparent_nodes(rbt, brother, which);
KASSERT(RB_FATHER(brother) == parent->rb_nodes[other]);
brother = parent->rb_nodes[other];
KASSERT(RB_RED_P(brother->rb_nodes[other]));
}
/*
* Case 4: our brother is black and our far nephew
* is red. Swap our father and brother locations and
* change our far nephew to black. (these can be
* done in either order so we change the color first).
* The result is a valid red-black tree and is a
* terminal case. (again we don't care about the
* father's color)
*
* If the father is red, we will get a red-black-black
* tree:
* | f -> f --> b |
* | B -> B --> F N |
* | n -> N --> |
*
* If the father is black, we will get an all black
* tree:
* | F -> F --> B |
* | B -> B --> F N |
* | n -> N --> |
*
* If we had two red nephews, then after the swap,
* our former father would have a red grandson.
*/
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[other]));
RB_MARK_BLACK(brother->rb_nodes[other]);
rb_tree_reparent_nodes(rbt, parent, other);
break; /* We're done! */
}
}
KASSERT(rb_tree_check_node(rbt, parent, NULL, true));
}
void *
rb_tree_iterate(struct rb_tree *rbt, void *object, const unsigned int direction)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
const unsigned int other = direction ^ RB_DIR_OTHER;
struct rb_node *self;
KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);
if (object == NULL) {
#ifndef RBSMALL
if (RB_SENTINEL_P(rbt->rbt_root))
return NULL;
return RB_NODETOITEM(rbto, rbt->rbt_minmax[direction]);
#else
self = rbt->rbt_root;
if (RB_SENTINEL_P(self))
return NULL;
while (!RB_SENTINEL_P(self->rb_nodes[direction]))
self = self->rb_nodes[direction];
return RB_NODETOITEM(rbto, self);
#endif /* !RBSMALL */
}
self = RB_ITEMTONODE(rbto, object);
KASSERT(!RB_SENTINEL_P(self));
/*
* We can't go any further in this direction. We proceed up in the
* opposite direction until our parent is in direction we want to go.
*/
if (RB_SENTINEL_P(self->rb_nodes[direction])) {
while (!RB_ROOT_P(rbt, self)) { if (other == RB_POSITION(self))
return RB_NODETOITEM(rbto, RB_FATHER(self));
self = RB_FATHER(self);
}
return NULL;
}
/*
* Advance down one in current direction and go down as far as possible
* in the opposite direction.
*/
self = self->rb_nodes[direction];
KASSERT(!RB_SENTINEL_P(self));
while (!RB_SENTINEL_P(self->rb_nodes[other]))
self = self->rb_nodes[other];
return RB_NODETOITEM(rbto, self);
}
#ifdef RBDEBUG
static const struct rb_node *
rb_tree_iterate_const(const struct rb_tree *rbt, const struct rb_node *self,
const unsigned int direction)
{
const unsigned int other = direction ^ RB_DIR_OTHER;
KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);
if (self == NULL) {
#ifndef RBSMALL
if (RB_SENTINEL_P(rbt->rbt_root))
return NULL;
return rbt->rbt_minmax[direction];
#else
self = rbt->rbt_root;
if (RB_SENTINEL_P(self))
return NULL;
while (!RB_SENTINEL_P(self->rb_nodes[direction]))
self = self->rb_nodes[direction];
return self;
#endif /* !RBSMALL */
}
KASSERT(!RB_SENTINEL_P(self));
/*
* We can't go any further in this direction. We proceed up in the
* opposite direction until our parent is in direction we want to go.
*/
if (RB_SENTINEL_P(self->rb_nodes[direction])) {
while (!RB_ROOT_P(rbt, self)) {
if (other == RB_POSITION(self))
return RB_FATHER(self);
self = RB_FATHER(self);
}
return NULL;
}
/*
* Advance down one in current direction and go down as far as possible
* in the opposite direction.
*/
self = self->rb_nodes[direction];
KASSERT(!RB_SENTINEL_P(self));
while (!RB_SENTINEL_P(self->rb_nodes[other]))
self = self->rb_nodes[other];
return self;
}
static unsigned int
rb_tree_count_black(const struct rb_node *self)
{
unsigned int left, right;
if (RB_SENTINEL_P(self))
return 0;
left = rb_tree_count_black(self->rb_left);
right = rb_tree_count_black(self->rb_right);
KASSERT(left == right);
return left + RB_BLACK_P(self);
}
static bool
rb_tree_check_node(const struct rb_tree *rbt, const struct rb_node *self,
const struct rb_node *prev, bool red_check)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;
KASSERT(!RB_SENTINEL_P(self));
KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);
/*
* Verify our relationship to our parent.
*/
if (RB_ROOT_P(rbt, self)) {
KASSERT(self == rbt->rbt_root);
KASSERT(RB_POSITION(self) == RB_DIR_LEFT);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
KASSERT(RB_FATHER(self) == (const struct rb_node *) &rbt->rbt_root);
} else {
int diff = (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self),
RB_NODETOITEM(rbto, RB_FATHER(self)));
KASSERT(self != rbt->rbt_root);
KASSERT(!RB_FATHER_SENTINEL_P(self));
if (RB_POSITION(self) == RB_DIR_LEFT) {
KASSERT(diff < 0);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
} else {
KASSERT(diff > 0);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_RIGHT] == self);
}
}
/*
* Verify our position in the linked list against the tree itself.
*/
{
const struct rb_node *prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
const struct rb_node *next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
KASSERT(prev0 == TAILQ_PREV(self, rb_node_qh, rb_link));
KASSERT(next0 == TAILQ_NEXT(self, rb_link));
#ifndef RBSMALL
KASSERT(prev0 != NULL || self == rbt->rbt_minmax[RB_DIR_LEFT]);
KASSERT(next0 != NULL || self == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif
}
/*
* The root must be black.
* There can never be two adjacent red nodes.
*/
if (red_check) {
KASSERT(!RB_ROOT_P(rbt, self) || RB_BLACK_P(self));
(void) rb_tree_count_black(self);
if (RB_RED_P(self)) {
const struct rb_node *brother;
KASSERT(!RB_ROOT_P(rbt, self));
brother = RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER];
KASSERT(RB_BLACK_P(RB_FATHER(self)));
/*
* I'm red and have no children, then I must either
* have no brother or my brother also be red and
* also have no children. (black count == 0)
*/
KASSERT(!RB_CHILDLESS_P(self)
|| RB_SENTINEL_P(brother)
|| RB_RED_P(brother)
|| RB_CHILDLESS_P(brother));
/*
* If I'm not childless, I must have two children
* and they must be both be black.
*/
KASSERT(RB_CHILDLESS_P(self)
|| (RB_TWOCHILDREN_P(self)
&& RB_BLACK_P(self->rb_left)
&& RB_BLACK_P(self->rb_right)));
/*
* If I'm not childless, thus I have black children,
* then my brother must either be black or have two
* black children.
*/
KASSERT(RB_CHILDLESS_P(self)
|| RB_BLACK_P(brother)
|| (RB_TWOCHILDREN_P(brother)
&& RB_BLACK_P(brother->rb_left)
&& RB_BLACK_P(brother->rb_right)));
} else {
/*
* If I'm black and have one child, that child must
* be red and childless.
*/
KASSERT(RB_CHILDLESS_P(self)
|| RB_TWOCHILDREN_P(self)
|| (!RB_LEFT_SENTINEL_P(self)
&& RB_RIGHT_SENTINEL_P(self)
&& RB_RED_P(self->rb_left)
&& RB_CHILDLESS_P(self->rb_left))
|| (!RB_RIGHT_SENTINEL_P(self)
&& RB_LEFT_SENTINEL_P(self)
&& RB_RED_P(self->rb_right)
&& RB_CHILDLESS_P(self->rb_right)));
/*
* If I'm a childless black node and my parent is
* black, my 2nd closet relative away from my parent
* is either red or has a red parent or red children.
*/
if (!RB_ROOT_P(rbt, self)
&& RB_CHILDLESS_P(self)
&& RB_BLACK_P(RB_FATHER(self))) {
const unsigned int which = RB_POSITION(self);
const unsigned int other = which ^ RB_DIR_OTHER;
const struct rb_node *relative0, *relative;
relative0 = rb_tree_iterate_const(rbt,
self, other);
KASSERT(relative0 != NULL);
relative = rb_tree_iterate_const(rbt,
relative0, other);
KASSERT(relative != NULL);
KASSERT(RB_SENTINEL_P(relative->rb_nodes[which]));
#if 0
KASSERT(RB_RED_P(relative)
|| RB_RED_P(relative->rb_left)
|| RB_RED_P(relative->rb_right)
|| RB_RED_P(RB_FATHER(relative)));
#endif
}
}
/*
* A grandparent's children must be real nodes and not
* sentinels. First check out grandparent.
*/
KASSERT(RB_ROOT_P(rbt, self)
|| RB_ROOT_P(rbt, RB_FATHER(self))
|| RB_TWOCHILDREN_P(RB_FATHER(RB_FATHER(self))));
/*
* If we are have grandchildren on our left, then
* we must have a child on our right.
*/
KASSERT(RB_LEFT_SENTINEL_P(self)
|| RB_CHILDLESS_P(self->rb_left)
|| !RB_RIGHT_SENTINEL_P(self));
/*
* If we are have grandchildren on our right, then
* we must have a child on our left.
*/
KASSERT(RB_RIGHT_SENTINEL_P(self)
|| RB_CHILDLESS_P(self->rb_right)
|| !RB_LEFT_SENTINEL_P(self));
/*
* If we have a child on the left and it doesn't have two
* children make sure we don't have great-great-grandchildren on
* the right.
*/
KASSERT(RB_TWOCHILDREN_P(self->rb_left)
|| RB_CHILDLESS_P(self->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_right->rb_right));
/*
* If we have a child on the right and it doesn't have two
* children make sure we don't have great-great-grandchildren on
* the left.
*/
KASSERT(RB_TWOCHILDREN_P(self->rb_right)
|| RB_CHILDLESS_P(self->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_left->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_right->rb_right));
/*
* If we are fully interior node, then our predecessors and
* successors must have no children in our direction.
*/
if (RB_TWOCHILDREN_P(self)) {
const struct rb_node *prev0;
const struct rb_node *next0;
prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
KASSERT(prev0 != NULL);
KASSERT(RB_RIGHT_SENTINEL_P(prev0));
next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
KASSERT(next0 != NULL);
KASSERT(RB_LEFT_SENTINEL_P(next0));
}
}
return true;
}
void
rb_tree_check(const struct rb_tree *rbt, bool red_check)
{
const struct rb_node *self;
const struct rb_node *prev;
#ifdef RBSTATS
unsigned int count = 0;
#endif
KASSERT(rbt->rbt_root != NULL);
KASSERT(RB_LEFT_P(rbt->rbt_root));
#if defined(RBSTATS) && !defined(RBSMALL)
KASSERT(rbt->rbt_count > 1
|| rbt->rbt_minmax[RB_DIR_LEFT] == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif
prev = NULL;
TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
rb_tree_check_node(rbt, self, prev, false);
#ifdef RBSTATS
count++;
#endif
}
#ifdef RBSTATS
KASSERT(rbt->rbt_count == count);
#endif
if (red_check) {
KASSERT(RB_BLACK_P(rbt->rbt_root));
KASSERT(RB_SENTINEL_P(rbt->rbt_root)
|| rb_tree_count_black(rbt->rbt_root));
/*
* The root must be black.
* There can never be two adjacent red nodes.
*/
TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
rb_tree_check_node(rbt, self, NULL, true);
}
}
}
#endif /* RBDEBUG */
#ifdef RBSTATS
static void
rb_tree_mark_depth(const struct rb_tree *rbt, const struct rb_node *self,
size_t *depths, size_t depth)
{
if (RB_SENTINEL_P(self))
return;
if (RB_TWOCHILDREN_P(self)) {
rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
return;
}
depths[depth]++;
if (!RB_LEFT_SENTINEL_P(self)) {
rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
}
if (!RB_RIGHT_SENTINEL_P(self)) {
rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
}
}
void
rb_tree_depths(const struct rb_tree *rbt, size_t *depths)
{
rb_tree_mark_depth(rbt, rbt->rbt_root, depths, 1);
}
#endif /* RBSTATS */
/* $NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $ */
/* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>
/*
* First-come first-served sort for disks.
*
* Requests are appended to the queue without any reordering.
*/
struct bufq_fcfs {
TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */
};
static void bufq_fcfs_init(struct bufq_state *);
static void bufq_fcfs_put(struct bufq_state *, struct buf *);
static struct buf *bufq_fcfs_get(struct bufq_state *, int);
BUFQ_DEFINE(fcfs, 10, bufq_fcfs_init);
static void
bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
}
static struct buf *
bufq_fcfs_get(struct bufq_state *bufq, int remove)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
struct buf *bp;
bp = TAILQ_FIRST(&fcfs->bq_head);
if (bp != NULL && remove) TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
return (bp);
}
static struct buf *
bufq_fcfs_cancel(struct bufq_state *bufq, struct buf *buf)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
struct buf *bp;
TAILQ_FOREACH(bp, &fcfs->bq_head, b_actq) {
if (bp == buf) {
TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
return buf;
}
}
return NULL;
}
static void
bufq_fcfs_fini(struct bufq_state *bufq)
{
KASSERT(bufq->bq_private != NULL);
kmem_free(bufq->bq_private, sizeof(struct bufq_fcfs));
}
static void
bufq_fcfs_init(struct bufq_state *bufq)
{
struct bufq_fcfs *fcfs;
bufq->bq_get = bufq_fcfs_get;
bufq->bq_put = bufq_fcfs_put;
bufq->bq_cancel = bufq_fcfs_cancel;
bufq->bq_fini = bufq_fcfs_fini;
bufq->bq_private = kmem_zalloc(sizeof(struct bufq_fcfs), KM_SLEEP);
fcfs = (struct bufq_fcfs *)bufq->bq_private;
TAILQ_INIT(&fcfs->bq_head);
}
MODULE(MODULE_CLASS_BUFQ, bufq_fcfs, NULL);
static int
bufq_fcfs_modcmd(modcmd_t cmd, void *opaque)
{
switch (cmd) {
case MODULE_CMD_INIT:
return bufq_register(&bufq_strat_fcfs);
case MODULE_CMD_FINI:
return bufq_unregister(&bufq_strat_fcfs);
default:
return ENOTTY;
}
}
/* $NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $ */
/* $KAME: ip6_input.c,v 1.188 2001/03/29 05:34:31 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_gateway.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/cprng.h>
#include <sys/percpu.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/pktqueue.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#ifdef INET
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#endif /* INET */
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet/icmp6.h>
#include <netinet6/scope6_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/nd6.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet6/ip6protosw.h>
#include "faith.h"
extern struct domain inet6domain;
u_char ip6_protox[IPPROTO_MAX];
pktqueue_t *ip6_pktq __read_mostly;
pfil_head_t *inet6_pfil_hook;
percpu_t *ip6stat_percpu;
percpu_t *ip6_forward_rt_percpu __cacheline_aligned;
static void ip6intr(void *);
static void ip6_input(struct mbuf *, struct ifnet *);
static bool ip6_badaddr(struct ip6_hdr *);
static struct m_tag *ip6_setdstifaddr(struct mbuf *, const struct in6_ifaddr *);
static struct m_tag *ip6_addaux(struct mbuf *);
static struct m_tag *ip6_findaux(struct mbuf *);
static void ip6_delaux(struct mbuf *);
static int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *,
u_int32_t *);
static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
static void sysctl_net_inet6_ip6_setup(struct sysctllog **);
#ifdef NET_MPSAFE
#define SOFTNET_LOCK() mutex_enter(softnet_lock)
#define SOFTNET_UNLOCK() mutex_exit(softnet_lock)
#else
#define SOFTNET_LOCK() KASSERT(mutex_owned(softnet_lock))
#define SOFTNET_UNLOCK() KASSERT(mutex_owned(softnet_lock))
#endif
/* Ensure that non packed structures are the desired size. */
__CTASSERT(sizeof(struct ip6_hdr) == 40);
__CTASSERT(sizeof(struct ip6_ext) == 2);
__CTASSERT(sizeof(struct ip6_hbh) == 2);
__CTASSERT(sizeof(struct ip6_dest) == 2);
__CTASSERT(sizeof(struct ip6_opt) == 2);
__CTASSERT(sizeof(struct ip6_opt_jumbo) == 6);
__CTASSERT(sizeof(struct ip6_opt_nsap) == 4);
__CTASSERT(sizeof(struct ip6_opt_tunnel) == 3);
__CTASSERT(sizeof(struct ip6_opt_router) == 4);
__CTASSERT(sizeof(struct ip6_rthdr) == 4);
__CTASSERT(sizeof(struct ip6_rthdr0) == 8);
__CTASSERT(sizeof(struct ip6_frag) == 8);
/*
* IP6 initialization: fill in IP6 protocol switch table.
* All protocols not implemented in kernel go to raw IP6 protocol handler.
*/
void
ip6_init(void)
{
const struct ip6protosw *pr;
int i;
in6_init();
ip6_pktq = pktq_create(IFQ_MAXLEN, ip6intr, NULL);
KASSERT(ip6_pktq != NULL);
sysctl_net_inet6_ip6_setup(NULL);
pr = (const struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
if (pr == 0)
panic("ip6_init");
for (i = 0; i < IPPROTO_MAX; i++)
ip6_protox[i] = pr - inet6sw;
for (pr = (const struct ip6protosw *)inet6domain.dom_protosw;
pr < (const struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
if (pr->pr_domain->dom_family == PF_INET6 &&
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
ip6_protox[pr->pr_protocol] = pr - inet6sw;
scope6_init();
addrsel_policy_init();
nd6_init();
frag6_init();
#ifdef GATEWAY
ip6flow_init(ip6_hashsize);
#endif
/* Register our Packet Filter hook. */
inet6_pfil_hook = pfil_head_create(PFIL_TYPE_AF, (void *)AF_INET6);
KASSERT(inet6_pfil_hook != NULL);
ip6stat_percpu = percpu_alloc(sizeof(uint64_t) * IP6_NSTATS);
ip6_forward_rt_percpu = rtcache_percpu_alloc();
}
/*
* IP6 input interrupt handling. Just pass the packet to ip6_input.
*/
static void
ip6intr(void *arg __unused)
{
struct mbuf *m;
SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
while ((m = pktq_dequeue(ip6_pktq)) != NULL) {
struct psref psref;
struct ifnet *rcvif = m_get_rcvif_psref(m, &psref);
if (rcvif == NULL) {
IP6_STATINC(IP6_STAT_IFDROP);
m_freem(m);
continue;
}
/*
* Drop the packet if IPv6 is disabled on the interface.
*/
if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) {
m_put_rcvif_psref(rcvif, &psref);
IP6_STATINC(IP6_STAT_IFDROP);
m_freem(m);
continue;
}
ip6_input(m, rcvif);
m_put_rcvif_psref(rcvif, &psref);
}
SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
static void
ip6_input(struct mbuf *m, struct ifnet *rcvif)
{
struct ip6_hdr *ip6;
int hit, off = sizeof(struct ip6_hdr), nest;
u_int32_t plen;
u_int32_t rtalert = ~0;
int nxt, ours = 0, rh_present = 0, frg_present;
struct ifnet *deliverifp = NULL;
int srcrt = 0;
struct rtentry *rt = NULL;
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
struct route *ro;
KASSERT(rcvif != NULL);
/*
* make sure we don't have onion peering information into m_tag.
*/
ip6_delaux(m);
/*
* mbuf statistics
*/
if (m->m_flags & M_EXT) {
if (m->m_next)
IP6_STATINC(IP6_STAT_MEXT2M);
else
IP6_STATINC(IP6_STAT_MEXT1);
} else {
#define M2MMAX 32
if (m->m_next) {
if (m->m_flags & M_LOOP)
/*XXX*/ IP6_STATINC(IP6_STAT_M2M + lo0ifp->if_index);
else if (rcvif->if_index < M2MMAX)
IP6_STATINC(IP6_STAT_M2M + rcvif->if_index);
else
IP6_STATINC(IP6_STAT_M2M);
} else
IP6_STATINC(IP6_STAT_M1);
#undef M2MMAX
}
in6_ifstat_inc(rcvif, ifs6_in_receive);
IP6_STATINC(IP6_STAT_TOTAL);
/*
* If the IPv6 header is not aligned, slurp it up into a new
* mbuf with space for link headers, in the event we forward
* it. Otherwise, if it is aligned, make sure the entire base
* IPv6 header is in the first mbuf of the chain.
*/
if (M_GET_ALIGNED_HDR(&m, struct ip6_hdr, true) != 0) {
/* XXXJRT new stat, please */
IP6_STATINC(IP6_STAT_TOOSMALL);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
return;
}
ip6 = mtod(m, struct ip6_hdr *);
if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
IP6_STATINC(IP6_STAT_BADVERS);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
goto bad;
}
if (ip6_badaddr(ip6)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(rcvif, ifs6_in_addrerr);
goto bad;
}
/*
* Assume that we can create a fast-forward IP flow entry
* based on this packet.
*/
m->m_flags |= M_CANFASTFWD;
/*
* Run through list of hooks for input packets. If there are any
* filters which require that additional packets in the flow are
* not fast-forwarded, they must clear the M_CANFASTFWD flag.
* Note that filters must _never_ set this flag, as another filter
* in the list may have previously cleared it.
*
* Don't call hooks if the packet has already been processed by
* IPsec (encapsulated, tunnel mode).
*/
#if defined(IPSEC)
if (!ipsec_used || !ipsec_skip_pfil(m))
#else
if (1)
#endif
{
struct in6_addr odst;
int error;
odst = ip6->ip6_dst;
error = pfil_run_hooks(inet6_pfil_hook, &m, rcvif, PFIL_IN);
if (error != 0 || m == NULL) {
IP6_STATINC(IP6_STAT_PFILDROP_IN);
return;
}
if (m->m_len < sizeof(struct ip6_hdr)) {
if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
IP6_STATINC(IP6_STAT_TOOSMALL);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
return;
}
}
ip6 = mtod(m, struct ip6_hdr *);
srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
}
IP6_STATINC(IP6_STAT_NXTHIST + ip6->ip6_nxt);
#ifdef ALTQ
if (altq_input != NULL) {
SOFTNET_LOCK();
if ((*altq_input)(m, AF_INET6) == 0) {
SOFTNET_UNLOCK();
/* packet is dropped by traffic conditioner */
return;
}
SOFTNET_UNLOCK();
}
#endif
/*
* Disambiguate address scope zones (if there is ambiguity).
* We first make sure that the original source or destination address
* is not in our internal form for scoped addresses. Such addresses
* are not necessarily invalid spec-wise, but we cannot accept them due
* to the usage conflict.
* in6_setscope() then also checks and rejects the cases where src or
* dst are the loopback address and the receiving interface
* is not loopback.
*/
if (__predict_false(
m_makewritable(&m, 0, sizeof(struct ip6_hdr), M_DONTWAIT))) {
IP6_STATINC(IP6_STAT_IDROPPED);
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
IP6_STATINC(IP6_STAT_BADSCOPE); /* XXX */
goto bad;
}
if (in6_setscope(&ip6->ip6_src, rcvif, NULL) ||
in6_setscope(&ip6->ip6_dst, rcvif, NULL)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
ro = rtcache_percpu_getref(ip6_forward_rt_percpu);
/*
* Multicast check
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
bool ingroup;
in6_ifstat_inc(rcvif, ifs6_in_mcast);
/*
* See if we belong to the destination multicast group on the
* arrival interface.
*/
ingroup = in6_multi_group(&ip6->ip6_dst, rcvif);
if (ingroup) {
ours = 1;
} else if (!ip6_mrouter) {
uint64_t *ip6s = IP6_STAT_GETREF();
ip6s[IP6_STAT_NOTMEMBER]++;
ip6s[IP6_STAT_CANTFORWARD]++;
IP6_STAT_PUTREF();
in6_ifstat_inc(rcvif, ifs6_in_discard);
goto bad_unref;
}
deliverifp = rcvif;
goto hbhcheck;
}
sockaddr_in6_init(&u.dst6, &ip6->ip6_dst, 0, 0, 0);
/*
* Unicast check
*/
rt = rtcache_lookup2(ro, &u.dst, 1, &hit);
if (hit)
IP6_STATINC(IP6_STAT_FORWARD_CACHEHIT);
else
IP6_STATINC(IP6_STAT_FORWARD_CACHEMISS);
/*
* Accept the packet if the forwarding interface to the destination
* (according to the routing table) is the loopback interface,
* unless the associated route has a gateway.
*
* We don't explicitly match ip6_dst against an interface here. It
* is already done in rtcache_lookup2: rt->rt_ifp->if_type will be
* IFT_LOOP if the packet is for us.
*
* Note that this approach causes to accept a packet if there is a
* route to the loopback interface for the destination of the packet.
* But we think it's even useful in some situations, e.g. when using
* a special daemon which wants to intercept the packet.
*/
if (rt != NULL &&
(rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST &&
rt->rt_ifp->if_type == IFT_LOOP) {
struct in6_ifaddr *ia6 = (struct in6_ifaddr *)rt->rt_ifa;
int addrok;
if (ia6->ia6_flags & IN6_IFF_ANYCAST)
m->m_flags |= M_ANYCAST6;
/*
* packets to a tentative, duplicated, or somehow invalid
* address must not be accepted.
*/
if (ia6->ia6_flags & IN6_IFF_NOTREADY)
addrok = 0;
else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src))
{
/* Allow internal traffic to DETACHED addresses */
struct sockaddr_in6 sin6;
int s;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = ip6->ip6_src;
s = pserialize_read_enter();
addrok = (ifa_ifwithaddr(sin6tosa(&sin6)) != NULL);
pserialize_read_exit(s);
} else
addrok = 1;
if (addrok) {
/* this address is ready */
ours = 1;
deliverifp = ia6->ia_ifp; /* correct? */
goto hbhcheck;
} else {
/* address is not ready, so discard the packet. */
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
nd6log(LOG_INFO, "packet to an unready address %s->%s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
IP6_STATINC(IP6_STAT_IDROPPED);
goto bad_unref;
}
}
/*
* FAITH (Firewall Aided Internet Translator)
*/
#if defined(NFAITH) && 0 < NFAITH
if (ip6_keepfaith) {
if (rt != NULL && rt->rt_ifp != NULL &&
rt->rt_ifp->if_type == IFT_FAITH) {
/* XXX do we need more sanity checks? */
ours = 1;
deliverifp = rt->rt_ifp; /* faith */
goto hbhcheck;
}
}
#endif
/*
* Now there is no reason to process the packet if it's not our own
* and we're not a router.
*/
if (!ip6_forwarding) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
in6_ifstat_inc(rcvif, ifs6_in_discard);
goto bad_unref;
}
hbhcheck:
/*
* Record address information into m_tag, if we don't have one yet.
* Note that we are unable to record it, if the address is not listed
* as our interface address (e.g. multicast addresses, addresses
* within FAITH prefixes and such).
*/
if (deliverifp && ip6_getdstifaddr(m) == NULL) {
struct in6_ifaddr *ia6;
int s = pserialize_read_enter();
ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
/* Depends on ip6_setdstifaddr never sleep */
if (ia6 != NULL && ip6_setdstifaddr(m, ia6) == NULL) {
/*
* XXX maybe we should drop the packet here,
* as we could not provide enough information
* to the upper layers.
*/
}
pserialize_read_exit(s);
}
/*
* Process Hop-by-Hop options header if it's contained.
* m may be modified in ip6_hopopts_input().
* If a JumboPayload option is included, plen will also be modified.
*/
plen = (u_int32_t)ntohs(ip6->ip6_plen);
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) {
/* m already freed */
in6_ifstat_inc(rcvif, ifs6_in_discard);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
/*
* if the payload length field is 0 and the next header field
* indicates Hop-by-Hop Options header, then a Jumbo Payload
* option MUST be included.
*/
if (ip6->ip6_plen == 0 && plen == 0) {
/*
* Note that if a valid jumbo payload option is
* contained, ip6_hopopts_input() must set a valid
* (non-zero) payload length to the variable plen.
*/
IP6_STATINC(IP6_STAT_BADOPTIONS);
in6_ifstat_inc(rcvif, ifs6_in_discard);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
(char *)&ip6->ip6_plen - (char *)ip6);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
sizeof(struct ip6_hbh));
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
nxt = hbh->ip6h_nxt;
/*
* accept the packet if a router alert option is included
* and we act as an IPv6 router.
*/
if (rtalert != ~0 && ip6_forwarding)
ours = 1;
} else
nxt = ip6->ip6_nxt;
/*
* Check that the amount of data in the buffers is at least much as
* the IPv6 header would have us expect. Trim mbufs if longer than we
* expect. Drop packet if shorter than we expect.
*/
if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
IP6_STATINC(IP6_STAT_TOOSHORT);
in6_ifstat_inc(rcvif, ifs6_in_truncated);
goto bad_unref;
}
if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
if (m->m_len == m->m_pkthdr.len) {
m->m_len = sizeof(struct ip6_hdr) + plen;
m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
} else
m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
}
/*
* Forward if desirable.
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/*
* If we are acting as a multicast router, all
* incoming multicast packets are passed to the
* kernel-level multicast forwarding function.
* The packet is returned (relatively) intact; if
* ip6_mforward() returns a non-zero value, the packet
* must be discarded, else it may be accepted below.
*/
if (ip6_mrouter != NULL) {
int error;
SOFTNET_LOCK();
error = ip6_mforward(ip6, rcvif, m);
SOFTNET_UNLOCK();
if (error != 0) {
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
IP6_STATINC(IP6_STAT_CANTFORWARD);
goto bad;
}
}
if (!ours) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
goto bad_unref;
}
} else if (!ours) {
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
ip6_forward(m, srcrt, rcvif);
return;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Malicious party may be able to use IPv4 mapped addr to confuse
* tcp/udp stack and bypass security checks (act as if it was from
* 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious.
*
* For SIIT end node behavior, you may want to disable the check.
* However, you will become vulnerable to attacks using IPv4 mapped
* source.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(rcvif, ifs6_in_addrerr);
goto bad_unref;
}
#ifdef IFA_STATS
if (deliverifp != NULL) {
struct in6_ifaddr *ia6;
int s = pserialize_read_enter();
ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
if (ia6)
ia6->ia_ifa.ifa_data.ifad_inbytes += m->m_pkthdr.len;
pserialize_read_exit(s);
}
#endif
IP6_STATINC(IP6_STAT_DELIVERED);
in6_ifstat_inc(deliverifp, ifs6_in_deliver);
nest = 0;
if (rt != NULL) {
rtcache_unref(rt, ro);
rt = NULL;
}
rtcache_percpu_putref(ip6_forward_rt_percpu);
rh_present = 0;
frg_present = 0;
while (nxt != IPPROTO_DONE) {
if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
IP6_STATINC(IP6_STAT_TOOMANYHDR);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
goto bad;
}
M_VERIFY_PACKET(m);
/*
* protection against faulty packet - there should be
* more sanity checks in header chain processing.
*/
if (m->m_pkthdr.len < off) {
IP6_STATINC(IP6_STAT_TOOSHORT);
in6_ifstat_inc(rcvif, ifs6_in_truncated);
goto bad;
}
if (nxt == IPPROTO_ROUTING) {
if (rh_present++) {
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
IP6_STATINC(IP6_STAT_BADOPTIONS);
goto bad;
}
} else if (nxt == IPPROTO_FRAGMENT) {
if (frg_present++) {
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
IP6_STATINC(IP6_STAT_BADOPTIONS);
goto bad;
}
}
#ifdef IPSEC
if (ipsec_used) {
/*
* Enforce IPsec policy checking if we are seeing last
* header. Note that we do not visit this with
* protocols with pcb layer code - like udp/tcp/raw ip.
*/
if ((inet6sw[ip6_protox[nxt]].pr_flags
& PR_LASTHDR) != 0) {
int error;
error = ipsec_ip_input_checkpolicy(m, false);
if (error) {
IP6_STATINC(IP6_STAT_IPSECDROP_IN);
goto bad;
}
}
}
#endif
nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
}
return;
bad_unref:
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
bad:
m_freem(m);
return;
}
static bool
ip6_badaddr(struct ip6_hdr *ip6)
{
/* Check against address spoofing/corruption. */
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
return true;
}
/*
* The following check is not documented in specs. A malicious
* party may be able to use IPv4 mapped addr to confuse tcp/udp stack
* and bypass security checks (act as if it was from 127.0.0.1 by using
* IPv6 src ::ffff:127.0.0.1). Be cautious.
*
* This check chokes if we are in an SIIT cloud. As none of BSDs
* support IPv4-less kernel compilation, we cannot support SIIT
* environment at all. So, it makes more sense for us to reject any
* malicious packets for non-SIIT environment, than try to do a
* partial support for SIIT environment.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
return true;
}
/*
* Reject packets with IPv4-compatible IPv6 addresses (RFC4291).
*/
if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
return true;
}
return false;
}
/*
* set/grab in6_ifaddr correspond to IPv6 destination address.
*/
static struct m_tag *
ip6_setdstifaddr(struct mbuf *m, const struct in6_ifaddr *ia)
{
struct m_tag *mtag;
struct ip6aux *ip6a;
mtag = ip6_addaux(m);
if (mtag == NULL)
return NULL;
ip6a = (struct ip6aux *)(mtag + 1);
if (in6_setscope(&ip6a->ip6a_src, ia->ia_ifp, &ip6a->ip6a_scope_id)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
return NULL;
}
ip6a->ip6a_src = ia->ia_addr.sin6_addr;
ip6a->ip6a_flags = ia->ia6_flags;
return mtag;
}
const struct ip6aux *
ip6_getdstifaddr(struct mbuf *m)
{
struct m_tag *mtag;
mtag = ip6_findaux(m);
if (mtag != NULL)
return (struct ip6aux *)(mtag + 1);
else
return NULL;
}
/*
* Hop-by-Hop options header processing. If a valid jumbo payload option is
* included, the real payload length will be stored in plenp.
*
* rtalertp - XXX: should be stored more smart way
*/
int
ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
struct mbuf **mp, int *offp)
{
struct mbuf *m = *mp;
int off = *offp, hbhlen;
struct ip6_hbh *hbh;
/* validation of the length of the header */
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return -1;
}
hbhlen = (hbh->ip6h_len + 1) << 3;
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
hbhlen);
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return -1;
}
KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
off += hbhlen;
hbhlen -= sizeof(struct ip6_hbh);
if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
hbhlen, rtalertp, plenp) < 0)
return -1;
*offp = off;
*mp = m;
return 0;
}
/*
* Search header for all Hop-by-hop options and process each option.
* This function is separate from ip6_hopopts_input() in order to
* handle a case where the sending node itself process its hop-by-hop
* options header. In such a case, the function is called from ip6_output().
*
* The function assumes that hbh header is located right after the IPv6 header
* (RFC2460 p7), opthead is pointer into data content in m, and opthead to
* opthead + hbhlen is located in continuous memory region.
*/
static int
ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
u_int32_t *rtalertp, u_int32_t *plenp)
{
struct ip6_hdr *ip6;
int optlen = 0;
u_int8_t *opt = opthead;
u_int16_t rtalert_val;
u_int32_t jumboplen;
const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) {
case IP6OPT_PAD1:
optlen = 1;
break;
case IP6OPT_PADN:
if (hbhlen < IP6OPT_MINLEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
optlen = *(opt + 1) + 2;
break;
case IP6OPT_RTALERT:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_RTALERT_LEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_RTALERT_LEN;
memcpy((void *)&rtalert_val, (void *)(opt + 2), 2);
*rtalertp = ntohs(rtalert_val);
break;
case IP6OPT_JUMBO:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_JUMBO_LEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_JUMBO_LEN;
/*
* IPv6 packets that have non 0 payload length
* must not contain a jumbo payload option.
*/
ip6 = mtod(m, struct ip6_hdr *);
if (ip6->ip6_plen) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt - opthead);
return (-1);
}
/*
* We may see jumbolen in unaligned location, so
* we'd need to perform memcpy().
*/
memcpy(&jumboplen, opt + 2, sizeof(jumboplen));
jumboplen = (u_int32_t)htonl(jumboplen);
#if 1
/*
* if there are multiple jumbo payload options,
* *plenp will be non-zero and the packet will be
* rejected.
* the behavior may need some debate in ipngwg -
* multiple options does not make sense, however,
* there's no explicit mention in specification.
*/
if (*plenp != 0) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
#endif
/*
* jumbo payload length must be larger than 65535.
*/
if (jumboplen <= IPV6_MAXPACKET) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
*plenp = jumboplen;
break;
default: /* unknown option */
if (hbhlen < IP6OPT_MINLEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
optlen = ip6_unknown_opt(opt, m,
erroff + opt - opthead);
if (optlen == -1)
return (-1);
optlen += 2;
break;
}
}
return (0);
bad:
m_freem(m);
return (-1);
}
/*
* Unknown option processing.
* The third argument `off' is the offset from the IPv6 header to the option,
* which is necessary if the IPv6 header the and option header and IPv6 header
* is not continuous in order to return an ICMPv6 error.
*/
int
ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
{
struct ip6_hdr *ip6;
switch (IP6OPT_TYPE(*optp)) {
case IP6OPT_TYPE_SKIP: /* ignore the option */
return ((int)*(optp + 1));
case IP6OPT_TYPE_DISCARD: /* silently discard */
m_freem(m);
return (-1);
case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
return (-1);
case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
IP6_STATINC(IP6_STAT_BADOPTIONS);
ip6 = mtod(m, struct ip6_hdr *);
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
(m->m_flags & (M_BCAST|M_MCAST)))
m_freem(m);
else
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_OPTION, off);
return (-1);
}
m_freem(m); /* XXX: NOTREACHED */
return (-1);
}
void
ip6_savecontrol(struct inpcb *inp, struct mbuf **mp,
struct ip6_hdr *ip6, struct mbuf *m)
{
struct socket *so = inp->inp_socket;
#ifdef RFC2292
#define IS2292(x, y) ((inp->inp_flags & IN6P_RFC2292) ? (x) : (y))
#else
#define IS2292(x, y) (y)
#endif
KASSERT(m->m_flags & M_PKTHDR);
if (SOOPT_TIMESTAMP(so->so_options))
mp = sbsavetimestamp(so->so_options, mp);
/* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */
if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
return;
/* RFC 2292 sec. 5 */
if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
struct in6_pktinfo pi6;
memcpy(&pi6.ipi6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
in6_clearscope(&pi6.ipi6_addr); /* XXX */
pi6.ipi6_ifindex = m->m_pkthdr.rcvif_index;
*mp = sbcreatecontrol(&pi6, sizeof(pi6),
IS2292(IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & IN6P_HOPLIMIT) {
int hlim = ip6->ip6_hlim & 0xff;
*mp = sbcreatecontrol(&hlim, sizeof(hlim),
IS2292(IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if ((inp->inp_flags & IN6P_TCLASS) != 0) {
u_int32_t flowinfo;
int tclass;
flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
flowinfo >>= 20;
tclass = flowinfo & 0xff;
*mp = sbcreatecontrol(&tclass, sizeof(tclass),
IPV6_TCLASS, IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
/*
* IPV6_HOPOPTS socket option. Recall that we required super-user
* privilege for the option (see ip6_ctloutput), but it might be too
* strict, since there might be some hop-by-hop options which can be
* returned to normal user.
* See also RFC3542 section 8 (or RFC2292 section 6).
*/
if ((inp->inp_flags & IN6P_HOPOPTS) != 0) {
/*
* Check if a hop-by-hop options header is contatined in the
* received packet, and if so, store the options as ancillary
* data. Note that a hop-by-hop options header must be
* just after the IPv6 header, which fact is assured through
* the IPv6 input processing.
*/
struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
if (xip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
int hbhlen;
struct mbuf *ext;
ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
xip6->ip6_nxt);
if (ext == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
hbh = mtod(ext, struct ip6_hbh *);
hbhlen = (hbh->ip6h_len + 1) << 3;
if (hbhlen != ext->m_len) {
m_freem(ext);
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
/*
* XXX: We copy whole the header even if a jumbo
* payload option is included, which option is to
* be removed before returning in the RFC 2292.
* Note: this constraint is removed in RFC3542.
*/
*mp = sbcreatecontrol(hbh, hbhlen,
IS2292(IPV6_2292HOPOPTS, IPV6_HOPOPTS),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
m_freem(ext);
}
}
/* IPV6_DSTOPTS and IPV6_RTHDR socket options */
if (inp->inp_flags & (IN6P_DSTOPTS | IN6P_RTHDR)) {
struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
int nxt = xip6->ip6_nxt, off = sizeof(struct ip6_hdr);
/*
* Search for destination options headers or routing
* header(s) through the header chain, and stores each
* header as ancillary data.
* Note that the order of the headers remains in
* the chain of ancillary data.
*/
for (;;) { /* is explicit loop prevention necessary? */
struct ip6_ext *ip6e = NULL;
int elen;
struct mbuf *ext = NULL;
/*
* if it is not an extension header, don't try to
* pull it from the chain.
*/
switch (nxt) {
case IPPROTO_DSTOPTS:
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
goto loopend;
}
ext = ip6_pullexthdr(m, off, nxt);
if (ext == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
ip6e = mtod(ext, struct ip6_ext *);
if (nxt == IPPROTO_AH)
elen = (ip6e->ip6e_len + 2) << 2;
else
elen = (ip6e->ip6e_len + 1) << 3;
if (elen != ext->m_len) {
m_freem(ext);
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
KASSERT(ACCESSIBLE_POINTER(ip6e, struct ip6_hdr));
switch (nxt) {
case IPPROTO_DSTOPTS:
if (!(inp->inp_flags & IN6P_DSTOPTS))
break;
*mp = sbcreatecontrol(ip6e, elen,
IS2292(IPV6_2292DSTOPTS, IPV6_DSTOPTS),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_ROUTING:
if (!(inp->inp_flags & IN6P_RTHDR))
break;
*mp = sbcreatecontrol(ip6e, elen,
IS2292(IPV6_2292RTHDR, IPV6_RTHDR),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
/*
* other cases have been filtered in the above.
* none will visit this case. here we supply
* the code just in case (nxt overwritten or
* other cases).
*/
m_freem(ext);
goto loopend;
}
/* proceed with the next header. */
off += elen;
nxt = ip6e->ip6e_nxt;
ip6e = NULL;
m_freem(ext);
ext = NULL;
}
loopend:
;
}
}
#undef IS2292
void
ip6_notify_pmtu(struct inpcb *inp, const struct sockaddr_in6 *dst,
uint32_t *mtu)
{
struct socket *so;
struct mbuf *m_mtu;
struct ip6_mtuinfo mtuctl;
so = inp->inp_socket;
if (mtu == NULL)
return;
KASSERT(so != NULL);
memset(&mtuctl, 0, sizeof(mtuctl)); /* zero-clear for safety */
mtuctl.ip6m_mtu = *mtu;
mtuctl.ip6m_addr = *dst;
if (sa6_recoverscope(&mtuctl.ip6m_addr))
return;
if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl),
IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
return;
if (sbappendaddr(&so->so_rcv, (const struct sockaddr *)dst, NULL, m_mtu)
== 0) {
soroverflow(so);
m_freem(m_mtu);
} else
sorwakeup(so);
return;
}
/*
* pull single extension header from mbuf chain. returns single mbuf that
* contains the result, or NULL on error.
*/
static struct mbuf *
ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
{
struct ip6_ext ip6e;
size_t elen;
struct mbuf *n;
if (off + sizeof(ip6e) > m->m_pkthdr.len)
return NULL;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxt == IPPROTO_AH)
elen = (ip6e.ip6e_len + 2) << 2;
else
elen = (ip6e.ip6e_len + 1) << 3;
if (off + elen > m->m_pkthdr.len)
return NULL;
MGET(n, M_DONTWAIT, MT_DATA);
if (n && elen >= MLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (!n)
return NULL;
n->m_len = 0;
if (elen >= M_TRAILINGSPACE(n)) {
m_free(n);
return NULL;
}
m_copydata(m, off, elen, mtod(n, void *));
n->m_len = elen;
return n;
}
/*
* Get offset to the previous header followed by the header
* currently processed.
*/
int
ip6_get_prevhdr(struct mbuf *m, int off)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
if (off == sizeof(struct ip6_hdr)) {
return offsetof(struct ip6_hdr, ip6_nxt);
} else if (off < sizeof(struct ip6_hdr)) {
panic("%s: off < sizeof(struct ip6_hdr)", __func__);
} else {
int len, nlen, nxt;
struct ip6_ext ip6e;
nxt = ip6->ip6_nxt;
len = sizeof(struct ip6_hdr);
nlen = 0;
while (len < off) {
m_copydata(m, len, sizeof(ip6e), &ip6e);
switch (nxt) {
case IPPROTO_FRAGMENT:
nlen = sizeof(struct ip6_frag);
break;
case IPPROTO_AH:
nlen = (ip6e.ip6e_len + 2) << 2;
break;
default:
nlen = (ip6e.ip6e_len + 1) << 3;
break;
}
len += nlen;
nxt = ip6e.ip6e_nxt;
}
return (len - nlen);
}
}
/*
* get next header offset. m will be retained.
*/
int
ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
struct ip6_hdr ip6;
struct ip6_ext ip6e;
struct ip6_frag fh;
/* just in case */
if (m == NULL)
panic("%s: m == NULL", __func__); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
return -1;
switch (proto) {
case IPPROTO_IPV6:
/* do not chase beyond intermediate IPv6 headers */
if (off != 0)
return -1;
if (m->m_pkthdr.len < off + sizeof(ip6))
return -1;
m_copydata(m, off, sizeof(ip6), (void *)&ip6);
if (nxtp) *nxtp = ip6.ip6_nxt;
off += sizeof(ip6);
return off;
case IPPROTO_FRAGMENT:
/*
* terminate parsing if it is not the first fragment,
* it does not make sense to parse through it.
*/
if (m->m_pkthdr.len < off + sizeof(fh))
return -1;
m_copydata(m, off, sizeof(fh), (void *)&fh);
if ((fh.ip6f_offlg & IP6F_OFF_MASK) != 0)
return -1;
if (nxtp) *nxtp = fh.ip6f_nxt;
off += sizeof(struct ip6_frag);
return off;
case IPPROTO_AH:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 2) << 2;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_HOPOPTS:
case IPPROTO_ROUTING:
case IPPROTO_DSTOPTS:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 1) << 3;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_NONE:
case IPPROTO_ESP:
case IPPROTO_IPCOMP:
/* give up */
return -1;
default:
return -1;
}
}
/*
* get offset for the last header in the chain. m will be kept untainted.
*/
int
ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
int newoff;
int nxt;
if (!nxtp) {
nxt = -1;
nxtp = &nxt;
}
for (;;) {
newoff = ip6_nexthdr(m, off, proto, nxtp);
if (newoff < 0)
return off;
else if (newoff < off)
return -1; /* invalid */
else if (newoff == off)
return newoff;
off = newoff;
proto = *nxtp;
}
}
static struct m_tag *
ip6_addaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
if (!mtag) {
mtag = m_tag_get(PACKET_TAG_INET6, sizeof(struct ip6aux),
M_NOWAIT);
if (mtag) {
m_tag_prepend(m, mtag);
memset(mtag + 1, 0, sizeof(struct ip6aux));
}
}
return mtag;
}
static struct m_tag *
ip6_findaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
return mtag;
}
static void
ip6_delaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
if (mtag)
m_tag_delete(m, mtag);
}
/*
* System control for IP6
*/
const u_char inet6ctlerrmap[PRC_NCMDS] = {
0, 0, 0, 0,
0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
EMSGSIZE, EHOSTUNREACH, 0, 0,
0, 0, 0, 0,
ENOPROTOOPT
};
extern int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);
static int
sysctl_net_inet6_ip6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(ip6stat_percpu, IP6_NSTATS));
}
static void
sysctl_net_inet6_ip6_setup(struct sysctllog **clog)
{
const struct sysctlnode *ip6_node;
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6",
SYSCTL_DESCR("PF_INET6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, &ip6_node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ip6",
SYSCTL_DESCR("IPv6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "forwarding",
SYSCTL_DESCR("Enable forwarding of INET6 datagrams"),
NULL, 0, &ip6_forwarding, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_FORWARDING, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "redirect",
SYSCTL_DESCR("Enable sending of ICMPv6 redirect messages"),
NULL, 0, &ip6_sendredirects, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_SENDREDIRECTS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hlim",
SYSCTL_DESCR("Hop limit for an INET6 datagram"),
NULL, 0, &ip6_defhlim, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DEFHLIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfragpackets",
SYSCTL_DESCR("Maximum number of fragments to buffer "
"for reassembly"),
NULL, 0, &ip6_maxfragpackets, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_MAXFRAGPACKETS, CTL_EOL);
pktq_sysctl_setup(ip6_pktq, clog, ip6_node, IPV6CTL_IFQ);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "keepfaith",
SYSCTL_DESCR("Activate faith interface"),
NULL, 0, &ip6_keepfaith, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_KEEPFAITH, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "log_interval",
SYSCTL_DESCR("Minimum interval between logging "
"unroutable packets"),
NULL, 0, &ip6_log_interval, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOG_INTERVAL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hdrnestlimit",
SYSCTL_DESCR("Maximum number of nested IPv6 headers"),
NULL, 0, &ip6_hdrnestlimit, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_HDRNESTLIMIT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "dad_count",
SYSCTL_DESCR("Number of Duplicate Address Detection "
"probes to send"),
NULL, 0, &ip6_dad_count, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DAD_COUNT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "auto_flowlabel",
SYSCTL_DESCR("Assign random IPv6 flow labels"),
NULL, 0, &ip6_auto_flowlabel, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_AUTO_FLOWLABEL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "defmcasthlim",
SYSCTL_DESCR("Default multicast hop limit"),
NULL, 0, &ip6_defmcasthlim, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DEFMCASTHLIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "kame_version",
SYSCTL_DESCR("KAME Version"),
NULL, 0, __UNCONST(__KAME_VERSION), 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_KAME_VERSION, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "use_deprecated",
SYSCTL_DESCR("Allow use of deprecated addresses as "
"source addresses"),
NULL, 0, &ip6_use_deprecated, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_USE_DEPRECATED, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT
#ifndef INET6_BINDV6ONLY
|CTLFLAG_READWRITE,
#endif
CTLTYPE_INT, "v6only",
SYSCTL_DESCR("Disallow PF_INET6 sockets from connecting "
"to PF_INET sockets"),
NULL, 0, &ip6_v6only, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_V6ONLY, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "anonportmin",
SYSCTL_DESCR("Lowest ephemeral port number to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_anonportmin, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ANONPORTMIN, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "anonportmax",
SYSCTL_DESCR("Highest ephemeral port number to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_anonportmax, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ANONPORTMAX, CTL_EOL);
#ifndef IPNOPRIVPORTS
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "lowportmin",
SYSCTL_DESCR("Lowest privileged ephemeral port number "
"to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_lowportmin, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOWPORTMIN, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "lowportmax",
SYSCTL_DESCR("Highest privileged ephemeral port number "
"to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_lowportmax, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOWPORTMAX, CTL_EOL);
#endif /* IPNOPRIVPORTS */
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "auto_linklocal",
SYSCTL_DESCR("Default value of per-interface flag for "
"adding an IPv6 link-local address to "
"interfaces when attached"),
NULL, 0, &ip6_auto_linklocal, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_AUTO_LINKLOCAL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_STRUCT, "addctlpolicy",
SYSCTL_DESCR("Return the current address control"
" policy"),
sysctl_net_inet6_addrctlpolicy, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ADDRCTLPOLICY, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "prefer_tempaddr",
SYSCTL_DESCR("Prefer temporary address as source "
"address"),
NULL, 0, &ip6_prefer_tempaddr, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfrags",
SYSCTL_DESCR("Maximum fragments in reassembly queue"),
NULL, 0, &ip6_maxfrags, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_MAXFRAGS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("IPv6 statistics"),
sysctl_net_inet6_ip6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_STATS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "use_defaultzone",
SYSCTL_DESCR("Whether to use the default scope zones"),
NULL, 0, &ip6_use_defzone, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_USE_DEFAULTZONE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "mcast_pmtu",
SYSCTL_DESCR("Enable pMTU discovery for multicast packet"),
NULL, 0, &ip6_mcast_pmtu, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
/* anonportalgo RFC6056 subtree */
const struct sysctlnode *portalgo_node;
sysctl_createv(clog, 0, NULL, &portalgo_node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "anonportalgo",
SYSCTL_DESCR("Anonymous port algorithm selection (RFC 6056)"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "available",
SYSCTL_DESCR("available algorithms"),
sysctl_portalgo_available, 0, NULL, PORTALGO_MAXLEN,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRING, "selected",
SYSCTL_DESCR("selected algorithm"),
sysctl_portalgo_selected6, 0, NULL, PORTALGO_MAXLEN,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "reserve",
SYSCTL_DESCR("bitmap of reserved ports"),
sysctl_portalgo_reserve6, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "neighborgcthresh",
SYSCTL_DESCR("Maximum number of entries in neighbor"
" cache"),
NULL, 1, &ip6_neighborgcthresh, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxdynroutes",
SYSCTL_DESCR("Maximum number of routes created via"
" redirect"),
NULL, 1, &ip6_maxdynroutes, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "param_rt_msg",
SYSCTL_DESCR("How to send parameter changing"
" routing message"),
NULL, 0, &ip6_param_rt_msg, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
}
void
ip6_statinc(u_int stat)
{
KASSERT(stat < IP6_NSTATS);
IP6_STATINC(stat);
}
/* $NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ptyfs_subr.c 8.6 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/select.h>
#include <sys/tty.h>
#include <sys/pty.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <fs/ptyfs/ptyfs.h>
static kmutex_t ptyfs_hashlock;
static SLIST_HEAD(ptyfs_hashhead, ptyfsnode) *ptyfs_node_tbl;
static u_long ptyfs_node_mask; /* size of hash table - 1 */
/*
* allocate a ptyfsnode/vnode pair. the vnode is referenced.
*
* the pty, ptyfs_type, and mount point uniquely
* identify a ptyfsnode. the mount point is needed
* because someone might mount this filesystem
* twice.
*/
int
ptyfs_allocvp(struct mount *mp, struct vnode **vpp, ptyfstype type, int pty)
{
struct ptyfskey key;
memset(&key, 0, sizeof(key));
key.ptk_pty = pty;
key.ptk_type = type;
return vcache_get(mp, &key, sizeof(key), vpp);
}
/*
* Initialize ptyfsnode hash table.
*/
void
ptyfs_hashinit(void)
{
ptyfs_node_tbl = hashinit(16, HASH_SLIST, true, &ptyfs_node_mask);
mutex_init(&ptyfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Free ptyfsnode hash table.
*/
void
ptyfs_hashdone(void)
{
mutex_destroy(&ptyfs_hashlock);
hashdone(ptyfs_node_tbl, HASH_SLIST, ptyfs_node_mask);
}
/*
* Get a ptyfsnode from the hash table, or allocate one.
*/
struct ptyfsnode *
ptyfs_get_node(ptyfstype type, int pty)
{
struct ptyfs_hashhead *ppp;
struct ptyfsnode *pp;
ppp = &ptyfs_node_tbl[PTYFS_FILENO(type, pty) & ptyfs_node_mask];
mutex_enter(&ptyfs_hashlock);
SLIST_FOREACH(pp, ppp, ptyfs_hash) { if (pty == pp->ptyfs_pty && pp->ptyfs_type == type) {
mutex_exit(&ptyfs_hashlock);
return pp;
}
}
mutex_exit(&ptyfs_hashlock);
pp = malloc(sizeof(struct ptyfsnode), M_TEMP, M_WAITOK);
pp->ptyfs_pty = pty;
pp->ptyfs_type = type;
pp->ptyfs_fileno = PTYFS_FILENO(type, pty);
if (pp->ptyfs_type == PTYFSroot)
pp->ptyfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
S_IROTH|S_IXOTH;
else
pp->ptyfs_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
S_IROTH|S_IWOTH;
pp->ptyfs_uid = pp->ptyfs_gid = 0;
pp->ptyfs_status = PTYFS_CHANGE;
PTYFS_ITIMES(pp, NULL, NULL, NULL); pp->ptyfs_birthtime = pp->ptyfs_mtime =
pp->ptyfs_atime = pp->ptyfs_ctime;
pp->ptyfs_flags = 0;
mutex_enter(&ptyfs_hashlock);
/*
* XXX We have minimum race condition when opening master side
* first time, if other threads through other mount points, trying
* opening the same device. As follow we have little chance have
* unused list entries.
*/
SLIST_INSERT_HEAD(ppp, pp, ptyfs_hash);
mutex_exit(&ptyfs_hashlock);
return pp;
}
/*
* Mark this controlling pty as active.
*/
void
ptyfs_set_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
/* Reallocate map if needed. */
if (pty >= pmnt->pmnt_bitmap_size * NBBY) {
int osize, nsize;
uint8_t *obitmap, *nbitmap;
nsize = roundup(howmany(pty + 1, NBBY), 64);
nbitmap = kmem_alloc(nsize, KM_SLEEP);
mutex_enter(&pmnt->pmnt_lock);
if (pty < pmnt->pmnt_bitmap_size * NBBY) {
mutex_exit(&pmnt->pmnt_lock);
kmem_free(nbitmap, nsize);
} else {
osize = pmnt->pmnt_bitmap_size;
obitmap = pmnt->pmnt_bitmap;
pmnt->pmnt_bitmap_size = nsize;
pmnt->pmnt_bitmap = nbitmap;
if (osize > 0)
memcpy(pmnt->pmnt_bitmap, obitmap, osize);
memset(pmnt->pmnt_bitmap + osize, 0, nsize - osize);
mutex_exit(&pmnt->pmnt_lock);
if (osize > 0)
kmem_free(obitmap, osize);
}
}
mutex_enter(&pmnt->pmnt_lock);
setbit(pmnt->pmnt_bitmap, pty);
mutex_exit(&pmnt->pmnt_lock);
}
/*
* Mark this controlling pty as inactive.
*/
void
ptyfs_clr_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
mutex_enter(&pmnt->pmnt_lock);
if (pty >= 0 && pty < pmnt->pmnt_bitmap_size * NBBY)
clrbit(pmnt->pmnt_bitmap, pty);
mutex_exit(&pmnt->pmnt_lock);
}
/*
* Lookup the next active controlling pty greater or equal "pty".
* Return -1 if not found.
*/
int
ptyfs_next_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
mutex_enter(&pmnt->pmnt_lock);
while (pty < pmnt->pmnt_bitmap_size * NBBY) {
if (isset(pmnt->pmnt_bitmap, pty)) {
mutex_exit(&pmnt->pmnt_lock);
return pty;
}
pty++;
}
mutex_exit(&pmnt->pmnt_lock);
return -1;
}
/* $NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>
#include <miscfs/procfs/procfs.h>
/*
* Allocate a pfsnode/vnode pair. The vnode is referenced.
* The pid, type, and file descriptor uniquely identify a pfsnode.
*/
int
procfs_allocvp(struct mount *mp, struct vnode **vpp, pid_t pid,
pfstype type, int fd)
{
struct pfskey key;
memset(&key, 0, sizeof(key));
key.pk_type = type;
key.pk_pid = pid;
key.pk_fd = fd;
return vcache_get(mp, &key, sizeof(key), vpp);
}
int
procfs_rw(void *v)
{
struct vop_read_args *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *curl;
struct lwp *l;
struct pfsnode *pfs = VTOPFS(vp);
struct proc *p;
int error;
if (uio->uio_offset < 0)
return EINVAL;
if ((error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0)
return error;
curl = curlwp;
/*
* Do not allow init to be modified while in secure mode; it
* could be duped into changing the security level.
*/
#define M2K(m) ((m) == UIO_READ ? KAUTH_REQ_PROCESS_PROCFS_READ : \
KAUTH_REQ_PROCESS_PROCFS_WRITE)
mutex_enter(p->p_lock);
error = kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_PROCFS,
p, pfs, KAUTH_ARG(M2K(uio->uio_rw)), NULL);
mutex_exit(p->p_lock);
if (error) {
procfs_proc_unlock(p);
return (error);
}
#undef M2K
mutex_enter(p->p_lock);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (l->l_stat != LSZOMB)
break;
}
/* Process is exiting if no-LWPS or all LWPs are LSZOMB */
if (l == NULL) {
mutex_exit(p->p_lock);
procfs_proc_unlock(p);
return ESRCH;
}
lwp_addref(l);
mutex_exit(p->p_lock);
switch (pfs->pfs_type) {
case PFSnote:
case PFSnotepg:
error = procfs_donote(curl, p, pfs, uio);
break;
case PFSregs:
error = procfs_doregs(curl, l, pfs, uio);
break;
case PFSfpregs:
error = procfs_dofpregs(curl, l, pfs, uio);
break;
case PFSstatus:
error = procfs_dostatus(curl, l, pfs, uio);
break;
case PFSstat:
error = procfs_do_pid_stat(curl, l, pfs, uio);
break;
case PFSlimit:
error = procfs_dolimit(curl, p, pfs, uio);
break;
case PFSmap:
error = procfs_domap(curl, p, pfs, uio, 0);
break;
case PFSmaps:
error = procfs_domap(curl, p, pfs, uio, 1);
break;
case PFSmem:
error = procfs_domem(curl, l, pfs, uio);
break;
case PFScmdline:
error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ARGV);
break;
case PFSenviron:
error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ENV);
break;
case PFSmeminfo:
error = procfs_domeminfo(curl, p, pfs, uio);
break;
case PFSdevices:
error = procfs_dodevices(curl, p, pfs, uio);
break;
case PFScpuinfo:
error = procfs_docpuinfo(curl, p, pfs, uio);
break;
case PFScpustat:
error = procfs_docpustat(curl, p, pfs, uio);
break;
case PFSloadavg:
error = procfs_doloadavg(curl, p, pfs, uio);
break;
case PFSstatm:
error = procfs_do_pid_statm(curl, l, pfs, uio);
break;
case PFSfd:
error = procfs_dofd(curl, p, pfs, uio);
break;
case PFSuptime:
error = procfs_douptime(curl, p, pfs, uio);
break;
case PFSmounts:
error = procfs_domounts(curl, p, pfs, uio);
break;
case PFSemul:
error = procfs_doemul(curl, p, pfs, uio);
break;
case PFSversion:
error = procfs_doversion(curl, p, pfs, uio);
break;
case PFSauxv:
error = procfs_doauxv(curl, p, pfs, uio);
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
error = procfs_machdep_rw(curl, l, pfs, uio);
break;
#endif
default:
error = EOPNOTSUPP;
break;
}
/*
* Release the references that we acquired earlier.
*/
lwp_delref(l);
procfs_proc_unlock(p);
return (error);
}
/*
* Get a string from userland into (bf). Strip a trailing
* nl character (to allow easy access from the shell).
* The buffer should be *buflenp + 1 chars long. vfs_getuserstr
* will automatically add a nul char at the end.
*
* Returns 0 on success or the following errors
*
* EINVAL: file offset is non-zero.
* EMSGSIZE: message is longer than kernel buffer
* EFAULT: user i/o buffer is not addressable
*/
int
vfs_getuserstr(struct uio *uio, char *bf, int *buflenp)
{
size_t xlen;
int error;
if (uio->uio_offset != 0)
return (EINVAL);
xlen = *buflenp;
/* must be able to read the whole string in one go */
if (xlen < uio->uio_resid)
return (EMSGSIZE);
xlen = uio->uio_resid;
if ((error = uiomove(bf, xlen, uio)) != 0)
return (error);
/* allow multiple writes without seeks */
uio->uio_offset = 0;
/* cleanup string and remove trailing newline */
bf[xlen] = '\0';
xlen = strlen(bf);
if (xlen > 0 && bf[xlen-1] == '\n')
bf[--xlen] = '\0';
*buflenp = xlen;
return (0);
}
const vfs_namemap_t *
vfs_findname(const vfs_namemap_t *nm, const char *bf, int buflen)
{
for (; nm->nm_name; nm++)
if (memcmp(bf, nm->nm_name, buflen+1) == 0)
return (nm);
return (0);
}
bool
procfs_use_linux_compat(struct mount *mp)
{
const int flags = VFSTOPROC(mp)->pmnt_flags;
return (flags & PROCFSMNT_LINUXCOMPAT) ? true : false;
}
struct proc *
procfs_proc_find(struct mount *mp, pid_t pid)
{
KASSERT(mutex_owned(&proc_lock)); return procfs_use_linux_compat(mp) ? proc_find_lwpid(pid) : proc_find(pid);
}
int
procfs_proc_lock(struct mount *mp, int pid, struct proc **bunghole,
int notfound)
{
struct proc *tp;
int error = 0;
mutex_enter(&proc_lock);
if (pid == 0)
tp = &proc0;
else if ((tp = procfs_proc_find(mp, pid)) == NULL)
error = notfound;
if (tp != NULL && !rw_tryenter(&tp->p_reflock, RW_READER))
error = EBUSY;
mutex_exit(&proc_lock);
*bunghole = tp;
return error;
}
void
procfs_proc_unlock(struct proc *p)
{
rw_exit(&p->p_reflock);
}
int
procfs_doemul(struct lwp *curl, struct proc *p,
struct pfsnode *pfs, struct uio *uio)
{
const char *ename = p->p_emul->e_name;
return uiomove_frombuf(__UNCONST(ename), strlen(ename), uio);
}
/* $NetBSD: cpu.h,v 1.72 2023/09/04 20:58:52 mrg Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)cpu.h 5.4 (Berkeley) 5/9/91
*/
#ifndef _AMD64_CPU_H_
#define _AMD64_CPU_H_
#ifdef __x86_64__
#include <x86/cpu.h>
#ifdef _KERNEL
#if defined(__GNUC__) && !defined(_MODULE)
static struct cpu_info *x86_curcpu(void);
static lwp_t *x86_curlwp(void);
/*
* XXXGCC12 has:
* ./machine/cpu.h:57:9: error: array subscript 0 is outside array bounds of 'struct cpu_info * const[0]' [-Werror=array-bounds]
* 56 | __asm("movq %%gs:%1, %0" :
*/
#pragma GCC push_options
#pragma GCC diagnostic ignored "-Warray-bounds"
__inline __always_inline static struct cpu_info * __unused __nomsan
x86_curcpu(void)
{
struct cpu_info *ci;
__asm("movq %%gs:%1, %0" :
"=r" (ci) :
"m"
(*(struct cpu_info * const *)offsetof(struct cpu_info, ci_self)));
return ci;
}
__inline static lwp_t * __unused __nomsan __attribute__ ((const))
x86_curlwp(void)
{
lwp_t *l;
__asm("movq %%gs:%1, %0" :
"=r" (l) :
"m"
(*(struct cpu_info * const *)offsetof(struct cpu_info, ci_curlwp)));
return l;
}
#pragma GCC pop_options
#endif /* __GNUC__ && !_MODULE */
#ifdef XENPV
#define CLKF_USERMODE(frame) (curcpu()->ci_xen_clockf_usermode)
#define CLKF_PC(frame) (curcpu()->ci_xen_clockf_pc)
#else /* XENPV */
#define CLKF_USERMODE(frame) USERMODE((frame)->cf_if.if_tf.tf_cs)
#define CLKF_PC(frame) ((frame)->cf_if.if_tf.tf_rip)
#endif /* XENPV */
#define CLKF_INTR(frame) (curcpu()->ci_idepth > 0)
#define LWP_PC(l) ((l)->l_md.md_regs->tf_rip)
void *cpu_uarea_alloc(bool);
bool cpu_uarea_free(void *);
#endif /* _KERNEL */
#else /* __x86_64__ */
#include <i386/cpu.h>
#endif /* __x86_64__ */
#endif /* !_AMD64_CPU_H_ */
/* $NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $ */
/*
* Copyright (c) 2008, 2019 The NetBSD Foundation, Inc. All
* rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran and Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1991 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)npx.c 7.2 (Berkeley) 5/12/91
*/
/*
* Copyright (c) 1994, 1995, 1998 Charles M. Hannum. All rights reserved.
* Copyright (c) 1990 William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)npx.c 7.2 (Berkeley) 5/12/91
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_multiprocessor.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/xcall.h>
#include <machine/cpu.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/intr.h>
#include <machine/cpufunc.h>
#include <machine/pcb.h>
#include <machine/trap.h>
#include <machine/specialreg.h>
#include <x86/cpu.h>
#include <x86/fpu.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#ifdef XENPV
#define clts() HYPERVISOR_fpu_taskswitch(0)
#define stts() HYPERVISOR_fpu_taskswitch(1)
#endif
void fpu_handle_deferred(void);
void fpu_switch(struct lwp *, struct lwp *);
uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
static inline union savefpu *
fpu_lwp_area(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
union savefpu *area = &pcb->pcb_savefpu;
KASSERT((l->l_flag & LW_SYSTEM) == 0);
if (l == curlwp) { fpu_save();
}
KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU));
return area;
}
static inline void
fpu_save_lwp(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
union savefpu *area = &pcb->pcb_savefpu;
int s;
s = splvm();
if (l->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT((l->l_flag & LW_SYSTEM) == 0); fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32));
l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
splx(s);
}
/*
* Bring curlwp's FPU state in memory. It will get installed back in the CPU
* when returning to userland.
*/
void
fpu_save(void)
{ fpu_save_lwp(curlwp);
}
void
fpuinit(struct cpu_info *ci)
{
/*
* This might not be strictly necessary since it will be initialized
* for each process. However it does no harm.
*/
clts();
fninit();
stts();
}
void
fpuinit_mxcsr_mask(void)
{
#ifndef XENPV
union savefpu fpusave __aligned(64);
u_long psl;
memset(&fpusave, 0, sizeof(fpusave));
/* Disable interrupts, and enable FPU */
psl = x86_read_psl();
x86_disable_intr();
clts();
/* Fill in the FPU area */
fxsave(&fpusave);
/* Restore previous state */
stts();
x86_write_psl(psl);
if (fpusave.sv_xmm.fx_mxcsr_mask == 0) {
x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
} else {
x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask;
}
#else
/*
* XXX XXX XXX: On Xen the FXSAVE above faults. That's because
* &fpusave is not 16-byte aligned. Stack alignment problem
* somewhere, it seems.
*/
x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
#endif
}
static inline void
fpu_errata_amd(void)
{
uint16_t sw;
/*
* AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor
* when FSW.ES=0, leaking other threads' execution history.
*
* Clear them manually by loading a zero (fldummy). We do this
* unconditionally, regardless of FSW.ES.
*
* Before that, clear the ES bit in the x87 status word if it is
* currently set, in order to avoid causing a fault in the
* upcoming load.
*
* Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2],
* which indicates that FIP/FDP/FOP are restored (same behavior
* as Intel). We're not using it though.
*/
fnstsw(&sw);
if (sw & 0x80) fnclex();
fldummy();
}
#ifdef __x86_64__
#define XS64(x) (is_64bit ? x##64 : x)
#else
#define XS64(x) x
#endif
void
fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit)
{
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
fnsave(area);
break;
case FPU_SAVE_FXSAVE:
XS64(fxsave)(area);
break;
case FPU_SAVE_XSAVE:
XS64(xsave)(area, xsave_features);
break;
case FPU_SAVE_XSAVEOPT:
XS64(xsaveopt)(area, xsave_features);
break;
}
stts();
}
void
fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit)
{
clts();
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
frstor(area);
break;
case FPU_SAVE_FXSAVE:
if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd();
XS64(fxrstor)(area);
break;
case FPU_SAVE_XSAVE:
case FPU_SAVE_XSAVEOPT:
if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd();
XS64(xrstor)(area, xsave_features);
break;
}
}
void
fpu_handle_deferred(void)
{
struct pcb *pcb = lwp_getpcb(curlwp);
fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features,
!(curlwp->l_proc->p_flag & PK_32));
}
void
fpu_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
struct cpu_info *ci __diagused = curcpu();
struct pcb *pcb;
KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d",
cpu_index(ci), ci->ci_ilevel);
if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
pcb = lwp_getpcb(oldlwp);
fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features,
!(oldlwp->l_proc->p_flag & PK_32));
oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU));
}
void
fpu_lwp_fork(struct lwp *l1, struct lwp *l2)
{
struct pcb *pcb2 = lwp_getpcb(l2);
union savefpu *fpu_save;
/* Kernel threads have no FPU. */
if (__predict_false(l2->l_flag & LW_SYSTEM)) {
return;
}
/* For init(8). */
if (__predict_false(l1->l_flag & LW_SYSTEM)) {
memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
return;
}
fpu_save = fpu_lwp_area(l1);
memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
void
fpu_lwp_abandon(struct lwp *l)
{
int s;
KASSERT(l == curlwp);
s = splvm();
l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
stts();
splx(s);
}
/* -------------------------------------------------------------------------- */
/*
* fpu_kern_enter()
*
* Begin using the FPU. Raises to splvm, disabling most
* interrupts and rendering the thread non-preemptible; caller
* should not use this for long periods of time, and must call
* fpu_kern_leave() afterward. Non-recursive -- you cannot call
* fpu_kern_enter() again without calling fpu_kern_leave() first.
*
* Must be used only at IPL_VM or below -- never in IPL_SCHED or
* IPL_HIGH interrupt handlers.
*/
void
fpu_kern_enter(void)
{
static const union savefpu safe_fpu __aligned(64) = {
.sv_xmm = {
.fx_mxcsr = __SAFE_MXCSR__,
},
};
struct lwp *l = curlwp;
struct cpu_info *ci;
int s;
s = splvm();
ci = curcpu();
#if 0
/*
* Can't assert this because if the caller holds a spin lock at
* IPL_VM, and previously held and released a spin lock at
* higher IPL, the IPL remains raised above IPL_VM.
*/
KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d",
ci->ci_ilevel);
#endif
KASSERT(ci->ci_kfpu_spl == -1);
ci->ci_kfpu_spl = s;
/*
* If we are in a softint and have a pinned lwp, the fpu state is that
* of the pinned lwp, so save it there.
*/
while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL))
l = l->l_switchto;
fpu_save_lwp(l);
/*
* Clear CR0_TS, which fpu_save_lwp set if it saved anything --
* otherwise the CPU will trap if we try to use the FPU under
* the false impression that there has been a task switch since
* the last FPU usage requiring that we save the FPU state.
*/
clts();
/*
* Zero the FPU registers and install safe control words.
*/
fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false);
}
/*
* fpu_kern_leave()
*
* End using the FPU after fpu_kern_enter().
*/
void
fpu_kern_leave(void)
{
static const union savefpu zero_fpu __aligned(64);
struct cpu_info *ci = curcpu();
int s;
#if 0
/*
* Can't assert this because if the caller holds a spin lock at
* IPL_VM, and previously held and released a spin lock at
* higher IPL, the IPL remains raised above IPL_VM.
*/
KASSERT(ci->ci_ilevel == IPL_VM || cold);
#endif
KASSERT(ci->ci_kfpu_spl != -1);
/*
* Zero the fpu registers; otherwise we might leak secrets
* through Spectre-class attacks to userland, even if there are
* no bugs in fpu state management.
*/
fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false);
/*
* Set CR0_TS again so that the kernel can't accidentally use
* the FPU.
*/
stts();
s = ci->ci_kfpu_spl;
ci->ci_kfpu_spl = -1;
splx(s);
}
/* -------------------------------------------------------------------------- */
/*
* The following table is used to ensure that the FPE_... value
* that is passed as a trapcode to the signal handler of the user
* process does not have more than one bit set.
*
* Multiple bits may be set if SSE simd instructions generate errors
* on more than one value or if the user process modifies the control
* word while a status word bit is already set (which this is a sign
* of bad coding).
* We have no choice than to narrow them down to one bit, since we must
* not send a trapcode that is not exactly one of the FPE_ macros.
*
* The mechanism has a static table with 127 entries. Each combination
* of the 7 FPU status word exception bits directly translates to a
* position in this table, where a single FPE_... value is stored.
* This FPE_... value stored there is considered the "most important"
* of the exception bits and will be sent as the signal code. The
* precedence of the bits is based upon Intel Document "Numerical
* Applications", Chapter "Special Computational Situations".
*
* The code to choose one of these values does these steps:
* 1) Throw away status word bits that cannot be masked.
* 2) Throw away the bits currently masked in the control word,
* assuming the user isn't interested in them anymore.
* 3) Reinsert status word bit 7 (stack fault) if it is set, which
* cannot be masked but must be preserved.
* 'Stack fault' is a sub-class of 'invalid operation'.
* 4) Use the remaining bits to point into the trapcode table.
*
* The 6 maskable bits in order of their preference, as stated in the
* above referenced Intel manual:
* 1 Invalid operation (FP_X_INV)
* 1a Stack underflow
* 1b Stack overflow
* 1c Operand of unsupported format
* 1d SNaN operand.
* 2 QNaN operand (not an exception, irrelevant here)
* 3 Any other invalid-operation not mentioned above or zero divide
* (FP_X_INV, FP_X_DZ)
* 4 Denormal operand (FP_X_DNML)
* 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL)
* 6 Inexact result (FP_X_IMP)
*
* NB: the above seems to mix up the mxscr error bits and the x87 ones.
* They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx
* status.
*
* The table is nearly, but not quite, in bit order (ZERODIV and DENORM
* are swapped).
*
* This table assumes that any stack fault is cleared - so that an INVOP
* fault will only be reported as FLTSUB once.
* This might not happen if the mask is being changed.
*/
#define FPE_xxx1(f) (f & EN_SW_INVOP \
? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \
: f & EN_SW_ZERODIV ? FPE_FLTDIV \
: f & EN_SW_DENORM ? FPE_FLTUND \
: f & EN_SW_OVERFLOW ? FPE_FLTOVF \
: f & EN_SW_UNDERFLOW ? FPE_FLTUND \
: f & EN_SW_PRECLOSS ? FPE_FLTRES \
: f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0)
#define FPE_xxx2(f) FPE_xxx1(f), FPE_xxx1((f + 1))
#define FPE_xxx4(f) FPE_xxx2(f), FPE_xxx2((f + 2))
#define FPE_xxx8(f) FPE_xxx4(f), FPE_xxx4((f + 4))
#define FPE_xxx16(f) FPE_xxx8(f), FPE_xxx8((f + 8))
#define FPE_xxx32(f) FPE_xxx16(f), FPE_xxx16((f + 16))
static const uint8_t fpetable[128] = {
FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96)
};
#undef FPE_xxx1
#undef FPE_xxx2
#undef FPE_xxx4
#undef FPE_xxx8
#undef FPE_xxx16
#undef FPE_xxx32
/*
* This is a synchronous trap on either an x87 instruction (due to an unmasked
* error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due
* to an error on the instruction itself.
*
* If trap actually generates a signal, then the fpu state is saved and then
* copied onto the lwp's user-stack, and then recovered from there when the
* signal returns.
*
* All this code needs to do is save the reason for the trap. For x87 traps the
* status word bits need clearing to stop the trap re-occurring. For SSE traps
* the mxcsr bits are 'sticky' and need clearing to not confuse a later trap.
*
* We come here with interrupts disabled.
*/
void
fputrap(struct trapframe *frame)
{
uint32_t statbits;
ksiginfo_t ksi;
if (__predict_false(!USERMODE(frame->tf_cs))) {
register_t ip = X86_TF_RIP(frame);
char where[128];
#ifdef DDB
db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC);
#else
snprintf(where, sizeof(where), "%p", (void *)ip);
#endif
panic("fpu trap from kernel at %s, trapframe %p\n", where,
frame);
}
KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU);
if (frame->tf_trapno == T_XMM) {
uint32_t mxcsr;
x86_stmxcsr(&mxcsr);
statbits = mxcsr;
/* Clear the sticky status bits */
mxcsr &= ~0x3f;
x86_ldmxcsr(&mxcsr);
/* Remove masked interrupts and non-status bits */
statbits &= ~(statbits >> 7) & 0x3f;
/* Mark this is an XMM status */
statbits |= 0x10000;
} else {
uint16_t cw, sw;
/* Get current control and status words */
fnstcw(&cw);
fnstsw(&sw);
/* Clear any pending exceptions from status word */
fnclex();
/* Remove masked interrupts */
statbits = sw & ~(cw & 0x3f);
}
/* Doesn't matter now if we get pre-empted */
x86_enable_intr();
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGFPE;
ksi.ksi_addr = (void *)X86_TF_RIP(frame);
ksi.ksi_code = fpetable[statbits & 0x7f];
ksi.ksi_trap = statbits;
(*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi);
}
void
fpudna(struct trapframe *frame)
{
panic("fpudna from %s, ip %p, trapframe %p",
USERMODE(frame->tf_cs) ? "userland" : "kernel",
(void *)X86_TF_RIP(frame), frame);
}
/* -------------------------------------------------------------------------- */
static inline void
fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate)
{
/*
* Force a reload of the given xstate during the next XRSTOR.
*/
if (x86_fpu_save >= FPU_SAVE_XSAVE) {
fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate;
}
}
void
fpu_set_default_cw(struct lwp *l, unsigned int x87_cw)
{
union savefpu *fpu_save = fpu_lwp_area(l);
struct pcb *pcb = lwp_getpcb(l);
if (i386_use_fxsave) {
fpu_save->sv_xmm.fx_cw = x87_cw;
if (x87_cw != __INITIAL_NPXCW__) {
fpu_xstate_reload(fpu_save, XCR0_X87);
}
} else {
fpu_save->sv_87.s87_cw = x87_cw;
}
pcb->pcb_fpu_dflt_cw = x87_cw;
}
void
fpu_clear(struct lwp *l, unsigned int x87_cw)
{
union savefpu *fpu_save;
struct pcb *pcb;
KASSERT(l == curlwp);
fpu_save = fpu_lwp_area(l);
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
memset(&fpu_save->sv_87, 0, x86_fpu_save_size);
fpu_save->sv_87.s87_tw = 0xffff;
fpu_save->sv_87.s87_cw = x87_cw;
break;
case FPU_SAVE_FXSAVE:
memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_cw = x87_cw;
break;
case FPU_SAVE_XSAVE:
case FPU_SAVE_XSAVEOPT:
memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_cw = x87_cw;
if (__predict_false(x87_cw != __INITIAL_NPXCW__)) {
fpu_xstate_reload(fpu_save, XCR0_X87);
}
break;
}
pcb = lwp_getpcb(l);
pcb->pcb_fpu_dflt_cw = x87_cw;
}
void
fpu_sigreset(struct lwp *l)
{
union savefpu *fpu_save = fpu_lwp_area(l);
struct pcb *pcb = lwp_getpcb(l);
/*
* For signal handlers the register values don't matter. Just reset
* a few fields.
*/
if (i386_use_fxsave) {
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_tw = 0;
fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw;
} else {
fpu_save->sv_87.s87_tw = 0xffff;
fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw;
}
}
void
process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm));
/*
* Invalid bits in mxcsr or mxcsr_mask will cause faults.
*/
fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask;
fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
} else {
process_xmm_to_s87(fpregs, &fpu_save->sv_87);
}
}
void
process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
process_s87_to_xmm(fpregs, &fpu_save->sv_xmm);
fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
} else {
memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87));
}
}
void
process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm));
} else {
memset(fpregs, 0, sizeof(*fpregs));
process_s87_to_xmm(&fpu_save->sv_87, fpregs);
}
}
void
process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memset(fpregs, 0, sizeof(*fpregs));
process_xmm_to_s87(&fpu_save->sv_xmm, fpregs);
} else {
memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87));
}
}
int
process_read_xstate(struct lwp *l, struct xstate *xstate)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (x86_fpu_save == FPU_SAVE_FSAVE) {
/* Convert from legacy FSAVE format. */
memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave));
process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave);
/* We only got x87 data. */
xstate->xs_rfbm = XCR0_X87;
xstate->xs_xstate_bv = XCR0_X87;
return 0;
}
/* Copy the legacy area. */
memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave,
sizeof(xstate->xs_fxsave));
if (x86_fpu_save == FPU_SAVE_FXSAVE) {
/* FXSAVE means we've got x87 + SSE data. */
xstate->xs_rfbm = XCR0_X87 | XCR0_SSE;
xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE;
return 0;
}
/* Copy the bitmap indicating which states are available. */
xstate->xs_rfbm = x86_xsave_features & XCR0_FPU;
xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv;
KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm));
#define COPY_COMPONENT(xcr0_val, xsave_val, field) \
if (xstate->xs_xstate_bv & xcr0_val) { \
KASSERT(x86_xsave_offsets[xsave_val] \
>= sizeof(struct xsave_header)); \
KASSERT(x86_xsave_sizes[xsave_val] \
>= sizeof(xstate->field)); \
memcpy(&xstate->field, \
(char*)fpu_save + x86_xsave_offsets[xsave_val], \
sizeof(xstate->field)); \
}
COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
#undef COPY_COMPONENT
return 0;
}
int
process_verify_xstate(const struct xstate *xstate)
{
/* xstate_bv must be a subset of RFBM */
if (xstate->xs_xstate_bv & ~xstate->xs_rfbm)
return EINVAL;
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
if ((xstate->xs_rfbm & ~XCR0_X87))
return EINVAL;
break;
case FPU_SAVE_FXSAVE:
if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE)))
return EINVAL;
break;
default:
/* Verify whether no unsupported features are enabled */
if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0)
return EINVAL;
}
return 0;
}
int
process_write_xstate(struct lwp *l, const struct xstate *xstate)
{
union savefpu *fpu_save = fpu_lwp_area(l);
/* Convert data into legacy FSAVE format. */
if (x86_fpu_save == FPU_SAVE_FSAVE) {
if (xstate->xs_xstate_bv & XCR0_X87)
process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87);
return 0;
}
/* If XSAVE is supported, make sure that xstate_bv is set correctly. */
if (x86_fpu_save >= FPU_SAVE_XSAVE) {
/*
* Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv :
* fpu_save->sv_xsave_hdr.xsh_xstate_bv"
*/
fpu_save->sv_xsave_hdr.xsh_xstate_bv =
(fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) |
xstate->xs_xstate_bv;
}
if (xstate->xs_xstate_bv & XCR0_X87) {
/*
* X87 state is split into two areas, interspersed with SSE
* data.
*/
memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24);
memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac,
sizeof(xstate->xs_fxsave.fx_87_ac));
}
/*
* Copy MXCSR if either SSE or AVX state is requested, to match the
* XSAVE behavior for those flags.
*/
if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) {
/*
* Invalid bits in mxcsr or mxcsr_mask will cause faults.
*/
fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask
& x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr &
fpu_save->sv_xmm.fx_mxcsr_mask;
}
if (xstate->xs_xstate_bv & XCR0_SSE) {
memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160],
xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm));
}
#define COPY_COMPONENT(xcr0_val, xsave_val, field) \
if (xstate->xs_xstate_bv & xcr0_val) { \
KASSERT(x86_xsave_offsets[xsave_val] \
>= sizeof(struct xsave_header)); \
KASSERT(x86_xsave_sizes[xsave_val] \
>= sizeof(xstate->field)); \
memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val], \
&xstate->field, sizeof(xstate->field)); \
}
COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
#undef COPY_COMPONENT
return 0;
}
/* $NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $ */
/*
* Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran, and by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2007 Manuel Bouyer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Copyright 2001 (c) Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $");
#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include "opt_xen.h"
#include "opt_svs.h"
#include "opt_kaslr.h"
#include "opt_efi.h"
#define __MUTEX_PRIVATE /* for assertions */
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/xcall.h>
#include <sys/kcore.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/entropy.h>
#include <uvm/uvm.h>
#include <uvm/pmap/pmap_pvt.h>
#include <dev/isa/isareg.h>
#include <machine/specialreg.h>
#include <machine/gdt.h>
#include <machine/isa_machdep.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/pmap_private.h>
#include <x86/bootspace.h>
#include <x86/pat.h>
#include <x86/pmap_pv.h>
#include <x86/i82489reg.h>
#include <x86/i82489var.h>
#ifdef XEN
#include <xen/include/public/xen.h>
#include <xen/hypervisor.h>
#include <xen/xenpmap.h>
#endif
#ifdef __HAVE_DIRECT_MAP
#include <crypto/nist_hash_drbg/nist_hash_drbg.h>
#endif
/*
* general info:
*
* - for an explanation of how the x86 MMU hardware works see
* the comments in <machine/pte.h>.
*
* - for an explanation of the general memory structure used by
* this pmap (including the recursive mapping), see the comments
* in <machine/pmap.h>.
*
* this file contains the code for the "pmap module." the module's
* job is to manage the hardware's virtual to physical address mappings.
* note that there are two levels of mapping in the VM system:
*
* [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
* to map ranges of virtual address space to objects/files. for
* example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
* to the file /bin/ls starting at offset zero." note that
* the upper layer mapping is not concerned with how individual
* vm_pages are mapped.
*
* [2] the lower layer of the VM system (the pmap) maintains the mappings
* from virtual addresses. it is concerned with which vm_page is
* mapped where. for example, when you run /bin/ls and start
* at page 0x1000 the fault routine may lookup the correct page
* of the /bin/ls file and then ask the pmap layer to establish
* a mapping for it.
*
* note that information in the lower layer of the VM system can be
* thrown away since it can easily be reconstructed from the info
* in the upper layer.
*
* data structures we use include:
*
* - struct pmap: describes the address space of one thread
* - struct pmap_page: describes one pv-tracked page, without
* necessarily a corresponding vm_page
* - struct pv_entry: describes one <PMAP,VA> mapping of a PA
* - pmap_page::pp_pvlist: there is one list per pv-tracked page of
* physical memory. the pp_pvlist points to a list of pv_entry
* structures which describe all the <PMAP,VA> pairs that this
* page is mapped in. this is critical for page based operations
* such as pmap_page_protect() [change protection on _all_ mappings
* of a page]
*/
/*
* Locking
*
* We have the following locks that we must deal with, listed in the order
* that they are acquired:
*
* pg->uobject->vmobjlock, pg->uanon->an_lock
*
* For managed pages, these per-object locks are taken by the VM system
* before calling into the pmap module - either a read or write hold.
* The lock hold prevent pages from changing identity while the pmap is
* operating on them. For example, the same lock is held across a call
* to pmap_remove() and the following call to pmap_update(), so that a
* page does not gain a new identity while its TLB visibility is stale.
*
* pmap->pm_lock
*
* This lock protects the fields in the pmap structure including the
* non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
* structures. For modifying unmanaged kernel PTEs it is not needed as
* kernel PDEs are never freed, and the kernel is expected to be self
* consistent (and the lock can't be taken for unmanaged kernel PTEs,
* because they can be modified from interrupt context).
*
* pmaps_lock
*
* This lock protects the list of active pmaps (headed by "pmaps").
* It's acquired when adding or removing pmaps or adjusting kernel PDEs.
*
* pp_lock
*
* This per-page lock protects PV entry lists and the embedded PV entry
* in each vm_page, allowing for concurrent operation on pages by
* different pmaps. This is a spin mutex at IPL_VM, because at the
* points it is taken context switching is usually not tolerable, and
* spin mutexes must block out interrupts that could take kernel_lock.
*/
/* uvm_object is abused here to index pmap_pages; make assertions happy. */
#ifdef DIAGNOSTIC
#define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
#define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock)
#else
#define PMAP_DUMMY_LOCK(pm)
#define PMAP_DUMMY_UNLOCK(pm)
#endif
static const struct uvm_pagerops pmap_pager = {
/* nothing */
};
/*
* pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
*/
#define pl_i(va, lvl) \
(((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
#define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
/*
* PTP macros:
* a PTP's index is the PD index of the PDE that points to it
* a PTP's offset is the byte-offset in the PTE space that this PTP is at
* a PTP's VA is the first VA mapped by that PTP
*/
#define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE)
const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
const long nkptpmax[] = NKPTPMAX_INITIALIZER;
const long nbpd[] = NBPD_INITIALIZER;
#ifdef i386
pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
#else
pd_entry_t *normal_pdes[3];
#endif
long nkptp[] = NKPTP_INITIALIZER;
struct pmap_head pmaps;
kmutex_t pmaps_lock __cacheline_aligned;
struct pcpu_area *pcpuarea __read_mostly;
static vaddr_t pmap_maxkvaddr;
/*
* Misc. event counters.
*/
struct evcnt pmap_iobmp_evcnt;
struct evcnt pmap_ldt_evcnt;
/*
* PAT
*/
static bool cpu_pat_enabled __read_mostly = false;
/*
* Global data structures
*/
static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
static rb_tree_t pmap_kernel_rb __cacheline_aligned;
struct bootspace bootspace __read_mostly;
struct slotspace slotspace __read_mostly;
/* Set to PTE_NX if supported. */
pd_entry_t pmap_pg_nx __read_mostly = 0;
/* Set to PTE_G if supported. */
pd_entry_t pmap_pg_g __read_mostly = 0;
/* Set to true if large pages are supported. */
int pmap_largepages __read_mostly = 0;
paddr_t lowmem_rsvd __read_mostly;
paddr_t avail_start __read_mostly; /* PA of first available physical page */
paddr_t avail_end __read_mostly; /* PA of last available physical page */
#ifdef XENPV
paddr_t pmap_pa_start; /* PA of first physical page for this domain */
paddr_t pmap_pa_end; /* PA of last physical page for this domain */
#endif
#define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
#define PMAP_CHECK_PP(pp) \
KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
#define PAGE_ALIGNED(pp) \
__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
/*
* Other data structures
*/
static pt_entry_t protection_codes[8] __read_mostly;
static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
/*
* The following two vaddr_t's are used during system startup to keep track of
* how much of the kernel's VM space we have used. Once the system is started,
* the management of the remaining kernel VM space is turned over to the
* kernel_map vm_map.
*/
static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */
static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */
#ifndef XENPV
/*
* LAPIC virtual address, and fake physical address.
*/
volatile vaddr_t local_apic_va __read_mostly;
paddr_t local_apic_pa __read_mostly;
#endif
/*
* pool that pmap structures are allocated from
*/
struct pool_cache pmap_cache;
static int pmap_ctor(void *, void *, int);
static void pmap_dtor(void *, void *);
/*
* pv_page cache
*/
static struct pool_cache pmap_pvp_cache;
#ifdef __HAVE_DIRECT_MAP
vaddr_t pmap_direct_base __read_mostly;
vaddr_t pmap_direct_end __read_mostly;
#endif
#ifndef __HAVE_DIRECT_MAP
/*
* Special VAs and the PTEs that map them
*/
static pt_entry_t *early_zero_pte;
static void pmap_vpage_cpualloc(struct cpu_info *);
#ifdef XENPV
char *early_zerop; /* also referenced from xen_locore() */
#else
static char *early_zerop;
#endif
#endif
int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
/* PDP pool and its callbacks */
static struct pool pmap_pdp_pool;
static void pmap_pdp_init(pd_entry_t *);
static void pmap_pdp_fini(pd_entry_t *);
#ifdef PAE
/* need to allocate items of 4 pages */
static void *pmap_pdp_alloc(struct pool *, int);
static void pmap_pdp_free(struct pool *, void *);
static struct pool_allocator pmap_pdp_allocator = {
.pa_alloc = pmap_pdp_alloc,
.pa_free = pmap_pdp_free,
.pa_pagesz = PAGE_SIZE * PDP_SIZE,
};
#endif
extern vaddr_t idt_vaddr;
extern paddr_t idt_paddr;
extern vaddr_t gdt_vaddr;
extern paddr_t gdt_paddr;
extern vaddr_t ldt_vaddr;
extern paddr_t ldt_paddr;
#ifdef i386
/* stuff to fix the pentium f00f bug */
extern vaddr_t pentium_idt_vaddr;
#endif
/* Array of freshly allocated PTPs, for pmap_get_ptp(). */
struct pmap_ptparray {
struct vm_page *pg[PTP_LEVELS + 1];
bool alloced[PTP_LEVELS + 1];
};
/*
* PV entries are allocated in page-sized chunks and cached per-pmap to
* avoid intense pressure on memory allocators.
*/
struct pv_page {
LIST_HEAD(, pv_entry) pvp_pves;
LIST_ENTRY(pv_page) pvp_list;
long pvp_nfree;
struct pmap *pvp_pmap;
};
#define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
/*
* PV tree prototypes
*/
static int pmap_compare_key(void *, const void *, const void *);
static int pmap_compare_nodes(void *, const void *, const void *);
/* Read-black tree */
static const rb_tree_ops_t pmap_rbtree_ops = {
.rbto_compare_nodes = pmap_compare_nodes,
.rbto_compare_key = pmap_compare_key,
.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
.rbto_context = NULL
};
/*
* Local prototypes
*/
#ifdef __HAVE_PCPU_AREA
static void pmap_init_pcpu(void);
#endif
#ifdef __HAVE_DIRECT_MAP
static void pmap_init_directmap(struct pmap *);
#endif
#if !defined(XENPV)
static void pmap_remap_global(void);
#endif
#ifndef XENPV
static void pmap_init_lapic(void);
static void pmap_remap_largepages(void);
#endif
static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
struct vm_page **);
static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
pd_entry_t * const *);
static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
static void pmap_freepage(struct pmap *, struct vm_page *, int);
static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
pt_entry_t *, pd_entry_t * const *);
static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
vaddr_t);
static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
vaddr_t);
static int pmap_pvp_ctor(void *, void *, int);
static void pmap_pvp_dtor(void *, void *);
static struct pv_entry *pmap_alloc_pv(struct pmap *);
static void pmap_free_pv(struct pmap *, struct pv_entry *);
static void pmap_drain_pv(struct pmap *);
static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
static void pmap_reactivate(struct pmap *);
long
pmap_resident_count(struct pmap *pmap)
{
return pmap->pm_stats.resident_count;
}
long
pmap_wired_count(struct pmap *pmap)
{
return pmap->pm_stats.wired_count;
}
/*
* p m a p h e l p e r f u n c t i o n s
*/
static inline void
pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
{
KASSERT(cold || mutex_owned(&pmap->pm_lock));
pmap->pm_stats.resident_count += resid_diff;
pmap->pm_stats.wired_count += wired_diff;
}
static inline void
pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff);
}
/*
* ptp_to_pmap: lookup pmap by ptp
*/
static inline struct pmap *
ptp_to_pmap(struct vm_page *ptp)
{
struct pmap *pmap;
if (ptp == NULL) {
return pmap_kernel();
}
pmap = (struct pmap *)ptp->uobject;
KASSERT(pmap != NULL); KASSERT(&pmap->pm_obj[0] == ptp->uobject);
return pmap;
}
static inline struct pv_pte *
pve_to_pvpte(struct pv_entry *pve)
{
if (pve == NULL)
return NULL;
KASSERT((void *)&pve->pve_pte == (void *)pve);
return &pve->pve_pte;
}
static inline struct pv_entry *
pvpte_to_pve(struct pv_pte *pvpte)
{
struct pv_entry *pve = (void *)pvpte;
KASSERT(pve_to_pvpte(pve) == pvpte);
return pve;
}
/*
* Return true if the pmap page has an embedded PV entry.
*/
static inline bool
pv_pte_embedded(struct pmap_page *pp)
{
KASSERT(mutex_owned(&pp->pp_lock));
return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
}
/*
* pv_pte_first, pv_pte_next: PV list iterator.
*/
static inline struct pv_pte *
pv_pte_first(struct pmap_page *pp)
{
KASSERT(mutex_owned(&pp->pp_lock)); if (pv_pte_embedded(pp)) { return &pp->pp_pte;
}
return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
}
static inline struct pv_pte *
pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
{
KASSERT(mutex_owned(&pp->pp_lock));
KASSERT(pvpte != NULL);
if (pvpte == &pp->pp_pte) {
return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
}
return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
}
static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)
{
uint8_t ret = 0;
if (pte & PTE_D)
ret |= PP_ATTRS_D;
if (pte & PTE_A)
ret |= PP_ATTRS_A;
if (pte & PTE_W)
ret |= PP_ATTRS_W;
return ret;
}
static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)
{
pt_entry_t pte = 0;
if (attrs & PP_ATTRS_D)
pte |= PTE_D;
if (attrs & PP_ATTRS_A)
pte |= PTE_A;
if (attrs & PP_ATTRS_W)
pte |= PTE_W;
return pte;
}
/*
* pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
* of course the kernel is always loaded
*/
bool
pmap_is_curpmap(struct pmap *pmap)
{
return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
}
inline void
pmap_reference(struct pmap *pmap)
{
atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
}
/*
* rbtree: compare two nodes.
*/
static int
pmap_compare_nodes(void *context, const void *n1, const void *n2)
{
const struct pv_entry *pve1 = n1;
const struct pv_entry *pve2 = n2;
KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
return -1;
}
if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
return 1;
}
return 0;
}
/*
* rbtree: compare a node and a key.
*/
static int
pmap_compare_key(void *context, const void *n, const void *k)
{
const struct pv_entry *pve = n;
const vaddr_t key = (vaddr_t)k;
if (pve->pve_pte.pte_va < key) {
return -1;
}
if (pve->pve_pte.pte_va > key) {
return 1;
}
return 0;
}
/*
* pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
*/
static inline void
pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
{
vaddr_t *min = (vaddr_t *)&ptp->uanon;
if (va < *min) { *min = va;
}
}
/*
* pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
*/
static inline void
pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
{
vaddr_t sclip;
if (ptp == NULL) {
return;
}
sclip = (vaddr_t)ptp->uanon;
sclip = (*startva < sclip ? sclip : *startva);
*pte += (sclip - *startva) / PAGE_SIZE;
*startva = sclip;
}
/*
* pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
*
* there are several pmaps involved. some or all of them might be same.
*
* - the pmap given by the first argument
* our caller wants to access this pmap's PTEs.
*
* - pmap_kernel()
* the kernel pmap. note that it only contains the kernel part
* of the address space which is shared by any pmap. ie. any
* pmap can be used instead of pmap_kernel() for our purpose.
*
* - ci->ci_pmap
* pmap currently loaded on the cpu.
*
* - vm_map_pmap(&curproc->p_vmspace->vm_map)
* current process' pmap.
*
* => caller must lock pmap first (if not the kernel pmap)
* => must be undone with pmap_unmap_ptes before returning
* => disables kernel preemption
*/
void
pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
pd_entry_t * const **pdeppp)
{
struct pmap *curpmap;
struct cpu_info *ci;
lwp_t *l;
kpreempt_disable();
/* The kernel's pmap is always accessible. */
if (pmap == pmap_kernel()) {
*pmap2 = NULL;
*ptepp = PTE_BASE;
*pdeppp = normal_pdes;
return;
}
KASSERT(mutex_owned(&pmap->pm_lock));
l = curlwp;
ci = l->l_cpu;
curpmap = ci->ci_pmap;
if (pmap == curpmap) {
/*
* Already on the CPU: make it valid. This is very
* often the case during exit(), when we have switched
* to the kernel pmap in order to destroy a user pmap.
*/
if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { pmap_reactivate(pmap);
}
*pmap2 = NULL;
} else {
/*
* Toss current pmap from CPU and install new pmap, but keep
* a reference to the old one. Dropping the reference can
* can block as it needs to take locks, so defer that to
* pmap_unmap_ptes().
*/
pmap_reference(pmap);
pmap_load1(l, pmap, curpmap);
*pmap2 = curpmap;
}
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
#ifdef DIAGNOSTIC
pmap->pm_pctr = lwp_pctr();
#endif
*ptepp = PTE_BASE;
#if defined(XENPV) && defined(__x86_64__)
KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
*pdeppp = ci->ci_normal_pdes;
#else
*pdeppp = normal_pdes;
#endif
}
/*
* pmap_unmap_ptes: unlock the PTE mapping of "pmap"
*
* => we cannot tolerate context switches while mapped in: assert this.
* => reenables kernel preemption.
* => does not unlock pmap.
*/
void
pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
{
struct cpu_info *ci;
struct pmap *mypmap;
struct lwp *l;
KASSERT(kpreempt_disabled());
/* The kernel's pmap is always accessible. */
if (pmap == pmap_kernel()) {
kpreempt_enable();
return;
}
l = curlwp;
ci = l->l_cpu;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pmap->pm_pctr == lwp_pctr());
#if defined(XENPV) && defined(__x86_64__)
KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
#endif
/* If not our own pmap, mark whatever's on the CPU now as lazy. */
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
ci->ci_want_pmapload = 0;
} else {
ci->ci_want_pmapload = (mypmap != pmap_kernel());
ci->ci_tlbstate = TLBSTATE_LAZY;
}
/* Now safe to re-enable preemption. */
kpreempt_enable();
/* Toss reference to other pmap taken earlier. */
if (pmap2 != NULL) { pmap_destroy(pmap2);
}
}
inline static void
pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
{
#if !defined(__x86_64__)
if (curproc == NULL || curproc->p_vmspace == NULL ||
pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
return;
if ((opte ^ npte) & PTE_X)
pmap_update_pg(va);
/*
* Executability was removed on the last executable change.
* Reset the code segment to something conservative and
* let the trap handler deal with setting the right limit.
* We can't do that because of locking constraints on the vm map.
*/
if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
struct trapframe *tf = curlwp->l_md.md_regs;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
pm->pm_hiexec = I386_MAX_EXE_ADDR;
}
#endif /* !defined(__x86_64__) */
}
#if !defined(__x86_64__)
/*
* Fixup the code segment to cover all potential executable mappings.
* returns 0 if no changes to the code segment were made.
*/
int
pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
{
struct vm_map_entry *ent;
struct pmap *pm = vm_map_pmap(map);
vaddr_t va = 0;
vm_map_lock_read(map);
for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
/*
* This entry has greater va than the entries before.
* We need to make it point to the last page, not past it.
*/
if (ent->protection & VM_PROT_EXECUTE)
va = trunc_page(ent->end) - PAGE_SIZE;
}
vm_map_unlock_read(map);
if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
return 0;
pm->pm_hiexec = va;
if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
} else {
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
return 0;
}
return 1;
}
#endif /* !defined(__x86_64__) */
void
pat_init(struct cpu_info *ci)
{
#ifndef XENPV
uint64_t pat;
if (!(ci->ci_feat_val[0] & CPUID_PAT))
return;
/* We change WT to WC. Leave all other entries the default values. */
pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
wrmsr(MSR_CR_PAT, pat);
cpu_pat_enabled = true;
#endif
}
static pt_entry_t
pmap_pat_flags(u_int flags)
{
u_int cacheflags = (flags & PMAP_CACHE_MASK);
if (!cpu_pat_enabled) {
switch (cacheflags) {
case PMAP_NOCACHE:
case PMAP_NOCACHE_OVR:
/* results in PGC_UCMINUS on cpus which have
* the cpuid PAT but PAT "disabled"
*/
return PTE_PCD;
default:
return 0;
}
}
switch (cacheflags) {
case PMAP_NOCACHE:
return PGC_UC;
case PMAP_WRITE_COMBINE:
return PGC_WC;
case PMAP_WRITE_BACK:
return PGC_WB;
case PMAP_NOCACHE_OVR:
return PGC_UCMINUS;
}
return 0;
}
/*
* p m a p k e n t e r f u n c t i o n s
*
* functions to quickly enter/remove pages from the kernel address
* space. pmap_kremove is exported to MI kernel. we make use of
* the recursive PTE mappings.
*/
/*
* pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
*
* => no need to lock anything, assume va is already allocated
* => should be faster than normal pmap enter function
*/
void
pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
{
pt_entry_t *pte, opte, npte;
KASSERT(!(prot & ~VM_PROT_ALL));
if (va < VM_MIN_KERNEL_ADDRESS)
pte = vtopte(va);
else
pte = kvtopte(va);
#if defined(XENPV) && defined(DOM0OPS)
if (pa < pmap_pa_start || pa >= pmap_pa_end) {
#ifdef DEBUG
printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
" outside range\n", __func__, pa, va);
#endif /* DEBUG */
npte = pa;
} else
#endif /* XENPV && DOM0OPS */
npte = pmap_pa2pte(pa);
npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
npte |= pmap_pat_flags(flags);
opte = pmap_pte_testset(pte, npte); /* zap! */
/*
* XXX: make sure we are not dealing with a large page, since the only
* large pages created are for the kernel image, and they should never
* be kentered.
*/
KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
/* This should not happen. */
printf_nolog("%s: mapping already present\n", __func__);
kpreempt_disable();
pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
kpreempt_enable();
}
}
__strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
#if defined(__x86_64__)
/*
* Change protection for a virtual address. Local for a CPU only, don't
* care about TLB shootdowns.
*
* => must be called with preemption disabled
*/
void
pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
{
pt_entry_t *pte, opte, npte;
KASSERT(kpreempt_disabled());
if (va < VM_MIN_KERNEL_ADDRESS)
pte = vtopte(va);
else
pte = kvtopte(va);
npte = opte = *pte;
if ((prot & VM_PROT_WRITE) != 0)
npte |= PTE_W;
else
npte &= ~(PTE_W|PTE_D);
if (opte != npte) {
pmap_pte_set(pte, npte);
pmap_pte_flush();
invlpg(va);
}
}
#endif /* defined(__x86_64__) */
/*
* pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
*
* => no need to lock anything
* => caller must dispose of any vm_page mapped in the va range
* => note: not an inline function
* => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
* => we assume kernel only unmaps valid addresses and thus don't bother
* checking the valid bit before doing TLB flushing
* => must be followed by call to pmap_update() before reuse of page
*/
static void
pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
{
pt_entry_t *pte, opte;
vaddr_t va, eva;
eva = sva + len;
kpreempt_disable();
for (va = sva; va < eva; va += PAGE_SIZE) { pte = kvtopte(va);
opte = pmap_pte_testset(pte, 0); /* zap! */
if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { pmap_tlb_shootdown(pmap_kernel(), va, opte,
TLBSHOOT_KREMOVE);
}
KASSERTMSG((opte & PTE_PS) == 0,
"va %#" PRIxVADDR " is a large page", va);
KASSERTMSG((opte & PTE_PVLIST) == 0,
"va %#" PRIxVADDR " is a pv tracked page", va);
}
if (localonly) { tlbflushg();
}
kpreempt_enable();
}
void
pmap_kremove(vaddr_t sva, vsize_t len)
{
pmap_kremove1(sva, len, false);
}
/*
* pmap_kremove_local: like pmap_kremove(), but only worry about
* TLB invalidations on the current CPU. this is only intended
* for use while writing kernel crash dumps, either after panic
* or via reboot -d.
*/
void
pmap_kremove_local(vaddr_t sva, vsize_t len)
{
pmap_kremove1(sva, len, true);
}
/*
* p m a p i n i t f u n c t i o n s
*
* pmap_bootstrap and pmap_init are called during system startup
* to init the pmap module. pmap_bootstrap() does a low level
* init just to get things rolling. pmap_init() finishes the job.
*/
/*
* pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
* This function is to be used before any VM system has been set up.
*
* The va is taken from virtual_avail.
*/
static vaddr_t
pmap_bootstrap_valloc(size_t npages)
{
vaddr_t va = virtual_avail;
virtual_avail += npages * PAGE_SIZE;
return va;
}
/*
* pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
* This function is to be used before any VM system has been set up.
*
* The pa is taken from avail_start.
*/
static paddr_t
pmap_bootstrap_palloc(size_t npages)
{
paddr_t pa = avail_start;
avail_start += npages * PAGE_SIZE;
return pa;
}
/*
* pmap_bootstrap: get the system in a state where it can run with VM properly
* enabled (called before main()). The VM system is fully init'd later.
*
* => on i386, locore.S has already enabled the MMU by allocating a PDP for the
* kernel, and nkpde PTP's for the kernel.
* => kva_start is the first free virtual address in kernel space.
*/
void
pmap_bootstrap(vaddr_t kva_start)
{
struct pmap *kpm;
int i;
vaddr_t kva;
pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
/*
* Set up our local static global vars that keep track of the usage of
* KVM before kernel_map is set up.
*/
virtual_avail = kva_start; /* first free KVA */
virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
/*
* Set up protection_codes: we need to be able to convert from a MI
* protection code (some combo of VM_PROT...) to something we can jam
* into a x86 PTE.
*/
protection_codes[VM_PROT_NONE] = pmap_pg_nx;
protection_codes[VM_PROT_EXECUTE] = PTE_X;
protection_codes[VM_PROT_READ] = pmap_pg_nx;
protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
/*
* Now we init the kernel's pmap.
*
* The kernel pmap's pm_obj is not used for much. However, in user pmaps
* the pm_obj contains the list of active PTPs.
*/
kpm = pmap_kernel();
mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&kpm->pm_dummy_lock);
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
kpm->pm_ptphint[i] = NULL;
}
memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
for (i = 0; i < PDP_SIZE; i++)
kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
kcpuset_create(&kpm->pm_cpus, true);
kcpuset_create(&kpm->pm_kernel_cpus, true);
kpm->pm_ldt = NULL;
kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
/*
* the above is just a rough estimate and not critical to the proper
* operation of the system.
*/
#if !defined(XENPV)
/*
* Begin to enable global TLB entries if they are supported: add PTE_G
* attribute to already mapped kernel pages. Do that only if SVS is
* disabled.
*
* The G bit has no effect until the CR4_PGE bit is set in CR4, which
* happens later in cpu_init().
*/
#ifdef SVS
if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
#else
if (cpu_feature[0] & CPUID_PGE) {
#endif
pmap_pg_g = PTE_G;
pmap_remap_global();
}
#endif
#ifndef XENPV
/*
* Enable large pages if they are supported.
*/
if (cpu_feature[0] & CPUID_PSE) {
lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
pmap_largepages = 1; /* enable software */
/*
* The TLB must be flushed after enabling large pages on Pentium
* CPUs, according to section 3.6.2.2 of "Intel Architecture
* Software Developer's Manual, Volume 3: System Programming".
*/
tlbflushg();
/* Remap the kernel. */
pmap_remap_largepages();
}
pmap_init_lapic();
#endif /* !XENPV */
#ifdef __HAVE_PCPU_AREA
pmap_init_pcpu();
#endif
#ifdef __HAVE_DIRECT_MAP
pmap_init_directmap(kpm);
#else
pmap_vpage_cpualloc(&cpu_info_primary);
if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
} else { /* amd64 */
/*
* zero_pte is stuck at the end of mapped space for the kernel
* image (disjunct from kva space). This is done so that it
* can safely be used in pmap_growkernel (pmap_get_physpage),
* when it's called for the first time.
* XXXfvdl fix this for MULTIPROCESSOR later.
*/
#ifdef XENPV
/* early_zerop initialized in xen_locore() */
#else
early_zerop = (void *)bootspace.spareva;
#endif
early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
}
#endif
#if defined(XENPV) && defined(__x86_64__)
extern vaddr_t xen_dummy_page;
paddr_t xen_dummy_user_pgd;
/*
* We want a dummy page directory for Xen: when deactivating a pmap,
* Xen will still consider it active. So we set user PGD to this one
* to lift all protection on the now inactive page tables set.
*/
xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
/* Zero fill it, the less checks in Xen it requires the better */
memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
/* Mark read-only */
HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
UVMF_INVLPG);
/* Pin as L4 */
xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
#endif
/*
* Allocate space for the Interrupt Descriptor Table (IDT),
* Global Descriptor Table (GDT), and Local Descriptor Table
* (LDT).
*
* Currently there is an initial temporary GDT allocated on the
* stack by the caller of init386/init_x86_64, which is (among
* other things) needed on i386 for %fs-relative addressing for
* CPU-local data (CPUVAR(...), curcpu(), curlwp). This
* initial temporary GDT will be popped off the stack before we
* can enter main, so we need to make sure there is space for a
* second temporary GDT to continue existing when we enter main
* before we allocate space for the permanent GDT with
* uvm_km(9) in gdt_init via cpu_startup and switch to that.
*/
idt_vaddr = pmap_bootstrap_valloc(1);
idt_paddr = pmap_bootstrap_palloc(1);
gdt_vaddr = pmap_bootstrap_valloc(1);
gdt_paddr = pmap_bootstrap_palloc(1);
#ifdef __HAVE_PCPU_AREA
ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
#else
ldt_vaddr = pmap_bootstrap_valloc(1);
#endif
ldt_paddr = pmap_bootstrap_palloc(1);
#if !defined(__x86_64__)
/* pentium f00f bug stuff */
pentium_idt_vaddr = pmap_bootstrap_valloc(1);
#endif
#if defined(XENPVHVM)
/* XXX: move to hypervisor.c with appropriate API adjustments */
extern paddr_t HYPERVISOR_shared_info_pa;
extern volatile struct xencons_interface *xencons_interface; /* XXX */
extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
if (vm_guest != VM_GUEST_XENPVH) {
HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
}
xencons_interface = (void *) pmap_bootstrap_valloc(1);
xenstore_interface = (void *) pmap_bootstrap_valloc(1);
#endif
/*
* Now we reserve some VM for mapping pages when doing a crash dump.
*/
virtual_avail = reserve_dumppages(virtual_avail);
/*
* Init the global lock and global list.
*/
mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&pmaps);
/*
* Ensure the TLB is sync'd with reality by flushing it...
*/
tlbflushg();
/*
* Calculate pmap_maxkvaddr from nkptp[].
*/
kva = VM_MIN_KERNEL_ADDRESS;
for (i = PTP_LEVELS - 1; i >= 1; i--) {
kva += nkptp[i] * nbpd[i];
}
pmap_maxkvaddr = kva;
}
#ifndef XENPV
static void
pmap_init_lapic(void)
{
/*
* On CPUs that have no LAPIC, local_apic_va is never kentered. But our
* x86 implementation relies a lot on this address to be valid; so just
* allocate a fake physical page that will be kentered into
* local_apic_va by machdep.
*
* If the LAPIC is present, the va will be remapped somewhere else
* later in lapic_map.
*/
local_apic_va = pmap_bootstrap_valloc(1);
local_apic_pa = pmap_bootstrap_palloc(1);
}
#endif
#ifdef __x86_64__
static size_t
pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
{
size_t npages;
npages = (roundup(endva, pgsz) / pgsz) -
(rounddown(startva, pgsz) / pgsz);
return npages;
}
#endif
#if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
static inline void
slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
{
size_t sslot = slotspace.area[type].sslot;
size_t nslot = slotspace.area[type].nslot;
memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
}
#endif
#ifdef __x86_64__
/*
* Randomize the location of an area. We count the holes in the VM space. We
* randomly select one hole, and then randomly select an area within that hole.
* Finally we update the associated entry in the slotspace structure.
*/
vaddr_t
slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
vaddr_t randva)
{
struct {
int start;
int end;
} holes[SLSPACE_NAREAS+1];
size_t i, nholes, hole;
size_t startsl, endsl, nslots, winsize;
vaddr_t startva, va;
sz = roundup(sz, align);
/*
* Take one more slot with +NBPD_L4, because we may end up choosing
* an area that crosses slots:
* +------+------+------+
* | Slot | Slot | Slot |
* +------+------+------+
* [Chosen Area]
* And in that case we must take into account the additional slot
* consumed.
*/
nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
/* Get the holes. */
nholes = 0;
size_t curslot = 0 + 256; /* end of SLAREA_USER */
while (1) {
/*
* Find the first occupied slot after the current one.
* The area between the two is a hole.
*/
size_t minsslot = 512;
size_t minnslot = 0;
for (i = 0; i < SLSPACE_NAREAS; i++) {
if (!slotspace.area[i].active)
continue;
if (slotspace.area[i].sslot >= curslot &&
slotspace.area[i].sslot < minsslot) {
minsslot = slotspace.area[i].sslot;
minnslot = slotspace.area[i].nslot;
}
}
/* No hole anymore, stop here. */
if (minsslot == 512) {
break;
}
/* Register the hole. */
if (minsslot - curslot >= nslots) {
holes[nholes].start = curslot;
holes[nholes].end = minsslot;
nholes++;
}
/* Skip that hole, and iterate again. */
curslot = minsslot + minnslot;
}
if (nholes == 0) {
panic("%s: impossible", __func__);
}
/* Select a hole. */
hole = randhole;
#ifdef NO_X86_ASLR
hole = 0;
#endif
hole %= nholes;
startsl = holes[hole].start;
endsl = holes[hole].end;
startva = VA_SIGN_NEG(startsl * NBPD_L4);
/* Select an area within the hole. */
va = randva;
#ifdef NO_X86_ASLR
va = 0;
#endif
winsize = ((endsl - startsl) * NBPD_L4) - sz;
va %= winsize;
va = rounddown(va, align);
va += startva;
/* Update the entry. */
slotspace.area[type].sslot = pl4_i(va);
slotspace.area[type].nslot =
pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
slotspace.area[type].active = true;
return va;
}
#endif
#ifdef __HAVE_PCPU_AREA
static void
pmap_init_pcpu(void)
{
const vaddr_t startva = PMAP_PCPU_BASE;
size_t nL4e, nL3e, nL2e, nL1e;
size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
paddr_t pa;
vaddr_t endva;
vaddr_t tmpva;
pt_entry_t *pte;
size_t size;
int i;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
size = sizeof(struct pcpu_area);
endva = startva + size;
/* We will use this temporary va. */
tmpva = bootspace.spareva;
pte = PTE_BASE + pl1_i(tmpva);
/* Build L4 */
L4e_idx = pl4_i(startva);
nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
KASSERT(nL4e == 1);
for (i = 0; i < nL4e; i++) {
KASSERT(L4_BASE[L4e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L3 */
L3e_idx = pl3_i(startva);
nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
for (i = 0; i < nL3e; i++) {
KASSERT(L3_BASE[L3e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L2 */
L2e_idx = pl2_i(startva);
nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
for (i = 0; i < nL2e; i++) {
KASSERT(L2_BASE[L2e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L1 */
L1e_idx = pl1_i(startva);
nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
for (i = 0; i < nL1e; i++) {
/*
* Nothing to do, the PTEs will be entered via
* pmap_kenter_pa.
*/
KASSERT(L1_BASE[L1e_idx+i] == 0);
}
*pte = 0;
pmap_update_pg(tmpva);
pcpuarea = (struct pcpu_area *)startva;
tlbflush();
}
#endif
#ifdef __HAVE_DIRECT_MAP
static void
randomize_hole(size_t *randholep, vaddr_t *randvap)
{
struct nist_hash_drbg drbg;
uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
const char p[] = "x86/directmap";
int error;
entropy_extract(seed, sizeof(seed), 0);
error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
/*nonce*/NULL, 0,
/*personalization*/p, strlen(p));
KASSERTMSG(error == 0, "error=%d", error);
error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
/*additional*/NULL, 0);
KASSERTMSG(error == 0, "error=%d", error);
error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
/*additional*/NULL, 0);
KASSERTMSG(error == 0, "error=%d", error);
explicit_memset(seed, 0, sizeof(seed));
explicit_memset(&drbg, 0, sizeof(drbg));
}
/*
* Create the amd64 direct map. Called only once at boot time. We map all of
* the physical memory contiguously using 2MB large pages, with RW permissions.
* However there is a hole: the kernel is mapped with RO permissions.
*/
static void
pmap_init_directmap(struct pmap *kpm)
{
extern phys_ram_seg_t mem_clusters[];
extern int mem_cluster_cnt;
vaddr_t startva;
size_t nL4e, nL3e, nL2e;
size_t L4e_idx, L3e_idx, L2e_idx;
size_t spahole, epahole;
paddr_t lastpa, pa;
vaddr_t endva;
vaddr_t tmpva;
pt_entry_t *pte;
phys_ram_seg_t *mc;
int i;
size_t randhole;
vaddr_t randva;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
spahole = roundup(bootspace.head.pa, NBPD_L2);
epahole = rounddown(bootspace.boot.pa, NBPD_L2);
/* Get the last physical address available */
lastpa = 0;
for (i = 0; i < mem_cluster_cnt; i++) {
mc = &mem_clusters[i];
lastpa = MAX(lastpa, mc->start + mc->size);
}
/*
* x86_add_cluster should have truncated the memory to MAXPHYSMEM.
*/
if (lastpa > MAXPHYSMEM) {
panic("pmap_init_directmap: lastpa incorrect");
}
randomize_hole(&randhole, &randva);
startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
randhole, randva);
endva = startva + lastpa;
/* We will use this temporary va. */
tmpva = bootspace.spareva;
pte = PTE_BASE + pl1_i(tmpva);
/* Build L4 */
L4e_idx = pl4_i(startva);
nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
KASSERT(nL4e <= NL4_SLOT_DIRECT);
for (i = 0; i < nL4e; i++) {
KASSERT(L4_BASE[L4e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L3 */
L3e_idx = pl3_i(startva);
nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
for (i = 0; i < nL3e; i++) {
KASSERT(L3_BASE[L3e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L2 */
L2e_idx = pl2_i(startva);
nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
for (i = 0; i < nL2e; i++) {
KASSERT(L2_BASE[L2e_idx+i] == 0);
pa = (paddr_t)(i * NBPD_L2);
if (spahole <= pa && pa < epahole) {
L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
PTE_PS | pmap_pg_g;
} else {
L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
PTE_PS | pmap_pg_g;
}
}
*pte = 0;
pmap_update_pg(tmpva);
pmap_direct_base = startva;
pmap_direct_end = endva;
tlbflush();
}
#endif /* __HAVE_DIRECT_MAP */
#if !defined(XENPV)
/*
* Remap all of the virtual pages created so far with the PTE_G bit.
*/
static void
pmap_remap_global(void)
{
vaddr_t kva, kva_end;
unsigned long p1i;
size_t i;
/* head */
kva = bootspace.head.va;
kva_end = kva + bootspace.head.sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
/* kernel segments */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type == BTSEG_NONE) {
continue;
}
kva = bootspace.segs[i].va;
kva_end = kva + bootspace.segs[i].sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
}
/* boot space */
kva = bootspace.boot.va;
kva_end = kva + bootspace.boot.sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
}
#endif
#ifndef XENPV
/*
* Remap several kernel segments with large pages. We cover as many pages as we
* can. Called only once at boot time, if the CPU supports large pages.
*/
static void
pmap_remap_largepages(void)
{
pd_entry_t *pde;
vaddr_t kva, kva_end;
paddr_t pa;
size_t i;
/* Remap the kernel text using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_TEXT) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
tlbflushg();
}
}
/* Remap the kernel rodata using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_RODATA) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
tlbflushg();
}
}
/* Remap the kernel data+bss using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_DATA) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
tlbflushg();
}
}
}
#endif /* !XENPV */
/*
* pmap_init: called from uvm_init, our job is to get the pmap system ready
* to manage mappings.
*/
void
pmap_init(void)
{
int flags;
/*
* initialize caches.
*/
pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
#ifdef XENPV
/*
* pool_cache(9) should not touch cached objects, since they
* are pinned on xen and R/O for the domU
*/
flags = PR_NOTOUCH;
#else
flags = 0;
#endif
#ifdef PAE
pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
"pdppl", &pmap_pdp_allocator, IPL_NONE);
#else
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
"pdppl", NULL, IPL_NONE);
#endif
pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
0, 0, "pvpage", &pool_allocator_kmem,
IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
pmap_tlb_init();
/* XXX: Since cpu_hatch() is only for secondary CPUs. */
pmap_tlb_cpu_init(curcpu());
evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
NULL, "x86", "io bitmap copy");
evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
NULL, "x86", "ldt sync");
/*
* The kernel doesn't keep track of PTPs, so there's nowhere handy
* to hang a tree of pv_entry records. Dynamically allocated
* pv_entry lists are not heavily used in the kernel's pmap (the
* usual case is embedded), so cop out and use a single RB tree
* to cover them.
*/
rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
/*
* done: pmap module is up (and ready for business)
*/
pmap_initialized = true;
}
#ifndef XENPV
/*
* pmap_cpu_init_late: perform late per-CPU initialization.
*/
void
pmap_cpu_init_late(struct cpu_info *ci)
{
/*
* The BP has already its own PD page allocated during early
* MD startup.
*/
if (ci == &cpu_info_primary)
return;
#ifdef PAE
cpu_alloc_l3_page(ci);
#endif
}
#endif
#ifndef __HAVE_DIRECT_MAP
CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
static void
pmap_vpage_cpualloc(struct cpu_info *ci)
{
bool primary = (ci == &cpu_info_primary);
size_t i, npages;
vaddr_t vabase;
vsize_t vrange;
npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
KASSERT(npages >= VPAGE_MAX);
vrange = npages * PAGE_SIZE;
if (primary) {
while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
/* Waste some pages to align properly */
}
/* The base is aligned, allocate the rest (contiguous) */
pmap_bootstrap_valloc(npages - 1);
} else {
vabase = uvm_km_alloc(kernel_map, vrange, vrange,
UVM_KMF_VAONLY);
if (vabase == 0) {
panic("%s: failed to allocate tmp VA for CPU %d\n",
__func__, cpu_index(ci));
}
}
KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
for (i = 0; i < VPAGE_MAX; i++) {
ci->vpage[i] = vabase + i * PAGE_SIZE;
ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
}
}
void
pmap_vpage_cpu_init(struct cpu_info *ci)
{
if (ci == &cpu_info_primary) {
/* cpu0 already taken care of in pmap_bootstrap */
return;
}
pmap_vpage_cpualloc(ci);
}
#endif
/*
* p v _ e n t r y f u n c t i o n s
*/
/*
* pmap_pvp_dtor: pool_cache constructor for PV pages.
*/
static int
pmap_pvp_ctor(void *arg, void *obj, int flags)
{
struct pv_page *pvp = (struct pv_page *)obj;
struct pv_entry *pve = (struct pv_entry *)obj + 1;
struct pv_entry *maxpve = pve + PVE_PER_PVP;
KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
LIST_INIT(&pvp->pvp_pves);
pvp->pvp_nfree = PVE_PER_PVP;
pvp->pvp_pmap = NULL;
for (; pve < maxpve; pve++) {
LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
}
return 0;
}
/*
* pmap_pvp_dtor: pool_cache destructor for PV pages.
*/
static void
pmap_pvp_dtor(void *arg, void *obj)
{
struct pv_page *pvp __diagused = obj;
KASSERT(pvp->pvp_pmap == NULL);
KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
}
/*
* pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
*/
static struct pv_entry *
pmap_alloc_pv(struct pmap *pmap)
{
struct pv_entry *pve;
struct pv_page *pvp;
KASSERT(mutex_owned(&pmap->pm_lock));
if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
LIST_REMOVE(pvp, pvp_list);
} else {
pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
}
if (__predict_false(pvp == NULL)) {
return NULL;
}
/* full -> part */
LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
pvp->pvp_pmap = pmap;
}
KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree > 0);
pve = LIST_FIRST(&pvp->pvp_pves);
LIST_REMOVE(pve, pve_list);
pvp->pvp_nfree--;
if (__predict_false(pvp->pvp_nfree == 0)) {
/* part -> empty */
KASSERT(LIST_EMPTY(&pvp->pvp_pves)); LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
} else {
KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
}
return pve;
}
/*
* pmap_free_pv: delayed free of a PV entry.
*/
static void
pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
{
struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(pvp->pvp_pmap == pmap);
KASSERT(pvp->pvp_nfree >= 0);
LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
pvp->pvp_nfree++;
if (__predict_false(pvp->pvp_nfree == 1)) {
/* empty -> part */
LIST_REMOVE(pvp, pvp_list);
LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
/* part -> full */
LIST_REMOVE(pvp, pvp_list);
LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
}
}
/*
* pmap_drain_pv: free full PV pages.
*/
static void
pmap_drain_pv(struct pmap *pmap)
{
struct pv_page *pvp;
KASSERT(mutex_owned(&pmap->pm_lock)); while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
pvp->pvp_pmap = NULL;
pool_cache_put(&pmap_pvp_cache, pvp);
}
}
/*
* pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
*/
static void
pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
vaddr_t va, bool tracked)
{
#ifdef DEBUG
struct pv_pte *pvpte;
PMAP_CHECK_PP(pp);
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
break;
}
}
mutex_spin_exit(&pp->pp_lock);
if (pvpte && !tracked) { panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); } else if (!pvpte && tracked) { panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
}
#endif
}
/*
* pmap_treelookup_pv: search the PV tree for a dynamic entry
*
* => pmap must be locked
*/
static struct pv_entry *
pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
const rb_tree_t *tree, const vaddr_t va)
{
struct pv_entry *pve;
rb_node_t *node;
/*
* Inlined lookup tailored for exactly what's needed here that is
* quite a bit faster than using rb_tree_find_node().
*/
for (node = tree->rbt_root;;) {
if (__predict_false(RB_SENTINEL_P(node))) {
return NULL;
}
pve = (struct pv_entry *)
((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
if (pve->pve_pte.pte_va == va) {
KASSERT(pve->pve_pte.pte_ptp == ptp);
return pve;
}
node = node->rb_nodes[pve->pve_pte.pte_va < va];
}
}
/*
* pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
*
* => a PV entry must be known present (doesn't check for existence)
* => pmap must be locked
*/
static struct pv_entry *
pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
const struct pmap_page * const old_pp, const vaddr_t va)
{
struct pv_entry *pve;
const rb_tree_t *tree;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp != NULL || pmap == pmap_kernel());
/*
* [This mostly deals with the case of process-private pages, i.e.
* anonymous memory allocations or COW.]
*
* If the page is tracked with an embedded entry then the tree
* lookup can be avoided. It's safe to check for this specific
* set of values without pp_lock because both will only ever be
* set together for this pmap.
*
*/
if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
return NULL;
}
/*
* [This mostly deals with shared mappings, for example shared libs
* and executables.]
*
* Optimise for pmap_remove_ptes() which works by ascending scan:
* look at the lowest numbered node in the tree first. The tree is
* known non-empty because of the check above. For short lived
* processes where pmap_remove() isn't used much this gets close to
* a 100% hit rate.
*/
tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
KASSERT(!RB_SENTINEL_P(tree->rbt_root));
pve = (struct pv_entry *)
((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
offsetof(struct pv_entry, pve_rb));
if (__predict_true(pve->pve_pte.pte_va == va)) {
KASSERT(pve->pve_pte.pte_ptp == ptp);
return pve;
}
/* Search the RB tree for the key (uncommon). */
return pmap_treelookup_pv(pmap, ptp, tree, va);
}
/*
* pmap_enter_pv: enter a mapping onto a pmap_page lst
*
* => pmap must be locked
* => does NOT insert dynamic entries to tree (pmap_enter() does later)
*/
static int
pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
bool *samepage, bool *new_embedded, rb_tree_t *tree)
{
struct pv_entry *pve;
int error;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); PMAP_CHECK_PP(pp);
/*
* If entering the same page and it's already tracked with an
* embedded entry, we can avoid the expense below. It's safe
* to check for this very specific set of values without a lock
* because both will only ever be set together for this pmap.
*/
if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
*samepage = true;
pmap_check_pv(pmap, ptp, pp, va, true);
return 0;
}
/*
* Check for an existing dynamic mapping at this address. If it's
* for the same page, then it will be reused and nothing needs to be
* changed.
*/
*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
*samepage = true;
pmap_check_pv(pmap, ptp, pp, va, true);
return 0;
}
/*
* Need to put a new mapping in place. Grab a spare pv_entry in
* case it's needed; won't know for sure until the lock is taken.
*/
if (pmap->pm_pve == NULL) { pmap->pm_pve = pmap_alloc_pv(pmap);
}
error = 0;
pmap_check_pv(pmap, ptp, pp, va, false);
mutex_spin_enter(&pp->pp_lock);
if (!pv_pte_embedded(pp)) {
/*
* Embedded PV tracking available - easy.
*/
pp->pp_pte.pte_ptp = ptp;
pp->pp_pte.pte_va = va;
*new_embedded = true;
} else if (__predict_false(pmap->pm_pve == NULL)) {
/*
* No memory.
*/
error = ENOMEM;
} else {
/*
* Install new pv_entry on the page.
*/
pve = pmap->pm_pve;
pmap->pm_pve = NULL;
*new_pve = pve;
pve->pve_pte.pte_ptp = ptp;
pve->pve_pte.pte_va = va;
pve->pve_pp = pp;
LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
}
mutex_spin_exit(&pp->pp_lock);
if (error == 0) {
pmap_check_pv(pmap, ptp, pp, va, true);
}
return error;
}
/*
* pmap_remove_pv: try to remove a mapping from a pv_list
*
* => pmap must be locked
* => removes dynamic entries from tree and frees them
* => caller should adjust ptp's wire_count and free PTP if needed
*/
static void
pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
{
rb_tree_t *tree = (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); KASSERT(ptp != NULL || pmap == pmap_kernel());
pmap_check_pv(pmap, ptp, pp, va, true);
if (pve == NULL) {
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va);
pp->pp_attrs |= oattrs;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
mutex_spin_exit(&pp->pp_lock);
} else {
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp != ptp ||
pp->pp_pte.pte_va != va);
KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pve->pve_pte.pte_va == va); KASSERT(pve->pve_pp == pp);
pp->pp_attrs |= oattrs;
LIST_REMOVE(pve, pve_list);
mutex_spin_exit(&pp->pp_lock);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
rb_tree_remove_node(tree, pve);
#ifdef DIAGNOSTIC
memset(pve, 0, sizeof(*pve));
#endif
pmap_free_pv(pmap, pve);
}
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
pmap_check_pv(pmap, ptp, pp, va, false);
}
/*
* p t p f u n c t i o n s
*/
static struct vm_page *
pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
{
int lidx = level - 1;
off_t off = ptp_va2o(va, level);
struct vm_page *pg;
KASSERT(mutex_owned(&pmap->pm_lock)); if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
pg = pmap->pm_ptphint[lidx];
PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
return pg;
}
PMAP_DUMMY_LOCK(pmap);
pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
PMAP_DUMMY_UNLOCK(pmap);
if (pg != NULL && __predict_false(pg->wire_count == 0)) {
/* This page is queued to be freed - ignore. */
pg = NULL;
}
if (pg != NULL) {
PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
}
pmap->pm_ptphint[lidx] = pg;
return pg;
}
static inline void
pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
{
int lidx;
KASSERT(ptp->wire_count <= 1); PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
lidx = level - 1;
pmap_stats_update(pmap, -ptp->wire_count, 0); if (pmap->pm_ptphint[lidx] == ptp) pmap->pm_ptphint[lidx] = NULL;
ptp->wire_count = 0;
ptp->uanon = NULL;
KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
/*
* Enqueue the PTP to be freed by pmap_update(). We can't remove
* the page from the uvm_object, as that can take further locks
* (intolerable right now because the PTEs are likely mapped in).
* Instead mark the PTP as free and if we bump into it again, we'll
* either ignore or reuse (depending on what's useful at the time).
*/
LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
}
static void
pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
pt_entry_t *ptes, pd_entry_t * const *pdes)
{
unsigned long index;
int level;
vaddr_t invaladdr;
pd_entry_t opde;
KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
level = 1;
do {
index = pl_i(va, level + 1);
opde = pmap_pte_testset(&pdes[level - 1][index], 0);
/*
* On Xen-amd64 or SVS, we need to sync the top level page
* directory on each CPU.
*/
#if defined(XENPV) && defined(__x86_64__)
if (level == PTP_LEVELS - 1) {
xen_kpm_sync(pmap, index);
}
#elif defined(SVS)
if (svs_enabled && level == PTP_LEVELS - 1 &&
pmap_is_user(pmap)) {
svs_pmap_sync(pmap, index);
}
#endif
invaladdr = level == 1 ? (vaddr_t)ptes :
(vaddr_t)pdes[level - 2];
pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
opde, TLBSHOOT_FREE_PTP);
#if defined(XENPV)
pmap_tlb_shootnow();
#endif
pmap_freepage(pmap, ptp, level);
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, level + 1);
ptp->wire_count--;
if (ptp->wire_count > 1)
break;
}
} while (++level < PTP_LEVELS);
pmap_pte_flush();
}
/*
* pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
* => we are not touching any PTEs yet, so they need not be mapped in
*/
static int
pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
int flags, struct vm_page **resultp)
{
struct vm_page *ptp;
int i, aflags;
struct uvm_object *obj;
voff_t off;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
/*
* Loop through all page table levels allocating a page
* for any level where we don't already have one.
*/
memset(pt, 0, sizeof(*pt));
aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
UVM_PGA_ZERO;
for (i = PTP_LEVELS; i > 1; i--) {
obj = &pmap->pm_obj[i - 2];
off = ptp_va2o(va, i - 1);
PMAP_DUMMY_LOCK(pmap);
pt->pg[i] = uvm_pagelookup(obj, off);
if (pt->pg[i] == NULL) {
pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
pt->alloced[i] = (pt->pg[i] != NULL);
} else if (pt->pg[i]->wire_count == 0) {
/* This page was queued to be freed; dequeue it. */
LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
pt->alloced[i] = true;
}
PMAP_DUMMY_UNLOCK(pmap);
if (pt->pg[i] == NULL) {
pmap_unget_ptp(pmap, pt);
return ENOMEM;
} else if (pt->alloced[i]) {
pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
&pmap_rbtree_ops);
PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
}
}
ptp = pt->pg[2];
KASSERT(ptp != NULL);
*resultp = ptp;
pmap->pm_ptphint[0] = ptp;
return 0;
}
/*
* pmap_install_ptp: install any freshly allocated PTPs
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
* => PTEs must be mapped
* => preemption must be disabled
*/
static void
pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
pd_entry_t * const *pdes)
{
struct vm_page *ptp;
unsigned long index;
pd_entry_t *pva;
paddr_t pa;
int i;
KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
/*
* Now that we have all the pages looked up or allocated,
* loop through again installing any new ones into the tree.
*/
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_i(va, i);
pva = pdes[i - 2];
if (pmap_valid_entry(pva[index])) {
KASSERT(!pt->alloced[i]);
continue;
}
ptp = pt->pg[i];
ptp->flags &= ~PG_BUSY; /* never busy */
ptp->wire_count = 1;
pmap->pm_ptphint[i - 2] = ptp;
pa = VM_PAGE_TO_PHYS(ptp);
pmap_pte_set(&pva[index], (pd_entry_t)
(pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
/*
* On Xen-amd64 or SVS, we need to sync the top level page
* directory on each CPU.
*/
#if defined(XENPV) && defined(__x86_64__)
if (i == PTP_LEVELS) {
xen_kpm_sync(pmap, index);
}
#elif defined(SVS)
if (svs_enabled && i == PTP_LEVELS &&
pmap_is_user(pmap)) {
svs_pmap_sync(pmap, index);
}
#endif
pmap_pte_flush();
pmap_stats_update(pmap, 1, 0);
/*
* If we're not in the top level, increase the
* wire count of the parent page.
*/
if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++;
}
}
}
/*
* pmap_unget_ptp: free unusued PTPs
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
*/
static void
pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
{
int i;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
for (i = PTP_LEVELS; i > 1; i--) {
if (!pt->alloced[i]) {
continue;
}
KASSERT(pt->pg[i]->wire_count == 0);
PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
pmap_freepage(pmap, pt->pg[i], i - 1);
}
}
/*
* p m a p l i f e c y c l e f u n c t i o n s
*/
/*
* pmap_pdp_init: constructor a new PDP.
*/
static void
pmap_pdp_init(pd_entry_t *pdir)
{
paddr_t pdirpa = 0;
vaddr_t object;
int i;
#if !defined(XENPV) || !defined(__x86_64__)
int npde;
#endif
#ifdef XENPV
int s;
#endif
memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
/*
* NOTE: This is all done unlocked, but we will check afterwards
* if we have raced with pmap_growkernel().
*/
#if defined(XENPV) && defined(__x86_64__)
/* Fetch the physical address of the page directory */
(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
/*
* This pdir will NEVER be active in kernel mode, so mark
* recursive entry invalid.
*/
pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
/*
* PDP constructed this way won't be for the kernel, hence we
* don't put kernel mappings on Xen.
*
* But we need to make pmap_create() happy, so put a dummy
* (without PTE_P) value at the right place.
*/
pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
(pd_entry_t)-1 & PTE_FRAME;
#else /* XENPV && __x86_64__*/
object = (vaddr_t)pdir;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* Fetch the physical address of the page directory */
(void)pmap_extract(pmap_kernel(), object, &pdirpa);
/* Put in recursive PDE to map the PTEs */
pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
pmap_pg_nx;
#ifndef XENPV
pdir[PDIR_SLOT_PTE + i] |= PTE_W;
#endif
}
/* Copy the kernel's top level PDE */
npde = nkptp[PTP_LEVELS - 1];
memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
npde * sizeof(pd_entry_t));
if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
int idx = pl_i(KERNBASE, PTP_LEVELS);
pdir[idx] = PDP_BASE[idx];
}
#ifdef __HAVE_PCPU_AREA
pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
#endif
#ifdef __HAVE_DIRECT_MAP
slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
#endif
#ifdef KASAN
slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
#endif
#ifdef KMSAN
slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
#endif
#endif /* XENPV && __x86_64__*/
#ifdef XENPV
s = splvm();
object = (vaddr_t)pdir;
pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
VM_PROT_READ);
pmap_update(pmap_kernel());
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/*
* pin as L2/L4 page, we have to do the page with the
* PDIR_SLOT_PTE entries last
*/
#ifdef PAE
if (i == l2tol3(PDIR_SLOT_PTE))
continue;
#endif
(void) pmap_extract(pmap_kernel(), object, &pdirpa);
#ifdef __x86_64__
xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
#else
xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
}
#ifdef PAE
object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE);
(void)pmap_extract(pmap_kernel(), object, &pdirpa);
xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
splx(s);
#endif /* XENPV */
}
/*
* pmap_pdp_fini: destructor for the PDPs.
*/
static void
pmap_pdp_fini(pd_entry_t *pdir)
{
#ifdef XENPV
paddr_t pdirpa = 0; /* XXX: GCC */
vaddr_t object = (vaddr_t)pdir;
int i;
int s = splvm();
pt_entry_t *pte;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* fetch the physical address of the page directory. */
(void) pmap_extract(pmap_kernel(), object, &pdirpa);
/* unpin page table */
xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
}
object = (vaddr_t)pdir;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* Set page RW again */
pte = kvtopte(object);
pmap_pte_set(pte, *pte | PTE_W);
xen_bcast_invlpg((vaddr_t)object);
}
splx(s);
#endif /* XENPV */
}
#ifdef PAE
static void *
pmap_pdp_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(kernel_map,
PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
UVM_KMF_WIRED);
}
static void
pmap_pdp_free(struct pool *pp, void *v)
{
uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
UVM_KMF_WIRED);
}
#endif /* PAE */
/*
* pmap_ctor: constructor for the pmap cache.
*/
static int
pmap_ctor(void *arg, void *obj, int flags)
{
struct pmap *pmap = obj;
pt_entry_t p;
int i;
KASSERT((flags & PR_WAITOK) != 0);
mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&pmap->pm_dummy_lock);
kcpuset_create(&pmap->pm_cpus, true);
kcpuset_create(&pmap->pm_kernel_cpus, true);
#ifdef XENPV
kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
#endif
LIST_INIT(&pmap->pm_gc_ptp);
pmap->pm_pve = NULL;
LIST_INIT(&pmap->pm_pvp_full);
LIST_INIT(&pmap->pm_pvp_part);
LIST_INIT(&pmap->pm_pvp_empty);
/* allocate and init PDP */
pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
for (;;) {
pmap_pdp_init(pmap->pm_pdir);
mutex_enter(&pmaps_lock);
p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
if (__predict_true(p != 0)) {
break;
}
mutex_exit(&pmaps_lock);
}
for (i = 0; i < PDP_SIZE; i++)
pmap->pm_pdirpa[i] =
pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
mutex_exit(&pmaps_lock);
return 0;
}
/*
* pmap_ctor: destructor for the pmap cache.
*/
static void
pmap_dtor(void *arg, void *obj)
{
struct pmap *pmap = obj;
mutex_enter(&pmaps_lock);
LIST_REMOVE(pmap, pm_list);
mutex_exit(&pmaps_lock);
pmap_pdp_fini(pmap->pm_pdir);
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
mutex_destroy(&pmap->pm_lock);
rw_destroy(&pmap->pm_dummy_lock);
kcpuset_destroy(pmap->pm_cpus);
kcpuset_destroy(pmap->pm_kernel_cpus);
#ifdef XENPV
kcpuset_destroy(pmap->pm_xen_ptp_cpus);
#endif
}
/*
* pmap_create: create a pmap object.
*/
struct pmap *
pmap_create(void)
{
struct pmap *pmap;
int i;
pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
/* init uvm_object */
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
pmap->pm_ptphint[i] = NULL;
}
pmap->pm_stats.wired_count = 0;
/* count the PDP allocd below */
pmap->pm_stats.resident_count = PDP_SIZE;
#if !defined(__x86_64__)
pmap->pm_hiexec = 0;
#endif
/* Used by NVMM and Xen */
pmap->pm_enter = NULL;
pmap->pm_extract = NULL;
pmap->pm_remove = NULL;
pmap->pm_sync_pv = NULL;
pmap->pm_pp_remove_ent = NULL;
pmap->pm_write_protect = NULL;
pmap->pm_unwire = NULL;
pmap->pm_tlb_flush = NULL;
pmap->pm_data = NULL;
/* init the LDT */
pmap->pm_ldt = NULL;
pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
return pmap;
}
/*
* pmap_check_ptps: verify that none of the pmap's page table objects
* have any pages allocated to them.
*/
static void
pmap_check_ptps(struct pmap *pmap)
{
int i;
for (i = 0; i < PTP_LEVELS - 1; i++) {
KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
"pmap %p level %d still has %d pages",
pmap, i, (int)pmap->pm_obj[i].uo_npages);
}
}
static void
pmap_check_inuse(struct pmap *pmap)
{
#ifdef DEBUG
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_pmap == pmap)
panic("destroying pmap being used");
#if defined(XENPV) && defined(__x86_64__)
for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
if (pmap->pm_pdir[i] != 0 &&
ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
printf("pmap_destroy(%p) pmap_kernel %p "
"curcpu %d cpu %d ci_pmap %p "
"ci->ci_kpm_pdir[%d]=%" PRIx64
" pmap->pm_pdir[%d]=%" PRIx64 "\n",
pmap, pmap_kernel(), curcpu()->ci_index,
ci->ci_index, ci->ci_pmap,
i, ci->ci_kpm_pdir[i],
i, pmap->pm_pdir[i]);
panic("%s: used pmap", __func__);
}
}
#endif
}
#endif /* DEBUG */
}
/*
* pmap_destroy: drop reference count on pmap. free pmap if reference
* count goes to zero.
*
* => we can be called from pmap_unmap_ptes() with a different, unrelated
* pmap's lock held. be careful!
*/
void
pmap_destroy(struct pmap *pmap)
{
int i;
/*
* drop reference count and verify not in use.
*/
if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
return;
}
pmap_check_inuse(pmap);
/*
* handle any deferred frees.
*/
mutex_enter(&pmap->pm_lock);
if (pmap->pm_pve != NULL) { pmap_free_pv(pmap, pmap->pm_pve);
pmap->pm_pve = NULL;
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
pmap_update(pmap);
/*
* Reference count is zero, free pmap resources and then free pmap.
*/
pmap_check_ptps(pmap); KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
#ifdef USER_LDT
if (pmap->pm_ldt != NULL) {
/*
* No need to switch the LDT; this address space is gone,
* nothing is using it.
*
* No need to lock the pmap for ldt_free (or anything else),
* we're the last one to use it.
*/
/* XXXAD can't take cpu_lock here - fix soon. */
mutex_enter(&cpu_lock);
ldt_free(pmap->pm_ldt_sel);
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
}
#endif
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_destroy(&pmap->pm_obj[i], false);
}
kcpuset_zero(pmap->pm_cpus);
kcpuset_zero(pmap->pm_kernel_cpus);
#ifdef XENPV
kcpuset_zero(pmap->pm_xen_ptp_cpus);
#endif
KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); pmap_check_ptps(pmap);
if (__predict_false(pmap->pm_enter != NULL)) {
/* XXX make this a different cache */
pool_cache_destruct_object(&pmap_cache, pmap);
} else {
pool_cache_put(&pmap_cache, pmap);
}
}
/*
* pmap_zap_ptp: clear out an entire PTP without modifying PTEs
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => must be called with kernel preemption disabled
* => does as little work as possible
*/
static void
pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t startva, vaddr_t blkendva)
{
#ifndef XENPV
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
rb_tree_t *tree;
vaddr_t va;
int wired;
uint8_t oattrs;
u_int cnt;
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(ptp->wire_count > 1);
KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
/*
* Start at the lowest entered VA, and scan until there are no more
* PTEs in the PTPs.
*/
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
pve = RB_TREE_MIN(tree);
wired = 0;
va = (vaddr_t)ptp->uanon;
pte += ((va - startva) >> PAGE_SHIFT);
for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
/*
* No need for an atomic to clear the PTE. Nothing else can
* see the address space any more and speculative access (if
* possible) won't modify. Therefore there's no need to
* track the accessed/dirty bits.
*/
opte = *pte;
if (!pmap_valid_entry(opte)) {
continue;
}
/*
* Count the PTE. If it's not for a managed mapping
* there's noting more to do.
*/
cnt--;
wired -= (opte & PTE_WIRED);
if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without PTE_PVLIST for %#"
PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without PTE_PVLIST for %#"
PRIxVADDR, va);
#endif
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
va) == NULL);
continue;
}
/*
* "pve" now points to the lowest (by VA) dynamic PV entry
* in the PTP. If it's for this VA, take advantage of it to
* avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB
* tree by skipping to the next VA in the tree whenever
* there is a match here. The tree will be cleared out in
* one pass before return to pmap_remove_all().
*/
oattrs = pmap_pte_to_pp_attrs(opte);
if (pve != NULL && pve->pve_pte.pte_va == va) {
pp = pve->pve_pp;
KASSERT(pve->pve_pte.pte_ptp == ptp);
KASSERT(pp->pp_pte.pte_ptp != ptp ||
pp->pp_pte.pte_va != va);
mutex_spin_enter(&pp->pp_lock);
pp->pp_attrs |= oattrs;
LIST_REMOVE(pve, pve_list);
mutex_spin_exit(&pp->pp_lock);
/*
* pve won't be touched again until pmap_drain_pv(),
* so it's still safe to traverse the tree.
*/
pmap_free_pv(pmap, pve);
pve = RB_TREE_NEXT(tree, pve);
continue;
}
/*
* No entry in the tree so it must be embedded. Look up the
* page and cancel the embedded entry.
*/
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg);
} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR
"(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
}
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp == ptp);
KASSERT(pp->pp_pte.pte_va == va);
pp->pp_attrs |= oattrs;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
mutex_spin_exit(&pp->pp_lock);
}
/* PTP now empty - adjust the tree & stats to match. */
pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
ptp->wire_count = 1;
#ifdef DIAGNOSTIC
rb_tree_init(tree, &pmap_rbtree_ops);
#endif
#else /* !XENPV */
/*
* XXXAD For XEN, it's not clear to me that we can do this, because
* I guess the hypervisor keeps track of PTEs too.
*/
pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
#endif /* !XENPV */
}
/*
* pmap_remove_all: remove all mappings from pmap in bulk.
*
* Ordinarily when removing mappings it's important to hold the UVM object's
* lock, so that pages do not gain a new identity while retaining stale TLB
* entries (the same lock hold covers both pmap_remove() and pmap_update()).
* Here it's known that the address space is no longer visible to any user
* process, so we don't need to worry about that.
*/
bool
pmap_remove_all(struct pmap *pmap)
{
struct vm_page *ptps[32];
vaddr_t va, blkendva;
struct pmap *pmap2;
pt_entry_t *ptes;
pd_entry_t pde __diagused;
pd_entry_t * const *pdes;
int lvl __diagused, i, n;
/* XXX Can't handle EPT just yet. */
if (pmap->pm_remove != NULL) {
return false;
}
for (;;) {
/* Fetch a block of PTPs from tree. */
mutex_enter(&pmap->pm_lock);
n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
(void **)ptps, __arraycount(ptps), false);
if (n == 0) {
mutex_exit(&pmap->pm_lock);
break;
}
/* Remove all mappings in the set of PTPs. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
for (i = 0; i < n; i++) {
if (ptps[i]->wire_count == 0) {
/* It's dead: pmap_update() will expunge. */
continue;
}
/* Determine range of block. */
va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
blkendva = x86_round_pdr(va + 1);
/* Make sure everything squares up... */
KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
KASSERT(lvl == 1);
KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
/* Zap! */
pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
blkendva);
/* PTP should now be unused - free it. */
KASSERT(ptps[i]->wire_count == 1);
pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
}
pmap_unmap_ptes(pmap, pmap2);
pmap_drain_pv(pmap);
pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
mutex_exit(&pmap->pm_lock);
/* Process deferred frees. */
pmap_update(pmap);
/* A breathing point. */
preempt_point();
}
/* Verify that the pmap is now completely empty. */
pmap_check_ptps(pmap);
KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
"pmap %p not empty", pmap);
return true;
}
#if defined(PMAP_FORK)
/*
* pmap_fork: perform any necessary data structure manipulation when
* a VM space is forked.
*/
void
pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
{
#ifdef USER_LDT
union descriptor *new_ldt;
int sel;
if (__predict_true(pmap1->pm_ldt == NULL)) {
return;
}
/*
* Copy the LDT into the new process.
*
* Read pmap1's ldt pointer unlocked; if it changes behind our back
* we'll retry. This will starve if there's a stream of LDT changes
* in another thread but that should not happen.
*/
retry:
if (pmap1->pm_ldt != NULL) {
/* Allocate space for the new process's LDT */
new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
if (new_ldt == NULL) {
printf("WARNING: %s: unable to allocate LDT space\n",
__func__);
return;
}
mutex_enter(&cpu_lock);
/* Get a GDT slot for it */
sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
if (sel == -1) {
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)new_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
printf("WARNING: %s: unable to allocate LDT selector\n",
__func__);
return;
}
} else {
/* Wasn't anything there after all. */
new_ldt = NULL;
sel = -1;
mutex_enter(&cpu_lock);
}
/*
* Now that we have cpu_lock, ensure the LDT status is the same.
*/
if (pmap1->pm_ldt != NULL) {
if (new_ldt == NULL) {
/* A wild LDT just appeared. */
mutex_exit(&cpu_lock);
goto retry;
}
/* Copy the LDT data and install it in pmap2 */
memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
pmap2->pm_ldt = new_ldt;
pmap2->pm_ldt_sel = sel;
mutex_exit(&cpu_lock);
} else {
if (new_ldt != NULL) {
/* The LDT disappeared, drop what we did. */
ldt_free(sel);
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)new_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
return;
}
/* We're good, just leave. */
mutex_exit(&cpu_lock);
}
#endif /* USER_LDT */
}
#endif /* PMAP_FORK */
#ifdef USER_LDT
/*
* pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap
* is active, reload LDTR.
*/
static void
pmap_ldt_xcall(void *arg1, void *arg2)
{
struct pmap *pm;
kpreempt_disable();
pm = arg1;
if (curcpu()->ci_pmap == pm) {
#if defined(SVS)
if (svs_enabled) {
svs_ldt_sync(pm);
} else
#endif
lldt(pm->pm_ldt_sel);
}
kpreempt_enable();
}
/*
* pmap_ldt_sync: LDT selector for the named pmap is changing. swap
* in the new selector on all CPUs.
*/
void
pmap_ldt_sync(struct pmap *pm)
{
uint64_t where;
KASSERT(mutex_owned(&cpu_lock));
pmap_ldt_evcnt.ev_count++;
where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
xc_wait(where);
}
/*
* pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
* restore the default.
*/
void
pmap_ldt_cleanup(struct lwp *l)
{
pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
union descriptor *ldt;
int sel;
if (__predict_true(pmap->pm_ldt == NULL)) {
return;
}
mutex_enter(&cpu_lock);
if (pmap->pm_ldt != NULL) {
sel = pmap->pm_ldt_sel;
ldt = pmap->pm_ldt;
pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
pmap->pm_ldt = NULL;
pmap_ldt_sync(pmap);
ldt_free(sel);
uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
UVM_KMF_WIRED);
}
mutex_exit(&cpu_lock);
}
#endif /* USER_LDT */
/*
* pmap_activate: activate a process' pmap
*
* => must be called with kernel preemption disabled
* => if lwp is the curlwp, then set ci_want_pmapload so that
* actual MMU context switch will be done by pmap_load() later
*/
void
pmap_activate(struct lwp *l)
{
struct cpu_info *ci;
struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
KASSERT(kpreempt_disabled());
ci = curcpu();
if (l != ci->ci_curlwp)
return;
KASSERT(ci->ci_want_pmapload == 0); KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
/*
* no need to switch to kernel vmspace because
* it's a subset of any vmspace.
*/
if (pmap == pmap_kernel()) {
ci->ci_want_pmapload = 0;
return;
}
ci->ci_want_pmapload = 1;
}
#if defined(XENPV) && defined(__x86_64__)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
pmap == pmap_kernel())
#elif defined(PAE)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
#elif !defined(XENPV)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
#else
#define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */
#endif
/*
* pmap_reactivate: try to regain reference to the pmap.
*
* => Must be called with kernel preemption disabled.
*/
static void
pmap_reactivate(struct pmap *pmap)
{
struct cpu_info * const ci = curcpu();
const cpuid_t cid = cpu_index(ci);
KASSERT(kpreempt_disabled()); KASSERT_PDIRPA(pmap);
/*
* If we still have a lazy reference to this pmap, we can assume
* that there was no TLB shootdown for this pmap in the meantime.
*
* The order of events here is important as we must synchronize
* with TLB shootdown interrupts. Declare interest in invalidations
* (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
* change only when the state is TLBSTATE_LAZY.
*/
ci->ci_tlbstate = TLBSTATE_VALID;
KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
/* We have the reference, state is valid. */
} else {
/*
* Must reload the TLB, pmap has been changed during
* deactivated.
*/
kcpuset_atomic_set(pmap->pm_cpus, cid);
tlbflush();
}
}
/*
* pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
* and relevant LDT info.
*
* Ensures that the current process' pmap is loaded on the current CPU's
* MMU and that there are no stale TLB entries.
*
* => The caller should disable kernel preemption or do check-and-retry
* to prevent a preemption from undoing our efforts.
* => This function may block.
*/
void
pmap_load(void)
{
struct cpu_info *ci;
struct pmap *pmap, *oldpmap;
struct lwp *l;
uint64_t pctr;
int ilevel __diagused;
u_long psl __diagused;
kpreempt_disable();
retry:
ci = curcpu();
if (!ci->ci_want_pmapload) {
kpreempt_enable();
return;
}
l = ci->ci_curlwp;
pctr = lwp_pctr();
__insn_barrier();
/* should be able to take ipis. */
KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
#ifdef XENPV
/* Check to see if interrupts are enabled (ie; no events are masked) */
KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
#else
KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
#endif
KASSERT(l != NULL);
pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
KASSERT(pmap != pmap_kernel());
oldpmap = ci->ci_pmap;
if (pmap == oldpmap) {
pmap_reactivate(pmap);
ci->ci_want_pmapload = 0;
kpreempt_enable();
return;
}
/*
* Acquire a reference to the new pmap and perform the switch.
*/
pmap_reference(pmap);
pmap_load1(l, pmap, oldpmap);
ci->ci_want_pmapload = 0;
/*
* we're now running with the new pmap. drop the reference
* to the old pmap. if we block, we need to go around again.
*/
pmap_destroy(oldpmap);
__insn_barrier();
if (lwp_pctr() != pctr) {
goto retry;
}
kpreempt_enable();
}
/*
* pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
* pmap_load(). It's critically important that this function does not
* block.
*/
static void
pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
{
struct cpu_info *ci;
struct pcb *pcb;
cpuid_t cid;
KASSERT(kpreempt_disabled());
pcb = lwp_getpcb(l);
ci = l->l_cpu;
cid = cpu_index(ci);
kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
KASSERT_PDIRPA(oldpmap); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
/*
* Mark the pmap in use by this CPU. Again, we must synchronize
* with TLB shootdown interrupts, so set the state VALID first,
* then register us for shootdown events on this pmap.
*/
ci->ci_tlbstate = TLBSTATE_VALID;
kcpuset_atomic_set(pmap->pm_cpus, cid);
kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
ci->ci_pmap = pmap;
/*
* update tss. now that we have registered for invalidations
* from other CPUs, we're good to load the page tables.
*/
#ifdef PAE
pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
#else
pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
#endif
#ifdef i386
#ifndef XENPV
ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
#endif
#endif
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(pmap);
} else
#endif
lldt(pmap->pm_ldt_sel);
cpu_load_pmap(pmap, oldpmap);
}
/*
* pmap_deactivate: deactivate a process' pmap.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
*/
void
pmap_deactivate(struct lwp *l)
{
struct pmap *pmap;
struct cpu_info *ci;
KASSERT(kpreempt_disabled()); if (l != curlwp) {
return;
}
/*
* Wait for pending TLB shootdowns to complete. Necessary because
* TLB shootdown state is per-CPU, and the LWP may be coming off
* the CPU before it has a chance to call pmap_update(), e.g. due
* to kernel preemption or blocking routine in between.
*/
pmap_tlb_shootnow();
ci = curcpu();
if (ci->ci_want_pmapload) {
/*
* ci_want_pmapload means that our pmap is not loaded on
* the CPU or TLB might be stale. note that pmap_kernel()
* is always considered loaded.
*/
KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
!= pmap_kernel());
KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
!= ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
/*
* userspace has not been touched.
* nothing to do here.
*/
ci->ci_want_pmapload = 0;
return;
}
pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
if (pmap == pmap_kernel()) {
return;
}
KASSERT_PDIRPA(pmap); KASSERT(ci->ci_pmap == pmap);
/*
* we aren't interested in TLB invalidations for this pmap,
* at least for the time being.
*/
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
ci->ci_tlbstate = TLBSTATE_LAZY;
}
#ifdef EFI_RUNTIME
extern struct pmap *efi_runtime_pmap;
/*
* pmap_is_user: true if pmap, which must not be the kernel pmap, is
* for an unprivileged user process
*/
bool
pmap_is_user(struct pmap *pmap)
{
KASSERT(pmap != pmap_kernel());
return (pmap != efi_runtime_pmap);
}
/*
* pmap_activate_sync: synchronously activate specified pmap.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
* => Must not sleep before pmap_deactivate_sync.
*/
void *
pmap_activate_sync(struct pmap *pmap)
{
struct cpu_info *ci = curcpu();
struct pmap *oldpmap = ci->ci_pmap;
unsigned cid = cpu_index(ci);
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
if (oldpmap) {
KASSERT_PDIRPA(oldpmap);
kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
}
ci->ci_tlbstate = TLBSTATE_VALID;
kcpuset_atomic_set(pmap->pm_cpus, cid);
kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
ci->ci_pmap = pmap;
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(pmap);
} else
#endif
lldt(pmap->pm_ldt_sel);
cpu_load_pmap(pmap, oldpmap);
return oldpmap;
}
/*
* pmap_deactivate_sync: synchronously deactivate specified pmap and
* restore whatever was active before pmap_activate_sync.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
* => Must not have slept since pmap_activate_sync.
*/
void
pmap_deactivate_sync(struct pmap *pmap, void *cookie)
{
struct cpu_info *ci = curcpu();
struct pmap *oldpmap = cookie;
unsigned cid = cpu_index(ci);
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(ci->ci_pmap == pmap);
KASSERT_PDIRPA(pmap);
KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
pmap_tlb_shootnow();
kcpuset_atomic_clear(pmap->pm_cpus, cid);
kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
ci->ci_tlbstate = TLBSTATE_VALID;
ci->ci_pmap = oldpmap;
if (oldpmap) {
kcpuset_atomic_set(oldpmap->pm_cpus, cid);
kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(oldpmap);
} else
#endif
lldt(oldpmap->pm_ldt_sel);
cpu_load_pmap(oldpmap, pmap);
} else {
lcr3(pmap_pdirpa(pmap_kernel(), 0));
}
}
#endif /* EFI_RUNTIME */
/*
* some misc. functions
*/
bool
pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
int *lastlvl)
{
unsigned long index;
pd_entry_t pde;
int i;
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_i(va, i);
pde = pdes[i - 2][index];
if ((pde & PTE_P) == 0) {
*lastlvl = i;
return false;
}
if (pde & PTE_PS)
break;
}
if (lastpde != NULL)
*lastpde = pde;
*lastlvl = i;
return true;
}
/*
* pmap_extract: extract a PA for the given VA
*/
bool
pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
pt_entry_t *ptes, pte;
pd_entry_t pde;
pd_entry_t * const *pdes;
struct pmap *pmap2;
paddr_t pa;
bool rv;
int lvl;
if (__predict_false(pmap->pm_extract != NULL)) {
return (*pmap->pm_extract)(pmap, va, pap);
}
#ifdef __HAVE_DIRECT_MAP
if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
if (pap != NULL) {
*pap = PMAP_DIRECT_UNMAP(va);
}
return true;
}
#endif
rv = false;
pa = 0;
if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock);
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
if (lvl == 2) {
pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
rv = true;
} else {
KASSERT(lvl == 1);
pte = ptes[pl1_i(va)];
if (__predict_true((pte & PTE_P) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
rv = true;
}
}
}
pmap_unmap_ptes(pmap, pmap2);
if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock);
}
if (pap != NULL) { *pap = pa;
}
return rv;
}
/*
* vtophys: virtual address to physical address. For use by
* machine-dependent code only.
*/
paddr_t
vtophys(vaddr_t va)
{
paddr_t pa;
if (pmap_extract(pmap_kernel(), va, &pa) == true)
return pa;
return 0;
}
__strict_weak_alias(pmap_extract_ma, pmap_extract);
#ifdef XENPV
/*
* vtomach: virtual address to machine address. For use by
* machine-dependent code only.
*/
paddr_t
vtomach(vaddr_t va)
{
paddr_t pa;
if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
return pa;
return 0;
}
#endif
/*
* pmap_virtual_space: used during bootup [pmap_steal_memory] to
* determine the bounds of the kernel virtual address space.
*/
void
pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
{
*startp = virtual_avail;
*endp = virtual_end;
}
void
pmap_zero_page(paddr_t pa)
{
#if defined(__HAVE_DIRECT_MAP)
memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_pagezero(pa);
return;
}
#endif
struct cpu_info *ci;
pt_entry_t *zpte;
vaddr_t zerova;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
kpreempt_disable();
ci = curcpu();
zerova = ci->vpage[VPAGE_ZER];
zpte = ci->vpage_pte[VPAGE_ZER];
KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
pmap_pte_flush();
pmap_update_pg(zerova); /* flush TLB */
memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(zpte, 0); /* zap ! */
pmap_pte_flush();
#endif
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}
void
pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
{
#if defined(__HAVE_DIRECT_MAP)
vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_copy_page(srcpa, dstpa);
return;
}
#endif
struct cpu_info *ci;
pt_entry_t *srcpte, *dstpte;
vaddr_t srcva, dstva;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
kpreempt_disable();
ci = curcpu();
srcva = ci->vpage[VPAGE_SRC];
dstva = ci->vpage[VPAGE_DST];
srcpte = ci->vpage_pte[VPAGE_SRC];
dstpte = ci->vpage_pte[VPAGE_DST];
KASSERT(*srcpte == 0 && *dstpte == 0);
pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
pmap_pte_flush();
pmap_update_pg(srcva);
pmap_update_pg(dstva);
memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(srcpte, 0);
pmap_pte_set(dstpte, 0);
pmap_pte_flush();
#endif
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}
static pt_entry_t *
pmap_map_ptp(struct vm_page *ptp)
{
#ifdef __HAVE_DIRECT_MAP
return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
#else
struct cpu_info *ci;
pt_entry_t *ptppte;
vaddr_t ptpva;
KASSERT(kpreempt_disabled());
#ifndef XENPV
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
#else
const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
#endif
ci = curcpu();
ptpva = ci->vpage[VPAGE_PTP];
ptppte = ci->vpage_pte[VPAGE_PTP];
pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
pmap_pte_flush();
pmap_update_pg(ptpva);
return (pt_entry_t *)ptpva;
#endif
}
static void
pmap_unmap_ptp(void)
{
#ifndef __HAVE_DIRECT_MAP
#if defined(DIAGNOSTIC) || defined(XENPV)
struct cpu_info *ci;
pt_entry_t *pte;
KASSERT(kpreempt_disabled());
ci = curcpu();
pte = ci->vpage_pte[VPAGE_PTP];
if (*pte != 0) { pmap_pte_set(pte, 0);
pmap_pte_flush();
}
#endif
#endif
}
static pt_entry_t *
pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{
KASSERT(kpreempt_disabled()); if (pmap_is_curpmap(pmap)) {
return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
}
KASSERT(ptp != NULL); return pmap_map_ptp(ptp) + pl1_pi(va);
}
static void
pmap_unmap_pte(void)
{
KASSERT(kpreempt_disabled()); pmap_unmap_ptp();
}
/*
* p m a p r e m o v e f u n c t i o n s
*
* functions that remove mappings
*/
/*
* pmap_remove_ptes: remove PTEs from a PTP
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
* => must be called with kernel preemption disabled
* => returns composite pte if at least one page should be shot down
*/
static void
pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
vaddr_t startva, vaddr_t endva)
{
pt_entry_t *pte = (pt_entry_t *)ptpva;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
/*
* mappings are very often sparse, so clip the given range to the
* range of PTEs that are known present in the PTP.
*/
pmap_ptp_range_clip(ptp, &startva, &pte);
/*
* note that ptpva points to the PTE that maps startva. this may
* or may not be the first PTE in the PTP.
*
* we loop through the PTP while there are still PTEs to look at
* and the wire_count is greater than 1 (because we use the wire_count
* to keep track of the number of real PTEs in the PTP).
*/
while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
(void)pmap_remove_pte(pmap, ptp, pte, startva);
startva += PAGE_SIZE;
pte++;
}
}
/*
* pmap_remove_pte: remove a single PTE from a PTP.
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
* => returns true if we removed a mapping
* => must be called with kernel preemption disabled
*/
static bool
pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t va)
{
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_valid_entry(*pte)) {
/* VA not mapped. */
return false;
}
/* Atomically save the old PTE and zap it. */
opte = pmap_pte_testset(pte, 0);
if (!pmap_valid_entry(opte)) {
return false;
}
pmap_exec_account(pmap, va, opte, 0);
pmap_stats_update_bypte(pmap, 0, opte); if (ptp) {
/*
* Dropping a PTE. Make sure that the PDE is flushed.
*/
ptp->wire_count--;
if (ptp->wire_count <= 1) {
opte |= PTE_A;
}
}
if ((opte & PTE_A) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
}
/*
* If we are not on a pv list - we are done.
*/
if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without PTE_PVLIST for %#"PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
#endif
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
return true;
}
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
__func__, va, pa, atop(pa));
}
/* Sync R/M bits. */
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
return true;
}
static void
pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes;
pd_entry_t pde;
pd_entry_t * const *pdes;
bool result;
vaddr_t blkendva, va = sva;
struct vm_page *ptp;
struct pmap *pmap2;
int lvl;
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/*
* removing one page? take shortcut function.
*/
if (va + PAGE_SIZE == eva) {
if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL,
"%s: unmanaged PTP detected", __func__);
} else {
/* Never free kernel PTPs. */
ptp = NULL;
}
result = pmap_remove_pte(pmap, ptp,
&ptes[pl1_i(va)], va);
/*
* if mapping removed and the PTP is no longer
* being used, free it!
*/
if (result && ptp && ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
} else for (/* null */ ; va < eva ; va = blkendva) {
/* determine range of block */
blkendva = x86_round_pdr(va+1);
if (blkendva > eva)
blkendva = eva;
if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
/* Skip a range corresponding to an invalid pde. */
blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
continue;
}
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
__func__);
} else {
/* Never free kernel PTPs. */
ptp = NULL;
}
pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
blkendva);
/* If PTP is no longer being used, free it. */
if (ptp && ptp->wire_count <= 1) {
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
}
pmap_unmap_ptes(pmap, pmap2);
pmap_drain_pv(pmap);
}
/*
* pmap_remove: mapping removal function.
*
* => caller should not be holding any pmap locks
*/
void
pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
if (__predict_false(pmap->pm_remove != NULL)) {
(*pmap->pm_remove)(pmap, sva, eva);
return;
}
mutex_enter(&pmap->pm_lock);
pmap_remove_locked(pmap, sva, eva);
mutex_exit(&pmap->pm_lock);
}
/*
* pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
*
* => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
* => Caller should disable kernel preemption.
* => issues tlb shootdowns if necessary.
*/
static int
pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
pt_entry_t *optep)
{
struct pmap *pmap;
struct vm_page *ptp;
vaddr_t va;
pt_entry_t *ptep;
pt_entry_t opte;
pt_entry_t npte;
pt_entry_t expect;
bool need_shootdown;
ptp = pvpte->pte_ptp;
va = pvpte->pte_va;
KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); pmap = ptp_to_pmap(ptp); KASSERT(kpreempt_disabled());
if (__predict_false(pmap->pm_sync_pv != NULL)) {
return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
optep);
}
expect = pmap_pa2pte(pa) | PTE_P;
if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
clearbits = pmap_pp_attrs_to_pte(clearbits);
}
ptep = pmap_map_pte(pmap, ptp, va);
do {
opte = *ptep;
KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); KASSERT(opte == 0 || (opte & PTE_P) != 0);
if ((opte & (PTE_FRAME | PTE_P)) != expect) {
/*
* We lost a race with a V->P operation like
* pmap_remove(). Wait for the competitor
* reflecting pte bits into mp_attrs.
*/
pmap_unmap_pte();
return EAGAIN;
}
/*
* Check if there's anything to do on this PTE.
*/
if ((opte & clearbits) == 0) {
need_shootdown = false;
break;
}
/*
* We need a shootdown if the PTE is cached (PTE_A) ...
* ... Unless we are clearing only the PTE_W bit and
* it isn't cached as RW (PTE_D).
*/
need_shootdown = (opte & PTE_A) != 0 && !(clearbits == PTE_W && (opte & PTE_D) == 0);
npte = opte & ~clearbits;
/*
* If we need a shootdown anyway, clear PTE_A and PTE_D.
*/
if (need_shootdown) {
npte &= ~(PTE_A | PTE_D);
}
KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); KASSERT(npte == 0 || (opte & PTE_P) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
}
pmap_unmap_pte();
*oattrs = pmap_pte_to_pp_attrs(opte);
if (optep != NULL) *optep = opte;
return 0;
}
static void
pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
vaddr_t va)
{
struct pmap *pmap2;
pt_entry_t *ptes;
pd_entry_t * const *pdes;
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
pmap_stats_update_bypte(pmap, 0, opte);
ptp->wire_count--;
if (ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
pmap_unmap_ptes(pmap, pmap2);
}
static void
pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
{
struct pv_pte *pvpte;
struct vm_page *ptp;
uintptr_t sum;
uint8_t oattrs;
bool locked;
/*
* Do an unlocked check to see if the page has no mappings, eg when
* pmap_remove_all() was called before amap_wipeout() for a process
* private amap - common. The page being removed must be on the way
* out, so we don't have to worry about concurrent attempts to enter
* it (otherwise the caller either doesn't care or has screwed up).
*/
sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); if (sum == 0) {
return;
}
kpreempt_disable();
for (;;) {
struct pmap *pmap;
struct pv_entry *pve;
pt_entry_t opte;
vaddr_t va;
mutex_spin_enter(&pp->pp_lock);
if ((pvpte = pv_pte_first(pp)) == NULL) { mutex_spin_exit(&pp->pp_lock);
break;
}
/*
* Add a reference to the pmap before clearing the pte.
* Otherwise the pmap can disappear behind us.
*/
ptp = pvpte->pte_ptp;
pmap = ptp_to_pmap(ptp); KASSERT(pmap->pm_obj[0].uo_refs > 0); if (ptp != NULL) { pmap_reference(pmap);
}
/*
* Now try to lock it. We need a direct handoff between
* pp_lock and pm_lock to know the pv_entry is kept intact
* and kept associated with this pmap. If that can't be
* had, wait for the pmap's lock to become free and then
* retry.
*/
locked = mutex_tryenter(&pmap->pm_lock);
mutex_spin_exit(&pp->pp_lock);
if (!locked) {
mutex_enter(&pmap->pm_lock);
/* nothing, just wait for it */
mutex_exit(&pmap->pm_lock);
if (ptp != NULL) {
pmap_destroy(pmap);
}
continue;
}
va = pvpte->pte_va;
KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
"va %lx pmap %p ptp %p is empty", va, pmap, ptp);
KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
"va %lx pmap %p ptp %p is free", va, pmap, ptp);
KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
"va %lx pmap %p ptp %p is empty", va, pmap, ptp);
#ifdef DEBUG
pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
rb_tree_t *tree = (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
pve = pmap_treelookup_pv(pmap, ptp, tree, va);
if (pve == NULL) {
KASSERTMSG(&pp->pp_pte == pvpte,
"va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
va, pmap, ptp, pvpte, pve);
} else {
KASSERTMSG(&pve->pve_pte == pvpte,
"va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
va, pmap, ptp, pvpte, pve);
}
#endif
if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
panic("pmap_pp_remove: mapping not present");
}
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
/* Update the PTP reference count. Free if last reference. */
if (ptp != NULL) {
KASSERT(pmap != pmap_kernel());
pmap_tlb_shootnow();
if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
} else {
pmap_pp_remove_ent(pmap, ptp, opte, va);
}
} else {
KASSERT(pmap == pmap_kernel()); pmap_stats_update_bypte(pmap, 0, opte);
}
pmap_tlb_shootnow();
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
if (ptp != NULL) {
pmap_destroy(pmap);
}
}
kpreempt_enable();
}
/*
* pmap_page_remove: remove a managed vm_page from all pmaps that map it
*
* => R/M bits are sync'd back to attrs
*/
void
pmap_page_remove(struct vm_page *pg)
{
struct pmap_page *pp;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
pa = VM_PAGE_TO_PHYS(pg);
pmap_pp_remove(pp, pa);
}
/*
* pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
* that map it
*/
void
pmap_pv_remove(paddr_t pa)
{
struct pmap_page *pp;
pp = pmap_pv_tracked(pa);
if (pp == NULL)
panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
pmap_pp_remove(pp, pa);
}
/*
* p m a p a t t r i b u t e f u n c t i o n s
* functions that test/change managed page's attributes
* since a page can be mapped multiple times we must check each PTE that
* maps it by going down the pv lists.
*/
/*
* pmap_test_attrs: test a page's attributes
*/
bool
pmap_test_attrs(struct vm_page *pg, unsigned testbits)
{
struct pmap_page *pp;
struct pv_pte *pvpte;
struct pmap *pmap;
uint8_t oattrs;
u_int result;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
if ((pp->pp_attrs & testbits) != 0) {
return true;
}
pa = VM_PAGE_TO_PHYS(pg);
startover:
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
if ((pp->pp_attrs & testbits) != 0) {
break;
}
if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
/*
* raced with a V->P operation. wait for the other
* side to finish by acquiring pmap's lock. if no
* wait, updates to pp_attrs by the other side may
* go unseen.
*/
pmap = ptp_to_pmap(pvpte->pte_ptp);
pmap_reference(pmap);
mutex_spin_exit(&pp->pp_lock);
mutex_enter(&pmap->pm_lock);
/* nothing. */
mutex_exit(&pmap->pm_lock);
pmap_destroy(pmap);
goto startover;
}
pp->pp_attrs |= oattrs;
}
result = pp->pp_attrs & testbits;
mutex_spin_exit(&pp->pp_lock);
/*
* note that we will exit the for loop with a non-null pve if
* we have found the bits we are testing for.
*/
return result != 0;
}
static bool
pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
{
struct pv_pte *pvpte;
struct pmap *pmap;
uint8_t oattrs;
u_int result;
startover:
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
/*
* raced with a V->P operation. wait for the other
* side to finish by acquiring pmap's lock. it is
* probably unmapping the page, and it will be gone
* when the loop is restarted.
*/
pmap = ptp_to_pmap(pvpte->pte_ptp);
pmap_reference(pmap);
mutex_spin_exit(&pp->pp_lock);
mutex_enter(&pmap->pm_lock);
/* nothing. */
mutex_exit(&pmap->pm_lock);
pmap_destroy(pmap);
goto startover;
}
pp->pp_attrs |= oattrs;
}
result = pp->pp_attrs & clearbits;
pp->pp_attrs &= ~clearbits;
pmap_tlb_shootnow();
mutex_spin_exit(&pp->pp_lock);
return result != 0;
}
/*
* pmap_clear_attrs: clear the specified attribute for a page.
*
* => we return true if we cleared one of the bits we were asked to
*/
bool
pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
{
struct pmap_page *pp;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
pa = VM_PAGE_TO_PHYS(pg);
/*
* If this is a new page, assert it has no mappings and simply zap
* the stored attributes without taking any locks.
*/
if ((pg->flags & PG_FAKE) != 0) {
KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
atomic_store_relaxed(&pp->pp_attrs, 0);
return false;
} else {
return pmap_pp_clear_attrs(pp, pa, clearbits);
}
}
/*
* pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
* pv-tracked page.
*/
bool
pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
{
struct pmap_page *pp;
pp = pmap_pv_tracked(pa);
if (pp == NULL)
panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
return pmap_pp_clear_attrs(pp, pa, clearbits);
}
/*
* p m a p p r o t e c t i o n f u n c t i o n s
*/
/*
* pmap_page_protect: change the protection of all recorded mappings
* of a managed page
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_pv_protect: change the protection of all recorded mappings
* of an unmanaged pv-tracked page
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_protect: set the protection in of the pages in a pmap
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_write_protect: write-protect pages in a pmap.
*
* Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
* don't need to remove this bit when re-entering the PTEs here: Xen tracks the
* kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
* present the page will still be considered as a kernel page, and the privilege
* separation will be enforced correctly.
*/
void
pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
pt_entry_t bit_rem, bit_put;
pt_entry_t *ptes;
pt_entry_t * const *pdes;
struct pmap *pmap2;
vaddr_t blockend, va;
int lvl, i;
if (__predict_false(pmap->pm_write_protect != NULL)) {
(*pmap->pm_write_protect)(pmap, sva, eva, prot);
return;
}
bit_rem = 0;
if (!(prot & VM_PROT_WRITE))
bit_rem = PTE_W;
bit_put = 0;
if (!(prot & VM_PROT_EXECUTE))
bit_put = pmap_pg_nx;
sva &= ~PAGE_MASK;
eva &= ~PAGE_MASK;
/*
* Acquire pmap. No need to lock the kernel pmap as we won't
* be touching PV entries nor stats and kernel PDEs aren't
* freed.
*/
if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock);
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
for (va = sva ; va < eva; va = blockend) {
pt_entry_t *spte, *epte;
blockend = x86_round_pdr(va + 1);
if (blockend > eva)
blockend = eva;
/* Is it a valid block? */
if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
continue;
}
KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); KASSERT(lvl == 1);
spte = &ptes[pl1_i(va)];
epte = &ptes[pl1_i(blockend)];
for (i = 0; spte < epte; spte++, i++) {
pt_entry_t opte, npte;
do {
opte = *spte;
if (!pmap_valid_entry(opte)) {
goto next;
}
npte = (opte & ~bit_rem) | bit_put;
} while (pmap_pte_cas(spte, opte, npte) != opte); if ((opte & PTE_D) != 0) { vaddr_t tva = va + x86_ptob(i);
pmap_tlb_shootdown(pmap, tva, opte,
TLBSHOOT_WRITE_PROTECT);
}
next:;
}
}
/* Release pmap. */
pmap_unmap_ptes(pmap, pmap2);
if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock);
}
}
/*
* pmap_unwire: clear the wired bit in the PTE.
*
* => Mapping should already be present.
*/
void
pmap_unwire(struct pmap *pmap, vaddr_t va)
{
pt_entry_t *ptes, *ptep, opte;
pd_entry_t * const *pdes;
struct pmap *pmap2;
int lvl;
if (__predict_false(pmap->pm_unwire != NULL)) {
(*pmap->pm_unwire)(pmap, va);
return;
}
/*
* Acquire pmap. Need to lock the kernel pmap only to protect the
* statistics.
*/
mutex_enter(&pmap->pm_lock);
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
}
KASSERT(lvl == 1);
ptep = &ptes[pl1_i(va)];
opte = *ptep;
KASSERT(pmap_valid_entry(opte));
if (opte & PTE_WIRED) {
pt_entry_t npte = opte & ~PTE_WIRED;
opte = pmap_pte_testset(ptep, npte);
pmap_stats_update_bypte(pmap, npte, opte);
} else {
printf("%s: wiring for pmap %p va %#" PRIxVADDR
" did not change!\n", __func__, pmap, va);
}
/* Release pmap. */
pmap_unmap_ptes(pmap, pmap2);
mutex_exit(&pmap->pm_lock);
}
/*
* pmap_copy: copy mappings from one pmap to another
*
* => optional function
* void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
*/
/*
* defined as macro in pmap.h
*/
__strict_weak_alias(pmap_enter, pmap_enter_default);
int
pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
u_int flags)
{
if (__predict_false(pmap->pm_enter != NULL)) {
return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
}
return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
}
/*
* pmap_enter: enter a mapping into a pmap
*
* => must be done "now" ... no lazy-evaluation
*/
int
pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
vm_prot_t prot, u_int flags, int domid)
{
pt_entry_t *ptes, opte, npte;
pt_entry_t *ptep;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct vm_page *new_pg, *old_pg;
struct pmap_page *new_pp, *old_pp;
struct pv_entry *old_pve, *new_pve;
bool wired = (flags & PMAP_WIRED) != 0;
struct pmap *pmap2;
struct pmap_ptparray pt;
int error;
bool getptp, samepage, new_embedded;
rb_tree_t *tree;
KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
PRIxVADDR " over PDP!", __func__, va);
KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
"%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
#ifdef XENPV
KASSERT(domid == DOMID_SELF || pa == 0);
#endif
npte = ma | protection_codes[prot] | PTE_P;
npte |= pmap_pat_flags(flags);
if (wired)
npte |= PTE_WIRED;
if (va < VM_MAXUSER_ADDRESS) { KASSERTMSG(pmap != pmap_kernel(),
"entering user va %#"PRIxVADDR" into kernel pmap",
va);
if (pmap_is_user(pmap))
npte |= PTE_U;
}
if (pmap == pmap_kernel())
npte |= pmap_pg_g;
if (flags & VM_PROT_ALL) {
npte |= PTE_A;
if (flags & VM_PROT_WRITE) { KASSERT((npte & PTE_W) != 0);
npte |= PTE_D;
}
}
#ifdef XENPV
if (domid != DOMID_SELF)
new_pg = NULL;
else
#endif
new_pg = PHYS_TO_VM_PAGE(pa);
if (new_pg != NULL) {
/* This is a managed page */
npte |= PTE_PVLIST;
new_pp = VM_PAGE_TO_PP(new_pg);
PMAP_CHECK_PP(new_pp); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
/* This is an unmanaged pv-tracked page */
npte |= PTE_PVLIST;
PMAP_CHECK_PP(new_pp);
} else {
new_pp = NULL;
}
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
if (error != 0) {
if (flags & PMAP_CANFAIL) {
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: get ptp failed, error=%d", __func__,
error);
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
} else {
/* Embedded PV entries rely on this. */
KASSERT(va != 0);
tree = &pmap_kernel_rb;
}
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
new_pve = NULL;
old_pve = NULL;
samepage = false;
new_embedded = false;
if (new_pp != NULL) {
error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
&old_pve, &samepage, &new_embedded, tree);
/*
* If a new pv_entry was needed and none was available, we
* can go no further.
*/
if (error != 0) {
if (flags & PMAP_CANFAIL) {
if (getptp) { pmap_unget_ptp(pmap, &pt);
}
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: alloc pve failed", __func__);
}
} else {
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
}
/* Map PTEs into address space. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/* Install any newly allocated PTPs. */
if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes);
}
/* Check if there is an existing mapping. */
ptep = &ptes[pl1_i(va)];
opte = *ptep;
bool have_oldpa = pmap_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
do {
opte = *ptep;
/*
* if the same page, inherit PTE_A and PTE_D.
*/
if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
npte |= opte & (PTE_A | PTE_D);
}
#if defined(XENPV)
if (domid != DOMID_SELF) {
/* pmap_pte_cas with error handling */
int s = splvm();
if (opte != *ptep) {
splx(s);
continue;
}
error = xpq_update_foreign(
vtomach((vaddr_t)ptep), npte, domid, flags);
splx(s);
if (error) {
/* Undo pv_entry tracking - oof. */
if (new_pp != NULL) {
mutex_spin_enter(&new_pp->pp_lock);
if (new_pve != NULL) {
LIST_REMOVE(new_pve, pve_list);
KASSERT(pmap->pm_pve == NULL);
pmap->pm_pve = new_pve;
} else if (new_embedded) {
new_pp->pp_pte.pte_ptp = NULL;
new_pp->pp_pte.pte_va = 0;
}
mutex_spin_exit(&new_pp->pp_lock);
}
pmap_unmap_ptes(pmap, pmap2);
/* Free new PTP. */
if (ptp != NULL && ptp->wire_count <= 1) {
pmap_free_ptp(pmap, ptp, va, ptes,
pdes);
}
mutex_exit(&pmap->pm_lock);
return error;
}
break;
}
#endif /* defined(XENPV) */
} while (pmap_pte_cas(ptep, opte, npte) != opte);
/*
* Done with the PTEs: they can now be unmapped.
*/
pmap_unmap_ptes(pmap, pmap2);
/*
* Update statistics and PTP's reference count.
*/
pmap_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++;
}
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
KASSERT(ptp == NULL || ptp->wire_count > 1);
/*
* If the same page, we can skip pv_entry handling.
*/
if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); if ((npte & PTE_PVLIST) != 0) { KASSERT(samepage);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
goto same_pa;
} else if ((npte & PTE_PVLIST) != 0) { KASSERT(!samepage);
}
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR
" pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
__func__, va, oldpa, atop(pa));
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_pte_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
/*
* If new page is dynamically PV tracked, insert to tree.
*/
if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
old_pve = rb_tree_insert_node(tree, new_pve);
KASSERT(old_pve == new_pve);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
same_pa:
/*
* shootdown tlb if necessary.
*/
if ((~opte & (PTE_P | PTE_A)) == 0 &&
((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return 0;
}
#if defined(XEN) && defined(DOM0OPS)
struct pmap_data_gnt {
SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
vaddr_t pd_gnt_sva;
vaddr_t pd_gnt_eva; /* range covered by this gnt */
int pd_gnt_refs; /* ref counter */
struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
};
SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
static struct pmap_data_gnt *
pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
struct pmap_data_gnt_head *headp;
struct pmap_data_gnt *pgnt;
KASSERT(mutex_owned(&pmap->pm_lock));
headp = pmap->pm_data;
KASSERT(headp != NULL);
SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
return pgnt;
/* check that we're not overlapping part of a region */
KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
}
return NULL;
}
static void
pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
const struct gnttab_map_grant_ref *ops)
{
struct pmap_data_gnt_head *headp;
struct pmap_data_gnt *pgnt;
vaddr_t eva = sva + nentries * PAGE_SIZE;
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(nentries >= 1);
if (pmap->pm_remove == NULL) {
pmap->pm_remove = pmap_remove_gnt;
KASSERT(pmap->pm_data == NULL);
headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
SLIST_INIT(headp);
pmap->pm_data = headp;
} else {
KASSERT(pmap->pm_remove == pmap_remove_gnt);
KASSERT(pmap->pm_data != NULL);
headp = pmap->pm_data;
}
pgnt = pmap_find_gnt(pmap, sva, eva);
if (pgnt != NULL) {
KASSERT(pgnt->pd_gnt_sva == sva);
KASSERT(pgnt->pd_gnt_eva == eva);
return;
}
/* new entry */
pgnt = kmem_alloc(sizeof(*pgnt) +
(nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
pgnt->pd_gnt_sva = sva;
pgnt->pd_gnt_eva = eva;
pgnt->pd_gnt_refs = 0;
memcpy(pgnt->pd_gnt_ops, ops,
sizeof(struct gnttab_map_grant_ref) * nentries);
SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
}
static void
pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
{
struct pmap_data_gnt_head *headp = pmap->pm_data;
int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
KASSERT(nentries >= 1);
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(pgnt->pd_gnt_refs == 0);
SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
kmem_free(pgnt, sizeof(*pgnt) +
(nentries - 1) * sizeof(struct gnttab_map_grant_ref));
if (SLIST_EMPTY(headp)) {
kmem_free(headp, sizeof(*headp));
pmap->pm_data = NULL;
pmap->pm_remove = NULL;
}
}
/*
* pmap_enter_gnt: enter a grant entry into a pmap
*
* => must be done "now" ... no lazy-evaluation
*/
int
pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
const struct gnttab_map_grant_ref *oops)
{
struct pmap_data_gnt *pgnt;
pt_entry_t *ptes, opte;
#ifndef XENPV
pt_entry_t npte;
#endif
pt_entry_t *ptep;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct vm_page *old_pg;
struct pmap_page *old_pp;
struct pv_entry *old_pve;
struct pmap *pmap2;
struct pmap_ptparray pt;
int error;
bool getptp;
rb_tree_t *tree;
struct gnttab_map_grant_ref *op;
int ret;
int idx;
KASSERT(pmap_initialized);
KASSERT(va < VM_MAX_KERNEL_ADDRESS);
KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
PRIxVADDR " over PDP!", __func__, va);
KASSERT(pmap != pmap_kernel());
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
pmap_alloc_gnt(pmap, sva, nentries, oops);
pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
KASSERT(pgnt != NULL);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
if (error != 0) {
mutex_exit(&pmap->pm_lock);
return error;
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
old_pve = NULL;
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
/* Map PTEs into address space. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/* Install any newly allocated PTPs. */
if (getptp) {
pmap_install_ptp(pmap, &pt, va, pdes);
}
/* Check if there is an existing mapping. */
ptep = &ptes[pl1_i(va)];
opte = *ptep;
bool have_oldpa = pmap_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
op = &pgnt->pd_gnt_ops[idx];
#ifdef XENPV
KASSERT(op->flags & GNTMAP_contains_pte);
op->host_addr = xpmap_ptetomach(ptep);
#else
KASSERT((op->flags & GNTMAP_contains_pte) == 0);
KASSERT(op->flags != 0);
KASSERT(op->host_addr != 0);
#endif
op->dev_bus_addr = 0;
op->status = GNTST_general_error;
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
if (__predict_false(ret)) {
printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
__func__, ret);
op->status = GNTST_general_error;
}
for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
kpause("gntmap", false, mstohz(1), NULL);
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
if (__predict_false(ret)) {
printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
__func__, ret);
op->status = GNTST_general_error;
}
}
if (__predict_false(op->status != GNTST_okay)) {
printf("%s: GNTTABOP_map_grant_ref status: %d\n",
__func__, op->status);
if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/
ptp->wire_count--;
}
} else {
#ifndef XENPV
npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
if ((op->flags & GNTMAP_readonly) == 0)
npte |= PTE_W;
do {
opte = *ptep;
} while (pmap_pte_cas(ptep, opte, npte) != opte);
#endif
pgnt->pd_gnt_refs++;
if (!have_oldpa) {
ptp->wire_count++;
}
KASSERT(ptp->wire_count > 1);
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
if (ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
/*
* Done with the PTEs: they can now be unmapped.
*/
pmap_unmap_ptes(pmap, pmap2);
/*
* Update statistics and PTP's reference count.
*/
pmap_stats_update_bypte(pmap, 0, opte);
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg);
} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
__func__, va, oldpa);
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_pte_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return op->status;
}
/*
* pmap_remove_gnt: grant mapping removal function.
*
* => caller should not be holding any pmap locks
*/
static void
pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
struct pmap_data_gnt *pgnt;
pt_entry_t *ptes;
pd_entry_t pde;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct pmap *pmap2;
vaddr_t va;
int lvl;
int idx;
struct gnttab_map_grant_ref *op;
struct gnttab_unmap_grant_ref unmap_op;
int ret;
KASSERT(pmap != pmap_kernel());
KASSERT(pmap->pm_remove == pmap_remove_gnt);
mutex_enter(&pmap->pm_lock);
for (va = sva; va < eva; va += PAGE_SIZE) {
pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
if (pgnt == NULL) {
pmap_remove_locked(pmap, sva, eva);
continue;
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
panic("pmap_remove_gnt pdes not valid");
}
idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
op = &pgnt->pd_gnt_ops[idx];
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL,
"%s: unmanaged PTP detected", __func__);
if (op->status == GNTST_okay) {
KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
#ifdef XENPV
unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
#else
unmap_op.host_addr = op->host_addr;
pmap_pte_testset(&ptes[pl1_i(va)], 0);
#endif
unmap_op.handle = op->handle;
unmap_op.dev_bus_addr = 0;
ret = HYPERVISOR_grant_table_op(
GNTTABOP_unmap_grant_ref, &unmap_op, 1);
if (ret) {
printf("%s: GNTTABOP_unmap_grant_ref "
"failed: %d\n", __func__, ret);
}
ptp->wire_count--;
pgnt->pd_gnt_refs--;
}
if (pgnt->pd_gnt_refs == 0) {
pmap_free_gnt(pmap, pgnt);
}
/*
* if mapping removed and the PTP is no longer
* being used, free it!
*/
if (ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
pmap_unmap_ptes(pmap, pmap2);
}
mutex_exit(&pmap->pm_lock);
}
#endif /* XEN && DOM0OPS */
paddr_t
pmap_get_physpage(void)
{
struct vm_page *ptp;
struct pmap *kpm = pmap_kernel();
paddr_t pa;
if (!uvm.page_init_done) {
/*
* We're growing the kernel pmap early (from
* uvm_pageboot_alloc()). This case must be
* handled a little differently.
*/
if (!uvm_page_physget(&pa))
panic("%s: out of memory", __func__);
#if defined(__HAVE_DIRECT_MAP)
memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_pagezero(pa);
return pa;
}
#endif
kpreempt_disable();
pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
PTE_W | pmap_pg_nx);
pmap_pte_flush();
pmap_update_pg((vaddr_t)early_zerop);
memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(early_zero_pte, 0);
pmap_pte_flush();
#endif /* defined(DIAGNOSTIC) */
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
} else {
/* XXX */
ptp = uvm_pagealloc(NULL, 0, NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
panic("%s: out of memory", __func__);
ptp->flags &= ~PG_BUSY;
ptp->wire_count = 1;
pa = VM_PAGE_TO_PHYS(ptp);
}
pmap_stats_update(kpm, 1, 0);
return pa;
}
/*
* Expand the page tree with the specified amount of PTPs, mapping virtual
* addresses starting at kva. We populate all the levels but the last one
* (L1). The nodes of the tree are created as RW, but the pages covered
* will be kentered in L1, with proper permissions.
*
* Used only by pmap_growkernel.
*/
static void
pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
{
unsigned long i;
paddr_t pa;
unsigned long index, endindex;
int level;
pd_entry_t *pdep;
#ifdef XENPV
int s = splvm(); /* protect xpq_* */
#endif
for (level = PTP_LEVELS; level > 1; level--) {
if (level == PTP_LEVELS)
pdep = cpm->pm_pdir;
else
pdep = normal_pdes[level - 2];
index = pl_i_roundup(kva, level);
endindex = index + needed_ptps[level - 1] - 1;
for (i = index; i <= endindex; i++) {
pt_entry_t pte;
KASSERT(!pmap_valid_entry(pdep[i]));
pa = pmap_get_physpage();
pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
#ifdef __x86_64__
pte |= pmap_pg_nx;
#endif
pmap_pte_set(&pdep[i], pte);
#ifdef XENPV
if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
if (__predict_true(
cpu_info_primary.ci_flags & CPUF_PRESENT)) {
/* update per-cpu PMDs on all cpus */
xen_kpm_sync(pmap_kernel(), i);
} else {
/*
* too early; update primary CPU
* PMD only (without locks)
*/
#ifdef __x86_64__
pd_entry_t *cpu_pdep =
&cpu_info_primary.ci_kpm_pdir[i];
#else
pd_entry_t *cpu_pdep =
&cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
#endif
pmap_pte_set(cpu_pdep, pte);
}
}
#endif
KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
nkptp[level - 1]++;
}
pmap_pte_flush();
}
#ifdef XENPV
splx(s);
#endif
}
/*
* pmap_growkernel: increase usage of KVM space.
*
* => we allocate new PTPs for the kernel and install them in all
* the pmaps on the system.
*/
vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)
{
struct pmap *kpm = pmap_kernel();
struct pmap *cpm;
#if !defined(XENPV) || !defined(__x86_64__)
struct pmap *pm;
long old;
#endif
int s, i;
long needed_kptp[PTP_LEVELS], target_nptp;
bool invalidate = false;
s = splvm(); /* to be safe */
mutex_enter(&kpm->pm_lock);
if (maxkvaddr <= pmap_maxkvaddr) {
mutex_exit(&kpm->pm_lock);
splx(s);
return pmap_maxkvaddr;
}
maxkvaddr = x86_round_pdr(maxkvaddr);
#if !defined(XENPV) || !defined(__x86_64__)
old = nkptp[PTP_LEVELS - 1];
#endif
/* Initialize needed_kptp. */
for (i = PTP_LEVELS - 1; i >= 1; i--) {
target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
if (target_nptp > nkptpmax[i])
panic("out of KVA space");
KASSERT(target_nptp >= nkptp[i]);
needed_kptp[i] = target_nptp - nkptp[i];
}
#ifdef XENPV
/* only pmap_kernel(), or the per-cpu map, has kernel entries */
cpm = kpm;
#else
/* Get the current pmap */
if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
cpm = curcpu()->ci_pmap;
} else {
cpm = kpm;
}
#endif
kasan_shadow_map((void *)pmap_maxkvaddr,
(size_t)(maxkvaddr - pmap_maxkvaddr));
kmsan_shadow_map((void *)pmap_maxkvaddr,
(size_t)(maxkvaddr - pmap_maxkvaddr));
pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
/*
* If the number of top level entries changed, update all pmaps.
*/
if (needed_kptp[PTP_LEVELS - 1] != 0) {
#ifdef XENPV
#ifdef __x86_64__
/* nothing, kernel entries are never entered in user pmap */
#else
int pdkidx;
mutex_enter(&pmaps_lock);
LIST_FOREACH(pm, &pmaps, pm_list) {
for (pdkidx = PDIR_SLOT_KERN + old;
pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
pdkidx++) {
pmap_pte_set(&pm->pm_pdir[pdkidx],
kpm->pm_pdir[pdkidx]);
}
pmap_pte_flush();
}
mutex_exit(&pmaps_lock);
#endif /* __x86_64__ */
#else /* XENPV */
size_t newpdes;
newpdes = nkptp[PTP_LEVELS - 1] - old;
if (cpm != kpm) {
memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
&cpm->pm_pdir[PDIR_SLOT_KERN + old],
newpdes * sizeof(pd_entry_t));
}
mutex_enter(&pmaps_lock);
LIST_FOREACH(pm, &pmaps, pm_list) {
if (__predict_false(pm->pm_enter != NULL)) {
/*
* Not a native pmap, the kernel is not mapped,
* so nothing to synchronize.
*/
continue;
}
memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
&kpm->pm_pdir[PDIR_SLOT_KERN + old],
newpdes * sizeof(pd_entry_t));
}
mutex_exit(&pmaps_lock);
#endif
invalidate = true;
}
pmap_maxkvaddr = maxkvaddr;
mutex_exit(&kpm->pm_lock);
splx(s);
if (invalidate && pmap_initialized) {
/* Invalidate the pmap cache. */
pool_cache_invalidate(&pmap_cache);
}
return maxkvaddr;
}
#ifdef DEBUG
void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
/*
* pmap_dump: dump all the mappings from a pmap
*
* => caller should not be holding any pmap locks
*/
void
pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes, *pte;
pd_entry_t * const *pdes;
struct pmap *pmap2;
vaddr_t blkendva;
int lvl;
/*
* if end is out of range truncate.
* if (end == start) update to max.
*/
if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
eva = VM_MAXUSER_ADDRESS;
mutex_enter(&pmap->pm_lock);
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/*
* dumping a range of pages: we dump in PTP sized blocks (4MB)
*/
for (/* null */ ; sva < eva ; sva = blkendva) {
/* determine range of block */
blkendva = x86_round_pdr(sva+1);
if (blkendva > eva)
blkendva = eva;
/* valid block? */
if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
continue;
KASSERT(lvl == 1);
pte = &ptes[pl1_i(sva)];
for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
if (!pmap_valid_entry(*pte))
continue;
printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
" (pte=%#" PRIxPADDR ")\n",
sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
}
}
pmap_unmap_ptes(pmap, pmap2);
mutex_exit(&pmap->pm_lock);
}
#endif
/*
* pmap_update: process deferred invalidations and frees.
*/
void
pmap_update(struct pmap *pmap)
{
struct pmap_page *pp;
struct vm_page *ptp;
/*
* Initiate any pending TLB shootdowns. Wait for them to
* complete before returning control to the caller.
*/
kpreempt_disable();
pmap_tlb_shootnow();
kpreempt_enable();
/*
* Now that shootdowns are complete, process deferred frees. This
* is an unlocked check, but is safe as we're only interested in
* work done in this LWP - we won't get a false negative.
*/
if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
return;
}
mutex_enter(&pmap->pm_lock);
while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { KASSERT(ptp->wire_count == 0); KASSERT(ptp->uanon == NULL); LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
pp = VM_PAGE_TO_PP(ptp);
LIST_INIT(&pp->pp_pvlist);
pp->pp_attrs = 0;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
/*
* XXX Hack to avoid extra locking, and lock
* assertions in uvm_pagefree(). Despite uobject
* being set, this isn't a managed page.
*/
PMAP_DUMMY_LOCK(pmap);
uvm_pagerealloc(ptp, NULL, 0);
PMAP_DUMMY_UNLOCK(pmap);
uvm_pagefree(ptp);
}
mutex_exit(&pmap->pm_lock);
}
#if PTP_LEVELS > 4
#error "Unsupported number of page table mappings"
#endif
paddr_t
pmap_init_tmp_pgtbl(paddr_t pg)
{
static bool maps_loaded;
static const paddr_t x86_tmp_pml_paddr[] = {
4 * PAGE_SIZE, /* L1 */
5 * PAGE_SIZE, /* L2 */
6 * PAGE_SIZE, /* L3 */
7 * PAGE_SIZE /* L4 */
};
static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
pd_entry_t *tmp_pml, *kernel_pml;
int level;
if (!maps_loaded) {
for (level = 0; level < PTP_LEVELS; ++level) {
x86_tmp_pml_vaddr[level] =
uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
if (x86_tmp_pml_vaddr[level] == 0)
panic("mapping of real mode PML failed\n");
pmap_kenter_pa(x86_tmp_pml_vaddr[level],
x86_tmp_pml_paddr[level],
VM_PROT_READ | VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
maps_loaded = true;
}
/* Zero levels 1-3 */
for (level = 0; level < PTP_LEVELS - 1; ++level) {
tmp_pml = (void *)x86_tmp_pml_vaddr[level];
memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
}
/* Copy PML4 */
kernel_pml = pmap_kernel()->pm_pdir;
tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
#ifdef PAE
/*
* Use the last 4 entries of the L2 page as L3 PD entries. These
* last entries are unlikely to be used for temporary mappings.
* 508: maps 0->1GB (userland)
* 509: unused
* 510: unused
* 511: maps 3->4GB (kernel)
*/
tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
tmp_pml[509] = 0;
tmp_pml[510] = 0;
tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
#endif
for (level = PTP_LEVELS - 1; level > 0; --level) {
tmp_pml = (void *)x86_tmp_pml_vaddr[level];
tmp_pml[pl_i(pg, level + 1)] =
(x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
}
tmp_pml = (void *)x86_tmp_pml_vaddr[0];
tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
#ifdef PAE
/* Return the PA of the L3 page (entry 508 of the L2 page) */
return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
#endif
return x86_tmp_pml_paddr[PTP_LEVELS - 1];
}
u_int
x86_mmap_flags(paddr_t mdpgno)
{
u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
u_int pflag = 0;
if (nflag & X86_MMAP_FLAG_PREFETCH)
pflag |= PMAP_WRITE_COMBINE;
return pflag;
}
#if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
/*
* -----------------------------------------------------------------------------
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* -----------------------------------------------------------------------------
*
* These functions are invoked as callbacks from the code above. Contrary to
* native, EPT does not have a recursive slot; therefore, it is not possible
* to call pmap_map_ptes(). Instead, we use the direct map and walk down the
* tree manually.
*
* Apart from that, the logic is mostly the same as native. Once a pmap has
* been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
* After that we're good, and the callbacks will handle the translations
* for us.
*
* -----------------------------------------------------------------------------
*/
/* Hardware bits. */
#define EPT_R __BIT(0) /* read */
#define EPT_W __BIT(1) /* write */
#define EPT_X __BIT(2) /* execute */
#define EPT_T __BITS(5,3) /* type */
#define TYPE_UC 0
#define TYPE_WC 1
#define TYPE_WT 4
#define TYPE_WP 5
#define TYPE_WB 6
#define EPT_NOPAT __BIT(6)
#define EPT_L __BIT(7) /* large */
#define EPT_A __BIT(8) /* accessed */
#define EPT_D __BIT(9) /* dirty */
/* Software bits. */
#define EPT_PVLIST __BIT(60)
#define EPT_WIRED __BIT(61)
#define pmap_ept_valid_entry(pte) (pte & EPT_R)
bool pmap_ept_has_ad __read_mostly;
static inline void
pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
pmap_stats_update(pmap, resid_diff, wired_diff);
}
static pt_entry_t
pmap_ept_type(u_int flags)
{
u_int cacheflags = (flags & PMAP_CACHE_MASK);
pt_entry_t ret;
switch (cacheflags) {
case PMAP_NOCACHE:
case PMAP_NOCACHE_OVR:
ret = __SHIFTIN(TYPE_UC, EPT_T);
break;
case PMAP_WRITE_COMBINE:
ret = __SHIFTIN(TYPE_WC, EPT_T);
break;
case PMAP_WRITE_BACK:
default:
ret = __SHIFTIN(TYPE_WB, EPT_T);
break;
}
ret |= EPT_NOPAT;
return ret;
}
static inline pt_entry_t
pmap_ept_prot(vm_prot_t prot)
{
pt_entry_t res = 0;
if (prot & VM_PROT_READ)
res |= EPT_R;
if (prot & VM_PROT_WRITE)
res |= EPT_W;
if (prot & VM_PROT_EXECUTE)
res |= EPT_X;
return res;
}
static inline uint8_t
pmap_ept_to_pp_attrs(pt_entry_t ept)
{
uint8_t ret = 0;
if (pmap_ept_has_ad) {
if (ept & EPT_D)
ret |= PP_ATTRS_D;
if (ept & EPT_A)
ret |= PP_ATTRS_A;
} else {
ret |= (PP_ATTRS_D|PP_ATTRS_A);
}
if (ept & EPT_W)
ret |= PP_ATTRS_W;
return ret;
}
static inline pt_entry_t
pmap_pp_attrs_to_ept(uint8_t attrs)
{
pt_entry_t ept = 0;
if (attrs & PP_ATTRS_D)
ept |= EPT_D;
if (attrs & PP_ATTRS_A)
ept |= EPT_A;
if (attrs & PP_ATTRS_W)
ept |= EPT_W;
return ept;
}
/*
* Helper for pmap_ept_free_ptp.
* tree[0] = &L2[L2idx]
* tree[1] = &L3[L3idx]
* tree[2] = &L4[L4idx]
*/
static void
pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
{
pt_entry_t *pteva;
paddr_t ptepa;
int i, index;
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_pi(va, i);
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
KASSERT(pmap_ept_valid_entry(pteva[index]));
tree[i - 2] = &pteva[index];
ptepa = pmap_pte2pa(pteva[index]);
}
}
static void
pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{
pd_entry_t *tree[3];
int level;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
pmap_ept_get_tree(pmap, va, tree);
level = 1;
do {
(void)pmap_pte_testset(tree[level - 1], 0);
pmap_freepage(pmap, ptp, level);
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, level + 1);
ptp->wire_count--;
if (ptp->wire_count > 1)
break;
}
} while (++level < PTP_LEVELS);
pmap_pte_flush();
}
/* Allocate L4->L3->L2. Return L2. */
static void
pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
{
struct vm_page *ptp;
unsigned long index;
pd_entry_t *pteva;
paddr_t ptepa;
int i;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
/*
* Now that we have all the pages looked up or allocated,
* loop through again installing any new ones into the tree.
*/
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_pi(va, i);
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
if (pmap_ept_valid_entry(pteva[index])) {
KASSERT(!pt->alloced[i]);
ptepa = pmap_pte2pa(pteva[index]);
continue;
}
ptp = pt->pg[i];
ptp->flags &= ~PG_BUSY; /* never busy */
ptp->wire_count = 1;
pmap->pm_ptphint[i - 2] = ptp;
ptepa = VM_PAGE_TO_PHYS(ptp);
pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
pmap_pte_flush();
pmap_stats_update(pmap, 1, 0);
/*
* If we're not in the top level, increase the
* wire count of the parent page.
*/
if (i < PTP_LEVELS) {
pt->pg[i + 1]->wire_count++;
}
}
}
static int
pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
u_int flags)
{
pt_entry_t *ptes, opte, npte;
pt_entry_t *ptep;
struct vm_page *ptp;
struct vm_page *new_pg, *old_pg;
struct pmap_page *new_pp, *old_pp;
struct pv_entry *old_pve, *new_pve;
bool wired = (flags & PMAP_WIRED) != 0;
bool accessed;
struct pmap_ptparray pt;
int error;
bool getptp, samepage, new_embedded;
rb_tree_t *tree;
KASSERT(pmap_initialized);
KASSERT(va < VM_MAXUSER_ADDRESS);
npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
if (wired)
npte |= EPT_WIRED;
if (flags & VM_PROT_ALL) {
npte |= EPT_A;
if (flags & VM_PROT_WRITE) {
KASSERT((npte & EPT_W) != 0);
npte |= EPT_D;
}
}
new_pg = PHYS_TO_VM_PAGE(pa);
if (new_pg != NULL) {
/* This is a managed page */
npte |= EPT_PVLIST;
new_pp = VM_PAGE_TO_PP(new_pg);
} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
/* This is an unmanaged pv-tracked page */
npte |= EPT_PVLIST;
} else {
new_pp = NULL;
}
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
if (error != 0) {
if (flags & PMAP_CANFAIL) {
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: get ptp failed, error=%d", __func__,
error);
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
} else {
/* Embedded PV entries rely on this. */
KASSERT(va != 0);
tree = &pmap_kernel_rb;
}
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
new_pve = NULL;
old_pve = NULL;
samepage = false;
new_embedded = false;
if (new_pp != NULL) {
error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
&old_pve, &samepage, &new_embedded, tree);
/*
* If a new pv_entry was needed and none was available, we
* can go no further.
*/
if (error != 0) {
if (flags & PMAP_CANFAIL) {
if (getptp) {
pmap_unget_ptp(pmap, &pt);
}
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: alloc pve failed", __func__);
}
} else {
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
}
/* Map PTEs into address space. */
kpreempt_disable();
/* Install any newly allocated PTPs. */
if (getptp) {
pmap_ept_install_ptp(pmap, &pt, va);
}
/* Check if there is an existing mapping. */
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
ptep = &ptes[pl1_pi(va)];
opte = *ptep;
bool have_oldpa = pmap_ept_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
do {
opte = *ptep;
/*
* if the same page, inherit PTE_A and PTE_D.
*/
if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
npte |= opte & (EPT_A | EPT_D);
}
} while (pmap_pte_cas(ptep, opte, npte) != opte);
/*
* Done with the PTEs: they can now be unmapped.
*/
kpreempt_enable();
/*
* Update statistics and PTP's reference count.
*/
pmap_ept_stats_update_bypte(pmap, npte, opte);
if (ptp != NULL) {
if (!have_oldpa) {
ptp->wire_count++;
}
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
KASSERT(ptp == NULL || ptp->wire_count > 1);
/*
* If the same page, we can skip pv_entry handling.
*/
if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
if ((npte & EPT_PVLIST) != 0) {
KASSERT(samepage);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
goto same_pa;
} else if ((npte & EPT_PVLIST) != 0) {
KASSERT(!samepage);
}
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg);
} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: EPT_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR
" pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
__func__, va, oldpa, atop(pa));
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_ept_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
/*
* If new page is dynamically PV tracked, insert to tree.
*/
if (new_pve != NULL) {
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
old_pve = rb_tree_insert_node(tree, new_pve);
KASSERT(old_pve == new_pve);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
same_pa:
/*
* shootdown tlb if necessary.
*/
if (pmap_ept_has_ad) {
accessed = (~opte & (EPT_R | EPT_A)) == 0;
} else {
accessed = (opte & EPT_R) != 0;
}
if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return 0;
}
/* Pay close attention, this returns L2. */
static int
pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
{
pt_entry_t *pteva;
paddr_t ptepa;
int i, index;
KASSERT(mutex_owned(&pmap->pm_lock));
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
index = pl_pi(va, i);
if (!pmap_ept_valid_entry(pteva[index]))
return i;
ptepa = pmap_pte2pa(pteva[index]);
}
if (lastpde != NULL) {
*lastpde = pteva[index];
}
return 0;
}
static bool
pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
pt_entry_t *ptes, pte;
pd_entry_t pde;
paddr_t ptppa, pa;
bool rv;
#ifdef __HAVE_DIRECT_MAP
if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
if (pap != NULL) {
*pap = PMAP_DIRECT_UNMAP(va);
}
return true;
}
#endif
rv = false;
pa = 0;
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
pte = ptes[pl1_pi(va)];
if (__predict_true((pte & EPT_R) != 0)) {
pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
rv = true;
}
}
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
if (pap != NULL) {
*pap = pa;
}
return rv;
}
static bool
pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t va)
{
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
bool accessed;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
if (!pmap_ept_valid_entry(*pte)) {
/* VA not mapped. */
return false;
}
/* Atomically save the old PTE and zap it. */
opte = pmap_pte_testset(pte, 0);
if (!pmap_ept_valid_entry(opte)) {
return false;
}
pmap_ept_stats_update_bypte(pmap, 0, opte);
if (ptp) {
/*
* Dropping a PTE. Make sure that the PDE is flushed.
*/
ptp->wire_count--;
if (ptp->wire_count <= 1) {
opte |= EPT_A;
}
}
if (pmap_ept_has_ad) {
accessed = (opte & EPT_A) != 0;
} else {
accessed = true;
}
if (accessed) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
}
/*
* If we are not on a pv list - we are done.
*/
if ((opte & EPT_PVLIST) == 0) {
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without EPT_PVLIST for %#"PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
return true;
}
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg);
} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: EPT_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
__func__, va, pa, atop(pa));
}
/* Sync R/M bits. */
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
return true;
}
static void
pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
vaddr_t startva, vaddr_t endva)
{
pt_entry_t *pte = (pt_entry_t *)ptpva;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
/*
* mappings are very often sparse, so clip the given range to the
* range of PTEs that are known present in the PTP.
*/
pmap_ptp_range_clip(ptp, &startva, &pte);
/*
* note that ptpva points to the PTE that maps startva. this may
* or may not be the first PTE in the PTP.
*
* we loop through the PTP while there are still PTEs to look at
* and the wire_count is greater than 1 (because we use the wire_count
* to keep track of the number of real PTEs in the PTP).
*/
while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
startva += PAGE_SIZE;
pte++;
}
}
static void
pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes;
pd_entry_t pde;
paddr_t ptppa;
vaddr_t blkendva, va = sva;
struct vm_page *ptp;
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
for (/* null */ ; va < eva ; va = blkendva) {
int lvl;
/* determine range of block */
blkendva = x86_round_pdr(va+1);
if (blkendva > eva)
blkendva = eva;
lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
if (lvl != 0) {
/* Skip a range corresponding to an invalid pde. */
blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
continue;
}
/* PA of the PTP */
ptppa = pmap_pte2pa(pde);
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
__func__);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
blkendva);
/* If PTP is no longer being used, free it. */
if (ptp && ptp->wire_count <= 1) {
pmap_ept_free_ptp(pmap, ptp, va);
}
}
kpreempt_enable();
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
}
static int
pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
uint8_t *oattrs, pt_entry_t *optep)
{
struct pmap *pmap;
pt_entry_t *ptep;
pt_entry_t opte;
pt_entry_t npte;
pt_entry_t expect;
bool need_shootdown;
expect = pmap_pa2pte(pa) | EPT_R;
pmap = ptp_to_pmap(ptp);
if (clearbits != ~0) {
KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
clearbits = pmap_pp_attrs_to_ept(clearbits);
}
ptep = pmap_map_pte(pmap, ptp, va);
do {
opte = *ptep;
KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
KASSERT(opte == 0 || (opte & EPT_R) != 0);
if ((opte & (PTE_FRAME | EPT_R)) != expect) {
/*
* We lost a race with a V->P operation like
* pmap_remove(). Wait for the competitor
* reflecting pte bits into mp_attrs.
*/
pmap_unmap_pte();
return EAGAIN;
}
/*
* Check if there's anything to do on this PTE.
*/
if ((opte & clearbits) == 0) {
need_shootdown = false;
break;
}
/*
* We need a shootdown if the PTE is cached (EPT_A) ...
* ... Unless we are clearing only the EPT_W bit and
* it isn't cached as RW (EPT_D).
*/
if (pmap_ept_has_ad) {
need_shootdown = (opte & EPT_A) != 0 &&
!(clearbits == EPT_W && (opte & EPT_D) == 0);
} else {
need_shootdown = true;
}
npte = opte & ~clearbits;
/*
* If we need a shootdown anyway, clear EPT_A and EPT_D.
*/
if (need_shootdown) {
npte &= ~(EPT_A | EPT_D);
}
KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
KASSERT(npte == 0 || (opte & EPT_R) != 0);
} while (pmap_pte_cas(ptep, opte, npte) != opte);
if (need_shootdown) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
}
pmap_unmap_pte();
*oattrs = pmap_ept_to_pp_attrs(opte);
if (optep != NULL)
*optep = opte;
return 0;
}
static void
pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
vaddr_t va)
{
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_ept_stats_update_bypte(pmap, 0, opte);
ptp->wire_count--;
if (ptp->wire_count <= 1) {
pmap_ept_free_ptp(pmap, ptp, va);
}
}
static void
pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
pt_entry_t bit_rem;
pt_entry_t *ptes, *spte;
pt_entry_t opte, npte;
pd_entry_t pde;
paddr_t ptppa;
vaddr_t va;
bool modified;
bit_rem = 0;
if (!(prot & VM_PROT_WRITE))
bit_rem = EPT_W;
sva &= PTE_FRAME;
eva &= PTE_FRAME;
/* Acquire pmap. */
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
for (va = sva; va < eva; va += PAGE_SIZE) {
if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
continue;
}
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
spte = &ptes[pl1_pi(va)];
do {
opte = *spte;
if (!pmap_ept_valid_entry(opte)) {
goto next;
}
npte = (opte & ~bit_rem);
} while (pmap_pte_cas(spte, opte, npte) != opte);
if (pmap_ept_has_ad) {
modified = (opte & EPT_D) != 0;
} else {
modified = true;
}
if (modified) {
vaddr_t tva = x86_ptob(spte - ptes);
pmap_tlb_shootdown(pmap, tva, 0,
TLBSHOOT_WRITE_PROTECT);
}
next:;
}
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
}
static void
pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
{
pt_entry_t *ptes, *ptep, opte;
pd_entry_t pde;
paddr_t ptppa;
/* Acquire pmap. */
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
}
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
ptep = &ptes[pl1_pi(va)];
opte = *ptep;
KASSERT(pmap_ept_valid_entry(opte));
if (opte & EPT_WIRED) {
pt_entry_t npte = opte & ~EPT_WIRED;
opte = pmap_pte_testset(ptep, npte);
pmap_ept_stats_update_bypte(pmap, npte, opte);
} else {
printf("%s: wiring for pmap %p va %#" PRIxVADDR
"did not change!\n", __func__, pmap, va);
}
/* Release pmap. */
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
}
/* -------------------------------------------------------------------------- */
void
pmap_ept_transform(struct pmap *pmap)
{
pmap->pm_enter = pmap_ept_enter;
pmap->pm_extract = pmap_ept_extract;
pmap->pm_remove = pmap_ept_remove;
pmap->pm_sync_pv = pmap_ept_sync_pv;
pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
pmap->pm_write_protect = pmap_ept_write_protect;
pmap->pm_unwire = pmap_ept_unwire;
memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
}
#endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
/* $NetBSD: proc.h,v 1.373 2023/10/04 20:52:07 ad Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2020, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)proc.h 8.15 (Berkeley) 5/19/95
*/
#ifndef _SYS_PROC_H_
#define _SYS_PROC_H_
#include <sys/lwp.h>
#if defined(_KMEMUSER) || defined(_KERNEL)
#if defined(_KERNEL_OPT)
#include "opt_multiprocessor.h"
#include "opt_kstack.h"
#include "opt_lockdebug.h"
#endif
#include <machine/proc.h> /* Machine-dependent proc substruct */
#include <machine/pcb.h>
#include <sys/aio.h>
#include <sys/idtype.h>
#include <sys/rwlock.h>
#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/queue.h>
#include <sys/radixtree.h>
#include <sys/signalvar.h>
#include <sys/siginfo.h>
#include <sys/event.h>
#include <sys/specificdata.h>
#ifdef _KERNEL
#include <sys/resourcevar.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#endif
/*
* One structure allocated per session.
*/
struct session {
int s_count; /* Ref cnt; pgrps in session */
u_int s_flags;
#define S_LOGIN_SET 1 /* s_login set in this session */
struct proc *s_leader; /* Session leader */
struct vnode *s_ttyvp; /* Vnode of controlling terminal */
struct tty *s_ttyp; /* Controlling terminal */
char s_login[MAXLOGNAME]; /* Setlogin() name */
pid_t s_sid; /* Session ID (pid of leader) */
};
/*
* One structure allocated per process group.
*/
struct pgrp {
LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members */
struct session *pg_session; /* Pointer to session */
pid_t pg_id; /* Pgrp id */
int pg_jobc; /*
* Number of processes qualifying
* pgrp for job control
*/
};
/*
* Autoloadable syscall definition
*/
struct sc_autoload {
u_int al_code;
const char *al_module;
};
/*
* One structure allocated per emulation.
*/
struct exec_package;
struct ras;
struct kauth_cred;
struct emul {
const char *e_name; /* Symbolic name */
const char *e_path; /* Extra emulation path (NULL if none)*/
#ifndef __HAVE_MINIMAL_EMUL
int e_flags; /* Miscellaneous flags, see above */
/* Syscall handling function */
const int *e_errno; /* Errno array */
int e_nosys; /* Offset of the nosys() syscall */
int e_nsysent; /* Number of system call entries */
#endif
struct sysent *e_sysent; /* System call array */
const uint32_t *e_nomodbits; /* sys_nosys/sys_nomodule flags
* for syscall_disestablish() */
const char * const *e_syscallnames; /* System call name array */
struct sc_autoload *e_sc_autoload; /* List of autoloadable syscalls */
/* Signal sending function */
void (*e_sendsig)(const struct ksiginfo *,
const sigset_t *);
void (*e_trapsignal)(struct lwp *, struct ksiginfo *);
char *e_sigcode; /* Start of sigcode */
char *e_esigcode; /* End of sigcode */
/* Set registers before execution */
struct uvm_object **e_sigobject;/* shared sigcode object */
void (*e_setregs)(struct lwp *, struct exec_package *,
vaddr_t);
/* Per-process hooks */
void (*e_proc_exec)(struct proc *, struct exec_package *);
void (*e_proc_fork)(struct proc *, struct lwp *, int);
void (*e_proc_exit)(struct proc *);
void (*e_lwp_fork)(struct lwp *, struct lwp *);
void (*e_lwp_exit)(struct lwp *);
#ifdef __HAVE_SYSCALL_INTERN
void (*e_syscall_intern)(struct proc *);
#else
void (*e_syscall)(void);
#endif
/* Emulation specific sysctl data */
struct sysctlnode *e_sysctlovly;
vaddr_t (*e_vm_default_addr)(struct proc *, vaddr_t, vsize_t,
int);
/* Emulation-specific hook for userspace page faults */
int (*e_usertrap)(struct lwp *, vaddr_t, void *);
size_t e_ucsize; /* size of ucontext_t */
void (*e_startlwp)(void *);
/* Dtrace syscall probe */
void (*e_dtrace_syscall)(uint32_t, register_t,
const struct sysent *, const void *,
const register_t *, int);
/* Emulation specific support for ktracing signal posts */
void (*e_ktrpsig)(int, sig_t, const sigset_t *,
const struct ksiginfo *);
};
/*
* Emulation miscellaneous flags
*/
#define EMUL_HAS_SYS___syscall 0x001 /* Has SYS___syscall */
/*
* Description of a process.
*
* This structure contains the information needed to manage a thread of
* control, known in UN*X as a process; it has references to substructures
* containing descriptions of things that the process uses, but may share
* with related processes. The process structure and the substructures
* are always addressible except for those marked "(PROC ONLY)" below,
* which might be addressible only on a processor on which the process
* is running.
*
* Field markings and the corresponding locks:
*
* a: p_auxlock
* k: ktrace_mutex
* l: proc_lock
* t: p_stmutex
* p: p_lock
* (: updated atomically
* :: unlocked, stable
*/
struct vmspace;
struct proc {
LIST_ENTRY(proc) p_list; /* l: List of all processes */
kmutex_t *p_lock; /* :: general mutex */
kcondvar_t p_waitcv; /* p: wait, stop CV on children */
kcondvar_t p_lwpcv; /* p: wait, stop CV on LWPs */
/* Substructures: */
struct kauth_cred *p_cred; /* p: Master copy of credentials */
struct filedesc *p_fd; /* :: Ptr to open files structure */
struct cwdinfo *p_cwdi; /* :: cdir/rdir/cmask info */
struct pstats *p_stats; /* :: Accounting/stats (PROC ONLY) */
struct plimit *p_limit; /* :: Process limits */
struct vmspace *p_vmspace; /* :: Address space */
struct sigacts *p_sigacts; /* :: Process sigactions */
struct aioproc *p_aio; /* p: Asynchronous I/O data */
u_int p_mqueue_cnt; /* (: Count of open message queues */
specificdata_reference
p_specdataref; /* subsystem proc-specific data */
int p_exitsig; /* l: signal to send to parent on exit */
int p_flag; /* p: PK_* flags */
int p_sflag; /* p: PS_* flags */
int p_stflag; /* t: PST_* flags */
short p_slflag; /* l, p: PSL_* flags */
char p_stat; /* l: S* process status. */
char p_lflag; /* l: PL_* flags */
char p_trace_enabled;/* p: cached by syscall_intern() */
char p_pad1[3]; /* unused */
pid_t p_pid; /* :: Process identifier. */
LIST_ENTRY(proc) p_pglist; /* l: List of processes in pgrp. */
struct proc *p_pptr; /* l: Pointer to parent process. */
LIST_ENTRY(proc) p_sibling; /* l: List of sibling processes. */
LIST_HEAD(, proc) p_children; /* l: List of children. */
LIST_HEAD(, lwp) p_lwps; /* p: List of LWPs. */
struct ras *p_raslist; /* a: List of RAS entries */
/* The following fields are all zeroed upon creation in fork. */
#define p_startzero p_nlwps
int p_nlwps; /* p: Number of LWPs */
int p_nzlwps; /* p: Number of zombie LWPs */
int p_nrlwps; /* p: Number running/sleeping LWPs */
int p_nlwpwait; /* p: Number of LWPs in lwp_wait1() */
int p_ndlwps; /* p: Number of detached LWPs */
u_int p_nstopchild; /* l: Count of stopped/dead children */
u_int p_waited; /* l: parent has waited on child */
struct lwp *p_zomblwp; /* p: detached LWP to be reaped */
struct lwp *p_vforklwp; /* p: parent LWP waiting at vfork() */
/* scheduling */
void *p_sched_info; /* p: Scheduler-specific structure */
fixpt_t p_estcpu; /* p: Time avg. value of p_cpticks */
fixpt_t p_estcpu_inherited; /* p: cpu inherited from children */
unsigned int p_forktime;
fixpt_t p_pctcpu; /* p: %cpu from dead LWPs */
struct proc *p_opptr; /* l: save parent during ptrace. */
struct ptimers *p_timers; /* Timers: real, virtual, profiling */
struct bintime p_rtime; /* p: real time */
u_quad_t p_uticks; /* t: Statclock hits in user mode */
u_quad_t p_sticks; /* t: Statclock hits in system mode */
u_quad_t p_iticks; /* t: Statclock hits processing intr */
uint64_t p_xutime; /* p: utime exposed to userspace */
uint64_t p_xstime; /* p: stime exposed to userspace */
int p_traceflag; /* k: Kernel trace points */
void *p_tracep; /* k: Trace private data */
struct vnode *p_textvp; /* :: Vnode of executable */
struct emul *p_emul; /* :: emulation information */
void *p_emuldata; /* :: per-proc emul data, or NULL */
const struct execsw *p_execsw; /* :: exec package information */
struct klist p_klist; /* p: knotes attached to proc */
LIST_HEAD(, lwp) p_sigwaiters; /* p: LWPs waiting for signals */
sigpend_t p_sigpend; /* p: pending signals */
struct lcproc *p_lwpctl; /* p, a: _lwp_ctl() information */
pid_t p_ppid; /* :: cached parent pid */
pid_t p_oppid; /* :: cached original parent pid */
char *p_path; /* :: full pathname of executable */
/*
* End area that is zeroed on creation
*/
#define p_endzero p_startcopy
/*
* The following fields are all copied upon creation in fork.
*/
#define p_startcopy p_sigctx
struct sigctx p_sigctx; /* p: Shared signal state */
u_char p_nice; /* p: Process "nice" value */
char p_comm[MAXCOMLEN+1];
/* p: basename of last exec file */
struct pgrp *p_pgrp; /* l: Pointer to process group */
vaddr_t p_psstrp; /* :: address of process's ps_strings */
u_int p_pax; /* :: PAX flags */
int p_xexit; /* p: exit code */
/*
* End area that is copied on creation
*/
#define p_endcopy p_xsig
u_short p_xsig; /* p: stop signal */
u_short p_acflag; /* p: Acc. flags; see struct lwp also */
struct mdproc p_md; /* p: Any machine-dependent fields */
vaddr_t p_stackbase; /* :: ASLR randomized stack base */
struct kdtrace_proc *p_dtrace; /* :: DTrace-specific data. */
/*
* Locks in their own cache line towards the end.
*/
kmutex_t p_auxlock /* :: secondary, longer term lock */
__aligned(COHERENCY_UNIT);
kmutex_t p_stmutex; /* :: mutex on profiling state */
krwlock_t p_reflock; /* :: lock for debugger, procfs */
};
#define p_rlimit p_limit->pl_rlimit
#define p_session p_pgrp->pg_session
#define p_pgid p_pgrp->pg_id
#endif /* _KMEMUSER || _KERNEL */
/*
* Status values.
*/
#define SIDL 1 /* Process being created by fork */
#define SACTIVE 2 /* Process is not stopped */
#define SDYING 3 /* About to die */
#define SSTOP 4 /* Process debugging or suspension */
#define SZOMB 5 /* Awaiting collection by parent */
#define SDEAD 6 /* Almost a zombie */
#define P_ZOMBIE(p) \
((p)->p_stat == SZOMB || (p)->p_stat == SDYING || (p)->p_stat == SDEAD)
/*
* These flags are kept in p_flag and are protected by p_lock. Access from
* process context only.
*/
#define PK_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock */
#define PK_SYSTEM 0x00000002 /* System process (kthread) */
#define PK_SYSVSEM 0x00000004 /* Used SysV semaphores */
#define PK_SUGID 0x00000100 /* Had set id privileges since last exec */
#define PK_KMEM 0x00000200 /* Has kmem access */
#define PK_EXEC 0x00004000 /* Process called exec */
#define PK_NOCLDWAIT 0x00020000 /* No zombies if child dies */
#define PK_32 0x00040000 /* 32-bit process (used on 64-bit kernels) */
#define PK_CLDSIGIGN 0x00080000 /* Process is ignoring SIGCHLD */
#define PK_MARKER 0x80000000 /* Is a dummy marker process */
/*
* These flags are kept in p_sflag and are protected by p_lock. Access from
* process context only.
*/
#define PS_NOCLDSTOP 0x00000008 /* No SIGCHLD when children stop */
#define PS_RUMP_LWPEXIT 0x00000400 /* LWPs in RUMP kernel should exit for GC */
#define PS_WCORE 0x00001000 /* Process needs to dump core */
#define PS_WEXIT 0x00002000 /* Working on exiting */
#define PS_STOPFORK 0x00800000 /* Child will be stopped on fork(2) */
#define PS_STOPEXEC 0x01000000 /* Will be stopped on exec(2) */
#define PS_STOPEXIT 0x02000000 /* Will be stopped at process exit */
#define PS_COREDUMP 0x20000000 /* Process core-dumped */
#define PS_CONTINUED 0x40000000 /* Process is continued */
#define PS_STOPPING 0x80000000 /* Transitioning SACTIVE -> SSTOP */
/*
* These flags are kept in p_slflag and are protected by the proc_lock
* and p_lock. Access from process context only.
*/
#define PSL_TRACEFORK 0x00000001 /* traced process wants fork events */
#define PSL_TRACEVFORK 0x00000002 /* traced process wants vfork events */
#define PSL_TRACEVFORK_DONE \
0x00000004 /* traced process wants vfork done events */
#define PSL_TRACELWP_CREATE \
0x00000008 /* traced process wants LWP create events */
#define PSL_TRACELWP_EXIT \
0x00000010 /* traced process wants LWP exit events */
#define PSL_TRACEPOSIX_SPAWN \
0x00000020 /* traced process wants posix_spawn events */
#define PSL_TRACED 0x00000040 /* Debugged process being traced */
#define PSL_TRACEDCHILD 0x00000080 /* Report process birth */
#define PSL_CHTRACED 0x00000100 /* Child has been traced & reparented */
#define PSL_SYSCALL 0x00000200 /* process has PT_SYSCALL enabled */
#define PSL_SYSCALLEMU 0x00000400 /* cancel in-progress syscall */
/*
* Kept in p_stflag and protected by p_stmutex.
*/
#define PST_PROFIL 0x00000020 /* Has started profiling */
/*
* Kept in p_lflag and protected by the proc_lock. Access
* from process context only.
*/
#define PL_CONTROLT 0x00000001 /* Has a controlling terminal */
#define PL_PPWAIT 0x00000002 /* Parent is waiting for child exec/exit */
#define PL_SIGCOMPAT 0x00000004 /* Has used compat signal trampoline */
#define PL_ORPHANPG 0x00000008 /* Member of an orphaned pgrp */
#if defined(_KMEMUSER) || defined(_KERNEL)
/*
* Macro to compute the exit signal to be delivered.
*/
#define P_EXITSIG(p) \
(((p)->p_slflag & PSL_TRACED) ? SIGCHLD : p->p_exitsig)
/*
* Compute a wait(2) 16 bit exit status code
*/
#define P_WAITSTATUS(p) W_EXITCODE((p)->p_xexit, ((p)->p_xsig | \
(((p)->p_sflag & PS_COREDUMP) ? WCOREFLAG : 0)))
LIST_HEAD(proclist, proc); /* A list of processes */
/*
* This structure associates a proclist with its lock.
*/
struct proclist_desc {
struct proclist *pd_list; /* The list */
/*
* XXX Add a pointer to the proclist's lock eventually.
*/
};
#ifdef _KERNEL
/*
* We use process IDs <= PID_MAX until there are > 16k processes.
* NO_PGID is used to represent "no process group" for a tty.
*/
#define PID_MAX 30000
#define NO_PGID ((pid_t)-1)
#define SESS_LEADER(p) ((p)->p_session->s_leader == (p))
/*
* Flags passed to fork1().
*/
#define FORK_PPWAIT 0x0001 /* Block parent until child exit */
#define FORK_SHAREVM 0x0002 /* Share vmspace with parent */
#define FORK_SHARECWD 0x0004 /* Share cdir/rdir/cmask */
#define FORK_SHAREFILES 0x0008 /* Share file descriptors */
#define FORK_SHARESIGS 0x0010 /* Share signal actions */
#define FORK_NOWAIT 0x0020 /* Make init the parent of the child */
#define FORK_CLEANFILES 0x0040 /* Start with a clean descriptor set */
#define FORK_SYSTEM 0x0080 /* Fork a kernel thread */
extern struct proc proc0; /* Process slot for swapper */
extern u_int nprocs; /* Current number of procs */
extern int maxproc; /* Max number of procs */
#define vmspace_kernel() (proc0.p_vmspace)
extern kmutex_t proc_lock;
extern struct proclist allproc; /* List of all processes */
extern struct proclist zombproc; /* List of zombie processes */
extern struct proc *initproc; /* Process slots for init, pager */
extern const struct proclist_desc proclists[];
int proc_find_locked(struct lwp *, struct proc **, pid_t);
proc_t * proc_find_raw(pid_t);
proc_t * proc_find(pid_t); /* Find process by ID */
proc_t * proc_find_lwpid(pid_t); /* Find process by LWP ID */
struct lwp * proc_find_lwp(proc_t *, pid_t); /* Find LWP in proc by ID */
struct lwp * proc_find_lwp_unlocked(proc_t *, pid_t);
/* Find LWP, acquire proc */
struct lwp * proc_find_lwp_acquire_proc(pid_t, proc_t **);
struct pgrp * pgrp_find(pid_t); /* Find process group by ID */
void procinit(void);
void procinit_sysctl(void);
int proc_enterpgrp(struct proc *, pid_t, pid_t, bool);
void proc_leavepgrp(struct proc *);
void proc_sesshold(struct session *);
void proc_sessrele(struct session *);
void fixjobc(struct proc *, struct pgrp *, int);
int tsleep(wchan_t, pri_t, const char *, int);
int mtsleep(wchan_t, pri_t, const char *, int, kmutex_t *);
void wakeup(wchan_t);
int kpause(const char *, bool, int, kmutex_t *);
void exit1(struct lwp *, int, int) __dead;
int kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval);
int do_sys_wait(int *, int *, int, struct rusage *);
int do_sys_waitid(idtype_t, id_t, int *, int *, int, struct wrusage *,
siginfo_t *);
struct proc *proc_alloc(void);
void proc0_init(void);
pid_t proc_alloc_pid(struct proc *);
void proc_free_pid(pid_t);
pid_t proc_alloc_lwpid(struct proc *, struct lwp *);
void proc_free_lwpid(struct proc *, pid_t);
void proc_free_mem(struct proc *);
void exit_lwps(struct lwp *l);
int fork1(struct lwp *, int, int, void *, size_t,
void (*)(void *), void *, register_t *);
int pgid_in_session(struct proc *, pid_t);
void cpu_lwp_fork(struct lwp *, struct lwp *, void *, size_t,
void (*)(void *), void *);
void cpu_lwp_free(struct lwp *, int);
void cpu_lwp_free2(struct lwp *);
void cpu_spawn_return(struct lwp*);
#ifdef __HAVE_SYSCALL_INTERN
void syscall_intern(struct proc *);
#endif
void md_child_return(struct lwp *);
void child_return(void *);
int proc_isunder(struct proc *, struct lwp *);
int proc_uidmatch(kauth_cred_t, kauth_cred_t);
int proc_vmspace_getref(struct proc *, struct vmspace **);
void proc_crmod_leave(kauth_cred_t, kauth_cred_t, bool);
void proc_crmod_enter(void);
int proc_getauxv(struct proc *, void **, size_t *);
int proc_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void proc_specific_key_delete(specificdata_key_t);
void proc_initspecific(struct proc *);
void proc_finispecific(struct proc *);
void * proc_getspecific(struct proc *, specificdata_key_t);
void proc_setspecific(struct proc *, specificdata_key_t, void *);
int proc_compare(const struct proc *, const struct lwp *,
const struct proc *, const struct lwp *);
/*
* Special handlers for delivering EVFILT_PROC notifications. These
* exist to handle some of the special locking considerations around
* processes.
*/
void knote_proc_exec(struct proc *);
void knote_proc_fork(struct proc *, struct proc *);
void knote_proc_exit(struct proc *);
int proclist_foreach_call(struct proclist *,
int (*)(struct proc *, void *arg), void *);
static __inline struct proc *
_proclist_skipmarker(struct proc *p0)
{
struct proc *p = p0;
while (p != NULL && p->p_flag & PK_MARKER)
p = LIST_NEXT(p, p_list);
return p;
}
#define PROC_PTRSZ(p) (((p)->p_flag & PK_32) ? sizeof(int) : sizeof(void *))
#define PROC_REGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_reg32) : sizeof(struct reg))
#define PROC_FPREGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_fpreg32) : sizeof(struct fpreg))
#define PROC_DBREGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_dbreg32) : sizeof(struct dbreg))
#ifndef PROC_MACHINE_ARCH
#define PROC_MACHINE_ARCH(p) machine_arch
#endif
/*
* PROCLIST_FOREACH: iterate on the given proclist, skipping PK_MARKER ones.
*/
#define PROCLIST_FOREACH(var, head) \
for ((var) = LIST_FIRST(head); \
((var) = _proclist_skipmarker(var)) != NULL; \
(var) = LIST_NEXT(var, p_list))
#ifdef KSTACK_CHECK_MAGIC
void kstack_setup_magic(const struct lwp *);
void kstack_check_magic(const struct lwp *);
#else
#define kstack_setup_magic(x)
#define kstack_check_magic(x)
#endif
extern struct emul emul_netbsd;
#endif /* _KERNEL */
/*
* Kernel stack parameters.
*
* KSTACK_LOWEST_ADDR: return the lowest address of the LWP's kernel stack,
* excluding red-zone.
*
* KSTACK_SIZE: the size kernel stack for a LWP, excluding red-zone.
*
* if <machine/proc.h> provides the MD definition, it will be used.
*/
#ifndef KSTACK_LOWEST_ADDR
#define KSTACK_LOWEST_ADDR(l) ((void *)ALIGN((struct pcb *)((l)->l_addr) + 1))
#endif
#ifndef KSTACK_SIZE
#define KSTACK_SIZE (USPACE - ALIGN(sizeof(struct pcb)))
#endif
#endif /* _KMEMUSER || _KERNEL */
#endif /* !_SYS_PROC_H_ */
/* $NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_amap.c: amap operations
*/
/*
* this file contains functions that perform operations on amaps. see
* uvm_amap.h for a brief explanation of the role of amaps in uvm.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
/*
* cache for allocation of vm_map structures. note that in order to
* avoid an endless loop, the amap cache's allocator cannot allocate
* memory from an amap (it currently goes through the kernel uobj, so
* we are ok).
*/
static struct pool_cache uvm_amap_cache;
static kmutex_t amap_list_lock __cacheline_aligned;
static LIST_HEAD(, vm_amap) amap_list;
/*
* local functions
*/
static int
amap_roundup_slots(int slots)
{
return kmem_roundup_size(slots * sizeof(int)) / sizeof(int);
}
#ifdef UVM_AMAP_PPREF
/*
* what is ppref? ppref is an _optional_ amap feature which is used
* to keep track of reference counts on a per-page basis. it is enabled
* when UVM_AMAP_PPREF is defined.
*
* when enabled, an array of ints is allocated for the pprefs. this
* array is allocated only when a partial reference is added to the
* map (either by unmapping part of the amap, or gaining a reference
* to only a part of an amap). if the allocation of the array fails
* (KM_NOSLEEP), then we set the array pointer to PPREF_NONE to indicate
* that we tried to do ppref's but couldn't alloc the array so just
* give up (after all, this is an optional feature!).
*
* the array is divided into page sized "chunks." for chunks of length 1,
* the chunk reference count plus one is stored in that chunk's slot.
* for chunks of length > 1 the first slot contains (the reference count
* plus one) * -1. [the negative value indicates that the length is
* greater than one.] the second slot of the chunk contains the length
* of the chunk. here is an example:
*
* actual REFS: 2 2 2 2 3 1 1 0 0 0 4 4 0 1 1 1
* ppref: -3 4 x x 4 -2 2 -1 3 x -5 2 1 -2 3 x
* <----------><-><----><-------><----><-><------->
* (x = don't care)
*
* this allows us to allow one int to contain the ref count for the whole
* chunk. note that the "plus one" part is needed because a reference
* count of zero is neither positive or negative (need a way to tell
* if we've got one zero or a bunch of them).
*
* here are some in-line functions to help us.
*/
/*
* pp_getreflen: get the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_getreflen(int *ppref, int offset, int *refp, int *lenp)
{
if (ppref[offset] > 0) { /* chunk size must be 1 */
*refp = ppref[offset] - 1; /* don't forget to adjust */
*lenp = 1;
} else {
*refp = (ppref[offset] * -1) - 1;
*lenp = ppref[offset+1];
}
}
/*
* pp_setreflen: set the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_setreflen(int *ppref, int offset, int ref, int len)
{
if (len == 0)
return;
if (len == 1) {
ppref[offset] = ref + 1;
} else {
ppref[offset] = (ref + 1) * -1;
ppref[offset+1] = len;
}
}
#endif /* UVM_AMAP_PPREF */
/*
* amap_alloc1: allocate an amap, but do not initialise the overlay.
*
* => Note: lock is not set.
*/
static struct vm_amap *
amap_alloc1(int slots, int padslots, int flags)
{
const bool nowait = (flags & UVM_FLAG_NOWAIT) != 0;
const km_flag_t kmflags = nowait ? KM_NOSLEEP : KM_SLEEP;
struct vm_amap *amap;
krwlock_t *newlock, *oldlock;
int totalslots;
amap = pool_cache_get(&uvm_amap_cache, nowait ? PR_NOWAIT : PR_WAITOK);
if (amap == NULL) {
return NULL;
}
KASSERT(amap->am_lock != NULL); KASSERT(amap->am_nused == 0);
/* Try to privatize the lock if currently shared. */
if (rw_obj_refcnt(amap->am_lock) > 1) {
newlock = rw_obj_tryalloc();
if (newlock != NULL) { oldlock = amap->am_lock;
mutex_enter(&amap_list_lock);
amap->am_lock = newlock;
mutex_exit(&amap_list_lock);
rw_obj_free(oldlock);
}
}
totalslots = amap_roundup_slots(slots + padslots);
amap->am_ref = 1;
amap->am_flags = 0;
#ifdef UVM_AMAP_PPREF
amap->am_ppref = NULL;
#endif
amap->am_maxslot = totalslots;
amap->am_nslot = slots;
/*
* Note: since allocations are likely big, we expect to reduce the
* memory fragmentation by allocating them in separate blocks.
*/
amap->am_slots = kmem_alloc(totalslots * sizeof(int), kmflags);
if (amap->am_slots == NULL)
goto fail1;
amap->am_bckptr = kmem_alloc(totalslots * sizeof(int), kmflags);
if (amap->am_bckptr == NULL)
goto fail2;
amap->am_anon = kmem_alloc(totalslots * sizeof(struct vm_anon *),
kmflags);
if (amap->am_anon == NULL)
goto fail3;
return amap;
fail3:
kmem_free(amap->am_bckptr, totalslots * sizeof(int));
fail2:
kmem_free(amap->am_slots, totalslots * sizeof(int));
fail1:
pool_cache_put(&uvm_amap_cache, amap);
/*
* XXX hack to tell the pagedaemon how many pages we need,
* since we can need more than it would normally free.
*/
if (nowait) {
extern u_int uvm_extrapages;
atomic_add_int(&uvm_extrapages,
((sizeof(int) * 2 + sizeof(struct vm_anon *)) *
totalslots) >> PAGE_SHIFT);
}
return NULL;
}
/*
* amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM
*
* => caller should ensure sz is a multiple of PAGE_SIZE
* => reference count to new amap is set to one
* => new amap is returned unlocked
*/
struct vm_amap *
amap_alloc(vaddr_t sz, vaddr_t padsz, int waitf)
{
struct vm_amap *amap;
int slots, padslots;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
AMAP_B2SLOT(slots, sz); AMAP_B2SLOT(padslots, padsz);
amap = amap_alloc1(slots, padslots, waitf);
if (amap) { memset(amap->am_anon, 0,
amap->am_maxslot * sizeof(struct vm_anon *));
}
UVMHIST_LOG(maphist,"<- done, amap = %#jx, sz=%jd", (uintptr_t)amap,
sz, 0, 0);
return(amap);
}
/*
* amap_ctor: pool_cache constructor for new amaps
*
* => carefully synchronize with amap_swap_off()
*/
static int
amap_ctor(void *arg, void *obj, int flags)
{
struct vm_amap *amap = obj;
if ((flags & PR_NOWAIT) != 0) {
amap->am_lock = rw_obj_tryalloc();
if (amap->am_lock == NULL) {
return ENOMEM;
}
} else {
amap->am_lock = rw_obj_alloc();
}
amap->am_nused = 0;
amap->am_flags = 0;
mutex_enter(&amap_list_lock);
LIST_INSERT_HEAD(&amap_list, amap, am_list);
mutex_exit(&amap_list_lock);
return 0;
}
/*
* amap_ctor: pool_cache destructor for amaps
*
* => carefully synchronize with amap_swap_off()
*/
static void
amap_dtor(void *arg, void *obj)
{
struct vm_amap *amap = obj;
KASSERT(amap->am_nused == 0);
mutex_enter(&amap_list_lock);
LIST_REMOVE(amap, am_list);
mutex_exit(&amap_list_lock);
rw_obj_free(amap->am_lock);
}
/*
* uvm_amap_init: initialize the amap system.
*/
void
uvm_amap_init(void)
{
mutex_init(&amap_list_lock, MUTEX_DEFAULT, IPL_NONE);
pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap),
COHERENCY_UNIT, 0, 0, "amappl", NULL, IPL_NONE,
amap_ctor, amap_dtor, NULL);
}
/*
* amap_free: free an amap
*
* => the amap must be unlocked
* => the amap should have a zero reference count and be empty
*/
void
amap_free(struct vm_amap *amap)
{
int slots;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap->am_ref == 0); KASSERT(amap->am_nused == 0); KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
slots = amap->am_maxslot;
kmem_free(amap->am_slots, slots * sizeof(*amap->am_slots));
kmem_free(amap->am_bckptr, slots * sizeof(*amap->am_bckptr));
kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon));
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) kmem_free(amap->am_ppref, slots * sizeof(*amap->am_ppref));
#endif
pool_cache_put(&uvm_amap_cache, amap);
UVMHIST_LOG(maphist,"<- done, freed amap = %#jx", (uintptr_t)amap,
0, 0, 0);
}
/*
* amap_extend: extend the size of an amap (if needed)
*
* => called from uvm_map when we want to extend an amap to cover
* a new mapping (rather than allocate a new one)
* => amap should be unlocked (we will lock it)
* => to safely extend an amap it should have a reference count of
* one (thus it can't be shared)
*/
int
amap_extend(struct vm_map_entry *entry, vsize_t addsize, int flags)
{
struct vm_amap *amap = entry->aref.ar_amap;
int slotoff = entry->aref.ar_pageoff;
int slotmapped, slotadd, slotneed, slotadded, slotalloc;
int slotadj, slotarea, slotendoff;
int oldnslots;
#ifdef UVM_AMAP_PPREF
int *newppref, *oldppref;
#endif
int i, *newsl, *newbck, *oldsl, *oldbck;
struct vm_anon **newover, **oldover;
const km_flag_t kmflags =
(flags & AMAP_EXTEND_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (entry=%#jx, addsize=%#jx, flags=%#jx)",
(uintptr_t)entry, addsize, flags, 0);
/*
* first, determine how many slots we need in the amap. don't
* forget that ar_pageoff could be non-zero: this means that
* there are some unused slots before us in the amap.
*/
amap_lock(amap, RW_WRITER);
KASSERT(amap_refs(amap) == 1); /* amap can't be shared */ AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */ AMAP_B2SLOT(slotadd, addsize); /* slots to add */
if (flags & AMAP_EXTEND_FORWARDS) {
slotneed = slotoff + slotmapped + slotadd;
slotadj = 0;
slotarea = 0;
} else {
slotneed = slotadd + slotmapped;
slotadj = slotadd - slotoff;
slotarea = amap->am_maxslot - slotmapped;
}
/*
* Because this amap only has 1 ref, we know that there is
* only one vm_map_entry pointing to it, and the one entry is
* using slots between slotoff and slotoff + slotmapped. If
* we have been using ppref then we know that only slots in
* the one map entry's range can have anons, since ppref
* allowed us to free any anons outside that range as other map
* entries which used this amap were removed. But without ppref,
* we couldn't know which slots were still needed by other map
* entries, so we couldn't free any anons as we removed map
* entries, and so any slot from 0 to am_nslot can have an
* anon. But now that we know there is only one map entry
* left and we know its range, we can free up any anons
* outside that range. This is necessary because the rest of
* this function assumes that there are no anons in the amap
* outside of the one map entry's range.
*/
slotendoff = slotoff + slotmapped;
if (amap->am_ppref == PPREF_NONE) { amap_wiperange(amap, 0, slotoff);
amap_wiperange(amap, slotendoff, amap->am_nslot - slotendoff);
}
for (i = 0; i < slotoff; i++) { KASSERT(amap->am_anon[i] == NULL);
}
for (i = slotendoff; i < amap->am_nslot - slotendoff; i++) { KASSERT(amap->am_anon[i] == NULL);
}
/*
* case 1: we already have enough slots in the map and thus
* only need to bump the reference counts on the slots we are
* adding.
*/
if (flags & AMAP_EXTEND_FORWARDS) {
if (amap->am_nslot >= slotneed) {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
amap_pp_adjref(amap, slotoff + slotmapped,
slotadd, 1);
}
#endif
amap_unlock(amap);
UVMHIST_LOG(maphist,
"<- done (case 1f), amap = %#jx, sltneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
} else {
if (slotadj <= 0) {
slotoff -= slotadd;
entry->aref.ar_pageoff = slotoff;
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, slotoff, slotadd, 1);
}
#endif
amap_unlock(amap);
UVMHIST_LOG(maphist,
"<- done (case 1b), amap = %#jx, sltneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
}
/*
* case 2: we pre-allocated slots for use and we just need to
* bump nslot up to take account for these slots.
*/
if (amap->am_maxslot >= slotneed) {
if (flags & AMAP_EXTEND_FORWARDS) {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { if ((slotoff + slotmapped) < amap->am_nslot)
amap_pp_adjref(amap,
slotoff + slotmapped,
(amap->am_nslot -
(slotoff + slotmapped)), 1);
pp_setreflen(amap->am_ppref, amap->am_nslot, 1,
slotneed - amap->am_nslot);
}
#endif
amap->am_nslot = slotneed;
amap_unlock(amap);
/*
* no need to zero am_anon since that was done at
* alloc time and we never shrink an allocation.
*/
UVMHIST_LOG(maphist,"<- done (case 2f), amap = %#jx, "
"slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
return 0;
} else {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
/*
* Slide up the ref counts on the pages that
* are actually in use.
*/
memmove(amap->am_ppref + slotarea,
amap->am_ppref + slotoff,
slotmapped * sizeof(int));
/*
* Mark the (adjusted) gap at the front as
* referenced/not referenced.
*/
pp_setreflen(amap->am_ppref,
0, 0, slotarea - slotadd);
pp_setreflen(amap->am_ppref,
slotarea - slotadd, 1, slotadd);
}
#endif
/*
* Slide the anon pointers up and clear out
* the space we just made.
*/
memmove(amap->am_anon + slotarea,
amap->am_anon + slotoff,
slotmapped * sizeof(struct vm_anon*));
memset(amap->am_anon + slotoff, 0,
(slotarea - slotoff) * sizeof(struct vm_anon *));
/*
* Slide the backpointers up, but don't bother
* wiping out the old slots.
*/
memmove(amap->am_bckptr + slotarea,
amap->am_bckptr + slotoff,
slotmapped * sizeof(int));
/*
* Adjust all the useful active slot numbers.
*/
for (i = 0; i < amap->am_nused; i++)
amap->am_slots[i] += (slotarea - slotoff);
/*
* We just filled all the empty space in the
* front of the amap by activating a few new
* slots.
*/
amap->am_nslot = amap->am_maxslot;
entry->aref.ar_pageoff = slotarea - slotadd;
amap_unlock(amap);
UVMHIST_LOG(maphist,"<- done (case 2b), amap = %#jx, "
"slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
return 0;
}
}
/*
* Case 3: we need to allocate a new amap and copy all the amap
* data over from old amap to the new one. Drop the lock before
* performing allocation.
*
* Note: since allocations are likely big, we expect to reduce the
* memory fragmentation by allocating them in separate blocks.
*/
amap_unlock(amap);
if (slotneed >= UVM_AMAP_LARGE) {
return E2BIG;
}
slotalloc = amap_roundup_slots(slotneed);
#ifdef UVM_AMAP_PPREF
newppref = NULL;
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
/* Will be handled later if fails. */
newppref = kmem_alloc(slotalloc * sizeof(*newppref), kmflags);
}
#endif
newsl = kmem_alloc(slotalloc * sizeof(*newsl), kmflags);
newbck = kmem_alloc(slotalloc * sizeof(*newbck), kmflags);
newover = kmem_alloc(slotalloc * sizeof(*newover), kmflags);
if (newsl == NULL || newbck == NULL || newover == NULL) {
#ifdef UVM_AMAP_PPREF
if (newppref != NULL) { kmem_free(newppref, slotalloc * sizeof(*newppref));
}
#endif
if (newsl != NULL) { kmem_free(newsl, slotalloc * sizeof(*newsl));
}
if (newbck != NULL) { kmem_free(newbck, slotalloc * sizeof(*newbck));
}
if (newover != NULL) { kmem_free(newover, slotalloc * sizeof(*newover));
}
return ENOMEM;
}
amap_lock(amap, RW_WRITER);
KASSERT(amap->am_maxslot < slotneed);
/*
* Copy everything over to new allocated areas.
*/
slotadded = slotalloc - amap->am_nslot;
if (!(flags & AMAP_EXTEND_FORWARDS))
slotarea = slotalloc - slotmapped;
/* do am_slots */
oldsl = amap->am_slots;
if (flags & AMAP_EXTEND_FORWARDS)
memcpy(newsl, oldsl, sizeof(int) * amap->am_nused);
else
for (i = 0; i < amap->am_nused; i++)
newsl[i] = oldsl[i] + slotarea - slotoff;
amap->am_slots = newsl;
/* do am_anon */
oldover = amap->am_anon;
if (flags & AMAP_EXTEND_FORWARDS) {
memcpy(newover, oldover,
sizeof(struct vm_anon *) * amap->am_nslot);
memset(newover + amap->am_nslot, 0,
sizeof(struct vm_anon *) * slotadded);
} else {
memcpy(newover + slotarea, oldover + slotoff,
sizeof(struct vm_anon *) * slotmapped);
memset(newover, 0,
sizeof(struct vm_anon *) * slotarea);
}
amap->am_anon = newover;
/* do am_bckptr */
oldbck = amap->am_bckptr;
if (flags & AMAP_EXTEND_FORWARDS)
memcpy(newbck, oldbck, sizeof(int) * amap->am_nslot);
else
memcpy(newbck + slotarea, oldbck + slotoff,
sizeof(int) * slotmapped);
amap->am_bckptr = newbck;
#ifdef UVM_AMAP_PPREF
/* do ppref */
oldppref = amap->am_ppref;
if (newppref) {
if (flags & AMAP_EXTEND_FORWARDS) {
memcpy(newppref, oldppref,
sizeof(int) * amap->am_nslot);
memset(newppref + amap->am_nslot, 0,
sizeof(int) * slotadded);
} else {
memcpy(newppref + slotarea, oldppref + slotoff,
sizeof(int) * slotmapped);
}
amap->am_ppref = newppref;
if ((flags & AMAP_EXTEND_FORWARDS) &&
(slotoff + slotmapped) < amap->am_nslot)
amap_pp_adjref(amap, slotoff + slotmapped,
(amap->am_nslot - (slotoff + slotmapped)), 1);
if (flags & AMAP_EXTEND_FORWARDS)
pp_setreflen(newppref, amap->am_nslot, 1,
slotneed - amap->am_nslot);
else {
pp_setreflen(newppref, 0, 0,
slotalloc - slotneed);
pp_setreflen(newppref, slotalloc - slotneed, 1,
slotneed - slotmapped);
}
} else {
if (amap->am_ppref) amap->am_ppref = PPREF_NONE;
}
#endif
/* update master values */
if (flags & AMAP_EXTEND_FORWARDS)
amap->am_nslot = slotneed;
else {
entry->aref.ar_pageoff = slotarea - slotadd;
amap->am_nslot = slotalloc;
}
oldnslots = amap->am_maxslot;
amap->am_maxslot = slotalloc;
amap_unlock(amap);
kmem_free(oldsl, oldnslots * sizeof(*oldsl));
kmem_free(oldbck, oldnslots * sizeof(*oldbck));
kmem_free(oldover, oldnslots * sizeof(*oldover));
#ifdef UVM_AMAP_PPREF
if (oldppref && oldppref != PPREF_NONE) kmem_free(oldppref, oldnslots * sizeof(*oldppref));
#endif
UVMHIST_LOG(maphist,"<- done (case 3), amap = %#jx, slotneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
/*
* amap_share_protect: change protection of anons in a shared amap
*
* for shared amaps, given the current data structure layout, it is
* not possible for us to directly locate all maps referencing the
* shared anon (to change the protection). in order to protect data
* in shared maps we use pmap_page_protect(). [this is useful for IPC
* mechanisms like map entry passing that may want to write-protect
* all mappings of a shared amap.] we traverse am_anon or am_slots
* depending on the current state of the amap.
*
* => entry's map and amap must be locked by the caller
*/
void
amap_share_protect(struct vm_map_entry *entry, vm_prot_t prot)
{
struct vm_amap *amap = entry->aref.ar_amap;
u_int slots, lcv, slot, stop;
struct vm_anon *anon;
KASSERT(rw_write_held(amap->am_lock));
AMAP_B2SLOT(slots, (entry->end - entry->start));
stop = entry->aref.ar_pageoff + slots;
if (slots < amap->am_nused) {
/*
* Cheaper to traverse am_anon.
*/
for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) {
anon = amap->am_anon[lcv];
if (anon == NULL) {
continue;
}
if (anon->an_page) {
pmap_page_protect(anon->an_page, prot);
}
}
return;
}
/*
* Cheaper to traverse am_slots.
*/
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
slot = amap->am_slots[lcv];
if (slot < entry->aref.ar_pageoff || slot >= stop) {
continue;
}
anon = amap->am_anon[slot];
if (anon->an_page) {
pmap_page_protect(anon->an_page, prot);
}
}
}
/*
* amap_wipeout: wipeout all anon's in an amap; then free the amap!
*
* => Called from amap_unref(), when reference count drops to zero.
* => amap must be locked.
*/
void
amap_wipeout(struct vm_amap *amap)
{
u_int lcv;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(amap=%#jx)", (uintptr_t)amap, 0,0,0);
KASSERT(rw_write_held(amap->am_lock)); KASSERT(amap->am_ref == 0);
if (__predict_false(amap->am_flags & AMAP_SWAPOFF)) {
/*
* Note: amap_swap_off() will call us again.
*/
amap_unlock(amap);
return;
}
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
struct vm_anon *anon;
u_int slot;
slot = amap->am_slots[lcv];
anon = amap->am_anon[slot];
KASSERT(anon != NULL); KASSERT(anon->an_ref != 0); KASSERT(anon->an_lock == amap->am_lock);
UVMHIST_LOG(maphist," processing anon %#jx, ref=%jd",
(uintptr_t)anon, anon->an_ref, 0, 0);
/*
* Drop the reference.
*/
if (__predict_true(--anon->an_ref == 0)) { uvm_anfree(anon);
}
if (__predict_false((lcv & 31) == 31)) { preempt_point();
}
}
/*
* Finally, destroy the amap.
*/
amap->am_nused = 0;
amap_unlock(amap);
amap_free(amap);
UVMHIST_LOG(maphist,"<- done!", 0,0,0,0);
}
/*
* amap_copy: ensure that a map entry's "needs_copy" flag is false
* by copying the amap if necessary.
*
* => an entry with a null amap pointer will get a new (blank) one.
* => the map that the map entry belongs to must be locked by caller.
* => the amap currently attached to "entry" (if any) must be unlocked.
* => if canchunk is true, then we may clip the entry into a chunk
* => "startva" and "endva" are used only if canchunk is true. they are
* used to limit chunking (e.g. if you have a large space that you
* know you are going to need to allocate amaps for, there is no point
* in allowing that to be chunked)
*/
void
amap_copy(struct vm_map *map, struct vm_map_entry *entry, int flags,
vaddr_t startva, vaddr_t endva)
{
const int waitf = (flags & AMAP_COPY_NOWAIT) ? UVM_FLAG_NOWAIT : 0;
struct vm_amap *amap, *srcamap;
u_int slots, lcv;
krwlock_t *oldlock;
vsize_t len;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (map=%#jx, entry=%#jx, flags=%#jx)",
(uintptr_t)map, (uintptr_t)entry, flags, -2);
KASSERT(map != kernel_map); /* we use nointr pool */
srcamap = entry->aref.ar_amap;
len = entry->end - entry->start;
/*
* Is there an amap to copy? If not, create one.
*/
if (srcamap == NULL) {
const bool canchunk = (flags & AMAP_COPY_NOCHUNK) == 0;
/*
* Check to see if we have a large amap that we can
* chunk. We align startva/endva to chunk-sized
* boundaries and then clip to them.
*/
if (canchunk && atop(len) >= UVM_AMAP_LARGE) {
vsize_t chunksize;
/* Convert slots to bytes. */
chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
startva = (startva / chunksize) * chunksize;
endva = roundup(endva, chunksize);
UVMHIST_LOG(maphist,
" chunk amap ==> clip %#jx->%#jx to %#jx->%#jx",
entry->start, entry->end, startva, endva);
UVM_MAP_CLIP_START(map, entry, startva);
/* Watch out for endva wrap-around! */
if (endva >= startva) { UVM_MAP_CLIP_END(map, entry, endva);
}
}
if ((flags & AMAP_COPY_NOMERGE) == 0 &&
uvm_mapent_trymerge(map, entry, UVM_MERGE_COPYING)) {
return;
}
UVMHIST_LOG(maphist, "<- done [creating new amap %#jx->%#jx]",
entry->start, entry->end, 0, 0);
/*
* Allocate an initialised amap and install it.
* Note: we must update the length after clipping.
*/
len = entry->end - entry->start;
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap_alloc(len, 0, waitf);
if (entry->aref.ar_amap != NULL) {
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
return;
}
/*
* First check and see if we are the only map entry referencing
* he amap we currently have. If so, then just take it over instead
* of copying it. Note that we are reading am_ref without lock held
* as the value can only be one if we have the only reference
* to the amap (via our locked map). If the value is greater than
* one, then allocate amap and re-check the value.
*/
if (srcamap->am_ref == 1) {
entry->etype &= ~UVM_ET_NEEDSCOPY;
UVMHIST_LOG(maphist, "<- done [ref cnt = 1, took it over]",
0, 0, 0, 0);
return;
}
UVMHIST_LOG(maphist," amap=%#jx, ref=%jd, must copy it",
(uintptr_t)srcamap, srcamap->am_ref, 0, 0);
/*
* Allocate a new amap (note: not initialised, etc).
*/
AMAP_B2SLOT(slots, len);
amap = amap_alloc1(slots, 0, waitf);
if (amap == NULL) {
UVMHIST_LOG(maphist, " amap_alloc1 failed", 0,0,0,0);
return;
}
/*
* Make the new amap share the source amap's lock, and then lock
* both. We must do this before we set am_nused != 0, otherwise
* amap_swap_off() can become interested in the amap.
*/
oldlock = amap->am_lock;
mutex_enter(&amap_list_lock);
amap->am_lock = srcamap->am_lock;
mutex_exit(&amap_list_lock);
rw_obj_hold(amap->am_lock);
rw_obj_free(oldlock);
amap_lock(srcamap, RW_WRITER);
/*
* Re-check the reference count with the lock held. If it has
* dropped to one - we can take over the existing map.
*/
if (srcamap->am_ref == 1) {
/* Just take over the existing amap. */
entry->etype &= ~UVM_ET_NEEDSCOPY;
amap_unlock(srcamap);
/* Destroy the new (unused) amap. */
amap->am_ref--;
amap_free(amap);
return;
}
/*
* Copy the slots. Zero the padded part.
*/
UVMHIST_LOG(maphist, " copying amap now",0, 0, 0, 0);
for (lcv = 0 ; lcv < slots; lcv++) {
amap->am_anon[lcv] =
srcamap->am_anon[entry->aref.ar_pageoff + lcv];
if (amap->am_anon[lcv] == NULL)
continue;
KASSERT(amap->am_anon[lcv]->an_lock == srcamap->am_lock); KASSERT(amap->am_anon[lcv]->an_ref > 0); KASSERT(amap->am_nused < amap->am_maxslot);
amap->am_anon[lcv]->an_ref++;
amap->am_bckptr[lcv] = amap->am_nused;
amap->am_slots[amap->am_nused] = lcv;
amap->am_nused++;
}
memset(&amap->am_anon[lcv], 0,
(amap->am_maxslot - lcv) * sizeof(struct vm_anon *));
/*
* Drop our reference to the old amap (srcamap) and unlock.
* Since the reference count on srcamap is greater than one,
* (we checked above), it cannot drop to zero while it is locked.
*/
srcamap->am_ref--;
KASSERT(srcamap->am_ref > 0); if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) { srcamap->am_flags &= ~AMAP_SHARED;
}
#ifdef UVM_AMAP_PPREF
if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) { amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
len >> PAGE_SHIFT, -1);
}
#endif
amap_unlock(srcamap);
/*
* Install new amap.
*/
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap;
entry->etype &= ~UVM_ET_NEEDSCOPY;
UVMHIST_LOG(maphist, "<- done",0, 0, 0, 0);
}
/*
* amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2)
*
* called during fork(2) when the parent process has a wired map
* entry. in that case we want to avoid write-protecting pages
* in the parent's map (e.g. like what you'd do for a COW page)
* so we resolve the COW here.
*
* => assume parent's entry was wired, thus all pages are resident.
* => assume pages that are loaned out (loan_count) are already mapped
* read-only in all maps, and thus no need for us to worry about them
* => assume both parent and child vm_map's are locked
* => caller passes child's map/entry in to us
* => if we run out of memory we will unlock the amap and sleep _with_ the
* parent and child vm_map's locked(!). we have to do this since
* we are in the middle of a fork(2) and we can't let the parent
* map change until we are done copying all the map entrys.
* => XXXCDC: out of memory should cause fork to fail, but there is
* currently no easy way to do this (needs fix)
*/
void
amap_cow_now(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_amap *amap = entry->aref.ar_amap;
struct vm_anon *anon, *nanon;
struct vm_page *pg, *npg;
u_int lcv, slot;
/*
* note that if we unlock the amap then we must ReStart the "lcv" for
* loop because some other process could reorder the anon's in the
* am_anon[] array on us while the lock is dropped.
*/
ReStart:
amap_lock(amap, RW_WRITER);
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
slot = amap->am_slots[lcv];
anon = amap->am_anon[slot];
KASSERT(anon->an_lock == amap->am_lock);
/*
* If anon has only one reference - we must have already
* copied it. This can happen if we needed to sleep waiting
* for memory in a previous run through this loop. The new
* page might even have been paged out, since is not wired.
*/
if (anon->an_ref == 1) {
KASSERT(anon->an_page != NULL || anon->an_swslot != 0);
continue;
}
/*
* The old page must be resident since the parent is wired.
*/
pg = anon->an_page;
KASSERT(pg != NULL);
KASSERT(pg->wire_count > 0);
/*
* If the page is loaned then it must already be mapped
* read-only and we don't need to copy it.
*/
if (pg->loan_count != 0) {
continue;
}
KASSERT(pg->uanon == anon);
KASSERT(pg->uobject == NULL);
/*
* If the page is busy, then we have to unlock, wait for
* it and then restart.
*/
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, amap->am_lock, "cownow");
goto ReStart;
}
/*
* Perform a copy-on-write.
* First - get a new anon and a page.
*/
nanon = uvm_analloc();
if (nanon) {
nanon->an_lock = amap->am_lock;
npg = uvm_pagealloc(NULL, 0, nanon, 0);
} else {
npg = NULL;
}
if (nanon == NULL || npg == NULL) {
amap_unlock(amap);
if (nanon) {
nanon->an_lock = NULL;
nanon->an_ref--;
KASSERT(nanon->an_ref == 0);
uvm_anfree(nanon);
}
uvm_wait("cownowpage");
goto ReStart;
}
/*
* Copy the data and replace anon with the new one.
* Also, setup its lock (share the with amap's lock).
*/
uvm_pagecopy(pg, npg);
anon->an_ref--;
KASSERT(anon->an_ref > 0);
amap->am_anon[slot] = nanon;
/*
* Drop PG_BUSY on new page. Since its owner was write
* locked all this time - it cannot be PG_RELEASED or
* waited on.
*/
uvm_pagelock(npg);
uvm_pageactivate(npg);
uvm_pageunlock(npg);
npg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(npg, NULL);
}
amap_unlock(amap);
}
/*
* amap_splitref: split a single reference into two separate references
*
* => called from uvm_map's clip routines
* => origref's map should be locked
* => origref->ar_amap should be unlocked (we will lock)
*/
void
amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset)
{
struct vm_amap *amap = origref->ar_amap;
u_int leftslots;
KASSERT(splitref->ar_amap == origref->ar_amap); AMAP_B2SLOT(leftslots, offset); KASSERT(leftslots != 0);
amap_lock(amap, RW_WRITER);
KASSERT(amap->am_nslot - origref->ar_pageoff - leftslots > 0);
#ifdef UVM_AMAP_PPREF
/* Establish ppref before we add a duplicate reference to the amap. */
if (amap->am_ppref == NULL) { amap_pp_establish(amap, origref->ar_pageoff);
}
#endif
/* Note: not a share reference. */
amap->am_ref++;
splitref->ar_pageoff = origref->ar_pageoff + leftslots;
amap_unlock(amap);
}
#ifdef UVM_AMAP_PPREF
/*
* amap_pp_establish: add a ppref array to an amap, if possible.
*
* => amap should be locked by caller.
*/
void
amap_pp_establish(struct vm_amap *amap, vaddr_t offset)
{
const size_t sz = amap->am_maxslot * sizeof(*amap->am_ppref);
KASSERT(rw_write_held(amap->am_lock));
amap->am_ppref = kmem_zalloc(sz, KM_NOSLEEP);
if (amap->am_ppref == NULL) {
/* Failure - just do not use ppref. */
amap->am_ppref = PPREF_NONE;
return;
}
pp_setreflen(amap->am_ppref, 0, 0, offset); pp_setreflen(amap->am_ppref, offset, amap->am_ref,
amap->am_nslot - offset);
}
/*
* amap_pp_adjref: adjust reference count to a part of an amap using the
* per-page reference count array.
*
* => caller must check that ppref != PPREF_NONE before calling.
* => map and amap must be locked.
*/
void
amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval)
{
int stopslot, *ppref, lcv, prevlcv;
int ref, len, prevref, prevlen;
KASSERT(rw_write_held(amap->am_lock));
stopslot = curslot + slotlen;
ppref = amap->am_ppref;
prevlcv = 0;
/*
* Advance to the correct place in the array, fragment if needed.
*/
for (lcv = 0 ; lcv < curslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > curslot) { /* goes past start? */ pp_setreflen(ppref, lcv, ref, curslot - lcv); pp_setreflen(ppref, curslot, ref, len - (curslot -lcv));
len = curslot - lcv; /* new length of entry @ lcv */
}
prevlcv = lcv;
}
if (lcv == 0) {
/*
* Ensure that the "prevref == ref" test below always
* fails, since we are starting from the beginning of
* the ppref array; that is, there is no previous chunk.
*/
prevref = -1;
prevlen = 0;
} else {
pp_getreflen(ppref, prevlcv, &prevref, &prevlen);
}
/*
* Now adjust reference counts in range. Merge the first
* changed entry with the last unchanged entry if possible.
*/
KASSERT(lcv == curslot); for (/* lcv already set */; lcv < stopslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > stopslot) { /* goes past end? */ pp_setreflen(ppref, lcv, ref, stopslot - lcv); pp_setreflen(ppref, stopslot, ref,
len - (stopslot - lcv));
len = stopslot - lcv;
}
ref += adjval;
KASSERT(ref >= 0); KASSERT(ref <= amap->am_ref);
if (lcv == prevlcv + prevlen && ref == prevref) {
pp_setreflen(ppref, prevlcv, ref, prevlen + len);
} else {
pp_setreflen(ppref, lcv, ref, len);
}
if (ref == 0) {
amap_wiperange(amap, lcv, len);
}
}
}
/*
* amap_wiperange: wipe out a range of an amap.
* Note: different from amap_wipeout because the amap is kept intact.
*
* => Both map and amap must be locked by caller.
*/
void
amap_wiperange(struct vm_amap *amap, int slotoff, int slots)
{
u_int lcv, stop, slotend;
bool byanon;
KASSERT(rw_write_held(amap->am_lock));
/*
* We can either traverse the amap by am_anon or by am_slots.
* Determine which way is less expensive.
*/
if (slots < amap->am_nused) {
byanon = true;
lcv = slotoff;
stop = slotoff + slots;
slotend = 0;
} else {
byanon = false;
lcv = 0;
stop = amap->am_nused;
slotend = slotoff + slots;
}
while (lcv < stop) {
struct vm_anon *anon;
u_int curslot, ptr, last;
if (byanon) {
curslot = lcv++; /* lcv advances here */
if (amap->am_anon[curslot] == NULL)
continue;
} else {
curslot = amap->am_slots[lcv];
if (curslot < slotoff || curslot >= slotend) {
lcv++; /* lcv advances here */
continue;
}
stop--; /* drop stop, since anon will be removed */
}
anon = amap->am_anon[curslot];
KASSERT(anon->an_lock == amap->am_lock);
/*
* Remove anon from the amap.
*/
amap->am_anon[curslot] = NULL;
ptr = amap->am_bckptr[curslot];
last = amap->am_nused - 1;
if (ptr != last) { amap->am_slots[ptr] = amap->am_slots[last];
amap->am_bckptr[amap->am_slots[ptr]] = ptr;
}
amap->am_nused--;
/*
* Drop its reference count.
*/
KASSERT(anon->an_lock == amap->am_lock); if (--anon->an_ref == 0) { uvm_anfree(anon);
}
}
}
#endif
#if defined(VMSWAP)
/*
* amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
*
* => called with swap_syscall_lock held.
* => note that we don't always traverse all anons.
* eg. amaps being wiped out, released anons.
* => return true if failed.
*/
bool
amap_swap_off(int startslot, int endslot)
{
struct vm_amap *am;
struct vm_amap *am_next;
struct vm_amap marker_prev;
struct vm_amap marker_next;
bool rv = false;
#if defined(DIAGNOSTIC)
memset(&marker_prev, 0, sizeof(marker_prev));
memset(&marker_next, 0, sizeof(marker_next));
#endif /* defined(DIAGNOSTIC) */
mutex_enter(&amap_list_lock);
for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) {
int i;
LIST_INSERT_BEFORE(am, &marker_prev, am_list);
LIST_INSERT_AFTER(am, &marker_next, am_list);
/* amap_list_lock prevents the lock pointer from changing. */
if (!amap_lock_try(am, RW_WRITER)) {
(void)kpause("amapswpo", false, 1, &amap_list_lock);
am_next = LIST_NEXT(&marker_prev, am_list);
if (am_next == &marker_next) {
am_next = LIST_NEXT(am_next, am_list);
} else {
KASSERT(LIST_NEXT(am_next, am_list) ==
&marker_next);
}
LIST_REMOVE(&marker_prev, am_list);
LIST_REMOVE(&marker_next, am_list);
continue;
}
mutex_exit(&amap_list_lock);
/* If am_nused == 0, the amap could be free - careful. */
for (i = 0; i < am->am_nused; i++) {
int slot;
int swslot;
struct vm_anon *anon;
slot = am->am_slots[i];
anon = am->am_anon[slot];
KASSERT(anon->an_lock == am->am_lock);
swslot = anon->an_swslot;
if (swslot < startslot || endslot <= swslot) {
continue;
}
am->am_flags |= AMAP_SWAPOFF;
rv = uvm_anon_pagein(am, anon);
amap_lock(am, RW_WRITER);
am->am_flags &= ~AMAP_SWAPOFF;
if (amap_refs(am) == 0) {
amap_wipeout(am);
am = NULL;
break;
}
if (rv) {
break;
}
i = 0;
}
if (am) {
amap_unlock(am);
}
mutex_enter(&amap_list_lock);
KASSERT(LIST_NEXT(&marker_prev, am_list) == &marker_next ||
LIST_NEXT(LIST_NEXT(&marker_prev, am_list), am_list) ==
&marker_next);
am_next = LIST_NEXT(&marker_next, am_list);
LIST_REMOVE(&marker_prev, am_list);
LIST_REMOVE(&marker_next, am_list);
}
mutex_exit(&amap_list_lock);
return rv;
}
#endif /* defined(VMSWAP) */
/*
* amap_lookup: look up a page in an amap.
*
* => amap should be locked by caller.
*/
struct vm_anon *
amap_lookup(struct vm_aref *aref, vaddr_t offset)
{
struct vm_amap *amap = aref->ar_amap;
struct vm_anon *an;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
an = amap->am_anon[slot];
UVMHIST_LOG(maphist,
"<- done (amap=%#jx, offset=%#jx, result=%#jx)",
(uintptr_t)amap, offset, (uintptr_t)an, 0);
KASSERT(slot < amap->am_nslot); KASSERT(an == NULL || an->an_ref != 0); KASSERT(an == NULL || an->an_lock == amap->am_lock);
return an;
}
/*
* amap_lookups: look up a range of pages in an amap.
*
* => amap should be locked by caller.
*/
void
amap_lookups(struct vm_aref *aref, vaddr_t offset, struct vm_anon **anons,
int npages)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
UVMHIST_LOG(maphist, " slot=%u, npages=%d, nslot=%d",
slot, npages, amap->am_nslot, 0);
KASSERT((slot + (npages - 1)) < amap->am_nslot);
memcpy(anons, &amap->am_anon[slot], npages * sizeof(struct vm_anon *));
#if defined(DIAGNOSTIC)
for (int i = 0; i < npages; i++) {
struct vm_anon * const an = anons[i];
if (an == NULL) {
continue;
}
KASSERT(an->an_ref != 0); KASSERT(an->an_lock == amap->am_lock);
}
#endif
UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0);
}
/*
* amap_add: add (or replace) a page to an amap.
*
* => amap should be locked by caller.
* => anon must have the lock associated with this amap.
*/
void
amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon,
bool replace)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon->an_lock == amap->am_lock); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
if (replace) {
struct vm_anon *oanon = amap->am_anon[slot];
KASSERT(oanon != NULL); if (oanon->an_page && (amap->am_flags & AMAP_SHARED) != 0) { pmap_page_protect(oanon->an_page, VM_PROT_NONE);
/*
* XXX: suppose page is supposed to be wired somewhere?
*/
}
} else {
KASSERT(amap->am_anon[slot] == NULL); KASSERT(amap->am_nused < amap->am_maxslot);
amap->am_bckptr[slot] = amap->am_nused;
amap->am_slots[amap->am_nused] = slot;
amap->am_nused++;
}
amap->am_anon[slot] = anon;
UVMHIST_LOG(maphist,
"<- done (amap=%#jx, offset=%#x, anon=%#jx, rep=%d)",
(uintptr_t)amap, offset, (uintptr_t)anon, replace);
}
/*
* amap_unadd: remove a page from an amap.
*
* => amap should be locked by caller.
*/
void
amap_unadd(struct vm_aref *aref, vaddr_t offset)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot, ptr, last;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_write_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot); KASSERT(amap->am_anon[slot] != NULL); KASSERT(amap->am_anon[slot]->an_lock == amap->am_lock);
amap->am_anon[slot] = NULL;
ptr = amap->am_bckptr[slot];
last = amap->am_nused - 1;
if (ptr != last) {
/* Move the last entry to keep the slots contiguous. */
amap->am_slots[ptr] = amap->am_slots[last];
amap->am_bckptr[amap->am_slots[ptr]] = ptr;
}
amap->am_nused--;
UVMHIST_LOG(maphist, "<- done (amap=%#jx, slot=%#jx)",
(uintptr_t)amap, slot,0, 0);
}
/*
* amap_adjref_anons: adjust the reference count(s) on amap and its anons.
*/
static void
amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len,
int refv, bool all)
{
#ifdef UVM_AMAP_PPREF
KASSERT(rw_write_held(amap->am_lock));
/*
* We must establish the ppref array before changing am_ref
* so that the ppref values match the current amap refcount.
*/
if (amap->am_ppref == NULL) { amap_pp_establish(amap, offset);
}
#endif
amap->am_ref += refv;
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, offset, len, refv);
}
#endif
amap_unlock(amap);
}
/*
* amap_ref: gain a reference to an amap.
*
* => amap must not be locked (we will lock).
* => "offset" and "len" are in units of pages.
* => Called at fork time to gain the child's reference.
*/
void
amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
amap_lock(amap, RW_WRITER);
if (flags & AMAP_SHARED) { amap->am_flags |= AMAP_SHARED;
}
amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0);
UVMHIST_LOG(maphist,"<- done! amap=%#jx", (uintptr_t)amap, 0, 0, 0);
}
/*
* amap_unref: remove a reference to an amap.
*
* => All pmap-level references to this amap must be already removed.
* => Called from uvm_unmap_detach(); entry is already removed from the map.
* => We will lock amap, so it must be unlocked.
*/
void
amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, bool all)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
amap_lock(amap, RW_WRITER);
UVMHIST_LOG(maphist," amap=%#jx refs=%d, nused=%d",
(uintptr_t)amap, amap->am_ref, amap->am_nused, 0);
KASSERT(amap->am_ref > 0); if (amap->am_ref == 1) {
/*
* If the last reference - wipeout and destroy the amap.
*/
amap->am_ref--;
amap_wipeout(amap);
UVMHIST_LOG(maphist,"<- done (was last ref)!", 0, 0, 0, 0);
return;
}
/*
* Otherwise, drop the reference count(s) on anons.
*/
if (amap->am_ref == 2 && (amap->am_flags & AMAP_SHARED) != 0) { amap->am_flags &= ~AMAP_SHARED;
}
amap_adjref_anons(amap, offset, len, -1, all);
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
}
/* $NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $ */
/*-
* Copyright (c) 2007-2010, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran and Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Cross call support
*
* Background
*
* Sometimes it is necessary to modify hardware state that is tied
* directly to individual CPUs (such as a CPU's local timer), and
* these updates can not be done remotely by another CPU. The LWP
* requesting the update may be unable to guarantee that it will be
* running on the CPU where the update must occur, when the update
* occurs.
*
* Additionally, it's sometimes necessary to modify per-CPU software
* state from a remote CPU. Where these update operations are so
* rare or the access to the per-CPU data so frequent that the cost
* of using locking or atomic operations to provide coherency is
* prohibitive, another way must be found.
*
* Cross calls help to solve these types of problem by allowing
* any LWP in the system to request that an arbitrary function be
* executed on a specific CPU.
*
* Implementation
*
* A slow mechanism for making low priority cross calls is
* provided. The function to be executed runs on the remote CPU
* within a bound kthread. No queueing is provided, and the
* implementation uses global state. The function being called may
* block briefly on locks, but in doing so must be careful to not
* interfere with other cross calls in the system. The function is
* called with thread context and not from a soft interrupt, so it
* can ensure that it is not interrupting other code running on the
* CPU, and so has exclusive access to the CPU. Since this facility
* is heavyweight, it's expected that it will not be used often.
*
* Cross calls must not allocate memory, as the pagedaemon uses cross
* calls (and memory allocation may need to wait on the pagedaemon).
*
* A low-overhead mechanism for high priority calls (XC_HIGHPRI) is
* also provided. The function to be executed runs in software
* interrupt context at IPL_SOFTSERIAL level, and is expected to
* be very lightweight, e.g. avoid blocking.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/xcall.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/evcnt.h>
#include <sys/kthread.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#ifdef _RUMPKERNEL
#include "rump_private.h"
#endif
/* Cross-call state box. */
typedef struct {
kmutex_t xc_lock;
kcondvar_t xc_busy;
xcfunc_t xc_func;
void * xc_arg1;
void * xc_arg2;
uint64_t xc_headp;
uint64_t xc_donep;
unsigned int xc_ipl;
} xc_state_t;
/* Bit indicating high (1) or low (0) priority. */
#define XC_PRI_BIT (1ULL << 63)
/* Low priority xcall structures. */
static xc_state_t xc_low_pri __cacheline_aligned;
/* High priority xcall structures. */
static xc_state_t xc_high_pri __cacheline_aligned;
static void * xc_sihs[4] __cacheline_aligned;
/* Event counters. */
static struct evcnt xc_unicast_ev __cacheline_aligned;
static struct evcnt xc_broadcast_ev __cacheline_aligned;
static void xc_init(void);
static void xc_thread(void *);
static inline uint64_t xc_highpri(xcfunc_t, void *, void *, struct cpu_info *,
unsigned int);
static inline uint64_t xc_lowpri(xcfunc_t, void *, void *, struct cpu_info *);
/* The internal form of IPL */
#define XC_IPL_MASK 0xff00
/*
* Assign 0 to XC_IPL_SOFTSERIAL to treat IPL_SOFTSERIAL as the default value
* (just XC_HIGHPRI).
*/
#define XC_IPL_SOFTSERIAL 0
#define XC_IPL_SOFTNET 1
#define XC_IPL_SOFTBIO 2
#define XC_IPL_SOFTCLOCK 3
#define XC_IPL_MAX XC_IPL_SOFTCLOCK
CTASSERT(XC_IPL_MAX <= __arraycount(xc_sihs));
/*
* xc_init:
*
* Initialize low and high priority cross-call structures.
*/
static void
xc_init(void)
{
xc_state_t *xclo = &xc_low_pri, *xchi = &xc_high_pri;
memset(xclo, 0, sizeof(xc_state_t));
mutex_init(&xclo->xc_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&xclo->xc_busy, "xclow");
memset(xchi, 0, sizeof(xc_state_t));
mutex_init(&xchi->xc_lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
cv_init(&xchi->xc_busy, "xchigh");
/* Set up a softint for each IPL_SOFT*. */
#define SETUP_SOFTINT(xipl, sipl) do { \
xc_sihs[(xipl)] = softint_establish( (sipl) | SOFTINT_MPSAFE,\
xc__highpri_intr, NULL); \
KASSERT(xc_sihs[(xipl)] != NULL); \
} while (0)
SETUP_SOFTINT(XC_IPL_SOFTSERIAL, SOFTINT_SERIAL);
/*
* If a IPL_SOFTXXX have the same value of the previous, we don't use
* the IPL (see xc_encode_ipl). So we don't need to allocate a softint
* for it.
*/
#if IPL_SOFTNET != IPL_SOFTSERIAL
SETUP_SOFTINT(XC_IPL_SOFTNET, SOFTINT_NET);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
SETUP_SOFTINT(XC_IPL_SOFTBIO, SOFTINT_BIO);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
SETUP_SOFTINT(XC_IPL_SOFTCLOCK, SOFTINT_CLOCK);
#endif
#undef SETUP_SOFTINT
evcnt_attach_dynamic(&xc_unicast_ev, EVCNT_TYPE_MISC, NULL,
"crosscall", "unicast");
evcnt_attach_dynamic(&xc_broadcast_ev, EVCNT_TYPE_MISC, NULL,
"crosscall", "broadcast");
}
/*
* Encode an IPL to a form that can be embedded into flags of xc_broadcast
* or xc_unicast.
*/
unsigned int
xc_encode_ipl(int ipl)
{
switch (ipl) {
case IPL_SOFTSERIAL:
return __SHIFTIN(XC_IPL_SOFTSERIAL, XC_IPL_MASK);
/* IPL_SOFT* can be the same value (e.g., on sparc or mips). */
#if IPL_SOFTNET != IPL_SOFTSERIAL
case IPL_SOFTNET:
return __SHIFTIN(XC_IPL_SOFTNET, XC_IPL_MASK);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
case IPL_SOFTBIO:
return __SHIFTIN(XC_IPL_SOFTBIO, XC_IPL_MASK);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
case IPL_SOFTCLOCK:
return __SHIFTIN(XC_IPL_SOFTCLOCK, XC_IPL_MASK);
#endif
}
panic("Invalid IPL: %d", ipl);
}
/*
* Extract an XC_IPL from flags of xc_broadcast or xc_unicast.
*/
static inline unsigned int
xc_extract_ipl(unsigned int flags)
{
return __SHIFTOUT(flags, XC_IPL_MASK);
}
/*
* xc_init_cpu:
*
* Initialize the cross-call subsystem. Called once for each CPU
* in the system as they are attached.
*/
void
xc_init_cpu(struct cpu_info *ci)
{
static bool again = false;
int error __diagused;
if (!again) {
/* Autoconfiguration will prevent re-entry. */
xc_init();
again = true;
}
cv_init(&ci->ci_data.cpu_xcall, "xcall");
error = kthread_create(PRI_XCALL, KTHREAD_MPSAFE, ci, xc_thread,
NULL, NULL, "xcall/%u", ci->ci_index);
KASSERT(error == 0);
}
/*
* xc_broadcast:
*
* Trigger a call on all CPUs in the system.
*/
uint64_t
xc_broadcast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2)
{
KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
int s, bound;
if (flags & XC_HIGHPRI)
s = splsoftserial();
else
bound = curlwp_bind();
(*func)(arg1, arg2);
if (flags & XC_HIGHPRI)
splx(s);
else
curlwp_bindx(bound);
return 0;
}
if ((flags & XC_HIGHPRI) != 0) {
int ipl = xc_extract_ipl(flags);
return xc_highpri(func, arg1, arg2, NULL, ipl);
} else {
return xc_lowpri(func, arg1, arg2, NULL);
}
}
static void
xc_nop(void *arg1, void *arg2)
{
return;
}
/*
* xc_barrier:
*
* Broadcast a nop to all CPUs in the system.
*/
void
xc_barrier(unsigned int flags)
{
uint64_t where;
where = xc_broadcast(flags, xc_nop, NULL, NULL);
xc_wait(where);
}
/*
* xc_unicast:
*
* Trigger a call on one CPU.
*/
uint64_t
xc_unicast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2,
struct cpu_info *ci)
{
KASSERT(ci != NULL);
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
int s, bound;
KASSERT(ci == curcpu());
if (flags & XC_HIGHPRI)
s = splsoftserial();
else
bound = curlwp_bind();
(*func)(arg1, arg2);
if (flags & XC_HIGHPRI)
splx(s);
else
curlwp_bindx(bound);
return 0;
}
if ((flags & XC_HIGHPRI) != 0) {
int ipl = xc_extract_ipl(flags);
return xc_highpri(func, arg1, arg2, ci, ipl);
} else {
return xc_lowpri(func, arg1, arg2, ci);
}
}
/*
* xc_wait:
*
* Wait for a cross call to complete.
*/
void
xc_wait(uint64_t where)
{
xc_state_t *xc;
KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
return;
}
/* Determine whether it is high or low priority cross-call. */
if ((where & XC_PRI_BIT) != 0) {
xc = &xc_high_pri;
where &= ~XC_PRI_BIT;
} else {
xc = &xc_low_pri;
}
#ifdef __HAVE_ATOMIC64_LOADSTORE
/* Fast path, if already done. */
if (atomic_load_acquire(&xc->xc_donep) >= where) {
return;
}
#endif
/* Slow path: block until awoken. */
mutex_enter(&xc->xc_lock);
while (xc->xc_donep < where) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
mutex_exit(&xc->xc_lock);
}
/*
* xc_lowpri:
*
* Trigger a low priority call on one or more CPUs.
*/
static inline uint64_t
xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci)
{
xc_state_t *xc = &xc_low_pri;
CPU_INFO_ITERATOR cii;
uint64_t where;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
xc->xc_arg1 = arg1;
xc->xc_arg2 = arg2;
xc->xc_func = func;
if (ci == NULL) {
xc_broadcast_ev.ev_count++;
for (CPU_INFO_FOREACH(cii, ci)) { if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
continue;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
} else {
xc_unicast_ev.ev_count++;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
KASSERT(xc->xc_donep < xc->xc_headp);
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
/* Return a low priority ticket. */
KASSERT((where & XC_PRI_BIT) == 0);
return where;
}
/*
* xc_thread:
*
* One thread per-CPU to dispatch low priority calls.
*/
static void
xc_thread(void *cookie)
{
struct cpu_info *ci = curcpu();
xc_state_t *xc = &xc_low_pri;
void *arg1, *arg2;
xcfunc_t func;
struct lwp *l = curlwp;
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
l, l->l_nopreempt);
mutex_enter(&xc->xc_lock);
for (;;) {
while (!ci->ci_data.cpu_xcall_pending) {
if (xc->xc_headp == xc->xc_donep) {
cv_broadcast(&xc->xc_busy);
}
cv_wait(&ci->ci_data.cpu_xcall, &xc->xc_lock);
KASSERT(ci == curcpu());
}
ci->ci_data.cpu_xcall_pending = false;
func = xc->xc_func;
arg1 = xc->xc_arg1;
arg2 = xc->xc_arg2;
mutex_exit(&xc->xc_lock);
KASSERT(func != NULL);
(*func)(arg1, arg2);
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d func %p",
l, l->l_nopreempt, func);
mutex_enter(&xc->xc_lock);
#ifdef __HAVE_ATOMIC64_LOADSTORE
atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
xc->xc_donep++;
#endif
}
/* NOTREACHED */
}
/*
* xc_ipi_handler:
*
* Handler of cross-call IPI.
*/
void
xc_ipi_handler(void)
{
xc_state_t *xc = & xc_high_pri;
KASSERT(xc->xc_ipl < __arraycount(xc_sihs)); KASSERT(xc_sihs[xc->xc_ipl] != NULL);
/* Executes xc__highpri_intr() via software interrupt. */
softint_schedule(xc_sihs[xc->xc_ipl]);
}
/*
* xc__highpri_intr:
*
* A software interrupt handler for high priority calls.
*/
void
xc__highpri_intr(void *dummy)
{
xc_state_t *xc = &xc_high_pri;
void *arg1, *arg2;
xcfunc_t func;
KASSERTMSG(!cpu_intr_p(), "high priority xcall for function %p",
xc->xc_func);
/*
* Lock-less fetch of function and its arguments.
* Safe since it cannot change at this point.
*/
func = xc->xc_func;
arg1 = xc->xc_arg1;
arg2 = xc->xc_arg2;
KASSERT(func != NULL);
(*func)(arg1, arg2);
/*
* Note the request as done, and if we have reached the head,
* cross-call has been processed - notify waiters, if any.
*/
mutex_enter(&xc->xc_lock);
KASSERT(xc->xc_donep < xc->xc_headp);
#ifdef __HAVE_ATOMIC64_LOADSTORE
atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
xc->xc_donep++;
#endif
if (xc->xc_donep == xc->xc_headp) {
cv_broadcast(&xc->xc_busy);
}
mutex_exit(&xc->xc_lock);
}
/*
* xc_highpri:
*
* Trigger a high priority call on one or more CPUs.
*/
static inline uint64_t
xc_highpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci,
unsigned int ipl)
{
xc_state_t *xc = &xc_high_pri;
uint64_t where;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
xc->xc_func = func;
xc->xc_arg1 = arg1;
xc->xc_arg2 = arg2;
xc->xc_headp += (ci ? 1 : ncpu);
xc->xc_ipl = ipl;
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
/*
* Send the IPI once lock is released.
* Note: it will handle the local CPU case.
*/
#ifdef _RUMPKERNEL
rump_xc_highpri(ci);
#else
#ifdef MULTIPROCESSOR
kpreempt_disable();
if (curcpu() == ci) {
/* Unicast: local CPU. */
xc_ipi_handler();
} else if (ci) {
/* Unicast: remote CPU. */
xc_send_ipi(ci);
} else {
/* Broadcast: all, including local. */
xc_send_ipi(NULL);
xc_ipi_handler();
}
kpreempt_enable();
#else
KASSERT(ci == NULL || curcpu() == ci);
xc_ipi_handler();
#endif
#endif
/* Indicate a high priority ticket. */
return (where | XC_PRI_BIT);
}
/* $NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $ */
/*-
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* VFS hooks.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/mutex.h>
LIST_HEAD(vfs_hooks_head, vfs_hooks) vfs_hooks_head =
LIST_HEAD_INITIALIZER(vfs_hooks_head);
kmutex_t vfs_hooks_lock;
void
vfs_hooks_init(void)
{
mutex_init(&vfs_hooks_lock, MUTEX_DEFAULT, IPL_NONE);
}
int
vfs_hooks_attach(struct vfs_hooks *vfs_hooks)
{
mutex_enter(&vfs_hooks_lock);
LIST_INSERT_HEAD(&vfs_hooks_head, vfs_hooks, vfs_hooks_list);
mutex_exit(&vfs_hooks_lock);
return (0);
}
int
vfs_hooks_detach(struct vfs_hooks *vfs_hooks)
{
struct vfs_hooks *hp;
int ret = 0;
mutex_enter(&vfs_hooks_lock);
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) {
if (hp == vfs_hooks) {
LIST_REMOVE(hp, vfs_hooks_list);
break;
}
}
if (hp == NULL)
ret = ESRCH;
mutex_exit(&vfs_hooks_lock);
return (ret);
}
/*
* Macro to be used in one of the vfs_hooks_* function for hooks that
* return an error code. Calls will stop as soon as one of the hooks
* fails.
*/
#define VFS_HOOKS_W_ERROR(func, fargs, hook, hargs) \
int \
func fargs \
{ \
int error; \
struct vfs_hooks *hp; \
\
error = EJUSTRETURN; \
\
mutex_enter(&vfs_hooks_lock); \
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \
if (hp-> hook != NULL) { \
error = hp-> hook hargs; \
if (error != 0) \
break; \
} \
} \
mutex_exit(&vfs_hooks_lock); \
\
return error; \
}
/*
* Macro to be used in one of the vfs_hooks_* function for hooks that
* do not return any error code. All hooks will be executed
* unconditionally.
*/
#define VFS_HOOKS_WO_ERROR(func, fargs, hook, hargs) \
void \
func fargs \
{ \
struct vfs_hooks *hp; \
\
mutex_enter(&vfs_hooks_lock); \
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \
if (hp-> hook != NULL) \
hp-> hook hargs; \
} \
mutex_exit(&vfs_hooks_lock); \
}
/*
* Routines to iterate over VFS hooks lists and execute them.
*/
VFS_HOOKS_WO_ERROR(vfs_hooks_unmount, (struct mount *mp), vh_unmount, (mp));
VFS_HOOKS_W_ERROR(vfs_hooks_reexport, (struct mount *mp, const char *path, void *data), vh_reexport, (mp, path, data));
/* $NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $ */
/* $KAME: mld6.c,v 1.25 2001/01/16 14:14:18 itojun Exp $ */
/*
* Copyright (C) 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
/*
* Copyright (c) 1988 Stephen Deering.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/cprng.h>
#include <sys/rwlock.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/mld6_var.h>
static krwlock_t in6_multilock __cacheline_aligned;
/*
* Protocol constants
*/
/*
* time between repetitions of a node's initial report of interest in a
* multicast address(in seconds)
*/
#define MLD_UNSOLICITED_REPORT_INTERVAL 10
static struct ip6_pktopts ip6_opts;
static void mld_start_listening(struct in6_multi *);
static void mld_stop_listening(struct in6_multi *);
static struct mld_hdr *mld_allocbuf(struct mbuf **, struct in6_multi *, int);
static void mld_sendpkt(struct in6_multi *, int, const struct in6_addr *);
static void mld_starttimer(struct in6_multi *);
static void mld_stoptimer(struct in6_multi *);
static u_long mld_timerresid(struct in6_multi *);
static void in6m_ref(struct in6_multi *);
static void in6m_unref(struct in6_multi *);
static void in6m_destroy(struct in6_multi *);
void
mld_init(void)
{
static u_int8_t hbh_buf[8];
struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf;
u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);
/* ip6h_nxt will be fill in later */
hbh->ip6h_len = 0; /* (8 >> 3) - 1 */
/* XXX: grotty hard coding... */
hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */
hbh_buf[3] = 0;
hbh_buf[4] = IP6OPT_RTALERT;
hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
memcpy(&hbh_buf[6], (void *)&rtalert_code, sizeof(u_int16_t));
ip6_opts.ip6po_hbh = hbh;
/* We will specify the hoplimit by a multicast option. */
ip6_opts.ip6po_hlim = -1;
ip6_opts.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
rw_init(&in6_multilock);
}
static void
mld_starttimer(struct in6_multi *in6m)
{
struct timeval now;
KASSERT(rw_write_held(&in6_multilock));
KASSERTMSG(in6m->in6m_timer != IN6M_TIMER_UNDEF,
"in6m_timer=%d", in6m->in6m_timer);
microtime(&now);
in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz;
in6m->in6m_timer_expire.tv_usec = now.tv_usec +
(in6m->in6m_timer % hz) * (1000000 / hz);
if (in6m->in6m_timer_expire.tv_usec > 1000000) {
in6m->in6m_timer_expire.tv_sec++;
in6m->in6m_timer_expire.tv_usec -= 1000000;
}
/* start or restart the timer */
callout_schedule(&in6m->in6m_timer_ch, in6m->in6m_timer);
}
/*
* mld_stoptimer releases in6_multilock when calling callout_halt.
* The caller must ensure in6m won't be freed while releasing the lock.
*/
static void
mld_stoptimer(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
return;
rw_exit(&in6_multilock);
callout_halt(&in6m->in6m_timer_ch, NULL);
rw_enter(&in6_multilock, RW_WRITER);
in6m->in6m_timer = IN6M_TIMER_UNDEF;
}
static void
mld_timeo(void *arg)
{
struct in6_multi *in6m = arg;
KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
in6m->in6m_refcount);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
rw_enter(&in6_multilock, RW_WRITER);
if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
goto out;
in6m->in6m_timer = IN6M_TIMER_UNDEF;
switch (in6m->in6m_state) {
case MLD_REPORTPENDING:
mld_start_listening(in6m);
break;
default:
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
break;
}
out:
rw_exit(&in6_multilock);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
static u_long
mld_timerresid(struct in6_multi *in6m)
{
struct timeval now, diff;
microtime(&now);
if (now.tv_sec > in6m->in6m_timer_expire.tv_sec ||
(now.tv_sec == in6m->in6m_timer_expire.tv_sec &&
now.tv_usec > in6m->in6m_timer_expire.tv_usec)) {
return (0);
}
diff = in6m->in6m_timer_expire;
diff.tv_sec -= now.tv_sec;
diff.tv_usec -= now.tv_usec;
if (diff.tv_usec < 0) {
diff.tv_sec--;
diff.tv_usec += 1000000;
}
/* return the remaining time in milliseconds */
return diff.tv_sec * 1000 + diff.tv_usec / 1000;
}
static void
mld_start_listening(struct in6_multi *in6m)
{
struct in6_addr all_in6;
KASSERT(rw_write_held(&in6_multilock));
/*
* RFC2710 page 10:
* The node never sends a Report or Done for the link-scope all-nodes
* address.
* MLD messages are never sent for multicast addresses whose scope is 0
* (reserved) or 1 (node-local).
*/
all_in6 = in6addr_linklocal_allnodes;
if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) {
/* XXX: this should not happen! */
in6m->in6m_timer = 0;
in6m->in6m_state = MLD_OTHERLISTENER;
}
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) {
in6m->in6m_timer = IN6M_TIMER_UNDEF;
in6m->in6m_state = MLD_OTHERLISTENER;
} else {
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_timer = cprng_fast32() %
(MLD_UNSOLICITED_REPORT_INTERVAL * hz);
in6m->in6m_state = MLD_IREPORTEDLAST;
mld_starttimer(in6m);
}
}
static void
mld_stop_listening(struct in6_multi *in6m)
{
struct in6_addr allnode, allrouter;
KASSERT(rw_lock_held(&in6_multilock));
allnode = in6addr_linklocal_allnodes;
if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) {
/* XXX: this should not happen! */
return;
}
allrouter = in6addr_linklocal_allrouters;
if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) {
/* XXX impossible */
return;
}
if (in6m->in6m_state == MLD_IREPORTEDLAST &&
(!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode)) &&
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) >
IPV6_ADDR_SCOPE_INTFACELOCAL) {
mld_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter);
}
}
void
mld_input(struct mbuf *m, int off)
{
struct ip6_hdr *ip6;
struct mld_hdr *mldh;
struct ifnet *ifp;
struct in6_multi *in6m = NULL;
struct in6_addr mld_addr, all_in6;
u_long timer = 0; /* timer value in the MLD query header */
struct psref psref;
ifp = m_get_rcvif_psref(m, &psref);
if (__predict_false(ifp == NULL))
goto out;
IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh));
if (mldh == NULL) {
ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
goto out_nodrop;
}
ip6 = mtod(m, struct ip6_hdr *);
/* source address validation */
if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
/*
* RFC3590 allows the IPv6 unspecified address as the source
* address of MLD report and done messages. However, as this
* same document says, this special rule is for snooping
* switches and the RFC requires routers to discard MLD packets
* with the unspecified source address. The RFC only talks
* about hosts receiving an MLD query or report in Security
* Considerations, but this is probably the correct intention.
* RFC3590 does not talk about other cases than link-local and
* the unspecified source addresses, but we believe the same
* rule should be applied.
* As a result, we only allow link-local addresses as the
* source address; otherwise, simply discard the packet.
*/
#if 0
/*
* XXX: do not log in an input path to avoid log flooding,
* though RFC3590 says "SHOULD log" if the source of a query
* is the unspecified address.
*/
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufm[INET6_ADDRSTRLEN];
log(LOG_INFO,
"mld_input: src %s is not link-local (grp=%s)\n",
IN6_PRINT(ip6bufs,&ip6->ip6_src),
IN6_PRINT(ip6bufm, &mldh->mld_addr));
#endif
goto out;
}
/*
* make a copy for local work (in6_setscope() may modify the 1st arg)
*/
mld_addr = mldh->mld_addr;
if (in6_setscope(&mld_addr, ifp, NULL)) {
/* XXX: this should not happen! */
goto out;
}
/*
* In the MLD specification, there are 3 states and a flag.
*
* In Non-Listener state, we simply don't have a membership record.
* In Delaying Listener state, our timer is running (in6m->in6m_timer)
* In Idle Listener state, our timer is not running
* (in6m->in6m_timer==IN6M_TIMER_UNDEF)
*
* The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if
* we have heard a report from another member, or MLD_IREPORTEDLAST
* if we sent the last report.
*/
switch (mldh->mld_type) {
case MLD_LISTENER_QUERY: {
struct in6_multi *next;
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
!IN6_IS_ADDR_MULTICAST(&mld_addr))
break; /* print error or log stat? */
all_in6 = in6addr_linklocal_allnodes;
if (in6_setscope(&all_in6, ifp, NULL)) {
/* XXX: this should not happen! */
break;
}
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
* which the query arrived excl. those that belong
* to the "all-nodes" group (ff02::1).
* - Restart any timer that is already running but has
* a value longer than the requested timeout.
* - Use the value specified in the query message as
* the maximum timeout.
*/
timer = ntohs(mldh->mld_maxdelay);
rw_enter(&in6_multilock, RW_WRITER);
/*
* mld_stoptimer and mld_sendpkt release in6_multilock
* temporarily, so we have to prevent in6m from being freed
* while releasing the lock by having an extra reference to it.
*
* Also in6_purge_multi might remove items from the list of the
* ifp while releasing the lock. Fortunately in6_purge_multi is
* never executed as long as we have a psref of the ifp.
*/
LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
IPV6_ADDR_SCOPE_LINKLOCAL)
continue;
if (in6m->in6m_state == MLD_REPORTPENDING)
continue; /* we are not yet ready */
if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
!IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr))
continue;
if (timer == 0) {
in6m_ref(in6m);
/* send a report immediately */
mld_stoptimer(in6m);
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_state = MLD_IREPORTEDLAST;
in6m_unref(in6m); /* May free in6m */
} else if (in6m->in6m_timer == IN6M_TIMER_UNDEF ||
mld_timerresid(in6m) > timer) {
in6m->in6m_timer =
1 + (cprng_fast32() % timer) * hz / 1000;
mld_starttimer(in6m);
}
}
rw_exit(&in6_multilock);
break;
}
case MLD_LISTENER_REPORT:
/*
* For fast leave to work, we have to know that we are the
* last person to send a report for this group. Reports
* can potentially get looped back if we are a multicast
* router, so discard reports sourced by me.
* Note that it is impossible to check IFF_LOOPBACK flag of
* ifp for this purpose, since ip6_mloopback pass the physical
* interface to looutput.
*/
if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
break;
if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
break;
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
rw_enter(&in6_multilock, RW_WRITER);
in6m = in6_lookup_multi(&mld_addr, ifp);
if (in6m) {
in6m_ref(in6m);
mld_stoptimer(in6m); /* transit to idle state */
in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */
in6m_unref(in6m);
in6m = NULL; /* in6m might be freed */
}
rw_exit(&in6_multilock);
break;
default: /* this is impossible */
#if 0
/*
* this case should be impossible because of filtering in
* icmp6_input(). But we explicitly disabled this part
* just in case.
*/
log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type);
#endif
break;
}
out:
m_freem(m);
out_nodrop:
m_put_rcvif_psref(ifp, &psref);
}
/*
* XXX mld_sendpkt must be called with in6_multilock held and
* will release in6_multilock before calling ip6_output and
* returning to avoid locking against myself in ip6_output.
*/
static void
mld_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst)
{
struct mbuf *mh;
struct mld_hdr *mldh;
struct ip6_hdr *ip6 = NULL;
struct ip6_moptions im6o;
struct in6_ifaddr *ia = NULL;
struct ifnet *ifp = in6m->in6m_ifp;
int ignflags;
struct psref psref;
int bound;
KASSERT(rw_write_held(&in6_multilock));
/*
* At first, find a link local address on the outgoing interface
* to use as the source address of the MLD packet.
* We do not reject tentative addresses for MLD report to deal with
* the case where we first join a link-local address.
*/
ignflags = (IN6_IFF_NOTREADY|IN6_IFF_ANYCAST) & ~IN6_IFF_TENTATIVE;
bound = curlwp_bind();
ia = in6ifa_ifpforlinklocal_psref(ifp, ignflags, &psref);
if (ia == NULL) {
curlwp_bindx(bound);
return;
}
if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
ia6_release(ia, &psref);
ia = NULL;
}
/* Allocate two mbufs to store IPv6 header and MLD header */
mldh = mld_allocbuf(&mh, in6m, type);
if (mldh == NULL) {
ia6_release(ia, &psref);
curlwp_bindx(bound);
return;
}
/* fill src/dst here */
ip6 = mtod(mh, struct ip6_hdr *);
ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
ip6->ip6_dst = dst ? *dst : in6m->in6m_addr;
ia6_release(ia, &psref);
curlwp_bindx(bound);
mldh->mld_addr = in6m->in6m_addr;
in6_clearscope(&mldh->mld_addr); /* XXX */
mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr),
sizeof(struct mld_hdr));
/* construct multicast option */
memset(&im6o, 0, sizeof(im6o));
im6o.im6o_multicast_if_index = if_get_index(ifp);
im6o.im6o_multicast_hlim = 1;
/*
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing daemon can hear it.
*/
im6o.im6o_multicast_loop = (ip6_mrouter != NULL);
/* increment output statistics */
ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
icmp6_ifstat_inc(ifp, ifs6_out_msg);
switch (type) {
case MLD_LISTENER_QUERY:
icmp6_ifstat_inc(ifp, ifs6_out_mldquery);
break;
case MLD_LISTENER_REPORT:
icmp6_ifstat_inc(ifp, ifs6_out_mldreport);
break;
case MLD_LISTENER_DONE:
icmp6_ifstat_inc(ifp, ifs6_out_mlddone);
break;
}
/* XXX we cannot call ip6_output with holding in6_multilock */
rw_exit(&in6_multilock);
ip6_output(mh, &ip6_opts, NULL, ia ? 0 : IPV6_UNSPECSRC,
&im6o, NULL, NULL);
rw_enter(&in6_multilock, RW_WRITER);
}
static struct mld_hdr *
mld_allocbuf(struct mbuf **mh, struct in6_multi *in6m, int type)
{
struct mbuf *md;
struct mld_hdr *mldh;
struct ip6_hdr *ip6;
/*
* Allocate mbufs to store ip6 header and MLD header.
* We allocate 2 mbufs and make chain in advance because
* it is more convenient when inserting the hop-by-hop option later.
*/
MGETHDR(*mh, M_DONTWAIT, MT_HEADER);
if (*mh == NULL)
return NULL;
MGET(md, M_DONTWAIT, MT_DATA);
if (md == NULL) {
m_free(*mh);
*mh = NULL;
return NULL;
}
(*mh)->m_next = md;
md->m_next = NULL;
m_reset_rcvif((*mh));
(*mh)->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
(*mh)->m_len = sizeof(struct ip6_hdr);
m_align(*mh, sizeof(struct ip6_hdr));
/* fill in the ip6 header */
ip6 = mtod(*mh, struct ip6_hdr *);
memset(ip6, 0, sizeof(*ip6));
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be set later */
ip6->ip6_nxt = IPPROTO_ICMPV6;
/* ip6_hlim will be set by im6o.im6o_multicast_hlim */
/* ip6_src/dst will be set by mld_sendpkt() or mld_sendbuf() */
/* fill in the MLD header as much as possible */
md->m_len = sizeof(struct mld_hdr);
mldh = mtod(md, struct mld_hdr *);
memset(mldh, 0, sizeof(struct mld_hdr));
mldh->mld_type = type;
return mldh;
}
static void
in6m_ref(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
in6m->in6m_refcount++;
}
static void
in6m_unref(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
if (--in6m->in6m_refcount == 0)
in6m_destroy(in6m);
}
/*
* Add an address to the list of IP6 multicast addresses for a given interface.
*/
struct in6_multi *
in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp,
int timer)
{
struct sockaddr_in6 sin6;
struct in6_multi *in6m;
*errorp = 0;
rw_enter(&in6_multilock, RW_WRITER);
/*
* See if address already in list.
*/
in6m = in6_lookup_multi(maddr6, ifp);
if (in6m != NULL) {
/*
* Found it; just increment the reference count.
*/
in6m->in6m_refcount++;
} else {
/*
* New address; allocate a new multicast record
* and link it into the interface's multicast list.
*/
in6m = malloc(sizeof(*in6m), M_IPMADDR, M_NOWAIT|M_ZERO);
if (in6m == NULL) {
*errorp = ENOBUFS;
goto out;
}
in6m->in6m_addr = *maddr6;
in6m->in6m_ifp = ifp;
in6m->in6m_refcount = 1;
in6m->in6m_timer = IN6M_TIMER_UNDEF;
callout_init(&in6m->in6m_timer_ch, CALLOUT_MPSAFE);
callout_setfunc(&in6m->in6m_timer_ch, mld_timeo, in6m);
LIST_INSERT_HEAD(&ifp->if_multiaddrs, in6m, in6m_entry);
/*
* Ask the network driver to update its multicast reception
* filter appropriately for the new address.
*/
sockaddr_in6_init(&sin6, maddr6, 0, 0, 0);
*errorp = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
if (*errorp) {
callout_destroy(&in6m->in6m_timer_ch);
LIST_REMOVE(in6m, in6m_entry);
free(in6m, M_IPMADDR);
in6m = NULL;
goto out;
}
in6m->in6m_timer = timer;
if (in6m->in6m_timer > 0) {
in6m->in6m_state = MLD_REPORTPENDING;
mld_starttimer(in6m);
goto out;
}
/*
* Let MLD6 know that we have joined a new IP6 multicast
* group.
*/
mld_start_listening(in6m);
}
out:
rw_exit(&in6_multilock);
return in6m;
}
static void
in6m_destroy(struct in6_multi *in6m)
{
struct sockaddr_in6 sin6;
KASSERT(rw_write_held(&in6_multilock));
KASSERTMSG(in6m->in6m_refcount == 0, "in6m_refcount=%d",
in6m->in6m_refcount);
/*
* Unlink from list if it's listed. This must be done before
* mld_stop_listening because it releases in6_multilock and that allows
* someone to look up the removing in6m from the list and add a
* reference to the entry unexpectedly.
*/
if (in6_lookup_multi(&in6m->in6m_addr, in6m->in6m_ifp) != NULL)
LIST_REMOVE(in6m, in6m_entry);
/*
* No remaining claims to this record; let MLD6 know
* that we are leaving the multicast group.
*/
mld_stop_listening(in6m);
/*
* Delete all references of this multicasting group from
* the membership arrays
*/
in6_purge_mcast_references(in6m);
/*
* Notify the network driver to update its multicast
* reception filter.
*/
sockaddr_in6_init(&sin6, &in6m->in6m_addr, 0, 0, 0);
if_mcast_op(in6m->in6m_ifp, SIOCDELMULTI, sin6tosa(&sin6));
/* Tell mld_timeo we're halting the timer */
in6m->in6m_timer = IN6M_TIMER_UNDEF;
rw_exit(&in6_multilock);
callout_halt(&in6m->in6m_timer_ch, NULL);
callout_destroy(&in6m->in6m_timer_ch);
free(in6m, M_IPMADDR);
rw_enter(&in6_multilock, RW_WRITER);
}
/*
* Delete a multicast address record.
*/
void
in6_delmulti_locked(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
in6m->in6m_refcount);
/*
* The caller should have a reference to in6m. So we don't need to care
* of releasing the lock in mld_stoptimer.
*/
mld_stoptimer(in6m);
if (--in6m->in6m_refcount == 0)
in6m_destroy(in6m);
}
void
in6_delmulti(struct in6_multi *in6m)
{
rw_enter(&in6_multilock, RW_WRITER);
in6_delmulti_locked(in6m);
rw_exit(&in6_multilock);
}
/*
* Look up the in6_multi record for a given IP6 multicast address
* on a given interface. If no matching record is found, "in6m"
* returns NULL.
*/
struct in6_multi *
in6_lookup_multi(const struct in6_addr *addr, const struct ifnet *ifp)
{
struct in6_multi *in6m;
KASSERT(rw_lock_held(&in6_multilock)); LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, addr))
break;
}
return in6m;
}
void
in6_lookup_and_delete_multi(const struct in6_addr *addr,
const struct ifnet *ifp)
{
struct in6_multi *in6m;
rw_enter(&in6_multilock, RW_WRITER);
in6m = in6_lookup_multi(addr, ifp);
if (in6m != NULL)
in6_delmulti_locked(in6m);
rw_exit(&in6_multilock);
}
bool
in6_multi_group(const struct in6_addr *addr, const struct ifnet *ifp)
{
bool ingroup;
rw_enter(&in6_multilock, RW_READER);
ingroup = in6_lookup_multi(addr, ifp) != NULL;
rw_exit(&in6_multilock);
return ingroup;
}
/*
* Purge in6_multi records associated to the interface.
*/
void
in6_purge_multi(struct ifnet *ifp)
{
struct in6_multi *in6m, *next;
rw_enter(&in6_multilock, RW_WRITER);
LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
LIST_REMOVE(in6m, in6m_entry);
/*
* Normally multicast addresses are already purged at this
* point. Remaining references aren't accessible via ifp,
* so what we can do here is to prevent ifp from being
* accessed via in6m by removing it from the list of ifp.
*/
mld_stoptimer(in6m);
}
rw_exit(&in6_multilock);
}
void
in6_multi_lock(int op)
{
rw_enter(&in6_multilock, op);
}
void
in6_multi_unlock(void)
{
rw_exit(&in6_multilock);
}
bool
in6_multi_locked(int op)
{
switch (op) {
case RW_READER:
return rw_read_held(&in6_multilock);
case RW_WRITER:
return rw_write_held(&in6_multilock);
default:
return rw_lock_held(&in6_multilock);
}
}
struct in6_multi_mship *
in6_joingroup(struct ifnet *ifp, struct in6_addr *addr, int *errorp, int timer)
{
struct in6_multi_mship *imm;
imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT|M_ZERO);
if (imm == NULL) {
*errorp = ENOBUFS;
return NULL;
}
imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp, timer);
if (!imm->i6mm_maddr) {
/* *errorp is already set */
free(imm, M_IPMADDR);
return NULL;
}
return imm;
}
int
in6_leavegroup(struct in6_multi_mship *imm)
{
struct in6_multi *in6m;
rw_enter(&in6_multilock, RW_WRITER);
in6m = imm->i6mm_maddr;
imm->i6mm_maddr = NULL;
if (in6m != NULL) {
in6_delmulti_locked(in6m);
}
rw_exit(&in6_multilock);
free(imm, M_IPMADDR);
return 0;
}
/*
* DEPRECATED: keep it just to avoid breaking old sysctl users.
*/
static int
in6_mkludge_sysctl(SYSCTLFN_ARGS)
{
if (namelen != 1)
return EINVAL;
*oldlenp = 0;
return 0;
}
static int
in6_multicast_sysctl(SYSCTLFN_ARGS)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct in6_ifaddr *ia6;
struct in6_multi *in6m;
uint32_t tmp;
int error;
size_t written;
struct psref psref, psref_ia;
int bound, s;
if (namelen != 1)
return EINVAL;
rw_enter(&in6_multilock, RW_READER);
bound = curlwp_bind();
ifp = if_get_byindex(name[0], &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
rw_exit(&in6_multilock);
return ENODEV;
}
if (oldp == NULL) {
*oldlenp = 0;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
*oldlenp += 2 * sizeof(struct in6_addr) +
sizeof(uint32_t);
}
}
pserialize_read_exit(s);
if_put(ifp, &psref);
curlwp_bindx(bound);
rw_exit(&in6_multilock);
return 0;
}
error = 0;
written = 0;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref_ia);
pserialize_read_exit(s);
ia6 = ifatoia6(ifa);
LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
if (written + 2 * sizeof(struct in6_addr) +
sizeof(uint32_t) > *oldlenp)
goto done;
/*
* XXX return the first IPv6 address to keep backward
* compatibility, however now multicast addresses
* don't belong to any IPv6 addresses so it should be
* unnecessary.
*/
error = sysctl_copyout(l, &ia6->ia_addr.sin6_addr,
oldp, sizeof(struct in6_addr));
if (error)
goto done;
oldp = (char *)oldp + sizeof(struct in6_addr);
written += sizeof(struct in6_addr);
error = sysctl_copyout(l, &in6m->in6m_addr,
oldp, sizeof(struct in6_addr));
if (error)
goto done;
oldp = (char *)oldp + sizeof(struct in6_addr);
written += sizeof(struct in6_addr);
tmp = in6m->in6m_refcount;
error = sysctl_copyout(l, &tmp, oldp, sizeof(tmp));
if (error)
goto done;
oldp = (char *)oldp + sizeof(tmp);
written += sizeof(tmp);
}
s = pserialize_read_enter();
break;
}
pserialize_read_exit(s);
done:
ifa_release(ifa, &psref_ia);
if_put(ifp, &psref);
curlwp_bindx(bound);
rw_exit(&in6_multilock);
*oldlenp = written;
return error;
}
void
in6_sysctl_multicast_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "multicast",
SYSCTL_DESCR("Multicast information"),
in6_multicast_sysctl, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "multicast_kludge",
SYSCTL_DESCR("multicast kludge information"),
in6_mkludge_sysctl, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1995 Jean-Marc Zucconi
* All rights reserved.
*
* Ported to NetBSD by Matthieu Herrb <matthieu@laas.fr>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/errno.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/vnode.h>
#include <sys/bus.h>
#include <sys/joystick.h>
#include <dev/ic/joyvar.h>
#include "ioconf.h"
/*
* The game port can manage 4 buttons and 4 variable resistors (usually 2
* joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201.
* Getting the state of the buttons is done by reading the game port;
* buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2)
* to bits 0-3. If button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5,
* 6, 7) is set to 0 to get the value of a resistor, write the value 0xff
* at port and wait until the corresponding bit returns to 0.
*/
#define JOYPART(d) (minor(d) & 1)
#define JOYUNIT(d) (minor(d) >> 1)
#ifndef JOY_TIMEOUT
#define JOY_TIMEOUT 2000 /* 2 milliseconds */
#endif
static dev_type_open(joyopen);
static dev_type_close(joyclose);
static dev_type_read(joyread);
static dev_type_ioctl(joyioctl);
const struct cdevsw joy_cdevsw = {
.d_open = joyopen,
.d_close = joyclose,
.d_read = joyread,
.d_write = nowrite,
.d_ioctl = joyioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
void
joyattach(struct joy_softc *sc)
{
if (sc->sc_lock == NULL) {
panic("joyattach: no lock");
}
sc->timeout[0] = 0;
sc->timeout[1] = 0;
mutex_enter(sc->sc_lock);
bus_space_write_1(sc->sc_iot, sc->sc_ioh, 0, 0xff);
DELAY(10000); /* 10 ms delay */
aprint_normal_dev(sc->sc_dev, "joystick %sconnected\n",
(bus_space_read_1(sc->sc_iot, sc->sc_ioh, 0) & 0x0f) == 0x0f ?
"not " : "");
mutex_exit(sc->sc_lock);
}
int
joydetach(struct joy_softc *sc, int flags)
{
int maj, mn;
maj = cdevsw_lookup_major(&joy_cdevsw);
mn = device_unit(sc->sc_dev) << 1;
vdevgone(maj, mn, mn, VCHR);
vdevgone(maj, mn + 1, mn + 1, VCHR);
return 0;
}
static int
joyopen(dev_t dev, int flag, int mode, struct lwp *l)
{
int unit = JOYUNIT(dev);
int i = JOYPART(dev);
struct joy_softc *sc;
sc = device_lookup_private(&joy_cd, unit);
if (sc == NULL)
return ENXIO;
mutex_enter(sc->sc_lock);
if (sc->timeout[i]) {
mutex_exit(sc->sc_lock);
return EBUSY;
}
sc->x_off[i] = sc->y_off[i] = 0;
sc->timeout[i] = JOY_TIMEOUT;
mutex_exit(sc->sc_lock);
return 0;
}
static int
joyclose(dev_t dev, int flag, int mode, struct lwp *l)
{
int unit = JOYUNIT(dev);
int i = JOYPART(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
mutex_enter(sc->sc_lock);
sc->timeout[i] = 0;
mutex_exit(sc->sc_lock);
return 0;
}
static int
joyread(dev_t dev, struct uio *uio, int flag)
{
int unit = JOYUNIT(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
bus_space_tag_t iot = sc->sc_iot;
bus_space_handle_t ioh = sc->sc_ioh;
struct joystick c;
struct timeval start, now, diff;
int state = 0, x = 0, y = 0, i;
mutex_enter(sc->sc_lock);
bus_space_write_1(iot, ioh, 0, 0xff);
microtime(&start);
now = start; /* structure assignment */
i = sc->timeout[JOYPART(dev)];
for (;;) {
timersub(&now, &start, &diff);
if (diff.tv_sec > 0 || diff.tv_usec > i)
break;
state = bus_space_read_1(iot, ioh, 0);
if (JOYPART(dev) == 1)
state >>= 2;
if (!x && !(state & 0x01))
x = diff.tv_usec;
if (!y && !(state & 0x02))
y = diff.tv_usec;
if (x && y)
break;
microtime(&now);
}
mutex_exit(sc->sc_lock);
c.x = x ? sc->x_off[JOYPART(dev)] + x : 0x80000000;
c.y = y ? sc->y_off[JOYPART(dev)] + y : 0x80000000;
state >>= 4;
c.b1 = ~state & 1;
c.b2 = ~(state >> 1) & 1;
return uiomove(&c, sizeof(struct joystick), uio);
}
static int
joyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
int unit = JOYUNIT(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
int i = JOYPART(dev), x, error;
mutex_enter(sc->sc_lock);
error = 0;
switch (cmd) {
case JOY_SETTIMEOUT:
x = *(int *)data;
if (x < 1 || x > 10000) { /* 10ms maximum! */
error = EINVAL;
break;
}
sc->timeout[i] = x;
break;
case JOY_GETTIMEOUT:
*(int *)data = sc->timeout[i];
break;
case JOY_SET_X_OFFSET:
sc->x_off[i] = *(int *)data;
break;
case JOY_SET_Y_OFFSET:
sc->y_off[i] = *(int *)data;
break;
case JOY_GET_X_OFFSET:
*(int *)data = sc->x_off[i];
break;
case JOY_GET_Y_OFFSET:
*(int *)data = sc->y_off[i];
break;
default:
error = ENXIO;
break;
}
mutex_exit(sc->sc_lock);
return error;
}
/* $NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel reader/writer lock implementation, modeled after those
* found in Solaris, a description of which can be found in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*
* The NetBSD implementation differs from that described in the book, in
* that the locks are partially adaptive. Lock waiters spin wait while a
* lock is write held and the holder is still running on a CPU. The method
* of choosing which threads to awaken when a lock is released also differs,
* mainly to take account of the partially adaptive behaviour.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $");
#include "opt_lockdebug.h"
#define __RWLOCK_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
#include <dev/lockstat.h>
#include <machine/rwlock.h>
/*
* LOCKDEBUG
*/
#define RW_DEBUG_P(rw) (((rw)->rw_owner & RW_NODEBUG) == 0)
#define RW_WANTLOCK(rw, op) \
LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
#define RW_LOCKED(rw, op) \
LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
#define RW_UNLOCKED(rw, op) \
LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
/*
* DIAGNOSTIC
*/
#if defined(DIAGNOSTIC)
#define RW_ASSERT(rw, cond) \
do { \
if (__predict_false(!(cond))) \
rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
} while (/* CONSTCOND */ 0)
#else
#define RW_ASSERT(rw, cond) /* nothing */
#endif /* DIAGNOSTIC */
/*
* For platforms that do not provide stubs, or for the LOCKDEBUG case.
*/
#ifdef LOCKDEBUG
#undef __HAVE_RW_STUBS
#endif
#ifndef __HAVE_RW_STUBS
__strong_alias(rw_enter,rw_vector_enter);
__strong_alias(rw_exit,rw_vector_exit);
__strong_alias(rw_tryenter,rw_vector_tryenter);
#endif
static void rw_abort(const char *, size_t, krwlock_t *, const char *);
static void rw_dump(const volatile void *, lockop_printer_t);
static lwp_t *rw_owner(wchan_t);
lockops_t rwlock_lockops = {
.lo_name = "Reader / writer lock",
.lo_type = LOCKOPS_SLEEP,
.lo_dump = rw_dump,
};
/*
* Give rwlock holders an extra-high priority boost on-blocking due to
* direct handoff. XXX To be revisited.
*/
syncobj_t rw_syncobj = {
.sobj_name = "rwlock",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KTHREAD,
.sobj_unsleep = turnstile_unsleep,
.sobj_changepri = turnstile_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = rw_owner,
};
/*
* rw_cas:
*
* Do an atomic compare-and-swap on the lock word.
*/
static inline uintptr_t
rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
{
return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
(void *)o, (void *)n);
}
/*
* rw_swap:
*
* Do an atomic swap of the lock word. This is used only when it's
* known that the lock word is set up such that it can't be changed
* behind us (assert this), so there's no point considering the result.
*/
static inline void
rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
{
n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
(void *)n);
RW_ASSERT(rw, n == o); RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
}
/*
* rw_dump:
*
* Dump the contents of a rwlock structure.
*/
static void
rw_dump(const volatile void *cookie, lockop_printer_t pr)
{
const volatile krwlock_t *rw = cookie;
pr("owner/count : %#018lx flags : %#018x\n",
(long)RW_OWNER(rw), (int)RW_FLAGS(rw));
}
/*
* rw_abort:
*
* Dump information about an error and panic the system. This
* generates a lot of machine code in the DIAGNOSTIC case, so
* we ask the compiler to not inline it.
*/
static void __noinline
rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
{
if (__predict_false(panicstr != NULL))
return;
LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
}
/*
* rw_init:
*
* Initialize a rwlock for use.
*/
void
_rw_init(krwlock_t *rw, uintptr_t return_address)
{
#ifdef LOCKDEBUG
/* XXX only because the assembly stubs can't handle RW_NODEBUG */
if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
rw->rw_owner = 0;
else
rw->rw_owner = RW_NODEBUG;
#else
rw->rw_owner = 0;
#endif
}
void
rw_init(krwlock_t *rw)
{
_rw_init(rw, (uintptr_t)__builtin_return_address(0));
}
/*
* rw_destroy:
*
* Tear down a rwlock.
*/
void
rw_destroy(krwlock_t *rw)
{ RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0); LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
}
/*
* rw_oncpu:
*
* Return true if an rwlock owner is running on a CPU in the system.
* If the target is waiting on the kernel big lock, then we must
* release it. This is necessary to avoid deadlock.
*/
static bool
rw_oncpu(uintptr_t owner)
{
#ifdef MULTIPROCESSOR
struct cpu_info *ci;
lwp_t *l;
KASSERT(kpreempt_disabled()); if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
return false;
}
/*
* See lwp_dtor() why dereference of the LWP pointer is safe.
* We must have kernel preemption disabled for that.
*/
l = (lwp_t *)(owner & RW_THREAD);
ci = l->l_cpu;
if (ci && ci->ci_curlwp == l) {
/* Target is running; do we need to block? */
return (ci->ci_biglock_wanted != l);
}
#endif
/* Not running. It may be safe to block now. */
return false;
}
/*
* rw_vector_enter:
*
* Acquire a rwlock.
*/
void
rw_vector_enter(krwlock_t *rw, const krw_t op)
{
uintptr_t owner, incr, need_wait, set_wait, curthread, next;
turnstile_t *ts;
int queue;
lwp_t *l;
LOCKSTAT_TIMER(slptime);
LOCKSTAT_TIMER(slpcnt);
LOCKSTAT_TIMER(spintime);
LOCKSTAT_COUNTER(spincnt);
LOCKSTAT_FLAG(lsflag);
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, !cpu_intr_p()); RW_ASSERT(rw, curthread != 0); RW_WANTLOCK(rw, op); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section());
LOCKDEBUG_BARRIER(&kernel_lock, 1);
}
/*
* We play a slight trick here. If we're a reader, we want
* increment the read count. If we're a writer, we want to
* set the owner field and the WRITE_LOCKED bit.
*
* In the latter case, we expect those bits to be zero,
* therefore we can use an add operation to set them, which
* means an add operation for both cases.
*/
if (__predict_true(op == RW_READER)) {
incr = RW_READ_INCR;
set_wait = RW_HAS_WAITERS;
need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
queue = TS_READER_Q;
} else {
RW_ASSERT(rw, op == RW_WRITER);
incr = curthread | RW_WRITE_LOCKED;
set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
need_wait = RW_WRITE_LOCKED | RW_THREAD;
queue = TS_WRITER_Q;
}
LOCKSTAT_ENTER(lsflag);
KPREEMPT_DISABLE(curlwp);
for (owner = rw->rw_owner;;) {
/*
* Read the lock owner field. If the need-to-wait
* indicator is clear, then try to acquire the lock.
*/
if ((owner & need_wait) == 0) {
next = rw_cas(rw, owner, (owner + incr) &
~RW_WRITE_WANTED);
if (__predict_true(next == owner)) {
/* Got it! */
membar_acquire();
break;
}
/*
* Didn't get it -- spin around again (we'll
* probably sleep on the next iteration).
*/
owner = next;
continue;
}
if (__predict_false(RW_OWNER(rw) == curthread)) { rw_abort(__func__, __LINE__, rw,
"locking against myself");
}
/*
* If the lock owner is running on another CPU, and
* there are no existing waiters, then spin.
*/
if (rw_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime);
u_int count = SPINLOCK_BACKOFF_MIN;
do {
KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count);
KPREEMPT_DISABLE(curlwp);
owner = rw->rw_owner;
} while (rw_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_COUNT(spincnt, 1);
if ((owner & need_wait) == 0)
continue;
}
/*
* Grab the turnstile chain lock. Once we have that, we
* can adjust the waiter bits and sleep queue.
*/
ts = turnstile_lookup(rw);
/*
* Mark the rwlock as having waiters. If the set fails,
* then we may not need to sleep and should spin again.
* Reload rw_owner because turnstile_lookup() may have
* spun on the turnstile chain lock.
*/
owner = rw->rw_owner;
if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
turnstile_exit(rw);
continue;
}
next = rw_cas(rw, owner, owner | set_wait);
/* XXX membar? */
if (__predict_false(next != owner)) {
turnstile_exit(rw);
owner = next;
continue;
}
LOCKSTAT_START_TIMER(lsflag, slptime);
turnstile_block(ts, queue, rw, &rw_syncobj);
LOCKSTAT_STOP_TIMER(lsflag, slptime);
LOCKSTAT_COUNT(slpcnt, 1);
/*
* No need for a memory barrier because of context switch.
* If not handed the lock, then spin again.
*/
if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
break;
owner = rw->rw_owner;
}
KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
(op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
(l->l_rwcallsite != 0 ? l->l_rwcallsite :
(uintptr_t)__builtin_return_address(0)));
LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
(l->l_rwcallsite != 0 ? l->l_rwcallsite :
(uintptr_t)__builtin_return_address(0)));
LOCKSTAT_EXIT(lsflag);
RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
(op == RW_READER && RW_COUNT(rw) != 0));
RW_LOCKED(rw, op);
}
/*
* rw_vector_exit:
*
* Release a rwlock.
*/
void
rw_vector_exit(krwlock_t *rw)
{
uintptr_t curthread, owner, decr, newown, next;
turnstile_t *ts;
int rcnt, wcnt;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0);
/*
* Again, we use a trick. Since we used an add operation to
* set the required lock bits, we can use a subtract to clear
* them, which makes the read-release and write-release path
* the same.
*/
owner = rw->rw_owner;
if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
RW_UNLOCKED(rw, RW_WRITER); RW_ASSERT(rw, RW_OWNER(rw) == curthread);
decr = curthread | RW_WRITE_LOCKED;
} else {
RW_UNLOCKED(rw, RW_READER); RW_ASSERT(rw, RW_COUNT(rw) != 0);
decr = RW_READ_INCR;
}
/*
* Compute what we expect the new value of the lock to be. Only
* proceed to do direct handoff if there are waiters, and if the
* lock would become unowned.
*/
membar_release();
for (;;) {
newown = (owner - decr);
if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
break;
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner))
return;
owner = next;
}
/*
* Grab the turnstile chain lock. This gets the interlock
* on the sleep queue. Once we have that, we can adjust the
* waiter bits.
*/
ts = turnstile_lookup(rw);
owner = rw->rw_owner;
RW_ASSERT(rw, ts != NULL); RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
wcnt = TS_WAITERS(ts, TS_WRITER_Q);
rcnt = TS_WAITERS(ts, TS_READER_Q);
/*
* Give the lock away.
*
* If we are releasing a write lock, then prefer to wake all
* outstanding readers. Otherwise, wake one writer if there
* are outstanding readers, or all writers if there are no
* pending readers. If waking one specific writer, the writer
* is handed the lock here. If waking multiple writers, we
* set WRITE_WANTED to block out new readers, and let them
* do the work of acquiring the lock in rw_vector_enter().
*/
if (rcnt == 0 || decr == RW_READ_INCR) {
RW_ASSERT(rw, wcnt != 0); RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
if (rcnt != 0) {
/* Give the lock to the longest waiting writer. */
l = TS_FIRST(ts, TS_WRITER_Q);
newown = (uintptr_t)l | (owner & RW_NODEBUG);
newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
if (wcnt > 1)
newown |= RW_WRITE_WANTED;
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
} else {
/* Wake all writers and let them fight it out. */
newown = owner & RW_NODEBUG;
newown |= RW_WRITE_WANTED;
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
}
} else {
RW_ASSERT(rw, rcnt != 0);
/*
* Give the lock to all blocked readers. If there
* is a writer waiting, new readers that arrive
* after the release will be blocked out.
*/
newown = owner & RW_NODEBUG;
newown += rcnt << RW_READ_COUNT_SHIFT;
if (wcnt != 0)
newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
/* Wake up all sleeping readers. */
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
}
}
/*
* rw_vector_tryenter:
*
* Try to acquire a rwlock.
*/
int
rw_vector_tryenter(krwlock_t *rw, const krw_t op)
{
uintptr_t curthread, owner, incr, need_wait, next;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0); if (op == RW_READER) {
incr = RW_READ_INCR;
need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
} else {
RW_ASSERT(rw, op == RW_WRITER);
incr = curthread | RW_WRITE_LOCKED;
need_wait = RW_WRITE_LOCKED | RW_THREAD;
}
for (owner = rw->rw_owner;; owner = next) {
if (__predict_false((owner & need_wait) != 0))
return 0;
next = rw_cas(rw, owner, owner + incr);
if (__predict_true(next == owner)) {
/* Got it! */
break;
}
}
RW_WANTLOCK(rw, op); RW_LOCKED(rw, op); RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
(op == RW_READER && RW_COUNT(rw) != 0));
membar_acquire();
return 1;
}
/*
* rw_downgrade:
*
* Downgrade a write lock to a read lock.
*/
void
rw_downgrade(krwlock_t *rw)
{
uintptr_t owner, newown, next, curthread __diagused;
turnstile_t *ts;
int rcnt, wcnt;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
RW_ASSERT(rw, RW_OWNER(rw) == curthread);
RW_UNLOCKED(rw, RW_WRITER);
membar_release();
for (owner = rw->rw_owner;; owner = next) {
/*
* If there are no waiters we can do this the easy way. Try
* swapping us down to one read hold. If it fails, the lock
* condition has changed and we most likely now have
* waiters.
*/
if ((owner & RW_HAS_WAITERS) == 0) {
newown = (owner & RW_NODEBUG);
next = rw_cas(rw, owner, newown + RW_READ_INCR);
if (__predict_true(next == owner)) {
RW_LOCKED(rw, RW_READER);
RW_ASSERT(rw,
(rw->rw_owner & RW_WRITE_LOCKED) == 0);
RW_ASSERT(rw, RW_COUNT(rw) != 0);
return;
}
continue;
}
/*
* Grab the turnstile chain lock. This gets the interlock
* on the sleep queue. Once we have that, we can adjust the
* waiter bits.
*/
ts = turnstile_lookup(rw);
RW_ASSERT(rw, ts != NULL);
rcnt = TS_WAITERS(ts, TS_READER_Q);
wcnt = TS_WAITERS(ts, TS_WRITER_Q);
if (rcnt == 0) {
/*
* If there are no readers, just preserve the
* waiters bits, swap us down to one read hold and
* return.
*/
RW_ASSERT(rw, wcnt != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
newown = owner & RW_NODEBUG;
newown |= RW_READ_INCR | RW_HAS_WAITERS |
RW_WRITE_WANTED;
next = rw_cas(rw, owner, newown);
turnstile_exit(rw);
if (__predict_true(next == owner))
break;
} else {
/*
* Give the lock to all blocked readers. We may
* retain one read hold if downgrading. If there is
* a writer waiting, new readers will be blocked
* out.
*/
newown = owner & RW_NODEBUG;
newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
if (wcnt != 0)
newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner)) {
/* Wake up all sleeping readers. */
turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
break;
}
turnstile_exit(rw);
}
}
RW_WANTLOCK(rw, RW_READER);
RW_LOCKED(rw, RW_READER);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
RW_ASSERT(rw, RW_COUNT(rw) != 0);
}
/*
* rw_tryupgrade:
*
* Try to upgrade a read lock to a write lock. We must be the only
* reader.
*/
int
rw_tryupgrade(krwlock_t *rw)
{
uintptr_t owner, curthread, newown, next;
struct lwp *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0); RW_ASSERT(rw, rw_read_held(rw));
for (owner = RW_READ_INCR;; owner = next) {
newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner)) {
membar_acquire();
break;
}
RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0); if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) { RW_ASSERT(rw, (next & RW_THREAD) != 0);
return 0;
}
}
RW_UNLOCKED(rw, RW_READER); RW_WANTLOCK(rw, RW_WRITER); RW_LOCKED(rw, RW_WRITER); RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED); RW_ASSERT(rw, RW_OWNER(rw) == curthread);
return 1;
}
/*
* rw_read_held:
*
* Returns true if the rwlock is held for reading. Must only be
* used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_read_held(krwlock_t *rw)
{
uintptr_t owner;
if (rw == NULL)
return 0;
owner = rw->rw_owner;
return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
}
/*
* rw_write_held:
*
* Returns true if the rwlock is held for writing. Must only be
* used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_write_held(krwlock_t *rw)
{ if (rw == NULL)
return 0;
return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
(RW_WRITE_LOCKED | (uintptr_t)curlwp);
}
/*
* rw_lock_held:
*
* Returns true if the rwlock is held for reading or writing. Must
* only be used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_lock_held(krwlock_t *rw)
{ if (rw == NULL)
return 0;
return (rw->rw_owner & RW_THREAD) != 0;
}
/*
* rw_lock_op:
*
* For a rwlock that is known to be held by the caller, return
* RW_READER or RW_WRITER to describe the hold type.
*/
krw_t
rw_lock_op(krwlock_t *rw)
{ RW_ASSERT(rw, rw_lock_held(rw));
return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER;
}
/*
* rw_owner:
*
* Return the current owner of an RW lock, but only if it is write
* held. Used for priority inheritance.
*/
static lwp_t *
rw_owner(wchan_t obj)
{
krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
uintptr_t owner = rw->rw_owner;
if ((owner & RW_WRITE_LOCKED) == 0)
return NULL;
return (void *)(owner & RW_THREAD);
}
/* $NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran, and by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
*/
/*-
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
*/
/*
* The buffer cache subsystem.
*
* Some references:
* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
*
* Locking
*
* There are three locks:
* - bufcache_lock: protects global buffer cache state.
* - BC_BUSY: a long term per-buffer lock.
* - buf_t::b_objlock: lock on completion (biowait vs biodone).
*
* For buffers associated with vnodes (a most common case) b_objlock points
* to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
*
* Lock order:
* bufcache_lock ->
* buf_t::b_objlock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_bufcache.h"
#include "opt_dtrace.h"
#include "opt_biohist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/wapbl.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/sdt.h>
#include <uvm/uvm.h> /* extern struct uvm uvm */
#include <miscfs/specfs/specdev.h>
SDT_PROVIDER_DEFINE(io);
SDT_PROBE_DEFINE4(io, kernel, , bbusy__start,
"struct buf *"/*bp*/,
"bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/);
SDT_PROBE_DEFINE5(io, kernel, , bbusy__done,
"struct buf *"/*bp*/,
"bool"/*intr*/,
"int"/*timo*/,
"kmutex_t *"/*interlock*/,
"int"/*error*/);
SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start);
SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/);
SDT_PROBE_DEFINE3(io, kernel, , getblk__start,
"struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/);
SDT_PROBE_DEFINE4(io, kernel, , getblk__done,
"struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/,
"struct buf *"/*bp*/);
SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/);
#ifndef BUFPAGES
# define BUFPAGES 0
#endif
#ifdef BUFCACHE
# if (BUFCACHE < 5) || (BUFCACHE > 95)
# error BUFCACHE is not between 5 and 95
# endif
#else
# define BUFCACHE 15
#endif
u_int nbuf; /* desired number of buffer headers */
u_int bufpages = BUFPAGES; /* optional hardwired count */
u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */
/*
* Definitions for the buffer free lists.
*/
#define BQUEUES 3 /* number of free buffer queues */
#define BQ_LOCKED 0 /* super-blocks &c */
#define BQ_LRU 1 /* lru, useful buffers */
#define BQ_AGE 2 /* rubbish */
struct bqueue {
TAILQ_HEAD(, buf) bq_queue;
uint64_t bq_bytes;
buf_t *bq_marker;
};
static struct bqueue bufqueues[BQUEUES] __cacheline_aligned;
/* Function prototypes */
static void buf_setwm(void);
static int buf_trim(void);
static void *bufpool_page_alloc(struct pool *, int);
static void bufpool_page_free(struct pool *, void *);
static buf_t *bio_doread(struct vnode *, daddr_t, int, int);
static buf_t *getnewbuf(int, int, int);
static int buf_lotsfree(void);
static int buf_canrelease(void);
static u_long buf_mempoolidx(u_long);
static u_long buf_roundsize(u_long);
static void *buf_alloc(size_t);
static void buf_mrelease(void *, size_t);
static void binsheadfree(buf_t *, struct bqueue *);
static void binstailfree(buf_t *, struct bqueue *);
#ifdef DEBUG
static int checkfreelist(buf_t *, struct bqueue *, int);
#endif
static void biointr(void *);
static void biodone2(buf_t *);
static void sysctl_kern_buf_setup(void);
static void sysctl_vm_buf_setup(void);
/* Initialization for biohist */
#include <sys/biohist.h>
BIOHIST_DEFINE(biohist);
void
biohist_init(void)
{
BIOHIST_INIT(biohist, BIOHIST_SIZE);
}
/*
* Definitions for the buffer hash lists.
*/
#define BUFHASH(dvp, lbn) \
(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
u_long bufhash;
static int bufhash_stats(struct hashstat_sysctl *, bool);
static kcondvar_t needbuffer_cv;
/*
* Buffer queue lock.
*/
kmutex_t bufcache_lock __cacheline_aligned;
kmutex_t buffer_lock __cacheline_aligned;
/* Software ISR for completed transfers. */
static void *biodone_sih;
/* Buffer pool for I/O buffers. */
static pool_cache_t buf_cache;
static pool_cache_t bufio_cache;
#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
__CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE);
/* Buffer memory pools */
static struct pool bmempools[NMEMPOOLS];
static struct vm_map *buf_map;
/*
* Buffer memory pool allocator.
*/
static void *
bufpool_page_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(buf_map,
MAXBSIZE, MAXBSIZE,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK)
| UVM_KMF_WIRED);
}
static void
bufpool_page_free(struct pool *pp, void *v)
{
uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
}
static struct pool_allocator bufmempool_allocator = {
.pa_alloc = bufpool_page_alloc,
.pa_free = bufpool_page_free,
.pa_pagesz = MAXBSIZE,
};
/* Buffer memory management variables */
u_long bufmem_valimit;
u_long bufmem_hiwater;
u_long bufmem_lowater;
u_long bufmem;
/*
* MD code can call this to set a hard limit on the amount
* of virtual memory used by the buffer cache.
*/
int
buf_setvalimit(vsize_t sz)
{
/* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
if (sz < NMEMPOOLS * MAXBSIZE)
return EINVAL;
bufmem_valimit = sz;
return 0;
}
static void
buf_setwm(void)
{
bufmem_hiwater = buf_memcalc();
/* lowater is approx. 2% of memory (with bufcache = 15) */
#define BUFMEM_WMSHIFT 3
#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
if (bufmem_hiwater < BUFMEM_HIWMMIN)
/* Ensure a reasonable minimum value */
bufmem_hiwater = BUFMEM_HIWMMIN;
bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
}
#ifdef DEBUG
int debug_verify_freelist = 0;
static int
checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
{
buf_t *b;
if (!debug_verify_freelist)
return 1;
TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { if (b == bp)
return ison ? 1 : 0;
}
return ison ? 0 : 1;
}
#endif
/*
* Insq/Remq for the buffer hash lists.
* Call with buffer queue locked.
*/
static void
binsheadfree(buf_t *bp, struct bqueue *dp)
{
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_freelistindex == -1); TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes += bp->b_bufsize;
bp->b_freelistindex = dp - bufqueues;
}
static void
binstailfree(buf_t *bp, struct bqueue *dp)
{
KASSERT(mutex_owned(&bufcache_lock)); KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? "
"bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes += bp->b_bufsize;
bp->b_freelistindex = dp - bufqueues;
}
void
bremfree(buf_t *bp)
{
struct bqueue *dp;
int bqidx = bp->b_freelistindex;
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bqidx != -1);
dp = &bufqueues[bqidx];
KDASSERT(checkfreelist(bp, dp, 1)); KASSERT(dp->bq_bytes >= bp->b_bufsize); TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes -= bp->b_bufsize;
/* For the sysctl helper. */
if (bp == dp->bq_marker) dp->bq_marker = NULL;
#if defined(DIAGNOSTIC)
bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
}
/*
* note that for some ports this is used by pmap bootstrap code to
* determine kva size.
*/
u_long
buf_memcalc(void)
{
u_long n;
vsize_t mapsz = 0;
/*
* Determine the upper bound of memory to use for buffers.
*
* - If bufpages is specified, use that as the number
* pages.
*
* - Otherwise, use bufcache as the percentage of
* physical memory.
*/
if (bufpages != 0) {
n = bufpages;
} else {
if (bufcache < 5) {
printf("forcing bufcache %d -> 5", bufcache);
bufcache = 5;
}
if (bufcache > 95) {
printf("forcing bufcache %d -> 95", bufcache);
bufcache = 95;
}
if (buf_map != NULL)
mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
n = calc_cache_size(mapsz, bufcache,
(buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
/ PAGE_SIZE;
}
n <<= PAGE_SHIFT;
if (bufmem_valimit != 0 && n > bufmem_valimit)
n = bufmem_valimit;
return (n);
}
/*
* Initialize buffers and hash links for buffers.
*/
void
bufinit(void)
{
struct bqueue *dp;
int use_std;
u_int i;
biodone_vfs = biodone;
mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&needbuffer_cv, "needbuf");
if (bufmem_valimit != 0) {
vaddr_t minaddr = 0, maxaddr;
buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
bufmem_valimit, 0, false, 0);
if (buf_map == NULL)
panic("bufinit: cannot allocate submap");
} else
buf_map = kernel_map;
/*
* Initialize buffer cache memory parameters.
*/
bufmem = 0;
buf_setwm();
/* On "small" machines use small pool page sizes where possible */
use_std = (physmem < atop(16*1024*1024));
/*
* Also use them on systems that can map the pool pages using
* a direct-mapped segment.
*/
#ifdef PMAP_MAP_POOLPAGE
use_std = 1;
#endif
buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
"bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
"biopl", NULL, IPL_BIO, NULL, NULL, NULL);
for (i = 0; i < NMEMPOOLS; i++) {
struct pool_allocator *pa;
struct pool *pp = &bmempools[i];
u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
if (__predict_false(size >= 1048576))
(void)snprintf(name, 8, "buf%um", size / 1048576);
else if (__predict_true(size >= 1024))
(void)snprintf(name, 8, "buf%uk", size / 1024);
else
(void)snprintf(name, 8, "buf%ub", size);
pa = (size <= PAGE_SIZE && use_std)
? &pool_allocator_nointr
: &bufmempool_allocator;
pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE);
pool_setlowat(pp, 1);
pool_sethiwat(pp, 1);
}
/* Initialize the buffer queues */
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
TAILQ_INIT(&dp->bq_queue);
dp->bq_bytes = 0;
}
/*
* Estimate hash table size based on the amount of memory we
* intend to use for the buffer cache. The average buffer
* size is dependent on our clients (i.e. filesystems).
*
* For now, use an empirical 3K per buffer.
*/
nbuf = (bufmem_hiwater / 1024) / 3;
bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
sysctl_kern_buf_setup();
sysctl_vm_buf_setup();
hashstat_register("bufhash", bufhash_stats);
}
void
bufinit2(void)
{
biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
NULL);
if (biodone_sih == NULL)
panic("bufinit2: can't establish soft interrupt");
}
static int
buf_lotsfree(void)
{
u_long guess;
/* Always allocate if less than the low water mark. */
if (bufmem < bufmem_lowater)
return 1;
/* Never allocate if greater than the high water mark. */
if (bufmem > bufmem_hiwater)
return 0;
/* If there's anything on the AGE list, it should be eaten. */
if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
return 0;
/*
* The probabily of getting a new allocation is inversely
* proportional to the current size of the cache above
* the low water mark. Divide the total first to avoid overflows
* in the product.
*/
guess = cprng_fast32() % 16;
if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >=
(bufmem - bufmem_lowater))
return 1;
/* Otherwise don't allocate. */
return 0;
}
/*
* Return estimate of bytes we think need to be
* released to help resolve low memory conditions.
*
* => called with bufcache_lock held.
*/
static int
buf_canrelease(void)
{
int pagedemand, ninvalid = 0;
KASSERT(mutex_owned(&bufcache_lock)); if (bufmem < bufmem_lowater)
return 0;
if (bufmem > bufmem_hiwater)
return bufmem - bufmem_hiwater;
ninvalid += bufqueues[BQ_AGE].bq_bytes;
pagedemand = uvmexp.freetarg - uvm_availmem(false);
if (pagedemand < 0)
return ninvalid;
return MAX(ninvalid, MIN(2 * MAXBSIZE,
MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
}
/*
* Buffer memory allocation helper functions
*/
static u_long
buf_mempoolidx(u_long size)
{
u_int n = 0;
size -= 1;
size >>= MEMPOOL_INDEX_OFFSET;
while (size) {
size >>= 1;
n += 1;
}
if (n >= NMEMPOOLS) panic("buf mem pool index %d", n);
return n;
}
static u_long
buf_roundsize(u_long size)
{
/* Round up to nearest power of 2 */
return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
}
static void *
buf_alloc(size_t size)
{
u_int n = buf_mempoolidx(size);
void *addr;
while (1) {
addr = pool_get(&bmempools[n], PR_NOWAIT);
if (addr != NULL)
break;
/* No memory, see if we can free some. If so, try again */
mutex_enter(&bufcache_lock);
if (buf_drain(1) > 0) {
mutex_exit(&bufcache_lock);
continue;
}
if (curlwp == uvm.pagedaemon_lwp) {
mutex_exit(&bufcache_lock);
return NULL;
}
/* Wait for buffers to arrive on the LRU queue */
cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
mutex_exit(&bufcache_lock);
}
return addr;
}
static void
buf_mrelease(void *addr, size_t size)
{
pool_put(&bmempools[buf_mempoolidx(size)], addr);
}
/*
* bread()/breadn() helper.
*/
static buf_t *
bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
{
buf_t *bp;
struct mount *mp;
bp = getblk(vp, blkno, size, 0, 0);
/*
* getblk() may return NULL if we are the pagedaemon.
*/
if (bp == NULL) {
KASSERT(curlwp == uvm.pagedaemon_lwp);
return NULL;
}
/*
* If buffer does not have data valid, start a read.
* Note that if buffer is BC_INVAL, getblk() won't return it.
* Therefore, it's valid if its I/O has completed or been delayed.
*/
if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
/* Start I/O for the buffer. */
SET(bp->b_flags, B_READ | async);
if (async)
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
else
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
/* Pay for the read. */
curlwp->l_ru.ru_inblock++;
} else if (async) brelse(bp, 0);
if (vp->v_type == VBLK)
mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
/*
* Collect statistics on synchronous and asynchronous reads.
* Reads from block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) {
if (async == 0)
mp->mnt_stat.f_syncreads++;
else
mp->mnt_stat.f_asyncreads++;
}
return (bp);
}
/*
* Read a disk block.
* This algorithm described in Bach (p.54).
*/
int
bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp)
{
buf_t *bp;
int error;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
/* Get buffer for block. */
bp = *bpp = bio_doread(vp, blkno, size, 0);
if (bp == NULL)
return ENOMEM;
/* Wait for the read to complete, and return result. */
error = biowait(bp);
if (error == 0 && (flags & B_MODIFY) != 0)
error = fscow_run(bp, true);
if (error) {
brelse(bp, 0);
*bpp = NULL;
}
return error;
}
/*
* Read-ahead multiple disk blocks. The first is sync, the rest async.
* Trivial modification to the breada algorithm presented in Bach (p.55).
*/
int
breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
int *rasizes, int nrablks, int flags, buf_t **bpp)
{
buf_t *bp;
int error, i;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
bp = *bpp = bio_doread(vp, blkno, size, 0);
if (bp == NULL)
return ENOMEM;
/*
* For each of the read-ahead blocks, start a read, if necessary.
*/
mutex_enter(&bufcache_lock);
for (i = 0; i < nrablks; i++) {
/* If it's in the cache, just go on to next one. */
if (incore(vp, rablks[i]))
continue;
/* Get a buffer for the read-ahead block */
mutex_exit(&bufcache_lock);
(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
mutex_enter(&bufcache_lock);
}
mutex_exit(&bufcache_lock);
/* Otherwise, we had to start a read for it; wait until it's valid. */
error = biowait(bp);
if (error == 0 && (flags & B_MODIFY) != 0)
error = fscow_run(bp, true);
if (error) {
brelse(bp, 0);
*bpp = NULL;
}
return error;
}
/*
* Block write. Described in Bach (p.56)
*/
int
bwrite(buf_t *bp)
{
int rv, sync, wasdelayed;
struct vnode *vp;
struct mount *mp;
BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
(uintptr_t)bp, 0, 0, 0);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
vp = bp->b_vp;
/*
* dholland 20160728 AFAICT vp==NULL must be impossible as it
* will crash upon reaching VOP_STRATEGY below... see further
* analysis on tech-kern.
*/
KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
if (vp != NULL) {
KASSERT(bp->b_objlock == vp->v_interlock);
if (vp->v_type == VBLK)
mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
} else {
mp = NULL;
}
if (mp && mp->mnt_wapbl) { if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { bdwrite(bp);
return 0;
}
}
/*
* Remember buffer type, to switch on it later. If the write was
* synchronous, but the file system was mounted with MNT_ASYNC,
* convert it to a delayed write.
* XXX note that this relies on delayed tape writes being converted
* to async, not sync writes (which is safe, but ugly).
*/
sync = !ISSET(bp->b_flags, B_ASYNC);
if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
bdwrite(bp);
return (0);
}
/*
* Collect statistics on synchronous and asynchronous writes.
* Writes to block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) {
if (sync)
mp->mnt_stat.f_syncwrites++;
else
mp->mnt_stat.f_asyncwrites++;
}
/*
* Pay for the I/O operation and make sure the buf is on the correct
* vnode queue.
*/
bp->b_error = 0;
wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
CLR(bp->b_flags, B_READ);
if (wasdelayed) {
mutex_enter(&bufcache_lock);
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
reassignbuf(bp, bp->b_vp);
/* Wake anyone trying to busy the buffer via vnode's lists. */
cv_broadcast(&bp->b_busy);
mutex_exit(&bufcache_lock);
} else {
curlwp->l_ru.ru_oublock++;
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
}
if (vp != NULL) vp->v_numoutput++;
mutex_exit(bp->b_objlock);
/* Initiate disk write. */
if (sync)
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
else
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
VOP_STRATEGY(vp, bp);
if (sync) {
/* If I/O was synchronous, wait for it to complete. */
rv = biowait(bp);
/* Release the buffer. */
brelse(bp, 0);
return (rv);
} else {
return (0);
}
}
int
vn_bwrite(void *v)
{
struct vop_bwrite_args *ap = v;
return (bwrite(ap->a_bp));
}
/*
* Delayed write.
*
* The buffer is marked dirty, but is not queued for I/O.
* This routine should be used when the buffer is expected
* to be modified again soon, typically a small write that
* partially fills a buffer.
*
* NB: magnetic tapes cannot be delayed; they must be
* written in the order that the writes are requested.
*
* Described in Leffler, et al. (pp. 208-213).
*/
void
bdwrite(buf_t *bp)
{
BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
(uintptr_t)bp, 0, 0, 0);
KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
/* If this is a tape block, write the block now. */
if (bdev_type(bp->b_dev) == D_TAPE) {
bawrite(bp);
return;
}
if (wapbl_vphaswapbl(bp->b_vp)) { struct mount *mp = wapbl_vptomp(bp->b_vp); if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { WAPBL_ADD_BUF(mp, bp);
}
}
/*
* If the block hasn't been seen before:
* (1) Mark it as having been seen,
* (2) Charge for the write,
* (3) Make sure it's on its vnode's correct block list.
*/
KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock);
if (!ISSET(bp->b_oflags, BO_DELWRI)) {
mutex_enter(&bufcache_lock);
mutex_enter(bp->b_objlock);
SET(bp->b_oflags, BO_DELWRI);
curlwp->l_ru.ru_oublock++;
reassignbuf(bp, bp->b_vp);
/* Wake anyone trying to busy the buffer via vnode's lists. */
cv_broadcast(&bp->b_busy);
mutex_exit(&bufcache_lock);
} else {
mutex_enter(bp->b_objlock);
}
/* Otherwise, the "write" is done, so mark and release the buffer. */
CLR(bp->b_oflags, BO_DONE);
mutex_exit(bp->b_objlock);
brelse(bp, 0);
}
/*
* Asynchronous block write; just an asynchronous bwrite().
*/
void
bawrite(buf_t *bp)
{
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_vp != NULL);
SET(bp->b_flags, B_ASYNC);
VOP_BWRITE(bp->b_vp, bp);
}
/*
* Release a buffer on to the free lists.
* Described in Bach (p. 46).
*/
void
brelsel(buf_t *bp, int set)
{
struct bqueue *bufq;
struct vnode *vp;
SDT_PROBE2(io, kernel, , brelse, bp, set); KASSERT(bp != NULL); KASSERT(mutex_owned(&bufcache_lock)); KASSERT(!cv_has_waiters(&bp->b_done));
SET(bp->b_cflags, set);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_iodone == NULL);
/* Wake up any processes waiting for any buffer to become free. */
cv_signal(&needbuffer_cv);
/* Wake up any proceeses waiting for _this_ buffer to become free */
if (ISSET(bp->b_cflags, BC_WANTED)) CLR(bp->b_cflags, BC_WANTED|BC_AGE);
/* If it's clean clear the copy-on-write flag. */
if (ISSET(bp->b_flags, B_COWDONE)) {
mutex_enter(bp->b_objlock);
if (!ISSET(bp->b_oflags, BO_DELWRI)) CLR(bp->b_flags, B_COWDONE);
mutex_exit(bp->b_objlock);
}
/*
* Determine which queue the buffer should be on, then put it there.
*/
/* If it's locked, don't report an error; try again later. */
if (ISSET(bp->b_flags, B_LOCKED)) bp->b_error = 0;
/* If it's not cacheable, or an error, mark it invalid. */
if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
SET(bp->b_cflags, BC_INVAL);
if (ISSET(bp->b_cflags, BC_VFLUSH)) {
/*
* This is a delayed write buffer that was just flushed to
* disk. It is still on the LRU queue. If it's become
* invalid, then we need to move it to a different queue;
* otherwise leave it in its current position.
*/
CLR(bp->b_cflags, BC_VFLUSH);
if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
!ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
goto already_queued;
} else {
bremfree(bp);
}
}
KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
/*
* If it's invalid or empty, dissociate it from its vnode
* and put on the head of the appropriate queue.
*/
if (ISSET(bp->b_flags, B_LOCKED)) { if (wapbl_vphaswapbl(vp = bp->b_vp)) { struct mount *mp = wapbl_vptomp(vp); KASSERT(bp->b_iodone
!= mp->mnt_wapbl_op->wo_wapbl_biodone);
WAPBL_REMOVE_BUF(mp, bp);
}
}
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
if ((vp = bp->b_vp) != NULL) {
KASSERT(bp->b_objlock == vp->v_interlock);
reassignbuf(bp, bp->b_vp);
brelvp(bp);
mutex_exit(vp->v_interlock);
} else {
KASSERT(bp->b_objlock == &buffer_lock);
mutex_exit(bp->b_objlock);
}
/* We want to dispose of the buffer, so wake everybody. */
cv_broadcast(&bp->b_busy);
if (bp->b_bufsize <= 0)
/* no data */
goto already_queued;
else
/* invalid data */
bufq = &bufqueues[BQ_AGE];
binsheadfree(bp, bufq);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
* If buf is AGE, but has dependencies, must put it on last
* bufqueue to be scanned, ie LRU. This protects against the
* livelock where BQ_AGE only has buffers with dependencies,
* and we thus never get to the dependent buffers in BQ_LRU.
*/
if (ISSET(bp->b_flags, B_LOCKED)) {
/* locked in core */
bufq = &bufqueues[BQ_LOCKED];
} else if (!ISSET(bp->b_cflags, BC_AGE)) {
/* valid data */
bufq = &bufqueues[BQ_LRU];
} else {
/* stale but valid data */
bufq = &bufqueues[BQ_AGE];
}
binstailfree(bp, bufq);
}
already_queued:
/* Unlock the buffer. */
CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
CLR(bp->b_flags, B_ASYNC);
/*
* Wake only the highest priority waiter on the lock, in order to
* prevent a thundering herd: many LWPs simultaneously awakening and
* competing for the buffer's lock. Testing in 2019 revealed this
* to reduce contention on bufcache_lock tenfold during a kernel
* compile. Here and elsewhere, when the buffer is changing
* identity, being disposed of, or moving from one list to another,
* we wake all lock requestors.
*/
if (bp->b_bufsize <= 0) {
cv_broadcast(&bp->b_busy);
buf_destroy(bp);
#ifdef DEBUG
memset((char *)bp, 0, sizeof(*bp));
#endif
pool_cache_put(buf_cache, bp);
} else
cv_signal(&bp->b_busy);
}
void
brelse(buf_t *bp, int set)
{
mutex_enter(&bufcache_lock);
brelsel(bp, set);
mutex_exit(&bufcache_lock);
}
/*
* Determine if a block is in the cache.
* Just look on what would be its hash chain. If it's there, return
* a pointer to it, unless it's marked invalid. If it's marked invalid,
* we normally don't return the buffer, unless the caller explicitly
* wants us to.
*/
buf_t *
incore(struct vnode *vp, daddr_t blkno)
{
buf_t *bp;
KASSERT(mutex_owned(&bufcache_lock));
/* Search hash chain */
LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { if (bp->b_lblkno == blkno && bp->b_vp == vp &&
!ISSET(bp->b_cflags, BC_INVAL)) {
KASSERT(bp->b_objlock == vp->v_interlock);
return (bp);
}
}
return (NULL);
}
/*
* Get a block of requested size that is associated with
* a given vnode and block offset. If it is found in the
* block cache, mark it as having been found, make it busy
* and return it. Otherwise, return an empty block of the
* correct size. It is up to the caller to insure that the
* cached blocks be of the correct size.
*/
buf_t *
getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
int err, preserve;
buf_t *bp;
mutex_enter(&bufcache_lock);
SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size);
loop:
bp = incore(vp, blkno);
if (bp != NULL) {
err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
if (err != 0) {
if (err == EPASSTHROUGH)
goto loop;
mutex_exit(&bufcache_lock);
SDT_PROBE4(io, kernel, , getblk__done,
vp, blkno, size, NULL);
return (NULL);
}
KASSERT(!cv_has_waiters(&bp->b_done));
#ifdef DIAGNOSTIC
if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && bp->b_bcount < size && vp->v_type != VBLK) panic("getblk: block size invariant failed");
#endif
bremfree(bp);
preserve = 1;
} else {
if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
goto loop;
if (incore(vp, blkno) != NULL) {
/* The block has come into memory in the meantime. */
brelsel(bp, 0);
goto loop;
}
LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
mutex_enter(vp->v_interlock);
bgetvp(vp, bp);
mutex_exit(vp->v_interlock);
preserve = 0;
}
mutex_exit(&bufcache_lock);
/*
* LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
* if we re-size buffers here.
*/
if (ISSET(bp->b_flags, B_LOCKED)) {
KASSERT(bp->b_bufsize >= size);
} else {
if (allocbuf(bp, size, preserve)) {
mutex_enter(&bufcache_lock);
LIST_REMOVE(bp, b_hash);
brelsel(bp, BC_INVAL);
mutex_exit(&bufcache_lock);
SDT_PROBE4(io, kernel, , getblk__done,
vp, blkno, size, NULL);
return NULL;
}
}
BIO_SETPRIO(bp, BPRIO_DEFAULT);
SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp);
return (bp);
}
/*
* Get an empty, disassociated buffer of given size.
*/
buf_t *
geteblk(int size)
{
buf_t *bp;
int error __diagused;
mutex_enter(&bufcache_lock);
while ((bp = getnewbuf(0, 0, 0)) == NULL)
;
SET(bp->b_cflags, BC_INVAL);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
mutex_exit(&bufcache_lock);
BIO_SETPRIO(bp, BPRIO_DEFAULT);
error = allocbuf(bp, size, 0);
KASSERT(error == 0);
return (bp);
}
/*
* Expand or contract the actual memory allocated to a buffer.
*
* If the buffer shrinks, data is lost, so it's up to the
* caller to have written it out *first*; this routine will not
* start a write. If the buffer grows, it's the callers
* responsibility to fill out the buffer's additional contents.
*/
int
allocbuf(buf_t *bp, int size, int preserve)
{
void *addr;
vsize_t oldsize, desired_size;
int oldcount;
int delta;
desired_size = buf_roundsize(size);
if (desired_size > MAXBSIZE)
printf("allocbuf: buffer larger than MAXBSIZE requested");
oldcount = bp->b_bcount;
bp->b_bcount = size;
oldsize = bp->b_bufsize;
if (oldsize == desired_size) {
/*
* Do not short cut the WAPBL resize, as the buffer length
* could still have changed and this would corrupt the
* tracking of the transaction length.
*/
goto out;
}
/*
* If we want a buffer of a different size, re-allocate the
* buffer's memory; copy old content only if needed.
*/
addr = buf_alloc(desired_size);
if (addr == NULL)
return ENOMEM;
if (preserve) memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); if (bp->b_data != NULL) buf_mrelease(bp->b_data, oldsize);
bp->b_data = addr;
bp->b_bufsize = desired_size;
/*
* Update overall buffer memory counter (protected by bufcache_lock)
*/
delta = (long)desired_size - (long)oldsize;
mutex_enter(&bufcache_lock);
if ((bufmem += delta) > bufmem_hiwater) {
/*
* Need to trim overall memory usage.
*/
while (buf_canrelease()) { if (preempt_needed()) { mutex_exit(&bufcache_lock);
preempt();
mutex_enter(&bufcache_lock);
}
if (buf_trim() == 0)
break;
}
}
mutex_exit(&bufcache_lock);
out:
if (wapbl_vphaswapbl(bp->b_vp)) WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
return 0;
}
/*
* Find a buffer which is available for use.
* Select something from a free list.
* Preference is to AGE list, then LRU list.
*
* Called with the buffer queues locked.
* Return buffer locked.
*/
static buf_t *
getnewbuf(int slpflag, int slptimeo, int from_bufq)
{
buf_t *bp;
struct vnode *vp;
struct mount *transmp = NULL;
SDT_PROBE0(io, kernel, , getnewbuf__start);
start:
KASSERT(mutex_owned(&bufcache_lock));
/*
* Get a new buffer from the pool.
*/
if (!from_bufq && buf_lotsfree()) {
mutex_exit(&bufcache_lock);
bp = pool_cache_get(buf_cache, PR_NOWAIT);
if (bp != NULL) {
memset((char *)bp, 0, sizeof(*bp));
buf_init(bp);
SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */
mutex_enter(&bufcache_lock);
#if defined(DIAGNOSTIC)
bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
return (bp);
}
mutex_enter(&bufcache_lock);
}
KASSERT(mutex_owned(&bufcache_lock));
if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
} else {
TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { if (ISSET(bp->b_cflags, BC_VFLUSH) ||
!ISSET(bp->b_oflags, BO_DELWRI))
break;
if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) {
KASSERT(transmp == NULL);
transmp = bp->b_vp->v_mount;
break;
}
}
}
if (bp != NULL) {
KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
bremfree(bp);
/* Buffer is no longer on free lists. */
SET(bp->b_cflags, BC_BUSY);
/* Wake anyone trying to lock the old identity. */
cv_broadcast(&bp->b_busy);
} else {
/*
* XXX: !from_bufq should be removed.
*/
if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
/* wait for a free buffer of any kind */
if ((slpflag & PCATCH) != 0)
(void)cv_timedwait_sig(&needbuffer_cv,
&bufcache_lock, slptimeo);
else
(void)cv_timedwait(&needbuffer_cv,
&bufcache_lock, slptimeo);
}
SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
return (NULL);
}
#ifdef DIAGNOSTIC
if (bp->b_bufsize <= 0)
panic("buffer %p: on queue but empty", bp);
#endif
if (ISSET(bp->b_cflags, BC_VFLUSH)) {
/*
* This is a delayed write buffer being flushed to disk. Make
* sure it gets aged out of the queue when it's finished, and
* leave it off the LRU queue.
*/
CLR(bp->b_cflags, BC_VFLUSH);
SET(bp->b_cflags, BC_AGE);
goto start;
}
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
/*
* If buffer was a delayed write, start it and return NULL
* (since we might sleep while starting the write).
*/
if (ISSET(bp->b_oflags, BO_DELWRI)) {
/*
* This buffer has gone through the LRU, so make sure it gets
* reused ASAP.
*/
SET(bp->b_cflags, BC_AGE);
mutex_exit(&bufcache_lock);
bawrite(bp); KASSERT(transmp != NULL);
fstrans_done(transmp);
mutex_enter(&bufcache_lock);
SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
return (NULL);
}
KASSERT(transmp == NULL);
vp = bp->b_vp;
/* clear out various other fields */
bp->b_cflags = BC_BUSY;
bp->b_oflags = 0;
bp->b_flags = 0;
bp->b_dev = NODEV;
bp->b_blkno = 0;
bp->b_lblkno = 0;
bp->b_rawblkno = 0;
bp->b_iodone = 0;
bp->b_error = 0;
bp->b_resid = 0;
bp->b_bcount = 0;
LIST_REMOVE(bp, b_hash);
/* Disassociate us from our vnode, if we had one... */
if (vp != NULL) { mutex_enter(vp->v_interlock);
brelvp(bp);
mutex_exit(vp->v_interlock);
}
SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
return (bp);
}
/*
* Invalidate the specified buffer if it exists.
*/
void
binvalbuf(struct vnode *vp, daddr_t blkno)
{
buf_t *bp;
int err;
mutex_enter(&bufcache_lock);
loop:
bp = incore(vp, blkno);
if (bp != NULL) {
err = bbusy(bp, 0, 0, NULL);
if (err == EPASSTHROUGH)
goto loop;
bremfree(bp);
if (ISSET(bp->b_oflags, BO_DELWRI)) {
SET(bp->b_cflags, BC_NOCACHE);
mutex_exit(&bufcache_lock);
bwrite(bp);
} else {
brelsel(bp, BC_INVAL);
mutex_exit(&bufcache_lock);
}
} else
mutex_exit(&bufcache_lock);
}
/*
* Attempt to free an aged buffer off the queues.
* Called with queue lock held.
* Returns the amount of buffer memory freed.
*/
static int
buf_trim(void)
{
buf_t *bp;
long size;
KASSERT(mutex_owned(&bufcache_lock));
/* Instruct getnewbuf() to get buffers off the queues */
if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
return 0;
KASSERT((bp->b_cflags & BC_WANTED) == 0);
size = bp->b_bufsize;
bufmem -= size;
if (size > 0) {
buf_mrelease(bp->b_data, size);
bp->b_bcount = bp->b_bufsize = 0;
}
/* brelse() will return the buffer to the global buffer pool */
brelsel(bp, 0);
return size;
}
int
buf_drain(int n)
{
int size = 0, sz;
KASSERT(mutex_owned(&bufcache_lock)); while (size < n && bufmem > bufmem_lowater) {
sz = buf_trim();
if (sz <= 0)
break;
size += sz;
}
return size;
}
/*
* Wait for operations on the buffer to complete.
* When they do, extract and return the I/O's error value.
*/
int
biowait(buf_t *bp)
{
BIOHIST_FUNC(__func__);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); SDT_PROBE1(io, kernel, , wait__start, bp);
mutex_enter(bp->b_objlock);
BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
(uintptr_t)bp, bp->b_oflags,
(uintptr_t)__builtin_return_address(0), 0);
while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) {
BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0);
cv_wait(&bp->b_done, bp->b_objlock);
}
mutex_exit(bp->b_objlock);
SDT_PROBE1(io, kernel, , wait__done, bp);
BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0);
return bp->b_error;
}
/*
* Mark I/O complete on a buffer.
*
* If a callback has been requested, e.g. the pageout
* daemon, do so. Otherwise, awaken waiting processes.
*
* [ Leffler, et al., says on p.247:
* "This routine wakes up the blocked process, frees the buffer
* for an asynchronous write, or, for a request by the pagedaemon
* process, invokes a procedure specified in the buffer structure" ]
*
* In real life, the pagedaemon (or other system processes) wants
* to do async stuff too, and doesn't want the buffer brelse()'d.
* (for swap pager, that puts swap buffers on the free lists (!!!),
* for the vn device, that puts allocated buffers on the free lists!)
*/
void
biodone(buf_t *bp)
{
int s;
BIOHIST_FUNC(__func__);
KASSERT(!ISSET(bp->b_oflags, BO_DONE));
if (cpu_intr_p()) {
/* From interrupt mode: defer to a soft interrupt. */
s = splvm();
TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
(uintptr_t)bp, 0, 0, 0);
softint_schedule(biodone_sih);
splx(s);
} else {
/* Process now - the buffer may be freed soon. */
biodone2(bp);
}
}
SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/);
static void
biodone2(buf_t *bp)
{
void (*callout)(buf_t *);
SDT_PROBE1(io, kernel, ,done, bp);
BIOHIST_FUNC(__func__);
BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
mutex_enter(bp->b_objlock);
/* Note that the transfer is done. */
if (ISSET(bp->b_oflags, BO_DONE))
panic("biodone2 already");
CLR(bp->b_flags, B_COWDONE);
SET(bp->b_oflags, BO_DONE);
BIO_SETPRIO(bp, BPRIO_DEFAULT);
/* Wake up waiting writers. */
if (!ISSET(bp->b_flags, B_READ)) vwakeup(bp);
if ((callout = bp->b_iodone) != NULL) {
BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
0, 0, 0);
/* Note callout done, then call out. */
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_iodone = NULL;
mutex_exit(bp->b_objlock);
(*callout)(bp);
} else if (ISSET(bp->b_flags, B_ASYNC)) {
/* If async, release. */
BIOHIST_LOG(biohist, "async", 0, 0, 0, 0);
KASSERT(!cv_has_waiters(&bp->b_done));
mutex_exit(bp->b_objlock);
brelse(bp, 0);
} else {
/* Otherwise just wake up waiters in biowait(). */
BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0);
cv_broadcast(&bp->b_done);
mutex_exit(bp->b_objlock);
}
}
static void
biointr(void *cookie)
{
struct cpu_info *ci;
buf_t *bp;
int s;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
ci = curcpu();
s = splvm();
while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
KASSERT(curcpu() == ci);
bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
splx(s);
BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
biodone2(bp);
s = splvm();
}
splx(s);
}
static void
sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o)
{
const bool allowaddr = get_expose_address(curproc);
memset(o, 0, sizeof(*o));
o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
o->b_error = i->b_error;
o->b_prio = i->b_prio;
o->b_dev = i->b_dev;
o->b_bufsize = i->b_bufsize;
o->b_bcount = i->b_bcount;
o->b_resid = i->b_resid;
COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
o->b_blkno = i->b_blkno;
o->b_rawblkno = i->b_rawblkno;
COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
o->b_lblkno = i->b_lblkno;
}
static int
sysctl_dobuf(SYSCTLFN_ARGS)
{
buf_t *bp;
struct buf_sysctl bs;
struct bqueue *bq;
char *dp;
u_int i, op, arg;
size_t len, needed, elem_size, out_size;
int error, elem_count, retries;
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (namelen != 4)
return (EINVAL);
retries = 100;
retry:
dp = oldp;
len = (oldp != NULL) ? *oldlenp : 0;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
out_size = MIN(sizeof(bs), elem_size);
/*
* at the moment, these are just "placeholders" to make the
* API for retrieving kern.buf data more extensible in the
* future.
*
* XXX kern.buf currently has "netbsd32" issues. hopefully
* these will be resolved at a later point.
*/
if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
elem_size < 1 || elem_count < 0)
return (EINVAL);
if (oldp == NULL) {
/* count only, don't run through the buffer queues */
needed = pool_cache_nget(buf_cache) - pool_cache_nput(buf_cache);
*oldlenp = (needed + KERN_BUFSLOP) * elem_size;
return 0;
}
error = 0;
needed = 0;
sysctl_unlock();
mutex_enter(&bufcache_lock);
for (i = 0; i < BQUEUES; i++) {
bq = &bufqueues[i];
TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
bq->bq_marker = bp;
if (len >= elem_size && elem_count > 0) {
sysctl_fillbuf(bp, &bs);
mutex_exit(&bufcache_lock);
error = copyout(&bs, dp, out_size);
mutex_enter(&bufcache_lock);
if (error)
break;
if (bq->bq_marker != bp) {
/*
* This sysctl node is only for
* statistics. Retry; if the
* queue keeps changing, then
* bail out.
*/
if (retries-- == 0) {
error = EAGAIN;
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
goto retry;
}
dp += elem_size;
len -= elem_size;
}
needed += elem_size;
if (elem_count > 0 && elem_count != INT_MAX)
elem_count--;
}
if (error != 0)
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
*oldlenp = needed;
return (error);
}
static int
sysctl_bufvm_update(SYSCTLFN_ARGS)
{
int error, rv;
struct sysctlnode node;
unsigned int temp_bufcache;
unsigned long temp_water;
/* Take a copy of the supplied node and its data */
node = *rnode;
if (node.sysctl_data == &bufcache) {
node.sysctl_data = &temp_bufcache;
temp_bufcache = *(unsigned int *)rnode->sysctl_data;
} else {
node.sysctl_data = &temp_water;
temp_water = *(unsigned long *)rnode->sysctl_data;
}
/* Update the copy */
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return (error);
if (rnode->sysctl_data == &bufcache) {
if (temp_bufcache > 100)
return (EINVAL);
bufcache = temp_bufcache;
buf_setwm();
} else if (rnode->sysctl_data == &bufmem_lowater) {
if (bufmem_hiwater - temp_water < 16)
return (EINVAL);
bufmem_lowater = temp_water;
} else if (rnode->sysctl_data == &bufmem_hiwater) {
if (temp_water - bufmem_lowater < 16)
return (EINVAL);
bufmem_hiwater = temp_water;
} else
return (EINVAL);
/* Drain until below new high water mark */
sysctl_unlock();
mutex_enter(&bufcache_lock);
while (bufmem > bufmem_hiwater) {
rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024));
if (rv <= 0)
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
return 0;
}
static struct sysctllog *vfsbio_sysctllog;
static void
sysctl_kern_buf_setup(void)
{
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "buf",
SYSCTL_DESCR("Kernel buffer cache information"),
sysctl_dobuf, 0, NULL, 0,
CTL_KERN, KERN_BUF, CTL_EOL);
}
static void
sysctl_vm_buf_setup(void)
{
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "bufcache",
SYSCTL_DESCR("Percentage of physical memory to use for "
"buffer cache"),
sysctl_bufvm_update, 0, &bufcache, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_LONG, "bufmem",
SYSCTL_DESCR("Amount of kernel memory used by buffer "
"cache"),
NULL, 0, &bufmem, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "bufmem_lowater",
SYSCTL_DESCR("Minimum amount of kernel memory to "
"reserve for buffer cache"),
sysctl_bufvm_update, 0, &bufmem_lowater, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "bufmem_hiwater",
SYSCTL_DESCR("Maximum amount of kernel memory to use "
"for buffer cache"),
sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
}
static int
bufhash_stats(struct hashstat_sysctl *hs, bool fill)
{
buf_t *bp;
uint64_t chain;
strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = bufhash + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
mutex_enter(&bufcache_lock);
LIST_FOREACH(bp, &bufhashtbl[i], b_hash) {
chain++;
}
mutex_exit(&bufcache_lock);
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
preempt_point();
}
return 0;
}
#ifdef DEBUG
/*
* Print out statistics on the current allocation of the buffer pool.
* Can be enabled to print out on every ``sync'' by setting "syncprt"
* in vfs_syscalls.c using sysctl.
*/
void
vfs_bufstats(void)
{
int i, j, count;
buf_t *bp;
struct bqueue *dp;
int counts[MAXBSIZE / MIN_PAGE_SIZE + 1];
static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
count = 0;
memset(counts, 0, sizeof(counts));
TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
counts[bp->b_bufsize / PAGE_SIZE]++;
count++;
}
printf("%s: total-%d", bname[i], count);
for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++)
if (counts[j] != 0)
printf(", %d-%d", j * PAGE_SIZE, counts[j]);
printf("\n");
}
}
#endif /* DEBUG */
/* ------------------------------ */
buf_t *
getiobuf(struct vnode *vp, bool waitok)
{
buf_t *bp;
bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
if (bp == NULL)
return bp;
buf_init(bp);
if ((bp->b_vp = vp) != NULL) { bp->b_objlock = vp->v_interlock;
} else {
KASSERT(bp->b_objlock == &buffer_lock);
}
return bp;
}
void
putiobuf(buf_t *bp)
{
buf_destroy(bp);
pool_cache_put(bufio_cache, bp);
}
/*
* nestiobuf_iodone: b_iodone callback for nested buffers.
*/
void
nestiobuf_iodone(buf_t *bp)
{
buf_t *mbp = bp->b_private;
int error;
int donebytes;
KASSERT(bp->b_bcount <= bp->b_bufsize);
KASSERT(mbp != bp);
error = bp->b_error;
if (bp->b_error == 0 &&
(bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
/*
* Not all got transferred, raise an error. We have no way to
* propagate these conditions to mbp.
*/
error = EIO;
}
donebytes = bp->b_bufsize;
putiobuf(bp);
nestiobuf_done(mbp, donebytes, error);
}
/*
* nestiobuf_setup: setup a "nested" buffer.
*
* => 'mbp' is a "master" buffer which is being divided into sub pieces.
* => 'bp' should be a buffer allocated by getiobuf.
* => 'offset' is a byte offset in the master buffer.
* => 'size' is a size in bytes of this nested buffer.
*/
void
nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
{
const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS);
struct vnode *vp = mbp->b_vp;
KASSERT(mbp->b_bcount >= offset + size);
bp->b_vp = vp;
bp->b_dev = mbp->b_dev;
bp->b_objlock = mbp->b_objlock;
bp->b_cflags = BC_BUSY;
bp->b_flags = B_ASYNC | b_pass;
bp->b_iodone = nestiobuf_iodone;
bp->b_data = (char *)mbp->b_data + offset;
bp->b_resid = bp->b_bcount = size;
bp->b_bufsize = bp->b_bcount;
bp->b_private = mbp;
BIO_COPYPRIO(bp, mbp);
if (BUF_ISWRITE(bp) && vp != NULL) {
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
}
}
/*
* nestiobuf_done: propagate completion to the master buffer.
*
* => 'donebytes' specifies how many bytes in the 'mbp' is completed.
* => 'error' is an errno(2) that 'donebytes' has been completed with.
*/
void
nestiobuf_done(buf_t *mbp, int donebytes, int error)
{ if (donebytes == 0) {
return;
}
mutex_enter(mbp->b_objlock);
KASSERT(mbp->b_resid >= donebytes);
mbp->b_resid -= donebytes;
if (error) mbp->b_error = error;
if (mbp->b_resid == 0) {
if (mbp->b_error) mbp->b_resid = mbp->b_bcount;
mutex_exit(mbp->b_objlock);
biodone(mbp);
} else
mutex_exit(mbp->b_objlock);
}
void
buf_init(buf_t *bp)
{
cv_init(&bp->b_busy, "biolock");
cv_init(&bp->b_done, "biowait");
bp->b_dev = NODEV;
bp->b_error = 0;
bp->b_flags = 0;
bp->b_cflags = 0;
bp->b_oflags = 0;
bp->b_objlock = &buffer_lock;
bp->b_iodone = NULL;
bp->b_dev = NODEV;
bp->b_vnbufs.le_next = NOLIST;
BIO_SETPRIO(bp, BPRIO_DEFAULT);
}
void
buf_destroy(buf_t *bp)
{
cv_destroy(&bp->b_done);
cv_destroy(&bp->b_busy);
}
int
bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
{
int error;
KASSERT(mutex_owned(&bufcache_lock)); SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock);
if ((bp->b_cflags & BC_BUSY) != 0) {
if (curlwp == uvm.pagedaemon_lwp) {
error = EDEADLK;
goto out;
}
bp->b_cflags |= BC_WANTED;
if (interlock != NULL) mutex_exit(interlock);
if (intr) {
error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
timo);
} else {
error = cv_timedwait(&bp->b_busy, &bufcache_lock,
timo);
}
/*
* At this point the buffer may be gone: don't touch it
* again. The caller needs to find it again and retry.
*/
if (interlock != NULL) mutex_enter(interlock);
if (error == 0)
error = EPASSTHROUGH;
} else {
bp->b_cflags |= BC_BUSY;
error = 0;
}
out: SDT_PROBE5(io, kernel, , bbusy__done,
bp, intr, timo, interlock, error);
return error;
}
/*
* Nothing outside this file should really need to know about nbuf,
* but a few things still want to read it, so give them a way to do that.
*/
u_int
buf_nbuf(void)
{
return nbuf;
}
/* $NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CPU-local reference counts
*
* localcount(9) is a reference-counting scheme that involves no
* interprocessor synchronization most of the time, at the cost of
* eight bytes of memory per CPU per object and at the cost of
* expensive interprocessor synchronization to drain references.
*
* localcount(9) references may be held across sleeps, may be
* transferred from CPU to CPU or thread to thread: they behave
* semantically like typical reference counts, with different
* pragmatic performance characteristics.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/localcount.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/errno.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#if defined(DEBUG) && defined(LOCKDEBUG)
#include <sys/atomic.h>
#endif
static void localcount_xc(void *, void *);
/*
* localcount_init(lc)
*
* Initialize a localcount object. Returns 0 on success, error
* code on failure. May fail to allocate memory for percpu(9).
*
* The caller must call localcount_drain and then localcount_fini
* when done with lc.
*/
void
localcount_init(struct localcount *lc)
{
lc->lc_totalp = NULL;
lc->lc_percpu = percpu_alloc(sizeof(int64_t));
}
/*
* localcount_drain(lc, cv, interlock)
*
* Wait for all acquired references to lc to drain. Caller must
* hold interlock; localcount_drain releases it during cross-calls
* and waits on cv. The cv and interlock passed here must be the
* same as are passed to localcount_release for this lc.
*
* Caller must guarantee that no new references can be acquired
* with localcount_acquire before calling localcount_drain. For
* example, any object that may be found in a list and acquired
* must be removed from the list before localcount_drain.
*
* The localcount object lc may be used only with localcount_fini
* after this, unless reinitialized after localcount_fini with
* localcount_init.
*/
void
localcount_drain(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{
int64_t total = 0;
KASSERT(mutex_owned(interlock));
KASSERT(lc->lc_totalp == NULL);
/* Mark it draining. */
lc->lc_totalp = &total;
/*
* Count up all references on all CPUs.
*
* This serves as a global memory barrier: after xc_wait, all
* CPUs will have witnessed the nonnull value of lc->lc_totalp,
* so that it is safe to wait on the cv for them.
*/
mutex_exit(interlock);
xc_wait(xc_broadcast(0, &localcount_xc, lc, interlock));
mutex_enter(interlock);
/* Wait for remaining references to drain. */
while (total != 0) {
/*
* At this point, now that we have added up all
* references on all CPUs, the total had better be
* nonnegative.
*/
KASSERTMSG((0 < total),
"negatively referenced localcount: %p, %"PRId64,
lc, total);
cv_wait(cv, interlock);
}
/* Paranoia: Cause any further use of lc->lc_totalp to crash. */
lc->lc_totalp = (void *)(uintptr_t)1;
}
/*
* localcount_fini(lc)
*
* Finalize a localcount object, releasing any memory allocated
* for it. The localcount object must already have been drained.
*/
void
localcount_fini(struct localcount *lc)
{
KASSERT(lc->lc_totalp == (void *)(uintptr_t)1);
percpu_free(lc->lc_percpu, sizeof(uint64_t));
}
/*
* localcount_xc(cookie0, cookie1)
*
* Accumulate and transfer the per-CPU reference counts to a
* global total, resetting the per-CPU counter to zero. Once
* localcount_drain() has started, we only maintain the total
* count in localcount_release().
*/
static void
localcount_xc(void *cookie0, void *cookie1)
{
struct localcount *lc = cookie0;
kmutex_t *interlock = cookie1;
int64_t *localp;
mutex_enter(interlock);
localp = percpu_getref(lc->lc_percpu);
*lc->lc_totalp += *localp;
*localp -= *localp; /* ie, *localp = 0; */
percpu_putref(lc->lc_percpu);
mutex_exit(interlock);
}
/*
* localcount_adjust(lc, delta)
*
* Add delta -- positive or negative -- to the local CPU's count
* for lc.
*/
static void
localcount_adjust(struct localcount *lc, int delta)
{
int64_t *localp;
localp = percpu_getref(lc->lc_percpu);
*localp += delta;
percpu_putref(lc->lc_percpu);
}
/*
* localcount_acquire(lc)
*
* Acquire a reference to lc.
*
* The reference may be held across sleeps and may be migrated
* from CPU to CPU, or even thread to thread -- it is only
* counted, not associated with a particular concrete owner.
*
* Involves no interprocessor synchronization. May be used in any
* context: while a lock is held, within a pserialize(9) read
* section, in hard interrupt context (provided other users block
* hard interrupts), in soft interrupt context, in thread context,
* &c.
*
* Caller must guarantee that there is no concurrent
* localcount_drain. For example, any object that may be found in
* a list and acquired must be removed from the list before
* localcount_drain.
*/
void
localcount_acquire(struct localcount *lc)
{ KASSERT(lc->lc_totalp == NULL);
localcount_adjust(lc, +1);
#if defined(DEBUG) && defined(LOCKDEBUG)
if (atomic_inc_32_nv(&lc->lc_refcnt) == 0)
panic("counter overflow");
#endif
}
/*
* localcount_release(lc, cv, interlock)
*
* Release a reference to lc. If there is a concurrent
* localcount_drain and this may be the last reference, notify
* localcount_drain by acquiring interlock, waking cv, and
* releasing interlock. The cv and interlock passed here must be
* the same as are passed to localcount_drain for this lc.
*
* Involves no interprocessor synchronization unless there is a
* concurrent localcount_drain in progress.
*/
void
localcount_release(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{
/*
* Block xcall so that if someone begins draining after we see
* lc->lc_totalp as null, then they won't start cv_wait until
* after they have counted this CPU's contributions.
*
* Otherwise, localcount_drain may notice an extant reference
* from this CPU and cv_wait for it, but having seen
* lc->lc_totalp as null, this CPU will not wake
* localcount_drain.
*/
kpreempt_disable();
KDASSERT(mutex_ownable(interlock));
if (__predict_false(lc->lc_totalp != NULL)) {
/*
* Slow path -- wake localcount_drain in case this is
* the last reference.
*/
mutex_enter(interlock);
if (--*lc->lc_totalp == 0) cv_broadcast(cv);
mutex_exit(interlock);
goto out;
}
localcount_adjust(lc, -1);
#if defined(DEBUG) && defined(LOCKDEBUG)
if (atomic_dec_32_nv(&lc->lc_refcnt) == UINT_MAX) panic("counter underflow");
#endif
out: kpreempt_enable();
}
/*
* localcount_debug_refcnt(lc)
*
* Return a total reference count of lc. It returns a correct value
* only if DEBUG and LOCKDEBUG enabled. Otherwise always return 0.
*/
uint32_t
localcount_debug_refcnt(const struct localcount *lc)
{
#if defined(DEBUG) && defined(LOCKDEBUG)
return lc->lc_refcnt;
#else
return 0;
#endif
}
/* $NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $ */
/*-
* Copyright (c) 1998, 1999, 2006, 2007, 2008, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_exit.c 8.10 (Berkeley) 2/23/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $");
#include "opt_ktrace.h"
#include "opt_dtrace.h"
#include "opt_sysv.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>
#include <sys/ptrace.h>
#include <sys/acct.h>
#include <sys/filedesc.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/ktrace.h>
#include <sys/cpu.h>
#include <sys/lwpctl.h>
#include <sys/atomic.h>
#include <sys/sdt.h>
#include <sys/psref.h>
#include <uvm/uvm_extern.h>
#ifdef DEBUG_EXIT
int debug_exit = 0;
#define DPRINTF(x) if (debug_exit) printf x
#else
#define DPRINTF(x)
#endif
static int find_stopped_child(struct proc *, idtype_t, id_t, int,
struct proc **, struct wrusage *, siginfo_t *);
static void proc_free(struct proc *, struct wrusage *);
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exit, "int");
/*
* Fill in the appropriate signal information, and signal the parent.
*/
/* XXX noclone works around a gcc 4.5 bug on arm */
static void __noclone
exit_psignal(struct proc *p, struct proc *pp, ksiginfo_t *ksi)
{
KSI_INIT(ksi);
if ((ksi->ksi_signo = P_EXITSIG(p)) == SIGCHLD) {
if (p->p_xsig) {
if (p->p_sflag & PS_COREDUMP)
ksi->ksi_code = CLD_DUMPED;
else
ksi->ksi_code = CLD_KILLED;
ksi->ksi_status = p->p_xsig;
} else {
ksi->ksi_code = CLD_EXITED;
ksi->ksi_status = p->p_xexit;
}
} else {
ksi->ksi_code = SI_USER;
ksi->ksi_status = p->p_xsig;
}
/*
* We fill those in, even for non-SIGCHLD.
* It's safe to access p->p_cred unlocked here.
*/
ksi->ksi_pid = p->p_pid;
ksi->ksi_uid = kauth_cred_geteuid(p->p_cred);
/* XXX: is this still valid? */
ksi->ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
ksi->ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
}
/*
* exit --
* Death of process.
*/
int
sys_exit(struct lwp *l, const struct sys_exit_args *uap, register_t *retval)
{
/* {
syscallarg(int) rval;
} */
struct proc *p = l->l_proc;
/* Don't call exit1() multiple times in the same process. */
mutex_enter(p->p_lock);
if (p->p_sflag & PS_WEXIT) {
mutex_exit(p->p_lock);
lwp_exit(l);
}
/* exit1() will release the mutex. */
exit1(l, SCARG(uap, rval), 0);
/* NOTREACHED */
return (0);
}
/*
* Exit: deallocate address space and other resources, change proc state
* to zombie, and unlink proc from allproc and parent's lists. Save exit
* status and rusage for wait(). Check for child processes and orphan them.
*
* Must be called with p->p_lock held. Does not return.
*/
void
exit1(struct lwp *l, int exitcode, int signo)
{
struct proc *p, *child, *next_child, *old_parent, *new_parent;
struct pgrp *pgrp;
ksiginfo_t ksi;
ksiginfoq_t kq;
int wakeinit;
p = l->l_proc;
/* Verify that we hold no locks other than p->p_lock. */
LOCKDEBUG_BARRIER(p->p_lock, 0);
/* XXX Temporary: something is leaking kernel_lock. */
KERNEL_UNLOCK_ALL(l, NULL);
KASSERT(mutex_owned(p->p_lock));
KASSERT(p->p_vmspace != NULL);
if (__predict_false(p == initproc)) {
panic("init died (signal %d, exit %d)", signo, exitcode);
}
p->p_sflag |= PS_WEXIT;
/*
* Force all other LWPs to exit before we do. Only then can we
* begin to tear down the rest of the process state.
*/
if (p->p_nlwps > 1) {
exit_lwps(l);
}
ksiginfo_queue_init(&kq);
/*
* If we have been asked to stop on exit, do so now.
*/
if (__predict_false(p->p_sflag & PS_STOPEXIT)) {
KASSERT(l->l_blcnt == 0);
sigclearall(p, &contsigmask, &kq);
if (!mutex_tryenter(&proc_lock)) {
mutex_exit(p->p_lock);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
}
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
p->p_stat = SSTOP;
mutex_exit(&proc_lock);
lwp_lock(l);
p->p_nrlwps--;
l->l_stat = LSSTOP;
lwp_unlock(l);
mutex_exit(p->p_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
mutex_enter(p->p_lock);
}
/*
* Bin any remaining signals and mark the process as dying so it will
* not be found for, e.g. signals.
*/
sigfillset(&p->p_sigctx.ps_sigignore);
sigclearall(p, NULL, &kq);
p->p_stat = SDYING;
/*
* Perform any required thread cleanup. Do this early so
* anyone wanting to look us up by our global thread ID
* will fail to find us.
*
* N.B. this will unlock p->p_lock on our behalf.
*/
lwp_thread_cleanup(l);
ksiginfo_queue_drain(&kq);
/* Destroy any lwpctl info. */
if (p->p_lwpctl != NULL)
lwp_ctl_exit();
/*
* Drain all remaining references that procfs, ptrace and others may
* have on the process.
*/
rw_enter(&p->p_reflock, RW_WRITER);
DPRINTF(("%s: %d.%d exiting.\n", __func__, p->p_pid, l->l_lid));
ptimers_free(p, TIMERS_ALL);
#if defined(__HAVE_RAS)
ras_purgeall();
#endif
/*
* Close open files, release open-file table and free signal
* actions. This may block!
*/
fd_free();
cwdfree(p->p_cwdi);
p->p_cwdi = NULL;
doexithooks(p);
sigactsfree(p->p_sigacts);
/*
* Write out accounting data.
*/
(void)acct_process(l);
#ifdef KTRACE
/*
* Release trace file.
*/
if (p->p_tracep != NULL) {
mutex_enter(&ktrace_lock);
ktrderef(p);
mutex_exit(&ktrace_lock);
}
#endif
p->p_xexit = exitcode;
p->p_xsig = signo;
/*
* If emulation has process exit hook, call it now.
* Set the exit status now so that the exit hook has
* an opportunity to tweak it (COMPAT_LINUX requires
* this for thread group emulation)
*/
if (p->p_emul->e_proc_exit)
(*p->p_emul->e_proc_exit)(p);
/*
* Free the VM resources we're still holding on to.
* We must do this from a valid thread because doing
* so may block. This frees vmspace, which we don't
* need anymore. The only remaining lwp is the one
* we run at this moment, nothing runs in userland
* anymore.
*/
ruspace(p); /* Update our vm resource use */
uvm_proc_exit(p);
/*
* Stop profiling.
*/
if (__predict_false((p->p_stflag & PST_PROFIL) != 0)) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
/*
* If parent is waiting for us to exit or exec, PL_PPWAIT is set; we
* wake up the parent early to avoid deadlock. We can do this once
* the VM resources are released.
*/
mutex_enter(&proc_lock);
if (p->p_lflag & PL_PPWAIT) {
lwp_t *lp;
l->l_lwpctl = NULL; /* was on loan from blocked parent */
p->p_lflag &= ~PL_PPWAIT;
lp = p->p_vforklwp;
p->p_vforklwp = NULL;
lp->l_vforkwaiting = false;
cv_broadcast(&lp->l_waitcv);
}
if (SESS_LEADER(p)) {
struct vnode *vprele = NULL, *vprevoke = NULL;
struct session *sp = p->p_session;
struct tty *tp;
if (sp->s_ttyvp) {
/*
* Controlling process.
* Signal foreground pgrp,
* drain controlling terminal
* and revoke access to controlling terminal.
*/
tp = sp->s_ttyp;
mutex_spin_enter(&tty_lock);
if (tp->t_session == sp) {
/* we can't guarantee the revoke will do this */
pgrp = tp->t_pgrp;
tp->t_pgrp = NULL;
tp->t_session = NULL;
mutex_spin_exit(&tty_lock);
if (pgrp != NULL) {
pgsignal(pgrp, SIGHUP, 1);
}
mutex_exit(&proc_lock);
(void) ttywait(tp);
mutex_enter(&proc_lock);
/* The tty could have been revoked. */
vprevoke = sp->s_ttyvp;
} else
mutex_spin_exit(&tty_lock);
vprele = sp->s_ttyvp;
sp->s_ttyvp = NULL;
/*
* s_ttyp is not zero'd; we use this to indicate
* that the session once had a controlling terminal.
* (for logging and informational purposes)
*/
}
sp->s_leader = NULL;
if (vprevoke != NULL || vprele != NULL) {
if (vprevoke != NULL) {
/* Releases proc_lock. */
proc_sessrele(sp);
VOP_REVOKE(vprevoke, REVOKEALL);
} else
mutex_exit(&proc_lock);
if (vprele != NULL)
vrele(vprele);
mutex_enter(&proc_lock);
}
}
fixjobc(p, p->p_pgrp, 0);
/* Release fstrans private data. */
fstrans_lwp_dtor(l);
/*
* Finalize the last LWP's specificdata, as well as the
* specificdata for the proc itself.
*/
lwp_finispecific(l);
proc_finispecific(p);
/*
* Reset p_opptr pointer of all former children which got
* traced by another process and were reparented. We reset
* it to NULL here; the trace detach code then reparents
* the child to initproc. We only check allproc list, since
* eventual former children on zombproc list won't reference
* p_opptr anymore.
*/
if (__predict_false(p->p_slflag & PSL_CHTRACED)) {
struct proc *q;
PROCLIST_FOREACH(q, &allproc) {
if (q->p_opptr == p)
q->p_opptr = NULL;
}
PROCLIST_FOREACH(q, &zombproc) {
if (q->p_opptr == p)
q->p_opptr = NULL;
}
}
/*
* Give orphaned children to init(8).
*/
child = LIST_FIRST(&p->p_children);
wakeinit = (child != NULL);
for (; child != NULL; child = next_child) {
next_child = LIST_NEXT(child, p_sibling);
/*
* Traced processes are killed since their existence
* means someone is screwing up. Since we reset the
* trace flags, the logic in sys_wait4() would not be
* triggered to reparent the process to its
* original parent, so we must do this here.
*/
if (__predict_false(child->p_slflag & PSL_TRACED)) {
mutex_enter(p->p_lock);
child->p_slflag &=
~(PSL_TRACED|PSL_SYSCALL);
mutex_exit(p->p_lock);
if (child->p_opptr != child->p_pptr) {
struct proc *t = child->p_opptr;
proc_reparent(child, t ? t : initproc);
child->p_opptr = NULL;
} else
proc_reparent(child, initproc);
killproc(child, "orphaned traced process");
} else
proc_reparent(child, initproc);
}
/*
* Move proc from allproc to zombproc, it's now nearly ready to be
* collected by parent.
*/
LIST_REMOVE(l, l_list);
LIST_REMOVE(p, p_list);
LIST_INSERT_HEAD(&zombproc, p, p_list);
/*
* Mark the process as dead. We must do this before we signal
* the parent.
*/
p->p_stat = SDEAD;
/*
* Let anyone watching this DTrace probe know what we're
* on our way out.
*/
SDT_PROBE(proc, kernel, , exit,
((p->p_sflag & PS_COREDUMP) ? CLD_DUMPED :
(p->p_xsig ? CLD_KILLED : CLD_EXITED)),
0,0,0,0);
/* Put in front of parent's sibling list for parent to collect it */
old_parent = p->p_pptr;
old_parent->p_nstopchild++;
if (LIST_FIRST(&old_parent->p_children) != p) {
/* Put child where it can be found quickly */
LIST_REMOVE(p, p_sibling);
LIST_INSERT_HEAD(&old_parent->p_children, p, p_sibling);
}
/*
* Notify parent that we're gone. If parent has the P_NOCLDWAIT
* flag set, notify init instead (and hope it will handle
* this situation).
*/
if (old_parent->p_flag & (PK_NOCLDWAIT|PK_CLDSIGIGN)) {
proc_reparent(p, initproc);
wakeinit = 1;
/*
* If this was the last child of our parent, notify
* parent, so in case he was wait(2)ing, he will
* continue.
*/
if (LIST_FIRST(&old_parent->p_children) == NULL)
cv_broadcast(&old_parent->p_waitcv);
}
/* Reload parent pointer, since p may have been reparented above */
new_parent = p->p_pptr;
if (__predict_false(p->p_exitsig != 0)) {
exit_psignal(p, new_parent, &ksi);
kpsignal(new_parent, &ksi, NULL);
}
/* Calculate the final rusage info. */
calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime,
NULL, NULL);
callout_destroy(&l->l_timeout_ch);
/*
* Release any PCU resources before becoming a zombie.
*/
pcu_discard_all(l);
/*
* Notify other processes tracking us with a knote that
* we're exiting.
*
* N.B. we do this here because the process is now SDEAD,
* and thus cannot have any more knotes attached. Also,
* knote_proc_exit() expects that p->p_lock is already
* held (and will assert so).
*/
mutex_enter(p->p_lock);
if (!SLIST_EMPTY(&p->p_klist)) {
knote_proc_exit(p);
}
/* Free the LWP ID */
proc_free_lwpid(p, l->l_lid);
lwp_drainrefs(l);
lwp_lock(l);
l->l_prflag &= ~LPR_DETACHED;
l->l_stat = LSZOMB;
lwp_unlock(l);
KASSERT(curlwp == l);
KASSERT(p->p_nrlwps == 1);
KASSERT(p->p_nlwps == 1);
p->p_stat = SZOMB;
p->p_nrlwps--;
p->p_nzlwps++;
p->p_ndlwps = 0;
mutex_exit(p->p_lock);
/*
* Signal the parent to collect us, and drop the proclist lock.
* Drop debugger/procfs lock; no new references can be gained.
*/
rw_exit(&p->p_reflock);
cv_broadcast(&p->p_pptr->p_waitcv);
mutex_exit(&proc_lock);
if (wakeinit)
cv_broadcast(&initproc->p_waitcv);
/*
* NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
*/
/*
* Give machine-dependent code a chance to free any MD LWP
* resources. This must be done before uvm_lwp_exit(), in
* case these resources are in the PCB.
*/
cpu_lwp_free(l, 1);
/* Switch away into oblivion. */
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
panic("exit1");
}
void
exit_lwps(struct lwp *l)
{
proc_t *p = l->l_proc;
lwp_t *l2;
retry:
KASSERT(mutex_owned(p->p_lock));
/*
* Interrupt LWPs in interruptable sleep, unsuspend suspended
* LWPs and then wait for everyone else to finish.
*/
LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
if (l2 == l)
continue;
lwp_lock(l2);
l2->l_flag |= LW_WEXIT;
lwp_need_userret(l2);
if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
l2->l_flag &= ~LW_DBGSUSPEND;
/* setrunnable() will release the lock. */
setrunnable(l2);
continue;
}
lwp_unlock(l2);
}
/*
* Wait for every LWP to exit. Note: LWPs can get suspended/slept
* behind us or there may even be new LWPs created. Therefore, a
* full retry is required on error.
*/
while (p->p_nlwps > 1) {
if (lwp_wait(l, 0, NULL, true)) {
goto retry;
}
}
KASSERT(p->p_nlwps == 1);
}
int
do_sys_waitid(idtype_t idtype, id_t id, int *pid, int *status, int options,
struct wrusage *wru, siginfo_t *si)
{
proc_t *child;
int error;
if (wru != NULL) memset(wru, 0, sizeof(*wru)); if (si != NULL) memset(si, 0, sizeof(*si));
mutex_enter(&proc_lock);
error = find_stopped_child(curproc, idtype, id, options, &child,
wru, si);
if (child == NULL) {
mutex_exit(&proc_lock);
*pid = 0;
*status = 0;
return error;
}
*pid = child->p_pid;
if (child->p_stat == SZOMB) {
/* Child is exiting */
*status = P_WAITSTATUS(child);
/* proc_free() will release the proc_lock. */
if (options & WNOWAIT) {
mutex_exit(&proc_lock);
} else {
proc_free(child, wru);
}
} else {
/* Don't mark SIGCONT if we are being stopped */
*status = (child->p_xsig == SIGCONT && child->p_stat != SSTOP) ?
W_CONTCODE() : W_STOPCODE(child->p_xsig);
mutex_exit(&proc_lock);
}
return 0;
}
int
do_sys_wait(int *pid, int *status, int options, struct rusage *ru)
{
idtype_t idtype;
id_t id;
int ret;
struct wrusage wru;
/*
* Translate the special pid values into the (idtype, pid)
* pair for wait6. The WAIT_MYPGRP case is handled by
* find_stopped_child() on its own.
*/
if (*pid == WAIT_ANY) {
idtype = P_ALL;
id = 0;
} else if (*pid < 0) {
idtype = P_PGID;
id = (id_t)-*pid;
} else {
idtype = P_PID;
id = (id_t)*pid;
}
options |= WEXITED | WTRAPPED;
ret = do_sys_waitid(idtype, id, pid, status, options, ru ? &wru : NULL,
NULL);
if (ru) *ru = wru.wru_self;
return ret;
}
int
sys___wait450(struct lwp *l, const struct sys___wait450_args *uap,
register_t *retval)
{
/* {
syscallarg(int) pid;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct rusage *) rusage;
} */
int error, status, pid = SCARG(uap, pid);
struct rusage ru;
error = do_sys_wait(&pid, &status, SCARG(uap, options),
SCARG(uap, rusage) != NULL ? &ru : NULL);
retval[0] = pid;
if (pid == 0) {
return error;
}
if (SCARG(uap, status)) { error = copyout(&status, SCARG(uap, status), sizeof(status));
}
if (SCARG(uap, rusage) && error == 0) { error = copyout(&ru, SCARG(uap, rusage), sizeof(ru));
}
return error;
}
int
sys_wait6(struct lwp *l, const struct sys_wait6_args *uap, register_t *retval)
{
/* {
syscallarg(idtype_t) idtype;
syscallarg(id_t) id;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct wrusage *) wru;
syscallarg(siginfo_t *) si;
} */
struct wrusage wru, *wrup;
siginfo_t si, *sip;
idtype_t idtype;
int pid;
id_t id;
int error, status;
idtype = SCARG(uap, idtype);
id = SCARG(uap, id);
if (SCARG(uap, wru) != NULL)
wrup = &wru;
else
wrup = NULL;
if (SCARG(uap, info) != NULL)
sip = &si;
else
sip = NULL;
/*
* We expect all callers of wait6() to know about WEXITED and
* WTRAPPED.
*/
error = do_sys_waitid(idtype, id, &pid, &status, SCARG(uap, options),
wrup, sip);
retval[0] = pid; /* tell userland who it was */
#if 0
/*
* should we copyout if there was no process, hence no useful data?
* We don't for an old style wait4() (etc) but I believe
* FreeBSD does for wait6(), so a tossup... Go with FreeBSD for now.
*/
if (pid == 0)
return error;
#endif
if (SCARG(uap, status) != NULL && error == 0)
error = copyout(&status, SCARG(uap, status), sizeof(status));
if (SCARG(uap, wru) != NULL && error == 0)
error = copyout(&wru, SCARG(uap, wru), sizeof(wru));
if (SCARG(uap, info) != NULL && error == 0)
error = copyout(&si, SCARG(uap, info), sizeof(si));
return error;
}
/*
* Find a process that matches the provided criteria, and fill siginfo
* and resources if found.
* Returns:
* -1: Not found, abort early
* 0: Not matched
* 1: Matched, there might be more matches
* 2: This is the only match
*/
static int
match_process(const struct proc *pp, struct proc **q, idtype_t idtype, id_t id,
int options, struct wrusage *wrusage, siginfo_t *siginfo)
{
struct rusage *rup;
struct proc *p = *q;
int rv = 1;
switch (idtype) {
case P_ALL:
mutex_enter(p->p_lock);
break;
case P_PID:
if (p->p_pid != (pid_t)id) {
p = *q = proc_find_raw((pid_t)id);
if (p == NULL || p->p_stat == SIDL || p->p_pptr != pp) {
*q = NULL;
return -1;
}
}
mutex_enter(p->p_lock);
rv++;
break;
case P_PGID:
if (p->p_pgid != (pid_t)id)
return 0;
mutex_enter(p->p_lock);
break;
case P_SID:
if (p->p_session->s_sid != (pid_t)id)
return 0;
mutex_enter(p->p_lock);
break;
case P_UID:
mutex_enter(p->p_lock);
if (kauth_cred_geteuid(p->p_cred) != (uid_t)id) { mutex_exit(p->p_lock);
return 0;
}
break;
case P_GID:
mutex_enter(p->p_lock);
if (kauth_cred_getegid(p->p_cred) != (gid_t)id) { mutex_exit(p->p_lock);
return 0;
}
break;
case P_CID:
case P_PSETID:
case P_CPUID:
/* XXX: Implement me */
default:
return 0;
}
if ((options & WEXITED) == 0 && p->p_stat == SZOMB) { mutex_exit(p->p_lock);
return 0;
}
if (siginfo != NULL) {
siginfo->si_errno = 0;
/*
* SUSv4 requires that the si_signo value is always
* SIGCHLD. Obey it despite the rfork(2) interface
* allows to request other signal for child exit
* notification.
*/
siginfo->si_signo = SIGCHLD;
/*
* This is still a rough estimate. We will fix the
* cases TRAPPED, STOPPED, and CONTINUED later.
*/
if (p->p_sflag & PS_COREDUMP) {
siginfo->si_code = CLD_DUMPED;
siginfo->si_status = p->p_xsig;
} else if (p->p_xsig) {
siginfo->si_code = CLD_KILLED;
siginfo->si_status = p->p_xsig;
} else {
siginfo->si_code = CLD_EXITED;
siginfo->si_status = p->p_xexit;
}
siginfo->si_pid = p->p_pid;
siginfo->si_uid = kauth_cred_geteuid(p->p_cred);
siginfo->si_utime = p->p_stats->p_ru.ru_utime.tv_sec;
siginfo->si_stime = p->p_stats->p_ru.ru_stime.tv_sec;
}
/*
* There should be no reason to limit resources usage info to
* exited processes only. A snapshot about any resources used
* by a stopped process may be exactly what is needed.
*/
if (wrusage != NULL) {
rup = &wrusage->wru_self;
*rup = p->p_stats->p_ru;
calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);
rup = &wrusage->wru_children;
*rup = p->p_stats->p_cru;
calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);
}
mutex_exit(p->p_lock);
return rv;
}
/*
* Determine if there are existing processes being debugged
* that used to be (and sometime later will be again) children
* of a specific parent (while matching wait criteria)
*/
static bool
debugged_child_exists(idtype_t idtype, id_t id, int options, siginfo_t *si,
const struct proc *parent)
{
struct proc *pp;
/*
* If we are searching for a specific pid, we can optimise a little
*/
if (idtype == P_PID) {
/*
* Check the specific process to see if its real parent is us
*/
pp = proc_find_raw((pid_t)id);
if (pp != NULL && pp->p_stat != SIDL && pp->p_opptr == parent) {
/*
* using P_ALL here avoids match_process() doing the
* same work that we just did, but incorrectly for
* this scenario.
*/
if (match_process(parent, &pp, P_ALL, id, options,
NULL, si))
return true;
}
return false;
}
/*
* For the hard cases, just look everywhere to see if some
* stolen (reparented) process is really our lost child.
* Then check if that process could satisfy the wait conditions.
*/
/*
* XXX inefficient, but hopefully fairly rare.
* XXX should really use a list of reparented processes.
*/
PROCLIST_FOREACH(pp, &allproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */
continue;
if (pp->p_opptr == parent &&
match_process(parent, &pp, idtype, id, options, NULL, si))
return true;
}
PROCLIST_FOREACH(pp, &zombproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */
continue;
if (pp->p_opptr == parent &&
match_process(parent, &pp, idtype, id, options, NULL, si))
return true;
}
return false;
}
/*
* Scan list of child processes for a child process that has stopped or
* exited. Used by sys_wait4 and 'compat' equivalents.
*
* Must be called with the proc_lock held, and may release while waiting.
*/
static int
find_stopped_child(struct proc *parent, idtype_t idtype, id_t id, int options,
struct proc **child_p, struct wrusage *wru, siginfo_t *si)
{
struct proc *child, *dead;
int error;
KASSERT(mutex_owned(&proc_lock)); if (options & ~WALLOPTS) {
*child_p = NULL;
return EINVAL;
}
if ((options & WSELECTOPTS) == 0) {
/*
* We will be unable to find any matching processes,
* because there are no known events to look for.
* Prefer to return error instead of blocking
* indefinitely.
*/
*child_p = NULL;
return EINVAL;
}
if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
id = (id_t)parent->p_pgid;
idtype = P_PGID;
}
for (;;) {
error = ECHILD;
dead = NULL;
LIST_FOREACH(child, &parent->p_children, p_sibling) {
int rv = match_process(parent, &child, idtype, id,
options, wru, si);
if (rv == -1)
break;
if (rv == 0)
continue;
/*
* Wait for processes with p_exitsig != SIGCHLD
* processes only if WALTSIG is set; wait for
* processes with p_exitsig == SIGCHLD only
* if WALTSIG is clear.
*/
if (((options & WALLSIG) == 0) && (options & WALTSIG ? child->p_exitsig == SIGCHLD : P_EXITSIG(child) != SIGCHLD)){ if (rv == 2) {
child = NULL;
break;
}
continue;
}
error = 0;
if ((options & WNOZOMBIE) == 0) { if (child->p_stat == SZOMB)
break;
if (child->p_stat == SDEAD) {
/*
* We may occasionally arrive here
* after receiving a signal, but
* immediately before the child
* process is zombified. The wait
* will be short, so avoid returning
* to userspace.
*/
dead = child;
}
}
if ((options & WCONTINUED) != 0 && child->p_xsig == SIGCONT && (child->p_sflag & PS_CONTINUED)) { if ((options & WNOWAIT) == 0) {
child->p_sflag &= ~PS_CONTINUED;
child->p_waited = 1;
parent->p_nstopchild--;
}
if (si) {
si->si_status = child->p_xsig;
si->si_code = CLD_CONTINUED;
}
break;
}
if ((options & (WTRAPPED|WSTOPPED)) != 0 && child->p_stat == SSTOP && child->p_waited == 0 && ((child->p_slflag & PSL_TRACED) ||
options & (WUNTRACED|WSTOPPED))) {
if ((options & WNOWAIT) == 0) {
child->p_waited = 1;
parent->p_nstopchild--;
}
if (si) {
si->si_status = child->p_xsig;
si->si_code =
(child->p_slflag & PSL_TRACED) ?
CLD_TRAPPED : CLD_STOPPED;
}
break;
}
if (parent->p_nstopchild == 0 || rv == 2) {
child = NULL;
break;
}
}
/*
* If we found nothing, but we are the bereaved parent
* of a stolen child, look and see if that child (or
* one of them) meets our search criteria. If so, then
* we cannot succeed, but we can hang (wait...),
* or if WNOHANG, return 0 instead of ECHILD
*/
if (child == NULL && error == ECHILD && (parent->p_slflag & PSL_CHTRACED) && debugged_child_exists(idtype, id, options, si, parent))
error = 0;
if (child != NULL || error != 0 ||
((options & WNOHANG) != 0 && dead == NULL)) {
*child_p = child;
return error;
}
/*
* Wait for another child process to stop.
*/
error = cv_wait_sig(&parent->p_waitcv, &proc_lock);
if (error != 0) {
*child_p = NULL;
return error;
}
}
}
/*
* Free a process after parent has taken all the state info. Must be called
* with the proclist lock held, and will release before returning.
*
* *ru is returned to the caller, and must be freed by the caller.
*/
static void
proc_free(struct proc *p, struct wrusage *wru)
{
struct proc *parent = p->p_pptr;
struct lwp *l;
ksiginfo_t ksi;
kauth_cred_t cred1, cred2;
uid_t uid;
KASSERT(mutex_owned(&proc_lock)); KASSERT(p->p_nlwps == 1); KASSERT(p->p_nzlwps == 1); KASSERT(p->p_nrlwps == 0); KASSERT(p->p_stat == SZOMB);
/*
* If we got the child via ptrace(2) or procfs, and
* the parent is different (meaning the process was
* attached, rather than run as a child), then we need
* to give it back to the old parent, and send the
* parent the exit signal. The rest of the cleanup
* will be done when the old parent waits on the child.
*/
if ((p->p_slflag & PSL_TRACED) != 0 && p->p_opptr != parent) {
mutex_enter(p->p_lock);
p->p_slflag &= ~(PSL_TRACED|PSL_SYSCALL);
mutex_exit(p->p_lock);
parent = (p->p_opptr == NULL) ? initproc : p->p_opptr;
proc_reparent(p, parent);
p->p_opptr = NULL;
if (p->p_exitsig != 0) { exit_psignal(p, parent, &ksi);
kpsignal(parent, &ksi, NULL);
}
cv_broadcast(&parent->p_waitcv);
mutex_exit(&proc_lock);
return;
}
sched_proc_exit(parent, p);
/*
* Add child times of exiting process onto its own times.
* This cannot be done any earlier else it might get done twice.
*/
l = LIST_FIRST(&p->p_lwps);
ruadd(&p->p_stats->p_ru, &l->l_ru);
ruadd(&p->p_stats->p_ru, &p->p_stats->p_cru);
ruadd(&parent->p_stats->p_cru, &p->p_stats->p_ru);
if (wru != NULL) { wru->wru_self = p->p_stats->p_ru;
wru->wru_children = p->p_stats->p_cru;
}
p->p_xsig = 0;
p->p_xexit = 0;
/*
* At this point we are going to start freeing the final resources.
* If anyone tries to access the proc structure after here they will
* get a shock - bits are missing. Attempt to make it hard! We
* don't bother with any further locking past this point.
*/
p->p_stat = SIDL; /* not even a zombie any more */
LIST_REMOVE(p, p_list); /* off zombproc */
parent->p_nstopchild--;
LIST_REMOVE(p, p_sibling);
/*
* Let pid be reallocated.
*/
proc_free_pid(p->p_pid);
atomic_dec_uint(&nprocs);
/*
* Unlink process from its process group.
* Releases the proc_lock.
*/
proc_leavepgrp(p);
/*
* Delay release until after lwp_free.
*/
cred2 = l->l_cred;
/*
* Free the last LWP's resources.
*
* lwp_free ensures the LWP is no longer running on another CPU.
*/
lwp_free(l, false, true);
/*
* Now no one except us can reach the process p.
*/
/*
* Decrement the count of procs running with this uid.
*/
cred1 = p->p_cred;
uid = kauth_cred_getuid(cred1);
(void)chgproccnt(uid, -1);
/*
* Release substructures.
*/
lim_free(p->p_limit);
pstatsfree(p->p_stats);
kauth_cred_free(cred1);
kauth_cred_free(cred2);
/*
* Release reference to text vnode
*/
if (p->p_textvp) vrele(p->p_textvp);
kmem_strfree(p->p_path);
mutex_destroy(&p->p_auxlock);
mutex_obj_free(p->p_lock);
mutex_destroy(&p->p_stmutex);
cv_destroy(&p->p_waitcv);
cv_destroy(&p->p_lwpcv);
rw_destroy(&p->p_reflock);
proc_free_mem(p);
}
/*
* Change the parent of a process for tracing purposes.
*/
void
proc_changeparent(struct proc *t, struct proc *p)
{
SET(t->p_slflag, PSL_TRACED);
t->p_opptr = t->p_pptr;
if (t->p_pptr == p)
return;
struct proc *parent = t->p_pptr;
if (parent->p_lock < t->p_lock) {
if (!mutex_tryenter(parent->p_lock)) {
mutex_exit(t->p_lock);
mutex_enter(parent->p_lock);
mutex_enter(t->p_lock);
}
} else if (parent->p_lock > t->p_lock) {
mutex_enter(parent->p_lock);
}
parent->p_slflag |= PSL_CHTRACED;
proc_reparent(t, p);
if (parent->p_lock != t->p_lock)
mutex_exit(parent->p_lock);
}
/*
* make process 'parent' the new parent of process 'child'.
*
* Must be called with proc_lock held.
*/
void
proc_reparent(struct proc *child, struct proc *parent)
{
KASSERT(mutex_owned(&proc_lock));
if (child->p_pptr == parent)
return;
if (child->p_stat == SZOMB || child->p_stat == SDEAD ||
(child->p_stat == SSTOP && !child->p_waited)) {
child->p_pptr->p_nstopchild--;
parent->p_nstopchild++;
}
if (parent == initproc) {
child->p_exitsig = SIGCHLD;
child->p_ppid = parent->p_pid;
}
LIST_REMOVE(child, p_sibling);
LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
}
/* $NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $ */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)umap_vnops.c 8.6 (Berkeley) 5/22/95
*/
/*
* Umap Layer
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/kauth.h>
#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer_extern.h>
/*
* Note: If the LAYERFS_MBYPASSDEBUG flag is set, it is possible
* that the debug printing will bomb out, because kauth routines
* do not handle NOCRED or FSCRED like other credentials and end
* up dereferencing an inappropriate pointer.
*
* That should be fixed in kauth rather than here.
*/
int umap_lookup(void *);
int umap_getattr(void *);
int umap_print(void *);
int umap_rename(void *);
/*
* Global vfs data structures
*/
/*
* XXX - strategy, bwrite are hand coded currently. They should
* go away with a merged buffer/block cache.
*
*/
int (**umap_vnodeop_p)(void *);
const struct vnodeopv_entry_desc umap_vnodeop_entries[] = {
{ &vop_default_desc, umap_bypass },
{ &vop_lookup_desc, umap_lookup },
{ &vop_getattr_desc, umap_getattr },
{ &vop_print_desc, umap_print },
{ &vop_rename_desc, umap_rename },
{ &vop_fsync_desc, layer_fsync },
{ &vop_inactive_desc, layer_inactive },
{ &vop_reclaim_desc, layer_reclaim },
{ &vop_open_desc, layer_open },
{ &vop_close_desc, layer_close },
{ &vop_setattr_desc, layer_setattr },
{ &vop_access_desc, layer_access },
{ &vop_accessx_desc, genfs_accessx },
{ &vop_remove_desc, layer_remove },
{ &vop_revoke_desc, layer_revoke },
{ &vop_rmdir_desc, layer_rmdir },
{ &vop_bmap_desc, layer_bmap },
{ &vop_getpages_desc, layer_getpages },
{ &vop_putpages_desc, layer_putpages },
{ NULL, NULL }
};
const struct vnodeopv_desc umapfs_vnodeop_opv_desc =
{ &umap_vnodeop_p, umap_vnodeop_entries };
/*
* This is the 08-June-1999 bypass routine.
* See layer_vnops.c:layer_bypass for more details.
*/
int
umap_bypass(void *v)
{
struct vop_generic_args /* {
struct vnodeop_desc *a_desc;
<other random data follows, presumably>
} */ *ap = v;
int (**our_vnodeop_p)(void *);
kauth_cred_t *credpp = NULL, credp = 0;
kauth_cred_t savecredp = 0, savecompcredp = 0;
kauth_cred_t compcredp = 0;
struct vnode **this_vp_p;
int error;
struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
struct vnode **vps_p[VDESC_MAX_VPS];
struct vnode ***vppp;
struct vnodeop_desc *descp = ap->a_desc;
int reles, i, flags;
struct componentname **compnamepp = 0;
#ifdef DIAGNOSTIC
/*
* We require at least one vp.
*/
if (descp->vdesc_vp_offsets == NULL ||
descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
panic("%s: no vp's in map.\n", __func__);
#endif
vps_p[0] =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
vp0 = *vps_p[0];
flags = MOUNTTOUMAPMOUNT(vp0->v_mount)->umapm_flags;
our_vnodeop_p = vp0->v_op;
if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name);
/*
* Map the vnodes going in.
* Later, we'll invoke the operation based on
* the first mapped vnode's operation vector.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
vps_p[i] = this_vp_p =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
ap);
/*
* We're not guaranteed that any but the first vnode
* are of our type. Check for and don't map any
* that aren't. (We must always map first vp or vclean fails.)
*/
if (i && (*this_vp_p == NULL ||
(*this_vp_p)->v_op != our_vnodeop_p)) {
old_vps[i] = NULL;
} else {
old_vps[i] = *this_vp_p;
*(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p);
/*
* XXX - Several operations have the side effect
* of vrele'ing their vp's. We must account for
* that. (This should go away in the future.)
*/
if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p);
}
}
/*
* Fix the credentials. (That's the purpose of this layer.)
*/
if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) {
credpp = VOPARG_OFFSETTO(kauth_cred_t*,
descp->vdesc_cred_offset, ap);
/* Save old values */
savecredp = *credpp;
if (savecredp != NOCRED && savecredp != FSCRED) *credpp = kauth_cred_dup(savecredp);
credp = *credpp;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: user was %d, group %d\n",
kauth_cred_geteuid(credp), kauth_cred_getegid(credp));
/* Map all ids in the credential structure. */
umap_mapids(vp0->v_mount, credp); if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: user now %d, group %d\n",
kauth_cred_geteuid(credp), kauth_cred_getegid(credp));
}
/* BSD often keeps a credential in the componentname structure
* for speed. If there is one, it better get mapped, too.
*/
if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) {
compnamepp = VOPARG_OFFSETTO(struct componentname**,
descp->vdesc_componentname_offset, ap);
savecompcredp = (*compnamepp)->cn_cred;
if (savecompcredp != NOCRED && savecompcredp != FSCRED) (*compnamepp)->cn_cred = kauth_cred_dup(savecompcredp);
compcredp = (*compnamepp)->cn_cred;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(vp0->v_mount, compcredp); if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
}
/*
* Call the operation on the lower layer
* with the modified argument structure.
*/
error = VCALL(*vps_p[0], descp->vdesc_offset, ap);
/*
* Maintain the illusion of call-by-value
* by restoring vnodes in the argument structure
* to their original value.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
if (old_vps[i]) {
*(vps_p[i]) = old_vps[i];
if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i]));
}
}
/*
* Map the possible out-going vpp
* (Assumes that the lower layer always returns
* a VREF'ed vpp unless it gets an error.)
*/
if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
vppp = VOPARG_OFFSETTO(struct vnode***,
descp->vdesc_vpp_offset, ap);
/*
* Only vop_lookup, vop_create, vop_makedir, vop_mknod
* and vop_symlink return vpp's. vop_lookup doesn't call bypass
* as a lookup on "." would generate a locking error.
* So all the calls which get us here have a unlocked vpp. :-)
*/
error = layer_node_create(old_vps[0]->v_mount, **vppp, *vppp);
if (error) { vrele(**vppp);
**vppp = NULL;
}
}
/*
* Free duplicate cred structure and restore old one.
*/
if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && credp &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: returning-user was %d\n",
kauth_cred_geteuid(credp)); if (savecredp != NOCRED && savecredp != FSCRED && credpp) {
kauth_cred_free(credp);
*credpp = savecredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && credpp &&
kauth_cred_geteuid(*credpp) != 0)
printf("umap_bypass: returning-user now %d\n\n",
kauth_cred_geteuid(savecredp));
}
}
if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: returning-component-user was %d\n",
kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) {
kauth_cred_free(compcredp);
(*compnamepp)->cn_cred = savecompcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
kauth_cred_geteuid(savecompcredp) != 0)
printf("umap_bypass: returning-component-user now %d\n",
kauth_cred_geteuid(savecompcredp));
}
}
return (error);
}
/*
* This is based on the 08-June-1999 bypass routine.
* See layer_vnops.c:layer_bypass for more details.
*/
int
umap_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnodeop_desc *a_desc;
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t savecompcredp = NULL;
kauth_cred_t compcredp = NULL;
struct vnode *dvp, *vp, *ldvp;
struct mount *mp;
int error;
int flags, cnf = cnp->cn_flags;
dvp = ap->a_dvp;
mp = dvp->v_mount;
if ((cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
return (EROFS);
flags = MOUNTTOUMAPMOUNT(mp)->umapm_flags;
ldvp = UMAPVPTOLOWERVP(dvp);
if (flags & LAYERFS_MBYPASSDEBUG) printf("umap_lookup\n");
/*
* Fix the credentials. (That's the purpose of this layer.)
*
* BSD often keeps a credential in the componentname structure
* for speed. If there is one, it better get mapped, too.
*/
if ((savecompcredp = cnp->cn_cred)) {
compcredp = kauth_cred_dup(savecompcredp);
cnp->cn_cred = compcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(mp, compcredp);
}
if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
ap->a_dvp = ldvp;
error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
vp = *ap->a_vpp;
*ap->a_vpp = NULL;
if (error == EJUSTRETURN && (cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
error = EROFS;
/* Do locking fixup as appropriate. See layer_lookup() for info */
if (ldvp == vp) {
*ap->a_vpp = dvp;
vref(dvp);
vrele(vp);
} else if (vp != NULL) {
error = layer_node_create(mp, vp, ap->a_vpp);
if (error) {
vrele(vp);
}
}
/*
* Free duplicate cred structure and restore old one.
*/
if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: returning-component-user was %d\n",
kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) { if (compcredp) kauth_cred_free(compcredp);
cnp->cn_cred = savecompcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
kauth_cred_geteuid(savecompcredp) != 0)
printf("umap_lookup: returning-component-user now %d\n",
kauth_cred_geteuid(savecompcredp));
}
return (error);
}
/*
* We handle getattr to change the fsid.
*/
int
umap_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
uid_t uid;
gid_t gid;
int error, tmpid, nentries, gnentries, flags;
u_long (*mapdata)[2];
u_long (*gmapdata)[2];
struct vnode **vp1p;
const struct vnodeop_desc *descp = ap->a_desc;
if ((error = umap_bypass(ap)) != 0)
return (error);
/* Requires that arguments be restored. */
ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
flags = MOUNTTOUMAPMOUNT(ap->a_vp->v_mount)->umapm_flags;
/*
* Umap needs to map the uid and gid returned by a stat
* into the proper values for this site. This involves
* finding the returned uid in the mapping information,
* translating it into the uid on the other end,
* and filling in the proper field in the vattr
* structure pointed to by ap->a_vap. The group
* is easier, since currently all groups will be
* translate to the NULLGROUP.
*/
/* Find entry in map */
uid = ap->a_vap->va_uid;
gid = ap->a_vap->va_gid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid,
gid);
vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries;
mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata);
gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries;
gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata);
/* Reverse map the uid for the vnode. Since it's a reverse
map, we can't use umap_mapids() to do it. */
tmpid = umap_reverse_findid(uid, mapdata, nentries);
if (tmpid != -1) {
ap->a_vap->va_uid = (uid_t) tmpid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original uid = %d\n", uid);
} else
ap->a_vap->va_uid = (uid_t) NOBODY;
/* Reverse map the gid for the vnode. */
tmpid = umap_reverse_findid(gid, gmapdata, gnentries);
if (tmpid != -1) {
ap->a_vap->va_gid = (gid_t) tmpid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original gid = %d\n", gid);
} else
ap->a_vap->va_gid = (gid_t) NULLGROUP;
return (0);
}
int
umap_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
printf("\ttag VT_UMAPFS, vp=%p, lowervp=%p\n", vp,
UMAPVPTOLOWERVP(vp));
return (0);
}
int
umap_rename(void *v)
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
int error, flags;
struct componentname *compnamep;
kauth_cred_t compcredp, savecompcredp;
struct vnode *vp;
struct vnode *tvp;
/*
* Rename is irregular, having two componentname structures.
* We need to map the cre in the second structure,
* and then bypass takes care of the rest.
*/
vp = ap->a_fdvp;
flags = MOUNTTOUMAPMOUNT(vp->v_mount)->umapm_flags;
compnamep = ap->a_tcnp;
compcredp = compnamep->cn_cred;
savecompcredp = compcredp;
compcredp = compnamep->cn_cred = kauth_cred_dup(savecompcredp);
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_rename: rename component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(vp->v_mount, compcredp);
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_rename: rename component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
tvp = ap->a_tvp;
if (tvp) {
if (tvp->v_mount != vp->v_mount)
tvp = NULL;
else
vref(tvp);
}
error = umap_bypass(ap);
if (tvp) {
if (error == 0)
VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
vrele(tvp);
}
/* Restore the additional mapped componentname cred structure. */
kauth_cred_free(compcredp);
compnamep->cn_cred = savecompcredp;
return error;
}
/* $NetBSD: vnode.h,v 1.304 2022/10/26 23:40:30 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vnode.h 8.17 (Berkeley) 5/20/95
*/
#ifndef _SYS_VNODE_H_
#define _SYS_VNODE_H_
#include <sys/event.h>
#include <sys/queue.h>
#include <sys/condvar.h>
#include <sys/rwlock.h>
#include <sys/mutex.h>
#include <sys/time.h>
#include <sys/acl.h>
/* XXX: clean up includes later */
#include <uvm/uvm_param.h> /* XXX */
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <uvm/uvm_pglist.h> /* XXX */
#include <uvm/uvm_object.h> /* XXX */
#include <uvm/uvm_extern.h> /* XXX */
struct uvm_ractx;
#endif
/*
* The vnode is the focus of all file activity in UNIX. There is a
* unique vnode allocated for each active file, each current directory,
* each mounted-on file, text file, and the root.
*/
/*
* Vnode types. VNON means no type.
*/
enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
#define VNODE_TYPES \
"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"
/*
* Vnode tag types.
* These are for the benefit of external programs only (e.g., pstat)
* and should NEVER be inspected by the kernel.
*/
enum vtagtype {
VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS, VT_LFS, VT_LOFS,
VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS,
VT_AFS, VT_ISOFS, VT_UNION, VT_ADOSFS, VT_EXT2FS, VT_CODA,
VT_FILECORE, VT_NTFS, VT_VFS, VT_OVERLAY, VT_SMBFS, VT_PTYFS,
VT_TMPFS, VT_UDF, VT_SYSVBFS, VT_PUFFS, VT_HFS, VT_EFS, VT_ZFS,
VT_RUMP, VT_NILFS, VT_V7FS, VT_CHFS, VT_AUTOFS
};
#define VNODE_TAGS \
"VT_NON", "VT_UFS", "VT_NFS", "VT_MFS", "VT_MSDOSFS", "VT_LFS", "VT_LOFS", \
"VT_FDESC", "VT_PORTAL", "VT_NULL", "VT_UMAP", "VT_KERNFS", "VT_PROCFS", \
"VT_AFS", "VT_ISOFS", "VT_UNION", "VT_ADOSFS", "VT_EXT2FS", "VT_CODA", \
"VT_FILECORE", "VT_NTFS", "VT_VFS", "VT_OVERLAY", "VT_SMBFS", "VT_PTYFS", \
"VT_TMPFS", "VT_UDF", "VT_SYSVBFS", "VT_PUFFS", "VT_HFS", "VT_EFS", \
"VT_ZFS", "VT_RUMP", "VT_NILFS", "VT_V7FS", "VT_CHFS", "VT_AUTOFS"
#if defined(_KERNEL) || defined(_KMEMUSER)
struct vnode;
struct buf;
LIST_HEAD(buflists, buf);
/*
* Reading or writing any of these items requires holding the appropriate
* lock. Field markings and the corresponding locks:
*
* - stable, reference to the vnode is required
* b bufcache_lock
* e exec_lock
* f vnode_free_list_lock, or vrele_lock for vrele_list
* i v_interlock
* i+b v_interlock + bufcache_lock to modify, either to inspect
* i+u v_interlock + v_uobj.vmobjlock to modify, either to inspect
* k locked by underlying filesystem (maybe kernel_lock)
* u v_uobj.vmobjlock
* v vnode lock
*
* Each underlying filesystem allocates its own private area and hangs
* it from v_data.
*/
struct vnode {
/*
* VM system related items.
*/
struct uvm_object v_uobj; /* u the VM object */
voff_t v_size; /* i+u size of file */
voff_t v_writesize; /* i+u new size after write */
/*
* Unstable items get their own cache line.
* On _LP64 this fills the space nicely.
*/
kcondvar_t v_cv /* i synchronization */
__aligned(COHERENCY_UNIT);
int v_iflag; /* i+u VI_* flags */
int v_uflag; /* k VU_* flags */
int v_usecount; /* i reference count */
int v_numoutput; /* i # of pending writes */
int v_writecount; /* i ref count of writers */
int v_holdcnt; /* i page & buffer refs */
struct buflists v_cleanblkhd; /* i+b clean blocklist head */
struct buflists v_dirtyblkhd; /* i+b dirty blocklist head */
/*
* The remaining items are largely stable.
*/
int v_vflag /* v VV_* flags */
__aligned(COHERENCY_UNIT);
kmutex_t *v_interlock; /* - vnode interlock */
struct mount *v_mount; /* v ptr to vfs we are in */
int (**v_op)(void *); /* : vnode operations vector */
union {
struct mount *vu_mountedhere;/* v ptr to vfs (VDIR) */
struct socket *vu_socket; /* v unix ipc (VSOCK) */
struct specnode *vu_specnode; /* v device (VCHR, VBLK) */
struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */
struct uvm_ractx *vu_ractx; /* u read-ahead ctx (VREG) */
} v_un;
enum vtype v_type; /* - vnode type */
enum vtagtype v_tag; /* - type of underlying data */
void *v_data; /* - private data for fs */
struct vnode_klist *v_klist; /* i kevent / knote info */
void *v_segvguard; /* e for PAX_SEGVGUARD */
};
#define v_mountedhere v_un.vu_mountedhere
#define v_socket v_un.vu_socket
#define v_specnode v_un.vu_specnode
#define v_fifoinfo v_un.vu_fifoinfo
#define v_ractx v_un.vu_ractx
typedef struct vnode vnode_t;
/*
* Structure that encompasses the kevent state for a vnode. This is
* carved out as a separate structure because some vnodes may share
* this state with one another.
*
* N.B. if two vnodes share a vnode_klist, then they must also share
* v_interlock.
*/
struct vnode_klist {
struct klist vk_klist; /* i notes attached to vnode */
long vk_interest; /* i what the notes are interested in */
};
#endif
/*
* Vnode flags. The first set are locked by vnode lock or are stable.
* VSYSTEM is only used to skip vflush()ing quota files. VISTTY is used
* when reading dead vnodes.
*/
#define VV_ROOT 0x00000001 /* root of its file system */
#define VV_SYSTEM 0x00000002 /* vnode being used by kernel */
#define VV_ISTTY 0x00000004 /* vnode represents a tty */
#define VV_MAPPED 0x00000008 /* vnode might have user mappings */
#define VV_MPSAFE 0x00000010 /* file system code is MP safe */
/*
* The second set are locked by vp->v_interlock. VI_TEXT and VI_EXECMAP are
* typically updated with vp->v_uobj.vmobjlock also held as the VM system
* uses them for accounting purposes.
*/
#define VI_TEXT 0x00000100 /* vnode is a pure text prototype */
#define VI_EXECMAP 0x00000200 /* might have PROT_EXEC mappings */
#define VI_WRMAP 0x00000400 /* might have PROT_WRITE u. mappings */
#define VI_PAGES 0x00000800 /* UVM object has >0 pages */
#define VI_ONWORKLST 0x00004000 /* On syncer work-list */
#define VI_DEADCHECK 0x00008000 /* UVM: need to call vdead_check() */
/*
* The third set are locked by the underlying file system.
*/
#define VU_DIROP 0x01000000 /* LFS: involved in a directory op */
#define VNODE_FLAGBITS \
"\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\11TEXT\12EXECMAP" \
"\13WRMAP\14PAGES\17ONWORKLST\20DEADCHECK\31DIROP"
#define VSIZENOTSET ((voff_t)-1)
/*
* vnode lock flags
*/
#define LK_NONE 0x00000000 /* no lock - for VOP_ISLOCKED() */
#define LK_SHARED 0x00000001 /* shared lock */
#define LK_EXCLUSIVE 0x00000002 /* exclusive lock */
#define LK_UPGRADE 0x00000010 /* upgrade shared -> exclusive */
#define LK_DOWNGRADE 0x00000020 /* downgrade exclusive -> shared */
#define LK_NOWAIT 0x00000100 /* do not sleep to await lock */
#define LK_RETRY 0x00000200 /* vn_lock: retry until locked */
/*
* Vnode attributes. A field value of VNOVAL represents a field whose value
* is unavailable (getattr) or which is not to be changed (setattr).
*/
struct vattr {
enum vtype va_type; /* vnode type (for create) */
mode_t va_mode; /* files access mode and type */
nlink_t va_nlink; /* number of references to file */
uid_t va_uid; /* owner user id */
gid_t va_gid; /* owner group id */
dev_t va_fsid; /* file system id (dev for now) */
ino_t va_fileid; /* file id */
u_quad_t va_size; /* file size in bytes */
long va_blocksize; /* blocksize preferred for i/o */
struct timespec va_atime; /* time of last access */
struct timespec va_mtime; /* time of last modification */
struct timespec va_ctime; /* time file changed */
struct timespec va_birthtime; /* time file created */
u_long va_gen; /* generation number of file */
u_long va_flags; /* flags defined for file */
dev_t va_rdev; /* device the special file represents */
u_quad_t va_bytes; /* bytes of disk space held by file */
u_quad_t va_filerev; /* file modification number */
unsigned int va_vaflags; /* operations flags, see below */
long va_spare; /* remain quad aligned */
};
/*
* Flags for va_vaflags.
*/
#define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */
#define VA_EXCLUSIVE 0x02 /* exclusive create request */
#ifdef _KERNEL
/*
* Flags for ioflag.
*/
#define IO_UNIT 0x00010 /* do I/O as atomic unit */
#define IO_APPEND 0x00020 /* append write to end */
#define IO_SYNC (0x40|IO_DSYNC) /* sync I/O file integrity completion */
#define IO_NODELOCKED 0x00080 /* underlying node already locked */
#define IO_NDELAY 0x00100 /* FNDELAY flag set in file table */
#define IO_DSYNC 0x00200 /* sync I/O data integrity completion */
#define IO_ALTSEMANTICS 0x00400 /* use alternate i/o semantics */
#define IO_NORMAL 0x00800 /* operate on regular data */
#define IO_EXT 0x01000 /* operate on extended attributes */
#define IO_DIRECT 0x02000 /* direct I/O hint */
#define IO_JOURNALLOCKED 0x04000 /* journal is already locked */
#define IO_ADV_MASK 0x00003 /* access pattern hint */
#define IO_ADV_SHIFT 0
#define IO_ADV_ENCODE(adv) (((adv) << IO_ADV_SHIFT) & IO_ADV_MASK)
#define IO_ADV_DECODE(ioflag) (((ioflag) & IO_ADV_MASK) >> IO_ADV_SHIFT)
/*
* Flags for accmode_t.
*/
#define VEXEC 000000000100 /* execute/search permission */
#define VWRITE 000000000200 /* write permission */
#define VREAD 000000000400 /* read permission */
#define VADMIN 000000010000 /* being the file owner */
#define VAPPEND 000000040000 /* permission to write/append */
/*
* VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
* if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
* and 0 otherwise. This never happens with ordinary unix access rights
* or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with
* some other V* constant.
*/
#define VEXPLICIT_DENY 000000100000
#define VREAD_NAMED_ATTRS 000000200000 /* not used */
#define VWRITE_NAMED_ATTRS 000000400000 /* not used */
#define VDELETE_CHILD 000001000000
#define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */
#define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */
#define VDELETE 000010000000
#define VREAD_ACL 000020000000 /* read ACL and file mode */
#define VWRITE_ACL 000040000000 /* change ACL and/or file mode */
#define VWRITE_OWNER 000100000000 /* change file owner */
#define VSYNCHRONIZE 000200000000 /* not used */
#define VCREAT 000400000000 /* creating new file */
#define VVERIFY 001000000000 /* verification required */
#define __VNODE_PERM_BITS \
"\10" \
"\07VEXEC" \
"\10VWRITE" \
"\11VREAD" \
"\15VADMIN" \
"\17VAPPEND" \
"\20VEXPLICIT_DENY" \
"\21VREAD_NAMED_ATTRS" \
"\22VWRITE_NAMED_ATTRS" \
"\23VDELETE_CHILD" \
"\24VREAD_ATTRIBUTES" \
"\25VWRITE_ATTRIBUTES" \
"\26VDELETE" \
"\27VREAD_ACL" \
"\30VWRITE_ACL" \
"\31VWRITE_OWNER" \
"\32VSYNCHRONIZE" \
"\33VCREAT" \
"\34VVERIFY"
/*
* Permissions that were traditionally granted only to the file owner.
*/
#define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
VWRITE_OWNER)
/*
* Permissions that were traditionally granted to everyone.
*/
#define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL)
/*
* Permissions that allow to change the state of the file in any way.
*/
#define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
VDELETE)
/*
* Token indicating no attribute value yet assigned.
*/
#define VNOVAL (-1)
#define VNOVALSIZE ((u_quad_t)-1)
#define VNOVALFLAGS ((u_long)-1)
/*
* Convert between vnode types and inode formats (since POSIX.1
* defines mode word of stat structure in terms of inode formats).
*/
extern const enum vtype iftovt_tab[];
extern const int vttoif_tab[];
#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12])
#define VTTOIF(indx) (vttoif_tab[(int)(indx)])
#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode))
/*
* Flags to various vnode functions.
*/
#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */
#define FORCECLOSE 0x0002 /* vflush: force file closeure */
#define WRITECLOSE 0x0004 /* vflush: only close writable files */
#define V_SAVE 0x0001 /* vinvalbuf: sync file first */
/*
* Flags to various vnode operations.
*/
#define REVOKEALL 0x0001 /* revoke: revoke all aliases */
#define FSYNC_WAIT 0x0001 /* fsync: wait for completion */
#define FSYNC_DATAONLY 0x0002 /* fsync: hint: sync file data only */
#define FSYNC_RECLAIM 0x0004 /* fsync: hint: vnode is being reclaimed */
#define FSYNC_LAZY 0x0008 /* fsync: lazy sync (trickle) */
#define FSYNC_NOLOG 0x0010 /* fsync: do not flush the log */
#define FSYNC_CACHE 0x0100 /* fsync: flush disk caches too */
#define UPDATE_WAIT 0x0001 /* update: wait for completion */
#define UPDATE_DIROP 0x0002 /* update: hint to fs to wait or not */
#define UPDATE_CLOSE 0x0004 /* update: clean up on close */
#define VDEAD_NOWAIT 0x0001 /* vdead_check: do not sleep */
void holdrelel(struct vnode *);
void holdrele(struct vnode *);
void vholdl(struct vnode *);
void vhold(struct vnode *);
void vref(struct vnode *);
#define NULLVP ((struct vnode *)NULL)
/*
* Macro to determine kevent interest on a vnode.
*/
#define _VN_KEVENT_INTEREST(vp, n) \
(((vp)->v_klist->vk_interest & (n)) != 0)
static inline bool
VN_KEVENT_INTEREST(struct vnode *vp, long hint)
{
mutex_enter(vp->v_interlock);
bool rv = _VN_KEVENT_INTEREST(vp, hint);
mutex_exit(vp->v_interlock);
return rv;
}
static inline void
VN_KNOTE(struct vnode *vp, long hint)
{
mutex_enter(vp->v_interlock);
if (__predict_false(_VN_KEVENT_INTEREST(vp, hint))) { knote(&vp->v_klist->vk_klist, hint);
}
mutex_exit(vp->v_interlock);
}
void vn_knote_attach(struct vnode *, struct knote *);
void vn_knote_detach(struct vnode *, struct knote *);
/*
* Global vnode data.
*/
extern struct vnode *rootvnode; /* root (i.e. "/") vnode */
extern int desiredvnodes; /* number of vnodes desired */
extern unsigned int numvnodes; /* current number of vnodes */
#endif /* _KERNEL */
/*
* Mods for exensibility.
*/
/*
* Flags for vdesc_flags:
*/
#define VDESC_MAX_VPS 8
/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
#define VDESC_VP0_WILLRELE 0x00000001
#define VDESC_VP1_WILLRELE 0x00000002
#define VDESC_VP2_WILLRELE 0x00000004
#define VDESC_VP3_WILLRELE 0x00000008
#define VDESC_VP0_WILLPUT 0x00000101
#define VDESC_VP1_WILLPUT 0x00000202
#define VDESC_VP2_WILLPUT 0x00000404
#define VDESC_VP3_WILLPUT 0x00000808
/*
* VDESC_NO_OFFSET is used to identify the end of the offset list
* and in places where no such field exists.
*/
#define VDESC_NO_OFFSET -1
/*
* This structure describes the vnode operation taking place.
*/
struct vnodeop_desc {
int vdesc_offset; /* offset in vector--first for speed */
const char *vdesc_name; /* a readable name for debugging */
int vdesc_flags; /* VDESC_* flags */
/*
* These ops are used by bypass routines to map and locate arguments.
* Creds and procs are not needed in bypass routines, but sometimes
* they are useful to (for example) transport layers.
* Nameidata is useful because it has a cred in it.
*/
const int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */
int vdesc_vpp_offset; /* return vpp location */
int vdesc_cred_offset; /* cred location, if any */
int vdesc_componentname_offset; /* if any */
};
#ifdef _KERNEL
extern const struct vnodeop_desc * const vfs_op_descs[];
/*
* Union filesystem hook for vn_readdir().
*/
extern int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *);
/*
* Macros for offsets in the vdesc struct.
*/
#define VOPARG_OFFSETOF(type, member) offsetof(type, member)
#define VOPARG_OFFSETTO(type,offset,sp) ((type)(((char *)(sp)) + (offset)))
/*
* This structure is used to configure the new vnodeops vector.
*/
struct vnodeopv_entry_desc {
const struct vnodeop_desc *opve_op; /* which operation this is */
int (*opve_impl)(void *); /* code implementing this operation */
};
struct vnodeopv_desc {
/* ptr to the ptr to the vector where op should go */
int (***opv_desc_vector_p)(void *);
const struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */
};
/*
* A default routine which just returns an error.
*/
int vn_default_error(void *);
/*
* A generic structure.
* This can be used by bypass routines to identify generic arguments.
*/
struct vop_generic_args {
struct vnodeop_desc *a_desc;
/* other random data follows, presumably */
};
/*
* VOCALL calls an op given an ops vector. We break it out because BSD's
* vclean changes the ops vector and then wants to call ops with the old
* vector.
*/
/*
* actually, vclean doesn't use it anymore, but nfs does,
* for device specials and fifos.
*/
#define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))
/*
* This call works for vnodes in the kernel.
*/
#define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP))
#define VDESC(OP) (& __CONCAT(OP,_desc))
#define VOFFSET(OP) (VDESC(OP)->vdesc_offset)
/* XXX This include should go away */
#include <sys/mount.h>
/*
* Finally, include the default set of vnode operations.
*/
#include <sys/vnode_if.h>
/*
* Public vnode manipulation functions.
*/
struct file;
struct filedesc;
struct nameidata;
struct pathbuf;
struct proc;
struct stat;
struct uio;
struct vattr;
struct vnode;
/* see vnode(9) */
void vfs_vnode_sysinit(void);
int bdevvp(dev_t, struct vnode **);
int cdevvp(dev_t, struct vnode **);
void vattr_null(struct vattr *);
void vdevgone(int, int, int, enum vtype);
int vfinddev(dev_t, enum vtype, struct vnode **);
int vflush(struct mount *, struct vnode *, int);
int vflushbuf(struct vnode *, int);
void vgone(struct vnode *);
int vinvalbuf(struct vnode *, int, kauth_cred_t, struct lwp *, bool, int);
void vprint(const char *, struct vnode *);
void vput(struct vnode *);
bool vrecycle(struct vnode *);
void vrele(struct vnode *);
void vrele_async(struct vnode *);
void vrele_flush(struct mount *);
int vtruncbuf(struct vnode *, daddr_t, bool, int);
void vwakeup(struct buf *);
int vdead_check(struct vnode *, int);
void vrevoke(struct vnode *);
void vremfree(struct vnode *);
void vshareilock(struct vnode *, struct vnode *);
void vshareklist(struct vnode *, struct vnode *);
int vrefcnt(struct vnode *);
int vcache_get(struct mount *, const void *, size_t, struct vnode **);
int vcache_new(struct mount *, struct vnode *,
struct vattr *, kauth_cred_t, void *, struct vnode **);
int vcache_rekey_enter(struct mount *, struct vnode *,
const void *, size_t, const void *, size_t);
void vcache_rekey_exit(struct mount *, struct vnode *,
const void *, size_t, const void *, size_t);
/* see vnsubr(9) */
int vn_bwrite(void *);
int vn_close(struct vnode *, int, kauth_cred_t);
int vn_isunder(struct vnode *, struct vnode *, struct lwp *);
int vn_lock(struct vnode *, int);
void vn_markexec(struct vnode *);
int vn_marktext(struct vnode *);
int vn_open(struct vnode *, struct pathbuf *, int, int, int,
struct vnode **, bool *, int *);
int vn_rdwr(enum uio_rw, struct vnode *, void *, int, off_t, enum uio_seg,
int, kauth_cred_t, size_t *, struct lwp *);
int vn_readdir(struct file *, char *, int, unsigned int, int *,
struct lwp *, off_t **, int *);
int vn_stat(struct vnode *, struct stat *);
int vn_kqfilter(struct file *, struct knote *);
int vn_writechk(struct vnode *);
int vn_openchk(struct vnode *, kauth_cred_t, int);
int vn_extattr_get(struct vnode *, int, int, const char *, size_t *,
void *, struct lwp *);
int vn_extattr_set(struct vnode *, int, int, const char *, size_t,
const void *, struct lwp *);
int vn_extattr_rm(struct vnode *, int, int, const char *, struct lwp *);
int vn_fifo_bypass(void *);
int vn_bdev_open(dev_t, struct vnode **, struct lwp *);
int vn_bdev_openpath(struct pathbuf *pb, struct vnode **, struct lwp *);
/* initialise global vnode management */
void vntblinit(void);
/* misc stuff */
void sched_sync(void *);
void vn_syncer_add_to_worklist(struct vnode *, int);
void vn_syncer_remove_from_worklist(struct vnode *);
int dorevoke(struct vnode *, kauth_cred_t);
int rawdev_mounted(struct vnode *, struct vnode **);
uint8_t vtype2dt(enum vtype);
/* see vfssubr(9) */
int vfs_unixify_accmode(accmode_t *);
void vfs_getnewfsid(struct mount *);
void vfs_timestamp(struct timespec *);
#if defined(DDB) || defined(DEBUGPRINT)
void vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_vnode_lock_print(void *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_mount_print(struct mount *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_mount_print_all(int, void (*)(const char *, ...)
__printflike(1, 2));
#endif /* DDB */
#endif /* _KERNEL */
#endif /* !_SYS_VNODE_H_ */
/* $NetBSD: if.h,v 1.305 2023/10/09 11:55:34 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by William Studenmund and Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.h 8.3 (Berkeley) 2/9/95
*/
#ifndef _NET_IF_H_
#define _NET_IF_H_
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <stdbool.h>
#endif
#include <sys/featuretest.h>
/*
* Length of interface external name, including terminating '\0'.
* Note: this is the same size as a generic device's external name.
*/
#define IF_NAMESIZE 16
/*
* Length of interface description, including terminating '\0'.
*/
#define IFDESCRSIZE 64
#if defined(_NETBSD_SOURCE)
#include <sys/socket.h>
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/hook.h>
#include <net/dlt.h>
#include <net/pfil.h>
#ifdef _KERNEL
#include <net/pktqueue.h>
#include <sys/pslist.h>
#include <sys/pserialize.h>
#include <sys/psref.h>
#include <sys/module_hook.h>
#endif
/*
* Always include ALTQ glue here -- we use the ALTQ interface queue
* structure even when ALTQ is not configured into the kernel so that
* the size of struct ifnet does not changed based on the option. The
* ALTQ queue structure is API-compatible with the legacy ifqueue.
*/
#include <altq/if_altq.h>
/*
* Structures defining a network interface, providing a packet
* transport mechanism (ala level 0 of the PUP protocols).
*
* Each interface accepts output datagrams of a specified maximum
* length, and provides higher level routines with input datagrams
* received from its medium.
*
* Output occurs when the routine if_output is called, with four parameters:
* (*ifp->if_output)(ifp, m, dst, rt)
* Here m is the mbuf chain to be sent and dst is the destination address.
* The output routine encapsulates the supplied datagram if necessary,
* and then transmits it on its medium.
*
* On input, each interface unwraps the data received by it, and either
* places it on the input queue of a internetwork datagram routine
* and posts the associated software interrupt, or passes the datagram to a raw
* packet input routine.
*
* Routines exist for locating interfaces by their addresses
* or for locating a interface on a certain network, as well as more general
* routing and gateway routines maintaining information used to locate
* interfaces. These routines live in the files if.c and route.c
*/
#include <sys/time.h>
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#include "opt_gateway.h"
#endif
struct mbuf;
struct proc;
struct rtentry;
struct socket;
struct ether_header;
struct ifaddr;
struct ifnet;
struct rt_addrinfo;
#define IFNAMSIZ IF_NAMESIZE
/*
* Structure describing a `cloning' interface.
*/
struct if_clone {
LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */
const char *ifc_name; /* name of device, e.g. `gif' */
size_t ifc_namelen; /* length of name */
int (*ifc_create)(struct if_clone *, int);
int (*ifc_destroy)(struct ifnet *);
};
#define IF_CLONE_INITIALIZER(name, create, destroy) \
{ { NULL, NULL }, name, sizeof(name) - 1, create, destroy }
/*
* Structure used to query names of interface cloners.
*/
struct if_clonereq {
int ifcr_total; /* total cloners (out) */
int ifcr_count; /* room for this many in user buffer */
char *ifcr_buffer; /* buffer for cloner names */
};
/*
* Structure defining statistics and other data kept regarding a network
* interface.
*
* Only used for exporting data from the interface.
*/
struct if_data {
/* generic interface information */
u_char ifi_type; /* ethernet, tokenring, etc. */
u_char ifi_addrlen; /* media address length */
u_char ifi_hdrlen; /* media header length */
int ifi_link_state; /* current link state */
uint64_t ifi_mtu; /* maximum transmission unit */
uint64_t ifi_metric; /* routing metric (external only) */
uint64_t ifi_baudrate; /* linespeed */
/* volatile statistics */
uint64_t ifi_ipackets; /* packets received on interface */
uint64_t ifi_ierrors; /* input errors on interface */
uint64_t ifi_opackets; /* packets sent on interface */
uint64_t ifi_oerrors; /* output errors on interface */
uint64_t ifi_collisions; /* collisions on csma interfaces */
uint64_t ifi_ibytes; /* total number of octets received */
uint64_t ifi_obytes; /* total number of octets sent */
uint64_t ifi_imcasts; /* packets received via multicast */
uint64_t ifi_omcasts; /* packets sent via multicast */
uint64_t ifi_iqdrops; /* dropped on input, this interface */
uint64_t ifi_noproto; /* destined for unsupported protocol */
struct timespec ifi_lastchange;/* last operational state change */
};
/*
* Values for if_link_state.
*/
#define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */
#define LINK_STATE_DOWN 1 /* link is down */
#define LINK_STATE_UP 2 /* link is up */
/*
* Status bit descriptions for the various interface types.
*/
struct if_status_description {
unsigned char ifs_type;
unsigned char ifs_state;
const char *ifs_string;
};
#define LINK_STATE_DESC_MATCH(_ifs, _t, _s) \
(((_ifs)->ifs_type == (_t) || (_ifs)->ifs_type == 0) && \
(_ifs)->ifs_state == (_s))
#define LINK_STATE_DESCRIPTIONS { \
{ IFT_ETHER, LINK_STATE_DOWN, "no carrier" }, \
{ IFT_IEEE80211, LINK_STATE_DOWN, "no network" }, \
{ IFT_PPP, LINK_STATE_DOWN, "no carrier" }, \
{ IFT_CARP, LINK_STATE_DOWN, "backup" }, \
{ IFT_CARP, LINK_STATE_UP, "master" }, \
{ 0, LINK_STATE_UP, "active" }, \
{ 0, LINK_STATE_UNKNOWN, "unknown" }, \
{ 0, LINK_STATE_DOWN, "down" }, \
{ 0, 0, NULL } \
}
/*
* Structure defining a queue for a network interface.
*/
struct ifqueue {
struct mbuf *ifq_head;
struct mbuf *ifq_tail;
int ifq_len;
int ifq_maxlen;
uint64_t ifq_drops;
kmutex_t *ifq_lock;
};
#ifdef _KERNEL
#include <sys/percpu.h>
#include <sys/callout.h>
#include <sys/rwlock.h>
#include <sys/workqueue.h>
#endif /* _KERNEL */
/*
* Structure defining a queue for a network interface.
*
* (Would like to call this struct ``if'', but C isn't PL/1.)
*/
TAILQ_HEAD(ifnet_head, ifnet); /* the actual queue head */
struct bridge_softc;
struct bridge_iflist;
struct callout;
struct krwlock;
struct if_percpuq;
struct if_deferred_start;
struct in6_multi;
typedef unsigned short if_index_t;
/*
* Interface. Field markings and the corresponding locks:
*
* i: IFNET_LOCK (a.k.a., if_ioctl_lock)
* q: ifq_lock (struct ifaltq)
* a: if_afdata_lock
* 6: in6_multilock (global lock)
* :: unlocked, stable
* ?: unknown, maybe unsafe
*
* Lock order: IFNET_LOCK => in6_multilock => if_afdata_lock => ifq_lock
* Note that currently if_afdata_lock and ifq_lock aren't held
* at the same time, but define the order anyway.
*
* Lock order of IFNET_LOCK with other locks:
* softnet_lock => solock => IFNET_LOCK => ND6_LOCK, in_multilock
*/
typedef struct ifnet {
void *if_softc; /* :: lower-level data for this if */
/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
TAILQ_ENTRY(ifnet)
if_list; /* i: all struct ifnets are chained */
TAILQ_HEAD(, ifaddr)
if_addrlist; /* i: linked list of addresses per if */
char if_xname[IFNAMSIZ];
/* :: external name (name + unit) */
int if_pcount; /* i: number of promiscuous listeners */
struct bpf_if *if_bpf; /* :: packet filter structure */
if_index_t if_index; /* :: numeric abbreviation for this if */
short if_timer; /* ?: time 'til if_slowtimo called */
unsigned short if_flags; /* i: up/down, broadcast, etc. */
short if_extflags; /* :: if_output MP-safe, etc. */
u_char if_type; /* :: ethernet, tokenring, etc. */
u_char if_addrlen; /* :: media address length */
u_char if_hdrlen; /* :: media header length */
/* XXX audit :? fields here. */
int if_link_state; /* :? current link state */
uint64_t if_mtu; /* :? maximum transmission unit */
uint64_t if_metric; /* :? routing metric (external only) */
uint64_t if_baudrate; /* :? linespeed */
struct timespec if_lastchange; /* :? last operational state change */
#ifdef _KERNEL
percpu_t *if_stats; /* :: statistics */
#else
void *if_stats; /* opaque to user-space */
#endif /* _KERNEL */
/*
* Procedure handles. If you add more of these, don't forget the
* corresponding NULL stub in if.c.
*/
int (*if_output) /* :: output routine (enqueue) */
(struct ifnet *, struct mbuf *, const struct sockaddr *,
const struct rtentry *);
void (*_if_input) /* :: input routine (from h/w driver) */
(struct ifnet *, struct mbuf *);
void (*if_start) /* :: initiate output routine */
(struct ifnet *);
int (*if_transmit) /* :: output routine, must be MP-safe */
(struct ifnet *, struct mbuf *);
int (*if_ioctl) /* :: ioctl routine */
(struct ifnet *, u_long, void *);
int (*if_init) /* :: init routine */
(struct ifnet *);
void (*if_stop) /* :: stop routine */
(struct ifnet *, int);
void (*if_slowtimo) /* :: timer routine */
(struct ifnet *);
#define if_watchdog if_slowtimo
void (*if_drain) /* :: routine to release resources */
(struct ifnet *);
void (*if_bpf_mtap) /* :: bpf routine */
(struct bpf_if *, struct mbuf *, u_int);
struct ifaltq if_snd; /* q: output queue (includes altq) */
struct ifaddr *if_dl; /* i: identity of this interface. */
const struct sockaddr_dl
*if_sadl; /* i: pointer to sockaddr_dl of if_dl */
/*
* May be NULL. If not NULL, it is the address assigned
* to the interface by the manufacturer, so it very likely
* to be unique. It MUST NOT be deleted. It is highly
* suitable for deriving the EUI64 for the interface.
*/
struct ifaddr *if_hwdl; /* i: h/w identity */
const uint8_t *if_broadcastaddr;
/* :: linklevel broadcast bytestring */
struct bridge_softc
*if_bridge; /* i: bridge glue */
struct bridge_iflist
*if_bridgeif; /* i: shortcut to interface list entry */
int if_dlt; /* :: data link type (<net/dlt.h>) */
pfil_head_t * if_pfil; /* :: filtering point */
uint64_t if_capabilities;
/* i: interface capabilities */
uint64_t if_capenable; /* i: capabilities enabled */
union {
void * carp_s; /* carp structure (used by !carp ifs) */
struct ifnet *carp_d;/* ptr to carpdev (used by carp ifs) */
} if_carp_ptr; /* ?: */
#define if_carp if_carp_ptr.carp_s
#define if_carpdev if_carp_ptr.carp_d
/*
* These are pre-computed based on an interfaces enabled
* capabilities, for speed elsewhere.
*/
int if_csum_flags_tx;
/* i: M_CSUM_* flags for Tx */
int if_csum_flags_rx;
/* i: M_CSUM_* flags for Rx */
void *if_afdata[AF_MAX];
/* a: */
struct mowner *if_mowner; /* ?: who owns mbufs for this interface */
void *if_lagg; /* :: lagg or agr structure */
void *if_npf_private;/* ?: associated NPF context */
/*
* pf specific data, used only when #if NPF > 0.
*/
void *if_pf_kif; /* ?: pf interface abstraction */
void *if_pf_groups; /* ?: pf interface groups */
/*
* During an ifnet's lifetime, it has only one if_index, but
* an if_index is not sufficient to identify an ifnet
* because during the lifetime of the system, many ifnets may occupy a
* given if_index. Let us tell different ifnets at the same
* if_index apart by their if_index_gen, a unique number that each ifnet
* is assigned when it if_attach()s. Now, the kernel can use the
* pair (if_index, if_index_gen) as a weak reference to an ifnet.
*/
uint64_t if_index_gen; /* :: generation number for the ifnet
* at if_index: if two ifnets' index
* and generation number are both the
* same, they are the same ifnet.
*/
struct sysctllog
*if_sysctl_log; /* :: */
int (*if_initaddr) /* :: */
(struct ifnet *, struct ifaddr *, bool);
int (*if_setflags) /* :: */
(struct ifnet *, const u_short);
kmutex_t *if_ioctl_lock; /* :: */
char *if_description; /* i: interface description */
#ifdef _KERNEL /* XXX kvm(3) */
struct if_slowtimo_data *if_slowtimo_data; /* :: */
struct krwlock *if_afdata_lock;/* :: */
struct if_percpuq
*if_percpuq; /* :: we should remove it in the future */
struct work if_link_work; /* q: linkage on link state work queue */
uint16_t if_link_queue; /* q: masked link state change queue */
/* q: is link state work scheduled? */
bool if_link_scheduled;
struct pslist_entry
if_pslist_entry;/* i: */
struct psref_target
if_psref; /* :: */
struct pslist_head
if_addr_pslist; /* i: */
struct if_deferred_start
*if_deferred_start;
/* :: */
/* XXX should be protocol independent */
LIST_HEAD(, in6_multi)
if_multiaddrs; /* 6: */
khook_list_t *if_linkstate_hooks; /* :: */
#endif
} ifnet_t;
#include <net/if_stats.h>
#define if_name(ifp) ((ifp)->if_xname)
#define IFF_UP 0x0001 /* interface is up */
#define IFF_BROADCAST 0x0002 /* broadcast address valid */
#define IFF_DEBUG 0x0004 /* turn on debugging */
#define IFF_LOOPBACK 0x0008 /* is a loopback net */
#define IFF_POINTOPOINT 0x0010 /* interface is point-to-point link */
#if 0
/* 0x0020 was IFF_NOTRAILERS */
#else
/*
* sys/compat/svr4 is remvoed on 19 Dec 2018.
* And then, IFF_NOTRAILERS itself is removed by if.h:r1.268 on 5 Feb 2019.
*/
#define IFF_UNNUMBERED 0x0020 /* explicit unnumbered */
#endif
#define IFF_RUNNING 0x0040 /* resources allocated */
#define IFF_NOARP 0x0080 /* no address resolution protocol */
#define IFF_PROMISC 0x0100 /* receive all packets */
#define IFF_ALLMULTI 0x0200 /* OBSOLETE -- DO NOT USE */
/*
* IFF_ALLMULTI obsoleted on 2019-05-15 -- existing non-MP-safe drivers
* can use it for themselves under IFNET_LOCK, but they should be
* converted to use ETHER_F_ALLMULTI under ETHER_LOCK instead. For
* compatibility with existing drivers, if_ethersubr and if_arcsubr
* will set IFF_ALLMULTI according to other flags, but you should not
* rely on this.
*/
#define IFF_OACTIVE 0x0400 /* transmission in progress */
#define IFF_SIMPLEX 0x0800 /* can't hear own transmissions */
#define IFF_LINK0 0x1000 /* per link layer defined bit */
#define IFF_LINK1 0x2000 /* per link layer defined bit */
#define IFF_LINK2 0x4000 /* per link layer defined bit */
#define IFF_MULTICAST 0x8000 /* supports multicast */
#define IFEF_MPSAFE __BIT(0) /* handlers can run in parallel (see below) */
/*
* The guidelines for converting an interface to IFEF_MPSAFE are as follows
*
* Enabling IFEF_MPSAFE on an interface suppresses taking KERNEL_LOCK when
* calling the following handlers:
* - if_start
* - Note that if_transmit is always called without KERNEL_LOCK
* - if_output
* - if_ioctl
* - if_init
* - if_stop
*
* This means that an interface with IFEF_MPSAFE must make the above handlers
* MP-safe or take KERNEL_LOCK by itself inside handlers that aren't MP-safe
* yet.
*
* There are some additional restrictions to access member variables of struct
* ifnet:
* - if_flags
* - Must be updated with holding IFNET_LOCK
* - You cannot use the flag in Tx/Rx paths anymore because there is no
* synchronization on the flag except for IFNET_LOCK
* - Note that IFNET_LOCK can't be taken in softint because it's known
* that it causes a deadlock
* - Some synchronization mechanisms such as pserialize_perform are called
* with IFNET_LOCK and also require context switches on every CPUs
* that mean softints finish so trying to take IFNET_LOCK in softint
* might block on IFNET_LOCK and prevent such synchronization mechanisms
* from being completed
* - Currently the deadlock occurs only if NET_MPSAFE is enabled, however,
* we should deal with the restriction because NET_MPSAFE will be enabled
* by default in the future
* - if_watchdog and if_timer
* - The watchdog framework works only for non-IFEF_MPSAFE interfaces
* that rely on KERNEL_LOCK
* - Interfaces with IFEF_MPSAFE have to provide its own watchdog mechanism
* if needed
* - Keep if_watchdog NULL when calling if_attach
*/
#ifdef _KERNEL
static __inline bool
if_is_mpsafe(struct ifnet *ifp)
{
return ((ifp->if_extflags & IFEF_MPSAFE) != 0);
}
static __inline int
if_output_lock(struct ifnet *cifp, struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, const struct rtentry *rt)
{
if (if_is_mpsafe(cifp)) {
return (*cifp->if_output)(ifp, m, dst, rt);
} else {
int ret;
KERNEL_LOCK(1, NULL);
ret = (*cifp->if_output)(ifp, m, dst, rt);
KERNEL_UNLOCK_ONE(NULL);
return ret;
}
}
static __inline void
if_start_lock(struct ifnet *ifp)
{
if (if_is_mpsafe(ifp)) {
(*ifp->if_start)(ifp);
} else {
KERNEL_LOCK(1, NULL);
(*ifp->if_start)(ifp);
KERNEL_UNLOCK_ONE(NULL);
}
}
#define KERNEL_LOCK_IF_IFP_MPSAFE(ifp) \
do { if (if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp) \
do { if (if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)
#define KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp) \
do { if (!if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp) \
do { if (!if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)
#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif
/* XXX explore a better place to define */
#ifdef NET_MPSAFE
#define KERNEL_LOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_LOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_LOCK_IF_NET_MPSAFE() \
do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE() \
do { mutex_exit(softnet_lock); } while (0)
#else /* NET_MPSAFE */
#define KERNEL_LOCK_UNLESS_NET_MPSAFE() \
do { KERNEL_LOCK(1, NULL); } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \
do { KERNEL_UNLOCK_ONE(NULL); } while (0)
#define SOFTNET_LOCK_UNLESS_NET_MPSAFE() \
do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() \
do { mutex_exit(softnet_lock); } while (0)
#define SOFTNET_LOCK_IF_NET_MPSAFE() do { } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE() do { } while (0)
#endif /* NET_MPSAFE */
#define SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE() \
do { \
SOFTNET_LOCK_UNLESS_NET_MPSAFE(); \
KERNEL_LOCK_UNLESS_NET_MPSAFE(); \
} while (0)
#define SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \
do { \
KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); \
SOFTNET_UNLOCK_UNLESS_NET_MPSAFE(); \
} while (0)
#endif /* _KERNEL */
#define IFFBITS \
"\020\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5POINTOPOINT\6UNNUMBERED" \
"\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX" \
"\15LINK0\16LINK1\17LINK2\20MULTICAST"
/* flags set internally only: */
#define IFF_CANTCHANGE \
(IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\
IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC)
/*
* Some convenience macros used for setting ifi_baudrate.
*/
#define IF_Kbps(x) ((x) * 1000ULL) /* kilobits/sec. */
#define IF_Mbps(x) (IF_Kbps((x) * 1000ULL)) /* megabits/sec. */
#define IF_Gbps(x) (IF_Mbps((x) * 1000ULL)) /* gigabits/sec. */
/* Capabilities that interfaces can advertise. */
/* 0x01 .. 0x40 were previously used */
#define IFCAP_TSOv4 0x00080 /* can do TCPv4 segmentation offload */
#define IFCAP_CSUM_IPv4_Rx 0x00100 /* can do IPv4 header checksums (Rx) */
#define IFCAP_CSUM_IPv4_Tx 0x00200 /* can do IPv4 header checksums (Tx) */
#define IFCAP_CSUM_TCPv4_Rx 0x00400 /* can do IPv4/TCP checksums (Rx) */
#define IFCAP_CSUM_TCPv4_Tx 0x00800 /* can do IPv4/TCP checksums (Tx) */
#define IFCAP_CSUM_UDPv4_Rx 0x01000 /* can do IPv4/UDP checksums (Rx) */
#define IFCAP_CSUM_UDPv4_Tx 0x02000 /* can do IPv4/UDP checksums (Tx) */
#define IFCAP_CSUM_TCPv6_Rx 0x04000 /* can do IPv6/TCP checksums (Rx) */
#define IFCAP_CSUM_TCPv6_Tx 0x08000 /* can do IPv6/TCP checksums (Tx) */
#define IFCAP_CSUM_UDPv6_Rx 0x10000 /* can do IPv6/UDP checksums (Rx) */
#define IFCAP_CSUM_UDPv6_Tx 0x20000 /* can do IPv6/UDP checksums (Tx) */
#define IFCAP_TSOv6 0x40000 /* can do TCPv6 segmentation offload */
#define IFCAP_LRO 0x80000 /* can do Large Receive Offload */
#define IFCAP_MASK 0xfff80 /* currently valid capabilities */
#define IFCAPBITS \
"\020" \
"\10TSO4" \
"\11IP4CSUM_Rx" \
"\12IP4CSUM_Tx" \
"\13TCP4CSUM_Rx" \
"\14TCP4CSUM_Tx" \
"\15UDP4CSUM_Rx" \
"\16UDP4CSUM_Tx" \
"\17TCP6CSUM_Rx" \
"\20TCP6CSUM_Tx" \
"\21UDP6CSUM_Rx" \
"\22UDP6CSUM_Tx" \
"\23TSO6" \
"\24LRO" \
#define IF_AFDATA_LOCK_INIT(ifp) \
do {(ifp)->if_afdata_lock = rw_obj_alloc();} while (0)
#define IF_AFDATA_LOCK_DESTROY(ifp) rw_obj_free((ifp)->if_afdata_lock)
#define IF_AFDATA_WLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_WRITER)
#define IF_AFDATA_RLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_READER)
#define IF_AFDATA_WUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock)
#define IF_AFDATA_RUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp)
#define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp)
#define IF_AFDATA_TRYLOCK(ifp) rw_tryenter((ifp)->if_afdata_lock, RW_WRITER)
#define IF_AFDATA_LOCK_ASSERT(ifp) \
KASSERT(rw_lock_held((ifp)->if_afdata_lock))
#define IF_AFDATA_RLOCK_ASSERT(ifp) \
KASSERT(rw_read_held((ifp)->if_afdata_lock))
#define IF_AFDATA_WLOCK_ASSERT(ifp) \
KASSERT(rw_write_held((ifp)->if_afdata_lock))
/*
* Output queues (ifp->if_snd) and internetwork datagram level (pup level 1)
* input routines have queues of messages stored on ifqueue structures
* (defined above). Entries are added to and deleted from these structures
* by these macros, which should be called with ipl raised to splnet().
*/
#define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
#define IF_DROP(ifq) ((ifq)->ifq_drops++)
#define IF_ENQUEUE(ifq, m) do { \
(m)->m_nextpkt = 0; \
if ((ifq)->ifq_tail == 0) \
(ifq)->ifq_head = m; \
else \
(ifq)->ifq_tail->m_nextpkt = m; \
(ifq)->ifq_tail = m; \
(ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define IF_PREPEND(ifq, m) do { \
(m)->m_nextpkt = (ifq)->ifq_head; \
if ((ifq)->ifq_tail == 0) \
(ifq)->ifq_tail = (m); \
(ifq)->ifq_head = (m); \
(ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define IF_DEQUEUE(ifq, m) do { \
(m) = (ifq)->ifq_head; \
if (m) { \
if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \
(ifq)->ifq_tail = 0; \
(m)->m_nextpkt = 0; \
(ifq)->ifq_len--; \
} \
} while (/*CONSTCOND*/0)
#define IF_POLL(ifq, m) ((m) = (ifq)->ifq_head)
#define IF_PURGE(ifq) \
do { \
struct mbuf *__m0; \
\
for (;;) { \
IF_DEQUEUE((ifq), __m0); \
if (__m0 == NULL) \
break; \
else \
m_freem(__m0); \
} \
} while (/*CONSTCOND*/ 0)
#define IF_IS_EMPTY(ifq) ((ifq)->ifq_len == 0)
#ifndef IFQ_MAXLEN
#define IFQ_MAXLEN 256
#endif
#define IFNET_SLOWHZ 1 /* granularity is 1 second */
/*
* Structure defining statistics and other data kept regarding an address
* on a network interface.
*/
struct ifaddr_data {
int64_t ifad_inbytes;
int64_t ifad_outbytes;
};
/*
* The ifaddr structure contains information about one address
* of an interface. They are maintained by the different address families,
* are allocated and attached when an address is set, and are linked
* together so all addresses for an interface can be located.
*/
struct ifaddr {
struct sockaddr *ifa_addr; /* address of interface */
struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */
#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */
struct sockaddr *ifa_netmask; /* used to determine subnet */
struct ifnet *ifa_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifaddr) ifa_list; /* list of addresses for interface */
struct ifaddr_data ifa_data; /* statistics on the address */
void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */
(int, struct rtentry *, const struct rt_addrinfo *);
u_int ifa_flags; /* mostly rt_flags for cloning */
int ifa_refcnt; /* count of references */
int ifa_metric; /* cost of going out this interface */
struct ifaddr *(*ifa_getifa)(struct ifaddr *,
const struct sockaddr *);
uint32_t *ifa_seqno;
int16_t ifa_preference; /* preference level for this address */
#ifdef _KERNEL
struct pslist_entry ifa_pslist_entry;
struct psref_target ifa_psref;
#endif
};
#define IFA_ROUTE RTF_UP /* (0x01) route installed */
#define IFA_DESTROYING 0x2
/*
* Message format for use in obtaining information about interfaces from
* sysctl and the routing socket. We need to force 64-bit alignment if we
* aren't using compatibility definitions.
*/
#if !defined(_KERNEL) || !defined(COMPAT_RTSOCK)
#define __align64 __aligned(sizeof(uint64_t))
#else
#define __align64
#endif
struct if_msghdr {
u_short ifm_msglen __align64;
/* to skip over non-understood messages */
u_char ifm_version; /* future binary compatibility */
u_char ifm_type; /* message type */
int ifm_addrs; /* like rtm_addrs */
int ifm_flags; /* value of if_flags */
u_short ifm_index; /* index for associated ifp */
struct if_data ifm_data __align64;
/* statistics and other data about if */
};
/*
* Message format for use in obtaining information about interface addresses
* from sysctl and the routing socket.
*/
struct ifa_msghdr {
u_short ifam_msglen __align64;
/* to skip over non-understood messages */
u_char ifam_version; /* future binary compatibility */
u_char ifam_type; /* message type */
u_short ifam_index; /* index for associated ifp */
int ifam_flags; /* value of ifa_flags */
int ifam_addrs; /* like rtm_addrs */
pid_t ifam_pid; /* identify sender */
int ifam_addrflags; /* family specific address flags */
int ifam_metric; /* value of ifa_metric */
};
/*
* Message format announcing the arrival or departure of a network interface.
*/
struct if_announcemsghdr {
u_short ifan_msglen __align64;
/* to skip over non-understood messages */
u_char ifan_version; /* future binary compatibility */
u_char ifan_type; /* message type */
u_short ifan_index; /* index for associated ifp */
char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
u_short ifan_what; /* what type of announcement */
};
#define IFAN_ARRIVAL 0 /* interface arrival */
#define IFAN_DEPARTURE 1 /* interface departure */
#undef __align64
/*
* Interface request structure used for socket
* ioctl's. All interface ioctl's must have parameter
* definitions which begin with ifr_name. The
* remainder may be interface specific.
*/
struct ifreq {
char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
union {
struct sockaddr ifru_addr;
struct sockaddr ifru_dstaddr;
struct sockaddr ifru_broadaddr;
struct sockaddr_storage ifru_space;
short ifru_flags;
int ifru_addrflags;
int ifru_metric;
int ifru_mtu;
int ifru_dlt;
u_int ifru_value;
void * ifru_data;
struct {
uint32_t b_buflen;
void *b_buf;
} ifru_b;
} ifr_ifru;
#define ifr_addr ifr_ifru.ifru_addr /* address */
#define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */
#define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */
#define ifr_space ifr_ifru.ifru_space /* sockaddr_storage */
#define ifr_flags ifr_ifru.ifru_flags /* flags */
#define ifr_addrflags ifr_ifru.ifru_addrflags /* addr flags */
#define ifr_metric ifr_ifru.ifru_metric /* metric */
#define ifr_mtu ifr_ifru.ifru_mtu /* mtu */
#define ifr_dlt ifr_ifru.ifru_dlt /* data link type (DLT_*) */
#define ifr_value ifr_ifru.ifru_value /* generic value */
#define ifr_media ifr_ifru.ifru_metric /* media options (overload) */
#define ifr_data ifr_ifru.ifru_data /* for use by interface
* XXX deprecated
*/
#define ifr_buf ifr_ifru.ifru_b.b_buf /* new interface ioctls */
#define ifr_buflen ifr_ifru.ifru_b.b_buflen
#define ifr_index ifr_ifru.ifru_value /* interface index, BSD */
#define ifr_ifindex ifr_index /* interface index, linux */
};
#ifdef _KERNEL
#define ifreq_setdstaddr ifreq_setaddr
#define ifreq_setbroadaddr ifreq_setaddr
#define ifreq_getdstaddr ifreq_getaddr
#define ifreq_getbroadaddr ifreq_getaddr
static __inline const struct sockaddr *
/*ARGSUSED*/
ifreq_getaddr(u_long cmd, const struct ifreq *ifr)
{
return &ifr->ifr_addr;
}
#endif /* _KERNEL */
struct ifcapreq {
char ifcr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
uint64_t ifcr_capabilities; /* supported capabiliites */
uint64_t ifcr_capenable; /* capabilities enabled */
};
struct ifaliasreq {
char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct sockaddr ifra_addr;
struct sockaddr ifra_dstaddr;
#define ifra_broadaddr ifra_dstaddr
struct sockaddr ifra_mask;
};
struct ifdatareq {
char ifdr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct if_data ifdr_data;
};
struct ifmediareq {
char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */
int ifm_current; /* IFMWD: current media options */
int ifm_mask; /* IFMWD: don't care mask */
int ifm_status; /* media status */
int ifm_active; /* IFMWD: active options */
int ifm_count; /* # entries in ifm_ulist
array */
int *ifm_ulist; /* array of ifmedia word */
};
struct ifdrv {
char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */
unsigned long ifd_cmd;
size_t ifd_len;
void *ifd_data;
};
#define IFLINKSTR_QUERYLEN 0x01
#define IFLINKSTR_UNSET 0x02
/*
* Structure used in SIOCGIFCONF request.
* Used to retrieve interface configuration
* for machine (useful for programs which
* must know all networks accessible).
*/
struct ifconf {
int ifc_len; /* size of associated buffer */
union {
void * ifcu_buf;
struct ifreq *ifcu_req;
} ifc_ifcu;
#define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */
#define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */
};
/*
* Structure for SIOC[AGD]LIFADDR
*/
struct if_laddrreq {
char iflr_name[IFNAMSIZ];
unsigned int flags;
#define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */
#define IFLR_ACTIVE 0x4000 /* in/out: link-layer address activation */
#define IFLR_FACTORY 0x2000 /* in/out: factory link-layer address */
unsigned int prefixlen; /* in/out */
struct sockaddr_storage addr; /* in/out */
struct sockaddr_storage dstaddr; /* out */
};
/*
* Structure for SIOC[SG]IFADDRPREF
*/
struct if_addrprefreq {
char ifap_name[IFNAMSIZ];
int16_t ifap_preference; /* in/out */
struct sockaddr_storage ifap_addr; /* in/out */
};
#include <net/if_arp.h>
#endif /* _NETBSD_SOURCE */
#ifdef _KERNEL
#ifdef ALTQ
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_ENQUEUE((ifq), (m), (err)); \
else { \
if (IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
IF_ENQUEUE((ifq), (m)); \
(err) = 0; \
} \
} \
if ((err)) \
(ifq)->ifq_drops++; \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue((ifq), ALTDQ_REMOVE); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_DEQUEUE((ifq), (m)); \
else \
IF_DEQUEUE((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_POLL(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue((ifq), ALTDQ_POLL); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_POLL((ifq), (m)); \
else \
IF_POLL((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_PURGE(ifq) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_PURGE(ifq); \
else \
IF_PURGE(ifq); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_SET_READY(ifq) \
do { \
(ifq)->altq_flags |= ALTQF_READY; \
} while (/*CONSTCOND*/ 0)
#define IFQ_CLASSIFY(ifq, m, af) \
do { \
KASSERT(((m)->m_flags & M_PKTHDR) != 0); \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) { \
if (ALTQ_NEEDS_CLASSIFY(ifq)) \
(m)->m_pkthdr.pattr_class = (*(ifq)->altq_classify) \
((ifq)->altq_clfier, (m), (af)); \
(m)->m_pkthdr.pattr_af = (af); \
(m)->m_pkthdr.pattr_hdr = mtod((m), void *); \
} \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#else /* ! ALTQ */
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
IF_ENQUEUE((ifq), (m)); \
(err) = 0; \
} \
if (err) \
(ifq)->ifq_drops++; \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_DEQUEUE((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_POLL(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_POLL((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_PURGE(ifq) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_PURGE(ifq); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_SET_READY(ifq) /* nothing */
#define IFQ_CLASSIFY(ifq, m, af) /* nothing */
#endif /* ALTQ */
#define IFQ_LOCK_INIT(ifq) (ifq)->ifq_lock = \
mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET)
#define IFQ_LOCK_DESTROY(ifq) mutex_obj_free((ifq)->ifq_lock)
#define IFQ_LOCK(ifq) mutex_enter((ifq)->ifq_lock)
#define IFQ_UNLOCK(ifq) mutex_exit((ifq)->ifq_lock)
#define IFQ_IS_EMPTY(ifq) IF_IS_EMPTY(ifq)
#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++)
#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len)
#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++)
#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len))
#include <sys/mallocvar.h>
MALLOC_DECLARE(M_IFADDR);
MALLOC_DECLARE(M_IFMADDR);
int ifreq_setaddr(u_long, struct ifreq *, const struct sockaddr *);
struct ifnet *if_alloc(u_char);
void if_free(struct ifnet *);
void if_initname(struct ifnet *, const char *, int);
struct ifaddr *if_dl_create(const struct ifnet *, const struct sockaddr_dl **);
void if_activate_sadl(struct ifnet *, struct ifaddr *,
const struct sockaddr_dl *);
void if_set_sadl(struct ifnet *, const void *, u_char, bool);
void if_alloc_sadl(struct ifnet *);
void if_free_sadl(struct ifnet *, int);
void if_initialize(struct ifnet *);
void if_register(struct ifnet *);
void if_attach(struct ifnet *); /* Deprecated. Use if_initialize and if_register */
void if_attachdomain(void);
void if_deactivate(struct ifnet *);
bool if_is_deactivated(const struct ifnet *);
void if_export_if_data(struct ifnet *, struct if_data *, bool);
void if_purgeaddrs(struct ifnet *, int, void (*)(struct ifaddr *));
void if_detach(struct ifnet *);
void if_down(struct ifnet *);
void if_down_locked(struct ifnet *);
void if_link_state_change(struct ifnet *, int);
void if_domain_link_state_change(struct ifnet *, int);
void if_up(struct ifnet *);
void ifinit(void);
void ifinit1(void);
void ifinit_post(void);
int ifaddrpref_ioctl(struct socket *, u_long, void *, struct ifnet *);
extern int (*ifioctl)(struct socket *, u_long, void *, struct lwp *);
int ifioctl_common(struct ifnet *, u_long, void *);
int ifpromisc(struct ifnet *, int);
int ifpromisc_locked(struct ifnet *, int);
int if_addr_init(ifnet_t *, struct ifaddr *, bool);
int if_do_dad(struct ifnet *);
int if_mcast_op(ifnet_t *, const unsigned long, const struct sockaddr *);
int if_flags_set(struct ifnet *, const u_short);
int if_clone_list(int, char *, int *);
int if_ioctl(struct ifnet *, u_long, void *);
int if_init(struct ifnet *);
void if_stop(struct ifnet *, int);
struct ifnet *ifunit(const char *);
struct ifnet *if_get(const char *, struct psref *);
ifnet_t *if_byindex(u_int);
ifnet_t *_if_byindex(u_int);
ifnet_t *if_get_byindex(u_int, struct psref *);
ifnet_t *if_get_bylla(const void *, unsigned char, struct psref *);
void if_put(const struct ifnet *, struct psref *);
void if_acquire(struct ifnet *, struct psref *);
#define if_release if_put
int if_tunnel_check_nesting(struct ifnet *, struct mbuf *, int);
percpu_t *if_tunnel_alloc_ro_percpu(void);
void if_tunnel_free_ro_percpu(percpu_t *);
void if_tunnel_ro_percpu_rtcache_free(percpu_t *);
struct tunnel_ro {
struct route *tr_ro;
kmutex_t *tr_lock;
};
static inline void
if_tunnel_get_ro(percpu_t *ro_percpu, struct route **ro, kmutex_t **lock)
{
struct tunnel_ro *tro;
tro = percpu_getref(ro_percpu);
*ro = tro->tr_ro;
*lock = tro->tr_lock;
mutex_enter(*lock);
}
static inline void
if_tunnel_put_ro(percpu_t *ro_percpu, kmutex_t *lock)
{
mutex_exit(lock);
percpu_putref(ro_percpu);
}
static __inline if_index_t
if_get_index(const struct ifnet *ifp)
{
return ifp != NULL ? ifp->if_index : 0;
}
bool if_held(struct ifnet *);
void if_input(struct ifnet *, struct mbuf *);
struct if_percpuq *
if_percpuq_create(struct ifnet *);
void if_percpuq_destroy(struct if_percpuq *);
void
if_percpuq_enqueue(struct if_percpuq *, struct mbuf *);
void if_deferred_start_init(struct ifnet *, void (*)(struct ifnet *));
void if_schedule_deferred_start(struct ifnet *);
void ifa_insert(struct ifnet *, struct ifaddr *);
void ifa_remove(struct ifnet *, struct ifaddr *);
void ifa_psref_init(struct ifaddr *);
void ifa_acquire(struct ifaddr *, struct psref *);
void ifa_release(struct ifaddr *, struct psref *);
bool ifa_held(struct ifaddr *);
bool ifa_is_destroying(struct ifaddr *);
void ifaref(struct ifaddr *);
void ifafree(struct ifaddr *);
struct ifaddr *ifa_ifwithaddr(const struct sockaddr *);
struct ifaddr *ifa_ifwithaddr_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifa_ifwithaf(int);
struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *);
struct ifaddr *ifa_ifwithdstaddr_psref(const struct sockaddr *,
struct psref *);
struct ifaddr *ifa_ifwithnet(const struct sockaddr *);
struct ifaddr *ifa_ifwithnet_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifa_ifwithladdr(const struct sockaddr *);
struct ifaddr *ifa_ifwithladdr_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
struct ifaddr *ifaof_ifpforaddr_psref(const struct sockaddr *, struct ifnet *,
struct psref *);
void link_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
void p2p_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
void if_clone_attach(struct if_clone *);
void if_clone_detach(struct if_clone *);
int if_transmit_lock(struct ifnet *, struct mbuf *);
int ifq_enqueue(struct ifnet *, struct mbuf *);
int ifq_enqueue2(struct ifnet *, struct ifqueue *, struct mbuf *);
int loioctl(struct ifnet *, u_long, void *);
void loopattach(int);
void loopinit(void);
int looutput(struct ifnet *,
struct mbuf *, const struct sockaddr *, const struct rtentry *);
void * if_linkstate_change_establish(struct ifnet *,
void (*)(void *), void *);
void if_linkstate_change_disestablish(struct ifnet *,
void *, kmutex_t *);
/*
* These are exported because they're an easy way to tell if
* an interface is going away without having to burn a flag.
*/
int if_nulloutput(struct ifnet *, struct mbuf *,
const struct sockaddr *, const struct rtentry *);
void if_nullinput(struct ifnet *, struct mbuf *);
void if_nullstart(struct ifnet *);
int if_nulltransmit(struct ifnet *, struct mbuf *);
int if_nullioctl(struct ifnet *, u_long, void *);
int if_nullinit(struct ifnet *);
void if_nullstop(struct ifnet *, int);
void if_nullslowtimo(struct ifnet *);
#define if_nullwatchdog if_nullslowtimo
void if_nulldrain(struct ifnet *);
#else
struct if_nameindex {
unsigned int if_index; /* 1, 2, ... */
char *if_name; /* null terminated name: "le0", ... */
};
#include <sys/cdefs.h>
__BEGIN_DECLS
unsigned int if_nametoindex(const char *);
char * if_indextoname(unsigned int, char *);
struct if_nameindex * if_nameindex(void);
void if_freenameindex(struct if_nameindex *);
__END_DECLS
#endif /* _KERNEL */ /* XXX really ALTQ? */
#ifdef _KERNEL
#define IFADDR_FIRST(__ifp) TAILQ_FIRST(&(__ifp)->if_addrlist)
#define IFADDR_NEXT(__ifa) TAILQ_NEXT((__ifa), ifa_list)
#define IFADDR_FOREACH(__ifa, __ifp) TAILQ_FOREACH(__ifa, \
&(__ifp)->if_addrlist, ifa_list)
#define IFADDR_FOREACH_SAFE(__ifa, __ifp, __nifa) \
TAILQ_FOREACH_SAFE(__ifa, \
&(__ifp)->if_addrlist, ifa_list, __nifa)
#define IFADDR_EMPTY(__ifp) TAILQ_EMPTY(&(__ifp)->if_addrlist)
#define IFADDR_ENTRY_INIT(__ifa) \
PSLIST_ENTRY_INIT((__ifa), ifa_pslist_entry)
#define IFADDR_ENTRY_DESTROY(__ifa) \
PSLIST_ENTRY_DESTROY((__ifa), ifa_pslist_entry)
#define IFADDR_READER_EMPTY(__ifp) \
(PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry) == NULL)
#define IFADDR_READER_FIRST(__ifp) \
PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry)
#define IFADDR_READER_NEXT(__ifa) \
PSLIST_READER_NEXT((__ifa), struct ifaddr, ifa_pslist_entry)
#define IFADDR_READER_FOREACH(__ifa, __ifp) \
PSLIST_READER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_HEAD(__ifp, __ifa) \
PSLIST_WRITER_INSERT_HEAD(&(__ifp)->if_addr_pslist, (__ifa), \
ifa_pslist_entry)
#define IFADDR_WRITER_REMOVE(__ifa) \
PSLIST_WRITER_REMOVE((__ifa), ifa_pslist_entry)
#define IFADDR_WRITER_FOREACH(__ifa, __ifp) \
PSLIST_WRITER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
ifa_pslist_entry)
#define IFADDR_WRITER_NEXT(__ifp) \
PSLIST_WRITER_NEXT((__ifp), struct ifaddr, ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_AFTER(__ifp, __new) \
PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), ifa_pslist_entry)
#define IFADDR_WRITER_EMPTY(__ifp) \
(PSLIST_WRITER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry) == NULL)
#define IFADDR_WRITER_INSERT_TAIL(__ifp, __new) \
do { \
if (IFADDR_WRITER_EMPTY(__ifp)) { \
IFADDR_WRITER_INSERT_HEAD((__ifp), (__new)); \
} else { \
struct ifaddr *__ifa; \
IFADDR_WRITER_FOREACH(__ifa, (__ifp)) { \
if (IFADDR_WRITER_NEXT(__ifa) == NULL) {\
IFADDR_WRITER_INSERT_AFTER(__ifa,\
(__new)); \
break; \
} \
} \
} \
} while (0)
#define IFNET_GLOBAL_LOCK() mutex_enter(&ifnet_mtx)
#define IFNET_GLOBAL_UNLOCK() mutex_exit(&ifnet_mtx)
#define IFNET_GLOBAL_LOCKED() mutex_owned(&ifnet_mtx)
#define IFNET_READER_EMPTY() \
(PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_READER_FIRST() \
PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry)
#define IFNET_READER_NEXT(__ifp) \
PSLIST_READER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_READER_FOREACH(__ifp) \
PSLIST_READER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
if_pslist_entry)
#define IFNET_WRITER_INSERT_HEAD(__ifp) \
PSLIST_WRITER_INSERT_HEAD(&ifnet_pslist, (__ifp), if_pslist_entry)
#define IFNET_WRITER_REMOVE(__ifp) \
PSLIST_WRITER_REMOVE((__ifp), if_pslist_entry)
#define IFNET_WRITER_FOREACH(__ifp) \
PSLIST_WRITER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
if_pslist_entry)
#define IFNET_WRITER_NEXT(__ifp) \
PSLIST_WRITER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_WRITER_INSERT_AFTER(__ifp, __new) \
PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), if_pslist_entry)
#define IFNET_WRITER_EMPTY() \
(PSLIST_WRITER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_WRITER_INSERT_TAIL(__new) \
do { \
if (IFNET_WRITER_EMPTY()) { \
IFNET_WRITER_INSERT_HEAD(__new); \
} else { \
struct ifnet *__ifp; \
IFNET_WRITER_FOREACH(__ifp) { \
if (IFNET_WRITER_NEXT(__ifp) == NULL) { \
IFNET_WRITER_INSERT_AFTER(__ifp,\
(__new)); \
break; \
} \
} \
} \
} while (0)
#define IFNET_LOCK(ifp) mutex_enter((ifp)->if_ioctl_lock)
#define IFNET_UNLOCK(ifp) mutex_exit((ifp)->if_ioctl_lock)
#define IFNET_LOCKED(ifp) mutex_owned((ifp)->if_ioctl_lock)
#define IFNET_ASSERT_UNLOCKED(ifp) \
KDASSERT(mutex_ownable((ifp)->if_ioctl_lock))
extern struct pslist_head ifnet_pslist;
extern kmutex_t ifnet_mtx;
extern struct ifnet *lo0ifp;
/*
* ifq sysctl support
*/
int sysctl_ifq(int *name, u_int namelen, void *oldp,
size_t *oldlenp, void *newp, size_t newlen,
struct ifqueue *ifq);
/* symbolic names for terminal (per-protocol) CTL_IFQ_ nodes */
#define IFQCTL_LEN 1
#define IFQCTL_MAXLEN 2
#define IFQCTL_PEAK 3
#define IFQCTL_DROPS 4
/*
* Hook for if_vlan - needed by if_agr
*/
MODULE_HOOK(if_vlan_vlan_input_hook,
struct mbuf *, (struct ifnet *, struct mbuf *));
#endif /* _KERNEL */
#endif /* !_NET_IF_H_ */
/* $NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
*/
/*
* uvm_fault.c: fault handler
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/mman.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_rndsource.h>
/*
*
* a word on page faults:
*
* types of page faults we handle:
*
* CASE 1: upper layer faults CASE 2: lower layer faults
*
* CASE 1A CASE 1B CASE 2A CASE 2B
* read/write1 write>1 read/write +-cow_write/zero
* | | | |
* +--|--+ +--|--+ +-----+ + | + | +-----+
* amap | V | | ---------> new | | | | ^ |
* +-----+ +-----+ +-----+ + | + | +--|--+
* | | |
* +-----+ +-----+ +--|--+ | +--|--+
* uobj | d/c | | d/c | | V | +----+ |
* +-----+ +-----+ +-----+ +-----+
*
* d/c = don't care
*
* case [0]: layerless fault
* no amap or uobj is present. this is an error.
*
* case [1]: upper layer fault [anon active]
* 1A: [read] or [write with anon->an_ref == 1]
* I/O takes place in upper level anon and uobj is not touched.
* 1B: [write with anon->an_ref > 1]
* new anon is alloc'd and data is copied off ["COW"]
*
* case [2]: lower layer fault [uobj]
* 2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
* I/O takes place directly in object.
* 2B: [write to copy_on_write] or [read on NULL uobj]
* data is "promoted" from uobj to a new anon.
* if uobj is null, then we zero fill.
*
* we follow the standard UVM locking protocol ordering:
*
* MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
* we hold a PG_BUSY page if we unlock for I/O
*
*
* the code is structured as follows:
*
* - init the "IN" params in the ufi structure
* ReFault: (ERESTART returned to the loop in uvm_fault_internal)
* - do lookups [locks maps], check protection, handle needs_copy
* - check for case 0 fault (error)
* - establish "range" of fault
* - if we have an amap lock it and extract the anons
* - if sequential advice deactivate pages behind us
* - at the same time check pmap for unmapped areas and anon for pages
* that we could map in (and do map it if found)
* - check object for resident pages that we could map in
* - if (case 2) goto Case2
* - >>> handle case 1
* - ensure source anon is resident in RAM
* - if case 1B alloc new anon and copy from source
* - map the correct page in
* Case2:
* - >>> handle case 2
* - ensure source page is resident (if uobj)
* - if case 2B alloc new anon and copy from source (could be zero
* fill if uobj == NULL)
* - map the correct page in
* - done!
*
* note on paging:
* if we have to do I/O we place a PG_BUSY page in the correct object,
* unlock everything, and do the I/O. when I/O is done we must reverify
* the state of the world before assuming that our data structures are
* valid. [because mappings could change while the map is unlocked]
*
* alternative 1: unbusy the page in question and restart the page fault
* from the top (ReFault). this is easy but does not take advantage
* of the information that we already have from our previous lookup,
* although it is possible that the "hints" in the vm_map will help here.
*
* alternative 2: the system already keeps track of a "version" number of
* a map. [i.e. every time you write-lock a map (e.g. to change a
* mapping) you bump the version number up by one...] so, we can save
* the version number of the map before we release the lock and start I/O.
* then when I/O is done we can relock and check the version numbers
* to see if anything changed. this might save us some over 1 because
* we don't have to unbusy the page and may be less compares(?).
*
* alternative 3: put in backpointers or a way to "hold" part of a map
* in place while I/O is in progress. this could be complex to
* implement (especially with structures like amap that can be referenced
* by multiple map entries, and figuring out what should wait could be
* complex as well...).
*
* we use alternative 2. given that we are multi-threaded now we may want
* to reconsider the choice.
*/
/*
* local data structures
*/
struct uvm_advice {
int advice;
int nback;
int nforw;
};
/*
* page range array:
* note: index in array must match "advice" value
* XXX: borrowed numbers from freebsd. do they work well for us?
*/
static const struct uvm_advice uvmadvice[] = {
{ UVM_ADV_NORMAL, 3, 4 },
{ UVM_ADV_RANDOM, 0, 0 },
{ UVM_ADV_SEQUENTIAL, 8, 7},
};
#define UVM_MAXRANGE 16 /* must be MAX() of nback+nforw+1 */
/*
* private prototypes
*/
/*
* inline functions
*/
/*
* uvmfault_anonflush: try and deactivate pages in specified anons
*
* => does not have to deactivate page if it is busy
*/
static inline void
uvmfault_anonflush(struct vm_anon **anons, int n)
{
int lcv;
struct vm_page *pg;
for (lcv = 0; lcv < n; lcv++) { if (anons[lcv] == NULL)
continue;
KASSERT(rw_lock_held(anons[lcv]->an_lock));
pg = anons[lcv]->an_page;
if (pg && (pg->flags & PG_BUSY) == 0) { uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
}
}
}
/*
* normal functions
*/
/*
* uvmfault_amapcopy: clear "needs_copy" in a map.
*
* => called with VM data structures unlocked (usually, see below)
* => we get a write lock on the maps and clear needs_copy for a VA
* => if we are out of RAM we sleep (waiting for more)
*/
static void
uvmfault_amapcopy(struct uvm_faultinfo *ufi)
{
for (;;) {
/*
* no mapping? give up.
*/
if (uvmfault_lookup(ufi, true) == false)
return;
/*
* copy if needed.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry))
amap_copy(ufi->map, ufi->entry, AMAP_COPY_NOWAIT,
ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
/*
* didn't work? must be out of RAM. unlock and sleep.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
uvmfault_unlockmaps(ufi, true);
uvm_wait("fltamapcopy");
continue;
}
/*
* got it! unlock and return.
*/
uvmfault_unlockmaps(ufi, true);
return;
}
/*NOTREACHED*/
}
/*
* uvmfault_anonget: get data in an anon into a non-busy, non-released
* page in that anon.
*
* => Map, amap and thus anon should be locked by caller.
* => If we fail, we unlock everything and error is returned.
* => If we are successful, return with everything still locked.
* => We do not move the page on the queues [gets moved later]. If we
* allocate a new page [we_own], it gets put on the queues. Either way,
* the result is that the page is on the queues at return time
* => For pages which are on loan from a uvm_object (and thus are not owned
* by the anon): if successful, return with the owning object locked.
* The caller must unlock this object when it unlocks everything else.
*/
int
uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct vm_anon *anon)
{
struct vm_page *pg;
krw_t lock_type;
int error __unused; /* used for VMSWAP */
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(anon->an_lock)); KASSERT(anon->an_lock == amap->am_lock);
/* Increment the counters.*/
cpu_count(CPU_COUNT_FLTANGET, 1);
if (anon->an_page) {
curlwp->l_ru.ru_minflt++;
} else {
curlwp->l_ru.ru_majflt++;
}
error = 0;
/*
* Loop until we get the anon data, or fail.
*/
for (;;) {
bool we_own, locked;
/*
* Note: 'we_own' will become true if we set PG_BUSY on a page.
*/
we_own = false;
pg = anon->an_page;
/*
* If there is a resident page and it is loaned, then anon
* may not own it. Call out to uvm_anon_lockloanpg() to
* identify and lock the real owner of the page.
*/
if (pg && pg->loan_count)
pg = uvm_anon_lockloanpg(anon);
/*
* Is page resident? Make sure it is not busy/released.
*/
lock_type = rw_lock_op(anon->an_lock);
if (pg) {
/*
* at this point, if the page has a uobject [meaning
* we have it on loan], then that uobject is locked
* by us! if the page is busy, we drop all the
* locks (including uobject) and try again.
*/
if ((pg->flags & PG_BUSY) == 0) {
UVMHIST_LOG(maphist, "<- OK",0,0,0,0);
return 0;
}
cpu_count(CPU_COUNT_FLTPGWAIT, 1);
/*
* The last unlock must be an atomic unlock and wait
* on the owner of page.
*/
if (pg->uobject) {
/* Owner of page is UVM object. */
uvmfault_unlockall(ufi, amap, NULL);
UVMHIST_LOG(maphist, " unlock+wait on uobj",0,
0,0,0);
uvm_pagewait(pg, pg->uobject->vmobjlock, "anonget1");
} else {
/* Owner of page is anon. */
uvmfault_unlockall(ufi, NULL, NULL);
UVMHIST_LOG(maphist, " unlock+wait on anon",0,
0,0,0);
uvm_pagewait(pg, anon->an_lock, "anonget2");
}
} else {
#if defined(VMSWAP)
/*
* No page, therefore allocate one. A write lock is
* required for this. If the caller didn't supply
* one, fail now and have them retry.
*/
if (lock_type == RW_READER) {
return ENOLCK;
}
pg = uvm_pagealloc(NULL,
ufi != NULL ? ufi->orig_rvaddr : 0,
anon, ufi != NULL ? UVM_FLAG_COLORMATCH : 0);
if (pg == NULL) {
/* Out of memory. Wait a little. */
uvmfault_unlockall(ufi, amap, NULL);
cpu_count(CPU_COUNT_FLTNORAM, 1);
UVMHIST_LOG(maphist, " noram -- UVM_WAIT",0,
0,0,0);
if (!uvm_reclaimable()) {
return ENOMEM;
}
uvm_wait("flt_noram1");
} else {
/* PG_BUSY bit is set. */
we_own = true;
uvmfault_unlockall(ufi, amap, NULL);
/*
* Pass a PG_BUSY+PG_FAKE clean page into
* the uvm_swap_get() function with all data
* structures unlocked. Note that it is OK
* to read an_swslot here, because we hold
* PG_BUSY on the page.
*/
cpu_count(CPU_COUNT_PAGEINS, 1);
error = uvm_swap_get(pg, anon->an_swslot,
PGO_SYNCIO);
/*
* We clean up after the I/O below in the
* 'we_own' case.
*/
}
#else
panic("%s: no page", __func__);
#endif /* defined(VMSWAP) */
}
/*
* Re-lock the map and anon.
*/
locked = uvmfault_relock(ufi); if (locked || we_own) { rw_enter(anon->an_lock, lock_type);
}
/*
* If we own the page (i.e. we set PG_BUSY), then we need
* to clean up after the I/O. There are three cases to
* consider:
*
* 1) Page was released during I/O: free anon and ReFault.
* 2) I/O not OK. Free the page and cause the fault to fail.
* 3) I/O OK! Activate the page and sync with the non-we_own
* case (i.e. drop anon lock if not locked).
*/
if (we_own) { KASSERT(lock_type == RW_WRITER);
#if defined(VMSWAP)
if (error) {
/*
* Remove the swap slot from the anon and
* mark the anon as having no real slot.
* Do not free the swap slot, thus preventing
* it from being used again.
*/
if (anon->an_swslot > 0) { uvm_swap_markbad(anon->an_swslot, 1);
}
anon->an_swslot = SWSLOT_BAD;
if ((pg->flags & PG_RELEASED) != 0) {
goto released;
}
/*
* Note: page was never !PG_BUSY, so it
* cannot be mapped and thus no need to
* pmap_page_protect() it.
*/
uvm_pagefree(pg);
if (locked) { uvmfault_unlockall(ufi, NULL, NULL);
}
rw_exit(anon->an_lock);
UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0);
return error;
}
if ((pg->flags & PG_RELEASED) != 0) {
released:
KASSERT(anon->an_ref == 0);
/*
* Released while we had unlocked amap.
*/
if (locked) { uvmfault_unlockall(ufi, NULL, NULL);
}
uvm_anon_release(anon);
if (error) {
UVMHIST_LOG(maphist,
"<- ERROR/RELEASED", 0,0,0,0);
return error;
}
UVMHIST_LOG(maphist, "<- RELEASED", 0,0,0,0);
return ERESTART;
}
/*
* We have successfully read the page, activate it.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
UVM_PAGE_OWN(pg, NULL);
#else
panic("%s: we_own", __func__);
#endif /* defined(VMSWAP) */
}
/*
* We were not able to re-lock the map - restart the fault.
*/
if (!locked) {
if (we_own) {
rw_exit(anon->an_lock);
}
UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
return ERESTART;
}
/*
* Verify that no one has touched the amap and moved
* the anon on us.
*/
if (ufi != NULL && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start) != anon) {
uvmfault_unlockall(ufi, amap, NULL);
UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
return ERESTART;
}
/*
* Retry..
*/
cpu_count(CPU_COUNT_FLTANRETRY, 1);
continue;
}
/*NOTREACHED*/
}
/*
* uvmfault_promote: promote data to a new anon. used for 1B and 2B.
*
* 1. allocate an anon and a page.
* 2. fill its contents.
* 3. put it into amap.
*
* => if we fail (result != 0) we unlock everything.
* => on success, return a new locked anon via 'nanon'.
* (*nanon)->an_page will be a resident, locked, dirty page.
* => it's caller's responsibility to put the promoted nanon->an_page to the
* page queue.
*/
static int
uvmfault_promote(struct uvm_faultinfo *ufi,
struct vm_anon *oanon,
struct vm_page *uobjpage,
struct vm_anon **nanon, /* OUT: allocated anon */
struct vm_anon **spare)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj;
struct vm_anon *anon;
struct vm_page *pg;
struct vm_page *opg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (oanon) {
/* anon COW */
opg = oanon->an_page;
KASSERT(opg != NULL); KASSERT(opg->uobject == NULL || opg->loan_count > 0); } else if (uobjpage != PGO_DONTCARE) {
/* object-backed COW */
opg = uobjpage;
KASSERT(rw_lock_held(opg->uobject->vmobjlock));
} else {
/* ZFOD */
opg = NULL;
}
if (opg != NULL) {
uobj = opg->uobject;
} else {
uobj = NULL;
}
KASSERT(amap != NULL); KASSERT(uobjpage != NULL); KASSERT(rw_write_held(amap->am_lock)); KASSERT(oanon == NULL || amap->am_lock == oanon->an_lock); KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
if (*spare != NULL) {
anon = *spare;
*spare = NULL;
} else {
anon = uvm_analloc();
}
if (anon) {
/*
* The new anon is locked.
*
* if opg == NULL, we want a zero'd, dirty page,
* so have uvm_pagealloc() do that for us.
*/
KASSERT(anon->an_lock == NULL);
anon->an_lock = amap->am_lock;
pg = uvm_pagealloc(NULL, ufi->orig_rvaddr, anon,
UVM_FLAG_COLORMATCH | (opg == NULL ? UVM_PGA_ZERO : 0));
if (pg == NULL) {
anon->an_lock = NULL;
}
} else {
pg = NULL;
}
/*
* out of memory resources?
*/
if (pg == NULL) {
/* save anon for the next try. */
if (anon != NULL) {
*spare = anon;
}
/* unlock and fail ... */
uvmfault_unlockall(ufi, amap, uobj);
if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist, "out of VM", 0,0,0,0);
cpu_count(CPU_COUNT_FLTNOANON, 1);
error = ENOMEM;
goto done;
}
UVMHIST_LOG(maphist, "out of RAM, waiting for more", 0,0,0,0);
cpu_count(CPU_COUNT_FLTNORAM, 1);
uvm_wait("flt_noram5");
error = ERESTART;
goto done;
}
/*
* copy the page [pg now dirty]
*
* Remove the pmap entry now for the old page at this address
* so that no thread can modify the new page while any thread
* might still see the old page.
*/
if (opg) { pmap_remove(vm_map_pmap(ufi->orig_map), ufi->orig_rvaddr,
ufi->orig_rvaddr + PAGE_SIZE);
pmap_update(vm_map_pmap(ufi->orig_map));
uvm_pagecopy(opg, pg);
}
KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);
amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon,
oanon != NULL);
/*
* from this point on am_lock won't be dropped until the page is
* entered, so it's safe to unbusy the page up front.
*
* uvm_fault_{upper,lower}_done will activate or enqueue the page.
*/
pg = anon->an_page;
pg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
*nanon = anon;
error = 0;
done:
return error;
}
/*
* Update statistics after fault resolution.
* - maxrss
*/
void
uvmfault_update_stats(struct uvm_faultinfo *ufi)
{
struct vm_map *map;
struct vmspace *vm;
struct proc *p;
vsize_t res;
map = ufi->orig_map;
p = curproc;
KASSERT(p != NULL);
vm = p->p_vmspace;
if (&vm->vm_map != map)
return;
res = pmap_resident_count(map->pmap);
if (vm->vm_rssmax < res) vm->vm_rssmax = res;
}
/*
* F A U L T - m a i n e n t r y p o i n t
*/
/*
* uvm_fault: page fault handler
*
* => called from MD code to resolve a page fault
* => VM data structures usually should be unlocked. however, it is
* possible to call here with the main map locked if the caller
* gets a write lock, sets it recursive, and then calls us (c.f.
* uvm_map_pageable). this should be avoided because it keeps
* the map locked off during I/O.
* => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
*/
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
~VM_PROT_WRITE : VM_PROT_ALL)
/* fault_flag values passed from uvm_fault_wire to uvm_fault_internal */
#define UVM_FAULT_WIRE (1 << 0)
#define UVM_FAULT_MAXPROT (1 << 1)
struct uvm_faultctx {
/*
* the following members are set up by uvm_fault_check() and
* read-only after that.
*
* note that narrow is used by uvm_fault_check() to change
* the behaviour after ERESTART.
*
* most of them might change after RESTART if the underlying
* map entry has been changed behind us. an exception is
* wire_paging, which does never change.
*/
vm_prot_t access_type;
vaddr_t startva;
int npages;
int centeridx;
bool narrow; /* work on a single requested page only */
bool wire_mapping; /* request a PMAP_WIRED mapping
(UVM_FAULT_WIRE or VM_MAPENT_ISWIRED) */
bool wire_paging; /* request uvm_pagewire
(true for UVM_FAULT_WIRE) */
bool cow_now; /* VM_PROT_WRITE is actually requested
(ie. should break COW and page loaning) */
/*
* enter_prot is set up by uvm_fault_check() and clamped
* (ie. drop the VM_PROT_WRITE bit) in various places in case
* of !cow_now.
*/
vm_prot_t enter_prot; /* prot at which we want to enter pages in */
/*
* the following member is for uvmfault_promote() and ERESTART.
*/
struct vm_anon *anon_spare;
/*
* the following is actually a uvm_fault_lower() internal.
* it's here merely for debugging.
* (or due to the mechanical separation of the function?)
*/
bool promote;
/*
* type of lock to acquire on objects in both layers.
*/
krw_t lower_lock_type;
krw_t upper_lock_type;
};
static inline int uvm_fault_check(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon ***, bool);
static int uvm_fault_upper(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon **);
static inline int uvm_fault_upper_lookup(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_anon **, struct vm_page **);
static inline void uvm_fault_upper_neighbor(
struct uvm_faultinfo *, const struct uvm_faultctx *,
vaddr_t, struct vm_page *, bool);
static inline int uvm_fault_upper_loan(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon *, struct uvm_object **);
static inline int uvm_fault_upper_promote(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *);
static inline int uvm_fault_upper_direct(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *);
static int uvm_fault_upper_enter(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *,
struct vm_page *, struct vm_anon *);
static inline void uvm_fault_upper_done(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_anon *, struct vm_page *);
static int uvm_fault_lower(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_page **);
static inline void uvm_fault_lower_lookup(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_page **);
static inline void uvm_fault_lower_neighbor(
struct uvm_faultinfo *, const struct uvm_faultctx *,
vaddr_t, struct vm_page *);
static inline int uvm_fault_lower_io(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object **, struct vm_page **);
static inline int uvm_fault_lower_direct(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
static inline int uvm_fault_lower_direct_loan(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page **,
struct vm_page **);
static inline int uvm_fault_lower_promote(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
static int uvm_fault_lower_enter(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *,
struct vm_anon *, struct vm_page *);
static inline void uvm_fault_lower_done(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
int
uvm_fault_internal(struct vm_map *orig_map, vaddr_t vaddr,
vm_prot_t access_type, int fault_flag)
{
struct uvm_faultinfo ufi;
struct uvm_faultctx flt = {
.access_type = access_type,
/* don't look for neighborhood * pages on "wire" fault */
.narrow = (fault_flag & UVM_FAULT_WIRE) != 0,
/* "wire" fault causes wiring of both mapping and paging */
.wire_mapping = (fault_flag & UVM_FAULT_WIRE) != 0,
.wire_paging = (fault_flag & UVM_FAULT_WIRE) != 0,
/*
* default lock type to acquire on upper & lower layer
* objects: reader. this can be upgraded at any point
* during the fault from read -> write and uvm_faultctx
* changed to match, but is never downgraded write -> read.
*/
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
.upper_lock_type = RW_WRITER,
.lower_lock_type = RW_WRITER,
#else
.upper_lock_type = RW_READER,
.lower_lock_type = RW_READER,
#endif
};
const bool maxprot = (fault_flag & UVM_FAULT_MAXPROT) != 0;
struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
struct vm_page *pages_store[UVM_MAXRANGE], **pages;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, vaddr=%#jx, at=%jd, ff=%jd)",
(uintptr_t)orig_map, vaddr, access_type, fault_flag);
/* Don't count anything until user interaction is possible */
kpreempt_disable();
if (__predict_true(start_init_exec)) {
struct cpu_info *ci = curcpu();
CPU_COUNT(CPU_COUNT_NFAULT, 1);
/* Don't flood RNG subsystem with samples. */
if (++(ci->ci_faultrng) == 503) { ci->ci_faultrng = 0;
rnd_add_uint32(&uvm_fault_rndsource,
sizeof(vaddr_t) == sizeof(uint32_t) ?
(uint32_t)vaddr : sizeof(vaddr_t) ==
sizeof(uint64_t) ?
(uint32_t)vaddr :
(uint32_t)ci->ci_counts[CPU_COUNT_NFAULT]);
}
}
kpreempt_enable();
/*
* init the IN parameters in the ufi
*/
ufi.orig_map = orig_map;
ufi.orig_rvaddr = trunc_page(vaddr);
ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
error = ERESTART;
while (error == ERESTART) { /* ReFault: */
anons = anons_store;
pages = pages_store;
error = uvm_fault_check(&ufi, &flt, &anons, maxprot);
if (error != 0)
continue;
error = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
if (error != 0)
continue;
if (pages[flt.centeridx] == PGO_DONTCARE)
error = uvm_fault_upper(&ufi, &flt, anons);
else {
struct uvm_object * const uobj =
ufi.entry->object.uvm_obj;
if (uobj && uobj->pgops->pgo_fault != NULL) {
/*
* invoke "special" fault routine.
*/
rw_enter(uobj->vmobjlock, RW_WRITER);
/* locked: maps(read), amap(if there), uobj */
error = uobj->pgops->pgo_fault(&ufi,
flt.startva, pages, flt.npages,
flt.centeridx, flt.access_type,
PGO_LOCKED|PGO_SYNCIO);
/*
* locked: nothing, pgo_fault has unlocked
* everything
*/
/*
* object fault routine responsible for
* pmap_update().
*/
/*
* Wake up the pagedaemon if the fault method
* failed for lack of memory but some can be
* reclaimed.
*/
if (error == ENOMEM && uvm_reclaimable()) { uvm_wait("pgo_fault");
error = ERESTART;
}
} else {
error = uvm_fault_lower(&ufi, &flt, pages);
}
}
}
if (flt.anon_spare != NULL) {
flt.anon_spare->an_ref--;
KASSERT(flt.anon_spare->an_ref == 0); KASSERT(flt.anon_spare->an_lock == NULL);
uvm_anfree(flt.anon_spare);
}
return error;
}
/*
* uvm_fault_check: check prot, handle needs-copy, etc.
*
* 1. lookup entry.
* 2. check protection.
* 3. adjust fault condition (mainly for simulated fault).
* 4. handle needs-copy (lazy amap copy).
* 5. establish range of interest for neighbor fault (aka pre-fault).
* 6. look up anons (if amap exists).
* 7. flush pages (if MADV_SEQUENTIAL)
*
* => called with nothing locked.
* => if we fail (result != 0) we unlock everything.
* => initialize/adjust many members of flt.
*/
static int
uvm_fault_check(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon ***ranons, bool maxprot)
{
struct vm_amap *amap;
struct uvm_object *uobj;
vm_prot_t check_prot;
int nback, nforw;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* lookup and lock the maps
*/
if (uvmfault_lookup(ufi, false) == false) {
UVMHIST_LOG(maphist, "<- no mapping @ %#jx", ufi->orig_rvaddr,
0,0,0);
return EFAULT;
}
/* locked: maps(read) */
#ifdef DIAGNOSTIC
if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0) {
printf("Page fault on non-pageable map:\n");
printf("ufi->map = %p\n", ufi->map);
printf("ufi->orig_map = %p\n", ufi->orig_map);
printf("ufi->orig_rvaddr = %#lx\n", (u_long) ufi->orig_rvaddr);
panic("uvm_fault: (ufi->map->flags & VM_MAP_PAGEABLE) == 0");
}
#endif
/*
* check protection
*/
check_prot = maxprot ?
ufi->entry->max_protection : ufi->entry->protection;
if ((check_prot & flt->access_type) != flt->access_type) {
UVMHIST_LOG(maphist,
"<- protection failure (prot=%#jx, access=%#jx)",
ufi->entry->protection, flt->access_type, 0, 0);
uvmfault_unlockmaps(ufi, false);
return EFAULT;
}
/*
* "enter_prot" is the protection we want to enter the page in at.
* for certain pages (e.g. copy-on-write pages) this protection can
* be more strict than ufi->entry->protection. "wired" means either
* the entry is wired or we are fault-wiring the pg.
*/
flt->enter_prot = ufi->entry->protection;
if (VM_MAPENT_ISWIRED(ufi->entry)) {
flt->wire_mapping = true;
flt->wire_paging = true;
flt->narrow = true;
}
if (flt->wire_mapping) {
flt->access_type = flt->enter_prot; /* full access for wired */
flt->cow_now = (check_prot & VM_PROT_WRITE) != 0;
} else {
flt->cow_now = (flt->access_type & VM_PROT_WRITE) != 0;
}
if (flt->wire_paging) {
/* wiring pages requires a write lock. */
flt->upper_lock_type = RW_WRITER;
flt->lower_lock_type = RW_WRITER;
}
flt->promote = false;
/*
* handle "needs_copy" case. if we need to copy the amap we will
* have to drop our readlock and relock it with a write lock. (we
* need a write lock to change anything in a map entry [e.g.
* needs_copy]).
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
if (flt->cow_now || (ufi->entry->object.uvm_obj == NULL)) { KASSERT(!maxprot);
/* need to clear */
UVMHIST_LOG(maphist,
" need to clear needs_copy and refault",0,0,0,0);
uvmfault_unlockmaps(ufi, false); uvmfault_amapcopy(ufi);
cpu_count(CPU_COUNT_FLTAMCOPY, 1);
return ERESTART;
} else {
/*
* ensure that we pmap_enter page R/O since
* needs_copy is still true
*/
flt->enter_prot &= ~VM_PROT_WRITE;
}
}
/*
* identify the players
*/
amap = ufi->entry->aref.ar_amap; /* upper layer */
uobj = ufi->entry->object.uvm_obj; /* lower layer */
/*
* check for a case 0 fault. if nothing backing the entry then
* error now.
*/
if (amap == NULL && uobj == NULL) { uvmfault_unlockmaps(ufi, false);
UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0);
return EFAULT;
}
/*
* for a case 2B fault waste no time on adjacent pages because
* they are likely already entered.
*/
if (uobj != NULL && amap != NULL &&
(flt->access_type & VM_PROT_WRITE) != 0) {
/* wide fault (!narrow) */
flt->narrow = true;
}
/*
* establish range of interest based on advice from mapper
* and then clip to fit map entry. note that we only want
* to do this the first time through the fault. if we
* ReFault we will disable this by setting "narrow" to true.
*/
if (flt->narrow == false) {
/* wide fault (!narrow) */
KASSERT(uvmadvice[ufi->entry->advice].advice ==
ufi->entry->advice);
nback = MIN(uvmadvice[ufi->entry->advice].nback,
(ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
flt->startva = ufi->orig_rvaddr - (nback << PAGE_SHIFT);
/*
* note: "-1" because we don't want to count the
* faulting page as forw
*/
nforw = MIN(uvmadvice[ufi->entry->advice].nforw,
((ufi->entry->end - ufi->orig_rvaddr) >>
PAGE_SHIFT) - 1);
flt->npages = nback + nforw + 1;
flt->centeridx = nback;
flt->narrow = true; /* ensure only once per-fault */
} else {
/* narrow fault! */
nback = nforw = 0;
flt->startva = ufi->orig_rvaddr;
flt->npages = 1;
flt->centeridx = 0;
}
/* offset from entry's start to pgs' start */
const voff_t eoff = flt->startva - ufi->entry->start;
/* locked: maps(read) */
UVMHIST_LOG(maphist, " narrow=%jd, back=%jd, forw=%jd, startva=%#jx",
flt->narrow, nback, nforw, flt->startva);
UVMHIST_LOG(maphist, " entry=%#jx, amap=%#jx, obj=%#jx",
(uintptr_t)ufi->entry, (uintptr_t)amap, (uintptr_t)uobj, 0);
/*
* guess at the most suitable lock types to acquire.
* if we've got an amap then lock it and extract current anons.
*/
if (amap) {
if ((amap_flags(amap) & AMAP_SHARED) == 0) {
/*
* the amap isn't shared. get a writer lock to
* avoid the cost of upgrading the lock later if
* needed.
*
* XXX nice for PostgreSQL, but consider threads.
*/
flt->upper_lock_type = RW_WRITER;
} else if ((flt->access_type & VM_PROT_WRITE) != 0) {
/*
* assume we're about to COW.
*/
flt->upper_lock_type = RW_WRITER;
}
amap_lock(amap, flt->upper_lock_type);
amap_lookups(&ufi->entry->aref, eoff, *ranons, flt->npages);
} else {
if ((flt->access_type & VM_PROT_WRITE) != 0) {
/*
* we are about to dirty the object and that
* requires a write lock.
*/
flt->lower_lock_type = RW_WRITER;
}
*ranons = NULL; /* to be safe */
}
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/*
* for MADV_SEQUENTIAL mappings we want to deactivate the back pages
* now and then forget about them (for the rest of the fault).
*/
if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
UVMHIST_LOG(maphist, " MADV_SEQUENTIAL: flushing backpages",
0,0,0,0);
/* flush back-page anons? */
if (amap) uvmfault_anonflush(*ranons, nback);
/*
* flush object? change lock type to RW_WRITER, to avoid
* excessive competition between read/write locks if many
* threads doing "sequential access".
*/
if (uobj) {
voff_t uoff;
flt->lower_lock_type = RW_WRITER;
uoff = ufi->entry->offset + eoff;
rw_enter(uobj->vmobjlock, RW_WRITER);
(void) (uobj->pgops->pgo_put)(uobj, uoff, uoff +
(nback << PAGE_SHIFT), PGO_DEACTIVATE);
}
/* now forget about the backpages */
if (amap)
*ranons += nback;
flt->startva += (nback << PAGE_SHIFT);
flt->npages -= nback;
flt->centeridx = 0;
}
/*
* => startva is fixed
* => npages is fixed
*/
KASSERT(flt->startva <= ufi->orig_rvaddr); KASSERT(ufi->orig_rvaddr + ufi->orig_size <=
flt->startva + (flt->npages << PAGE_SHIFT));
return 0;
}
/*
* uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
*/
static inline int
uvm_fault_upper_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_amap *amap, struct uvm_object *uobj)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap != NULL); KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
/*
* fast path.
*/
if (__predict_true(flt->upper_lock_type == RW_WRITER)) {
return 0;
}
/*
* otherwise try for the upgrade. if we don't get it, unlock
* everything, restart the fault and next time around get a writer
* lock.
*/
flt->upper_lock_type = RW_WRITER;
if (__predict_false(!rw_tryupgrade(amap->am_lock))) {
uvmfault_unlockall(ufi, amap, uobj);
cpu_count(CPU_COUNT_FLTNOUP, 1);
UVMHIST_LOG(maphist, " !upgrade upper", 0, 0,0,0);
return ERESTART;
}
cpu_count(CPU_COUNT_FLTUP, 1);
KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
return 0;
}
/*
* uvm_fault_upper_lookup: look up existing h/w mapping and amap.
*
* iterate range of interest:
* 1. check if h/w mapping exists. if yes, we don't care
* 2. check if anon exists. if not, page is lower.
* 3. if anon exists, enter h/w mapping for neighbors.
*
* => called with amap locked (if exists).
*/
static int
uvm_fault_upper_lookup(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_anon **anons, struct vm_page **pages)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
int lcv;
vaddr_t currva;
bool shadowed __unused;
bool entered;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/*
* map in the backpages and frontpages we found in the amap in hopes
* of preventing future faults. we also init the pages[] array as
* we go.
*/
currva = flt->startva;
shadowed = false;
entered = false;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
/*
* unmapped or center page. check if any anon at this level.
*/
if (amap == NULL || anons[lcv] == NULL) {
pages[lcv] = NULL;
continue;
}
/*
* check for present page and map if possible.
*/
pages[lcv] = PGO_DONTCARE;
if (lcv == flt->centeridx) { /* save center for later! */
shadowed = true;
continue;
}
struct vm_anon *anon = anons[lcv];
struct vm_page *pg = anon->an_page;
KASSERT(anon->an_lock == amap->am_lock);
/*
* ignore loaned and busy pages.
* don't play with VAs that are already mapped.
*/
if (pg && pg->loan_count == 0 && (pg->flags & PG_BUSY) == 0 &&
!pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
uvm_fault_upper_neighbor(ufi, flt, currva,
pg, anon->an_ref > 1);
entered = true;
}
}
if (entered) { pmap_update(ufi->orig_map->pmap);
}
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/* (shadowed == true) if there is an anon at the faulting address */
UVMHIST_LOG(maphist, " shadowed=%jd, will_get=%jd", shadowed,
(ufi->entry->object.uvm_obj && shadowed != false),0,0);
return 0;
}
/*
* uvm_fault_upper_neighbor: enter single upper neighbor page.
*
* => called with amap and anon locked.
*/
static void
uvm_fault_upper_neighbor(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
vaddr_t currva, struct vm_page *pg, bool readonly)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: amap, anon */
KASSERT(pg->uobject == NULL); KASSERT(pg->uanon != NULL); KASSERT(rw_lock_op(pg->uanon->an_lock) == flt->upper_lock_type); KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
/*
* there wasn't a direct fault on the page, so avoid the cost of
* activating it.
*/
if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
}
UVMHIST_LOG(maphist,
" MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx",
(uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
cpu_count(CPU_COUNT_FLTNAMAP, 1);
/*
* Since this page isn't the page that's actually faulting,
* ignore pmap_enter() failures; it's not critical that we
* enter these right now.
*/
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(pg),
readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
flt->enter_prot,
PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0));
}
/*
* uvm_fault_upper: handle upper fault.
*
* 1. acquire anon lock.
* 2. get anon. let uvmfault_anonget do the dirty work.
* 3. handle loan.
* 4. dispatch direct or promote handlers.
*/
static int
uvm_fault_upper(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon **anons)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon * const anon = anons[flt->centeridx];
struct uvm_object *uobj;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap, anon */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock);
/*
* handle case 1: fault on an anon in our amap
*/
UVMHIST_LOG(maphist, " case 1 fault: anon=%#jx",
(uintptr_t)anon, 0, 0, 0);
/*
* no matter if we have case 1A or case 1B we are going to need to
* have the anon's memory resident. ensure that now.
*/
/*
* let uvmfault_anonget do the dirty work.
* if it fails (!OK) it will unlock everything for us.
* if it succeeds, locks are still valid and locked.
* also, if it is OK, then the anon's page is on the queues.
* if the page is on loan from a uvm_object, then anonget will
* lock that object for us if it does not fail.
*/
retry:
error = uvmfault_anonget(ufi, amap, anon);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
case EAGAIN:
kpause("fltagain1", false, hz/2, NULL);
return ERESTART;
case ENOLCK:
/* it needs a write lock: retry */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
goto retry;
default:
return error;
}
/*
* uobj is non null if the page is on loan from an object (i.e. uobj)
*/
uobj = anon->an_page->uobject; /* locked by anonget if !NULL */
/* locked: maps(read), amap, anon, uobj(if one) */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* special handling for loaned pages
*/
if (anon->an_page->loan_count) { error = uvm_fault_upper_loan(ufi, flt, anon, &uobj);
if (error != 0)
return error;
}
/*
* if we are case 1B then we will need to allocate a new blank
* anon to transfer the data into. note that we have a lock
* on anon, so no one can busy or release the page until we are done.
* also note that the ref count can't drop to zero here because
* it is > 1 and we are only dropping one ref.
*
* in the (hopefully very rare) case that we are out of RAM we
* will unlock, wait for more RAM, and refault.
*
* if we are out of anon VM we kill the process (XXX: could wait?).
*/
if (flt->cow_now && anon->an_ref > 1) {
flt->promote = true;
error = uvm_fault_upper_promote(ufi, flt, uobj, anon);
} else {
error = uvm_fault_upper_direct(ufi, flt, uobj, anon);
}
return error;
}
/*
* uvm_fault_upper_loan: handle loaned upper page.
*
* 1. if not cow'ing now, simply adjust flt->enter_prot.
* 2. if cow'ing now, and if ref count is 1, break loan.
*/
static int
uvm_fault_upper_loan(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon *anon, struct uvm_object **ruobj)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
int error = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (!flt->cow_now) {
/*
* for read faults on loaned pages we just cap the
* protection at read-only.
*/
flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
} else {
/*
* note that we can't allow writes into a loaned page!
*
* if we have a write fault on a loaned page in an
* anon then we need to look at the anon's ref count.
* if it is greater than one then we are going to do
* a normal copy-on-write fault into a new anon (this
* is not a problem). however, if the reference count
* is one (a case where we would normally allow a
* write directly to the page) then we need to kill
* the loan before we continue.
*/
/* >1 case is already ok */
if (anon->an_ref == 1) {
/* breaking loan requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
error = uvm_loanbreak_anon(anon, *ruobj);
if (error != 0) { uvmfault_unlockall(ufi, amap, *ruobj);
uvm_wait("flt_noram2");
return ERESTART;
}
/* if we were a loan receiver uobj is gone */
if (*ruobj)
*ruobj = NULL;
}
}
return error;
}
/*
* uvm_fault_upper_promote: promote upper page.
*
* 1. call uvmfault_promote.
* 2. enqueue page.
* 3. deref.
* 4. pass page to uvm_fault_upper_enter.
*/
static int
uvm_fault_upper_promote(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon * const oanon = anon;
struct vm_page *pg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist, " case 1B: COW fault",0,0,0,0);
/* promoting requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
cpu_count(CPU_COUNT_FLT_ACOW, 1);
error = uvmfault_promote(ufi, oanon, PGO_DONTCARE, &anon,
&flt->anon_spare);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
default:
return error;
}
pg = anon->an_page;
KASSERT(anon->an_lock == oanon->an_lock); KASSERT((pg->flags & (PG_BUSY | PG_FAKE)) == 0);
/* deref: can not drop to zero here by defn! */
KASSERT(oanon->an_ref > 1);
oanon->an_ref--;
/*
* note: oanon is still locked, as is the new anon. we
* need to check for this later when we unlock oanon; if
* oanon != anon, we'll have to unlock anon, too.
*/
return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}
/*
* uvm_fault_upper_direct: handle direct fault.
*/
static int
uvm_fault_upper_direct(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon)
{
struct vm_anon * const oanon = anon;
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
cpu_count(CPU_COUNT_FLT_ANON, 1);
pg = anon->an_page;
if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}
/*
* uvm_fault_upper_enter: enter h/w mapping of upper page.
*/
static int
uvm_fault_upper_enter(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg,
struct vm_anon *oanon)
{
struct pmap *pmap = ufi->orig_map->pmap;
vaddr_t va = ufi->orig_rvaddr;
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap, oanon, anon(if different from oanon) */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(oanon->an_lock == amap->am_lock); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
/*
* now map the page in.
*/
UVMHIST_LOG(maphist,
" MAPPING: anon: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
(uintptr_t)pmap, va, (uintptr_t)pg, flt->promote);
if (pmap_enter(pmap, va, VM_PAGE_TO_PHYS(pg),
flt->enter_prot, flt->access_type | PMAP_CANFAIL |
(flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
/*
* If pmap_enter() fails, it must not leave behind an existing
* pmap entry. In particular, a now-stale entry for a different
* page would leave the pmap inconsistent with the vm_map.
* This is not to imply that pmap_enter() should remove an
* existing mapping in such a situation (since that could create
* different problems, eg. if the existing mapping is wired),
* but rather that the pmap should be designed such that it
* never needs to fail when the new mapping is replacing an
* existing mapping and the new page has no existing mappings.
*
* XXX This can't be asserted safely any more because many
* LWPs and/or many processes could simultaneously fault on
* the same VA and some might succeed.
*/
/* KASSERT(!pmap_extract(pmap, va, NULL)); */
/*
* ensure that the page is queued in the case that
* we just promoted.
*/
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
/* XXX instrumentation */
return ENOMEM;
}
/* XXX instrumentation */
uvm_wait("flt_pmfail1");
return ERESTART;
}
uvm_fault_upper_done(ufi, flt, anon, pg);
/*
* done case 1! finish up by unlocking everything and returning success
*/
pmap_update(pmap);
uvmfault_unlockall(ufi, amap, uobj);
return 0;
}
/*
* uvm_fault_upper_done: queue upper center page.
*/
static void
uvm_fault_upper_done(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_anon *anon, struct vm_page *pg)
{
const bool wire_paging = flt->wire_paging;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* ... update the page queues.
*/
if (wire_paging) {
uvm_pagelock(pg);
uvm_pagewire(pg);
uvm_pageunlock(pg);
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* and since an anon with no swap cannot be clean,
* mark it dirty now.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
uvm_anon_dropswap(anon);
} else if (uvmpdpol_pageactivate_p(pg)) {
/*
* avoid re-activating the page unless needed,
* to avoid false sharing on multiprocessor.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
/*
* uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
*/
static inline int
uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_amap *amap, struct uvm_object *uobj, struct vm_page *uobjpage)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(uobj != NULL); KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
/*
* fast path.
*/
if (__predict_true(flt->lower_lock_type == RW_WRITER)) {
return 0;
}
/*
* otherwise try for the upgrade. if we don't get it, unlock
* everything, restart the fault and next time around get a writer
* lock.
*/
flt->lower_lock_type = RW_WRITER;
if (__predict_false(!rw_tryupgrade(uobj->vmobjlock))) {
uvmfault_unlockall(ufi, amap, uobj);
cpu_count(CPU_COUNT_FLTNOUP, 1);
UVMHIST_LOG(maphist, " !upgrade lower", 0, 0,0,0);
return ERESTART;
}
cpu_count(CPU_COUNT_FLTUP, 1);
KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
return 0;
}
/*
* uvm_fault_lower: handle lower fault.
*
* 1. check uobj
* 1.1. if null, ZFOD.
* 1.2. if not null, look up unmapped neighbor pages.
* 2. for center page, check if promote.
* 2.1. ZFOD always needs promotion.
* 2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode).
* 3. if uobj is not ZFOD and page is not found, do i/o.
* 4. dispatch either direct / promote fault.
*/
static int
uvm_fault_lower(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_page **pages)
{
struct vm_amap *amap __diagused = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
struct vm_page *uobjpage;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* now, if the desired page is not shadowed by the amap and we have
* a backing object that does not have a special fault routine, then
* we ask (with pgo_get) the object for resident pages that we care
* about and attempt to map them in. we do not let pgo_get block
* (PGO_LOCKED).
*/
if (uobj == NULL) {
/* zero fill; don't care neighbor pages */
uobjpage = NULL;
} else {
uvm_fault_lower_lookup(ufi, flt, pages);
uobjpage = pages[flt->centeridx];
}
/*
* note that at this point we are done with any front or back pages.
* we are now going to focus on the center page (i.e. the one we've
* faulted on). if we have faulted on the upper (anon) layer
* [i.e. case 1], then the anon we want is anons[centeridx] (we have
* not touched it yet). if we have faulted on the bottom (uobj)
* layer [i.e. case 2] and the page was both present and available,
* then we've got a pointer to it as "uobjpage" and we've already
* made it BUSY.
*/
/*
* locked:
* maps(read), amap(if there), uobj(if !null), uobjpage(if !null)
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* note that uobjpage can not be PGO_DONTCARE at this point. we now
* set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we
* have a backing object, check and see if we are going to promote
* the data up to an anon during the fault.
*/
if (uobj == NULL) {
uobjpage = PGO_DONTCARE;
flt->promote = true; /* always need anon here */
} else {
KASSERT(uobjpage != PGO_DONTCARE); flt->promote = flt->cow_now && UVM_ET_ISCOPYONWRITE(ufi->entry);
}
UVMHIST_LOG(maphist, " case 2 fault: promote=%jd, zfill=%jd",
flt->promote, (uobj == NULL), 0,0);
/*
* if uobjpage is not null then we do not need to do I/O to get the
* uobjpage.
*
* if uobjpage is null, then we need to unlock and ask the pager to
* get the data for us. once we have the data, we need to reverify
* the state the world. we are currently not holding any resources.
*/
if (uobjpage) {
/* update rusage counters */
curlwp->l_ru.ru_minflt++;
} else {
error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
if (error != 0)
return error;
}
/*
* locked:
* maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj)
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* notes:
* - at this point uobjpage can not be NULL
* - at this point uobjpage can not be PG_RELEASED (since we checked
* for it above)
* - at this point uobjpage could be waited on (handle later)
* - uobjpage can be from a different object if tmpfs (vnode vs UAO)
*/
KASSERT(uobjpage != NULL); KASSERT(uobj == NULL ||
uobjpage->uobject->vmobjlock == uobj->vmobjlock);
KASSERT(uobj == NULL || !UVM_OBJ_IS_CLEAN(uobjpage->uobject) ||
uvm_pagegetdirty(uobjpage) == UVM_PAGE_STATUS_CLEAN);
if (!flt->promote) {
error = uvm_fault_lower_direct(ufi, flt, uobj, uobjpage);
} else {
error = uvm_fault_lower_promote(ufi, flt, uobj, uobjpage);
}
return error;
}
/*
* uvm_fault_lower_lookup: look up on-memory uobj pages.
*
* 1. get on-memory pages.
* 2. if failed, give up (get only center page later).
* 3. if succeeded, enter h/w mapping of neighbor pages.
*/
static void
uvm_fault_lower_lookup(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_page **pages)
{
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
int lcv, gotpages;
vaddr_t currva;
bool entered;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
rw_enter(uobj->vmobjlock, flt->lower_lock_type);
/*
* Locked: maps(read), amap(if there), uobj
*/
cpu_count(CPU_COUNT_FLTLGET, 1);
gotpages = flt->npages;
(void) uobj->pgops->pgo_get(uobj,
ufi->entry->offset + flt->startva - ufi->entry->start,
pages, &gotpages, flt->centeridx,
flt->access_type & MASK(ufi->entry), ufi->entry->advice,
PGO_LOCKED);
KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* check for pages to map, if we got any
*/
if (gotpages == 0) {
pages[flt->centeridx] = NULL;
return;
}
entered = false;
currva = flt->startva;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
struct vm_page *curpg;
curpg = pages[lcv];
if (curpg == NULL || curpg == PGO_DONTCARE) {
continue;
}
/*
* in the case of tmpfs, the pages might be from a different
* uvm_object. just make sure that they have the same lock.
*/
KASSERT(curpg->uobject->vmobjlock == uobj->vmobjlock); KASSERT((curpg->flags & PG_BUSY) == 0);
/*
* leave the centre page for later. don't screw with
* existing mappings (needless & expensive).
*/
if (lcv == flt->centeridx) {
UVMHIST_LOG(maphist, " got uobjpage (%#jx) "
"with locked get", (uintptr_t)curpg, 0, 0, 0);
} else if (!pmap_extract(ufi->orig_map->pmap, currva, NULL)) { uvm_fault_lower_neighbor(ufi, flt, currva, curpg);
entered = true;
}
}
if (entered) { pmap_update(ufi->orig_map->pmap);
}
}
/*
* uvm_fault_lower_neighbor: enter h/w mapping of lower neighbor page.
*/
static void
uvm_fault_lower_neighbor(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
vaddr_t currva, struct vm_page *pg)
{
const bool readonly = uvm_pagereadonly_p(pg) || pg->loan_count > 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap(if there), uobj */
/*
* calling pgo_get with PGO_LOCKED returns us pages which
* are neither busy nor released, so we don't need to check
* for this. we can just directly enter the pages.
*
* there wasn't a direct fault on the page, so avoid the cost of
* activating it.
*/
if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
}
UVMHIST_LOG(maphist,
" MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx",
(uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
cpu_count(CPU_COUNT_FLTNOMAP, 1);
/*
* Since this page isn't the page that's actually faulting,
* ignore pmap_enter() failures; it's not critical that we
* enter these right now.
* NOTE: page can't be waited on or PG_RELEASED because we've
* held the lock the whole time we've had the handle.
*/
KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT(!UVM_OBJ_IS_CLEAN(pg->uobject) ||
uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN);
KASSERT((pg->flags & PG_BUSY) == 0); KASSERT(rw_lock_op(pg->uobject->vmobjlock) == flt->lower_lock_type);
const vm_prot_t mapprot =
readonly ? (flt->enter_prot & ~VM_PROT_WRITE) : flt->enter_prot & MASK(ufi->entry);
const u_int mapflags =
PMAP_CANFAIL | (flt->wire_mapping ? (mapprot | PMAP_WIRED) : 0);
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(pg), mapprot, mapflags);
}
/*
* uvm_fault_lower_io: get lower page from backing store.
*
* 1. unlock everything, because i/o will block.
* 2. call pgo_get.
* 3. if failed, recover.
* 4. if succeeded, relock everything and verify things.
*/
static int
uvm_fault_lower_io(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object **ruobj, struct vm_page **ruobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = *ruobj;
struct vm_page *pg;
bool locked;
int gotpages;
int error;
voff_t uoff;
vm_prot_t access_type;
int advice;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* grab everything we need from the entry before we unlock */
uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
access_type = flt->access_type & MASK(ufi->entry);
advice = ufi->entry->advice;
/* Locked: maps(read), amap(if there), uobj */
KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/* Upgrade to a write lock if needed. */
error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, NULL);
if (error != 0) {
return error;
}
uvmfault_unlockall(ufi, amap, NULL);
/* update rusage counters */
curlwp->l_ru.ru_majflt++;
/* Locked: uobj(write) */
KASSERT(rw_write_held(uobj->vmobjlock));
cpu_count(CPU_COUNT_FLTGET, 1);
gotpages = 1;
pg = NULL;
error = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages,
0, access_type, advice, PGO_SYNCIO);
/* locked: pg(if no error) */
/*
* recover from I/O
*/
if (error) {
if (error == EAGAIN) {
UVMHIST_LOG(maphist,
" pgo_get says TRY AGAIN!",0,0,0,0);
kpause("fltagain2", false, hz/2, NULL);
return ERESTART;
}
#if 0
KASSERT(error != ERESTART);
#else
/* XXXUEBS don't re-fault? */
if (error == ERESTART)
error = EIO;
#endif
UVMHIST_LOG(maphist, "<- pgo_get failed (code %jd)",
error, 0,0,0);
return error;
}
/*
* re-verify the state of the world by first trying to relock
* the maps. always relock the object.
*/
locked = uvmfault_relock(ufi); if (locked && amap) amap_lock(amap, flt->upper_lock_type);
/* might be changed */
uobj = pg->uobject;
rw_enter(uobj->vmobjlock, flt->lower_lock_type);
KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(flt->lower_lock_type == RW_WRITER);
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
/* locked(locked): maps(read), amap(if !null), uobj, pg */
/* locked(!locked): uobj, pg */
/*
* verify that the page has not be released and re-verify
* that amap slot is still free. if there is a problem,
* we unlock and clean up.
*/
if ((pg->flags & PG_RELEASED) != 0 || (locked && amap && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start))) {
if (locked) uvmfault_unlockall(ufi, amap, NULL);
locked = false;
}
/*
* unbusy/release the page.
*/
if ((pg->flags & PG_RELEASED) == 0) {
pg->flags &= ~PG_BUSY;
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
UVM_PAGE_OWN(pg, NULL);
} else {
cpu_count(CPU_COUNT_FLTPGRELE, 1);
uvm_pagefree(pg);
}
/*
* didn't get the lock? retry.
*/
if (locked == false) {
UVMHIST_LOG(maphist,
" wasn't able to relock after fault: retry",
0,0,0,0);
rw_exit(uobj->vmobjlock);
return ERESTART;
}
/*
* we have the data in pg. we are holding object lock (so the page
* can't be released on us).
*/
/* locked: maps(read), amap(if !null), uobj */
*ruobj = uobj;
*ruobjpage = pg;
return 0;
}
/*
* uvm_fault_lower_direct: fault lower center page
*
* 1. adjust flt->enter_prot.
* 2. if page is loaned, resolve.
*/
int
uvm_fault_lower_direct(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *uobjpage)
{
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* we are not promoting. if the mapping is COW ensure that we
* don't give more access than we should (e.g. when doing a read
* fault on a COPYONWRITE mapping we want to map the COW page in
* R/O even though the entry protection could be R/W).
*
* set "pg" to the page we want to map in (uobjpage, usually)
*/
cpu_count(CPU_COUNT_FLT_OBJ, 1);
if (UVM_ET_ISCOPYONWRITE(ufi->entry) || UVM_OBJ_NEEDS_WRITEFAULT(uobjpage->uobject))
flt->enter_prot &= ~VM_PROT_WRITE;
pg = uobjpage; /* map in the actual object */
KASSERT(uobjpage != PGO_DONTCARE);
/*
* we are faulting directly on the page. be careful
* about writing to loaned pages...
*/
if (uobjpage->loan_count) { uvm_fault_lower_direct_loan(ufi, flt, uobj, &pg, &uobjpage);
}
KASSERT(pg == uobjpage);
KASSERT((pg->flags & PG_BUSY) == 0);
return uvm_fault_lower_enter(ufi, flt, uobj, NULL, pg);
}
/*
* uvm_fault_lower_direct_loan: resolve loaned page.
*
* 1. if not cow'ing, adjust flt->enter_prot.
* 2. if cow'ing, break loan.
*/
static int
uvm_fault_lower_direct_loan(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page **rpg,
struct vm_page **ruobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_page *pg;
struct vm_page *uobjpage = *ruobjpage;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (!flt->cow_now) {
/* read fault: cap the protection at readonly */
/* cap! */
flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
} else {
/*
* write fault: must break the loan here. to do this
* we need a write lock on the object.
*/
error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, uobjpage);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(uobj->vmobjlock));
pg = uvm_loanbreak(uobjpage);
if (pg == NULL) {
uvmfault_unlockall(ufi, amap, uobj);
UVMHIST_LOG(maphist,
" out of RAM breaking loan, waiting",
0,0,0,0);
cpu_count(CPU_COUNT_FLTNORAM, 1);
uvm_wait("flt_noram4");
return ERESTART;
}
*rpg = pg;
*ruobjpage = pg;
/*
* drop ownership of page while still holding object lock,
* which won't be dropped until the page is entered.
*/
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
}
return 0;
}
/*
* uvm_fault_lower_promote: promote lower page.
*
* 1. call uvmfault_promote.
* 2. fill in data.
* 3. if not ZFOD, dispose old page.
*/
int
uvm_fault_lower_promote(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *uobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon *anon;
struct vm_page *pg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap != NULL);
/* promoting requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, uobj);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* If we are going to promote the data to an anon we
* allocate a blank anon here and plug it into our amap.
*/
error = uvmfault_promote(ufi, NULL, uobjpage, &anon, &flt->anon_spare);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
default:
return error;
}
pg = anon->an_page;
/*
* Fill in the data.
*/
if (uobjpage != PGO_DONTCARE) {
cpu_count(CPU_COUNT_FLT_PRCOPY, 1);
/*
* promote to shared amap? make sure all sharing
* procs see it
*/
if ((amap_flags(amap) & AMAP_SHARED) != 0) { pmap_page_protect(uobjpage, VM_PROT_NONE);
/*
* XXX: PAGE MIGHT BE WIRED!
*/
}
UVMHIST_LOG(maphist,
" promote uobjpage %#jx to anon/page %#jx/%#jx",
(uintptr_t)uobjpage, (uintptr_t)anon, (uintptr_t)pg, 0);
} else {
cpu_count(CPU_COUNT_FLT_PRZERO, 1);
/*
* Page is zero'd and marked dirty by
* uvmfault_promote().
*/
UVMHIST_LOG(maphist," zero fill anon/page %#jx/%#jx",
(uintptr_t)anon, (uintptr_t)pg, 0, 0);
}
return uvm_fault_lower_enter(ufi, flt, uobj, anon, pg);
}
/*
* uvm_fault_lower_enter: enter h/w mapping of lower page or anon page promoted
* from the lower page.
*/
int
uvm_fault_lower_enter(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj,
struct vm_anon *anon, struct vm_page *pg)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
const bool readonly = uvm_pagereadonly_p(pg);
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* Locked:
*
* maps(read), amap(if !null), uobj(if !null),
* anon(if !null), pg(if anon), unlock_uobj(if !null)
*
* anon must be write locked (promotion). uobj can be either.
*
* Note: pg is either the uobjpage or the new page in the new anon.
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
/*
* note that pg can't be PG_RELEASED or PG_BUSY since we did
* not drop the object lock since the last time we checked.
*/
KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT((pg->flags & PG_BUSY) == 0);
/*
* all resources are present. we can now map it in and free our
* resources.
*/
UVMHIST_LOG(maphist,
" MAPPING: case2: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
(uintptr_t)ufi->orig_map->pmap, ufi->orig_rvaddr,
(uintptr_t)pg, flt->promote);
KASSERTMSG((flt->access_type & VM_PROT_WRITE) == 0 || !readonly,
"promote=%u cow_now=%u access_type=%x enter_prot=%x cow=%u "
"entry=%p map=%p orig_rvaddr=%p pg=%p",
flt->promote, flt->cow_now, flt->access_type, flt->enter_prot,
UVM_ET_ISCOPYONWRITE(ufi->entry), ufi->entry, ufi->orig_map,
(void *)ufi->orig_rvaddr, pg);
KASSERT((flt->access_type & VM_PROT_WRITE) == 0 || !readonly);
if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
VM_PAGE_TO_PHYS(pg),
readonly ? flt->enter_prot & ~VM_PROT_WRITE : flt->enter_prot,
flt->access_type | PMAP_CANFAIL |
(flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
/*
* ensure that the page is queued in the case that
* we just promoted the page.
*/
if (anon != NULL) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
}
uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
/* XXX instrumentation */
error = ENOMEM;
return error;
}
/* XXX instrumentation */
uvm_wait("flt_pmfail2");
return ERESTART;
}
uvm_fault_lower_done(ufi, flt, uobj, pg);
pmap_update(ufi->orig_map->pmap);
uvmfault_unlockall(ufi, amap, uobj);
UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0);
return 0;
}
/*
* uvm_fault_lower_done: queue lower center page.
*/
void
uvm_fault_lower_done(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *pg)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (flt->wire_paging) {
uvm_pagelock(pg);
uvm_pagewire(pg);
uvm_pageunlock(pg);
if (pg->flags & PG_AOBJ) {
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* since an aobj page with no swap cannot be clean,
* mark it dirty now.
*
* use pg->uobject here. if the page is from a
* tmpfs vnode, the pages are backed by its UAO and
* not the vnode.
*/
KASSERT(uobj != NULL); KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
}
} else if (uvmpdpol_pageactivate_p(pg)) {
/*
* avoid re-activating the page unless needed,
* to avoid false sharing on multiprocessor.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
/*
* uvm_fault_wire: wire down a range of virtual addresses in a map.
*
* => map may be read-locked by caller, but MUST NOT be write-locked.
* => if map is read-locked, any operations which may cause map to
* be write-locked in uvm_fault() must be taken care of by
* the caller. See uvm_map_pageable().
*/
int
uvm_fault_wire(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t access_type, int maxprot)
{
vaddr_t va;
int error;
/*
* now fault it in a page at a time. if the fault fails then we have
* to undo what we have done. note that in uvm_fault VM_PROT_NONE
* is replaced with the max protection if fault_type is VM_FAULT_WIRE.
*/
/*
* XXX work around overflowing a vaddr_t. this prevents us from
* wiring the last page in the address space, though.
*/
if (start > end) {
return EFAULT;
}
for (va = start; va < end; va += PAGE_SIZE) {
error = uvm_fault_internal(map, va, access_type,
(maxprot ? UVM_FAULT_MAXPROT : 0) | UVM_FAULT_WIRE);
if (error) {
if (va != start) { uvm_fault_unwire(map, start, va);
}
return error;
}
}
return 0;
}
/*
* uvm_fault_unwire(): unwire range of virtual space.
*/
void
uvm_fault_unwire(struct vm_map *map, vaddr_t start, vaddr_t end)
{
vm_map_lock_read(map);
uvm_fault_unwire_locked(map, start, end);
vm_map_unlock_read(map);
}
/*
* uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
*
* => map must be at least read-locked.
*/
void
uvm_fault_unwire_locked(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct vm_map_entry *entry, *oentry;
pmap_t pmap = vm_map_pmap(map);
vaddr_t va;
paddr_t pa;
struct vm_page *pg;
/*
* we assume that the area we are unwiring has actually been wired
* in the first place. this means that we should be able to extract
* the PAs from the pmap. we also lock out the page daemon so that
* we can call uvm_pageunwire.
*/
/*
* find the beginning map entry for the region.
*/
KASSERT(start >= vm_map_min(map)); KASSERT(end <= vm_map_max(map));
if (uvm_map_lookup_entry(map, start, &entry) == false)
panic("uvm_fault_unwire_locked: address not in map");
oentry = NULL;
for (va = start; va < end; va += PAGE_SIZE) {
/*
* find the map entry for the current address.
*/
KASSERT(va >= entry->start); while (va >= entry->end) { KASSERT(entry->next != &map->header); KASSERT(entry->next->start <= entry->end);
entry = entry->next;
}
/*
* lock it.
*/
if (entry != oentry) { if (oentry != NULL) { uvm_map_unlock_entry(oentry);
}
uvm_map_lock_entry(entry, RW_WRITER);
oentry = entry;
}
/*
* if the entry is no longer wired, tell the pmap.
*/
if (!pmap_extract(pmap, va, &pa))
continue;
if (VM_MAPENT_ISWIRED(entry) == 0) pmap_unwire(pmap, va);
pg = PHYS_TO_VM_PAGE(pa);
if (pg) { uvm_pagelock(pg);
uvm_pageunwire(pg);
uvm_pageunlock(pg);
}
}
if (oentry != NULL) { uvm_map_unlock_entry(entry);
}
}
/* $NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/intr.h>
#include <sys/condvar.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <netinet/in_var.h>
#include <netinet/if_inarp.h>
#include <netmpls/mpls.h>
#include <compat/net/if.h>
#include <compat/net/route.h>
#ifdef COMPAT_RTSOCK
#undef COMPAT_RTSOCK
#endif
static int if_addrflags(struct ifaddr *);
#include <net/rtsock_shared.c>
/*
* XXX avoid using void * once msghdr compat disappears.
*/
void
rt_setmetrics(void *in, struct rtentry *out)
{
const struct rt_xmsghdr *rtm = in;
_rt_setmetrics(rtm->rtm_inits, rtm, out);
}
int
rt_msg3(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w,
int *lenp)
{
return rt_msg2(type, rtinfo, cpv, w, lenp);
}
static int
if_addrflags(struct ifaddr *ifa)
{
switch (ifa->ifa_addr->sa_family) {
#ifdef INET
case AF_INET:
return ifatoia(ifa)->ia4_flags;
#endif
#ifdef INET6
case AF_INET6:
return ifatoia6(ifa)->ia6_flags;
#endif
default:
return 0;
}
}
/*
* Send a routing message as mimicing that a cloned route is added.
*/
void
rt_clonedmsg(int type, const struct sockaddr *src, const struct sockaddr *dst,
const uint8_t *lladdr, const struct ifnet *ifp)
{
struct rt_addrinfo info;
/* Mimic flags exactly */
#define RTF_LLINFO 0x400
#define RTF_CLONED 0x2000
int flags = RTF_DONE;
union {
struct sockaddr sa;
struct sockaddr_storage ss;
struct sockaddr_dl sdl;
} u;
if (type != RTM_MISS)
flags |= RTF_HOST | RTF_CLONED | RTF_LLINFO;
if (type == RTM_ADD || type == RTM_CHANGE)
flags |= RTF_UP;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_AUTHOR] = src;
info.rti_info[RTAX_DST] = dst;
sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type,
NULL, 0, lladdr, ifp->if_addrlen);
info.rti_info[RTAX_GATEWAY] = &u.sa;
rt_missmsg(type, &info, flags, 0);
#undef RTF_LLINFO
#undef RTF_CLONED
}
/*
* The remaining code implements the routing-table sysctl node. It is
* compiled only for the non-COMPAT case.
*/
/*
* This is used in dumping the kernel table via sysctl().
*/
static int
sysctl_dumpentry(struct rtentry *rt, void *v)
{
struct rt_walkarg *w = v;
int error = 0, size;
struct rt_addrinfo info;
if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
return 0;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = rt_getkey(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_TAG] = rt_gettag(rt);
if (rt->rt_ifp) {
const struct ifaddr *rtifa;
info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
/* rtifa used to be simply rt->rt_ifa. If rt->rt_ifa != NULL,
* then rt_get_ifa() != NULL. So this ought to still be safe.
* --dyoung
*/
rtifa = rt_get_ifa(rt);
info.rti_info[RTAX_IFA] = rtifa->ifa_addr;
if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] = rtifa->ifa_dstaddr;
}
if ((error = rt_msg2(RTM_GET, &info, 0, w, &size)))
return error;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_use = rt->rt_use;
rtm_setmetrics(rt, rtm);
KASSERT(rt->rt_ifp != NULL);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
if ((error = copyout(rtm, w->w_where, size)) != 0)
w->w_where = NULL;
else
w->w_where = (char *)w->w_where + size;
}
return error;
}
static int
sysctl_iflist_if(struct ifnet *ifp, struct rt_walkarg *w,
struct rt_addrinfo *info, size_t len)
{
struct if_xmsghdr *ifm;
int error;
ifm = (struct if_xmsghdr *)w->w_tmem;
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags;
if_export_if_data(ifp, &ifm->ifm_data, false);
ifm->ifm_addrs = info->rti_addrs;
if ((error = copyout(ifm, w->w_where, len)) == 0)
w->w_where = (char *)w->w_where + len;
return error;
}
static int
sysctl_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa,
struct rt_addrinfo *info)
{
int len, error;
if ((error = rt_msg2(RTM_XNEWADDR, info, 0, w, &len)))
return error;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
struct ifa_xmsghdr *ifam;
ifam = (struct ifa_xmsghdr *)w->w_tmem;
ifam->ifam_index = ifa->ifa_ifp->if_index;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_addrs = info->rti_addrs;
ifam->ifam_pid = 0;
ifam->ifam_addrflags = if_addrflags(ifa);
if ((error = copyout(w->w_tmem, w->w_where, len)) == 0)
w->w_where = (char *)w->w_where + len;
}
return error;
}
static int
sysctl_iflist(int af, struct rt_walkarg *w, int type)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct rt_addrinfo info;
int cmd, len, error = 0;
int s;
struct psref psref;
int bound;
switch (type) {
case NET_RT_IFLIST:
cmd = RTM_IFINFO;
break;
case NET_RT_OOOIFLIST:
cmd = RTM_OOIFINFO;
break;
case NET_RT_OOIFLIST:
cmd = RTM_OIFINFO;
break;
case NET_RT_OIFLIST:
cmd = RTM_IFINFO;
break;
default:
#ifdef RTSOCK_DEBUG
printf("%s: unsupported IFLIST type %d\n", __func__, type);
#endif
return EINVAL;
}
memset(&info, 0, sizeof(info));
bound = curlwp_bind();
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
int _s;
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
if (IFADDR_READER_EMPTY(ifp))
continue;
if_acquire(ifp, &psref);
pserialize_read_exit(s);
info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
if ((error = rt_msg2(cmd, &info, NULL, w, &len)) != 0)
goto release_exit;
info.rti_info[RTAX_IFP] = NULL;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
switch (type) {
case NET_RT_OIFLIST: /* old _70 */
if (!rtsock_iflist_70_hook.hooked) {
error = EINVAL;
break;
}
/* FALLTHROUGH */
case NET_RT_IFLIST: /* current */
error = sysctl_iflist_if(ifp, w, &info, len);
break;
case NET_RT_OOIFLIST: /* old _50 */
MODULE_HOOK_CALL(rtsock_iflist_50_hook,
(ifp, w, &info, len), enosys(), error);
break;
case NET_RT_OOOIFLIST: /* old _14 */
MODULE_HOOK_CALL(rtsock_iflist_14_hook,
(ifp, w, &info, len), enosys(), error);
break;
default:
error = EINVAL;
}
if (error != 0) {
if (error == ENOSYS)
error = EINVAL;
goto release_exit;
}
}
_s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref _psref;
if (af && af != ifa->ifa_addr->sa_family)
continue;
ifa_acquire(ifa, &_psref);
pserialize_read_exit(_s);
info.rti_info[RTAX_IFA] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
switch (type) {
case NET_RT_IFLIST:
error = sysctl_iflist_addr(w, ifa, &info);
break;
case NET_RT_OIFLIST:
case NET_RT_OOIFLIST:
case NET_RT_OOOIFLIST:
MODULE_HOOK_CALL(rtsock_iflist_70_hook,
(w, ifa, &info), enosys(), error);
break;
default:
error = EINVAL;
}
_s = pserialize_read_enter();
ifa_release(ifa, &_psref);
if (error != 0) {
pserialize_read_exit(_s);
goto release_exit;
}
}
pserialize_read_exit(_s);
info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
info.rti_info[RTAX_BRD] = NULL;
s = pserialize_read_enter();
if_release(ifp, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
return 0;
release_exit:
if_release(ifp, &psref);
curlwp_bindx(bound);
return error;
}
static int
sysctl_rtable(SYSCTLFN_ARGS)
{
void *where = oldp;
size_t *given = oldlenp;
int i, error = EINVAL;
u_char af;
struct rt_walkarg w;
if (namelen == 1 && name[0] == CTL_QUERY)
return sysctl_query(SYSCTLFN_CALL(rnode));
if (newp)
return EPERM;
if (namelen != 3)
return EINVAL;
af = name[0];
w.w_tmemneeded = 0;
w.w_tmemsize = 0;
w.w_tmem = NULL;
again:
/* we may return here if a later [re]alloc of the t_mem buffer fails */
if (w.w_tmemneeded) {
w.w_tmem = kmem_zalloc(w.w_tmemneeded, KM_SLEEP);
w.w_tmemsize = w.w_tmemneeded;
w.w_tmemneeded = 0;
}
w.w_op = name[1];
w.w_arg = name[2];
w.w_given = *given;
w.w_needed = 0 - w.w_given;
w.w_where = where;
KERNEL_LOCK_UNLESS_NET_MPSAFE();
const int s = splsoftnet();
switch (w.w_op) {
case NET_RT_DUMP:
case NET_RT_FLAGS:
#if defined(INET) || defined(INET6)
/*
* take care of llinfo entries, the caller must
* specify an AF
*/
if (w.w_op == NET_RT_FLAGS &&
(w.w_arg == 0 || w.w_arg & RTF_LLDATA)) {
if (af != 0)
error = lltable_sysctl_dump(af, &w);
else
error = EINVAL;
break;
}
#endif
for (i = 1; i <= AF_MAX; i++) {
if (af == 0 || af == i) {
error = rt_walktree(i, sysctl_dumpentry, &w);
if (error != 0)
break;
#if defined(INET) || defined(INET6)
/*
* Return ARP/NDP entries too for
* backward compatibility.
*/
error = lltable_sysctl_dump(i, &w);
if (error != 0)
break;
#endif
}
}
break;
case NET_RT_OOOIFLIST: /* compat_14 */
case NET_RT_OOIFLIST: /* compat_50 */
case NET_RT_OIFLIST: /* compat_70 */
case NET_RT_IFLIST: /* current */
error = sysctl_iflist(af, &w, w.w_op);
break;
}
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
/* check to see if we couldn't allocate memory with NOWAIT */
if (error == ENOBUFS && w.w_tmem == 0 && w.w_tmemneeded)
goto again;
if (w.w_tmem)
kmem_free(w.w_tmem, w.w_tmemsize);
w.w_needed += w.w_given;
if (where) {
*given = (char *)w.w_where - (char *)where;
if (*given < w.w_needed)
return ENOMEM;
} else {
*given = (11 * w.w_needed) / 10;
}
return error;
}
void
sysctl_net_route_setup(struct sysctllog **clog, int pf, const char *name)
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, name,
SYSCTL_DESCR("PF_ROUTE information"),
NULL, 0, NULL, 0,
CTL_NET, pf, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "rtable",
SYSCTL_DESCR("Routing table information"),
sysctl_rtable, 0, NULL, 0,
CTL_NET, pf, 0 /* any protocol */, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("Routing statistics"),
NULL, 0, &rtstat, sizeof(rtstat),
CTL_CREATE, CTL_EOL);
}
/* $NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $ */
/*-
* Copyright (c) 1994, 2000, 2005, 2015, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas and Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $");
#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif /* _KERNEL_OPT */
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/syscall.h>
#include <sys/signalvar.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <machine/reg.h>
#include <compat/common/compat_util.h>
#include <sys/pax.h>
#include <uvm/uvm_param.h>
#define elf_check_header ELFNAME(check_header)
#define elf_copyargs ELFNAME(copyargs)
#define elf_populate_auxv ELFNAME(populate_auxv)
#define elf_load_interp ELFNAME(load_interp)
#define elf_load_psection ELFNAME(load_psection)
#define exec_elf_makecmds ELFNAME2(exec,makecmds)
#define netbsd_elf_signature ELFNAME2(netbsd,signature)
#define netbsd_elf_note ELFNAME2(netbsd,note)
#define netbsd_elf_probe ELFNAME2(netbsd,probe)
#define coredump ELFNAMEEND(coredump)
#define elf_free_emul_arg ELFNAME(free_emul_arg)
static int
elf_load_interp(struct lwp *, struct exec_package *, char *,
struct exec_vmcmd_set *, u_long *, Elf_Addr *);
static int
elf_load_psection(struct exec_vmcmd_set *, struct vnode *, const Elf_Phdr *,
Elf_Addr *, u_long *, int);
int netbsd_elf_signature(struct lwp *, struct exec_package *, Elf_Ehdr *);
int netbsd_elf_note(struct exec_package *, const Elf_Nhdr *, const char *,
const char *);
int netbsd_elf_probe(struct lwp *, struct exec_package *, void *, char *,
vaddr_t *);
static void elf_free_emul_arg(void *);
#ifdef DEBUG_ELF
#define DPRINTF(a, ...) printf("%s: " a "\n", __func__, ##__VA_ARGS__)
#else
#define DPRINTF(a, ...)
#endif
/* round up and down to page boundaries. */
#define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1))
#define ELF_TRUNC(a, b) ((a) & ~((b) - 1))
static int
elf_placedynexec(struct exec_package *epp, Elf_Ehdr *eh, Elf_Phdr *ph)
{
Elf_Addr align, offset;
int i;
for (align = 1, i = 0; i < eh->e_phnum; i++) if (ph[i].p_type == PT_LOAD && ph[i].p_align > align)
align = ph[i].p_align;
offset = (Elf_Addr)pax_aslr_exec_offset(epp, align);
if (offset < epp->ep_vm_minaddr) offset = roundup(epp->ep_vm_minaddr, align);
if ((offset & (align - 1)) != 0) {
DPRINTF("bad offset=%#jx align=%#jx",
(uintmax_t)offset, (uintmax_t)align);
return EINVAL;
}
for (i = 0; i < eh->e_phnum; i++)
ph[i].p_vaddr += offset;
epp->ep_entryoffset = offset;
eh->e_entry += offset;
return 0;
}
int
elf_populate_auxv(struct lwp *l, struct exec_package *pack, char **stackp)
{
size_t len, vlen;
AuxInfo ai[ELF_AUX_ENTRIES], *a, *execname;
struct elf_args *ap;
char *path = l->l_proc->p_path;
int error;
execname = NULL;
a = ai;
memset(ai, 0, sizeof(ai));
/*
* Push extra arguments on the stack needed by dynamically
* linked binaries
*/
if ((ap = (struct elf_args *)pack->ep_emul_arg)) {
struct vattr *vap = pack->ep_vap;
a->a_type = AT_PHDR;
a->a_v = ap->arg_phaddr;
a++;
a->a_type = AT_PHENT;
a->a_v = ap->arg_phentsize;
a++;
a->a_type = AT_PHNUM;
a->a_v = ap->arg_phnum;
a++;
a->a_type = AT_PAGESZ;
a->a_v = PAGE_SIZE;
a++;
a->a_type = AT_BASE;
a->a_v = ap->arg_interp;
a++;
a->a_type = AT_FLAGS;
a->a_v = 0;
a++;
a->a_type = AT_ENTRY;
a->a_v = ap->arg_entry;
a++;
a->a_type = AT_STACKBASE;
a->a_v = l->l_proc->p_stackbase;
a++;
a->a_type = AT_EUID;
if (vap->va_mode & S_ISUID)
a->a_v = vap->va_uid;
else
a->a_v = kauth_cred_geteuid(l->l_cred);
a++;
a->a_type = AT_RUID;
a->a_v = kauth_cred_getuid(l->l_cred);
a++;
a->a_type = AT_EGID;
if (vap->va_mode & S_ISGID)
a->a_v = vap->va_gid;
else
a->a_v = kauth_cred_getegid(l->l_cred);
a++;
a->a_type = AT_RGID;
a->a_v = kauth_cred_getgid(l->l_cred);
a++;
/* "/" means fexecve(2) could not resolve the pathname */
if (path[0] == '/' && path[1] != '\0') {
execname = a;
a->a_type = AT_SUN_EXECNAME;
a++;
}
exec_free_emul_arg(pack);
}
a->a_type = AT_NULL;
a->a_v = 0;
a++;
vlen = (a - ai) * sizeof(ai[0]);
KASSERT(vlen <= sizeof(ai));
if (execname) {
execname->a_v = (uintptr_t)(*stackp + vlen);
len = strlen(path) + 1;
if ((error = copyout(path, (*stackp + vlen), len)) != 0)
return error;
len = ALIGN(len);
} else {
len = 0;
}
if ((error = copyout(ai, *stackp, vlen)) != 0)
return error;
*stackp += vlen + len;
return 0;
}
/*
* Copy arguments onto the stack in the normal way, but add some
* extra information in case of dynamic binding.
*/
int
elf_copyargs(struct lwp *l, struct exec_package *pack,
struct ps_strings *arginfo, char **stackp, void *argp)
{
int error;
if ((error = copyargs(l, pack, arginfo, stackp, argp)) != 0)
return error;
return elf_populate_auxv(l, pack, stackp);
}
/*
* elf_check_header():
*
* Check header for validity; return 0 if ok, ENOEXEC if error
*/
int
elf_check_header(Elf_Ehdr *eh)
{
if (memcmp(eh->e_ident, ELFMAG, SELFMAG) != 0 ||
eh->e_ident[EI_CLASS] != ELFCLASS) {
DPRINTF("bad magic e_ident[EI_MAG0,EI_MAG3] %#x%x%x%x, "
"e_ident[EI_CLASS] %#x", eh->e_ident[EI_MAG0],
eh->e_ident[EI_MAG1], eh->e_ident[EI_MAG2],
eh->e_ident[EI_MAG3], eh->e_ident[EI_CLASS]);
return ENOEXEC;
}
switch (eh->e_machine) {
ELFDEFNNAME(MACHDEP_ID_CASES)
default:
DPRINTF("bad machine %#x", eh->e_machine);
return ENOEXEC;
}
if (ELF_EHDR_FLAGS_OK(eh) == 0) {
DPRINTF("bad flags %#x", eh->e_flags);
return ENOEXEC;
}
if (eh->e_shnum > ELF_MAXSHNUM || eh->e_phnum > ELF_MAXPHNUM) {
DPRINTF("bad shnum/phnum %#x/%#x", eh->e_shnum, eh->e_phnum);
return ENOEXEC;
}
return 0;
}
/*
* elf_load_psection():
*
* Load a psection at the appropriate address
*/
static int
elf_load_psection(struct exec_vmcmd_set *vcset, struct vnode *vp,
const Elf_Phdr *ph, Elf_Addr *addr, u_long *size, int flags)
{
u_long msize, psize, rm, rf;
long diff, offset;
int vmprot = 0;
KASSERT(VOP_ISLOCKED(vp) != LK_NONE);
/*
* If the user specified an address, then we load there.
*/
if (*addr == ELFDEFNNAME(NO_ADDR))
*addr = ph->p_vaddr;
if (ph->p_align > 1) {
/*
* Make sure we are virtually aligned as we are supposed to be.
*/
diff = ph->p_vaddr - ELF_TRUNC(ph->p_vaddr, ph->p_align);
if (*addr - diff != ELF_TRUNC(*addr, ph->p_align)) {
DPRINTF("bad alignment %#jx != %#jx\n",
(uintptr_t)(*addr - diff),
(uintptr_t)ELF_TRUNC(*addr, ph->p_align));
return EINVAL;
}
/*
* But make sure to not map any pages before the start of the
* psection by limiting the difference to within a page.
*/
diff &= PAGE_MASK;
} else
diff = 0;
vmprot |= (ph->p_flags & PF_R) ? VM_PROT_READ : 0;
vmprot |= (ph->p_flags & PF_W) ? VM_PROT_WRITE : 0;
vmprot |= (ph->p_flags & PF_X) ? VM_PROT_EXECUTE : 0;
/*
* Adjust everything so it all starts on a page boundary.
*/
*addr -= diff;
offset = ph->p_offset - diff;
*size = ph->p_filesz + diff;
msize = ph->p_memsz + diff;
if (ph->p_align >= PAGE_SIZE) {
if ((ph->p_flags & PF_W) != 0) {
/*
* Because the pagedvn pager can't handle zero fill
* of the last data page if it's not page aligned we
* map the last page readvn.
*/
psize = trunc_page(*size);
} else {
psize = round_page(*size);
}
} else {
psize = *size;
}
if (psize > 0) {
NEW_VMCMD2(vcset, ph->p_align < PAGE_SIZE ?
vmcmd_map_readvn : vmcmd_map_pagedvn, psize, *addr, vp,
offset, vmprot, flags);
flags &= VMCMD_RELATIVE;
}
if (psize < *size) {
NEW_VMCMD2(vcset, vmcmd_map_readvn, *size - psize,
*addr + psize, vp, offset + psize, vmprot, flags);
}
/*
* Check if we need to extend the size of the segment (does
* bss extend page the next page boundary)?
*/
rm = round_page(*addr + msize);
rf = round_page(*addr + *size);
if (rm != rf) {
NEW_VMCMD2(vcset, vmcmd_map_zero, rm - rf, rf, NULLVP,
0, vmprot, flags & VMCMD_RELATIVE);
*size = msize;
}
return 0;
}
/*
* elf_load_interp():
*
* Load an interpreter pointed to by path.
*/
static int
elf_load_interp(struct lwp *l, struct exec_package *epp, char *path,
struct exec_vmcmd_set *vcset, u_long *entryoff, Elf_Addr *last)
{
int error, i;
struct vnode *vp;
Elf_Ehdr eh;
Elf_Phdr *ph = NULL;
const Elf_Phdr *base_ph;
const Elf_Phdr *last_ph;
u_long phsize;
Elf_Addr addr = *last;
struct proc *p;
bool use_topdown;
p = l->l_proc;
KASSERT(p->p_vmspace); KASSERT(p->p_vmspace != proc0.p_vmspace);
#ifdef __USE_TOPDOWN_VM
use_topdown = epp->ep_flags & EXEC_TOPDOWN_VM;
#else
use_topdown = false;
#endif
/*
* 1. open file
* 2. read filehdr
* 3. map text, data, and bss out of it using VM_*
*/
vp = epp->ep_interp;
if (vp == NULL) {
error = emul_find_interp(l, epp, path);
if (error != 0)
return error;
vp = epp->ep_interp;
}
/* We'll tidy this ourselves - otherwise we have locking issues */
epp->ep_interp = NULL;
vn_lock(vp, LK_SHARED | LK_RETRY);
/*
* Similarly, if it's not marked as executable, or it's not a regular
* file, we don't allow it to be used.
*/
if (vp->v_type != VREG) {
error = EACCES;
goto bad;
}
if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
goto bad;
/*
* Check mount point. Though we're not trying to exec this binary,
* we will be executing code from it, so if the mount point
* disallows execution or set-id-ness, we punt or kill the set-id.
*/
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad;
}
if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
error = vn_marktext(vp);
if (error)
goto bad;
error = exec_read(l, vp, 0, &eh, sizeof(eh), IO_NODELOCKED);
if (error != 0)
goto bad;
if ((error = elf_check_header(&eh)) != 0)
goto bad;
if (eh.e_type != ET_DYN || eh.e_phnum == 0) {
DPRINTF("bad interpreter type %#x", eh.e_type);
error = ENOEXEC;
goto bad;
}
phsize = eh.e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, vp, eh.e_phoff, ph, phsize, IO_NODELOCKED);
if (error != 0)
goto bad;
#ifdef ELF_INTERP_NON_RELOCATABLE
/*
* Evil hack: Only MIPS should be non-relocatable, and the
* psections should have a high address (typically 0x5ffe0000).
* If it's now relocatable, it should be linked at 0 and the
* psections should have zeros in the upper part of the address.
* Otherwise, force the load at the linked address.
*/
if (*last == ELF_LINK_ADDR && (ph->p_vaddr & 0xffff0000) == 0)
*last = ELFDEFNNAME(NO_ADDR);
#endif
/*
* If no position to load the interpreter was set by a probe
* function, pick the same address that a non-fixed mmap(0, ..)
* would (i.e. something safely out of the way).
*/
if (*last == ELFDEFNNAME(NO_ADDR)) {
u_long limit = 0;
/*
* Find the start and ending addresses of the psections to
* be loaded. This will give us the size.
*/
for (i = 0, base_ph = NULL; i < eh.e_phnum; i++) { if (ph[i].p_type == PT_LOAD) { u_long psize = ph[i].p_vaddr + ph[i].p_memsz;
if (base_ph == NULL)
base_ph = &ph[i];
if (psize > limit)
limit = psize;
}
}
if (base_ph == NULL) {
DPRINTF("no interpreter loadable sections");
error = ENOEXEC;
goto bad;
}
/*
* Now compute the size and load address.
*/
addr = (*epp->ep_esch->es_emul->e_vm_default_addr)(p,
epp->ep_daddr,
round_page(limit) - trunc_page(base_ph->p_vaddr),
use_topdown);
addr += (Elf_Addr)pax_aslr_rtld_offset(epp, base_ph->p_align,
use_topdown);
} else {
addr = *last; /* may be ELF_LINK_ADDR */
}
/*
* Load all the necessary sections
*/
for (i = 0, base_ph = NULL, last_ph = NULL; i < eh.e_phnum; i++) { switch (ph[i].p_type) {
case PT_LOAD: {
u_long size;
int flags;
if (base_ph == NULL) {
/*
* First encountered psection is always the
* base psection. Make sure it's aligned
* properly (align down for topdown and align
* upwards for not topdown).
*/
base_ph = &ph[i];
flags = VMCMD_BASE;
if (addr == ELF_LINK_ADDR) addr = ph[i].p_vaddr;
if (use_topdown)
addr = ELF_TRUNC(addr, ph[i].p_align);
else
addr = ELF_ROUND(addr, ph[i].p_align);
} else {
u_long limit = round_page(last_ph->p_vaddr
+ last_ph->p_memsz);
u_long base = trunc_page(ph[i].p_vaddr);
/*
* If there is a gap in between the psections,
* map it as inaccessible so nothing else
* mmap'ed will be placed there.
*/
if (limit != base) { NEW_VMCMD2(vcset, vmcmd_map_zero,
base - limit,
limit - base_ph->p_vaddr, NULLVP,
0, VM_PROT_NONE, VMCMD_RELATIVE);
}
addr = ph[i].p_vaddr - base_ph->p_vaddr;
flags = VMCMD_RELATIVE;
}
last_ph = &ph[i];
if ((error = elf_load_psection(vcset, vp, &ph[i], &addr,
&size, flags)) != 0)
goto bad;
/*
* If entry is within this psection then this
* must contain the .text section. *entryoff is
* relative to the base psection.
*/
if (eh.e_entry >= ph[i].p_vaddr &&
eh.e_entry < (ph[i].p_vaddr + size)) {
*entryoff = eh.e_entry - base_ph->p_vaddr;
}
addr += size;
break;
}
default:
break;
}
}
kmem_free(ph, phsize);
/*
* This value is ignored if TOPDOWN.
*/
*last = addr;
vput(vp);
return 0;
bad:
if (ph != NULL)
kmem_free(ph, phsize);
vput(vp);
return error;
}
/*
* exec_elf_makecmds(): Prepare an Elf binary's exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
exec_elf_makecmds(struct lwp *l, struct exec_package *epp)
{
Elf_Ehdr *eh = epp->ep_hdr;
Elf_Phdr *ph, *pp;
Elf_Addr phdr = 0, computed_phdr = 0, pos = 0, end_text = 0;
int error, i;
char *interp = NULL;
u_long phsize;
struct elf_args *ap;
bool is_dyn = false;
if (epp->ep_hdrvalid < sizeof(Elf_Ehdr)) {
DPRINTF("small header %#x", epp->ep_hdrvalid);
return ENOEXEC;
}
if ((error = elf_check_header(eh)) != 0)
return error;
if (eh->e_type == ET_DYN)
/* PIE, and some libs have an entry point */
is_dyn = true;
else if (eh->e_type != ET_EXEC) {
DPRINTF("bad type %#x", eh->e_type);
return ENOEXEC;
}
if (eh->e_phnum == 0) {
DPRINTF("no program headers");
return ENOEXEC;
}
/* XXX only LK_EXCLUSIVE to match all others - allow spinning */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
error = vn_marktext(epp->ep_vp);
if (error) {
VOP_UNLOCK(epp->ep_vp);
return error;
}
/*
* Allocate space to hold all the program headers, and read them
* from the file
*/
phsize = eh->e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
IO_NODELOCKED);
if (error != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
epp->ep_taddr = epp->ep_tsize = ELFDEFNNAME(NO_ADDR);
epp->ep_daddr = epp->ep_dsize = ELFDEFNNAME(NO_ADDR);
for (i = 0; i < eh->e_phnum; i++) {
pp = &ph[i];
if (pp->p_type == PT_INTERP) {
if (pp->p_filesz < 2 || pp->p_filesz > MAXPATHLEN) {
DPRINTF("bad interpreter namelen %#jx",
(uintmax_t)pp->p_filesz);
error = ENOEXEC;
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
interp = PNBUF_GET();
error = exec_read(l, epp->ep_vp, pp->p_offset, interp,
pp->p_filesz, IO_NODELOCKED);
if (error != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/* Ensure interp is NUL-terminated and of the expected length */
if (strnlen(interp, pp->p_filesz) != pp->p_filesz - 1) {
DPRINTF("bad interpreter name");
error = ENOEXEC;
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
break;
}
}
/*
* On the same architecture, we may be emulating different systems.
* See which one will accept this executable.
*
* Probe functions would normally see if the interpreter (if any)
* exists. Emulation packages may possibly replace the interpreter in
* interp with a changed path (/emul/xxx/<path>).
*/
pos = ELFDEFNNAME(NO_ADDR);
if (epp->ep_esch->u.elf_probe_func) {
vaddr_t startp = (vaddr_t)pos;
error = (*epp->ep_esch->u.elf_probe_func)(l, epp, eh, interp,
&startp);
if (error) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
pos = (Elf_Addr)startp;
}
if (is_dyn && (error = elf_placedynexec(epp, eh, ph)) != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/*
* Load all the necessary sections
*/
for (i = 0; i < eh->e_phnum; i++) {
Elf_Addr addr = ELFDEFNNAME(NO_ADDR);
u_long size = 0;
switch (ph[i].p_type) {
case PT_LOAD:
if ((error = elf_load_psection(&epp->ep_vmcmds,
epp->ep_vp, &ph[i], &addr, &size, VMCMD_FIXED))
!= 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/*
* Consider this as text segment, if it is executable.
* If there is more than one text segment, pick the
* largest.
*/
if (ph[i].p_flags & PF_X) {
if (epp->ep_taddr == ELFDEFNNAME(NO_ADDR) ||
size > epp->ep_tsize) {
epp->ep_taddr = addr;
epp->ep_tsize = size;
}
end_text = addr + size;
} else {
epp->ep_daddr = addr;
epp->ep_dsize = size;
}
if (ph[i].p_offset == 0) { computed_phdr = ph[i].p_vaddr + eh->e_phoff;
}
break;
case PT_SHLIB:
/* SCO has these sections. */
case PT_INTERP:
/* Already did this one. */
case PT_DYNAMIC:
case PT_NOTE:
break;
case PT_PHDR:
/* Note address of program headers (in text segment) */
phdr = ph[i].p_vaddr;
break;
default:
/*
* Not fatal; we don't need to understand everything.
*/
break;
}
}
/* Now done with the vnode. */
VOP_UNLOCK(epp->ep_vp);
if (epp->ep_vmcmds.evs_used == 0) {
/* No VMCMD; there was no PT_LOAD section, or those
* sections were empty */
DPRINTF("no vmcommands");
error = ENOEXEC;
goto bad;
}
if (epp->ep_daddr == ELFDEFNNAME(NO_ADDR)) { epp->ep_daddr = round_page(end_text);
epp->ep_dsize = 0;
}
/*
* Check if we found a dynamically linked binary and arrange to load
* its interpreter
*/
if (interp) {
u_int nused = epp->ep_vmcmds.evs_used;
u_long interp_offset = 0;
if ((error = elf_load_interp(l, epp, interp,
&epp->ep_vmcmds, &interp_offset, &pos)) != 0) {
goto bad;
}
if (epp->ep_vmcmds.evs_used == nused) {
/* elf_load_interp() has not set up any new VMCMD */
DPRINTF("no vmcommands for interpreter");
error = ENOEXEC;
goto bad;
}
ap = kmem_alloc(sizeof(*ap), KM_SLEEP);
ap->arg_interp = epp->ep_vmcmds.evs_cmds[nused].ev_addr;
epp->ep_entryoffset = interp_offset;
epp->ep_entry = ap->arg_interp + interp_offset;
PNBUF_PUT(interp);
interp = NULL;
} else {
epp->ep_entry = eh->e_entry;
if (epp->ep_flags & EXEC_FORCEAUX) { ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
ap->arg_interp = (vaddr_t)NULL;
} else {
ap = NULL;
}
}
if (ap) {
ap->arg_phaddr = phdr ? phdr : computed_phdr;
ap->arg_phentsize = eh->e_phentsize;
ap->arg_phnum = eh->e_phnum;
ap->arg_entry = eh->e_entry;
epp->ep_emul_arg = ap;
epp->ep_emul_arg_free = elf_free_emul_arg;
}
#ifdef ELF_MAP_PAGE_ZERO
/* Dell SVR4 maps page zero, yeuch! */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, PAGE_SIZE, 0,
epp->ep_vp, 0, VM_PROT_READ);
#endif
error = (*epp->ep_esch->es_setup_stack)(l, epp);
if (error)
goto bad;
kmem_free(ph, phsize);
return 0;
bad:
if (interp)
PNBUF_PUT(interp);
exec_free_emul_arg(epp);
kmem_free(ph, phsize);
kill_vmcmds(&epp->ep_vmcmds);
return error;
}
int
netbsd_elf_signature(struct lwp *l, struct exec_package *epp,
Elf_Ehdr *eh)
{
size_t i;
Elf_Phdr *ph;
size_t phsize;
char *nbuf;
int error;
int isnetbsd = 0;
epp->ep_pax_flags = 0;
if (eh->e_phnum > ELF_MAXPHNUM || eh->e_phnum == 0) {
DPRINTF("no signature %#x", eh->e_phnum);
return ENOEXEC;
}
phsize = eh->e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
IO_NODELOCKED);
if (error)
goto out;
nbuf = kmem_alloc(ELF_MAXNOTESIZE, KM_SLEEP);
for (i = 0; i < eh->e_phnum; i++) {
const char *nptr;
size_t nlen;
if (ph[i].p_type != PT_NOTE ||
ph[i].p_filesz > ELF_MAXNOTESIZE)
continue;
nlen = ph[i].p_filesz;
error = exec_read(l, epp->ep_vp, ph[i].p_offset, nbuf, nlen,
IO_NODELOCKED);
if (error)
continue;
nptr = nbuf;
while (nlen > 0) {
const Elf_Nhdr *np;
const char *ndata, *ndesc;
/* note header */
np = (const Elf_Nhdr *)nptr;
if (nlen < sizeof(*np)) {
break;
}
nptr += sizeof(*np);
nlen -= sizeof(*np);
/* note name */
ndata = nptr;
if (nlen < roundup(np->n_namesz, 4)) {
break;
}
nptr += roundup(np->n_namesz, 4);
nlen -= roundup(np->n_namesz, 4);
/* note description */
ndesc = nptr;
if (nlen < roundup(np->n_descsz, 4)) {
break;
}
nptr += roundup(np->n_descsz, 4);
nlen -= roundup(np->n_descsz, 4);
isnetbsd |= netbsd_elf_note(epp, np, ndata, ndesc);
}
}
kmem_free(nbuf, ELF_MAXNOTESIZE);
error = isnetbsd ? 0 : ENOEXEC;
#ifdef DEBUG_ELF
if (error)
DPRINTF("not netbsd");
#endif
out:
kmem_free(ph, phsize);
return error;
}
int
netbsd_elf_note(struct exec_package *epp,
const Elf_Nhdr *np, const char *ndata, const char *ndesc)
{
int isnetbsd = 0;
#ifdef DIAGNOSTIC
const char *badnote;
#define BADNOTE(n) badnote = (n)
#else
#define BADNOTE(n)
#endif
switch (np->n_type) {
case ELF_NOTE_TYPE_NETBSD_TAG:
/* It is us */
if (np->n_namesz == ELF_NOTE_NETBSD_NAMESZ &&
np->n_descsz == ELF_NOTE_NETBSD_DESCSZ &&
memcmp(ndata, ELF_NOTE_NETBSD_NAME,
ELF_NOTE_NETBSD_NAMESZ) == 0) {
memcpy(&epp->ep_osversion, ndesc,
ELF_NOTE_NETBSD_DESCSZ);
isnetbsd = 1;
break;
}
/*
* Ignore SuSE tags; SuSE's n_type is the same the
* NetBSD one.
*/
if (np->n_namesz == ELF_NOTE_SUSE_NAMESZ &&
memcmp(ndata, ELF_NOTE_SUSE_NAME,
ELF_NOTE_SUSE_NAMESZ) == 0)
break;
/*
* Ignore old GCC
*/
if (np->n_namesz == ELF_NOTE_OGCC_NAMESZ &&
memcmp(ndata, ELF_NOTE_OGCC_NAME,
ELF_NOTE_OGCC_NAMESZ) == 0)
break;
BADNOTE("NetBSD tag");
goto bad;
case ELF_NOTE_TYPE_PAX_TAG:
if (np->n_namesz == ELF_NOTE_PAX_NAMESZ &&
np->n_descsz == ELF_NOTE_PAX_DESCSZ &&
memcmp(ndata, ELF_NOTE_PAX_NAME,
ELF_NOTE_PAX_NAMESZ) == 0) {
uint32_t flags;
memcpy(&flags, ndesc, sizeof(flags));
/* Convert the flags and insert them into
* the exec package. */
pax_setup_elf_flags(epp, flags);
break;
}
BADNOTE("PaX tag");
goto bad;
case ELF_NOTE_TYPE_MARCH_TAG:
/* Copy the machine arch into the package. */
if (np->n_namesz == ELF_NOTE_MARCH_NAMESZ
&& memcmp(ndata, ELF_NOTE_MARCH_NAME,
ELF_NOTE_MARCH_NAMESZ) == 0) {
/* Do not truncate the buffer */
if (np->n_descsz > sizeof(epp->ep_machine_arch)) {
BADNOTE("description size limit");
goto bad;
}
/*
* Ensure ndesc is NUL-terminated and of the
* expected length.
*/
if (strnlen(ndesc, np->n_descsz) + 1 !=
np->n_descsz) {
BADNOTE("description size");
goto bad;
}
strlcpy(epp->ep_machine_arch, ndesc,
sizeof(epp->ep_machine_arch));
break;
}
BADNOTE("march tag");
goto bad;
case ELF_NOTE_TYPE_MCMODEL_TAG:
/* arch specific check for code model */
#ifdef ELF_MD_MCMODEL_CHECK
if (np->n_namesz == ELF_NOTE_MCMODEL_NAMESZ
&& memcmp(ndata, ELF_NOTE_MCMODEL_NAME,
ELF_NOTE_MCMODEL_NAMESZ) == 0) {
ELF_MD_MCMODEL_CHECK(epp, ndesc, np->n_descsz);
break;
}
BADNOTE("mcmodel tag");
goto bad;
#endif
break;
case ELF_NOTE_TYPE_SUSE_VERSION_TAG:
break;
case ELF_NOTE_TYPE_GO_BUILDID_TAG:
break;
case ELF_NOTE_TYPE_FDO_PACKAGING_METADATA:
break;
case ELF_NOTE_TYPE_NETBSD_EMUL_TAG:
/* Ancient NetBSD version tag */
break;
default:
BADNOTE("unknown tag");
bad:
#ifdef DIAGNOSTIC
/* Ignore GNU tags */
if (np->n_namesz == ELF_NOTE_GNU_NAMESZ &&
memcmp(ndata, ELF_NOTE_GNU_NAME,
ELF_NOTE_GNU_NAMESZ) == 0)
break;
int ns = (int)np->n_namesz;
printf("%s: Unknown elf note type %d (%s): "
"[namesz=%d, descsz=%d name=%-*.*s]\n",
epp->ep_kname, np->n_type, badnote, np->n_namesz,
np->n_descsz, ns, ns, ndata);
#endif
break;
}
return isnetbsd;
}
int
netbsd_elf_probe(struct lwp *l, struct exec_package *epp, void *eh, char *itp,
vaddr_t *pos)
{
int error;
if ((error = netbsd_elf_signature(l, epp, eh)) != 0)
return error;
#ifdef ELF_MD_PROBE_FUNC
if ((error = ELF_MD_PROBE_FUNC(l, epp, eh, itp, pos)) != 0)
return error;
#elif defined(ELF_INTERP_NON_RELOCATABLE)
*pos = ELF_LINK_ADDR;
#endif
epp->ep_flags |= EXEC_FORCEAUX;
return 0;
}
void
elf_free_emul_arg(void *arg)
{
struct elf_args *ap = arg;
KASSERT(ap != NULL);
kmem_free(ap, sizeof(*ap));
}
/* $NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_loop.c 8.2 (Berkeley) 1/9/95
*/
/*
* Loopback interface driver for protocol testing and timing.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_atalk.h"
#include "opt_mbuftrace.h"
#include "opt_mpls.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <sys/device.h>
#include <sys/module.h>
#include <sys/cpu.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#endif
#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet6/in6_var.h>
#include <netinet6/in6_offload.h>
#include <netinet/ip6.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif
#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#endif
#include <net/bpf.h>
#if defined(LARGE_LOMTU)
#define LOMTU (131072 + MHLEN + MLEN)
#define LOMTU_MAX LOMTU
#else
#define LOMTU (32768 + MHLEN + MLEN)
#define LOMTU_MAX (65536 + MHLEN + MLEN)
#endif
#ifdef ALTQ
static void lostart(struct ifnet *);
#endif
static int loop_clone_create(struct if_clone *, int);
static int loop_clone_destroy(struct ifnet *);
static void loop_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
static struct if_clone loop_cloner =
IF_CLONE_INITIALIZER("lo", loop_clone_create, loop_clone_destroy);
void
loopattach(int n)
{
#ifndef _MODULE
loop_clone_create(&loop_cloner, 0); /* lo0 always exists */
#endif
}
void
loopinit(void)
{
if (lo0ifp != NULL) /* can happen in rump kernel */
return;
#ifdef _MODULE
loop_clone_create(&loop_cloner, 0); /* lo0 always exists */
#endif
if_clone_attach(&loop_cloner);
}
static int
loopdetach(void)
{
/* no detach for now; we don't allow lo0 to be deleted */
return EBUSY;
}
static int
loop_clone_create(struct if_clone *ifc, int unit)
{
struct ifnet *ifp;
ifp = if_alloc(IFT_LOOP);
if_initname(ifp, ifc->ifc_name, unit);
ifp->if_mtu = LOMTU;
ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
#ifdef NET_MPSAFE
ifp->if_extflags = IFEF_MPSAFE;
#endif
ifp->if_ioctl = loioctl;
ifp->if_output = looutput;
#ifdef ALTQ
ifp->if_start = lostart;
#endif
ifp->if_type = IFT_LOOP;
ifp->if_hdrlen = 0;
ifp->if_addrlen = 0;
ifp->if_dlt = DLT_NULL;
IFQ_SET_READY(&ifp->if_snd);
if (unit == 0)
lo0ifp = ifp;
if_initialize(ifp);
ifp->if_link_state = LINK_STATE_UP;
if_alloc_sadl(ifp);
bpf_attach(ifp, DLT_NULL, sizeof(u_int));
#ifdef MBUFTRACE
ifp->if_mowner = malloc(sizeof(struct mowner), M_DEVBUF,
M_WAITOK | M_ZERO);
strlcpy(ifp->if_mowner->mo_name, ifp->if_xname,
sizeof(ifp->if_mowner->mo_name));
MOWNER_ATTACH(ifp->if_mowner);
#endif
ifp->if_flags |= IFF_RUNNING;
if_register(ifp);
return (0);
}
static int
loop_clone_destroy(struct ifnet *ifp)
{
if (ifp == lo0ifp)
return (EPERM);
ifp->if_flags &= ~IFF_RUNNING;
#ifdef MBUFTRACE
MOWNER_DETACH(ifp->if_mowner);
free(ifp->if_mowner, M_DEVBUF);
#endif
bpf_detach(ifp);
if_detach(ifp);
if_free(ifp);
return (0);
}
int
looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
const struct rtentry *rt)
{
pktqueue_t *pktq = NULL;
int s;
int csum_flags;
int error = 0;
size_t pktlen;
MCLAIM(m, ifp->if_mowner);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
if ((m->m_flags & M_PKTHDR) == 0)
panic("looutput: no header mbuf"); if (ifp->if_flags & IFF_LOOPBACK) bpf_mtap_af(ifp, dst->sa_family, m, BPF_D_OUT); m_set_rcvif(m, ifp); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m);
error = (rt->rt_flags & RTF_BLACKHOLE ? 0 :
rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
goto out;
}
pktlen = m->m_pkthdr.len;
if_statadd2(ifp, if_opackets, 1, if_obytes, pktlen);
#ifdef ALTQ
/*
* ALTQ on the loopback interface is just for debugging. It's
* used only for loopback interfaces, not for a simplex interface.
*/
if ((ALTQ_IS_ENABLED(&ifp->if_snd) || TBR_IS_ENABLED(&ifp->if_snd)) &&
ifp->if_start == lostart) {
/*
* If the queueing discipline needs packet classification,
* do it before prepending the link headers.
*/
IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT);
if (m == NULL) {
if_statinc(ifp, if_oerrors);
error = ENOBUFS;
goto out;
}
*(mtod(m, uint32_t *)) = dst->sa_family;
error = if_transmit_lock(ifp, m);
goto out;
}
#endif /* ALTQ */
m_tag_delete_chain(m);
#ifdef MPLS
bool is_mpls = false;
if (rt != NULL && rt_gettag(rt) != NULL &&
rt_gettag(rt)->sa_family == AF_MPLS &&
(m->m_flags & (M_MCAST | M_BCAST)) == 0) {
union mpls_shim msh;
msh.s_addr = MPLS_GETSADDR(rt);
if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
is_mpls = true;
pktq = mpls_pktq;
}
}
if (!is_mpls)
#endif
switch (dst->sa_family) {
#ifdef INET
case AF_INET:
csum_flags = m->m_pkthdr.csum_flags;
KASSERT((csum_flags & ~(M_CSUM_IPv4|M_CSUM_UDPv4)) == 0); if (csum_flags != 0 && IN_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
in_undefer_cksum(m, 0, csum_flags);
m->m_pkthdr.csum_flags = 0;
} else {
/*
* Do nothing. Pass M_CSUM_IPv4 and M_CSUM_UDPv4 as
* they are to tell those are calculated and good.
*/
}
pktq = ip_pktq;
break;
#endif
#ifdef INET6
case AF_INET6:
csum_flags = m->m_pkthdr.csum_flags;
KASSERT((csum_flags & ~M_CSUM_UDPv6) == 0); if (csum_flags != 0 && IN6_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
in6_undefer_cksum(m, 0, csum_flags);
m->m_pkthdr.csum_flags = 0;
} else {
/*
* Do nothing. Pass M_CSUM_UDPv6 as
* they are to tell those are calculated and good.
*/
}
m->m_flags |= M_LOOP;
pktq = ip6_pktq;
break;
#endif
#ifdef NETATALK
case AF_APPLETALK:
pktq = at_pktq2;
break;
#endif
default:
printf("%s: can't handle af%d\n", ifp->if_xname,
dst->sa_family);
m_freem(m);
error = EAFNOSUPPORT;
goto out;
}
KASSERT(pktq != NULL);
error = 0;
s = splnet();
if (__predict_true(pktq_enqueue(pktq, m, 0))) {
if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
} else {
m_freem(m);
if_statinc(ifp, if_oerrors);
error = ENOBUFS;
}
splx(s);
out:
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
return error;
}
#ifdef ALTQ
static void
lostart(struct ifnet *ifp)
{
for (;;) {
pktqueue_t *pktq = NULL;
struct mbuf *m;
size_t pktlen;
uint32_t af;
int s;
IFQ_DEQUEUE(&ifp->if_snd, m);
if (m == NULL)
return;
af = *(mtod(m, uint32_t *));
m_adj(m, sizeof(uint32_t));
switch (af) {
#ifdef INET
case AF_INET:
pktq = ip_pktq;
break;
#endif
#ifdef INET6
case AF_INET6:
m->m_flags |= M_LOOP;
pktq = ip6_pktq;
break;
#endif
#ifdef NETATALK
case AF_APPLETALK:
pktq = at_pktq2;
break;
#endif
default:
printf("%s: can't handle af%d\n", ifp->if_xname, af);
m_freem(m);
return;
}
pktlen = m->m_pkthdr.len;
KASSERT(pktq != NULL);
s = splnet();
if (__predict_false(pktq_enqueue(pktq, m, 0))) {
m_freem(m);
splx(s);
return;
}
if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
splx(s);
}
}
#endif /* ALTQ */
/* ARGSUSED */
static void
loop_rtrequest(int cmd, struct rtentry *rt,
const struct rt_addrinfo *info)
{
if (rt)
rt->rt_rmx.rmx_mtu = lo0ifp->if_mtu;
}
/*
* Process an ioctl request.
*/
/* ARGSUSED */
int
loioctl(struct ifnet *ifp, u_long cmd, void *data)
{
struct ifaddr *ifa;
struct ifreq *ifr = data;
int error = 0;
switch (cmd) {
case SIOCINITIFADDR:
ifp->if_flags |= IFF_UP;
ifa = (struct ifaddr *)data;
if (ifa != NULL)
ifa->ifa_rtrequest = loop_rtrequest;
/*
* Everything else is done at a higher level.
*/
break;
case SIOCSIFMTU:
if ((unsigned)ifr->ifr_mtu > LOMTU_MAX)
error = EINVAL;
else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET){
error = 0;
}
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (ifr == NULL) {
error = EAFNOSUPPORT; /* XXX */
break;
}
switch (ifreq_getaddr(cmd, ifr)->sa_family) {
#ifdef INET
case AF_INET:
break;
#endif
#ifdef INET6
case AF_INET6:
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
break;
default:
error = ifioctl_common(ifp, cmd, data);
}
return (error);
}
/*
* Module infrastructure
*/
#include "if_module.h"
IF_MODULE(MODULE_CLASS_DRIVER, loop, NULL)
/* $NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $ */
/* NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94
* from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#if defined(PDSIM)
#include "pdsim.h"
#else /* defined(PDSIM) */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pdpolicy_impl.h>
#include <uvm/uvm_stat.h>
#endif /* defined(PDSIM) */
/*
* per-CPU queue of pending page status changes. 128 entries makes for a
* 1kB queue on _LP64 and has been found to be a reasonable compromise that
* keeps lock contention events and wait times low, while not using too much
* memory nor allowing global state to fall too far behind.
*/
#if !defined(CLOCK_PDQ_SIZE)
#define CLOCK_PDQ_SIZE 128
#endif /* !defined(CLOCK_PDQ_SIZE) */
#define PQ_INACTIVE 0x00000010 /* page is in inactive list */
#define PQ_ACTIVE 0x00000020 /* page is in active list */
#if !defined(CLOCK_INACTIVEPCT)
#define CLOCK_INACTIVEPCT 33
#endif /* !defined(CLOCK_INACTIVEPCT) */
struct uvmpdpol_globalstate {
kmutex_t lock; /* lock on state */
/* <= compiler pads here */
struct pglist s_activeq /* allocated pages, in use */
__aligned(COHERENCY_UNIT);
struct pglist s_inactiveq; /* pages between the clock hands */
int s_active;
int s_inactive;
int s_inactarg;
struct uvm_pctparam s_anonmin;
struct uvm_pctparam s_filemin;
struct uvm_pctparam s_execmin;
struct uvm_pctparam s_anonmax;
struct uvm_pctparam s_filemax;
struct uvm_pctparam s_execmax;
struct uvm_pctparam s_inactivepct;
};
struct uvmpdpol_scanstate {
bool ss_anonreact, ss_filereact, ss_execreact;
struct vm_page ss_marker;
};
static void uvmpdpol_pageactivate_locked(struct vm_page *);
static void uvmpdpol_pagedeactivate_locked(struct vm_page *);
static void uvmpdpol_pagedequeue_locked(struct vm_page *);
static bool uvmpdpol_pagerealize_locked(struct vm_page *);
static struct uvm_cpu *uvmpdpol_flush(void);
static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
static struct uvmpdpol_scanstate pdpol_scanstate;
PDPOL_EVCNT_DEFINE(reactexec)
PDPOL_EVCNT_DEFINE(reactfile)
PDPOL_EVCNT_DEFINE(reactanon)
static void
clock_tune(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
s->s_active + s->s_inactive);
if (s->s_inactarg <= uvmexp.freetarg) {
s->s_inactarg = uvmexp.freetarg + 1;
}
}
void
uvmpdpol_scaninit(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
int t;
bool anonunder, fileunder, execunder;
bool anonover, fileover, execover;
bool anonreact, filereact, execreact;
int64_t freepg, anonpg, filepg, execpg;
/*
* decide which types of pages we want to reactivate instead of freeing
* to keep usage within the minimum and maximum usage limits.
* uvm_availmem() will sync the counters.
*/
freepg = uvm_availmem(false);
anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
cpu_count_get(CPU_COUNT_ANONDIRTY) +
cpu_count_get(CPU_COUNT_ANONUNKNOWN);
execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
cpu_count_get(CPU_COUNT_FILEDIRTY) +
cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
execpg;
mutex_enter(&s->lock);
t = s->s_active + s->s_inactive + freepg;
anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
anonreact = anonunder || (!anonover && (fileover || execover));
filereact = fileunder || (!fileover && (anonover || execover));
execreact = execunder || (!execover && (anonover || fileover));
if (filereact && execreact && (anonreact || uvm_swapisfull())) {
anonreact = filereact = execreact = false;
}
ss->ss_anonreact = anonreact;
ss->ss_filereact = filereact;
ss->ss_execreact = execreact;
memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
ss->ss_marker.flags = PG_MARKER;
TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
mutex_exit(&s->lock);
}
void
uvmpdpol_scanfini(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
mutex_enter(&s->lock);
TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
mutex_exit(&s->lock);
}
struct vm_page *
uvmpdpol_selectvictim(krwlock_t **plock)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
struct vm_page *pg;
krwlock_t *lock;
mutex_enter(&s->lock);
while (/* CONSTCOND */ 1) {
struct vm_anon *anon;
struct uvm_object *uobj;
pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
if (pg == NULL) {
break;
}
KASSERT((pg->flags & PG_MARKER) == 0);
uvmexp.pdscans++;
/*
* acquire interlock to stabilize page identity.
* if we have caught the page in a state of flux
* deal with it and retry.
*/
mutex_enter(&pg->interlock);
if (uvmpdpol_pagerealize_locked(pg)) {
mutex_exit(&pg->interlock);
continue;
}
/*
* now prepare to move on to the next page.
*/
TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
pdqueue);
TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
&ss->ss_marker, pdqueue);
/*
* enforce the minimum thresholds on different
* types of memory usage. if reusing the current
* page would reduce that type of usage below its
* minimum, reactivate the page instead and move
* on to the next page.
*/
anon = pg->uanon;
uobj = pg->uobject;
if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactexec);
continue;
}
if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
!UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactfile);
continue;
}
if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactanon);
continue;
}
/*
* try to lock the object that owns the page.
*
* with the page interlock held, we can drop s->lock, which
* could otherwise serve as a barrier to us getting the
* object locked, because the owner of the object's lock may
* be blocked on s->lock (i.e. a deadlock).
*
* whatever happens, uvmpd_trylockowner() will release the
* interlock. with the interlock dropped we can then
* re-acquire our own lock. the order is:
*
* object -> pdpol -> interlock.
*/
mutex_exit(&s->lock);
lock = uvmpd_trylockowner(pg);
/* pg->interlock now released */
mutex_enter(&s->lock);
if (lock == NULL) {
/* didn't get it - try the next page. */
continue;
}
/*
* move referenced pages back to active queue and skip to
* next page.
*/
if (pmap_is_referenced(pg)) {
mutex_enter(&pg->interlock);
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
uvmexp.pdreact++;
rw_exit(lock);
continue;
}
/* we have a potential victim. */
*plock = lock;
break;
}
mutex_exit(&s->lock);
return pg;
}
void
uvmpdpol_balancequeue(int swap_shortage)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
int inactive_shortage;
struct vm_page *p, marker;
krwlock_t *lock;
/*
* we have done the scan to get free pages. now we work on meeting
* our inactive target.
*/
memset(&marker, 0, sizeof(marker));
marker.flags = PG_MARKER;
mutex_enter(&s->lock);
TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
for (;;) {
inactive_shortage =
pdpol_state.s_inactarg - pdpol_state.s_inactive;
if (inactive_shortage <= 0 && swap_shortage <= 0) {
break;
}
p = TAILQ_NEXT(&marker, pdqueue);
if (p == NULL) {
break;
}
KASSERT((p->flags & PG_MARKER) == 0);
/*
* acquire interlock to stabilize page identity.
* if we have caught the page in a state of flux
* deal with it and retry.
*/
mutex_enter(&p->interlock);
if (uvmpdpol_pagerealize_locked(p)) {
mutex_exit(&p->interlock);
continue;
}
/*
* now prepare to move on to the next page.
*/
TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
pdqueue);
/*
* try to lock the object that owns the page. see comments
* in uvmpdol_selectvictim().
*/
mutex_exit(&s->lock);
lock = uvmpd_trylockowner(p);
/* p->interlock now released */
mutex_enter(&s->lock);
if (lock == NULL) {
/* didn't get it - try the next page. */
continue;
}
/*
* if there's a shortage of swap slots, try to free it.
*/
if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
(p->flags & PG_BUSY) == 0) {
if (uvmpd_dropswap(p)) {
swap_shortage--;
}
}
/*
* if there's a shortage of inactive pages, deactivate.
*/
if (inactive_shortage > 0) {
pmap_clear_reference(p);
mutex_enter(&p->interlock);
uvmpdpol_pagedeactivate_locked(p);
mutex_exit(&p->interlock);
uvmexp.pddeact++;
inactive_shortage--;
}
rw_exit(lock);
}
TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
mutex_exit(&s->lock);
}
static void
uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
(PQ_INTENT_D | PQ_INTENT_SET));
if (pg->pqflags & PQ_ACTIVE) { TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT(pdpol_state.s_active > 0);
pdpol_state.s_active--;
}
if ((pg->pqflags & PQ_INACTIVE) == 0) { KASSERT(pg->wire_count == 0); TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
pdpol_state.s_inactive++;
}
pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
pg->pqflags |= PQ_INACTIVE;
}
void
uvmpdpol_pagedeactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock));
/*
* we have to clear the reference bit now, as when it comes time to
* realize the intent we won't have the object locked any more.
*/
pmap_clear_reference(pg);
uvmpdpol_set_intent(pg, PQ_INTENT_I);
}
static void
uvmpdpol_pageactivate_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
(PQ_INTENT_D | PQ_INTENT_SET));
uvmpdpol_pagedequeue_locked(pg);
TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
pdpol_state.s_active++;
pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
pg->pqflags |= PQ_ACTIVE;
}
void
uvmpdpol_pageactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_A);
}
static void
uvmpdpol_pagedequeue_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock));
if (pg->pqflags & PQ_ACTIVE) {
TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT((pg->pqflags & PQ_INACTIVE) == 0); KASSERT(pdpol_state.s_active > 0);
pdpol_state.s_active--;
} else if (pg->pqflags & PQ_INACTIVE) { TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue); KASSERT(pdpol_state.s_inactive > 0);
pdpol_state.s_inactive--;
}
pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
}
void
uvmpdpol_pagedequeue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_D);
}
void
uvmpdpol_pageenqueue(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, false));
KASSERT(mutex_owned(&pg->interlock));
uvmpdpol_set_intent(pg, PQ_INTENT_E);
}
void
uvmpdpol_anfree(struct vm_anon *an)
{
}
bool
uvmpdpol_pageisqueued_p(struct vm_page *pg)
{
uint32_t pqflags;
/*
* if there's an intent set, we have to consider it. otherwise,
* return the actual state. we may be called unlocked for the
* purpose of assertions, which is safe due to the page lifecycle.
*/
pqflags = atomic_load_relaxed(&pg->pqflags);
if ((pqflags & PQ_INTENT_SET) != 0) {
return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
} else {
return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
}
}
bool
uvmpdpol_pageactivate_p(struct vm_page *pg)
{
uint32_t pqflags;
/* consider intent in preference to actual state. */
pqflags = atomic_load_relaxed(&pg->pqflags);
if ((pqflags & PQ_INTENT_SET) != 0) {
pqflags &= PQ_INTENT_MASK;
return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
} else {
/*
* TODO: Enabling this may be too much of a big hammer,
* since we do get useful information from activations.
* Think about it more and maybe come up with a heuristic
* or something.
*
* return (pqflags & PQ_ACTIVE) == 0;
*/
return true;
}
}
void
uvmpdpol_estimatepageable(int *active, int *inactive)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
/*
* Don't take any locks here. This can be called from DDB, and in
* any case the numbers are stale the instant the lock is dropped,
* so it just doesn't matter.
*/
if (active) {
*active = s->s_active;
}
if (inactive) {
*inactive = s->s_inactive;
}
}
#if !defined(PDSIM)
static int
min_check(struct uvm_pctparam *pct, int t)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
int total = t;
if (pct != &s->s_anonmin) {
total += uvm_pctparam_get(&s->s_anonmin);
}
if (pct != &s->s_filemin) {
total += uvm_pctparam_get(&s->s_filemin);
}
if (pct != &s->s_execmin) {
total += uvm_pctparam_get(&s->s_execmin);
}
if (total > 95) {
return EINVAL;
}
return 0;
}
#endif /* !defined(PDSIM) */
void
uvmpdpol_init(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&s->s_activeq);
TAILQ_INIT(&s->s_inactiveq);
uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
uvm_pctparam_init(&s->s_anonmin, 10, min_check);
uvm_pctparam_init(&s->s_filemin, 10, min_check);
uvm_pctparam_init(&s->s_execmin, 5, min_check);
uvm_pctparam_init(&s->s_anonmax, 80, NULL);
uvm_pctparam_init(&s->s_filemax, 50, NULL);
uvm_pctparam_init(&s->s_execmax, 30, NULL);
}
void
uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
{
ucpu->pdq =
kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
ucpu->pdqhead = CLOCK_PDQ_SIZE;
ucpu->pdqtail = CLOCK_PDQ_SIZE;
}
void
uvmpdpol_reinit(void)
{
}
bool
uvmpdpol_needsscan_p(void)
{
/*
* this must be an unlocked check: can be called from interrupt.
*/
return pdpol_state.s_inactive < pdpol_state.s_inactarg;
}
void
uvmpdpol_tune(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
mutex_enter(&s->lock);
clock_tune();
mutex_exit(&s->lock);
}
/*
* uvmpdpol_pagerealize_locked: take the intended state set on a page and
* make it real. return true if any work was done.
*/
static bool
uvmpdpol_pagerealize_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
case PQ_INTENT_A | PQ_INTENT_SET:
case PQ_INTENT_E | PQ_INTENT_SET:
uvmpdpol_pageactivate_locked(pg);
return true;
case PQ_INTENT_I | PQ_INTENT_SET:
uvmpdpol_pagedeactivate_locked(pg);
return true;
case PQ_INTENT_D | PQ_INTENT_SET:
uvmpdpol_pagedequeue_locked(pg);
return true;
default:
return false;
}
}
/*
* uvmpdpol_flush: return the current uvm_cpu with all of its pending
* updates flushed to the global queues. this routine may block, and
* so can switch cpu. the idea is to empty to queue on whatever cpu
* we finally end up on.
*/
static struct uvm_cpu *
uvmpdpol_flush(void)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
struct uvm_cpu *ucpu;
struct vm_page *pg;
KASSERT(kpreempt_disabled());
mutex_enter(&s->lock);
for (;;) {
/*
* prefer scanning forwards (even though mutex_enter() is
* serializing) so as to not defeat any prefetch logic in
* the CPU. that means elsewhere enqueuing backwards, like
* a stack, but not so important there as pages are being
* added singularly.
*
* prefetch the next "struct vm_page" while working on the
* current one. this has a measurable and very positive
* effect in reducing the amount of time spent here under
* the global lock.
*/
ucpu = curcpu()->ci_data.cpu_uvm;
KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
break;
}
pg = ucpu->pdq[ucpu->pdqhead++];
if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) { __builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
}
mutex_enter(&pg->interlock);
pg->pqflags &= ~PQ_INTENT_QUEUED;
(void)uvmpdpol_pagerealize_locked(pg);
mutex_exit(&pg->interlock);
}
mutex_exit(&s->lock);
return ucpu;
}
/*
* uvmpdpol_pagerealize: realize any intent set on the page. in this
* implementation, that means putting the page on a per-CPU queue to be
* dealt with later.
*/
void
uvmpdpol_pagerealize(struct vm_page *pg)
{
struct uvm_cpu *ucpu;
/*
* drain the per per-CPU queue if full, then enter the page.
*/
kpreempt_disable();
ucpu = curcpu()->ci_data.cpu_uvm;
if (__predict_false(ucpu->pdqhead == 0)) { ucpu = uvmpdpol_flush();
}
ucpu->pdq[--(ucpu->pdqhead)] = pg;
kpreempt_enable();
}
/*
* uvmpdpol_idle: called from the system idle loop. periodically purge any
* pending updates back to the global queues.
*/
void
uvmpdpol_idle(struct uvm_cpu *ucpu)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct vm_page *pg;
KASSERT(kpreempt_disabled());
/*
* if no pages in the queue, we have nothing to do.
*/
if (ucpu->pdqhead == ucpu->pdqtail) {
ucpu->pdqtime = getticks();
return;
}
/*
* don't do this more than ~8 times a second as it would needlessly
* exert pressure.
*/
if (getticks() - ucpu->pdqtime < (hz >> 3)) {
return;
}
/*
* the idle LWP can't block, so we have to try for the lock. if we
* get it, purge the per-CPU pending update queue. continually
* check for a pending resched: in that case exit immediately.
*/
if (mutex_tryenter(&s->lock)) {
while (ucpu->pdqhead != ucpu->pdqtail) {
pg = ucpu->pdq[ucpu->pdqhead];
if (!mutex_tryenter(&pg->interlock)) {
break;
}
ucpu->pdqhead++;
pg->pqflags &= ~PQ_INTENT_QUEUED;
(void)uvmpdpol_pagerealize_locked(pg);
mutex_exit(&pg->interlock);
if (curcpu()->ci_want_resched) {
break;
}
}
if (ucpu->pdqhead == ucpu->pdqtail) {
ucpu->pdqtime = getticks();
}
mutex_exit(&s->lock);
}
}
#if !defined(PDSIM)
#include <sys/sysctl.h> /* XXX SYSCTL_DESCR */
void
uvmpdpol_sysctlsetup(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for anonymous application data"));
uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for cached file data"));
uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for cached executable data"));
uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for "
"anonymous application data"));
uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for cached "
"file data"));
uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for cached "
"executable data"));
uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
SYSCTL_DESCR("Percentage of inactive queue of "
"the entire (active + inactive) queue"));
}
#endif /* !defined(PDSIM) */
#if defined(PDSIM)
void
pdsim_dump(const char *id)
{
#if defined(DEBUG)
/* XXX */
#endif /* defined(DEBUG) */
}
#endif /* defined(PDSIM) */
/* $NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $ */
/*
*
* Coda: an Experimental Distributed File System
* Release 3.1
*
* Copyright (c) 1987-1998 Carnegie Mellon University
* All Rights Reserved
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation, and
* that credit is given to Carnegie Mellon University in all documents
* and publicity pertaining to direct or indirect use of this code or its
* derivatives.
*
* CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS,
* SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS
* FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON
* DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
* RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF
* ANY DERIVATIVE WORK.
*
* Carnegie Mellon encourages users of this software to return any
* improvements or extensions that they make, and to grant Carnegie
* Mellon the rights to redistribute these changes without encumbrance.
*
* @(#) cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:26:45 rvb Exp $
*/
/*
* Mach Operating System
* Copyright (c) 1989 Carnegie-Mellon University
* All rights reserved. The CMU software License Agreement specifies
* the terms and conditions for use and redistribution.
*/
/*
* This code was written for the Coda file system at Carnegie Mellon
* University. Contributers include David Steere, James Kistler, and
* M. Satyanarayanan.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/namei.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vfsops.h>
#include <coda/coda_venus.h>
#include <coda/coda_subr.h>
#include <coda/coda_opstats.h>
/* for VN_RDEV */
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
MODULE(MODULE_CLASS_VFS, coda, "vcoda");
#define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__))
extern struct vnode *coda_ctlvp;
extern struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */
/* structure to keep statistics of internally generated/satisfied calls */
struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE];
#define MARK_ENTRY(op) (coda_vfsopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++)
#define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++)
extern const struct cdevsw vcoda_cdevsw;
extern const struct vnodeopv_desc coda_vnodeop_opv_desc;
const struct vnodeopv_desc * const coda_vnodeopv_descs[] = {
&coda_vnodeop_opv_desc,
NULL,
};
struct vfsops coda_vfsops = {
.vfs_name = MOUNT_CODA,
.vfs_min_mount_data = 256,
/* This is the pathname, unlike every other fs */
.vfs_mount = coda_mount,
.vfs_start = coda_start,
.vfs_unmount = coda_unmount,
.vfs_root = coda_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = coda_nb_statvfs,
.vfs_sync = coda_sync,
.vfs_vget = coda_vget,
.vfs_loadvnode = coda_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = coda_init,
.vfs_done = coda_done,
.vfs_mountroot = (void *)eopnotsupp,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = coda_vnodeopv_descs
};
static int
coda_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return vfs_attach(&coda_vfsops);
case MODULE_CMD_FINI:
return vfs_detach(&coda_vfsops);
default:
return ENOTTY;
}
}
int
coda_vfsopstats_init(void)
{
int i;
for (i=0;i<CODA_VFSOPS_SIZE;i++) {
coda_vfsopstats[i].opcode = i;
coda_vfsopstats[i].entries = 0;
coda_vfsopstats[i].sat_intrn = 0;
coda_vfsopstats[i].unsat_intrn = 0;
coda_vfsopstats[i].gen_intrn = 0;
}
return 0;
}
/*
* cfs mount vfsop
* Set up mount info record and attach it to vfs struct.
*/
/*ARGSUSED*/
int
coda_mount(struct mount *vfsp, /* Allocated and initialized by mount(2) */
const char *path, /* path covered: ignored by the fs-layer */
void *data, /* Need to define a data type for this in netbsd? */
size_t *data_len)
{
struct lwp *l = curlwp;
struct vnode *dvp;
struct cnode *cp;
dev_t dev;
struct coda_mntinfo *mi;
struct vnode *rtvp;
const struct cdevsw *cdev;
CodaFid rootfid = INVAL_FID;
CodaFid ctlfid = CTL_FID;
int error;
if (data == NULL)
return EINVAL;
if (vfsp->mnt_flag & MNT_GETARGS)
return EINVAL;
ENTRY;
coda_vfsopstats_init();
coda_vnodeopstats_init();
MARK_ENTRY(CODA_MOUNT_STATS);
if (CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_MOUNT_STATS);
return(EBUSY);
}
/* Validate mount device. Similar to getmdev(). */
/*
* XXX: coda passes the mount device as the entire mount args,
* All other fs pass a structure contining a pointer.
* In order to get sys_mount() to do the copyin() we've set a
* fixed default size for the filename buffer.
*/
/* Ensure that namei() doesn't run off the filename buffer */
if (*data_len < 1 || *data_len > PATH_MAX ||
strnlen(data, *data_len) >= *data_len) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return EINVAL;
}
error = namei_simple_kernel((char *)data, NSM_FOLLOW_NOEMULROOT,
&dvp);
if (error) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return (error);
}
if (dvp->v_type != VCHR) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
vrele(dvp);
return(ENXIO);
}
dev = dvp->v_rdev;
vrele(dvp);
cdev = cdevsw_lookup(dev);
if (cdev == NULL) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
/*
* See if the device table matches our expectations.
*/
if (cdev != &vcoda_cdevsw)
{
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
if (minor(dev) >= NVCODA) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
/*
* Initialize the mount record and link it to the vfs struct
*/
mi = &coda_mnttbl[minor(dev)];
if (!VC_OPEN(&mi->mi_vcomm)) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENODEV);
}
/* No initialization (here) of mi_vcomm! */
vfsp->mnt_data = mi;
vfsp->mnt_stat.f_fsidx.__fsid_val[0] = 0;
vfsp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CODA);
vfsp->mnt_stat.f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
vfsp->mnt_stat.f_namemax = CODA_MAXNAMLEN;
mi->mi_vfsp = vfsp;
/*
* Make a root vnode to placate the Vnode interface, but don't
* actually make the CODA_ROOT call to venus until the first call
* to coda_root in case a server is down while venus is starting.
*/
cp = make_coda_node(&rootfid, vfsp, VDIR);
rtvp = CTOV(cp);
rtvp->v_vflag |= VV_ROOT;
cp = make_coda_node(&ctlfid, vfsp, VCHR);
coda_ctlvp = CTOV(cp);
/* Add vfs and rootvp to chain of vfs hanging off mntinfo */
mi->mi_vfsp = vfsp;
mi->mi_rootvp = rtvp;
/* set filesystem block size */
vfsp->mnt_stat.f_bsize = 8192; /* XXX -JJK */
vfsp->mnt_stat.f_frsize = 8192; /* XXX -JJK */
/* error is currently guaranteed to be zero, but in case some
code changes... */
CODADEBUG(1,
myprintf(("coda_mount returned %d\n",error)););
if (error)
MARK_INT_FAIL(CODA_MOUNT_STATS);
else
MARK_INT_SAT(CODA_MOUNT_STATS);
return set_statvfs_info("/coda", UIO_SYSSPACE, "CODA", UIO_SYSSPACE,
vfsp->mnt_op->vfs_name, vfsp, l);
}
int
coda_start(struct mount *vfsp, int flags)
{
ENTRY;
vftomi(vfsp)->mi_started = 1;
return (0);
}
int
coda_unmount(struct mount *vfsp, int mntflags)
{
struct coda_mntinfo *mi = vftomi(vfsp);
int active, error = 0;
ENTRY;
MARK_ENTRY(CODA_UMOUNT_STATS);
if (!CODA_MOUNTED(vfsp)) {
MARK_INT_FAIL(CODA_UMOUNT_STATS);
return(EINVAL);
}
if (mi->mi_vfsp == vfsp) { /* We found the victim */
if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp)))
return (EBUSY); /* Venus is still running */
#ifdef DEBUG
printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp));
#endif
mi->mi_started = 0;
vrele(mi->mi_rootvp);
vrele(coda_ctlvp);
active = coda_kill(vfsp, NOT_DOWNCALL);
mi->mi_rootvp->v_vflag &= ~VV_ROOT;
error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE);
printf("coda_unmount: active = %d, vflush active %d\n", active, error);
error = 0;
/* I'm going to take this out to allow lookups to go through. I'm
* not sure it's important anyway. -- DCS 2/2/94
*/
/* vfsp->VFS_DATA = NULL; */
/* No more vfsp's to hold onto */
mi->mi_vfsp = NULL;
mi->mi_rootvp = NULL;
if (error)
MARK_INT_FAIL(CODA_UMOUNT_STATS);
else
MARK_INT_SAT(CODA_UMOUNT_STATS);
return(error);
}
return (EINVAL);
}
/*
* find root of cfs
*/
int
coda_root(struct mount *vfsp, int lktype, struct vnode **vpp)
{
struct coda_mntinfo *mi = vftomi(vfsp);
int error;
struct lwp *l = curlwp; /* XXX - bnoble */
CodaFid VFid;
static const CodaFid invalfid = INVAL_FID;
ENTRY;
MARK_ENTRY(CODA_ROOT_STATS);
if (vfsp == mi->mi_vfsp) {
if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid, sizeof(CodaFid)))
{ /* Found valid root. */
*vpp = mi->mi_rootvp;
/* On Mach, this is vref. On NetBSD, VOP_LOCK */
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_SAT(CODA_ROOT_STATS);
return(0);
}
}
error = venus_root(vftomi(vfsp), l->l_cred, l->l_proc, &VFid);
if (!error) {
struct cnode *cp = VTOC(mi->mi_rootvp);
/*
* Save the new rootfid in the cnode, and rekey the cnode
* with the new fid key.
*/
error = vcache_rekey_enter(vfsp, mi->mi_rootvp,
&invalfid, sizeof(CodaFid), &VFid, sizeof(CodaFid));
if (error)
goto exit;
cp->c_fid = VFid;
vcache_rekey_exit(vfsp, mi->mi_rootvp,
&invalfid, sizeof(CodaFid), &cp->c_fid, sizeof(CodaFid));
*vpp = mi->mi_rootvp;
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_SAT(CODA_ROOT_STATS);
goto exit;
} else if (error == ENODEV || error == EINTR) {
/* Gross hack here! */
/*
* If Venus fails to respond to the CODA_ROOT call, coda_call returns
* ENODEV. Return the uninitialized root vnode to allow vfs
* operations such as unmount to continue. Without this hack,
* there is no way to do an unmount if Venus dies before a
* successful CODA_ROOT call is done. All vnode operations
* will fail.
*/
*vpp = mi->mi_rootvp;
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_FAIL(CODA_ROOT_STATS);
error = 0;
goto exit;
} else {
CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); );
MARK_INT_FAIL(CODA_ROOT_STATS);
goto exit;
}
exit:
return(error);
}
/*
* Get file system statistics.
*/
int
coda_nb_statvfs(struct mount *vfsp, struct statvfs *sbp)
{
struct lwp *l = curlwp;
struct coda_statfs fsstat;
int error;
ENTRY;
MARK_ENTRY(CODA_STATFS_STATS);
if (!CODA_MOUNTED(vfsp)) {
/* MARK_INT_FAIL(CODA_STATFS_STATS); */
return(EINVAL);
}
/* XXX - what to do about f_flags, others? --bnoble */
/* Below This is what AFS does
#define NB_SFS_SIZ 0x895440
*/
/* Note: Normal fs's have a bsize of 0x400 == 1024 */
error = venus_statfs(vftomi(vfsp), l->l_cred, l, &fsstat);
if (!error) {
sbp->f_bsize = 8192; /* XXX */
sbp->f_frsize = 8192; /* XXX */
sbp->f_iosize = 8192; /* XXX */
sbp->f_blocks = fsstat.f_blocks;
sbp->f_bfree = fsstat.f_bfree;
sbp->f_bavail = fsstat.f_bavail;
sbp->f_bresvd = 0;
sbp->f_files = fsstat.f_files;
sbp->f_ffree = fsstat.f_ffree;
sbp->f_favail = fsstat.f_ffree;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, vfsp);
}
MARK_INT_SAT(CODA_STATFS_STATS);
return(error);
}
/*
* Flush any pending I/O.
*/
int
coda_sync(struct mount *vfsp, int waitfor,
kauth_cred_t cred)
{
ENTRY;
MARK_ENTRY(CODA_SYNC_STATS);
MARK_INT_SAT(CODA_SYNC_STATS);
return(0);
}
int
coda_vget(struct mount *vfsp, ino_t ino, int lktype,
struct vnode **vpp)
{
ENTRY;
return (EOPNOTSUPP);
}
int
coda_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
CodaFid fid;
struct cnode *cp;
extern int (**coda_vnodeop_p)(void *);
KASSERT(key_len == sizeof(CodaFid));
memcpy(&fid, key, key_len);
cp = kmem_zalloc(sizeof(*cp), KM_SLEEP);
mutex_init(&cp->c_lock, MUTEX_DEFAULT, IPL_NONE);
cp->c_fid = fid;
cp->c_vnode = vp;
vp->v_op = coda_vnodeop_p;
vp->v_tag = VT_CODA;
vp->v_type = VNON;
vp->v_data = cp;
*new_key = &cp->c_fid;
return 0;
}
/*
* fhtovp is now what vget used to be in 4.3-derived systems. For
* some silly reason, vget is now keyed by a 32 bit ino_t, rather than
* a type-specific fid.
*/
int
coda_fhtovp(struct mount *vfsp, struct fid *fhp, struct mbuf *nam,
struct vnode **vpp, int *exflagsp,
kauth_cred_t *creadanonp, int lktype)
{
struct cfid *cfid = (struct cfid *)fhp;
struct cnode *cp = 0;
int error;
struct lwp *l = curlwp; /* XXX -mach */
CodaFid VFid;
int vtype;
ENTRY;
MARK_ENTRY(CODA_VGET_STATS);
/* Check for vget of control object. */
if (IS_CTL_FID(&cfid->cfid_fid)) {
*vpp = coda_ctlvp;
vref(coda_ctlvp);
MARK_INT_SAT(CODA_VGET_STATS);
return(0);
}
error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, l->l_cred, l->l_proc, &VFid, &vtype);
if (error) {
CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));)
*vpp = (struct vnode *)0;
} else {
CODADEBUG(CODA_VGET,
myprintf(("vget: %s type %d result %d\n",
coda_f2s(&VFid), vtype, error)); )
cp = make_coda_node(&VFid, vfsp, vtype);
*vpp = CTOV(cp);
}
return(error);
}
int
coda_vptofh(struct vnode *vnp, struct fid *fidp)
{
ENTRY;
return (EOPNOTSUPP);
}
void
coda_init(void)
{
ENTRY;
}
void
coda_done(void)
{
ENTRY;
}
SYSCTL_SETUP(sysctl_vfs_coda_setup, "sysctl vfs.coda subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "coda",
SYSCTL_DESCR("code vfs options"),
NULL, 0, NULL, 0,
CTL_VFS, 18, CTL_EOL);
/*
* XXX the "18" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "18" is the order as taken from sys/mount.h
*/
/*
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "clusterread",
SYSCTL_DESCR( anyone? ),
NULL, 0, &doclusterread, 0,
CTL_VFS, 18, FFS_CLUSTERREAD, CTL_EOL);
*/
}
/*
* To allow for greater ease of use, some vnodes may be orphaned when
* Venus dies. Certain operations should still be allowed to go
* through, but without propagating orphan-ness. So this function will
* get a new vnode for the file from the current run of Venus.
*/
int
getNewVnode(struct vnode **vpp)
{
struct cfid cfid;
struct coda_mntinfo *mi = vftomi((*vpp)->v_mount);
ENTRY;
cfid.cfid_len = (short)sizeof(CodaFid);
cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */
/* XXX ? */
/* We're guessing that if set, the 1st element on the list is a
* valid vnode to use. If not, return ENODEV as venus is dead.
*/
if (mi->mi_vfsp == NULL)
return ENODEV;
return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp,
NULL, NULL, LK_EXCLUSIVE);
}
/* Get the mount structure corresponding to a given device.
* Return NULL if no device is found or the device is not mounted.
*/
struct mount *devtomp(dev_t dev)
{
struct mount *mp;
struct vnode *vp;
if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) {
mp = spec_node_getmountedfs(vp);
vrele(vp);
} else {
mp = NULL;
}
return mp;
}
/* $NetBSD: kern_tc.c,v 1.76 2023/07/30 12:39:18 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
* ---------------------------------------------------------------------------
*/
/*
* https://papers.freebsd.org/2002/phk-timecounters.files/timecounter.pdf
*/
#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.76 2023/07/30 12:39:18 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/reboot.h> /* XXX just to get AB_VERBOSE */
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/timepps.h>
#include <sys/timetc.h>
#include <sys/timex.h>
#include <sys/xcall.h>
/*
* A large step happens on boot. This constant detects such steps.
* It is relatively small so that ntp_update_second gets called enough
* in the typical 'missed a couple of seconds' case, but doesn't loop
* forever when the time step is large.
*/
#define LARGE_STEP 200
/*
* Implement a dummy timecounter which we can use until we get a real one
* in the air. This allows the console and other early stuff to use
* time services.
*/
static u_int
dummy_get_timecount(struct timecounter *tc)
{
static u_int now;
return ++now;
}
static struct timecounter dummy_timecounter = {
.tc_get_timecount = dummy_get_timecount,
.tc_counter_mask = ~0u,
.tc_frequency = 1000000,
.tc_name = "dummy",
.tc_quality = -1000000,
.tc_priv = NULL,
};
struct timehands {
/* These fields must be initialized by the driver. */
struct timecounter *th_counter; /* active timecounter */
int64_t th_adjustment; /* frequency adjustment */
/* (NTP/adjtime) */
uint64_t th_scale; /* scale factor (counter */
/* tick->time) */
uint64_t th_offset_count; /* offset at last time */
/* update (tc_windup()) */
struct bintime th_offset; /* bin (up)time at windup */
struct timeval th_microtime; /* cached microtime */
struct timespec th_nanotime; /* cached nanotime */
/* Fields not to be copied in tc_windup start with th_generation. */
volatile u_int th_generation; /* current genration */
struct timehands *th_next; /* next timehand */
};
static struct timehands th0;
static struct timehands th9 = { .th_next = &th0, };
static struct timehands th8 = { .th_next = &th9, };
static struct timehands th7 = { .th_next = &th8, };
static struct timehands th6 = { .th_next = &th7, };
static struct timehands th5 = { .th_next = &th6, };
static struct timehands th4 = { .th_next = &th5, };
static struct timehands th3 = { .th_next = &th4, };
static struct timehands th2 = { .th_next = &th3, };
static struct timehands th1 = { .th_next = &th2, };
static struct timehands th0 = {
.th_counter = &dummy_timecounter,
.th_scale = (uint64_t)-1 / 1000000,
.th_offset = { .sec = 1, .frac = 0 },
.th_generation = 1,
.th_next = &th1,
};
static struct timehands *volatile timehands = &th0;
struct timecounter *timecounter = &dummy_timecounter;
static struct timecounter *timecounters = &dummy_timecounter;
/* used by savecore(8) */
time_t time_second_legacy asm("time_second");
#ifdef __HAVE_ATOMIC64_LOADSTORE
volatile time_t time__second __cacheline_aligned = 1;
volatile time_t time__uptime __cacheline_aligned = 1;
#else
static volatile struct {
uint32_t lo, hi;
} time__uptime32 __cacheline_aligned = {
.lo = 1,
}, time__second32 __cacheline_aligned = {
.lo = 1,
};
#endif
static struct {
struct bintime bin;
volatile unsigned gen; /* even when stable, odd when changing */
} timebase __cacheline_aligned;
static int timestepwarnings;
kmutex_t timecounter_lock;
static u_int timecounter_mods;
static volatile int timecounter_removals = 1;
static u_int timecounter_bad;
#ifdef __HAVE_ATOMIC64_LOADSTORE
static inline void
setrealuptime(time_t second, time_t uptime)
{
time_second_legacy = second;
atomic_store_relaxed(&time__second, second);
atomic_store_relaxed(&time__uptime, uptime);
}
#else
static inline void
setrealuptime(time_t second, time_t uptime)
{
uint32_t seclo = second & 0xffffffff, sechi = second >> 32;
uint32_t uplo = uptime & 0xffffffff, uphi = uptime >> 32;
KDASSERT(mutex_owned(&timecounter_lock));
time_second_legacy = second;
/*
* Fast path -- no wraparound, just updating the low bits, so
* no need for seqlocked access.
*/
if (__predict_true(sechi == time__second32.hi) &&
__predict_true(uphi == time__uptime32.hi)) {
atomic_store_relaxed(&time__second32.lo, seclo);
atomic_store_relaxed(&time__uptime32.lo, uplo);
return;
}
atomic_store_relaxed(&time__second32.hi, 0xffffffff);
atomic_store_relaxed(&time__uptime32.hi, 0xffffffff);
membar_producer();
atomic_store_relaxed(&time__second32.lo, seclo);
atomic_store_relaxed(&time__uptime32.lo, uplo);
membar_producer();
atomic_store_relaxed(&time__second32.hi, sechi);
atomic_store_relaxed(&time__uptime32.hi, uphi);
}
time_t
getrealtime(void)
{
uint32_t lo, hi;
do {
for (;;) {
hi = atomic_load_relaxed(&time__second32.hi);
if (__predict_true(hi != 0xffffffff))
break;
SPINLOCK_BACKOFF_HOOK;
}
membar_consumer();
lo = atomic_load_relaxed(&time__second32.lo);
membar_consumer();
} while (hi != atomic_load_relaxed(&time__second32.hi));
return ((time_t)hi << 32) | lo;
}
time_t
getuptime(void)
{
uint32_t lo, hi;
do {
for (;;) {
hi = atomic_load_relaxed(&time__uptime32.hi);
if (__predict_true(hi != 0xffffffff))
break;
SPINLOCK_BACKOFF_HOOK;
}
membar_consumer();
lo = atomic_load_relaxed(&time__uptime32.lo);
membar_consumer();
} while (hi != atomic_load_relaxed(&time__uptime32.hi));
return ((time_t)hi << 32) | lo;
}
time_t
getboottime(void)
{
return getrealtime() - getuptime();
}
uint32_t
getuptime32(void)
{
return atomic_load_relaxed(&time__uptime32.lo);
}
#endif /* !defined(__HAVE_ATOMIC64_LOADSTORE) */
/*
* sysctl helper routine for kern.timercounter.hardware
*/
static int
sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int error;
char newname[MAX_TCNAMELEN];
struct timecounter *newtc, *tc;
tc = timecounter;
strlcpy(newname, tc->tc_name, sizeof(newname));
node = *rnode;
node.sysctl_data = newname;
node.sysctl_size = sizeof(newname);
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error ||
newp == NULL ||
strncmp(newname, tc->tc_name, sizeof(newname)) == 0)
return error;
if (l != NULL && (error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname,
NULL, NULL)) != 0)
return error;
if (!cold)
mutex_spin_enter(&timecounter_lock);
error = EINVAL;
for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
if (strcmp(newname, newtc->tc_name) != 0)
continue;
/* Warm up new timecounter. */
(void)newtc->tc_get_timecount(newtc);
(void)newtc->tc_get_timecount(newtc);
timecounter = newtc;
error = 0;
break;
}
if (!cold)
mutex_spin_exit(&timecounter_lock);
return error;
}
static int
sysctl_kern_timecounter_choice(SYSCTLFN_ARGS)
{
char buf[MAX_TCNAMELEN+48];
char *where;
const char *spc;
struct timecounter *tc;
size_t needed, left, slen;
int error, mods;
if (newp != NULL)
return EPERM;
if (namelen != 0)
return EINVAL;
mutex_spin_enter(&timecounter_lock);
retry:
spc = "";
error = 0;
needed = 0;
left = *oldlenp;
where = oldp;
for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
if (where == NULL) {
needed += sizeof(buf); /* be conservative */
} else {
slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64
" Hz)", spc, tc->tc_name, tc->tc_quality,
tc->tc_frequency);
if (left < slen + 1)
break;
mods = timecounter_mods;
mutex_spin_exit(&timecounter_lock);
error = copyout(buf, where, slen + 1);
mutex_spin_enter(&timecounter_lock);
if (mods != timecounter_mods) {
goto retry;
}
spc = " ";
where += slen;
needed += slen;
left -= slen;
}
}
mutex_spin_exit(&timecounter_lock);
*oldlenp = needed;
return error;
}
SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup")
{
const struct sysctlnode *node;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "timecounter",
SYSCTL_DESCR("time counter information"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node != NULL) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "choice",
SYSCTL_DESCR("available counters"),
sysctl_kern_timecounter_choice, 0, NULL, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRING, "hardware",
SYSCTL_DESCR("currently active time counter"),
sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "timestepwarnings",
SYSCTL_DESCR("log time steps"),
NULL, 0, ×tepwarnings, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
}
}
#ifdef TC_COUNTERS
#define TC_STATS(name) \
static struct evcnt n##name = \
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \
EVCNT_ATTACH_STATIC(n##name)
TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime);
TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime);
TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime);
TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime);
TC_STATS(setclock);
#define TC_COUNT(var) var.ev_count++
#undef TC_STATS
#else
#define TC_COUNT(var) /* nothing */
#endif /* TC_COUNTERS */
static void tc_windup(void);
/*
* Return the difference between the timehands' counter value now and what
* was when we copied it to the timehands' offset_count.
*/
static inline u_int
tc_delta(struct timehands *th)
{
struct timecounter *tc;
tc = th->th_counter;
return (tc->tc_get_timecount(tc) -
th->th_offset_count) & tc->tc_counter_mask;
}
/*
* Functions for reading the time. We have to loop until we are sure that
* the timehands that we operated on was not updated under our feet. See
* the comment in <sys/timevar.h> for a description of these 12 functions.
*/
void
binuptime(struct bintime *bt)
{
struct timehands *th;
lwp_t *l;
u_int lgen, gen;
TC_COUNT(nbinuptime);
/*
* Provide exclusion against tc_detach().
*
* We record the number of timecounter removals before accessing
* timecounter state. Note that the LWP can be using multiple
* "generations" at once, due to interrupts (interrupted while in
* this function). Hardware interrupts will borrow the interrupted
* LWP's l_tcgen value for this purpose, and can themselves be
* interrupted by higher priority interrupts. In this case we need
* to ensure that the oldest generation in use is recorded.
*
* splsched() is too expensive to use, so we take care to structure
* this code in such a way that it is not required. Likewise, we
* do not disable preemption.
*
* Memory barriers are also too expensive to use for such a
* performance critical function. The good news is that we do not
* need memory barriers for this type of exclusion, as the thread
* updating timecounter_removals will issue a broadcast cross call
* before inspecting our l_tcgen value (this elides memory ordering
* issues).
*
* XXX If the author of the above comment knows how to make it
* safe to avoid memory barriers around the access to
* th->th_generation, I'm all ears.
*/
l = curlwp;
lgen = l->l_tcgen;
if (__predict_true(lgen == 0)) { l->l_tcgen = timecounter_removals;
}
__insn_barrier();
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
bintime_addx(bt, th->th_scale * tc_delta(th));
membar_consumer();
} while (gen == 0 || gen != th->th_generation); __insn_barrier();
l->l_tcgen = lgen;
}
void
nanouptime(struct timespec *tsp)
{
struct bintime bt;
TC_COUNT(nnanouptime);
binuptime(&bt);
bintime2timespec(&bt, tsp);
}
void
microuptime(struct timeval *tvp)
{
struct bintime bt;
TC_COUNT(nmicrouptime);
binuptime(&bt);
bintime2timeval(&bt, tvp);
}
void
bintime(struct bintime *bt)
{
struct bintime boottime;
TC_COUNT(nbintime);
binuptime(bt); getbinboottime(&boottime);
bintime_add(bt, &boottime);
}
void
nanotime(struct timespec *tsp)
{
struct bintime bt;
TC_COUNT(nnanotime);
bintime(&bt);
bintime2timespec(&bt, tsp);
}
void
microtime(struct timeval *tvp)
{
struct bintime bt;
TC_COUNT(nmicrotime);
bintime(&bt);
bintime2timeval(&bt, tvp);
}
void
getbinuptime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetbinuptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
getnanouptime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetnanouptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
bintime2timespec(&th->th_offset, tsp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getmicrouptime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetmicrouptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
bintime2timeval(&th->th_offset, tvp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getbintime(struct bintime *bt)
{
struct timehands *th;
struct bintime boottime;
u_int gen;
TC_COUNT(ngetbintime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
getbinboottime(&boottime);
bintime_add(bt, &boottime);
}
static inline void
dogetnanotime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetnanotime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*tsp = th->th_nanotime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
getnanotime(struct timespec *tsp)
{ dogetnanotime(tsp);}
void dtrace_getnanotime(struct timespec *tsp);
void
dtrace_getnanotime(struct timespec *tsp)
{
dogetnanotime(tsp);
}
void
getmicrotime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetmicrotime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*tvp = th->th_microtime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getnanoboottime(struct timespec *tsp)
{
struct bintime bt;
getbinboottime(&bt);
bintime2timespec(&bt, tsp);
}
void
getmicroboottime(struct timeval *tvp)
{
struct bintime bt;
getbinboottime(&bt);
bintime2timeval(&bt, tvp);
}
void
getbinboottime(struct bintime *basep)
{
struct bintime base;
unsigned gen;
do {
/* Spin until the timebase isn't changing. */
while ((gen = atomic_load_relaxed(&timebase.gen)) & 1)
SPINLOCK_BACKOFF_HOOK;
/* Read out a snapshot of the timebase. */
membar_consumer();
base = timebase.bin;
membar_consumer();
/* Restart if it changed while we were reading. */
} while (gen != atomic_load_relaxed(&timebase.gen));
*basep = base;
}
/*
* Initialize a new timecounter and possibly use it.
*/
void
tc_init(struct timecounter *tc)
{
u_int u;
KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised",
tc->tc_name);
u = tc->tc_frequency / tc->tc_counter_mask;
/* XXX: We need some margin here, 10% is a guess */
u *= 11;
u /= 10;
if (u > hz && tc->tc_quality >= 0) {
tc->tc_quality = -2000;
aprint_verbose(
"timecounter: Timecounter \"%s\" frequency %ju Hz",
tc->tc_name, (uintmax_t)tc->tc_frequency);
aprint_verbose(" -- Insufficient hz, needs at least %u\n", u);
} else if (tc->tc_quality >= 0 || bootverbose) {
aprint_verbose(
"timecounter: Timecounter \"%s\" frequency %ju Hz "
"quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency,
tc->tc_quality);
}
mutex_spin_enter(&timecounter_lock);
tc->tc_next = timecounters;
timecounters = tc;
timecounter_mods++;
/*
* Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
* worse since this timecounter may not be monotonous.
*/
if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality ||
(tc->tc_quality == timecounter->tc_quality &&
tc->tc_frequency > timecounter->tc_frequency))) {
(void)tc->tc_get_timecount(tc);
(void)tc->tc_get_timecount(tc);
timecounter = tc;
tc_windup();
}
mutex_spin_exit(&timecounter_lock);
}
/*
* Pick a new timecounter due to the existing counter going bad.
*/
static void
tc_pick(void)
{
struct timecounter *best, *tc;
KASSERT(mutex_owned(&timecounter_lock));
for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) {
if (tc->tc_quality > best->tc_quality)
best = tc;
else if (tc->tc_quality < best->tc_quality)
continue;
else if (tc->tc_frequency > best->tc_frequency)
best = tc;
}
(void)best->tc_get_timecount(best);
(void)best->tc_get_timecount(best);
timecounter = best;
}
/*
* A timecounter has gone bad, arrange to pick a new one at the next
* clock tick.
*/
void
tc_gonebad(struct timecounter *tc)
{
tc->tc_quality = -100;
membar_producer();
atomic_inc_uint(&timecounter_bad);
}
/*
* Stop using a timecounter and remove it from the timecounters list.
*/
int
tc_detach(struct timecounter *target)
{
struct timecounter *tc;
struct timecounter **tcp = NULL;
int removals;
lwp_t *l;
/* First, find the timecounter. */
mutex_spin_enter(&timecounter_lock);
for (tcp = &timecounters, tc = timecounters;
tc != NULL;
tcp = &tc->tc_next, tc = tc->tc_next) {
if (tc == target)
break;
}
if (tc == NULL) {
mutex_spin_exit(&timecounter_lock);
return ESRCH;
}
/* And now, remove it. */
*tcp = tc->tc_next;
if (timecounter == target) {
tc_pick();
tc_windup();
}
timecounter_mods++;
removals = timecounter_removals++;
mutex_spin_exit(&timecounter_lock);
/*
* We now have to determine if any threads in the system are still
* making use of this timecounter.
*
* We issue a broadcast cross call to elide memory ordering issues,
* then scan all LWPs in the system looking at each's timecounter
* generation number. We need to see a value of zero (not actively
* using a timecounter) or a value greater than our removal value.
*
* We may race with threads that read `timecounter_removals' and
* and then get preempted before updating `l_tcgen'. This is not
* a problem, since it means that these threads have not yet started
* accessing timecounter state. All we do need is one clean
* snapshot of the system where every thread appears not to be using
* old timecounter state.
*/
for (;;) {
xc_barrier(0);
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
if (l->l_tcgen == 0 || l->l_tcgen > removals) {
/*
* Not using timecounter or old timecounter
* state at time of our xcall or later.
*/
continue;
}
break;
}
mutex_exit(&proc_lock);
/*
* If the timecounter is still in use, wait at least 10ms
* before retrying.
*/
if (l == NULL) {
break;
}
(void)kpause("tcdetach", false, mstohz(10), NULL);
}
tc->tc_next = NULL;
return 0;
}
/* Report the frequency of the current timecounter. */
uint64_t
tc_getfrequency(void)
{
return atomic_load_consume(&timehands)->th_counter->tc_frequency;
}
/*
* Step our concept of UTC. This is done by modifying our estimate of
* when we booted.
*/
void
tc_setclock(const struct timespec *ts)
{
struct timespec ts2;
struct bintime bt, bt2;
mutex_spin_enter(&timecounter_lock);
TC_COUNT(nsetclock);
binuptime(&bt2);
timespec2bintime(ts, &bt);
bintime_sub(&bt, &bt2);
bintime_add(&bt2, &timebase.bin);
timebase.gen |= 1; /* change in progress */
membar_producer();
timebase.bin = bt;
membar_producer();
timebase.gen++; /* commit change */
tc_windup();
mutex_spin_exit(&timecounter_lock);
if (timestepwarnings) {
bintime2timespec(&bt2, &ts2);
log(LOG_INFO,
"Time stepped from %lld.%09ld to %lld.%09ld\n",
(long long)ts2.tv_sec, ts2.tv_nsec,
(long long)ts->tv_sec, ts->tv_nsec);
}
}
/*
* Initialize the next struct timehands in the ring and make
* it the active timehands. Along the way we might switch to a different
* timecounter and/or do seconds processing in NTP. Slightly magic.
*/
static void
tc_windup(void)
{
struct bintime bt;
struct timehands *th, *tho;
uint64_t scale;
u_int delta, ncount, ogen;
int i, s_update;
time_t t;
KASSERT(mutex_owned(&timecounter_lock));
s_update = 0;
/*
* Make the next timehands a copy of the current one, but do not
* overwrite the generation or next pointer. While we update
* the contents, the generation must be zero. Ensure global
* visibility of the generation before proceeding.
*/
tho = timehands;
th = tho->th_next;
ogen = th->th_generation;
th->th_generation = 0;
membar_producer();
bcopy(tho, th, offsetof(struct timehands, th_generation));
/*
* Capture a timecounter delta on the current timecounter and if
* changing timecounters, a counter value from the new timecounter.
* Update the offset fields accordingly.
*/
delta = tc_delta(th);
if (th->th_counter != timecounter)
ncount = timecounter->tc_get_timecount(timecounter);
else
ncount = 0;
th->th_offset_count += delta;
bintime_addx(&th->th_offset, th->th_scale * delta);
/*
* Hardware latching timecounters may not generate interrupts on
* PPS events, so instead we poll them. There is a finite risk that
* the hardware might capture a count which is later than the one we
* got above, and therefore possibly in the next NTP second which might
* have a different rate than the current NTP second. It doesn't
* matter in practice.
*/
if (tho->th_counter->tc_poll_pps)
tho->th_counter->tc_poll_pps(tho->th_counter);
/*
* Deal with NTP second processing. The for loop normally
* iterates at most once, but in extreme situations it might
* keep NTP sane if timeouts are not run for several seconds.
* At boot, the time step can be large when the TOD hardware
* has been read, so on really large steps, we call
* ntp_update_second only twice. We need to call it twice in
* case we missed a leap second.
* If NTP is not compiled in ntp_update_second still calculates
* the adjustment resulting from adjtime() calls.
*/
bt = th->th_offset;
bintime_add(&bt, &timebase.bin);
i = bt.sec - tho->th_microtime.tv_sec;
if (i > LARGE_STEP)
i = 2;
for (; i > 0; i--) {
t = bt.sec;
ntp_update_second(&th->th_adjustment, &bt.sec);
s_update = 1;
if (bt.sec != t) {
timebase.gen |= 1; /* change in progress */
membar_producer();
timebase.bin.sec += bt.sec - t;
membar_producer();
timebase.gen++; /* commit change */
}
}
/* Update the UTC timestamps used by the get*() functions. */
/* XXX shouldn't do this here. Should force non-`get' versions. */
bintime2timeval(&bt, &th->th_microtime);
bintime2timespec(&bt, &th->th_nanotime);
/* Now is a good time to change timecounters. */
if (th->th_counter != timecounter) {
th->th_counter = timecounter;
th->th_offset_count = ncount;
s_update = 1;
}
/*-
* Recalculate the scaling factor. We want the number of 1/2^64
* fractions of a second per period of the hardware counter, taking
* into account the th_adjustment factor which the NTP PLL/adjtime(2)
* processing provides us with.
*
* The th_adjustment is nanoseconds per second with 32 bit binary
* fraction and we want 64 bit binary fraction of second:
*
* x = a * 2^32 / 10^9 = a * 4.294967296
*
* The range of th_adjustment is +/- 5000PPM so inside a 64bit int
* we can only multiply by about 850 without overflowing, but that
* leaves suitably precise fractions for multiply before divide.
*
* Divide before multiply with a fraction of 2199/512 results in a
* systematic undercompensation of 10PPM of th_adjustment. On a
* 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
*
* We happily sacrifice the lowest of the 64 bits of our result
* to the goddess of code clarity.
*
*/
if (s_update) {
scale = (uint64_t)1 << 63;
scale += (th->th_adjustment / 1024) * 2199;
scale /= th->th_counter->tc_frequency;
th->th_scale = scale * 2;
}
/*
* Now that the struct timehands is again consistent, set the new
* generation number, making sure to not make it zero. Ensure
* changes are globally visible before changing.
*/
if (++ogen == 0)
ogen = 1;
membar_producer();
th->th_generation = ogen;
/*
* Go live with the new struct timehands. Ensure changes are
* globally visible before changing.
*/
setrealuptime(th->th_microtime.tv_sec, th->th_offset.sec);
atomic_store_release(&timehands, th);
/*
* Force users of the old timehand to move on. This is
* necessary for MP systems; we need to ensure that the
* consumers will move away from the old timehand before
* we begin updating it again when we eventually wrap
* around.
*/
if (++tho->th_generation == 0)
tho->th_generation = 1;
}
/*
* RFC 2783 PPS-API implementation.
*/
int
pps_ioctl(u_long cmd, void *data, struct pps_state *pps)
{
pps_params_t *app;
pps_info_t *pipi;
#ifdef PPS_SYNC
int *epi;
#endif
KASSERT(mutex_owned(&timecounter_lock));
KASSERT(pps != NULL);
switch (cmd) {
case PPS_IOC_CREATE:
return 0;
case PPS_IOC_DESTROY:
return 0;
case PPS_IOC_SETPARAMS:
app = (pps_params_t *)data;
if (app->mode & ~pps->ppscap)
return EINVAL;
pps->ppsparam = *app;
return 0;
case PPS_IOC_GETPARAMS:
app = (pps_params_t *)data;
*app = pps->ppsparam;
app->api_version = PPS_API_VERS_1;
return 0;
case PPS_IOC_GETCAP:
*(int*)data = pps->ppscap;
return 0;
case PPS_IOC_FETCH:
pipi = (pps_info_t *)data;
pps->ppsinfo.current_mode = pps->ppsparam.mode;
*pipi = pps->ppsinfo;
return 0;
case PPS_IOC_KCBIND:
#ifdef PPS_SYNC
epi = (int *)data;
/* XXX Only root should be able to do this */
if (*epi & ~pps->ppscap)
return EINVAL;
pps->kcmode = *epi;
return 0;
#else
return EOPNOTSUPP;
#endif
default:
return EPASSTHROUGH;
}
}
void
pps_init(struct pps_state *pps)
{ KASSERT(mutex_owned(&timecounter_lock));
pps->ppscap |= PPS_TSFMT_TSPEC;
if (pps->ppscap & PPS_CAPTUREASSERT)
pps->ppscap |= PPS_OFFSETASSERT;
if (pps->ppscap & PPS_CAPTURECLEAR)
pps->ppscap |= PPS_OFFSETCLEAR;
}
/*
* capture a timetamp in the pps structure
*/
void
pps_capture(struct pps_state *pps)
{
struct timehands *th;
KASSERT(mutex_owned(&timecounter_lock));
KASSERT(pps != NULL);
th = timehands;
pps->capgen = th->th_generation;
pps->capth = th;
pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count;
if (pps->capgen != th->th_generation)
pps->capgen = 0;
}
#ifdef PPS_DEBUG
int ppsdebug = 0;
#endif
/*
* process a pps_capture()ed event
*/
void
pps_event(struct pps_state *pps, int event)
{
pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE);
}
/*
* extended pps api / kernel pll/fll entry point
*
* feed reference time stamps to PPS engine
*
* will simulate a PPS event and feed
* the NTP PLL/FLL if requested.
*
* the ref time stamps should be roughly once
* a second but do not need to be exactly in phase
* with the UTC second but should be close to it.
* this relaxation of requirements allows callout
* driven timestamping mechanisms to feed to pps
* capture/kernel pll logic.
*
* calling pattern is:
* pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR})
* read timestamp from reference source
* pps_ref_event()
*
* supported refmodes:
* PPS_REFEVNT_CAPTURE
* use system timestamp of pps_capture()
* PPS_REFEVNT_CURRENT
* use system timestamp of this call
* PPS_REFEVNT_CAPCUR
* use average of read capture and current system time stamp
* PPS_REFEVNT_PPS
* assume timestamp on second mark - ref_ts is ignored
*
*/
void
pps_ref_event(struct pps_state *pps,
int event,
struct bintime *ref_ts,
int refmode
)
{
struct bintime bt; /* current time */
struct bintime btd; /* time difference */
struct bintime bt_ref; /* reference time */
struct timespec ts, *tsp, *osp;
struct timehands *th;
uint64_t tcount, acount, dcount, *pcount;
int foff, gen;
#ifdef PPS_SYNC
int fhard;
#endif
pps_seq_t *pseq;
KASSERT(mutex_owned(&timecounter_lock));
KASSERT(pps != NULL);
/* pick up current time stamp if needed */
if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) {
/* pick up current time stamp */
th = timehands;
gen = th->th_generation;
tcount = (uint64_t)tc_delta(th) + th->th_offset_count;
if (gen != th->th_generation)
gen = 0;
/* If the timecounter was wound up underneath us, bail out. */
if (pps->capgen == 0 ||
pps->capgen != pps->capth->th_generation ||
gen == 0 ||
gen != pps->capgen) {
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n",
pps, event);
}
#endif
return;
}
} else {
tcount = 0; /* keep GCC happy */
}
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
struct timespec tmsp;
if (ref_ts == NULL) {
tmsp.tv_sec = 0;
tmsp.tv_nsec = 0;
} else {
bintime2timespec(ref_ts, &tmsp);
}
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64
".%09"PRIi32", refmode=0x%1x)\n",
pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode);
}
#endif
/* setup correct event references */
if (event == PPS_CAPTUREASSERT) {
tsp = &pps->ppsinfo.assert_timestamp;
osp = &pps->ppsparam.assert_offset;
foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
#ifdef PPS_SYNC
fhard = pps->kcmode & PPS_CAPTUREASSERT;
#endif
pcount = &pps->ppscount[0];
pseq = &pps->ppsinfo.assert_sequence;
} else {
tsp = &pps->ppsinfo.clear_timestamp;
osp = &pps->ppsparam.clear_offset;
foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
#ifdef PPS_SYNC
fhard = pps->kcmode & PPS_CAPTURECLEAR;
#endif
pcount = &pps->ppscount[1];
pseq = &pps->ppsinfo.clear_sequence;
}
/* determine system time stamp according to refmode */
dcount = 0; /* keep GCC happy */
switch (refmode & PPS_REFEVNT_RMASK) {
case PPS_REFEVNT_CAPTURE:
acount = pps->capcount; /* use capture timestamp */
break;
case PPS_REFEVNT_CURRENT:
acount = tcount; /* use current timestamp */
break;
case PPS_REFEVNT_CAPCUR:
/*
* calculate counter value between pps_capture() and
* pps_ref_event()
*/
dcount = tcount - pps->capcount;
acount = (dcount / 2) + pps->capcount;
break;
default: /* ignore call error silently */
return;
}
/*
* If the timecounter changed, we cannot compare the count values, so
* we have to drop the rest of the PPS-stuff until the next event.
*/
if (pps->ppstc != pps->capth->th_counter) {
pps->ppstc = pps->capth->th_counter;
pps->capcount = acount;
*pcount = acount;
pps->ppscount[2] = acount;
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n",
pps, event);
}
#endif
return;
}
pps->capcount = acount;
/* Convert the count to a bintime. */
bt = pps->capth->th_offset;
bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count));
bintime_add(&bt, &timebase.bin);
if ((refmode & PPS_REFEVNT_PPS) == 0) {
/* determine difference to reference time stamp */
bt_ref = *ref_ts;
btd = bt;
bintime_sub(&btd, &bt_ref);
/*
* simulate a PPS timestamp by dropping the fraction
* and applying the offset
*/
if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */
bt.sec++;
bt.frac = 0;
bintime_add(&bt, &btd);
} else {
/*
* create ref_ts from current time -
* we are supposed to be called on
* the second mark
*/
bt_ref = bt;
if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */
bt_ref.sec++;
bt_ref.frac = 0;
}
/* convert bintime to timestamp */
bintime2timespec(&bt, &ts);
/* If the timecounter was wound up underneath us, bail out. */
if (pps->capgen != pps->capth->th_generation)
return;
/* store time stamp */
*pcount = pps->capcount;
(*pseq)++;
*tsp = ts;
/* add offset correction */
if (foff) {
timespecadd(tsp, osp, tsp);
if (tsp->tv_nsec < 0) {
tsp->tv_nsec += 1000000000;
tsp->tv_sec -= 1;
}
}
#ifdef PPS_DEBUG
if (ppsdebug & 0x2) {
struct timespec ts2;
struct timespec ts3;
bintime2timespec(&bt_ref, &ts2);
bt.sec = 0;
bt.frac = 0;
if (refmode & PPS_REFEVNT_CAPCUR) {
bintime_addx(&bt, pps->capth->th_scale * dcount);
}
bintime2timespec(&bt, &ts3);
log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32
", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n",
ts2.tv_sec, (int32_t)ts2.tv_nsec,
tsp->tv_sec, (int32_t)tsp->tv_nsec,
timespec2ns(&ts3));
}
#endif
#ifdef PPS_SYNC
if (fhard) {
uint64_t scale;
uint64_t div;
/*
* Feed the NTP PLL/FLL.
* The FLL wants to know how many (hardware) nanoseconds
* elapsed since the previous event (mod 1 second) thus
* we are actually looking at the frequency difference scaled
* in nsec.
* As the counter time stamps are not truly at 1Hz
* we need to scale the count by the elapsed
* reference time.
* valid sampling interval: [0.5..2[ sec
*/
/* calculate elapsed raw count */
tcount = pps->capcount - pps->ppscount[2];
pps->ppscount[2] = pps->capcount;
tcount &= pps->capth->th_counter->tc_counter_mask;
/* calculate elapsed ref time */
btd = bt_ref;
bintime_sub(&btd, &pps->ref_time);
pps->ref_time = bt_ref;
/* check that we stay below 2 sec */
if (btd.sec < 0 || btd.sec > 1)
return;
/* we want at least 0.5 sec between samples */
if (btd.sec == 0 && btd.frac < (uint64_t)1<<63)
return;
/*
* calculate cycles per period by multiplying
* the frequency with the elapsed period
* we pick a fraction of 30 bits
* ~1ns resolution for elapsed time
*/
div = (uint64_t)btd.sec << 30;
div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1);
div *= pps->capth->th_counter->tc_frequency;
div >>= 30;
if (div == 0) /* safeguard */
return;
scale = (uint64_t)1 << 63;
scale /= div;
scale *= 2;
bt.sec = 0;
bt.frac = 0;
bintime_addx(&bt, scale * tcount);
bintime2timespec(&bt, &ts);
#ifdef PPS_DEBUG
if (ppsdebug & 0x4) {
struct timespec ts2;
int64_t df;
bintime2timespec(&bt_ref, &ts2);
df = timespec2ns(&ts);
if (df > 500000000)
df -= 1000000000;
log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64
".%09"PRIi32", ts=%"PRIi64".%09"PRIi32
", freqdiff=%"PRIi64" ns/s\n",
ts2.tv_sec, (int32_t)ts2.tv_nsec,
tsp->tv_sec, (int32_t)tsp->tv_nsec,
df);
}
#endif
hardpps(tsp, timespec2ns(&ts));
}
#endif
}
/*
* Timecounters need to be updated every so often to prevent the hardware
* counter from overflowing. Updating also recalculates the cached values
* used by the get*() family of functions, so their precision depends on
* the update frequency.
*/
static int tc_tick;
void
tc_ticktock(void)
{
static int count;
if (++count < tc_tick)
return;
count = 0;
mutex_spin_enter(&timecounter_lock);
if (__predict_false(timecounter_bad != 0)) {
/* An existing timecounter has gone bad, pick a new one. */
(void)atomic_swap_uint(&timecounter_bad, 0);
if (timecounter->tc_quality < 0) {
tc_pick();
}
}
tc_windup();
mutex_spin_exit(&timecounter_lock);
}
void
inittimecounter(void)
{
u_int p;
mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH);
/*
* Set the initial timeout to
* max(1, <approx. number of hardclock ticks in a millisecond>).
* People should probably not use the sysctl to set the timeout
* to smaller than its initial value, since that value is the
* smallest reasonable one. If they want better timestamps they
* should use the non-"get"* functions.
*/
if (hz > 1000)
tc_tick = (hz + 500) / 1000;
else
tc_tick = 1;
p = (tc_tick * 1000000) / hz;
aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n",
p / 1000, p % 1000);
/* warm up new timecounter (again) and get rolling. */
(void)timecounter->tc_get_timecount(timecounter);
(void)timecounter->tc_get_timecount(timecounter);
}
/* $NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95
*/
/*
* procfs vnode interface
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/ptrace.h>
#include <sys/kauth.h>
#include <sys/exec.h>
#include <uvm/uvm_extern.h> /* for PAGE_SIZE */
#include <machine/reg.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/procfs/procfs.h>
/*
* Vnode Operations.
*
*/
static int procfs_validfile_linux(struct lwp *, struct mount *);
static int procfs_root_readdir_callback(struct proc *, void *);
static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *,
size_t);
/*
* This is a list of the valid names in the
* process-specific sub-directories. It is
* used in procfs_lookup and procfs_readdir
*/
static const struct proc_target {
u_char pt_type;
u_char pt_namlen;
const char *pt_name;
pfstype pt_pfstype;
int (*pt_valid)(struct lwp *, struct mount *);
} proc_targets[] = {
#define N(s) sizeof(s)-1, s
/* name type validp */
{ DT_DIR, N("."), PFSproc, NULL },
{ DT_DIR, N(".."), PFSroot, NULL },
{ DT_DIR, N("fd"), PFSfd, NULL },
{ DT_DIR, N("task"), PFStask, procfs_validfile_linux },
{ DT_LNK, N("cwd"), PFScwd, NULL },
{ DT_REG, N("emul"), PFSemul, NULL },
{ DT_LNK, N("root"), PFSchroot, NULL },
{ DT_REG, N("auxv"), PFSauxv, procfs_validauxv },
{ DT_REG, N("cmdline"), PFScmdline, NULL },
{ DT_REG, N("environ"), PFSenviron, NULL },
{ DT_LNK, N("exe"), PFSexe, procfs_validfile },
{ DT_REG, N("file"), PFSfile, procfs_validfile },
{ DT_REG, N("fpregs"), PFSfpregs, procfs_validfpregs },
{ DT_REG, N("limit"), PFSlimit, NULL },
{ DT_REG, N("map"), PFSmap, procfs_validmap },
{ DT_REG, N("maps"), PFSmaps, procfs_validmap },
{ DT_REG, N("mem"), PFSmem, NULL },
{ DT_REG, N("note"), PFSnote, NULL },
{ DT_REG, N("notepg"), PFSnotepg, NULL },
{ DT_REG, N("regs"), PFSregs, procfs_validregs },
{ DT_REG, N("stat"), PFSstat, procfs_validfile_linux },
{ DT_REG, N("statm"), PFSstatm, procfs_validfile_linux },
{ DT_REG, N("status"), PFSstatus, NULL },
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_DEFNS
#endif
#undef N
};
static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]);
/*
* List of files in the root directory. Note: the validate function will
* be called with p == NULL for these ones.
*/
static const struct proc_target proc_root_targets[] = {
#define N(s) sizeof(s)-1, s
/* name type validp */
{ DT_REG, N("meminfo"), PFSmeminfo, procfs_validfile_linux },
{ DT_REG, N("cpuinfo"), PFScpuinfo, procfs_validfile_linux },
{ DT_REG, N("uptime"), PFSuptime, procfs_validfile_linux },
{ DT_REG, N("mounts"), PFSmounts, procfs_validfile_linux },
{ DT_REG, N("devices"), PFSdevices, procfs_validfile_linux },
{ DT_REG, N("stat"), PFScpustat, procfs_validfile_linux },
{ DT_REG, N("loadavg"), PFSloadavg, procfs_validfile_linux },
{ DT_REG, N("version"), PFSversion, procfs_validfile_linux },
#undef N
};
static const int nproc_root_targets =
sizeof(proc_root_targets) / sizeof(proc_root_targets[0]);
int procfs_lookup(void *);
int procfs_open(void *);
int procfs_close(void *);
int procfs_access(void *);
int procfs_getattr(void *);
int procfs_setattr(void *);
int procfs_readdir(void *);
int procfs_readlink(void *);
int procfs_inactive(void *);
int procfs_reclaim(void *);
int procfs_print(void *);
int procfs_pathconf(void *);
int procfs_getpages(void *);
static uint8_t fttodt(file_t *);
static int atoi(const char *, size_t);
/*
* procfs vnode operations.
*/
int (**procfs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = {
{ &vop_default_desc, vn_default_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, procfs_lookup }, /* lookup */
{ &vop_create_desc, genfs_eopnotsupp }, /* create */
{ &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */
{ &vop_open_desc, procfs_open }, /* open */
{ &vop_close_desc, procfs_close }, /* close */
{ &vop_access_desc, procfs_access }, /* access */
{ &vop_accessx_desc, genfs_accessx }, /* accessx */
{ &vop_getattr_desc, procfs_getattr }, /* getattr */
{ &vop_setattr_desc, procfs_setattr }, /* setattr */
{ &vop_read_desc, procfs_rw }, /* read */
{ &vop_write_desc, procfs_rw }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */
{ &vop_poll_desc, genfs_poll }, /* poll */
{ &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
{ &vop_revoke_desc, genfs_revoke }, /* revoke */
{ &vop_fsync_desc, genfs_nullop }, /* fsync */
{ &vop_seek_desc, genfs_nullop }, /* seek */
{ &vop_remove_desc, genfs_eopnotsupp }, /* remove */
{ &vop_link_desc, genfs_erofs_link }, /* link */
{ &vop_rename_desc, genfs_eopnotsupp }, /* rename */
{ &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */
{ &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */
{ &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */
{ &vop_readdir_desc, procfs_readdir }, /* readdir */
{ &vop_readlink_desc, procfs_readlink }, /* readlink */
{ &vop_abortop_desc, genfs_abortop }, /* abortop */
{ &vop_inactive_desc, procfs_inactive }, /* inactive */
{ &vop_reclaim_desc, procfs_reclaim }, /* reclaim */
{ &vop_lock_desc, genfs_lock }, /* lock */
{ &vop_unlock_desc, genfs_unlock }, /* unlock */
{ &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */
{ &vop_strategy_desc, genfs_badop }, /* strategy */
{ &vop_print_desc, procfs_print }, /* print */
{ &vop_islocked_desc, genfs_islocked }, /* islocked */
{ &vop_pathconf_desc, procfs_pathconf }, /* pathconf */
{ &vop_advlock_desc, genfs_einval }, /* advlock */
{ &vop_getpages_desc, procfs_getpages }, /* getpages */
{ &vop_putpages_desc, genfs_null_putpages }, /* putpages */
{ NULL, NULL }
};
const struct vnodeopv_desc procfs_vnodeop_opv_desc =
{ &procfs_vnodeop_p, procfs_vnodeop_entries };
/*
* set things up for doing i/o on
* the pfsnode (vp). (vp) is locked
* on entry, and should be left locked
* on exit.
*
* for procfs we don't need to do anything
* in particular for i/o. all that is done
* is to support exclusive open on process
* memory images.
*/
int
procfs_open(void *v)
{
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct lwp *l1;
struct proc *p2;
int error;
if ((error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0)
return error;
l1 = curlwp; /* tracer */
#define M2K(m) (((m) & FREAD) && ((m) & FWRITE) ? \
KAUTH_REQ_PROCESS_PROCFS_RW : \
(m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \
KAUTH_REQ_PROCESS_PROCFS_READ)
mutex_enter(p2->p_lock);
error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS,
p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL);
mutex_exit(p2->p_lock);
if (error) {
procfs_proc_unlock(p2);
return (error);
}
#undef M2K
switch (pfs->pfs_type) {
case PFSmem:
if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
error = EBUSY;
break;
}
if (!proc_isunder(p2, l1)) {
error = EPERM;
break;
}
if (ap->a_mode & FWRITE)
pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
break;
case PFSregs:
case PFSfpregs:
if (!proc_isunder(p2, l1)) {
error = EPERM;
break;
}
break;
default:
break;
}
procfs_proc_unlock(p2);
return (error);
}
/*
* close the pfsnode (vp) after doing i/o.
* (vp) is not locked on entry or exit.
*
* nothing to do for procfs other than undo
* any exclusive open flag (see _open above).
*/
int
procfs_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct pfsnode *pfs = VTOPFS(ap->a_vp);
switch (pfs->pfs_type) {
case PFSmem:
if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
pfs->pfs_flags &= ~(FWRITE|O_EXCL);
break;
default:
break;
}
return (0);
}
/*
* _inactive is called when the pfsnode
* is vrele'd and the reference count goes
* to zero. (vp) will be on the vnode free
* list, so to get it back vget() must be
* used.
*
* (vp) is locked on entry, but must be unlocked on exit.
*/
int
procfs_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
mutex_enter(&proc_lock);
*ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL);
mutex_exit(&proc_lock);
return (0);
}
/*
* _reclaim is called when getnewvnode()
* wants to make use of an entry on the vnode
* free list. at this time the filesystem needs
* to free any private data and remove the node
* from any private lists.
*/
int
procfs_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
VOP_UNLOCK(vp);
/*
* To interlock with procfs_revoke_vnodes().
*/
mutex_enter(vp->v_interlock);
vp->v_data = NULL;
mutex_exit(vp->v_interlock);
procfs_hashrem(pfs);
kmem_free(pfs, sizeof(*pfs));
return 0;
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
procfs_pathconf(void *v)
{
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap = v;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
return (0);
case _PC_MAX_CANON:
*ap->a_retval = MAX_CANON;
return (0);
case _PC_MAX_INPUT:
*ap->a_retval = MAX_INPUT;
return (0);
case _PC_PIPE_BUF:
*ap->a_retval = PIPE_BUF;
return (0);
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
return (0);
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
return (0);
case _PC_SYNC_IO:
*ap->a_retval = 1;
return (0);
default:
return genfs_pathconf(ap);
}
/* NOTREACHED */
}
/*
* _print is used for debugging.
* just print a readable description
* of (vp).
*/
int
procfs_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct pfsnode *pfs = VTOPFS(ap->a_vp);
printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n",
pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
return 0;
}
/*
* Works out the path to the target process's current
* working directory or chroot. If the caller is in a chroot and
* can't "reach" the target's cwd or root (or some other error
* occurs), a "/" is returned for the path.
*/
static void
procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp,
char *path, size_t len)
{
struct cwdinfo *cwdi;
struct vnode *vp, *rvp;
char *bp;
/*
* Lock target cwdi and take a reference to the vnode
* we are interested in to prevent it from disappearing
* before getcwd_common() below.
*/
rw_enter(&target->p_cwdi->cwdi_lock, RW_READER);
switch (t) {
case PFScwd:
vp = target->p_cwdi->cwdi_cdir;
break;
case PFSchroot:
vp = target->p_cwdi->cwdi_rdir;
break;
default:
rw_exit(&target->p_cwdi->cwdi_lock);
return;
}
if (vp != NULL)
vref(vp);
rw_exit(&target->p_cwdi->cwdi_lock);
cwdi = caller->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
rvp = cwdi->cwdi_rdir;
bp = bpp ? *bpp : NULL;
/*
* XXX: this horrible kludge avoids locking panics when
* attempting to lookup links that point to within procfs
*/
if (vp != NULL && vp->v_tag == VT_PROCFS) {
if (bpp) {
*--bp = '/';
*bpp = bp;
}
vrele(vp);
rw_exit(&cwdi->cwdi_lock);
return;
}
if (rvp == NULL)
rvp = rootvnode;
if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path,
len / 2, 0, caller) != 0) {
if (bpp) {
bp = *bpp;
*--bp = '/';
}
}
if (bpp)
*bpp = bp;
if (vp != NULL)
vrele(vp);
rw_exit(&cwdi->cwdi_lock);
}
/*
* Invent attributes for pfsnode (vp) and store
* them in (vap).
* Directories lengths are returned as zero since
* any real length would require the genuine size
* to be computed, and nothing cares anyway.
*
* this is relatively minimal for procfs.
*/
int
procfs_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct vattr *vap = ap->a_vap;
struct proc *procp;
char *path, *bp, bf[16];
int error;
/* first check the process still exists */
switch (pfs->pfs_type) {
case PFSroot:
case PFScurproc:
case PFSself:
procp = NULL;
break;
default:
error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT);
if (error != 0)
return (error);
break;
}
switch (pfs->pfs_type) {
case PFStask:
if (pfs->pfs_fd == -1) {
path = NULL;
break;
}
/*FALLTHROUGH*/
case PFScwd:
case PFSchroot:
path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
if (path == NULL && procp != NULL) { procfs_proc_unlock(procp);
return (ENOMEM);
}
break;
default:
path = NULL;
break;
}
if (procp != NULL) {
mutex_enter(procp->p_lock);
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, procp,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
mutex_exit(procp->p_lock);
if (error != 0) {
procfs_proc_unlock(procp);
if (path != NULL) free(path, M_TEMP);
return (ENOENT);
}
}
error = 0;
/* start by zeroing out the attributes */
vattr_null(vap);
/* next do all the common fields */
vap->va_type = ap->a_vp->v_type;
vap->va_mode = pfs->pfs_mode;
vap->va_fileid = pfs->pfs_fileno;
vap->va_flags = 0;
vap->va_blocksize = PAGE_SIZE;
/*
* Make all times be current TOD.
*
* It would be possible to get the process start
* time from the p_stats structure, but there's
* no "file creation" time stamp anyway, and the
* p_stats structure is not addressable if u. gets
* swapped out for that process.
*/
getnanotime(&vap->va_ctime);
vap->va_atime = vap->va_mtime = vap->va_ctime;
if (procp)
TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start,
&vap->va_birthtime);
else
getnanotime(&vap->va_birthtime); switch (pfs->pfs_type) {
case PFSmem:
case PFSregs:
case PFSfpregs:
#if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES)
PROCFS_MACHDEP_PROTECT_CASES
#endif
/*
* If the process has exercised some setuid or setgid
* privilege, then rip away read/write permission so
* that only root can gain access.
*/
if (procp->p_flag & PK_SUGID) vap->va_mode &= ~(S_IRUSR|S_IWUSR);
/* FALLTHROUGH */
case PFSstatus:
case PFSstat:
case PFSnote:
case PFSnotepg:
case PFScmdline:
case PFSenviron:
case PFSemul:
case PFSstatm:
case PFSmap:
case PFSmaps:
case PFSlimit:
case PFSauxv:
vap->va_nlink = 1;
vap->va_uid = kauth_cred_geteuid(procp->p_cred);
vap->va_gid = kauth_cred_getegid(procp->p_cred);
break;
case PFScwd:
case PFSchroot:
case PFSmeminfo:
case PFSdevices:
case PFScpuinfo:
case PFSuptime:
case PFSmounts:
case PFScpustat:
case PFSloadavg:
case PFSversion:
case PFSexe:
case PFSself:
case PFScurproc:
case PFSroot:
vap->va_nlink = 1;
vap->va_uid = vap->va_gid = 0;
break;
case PFSproc:
case PFStask:
case PFSfile:
case PFSfd:
break;
default:
panic("%s: %d/1", __func__, pfs->pfs_type);
}
/*
* now do the object specific fields
*
* The size could be set from struct reg, but it's hardly
* worth the trouble, and it puts some (potentially) machine
* dependent data into this machine-independent code. If it
* becomes important then this function should break out into
* a per-file stat function in the corresponding .c file.
*/
switch (pfs->pfs_type) {
case PFSroot:
vap->va_bytes = vap->va_size = DEV_BSIZE;
break;
case PFSself:
case PFScurproc:
vap->va_bytes = vap->va_size =
snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
break;
case PFStask:
if (pfs->pfs_fd != -1) { vap->va_nlink = 1;
vap->va_uid = 0;
vap->va_gid = 0;
vap->va_bytes = vap->va_size =
snprintf(bf, sizeof(bf), "..");
break;
}
/*FALLTHROUGH*/
case PFSfd:
if (pfs->pfs_fd != -1) {
file_t *fp;
fp = fd_getfile2(procp, pfs->pfs_fd);
if (fp == NULL) {
error = EBADF;
break;
}
vap->va_nlink = 1;
vap->va_uid = kauth_cred_geteuid(fp->f_cred);
vap->va_gid = kauth_cred_getegid(fp->f_cred);
switch (fp->f_type) {
case DTYPE_VNODE:
vap->va_bytes = vap->va_size =
fp->f_vnode->v_size;
break;
default:
vap->va_bytes = vap->va_size = 0;
break;
}
closef(fp);
break;
}
/*FALLTHROUGH*/
case PFSproc:
vap->va_nlink = 2;
vap->va_uid = kauth_cred_geteuid(procp->p_cred);
vap->va_gid = kauth_cred_getegid(procp->p_cred);
vap->va_bytes = vap->va_size = DEV_BSIZE;
break;
case PFSfile:
error = EOPNOTSUPP;
break;
case PFSmem:
vap->va_bytes = vap->va_size =
ctob(procp->p_vmspace->vm_tsize +
procp->p_vmspace->vm_dsize +
procp->p_vmspace->vm_ssize);
break;
case PFSauxv:
vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen;
break;
#if defined(PT_GETREGS) || defined(PT_SETREGS)
case PFSregs:
vap->va_bytes = vap->va_size = sizeof(struct reg);
break;
#endif
#if defined(PT_GETFPREGS) || defined(PT_SETFPREGS)
case PFSfpregs:
vap->va_bytes = vap->va_size = sizeof(struct fpreg);
break;
#endif
case PFSstatus:
case PFSstat:
case PFSnote:
case PFSnotepg:
case PFScmdline:
case PFSenviron:
case PFSmeminfo:
case PFSdevices:
case PFScpuinfo:
case PFSuptime:
case PFSmounts:
case PFScpustat:
case PFSloadavg:
case PFSstatm:
case PFSversion:
vap->va_bytes = vap->va_size = 0;
break;
case PFSlimit:
case PFSmap:
case PFSmaps:
/*
* Advise a larger blocksize for the map files, so that
* they may be read in one pass.
*/
vap->va_blocksize = 4 * PAGE_SIZE;
vap->va_bytes = vap->va_size = 0;
break;
case PFScwd:
case PFSchroot:
bp = path + MAXPATHLEN;
*--bp = '\0';
procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path,
MAXPATHLEN);
vap->va_bytes = vap->va_size = strlen(bp);
break;
case PFSexe:
vap->va_bytes = vap->va_size = strlen(procp->p_path);
break;
case PFSemul:
vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name);
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
error = procfs_machdep_getattr(ap->a_vp, vap, procp);
break;
#endif
default:
panic("%s: %d/2", __func__, pfs->pfs_type);
}
if (procp != NULL) procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP);
return (error);
}
/*ARGSUSED*/
int
procfs_setattr(void *v)
{
/*
* just fake out attribute setting
* it's not good to generate an error
* return, otherwise things like creat()
* will fail when they try to set the
* file length to 0. worse, this means
* that echo $note > /proc/$pid/note will fail.
*/
return (0);
}
/*
* implement access checking.
*
* actually, the check for super-user is slightly
* broken since it will allow read access to write-only
* objects. this doesn't cause any particular trouble
* but does mean that the i/o entry points need to check
* that the operation really does make sense.
*/
int
procfs_access(void *v)
{
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vattr va;
int error;
if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
return (error);
return kauth_authorize_vnode(ap->a_cred,
KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode),
ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred,
va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode));
}
/*
* lookup. this is incredibly complicated in the
* general case, however for most pseudo-filesystems
* very little needs to be done.
*
* Locking isn't hard here, just poorly documented.
*
* If we're looking up ".", just vref the parent & return it.
*
* If we're looking up "..", unlock the parent, and lock "..". If everything
* went ok, and we're on the last component and the caller requested the
* parent locked, try to re-lock the parent. We do this to prevent lock
* races.
*
* For anything else, get the needed node. Then unlock the parent if not
* the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the
* parent in the .. case).
*
* We try to exit with the parent locked in error cases.
*/
int
procfs_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
struct vnode **vpp = ap->a_vpp;
struct vnode *dvp = ap->a_dvp;
const char *pname = cnp->cn_nameptr;
const struct proc_target *pt = NULL;
struct vnode *fvp;
pid_t pid, vnpid;
struct pfsnode *pfs;
struct proc *p = NULL;
struct lwp *plwp;
int i, error;
pfstype type;
*vpp = NULL;
if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
return (error);
if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
return (EROFS);
if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp;
vref(dvp);
return (0);
}
pfs = VTOPFS(dvp);
switch (pfs->pfs_type) {
case PFSroot:
/*
* Shouldn't get here with .. in the root node.
*/
if (cnp->cn_flags & ISDOTDOT)
return (EIO);
for (i = 0; i < nproc_root_targets; i++) {
pt = &proc_root_targets[i];
/*
* check for node match. proc is always NULL here,
* so call pt_valid with constant NULL lwp.
*/
if (cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
(pt->pt_valid == NULL ||
(*pt->pt_valid)(NULL, dvp->v_mount)))
break;
}
if (i != nproc_root_targets) {
error = procfs_allocvp(dvp->v_mount, vpp, 0,
pt->pt_pfstype, -1);
return (error);
}
if (CNEQ(cnp, "curproc", 7)) {
pid = curproc->p_pid;
vnpid = 0;
type = PFScurproc;
} else if (CNEQ(cnp, "self", 4)) {
pid = curproc->p_pid;
vnpid = 0;
type = PFSself;
} else {
pid = (pid_t)atoi(pname, cnp->cn_namelen);
vnpid = pid;
type = PFSproc;
}
if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0)
break;
error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1);
procfs_proc_unlock(p);
return (error);
case PFSproc:
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot,
-1);
return (error);
}
if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ESRCH) != 0)
break;
mutex_enter(p->p_lock);
LIST_FOREACH(plwp, &p->p_lwps, l_sibling) {
if (plwp->l_stat != LSZOMB)
break;
}
/* Process is exiting if no-LWPS or all LWPs are LSZOMB */
if (plwp == NULL) {
mutex_exit(p->p_lock);
procfs_proc_unlock(p);
return ESRCH;
}
lwp_addref(plwp);
mutex_exit(p->p_lock);
for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
int found;
found = cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
(pt->pt_valid == NULL
|| (*pt->pt_valid)(plwp, dvp->v_mount));
if (found)
break;
}
lwp_delref(plwp);
if (i == nproc_targets) {
procfs_proc_unlock(p);
break;
}
if (pt->pt_pfstype == PFSfile) {
fvp = p->p_textvp;
/* We already checked that it exists. */
vref(fvp);
procfs_proc_unlock(p);
*vpp = fvp;
return (0);
}
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
pt->pt_pfstype, -1);
procfs_proc_unlock(p);
return (error);
case PFSfd: {
int fd;
file_t *fp;
if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ENOENT)) != 0)
return error;
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSproc, -1);
procfs_proc_unlock(p);
return (error);
}
fd = atoi(pname, cnp->cn_namelen);
fp = fd_getfile2(p, fd);
if (fp == NULL) {
procfs_proc_unlock(p);
return ENOENT;
}
fvp = fp->f_vnode;
/* Don't show directories */
if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR &&
!procfs_proc_is_linux_compat()) {
vref(fvp);
closef(fp);
procfs_proc_unlock(p);
*vpp = fvp;
return 0;
}
closef(fp);
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSfd, fd);
procfs_proc_unlock(p);
return error;
}
case PFStask: {
int xpid;
if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ENOENT)) != 0)
return error;
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSproc, -1);
procfs_proc_unlock(p);
return (error);
}
xpid = atoi(pname, cnp->cn_namelen);
if (xpid != pfs->pfs_pid) {
procfs_proc_unlock(p);
return ENOENT;
}
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFStask, 0);
procfs_proc_unlock(p);
return error;
}
default:
return (ENOTDIR);
}
return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS);
}
int
procfs_validfile(struct lwp *l, struct mount *mp)
{
return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL;
}
static int
procfs_validfile_linux(struct lwp *l, struct mount *mp)
{
return procfs_use_linux_compat(mp) &&
(l == NULL || l->l_proc == NULL || procfs_validfile(l, mp));
}
struct procfs_root_readdir_ctx {
struct uio *uiop;
off_t *cookies;
int ncookies;
off_t off;
off_t startoff;
int error;
};
static int
procfs_root_readdir_callback(struct proc *p, void *arg)
{
struct procfs_root_readdir_ctx *ctxp = arg;
struct dirent d;
struct uio *uiop;
int error;
uiop = ctxp->uiop;
if (uiop->uio_resid < UIO_MX)
return -1; /* no space */
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0)
return 0;
if (ctxp->off < ctxp->startoff) {
ctxp->off++;
return 0;
}
memset(&d, 0, UIO_MX);
d.d_reclen = UIO_MX;
d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1);
d.d_namlen = snprintf(d.d_name,
UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid);
d.d_type = DT_DIR;
mutex_exit(&proc_lock);
error = uiomove(&d, UIO_MX, uiop);
mutex_enter(&proc_lock);
if (error) {
ctxp->error = error;
return -1;
}
ctxp->ncookies++;
if (ctxp->cookies)
*(ctxp->cookies)++ = ctxp->off + 1;
ctxp->off++;
return 0;
}
/*
* readdir returns directory entries from pfsnode (vp).
*
* the strategy here with procfs is to generate a single
* directory entry at a time (struct dirent) and then
* copy that out to userland using uiomove. a more efficient
* though more complex implementation, would try to minimize
* the number of calls to uiomove(). for procfs, this is
* hardly worth the added code complexity.
*
* this should just be done through read()
*/
int
procfs_readdir(void *v)
{
struct vop_readdir_args /* {
struct vnode *a_vp;
struct uio *a_uio;
kauth_cred_t a_cred;
int *a_eofflag;
off_t **a_cookies;
int *a_ncookies;
} */ *ap = v;
struct uio *uio = ap->a_uio;
struct dirent d;
struct pfsnode *pfs;
off_t i;
int error;
off_t *cookies = NULL;
int ncookies;
struct vnode *vp;
const struct proc_target *pt;
struct procfs_root_readdir_ctx ctx;
struct proc *p = NULL;
struct lwp *l;
int nfd;
int nc = 0;
vp = ap->a_vp;
pfs = VTOPFS(vp);
if (uio->uio_resid < UIO_MX)
return (EINVAL);
if (uio->uio_offset < 0)
return (EINVAL);
error = 0;
i = uio->uio_offset;
memset(&d, 0, UIO_MX);
d.d_reclen = UIO_MX;
ncookies = uio->uio_resid / UIO_MX;
switch (pfs->pfs_type) {
/*
* this is for the process-specific sub-directories.
* all that is needed to is copy out all the entries
* from the procent[] table (top of this file).
*/
case PFSproc: {
if (i >= nproc_targets)
return 0;
if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0)
break;
if (ap->a_ncookies) {
ncookies = uimin(ncookies, (nproc_targets - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (pt = &proc_targets[i];
uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) {
if (pt->pt_valid) {
/* XXXSMP LWP can disappear */
mutex_enter(p->p_lock);
l = LIST_FIRST(&p->p_lwps);
KASSERT(l != NULL);
mutex_exit(p->p_lock);
if ((*pt->pt_valid)(l, vp->v_mount) == 0)
continue;
}
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
d.d_namlen = pt->pt_namlen;
memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies)
*cookies++ = i + 1;
}
procfs_proc_unlock(p);
break;
}
case PFSfd: {
file_t *fp;
int lim;
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
ESRCH)) != 0)
return error;
/* XXX Should this be by file as well? */
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL,
NULL) != 0) {
procfs_proc_unlock(p);
return ESRCH;
}
nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles;
lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
if (i >= lim) {
procfs_proc_unlock(p);
return 0;
}
if (ap->a_ncookies) {
ncookies = uimin(ncookies, (nfd + 2 - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
pt = &proc_targets[i];
d.d_namlen = pt->pt_namlen;
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies)
*cookies++ = i + 1;
nc++;
}
if (error)
goto out;
for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
/* check the descriptor exists */
if ((fp = fd_getfile2(p, i - 2)) == NULL)
continue;
closef(fp);
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2);
d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
"%lld", (long long)(i - 2));
d.d_type = fttodt(fp);
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies)
*cookies++ = i + 1;
nc++;
}
goto out;
}
case PFStask: {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
ESRCH)) != 0)
return error;
nfd = 3; /* ., .., pid */
if (ap->a_ncookies) {
ncookies = uimin(ncookies, (nfd + 2 - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
pt = &proc_targets[i];
d.d_namlen = pt->pt_namlen;
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies)
*cookies++ = i + 1;
nc++;
}
if (error)
goto out;
for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
/* check the descriptor exists */
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask,
i - 2);
d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
"%ld", (long)pfs->pfs_pid);
d.d_type = DT_LNK;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies)
*cookies++ = i + 1;
nc++;
}
goto out;
}
/*
* this is for the root of the procfs filesystem
* what is needed are special entries for "curproc"
* and "self" followed by an entry for each process
* on allproc.
*/
case PFSroot: {
if (ap->a_ncookies) {
/*
* XXX Potentially allocating too much space here,
* but I'm lazy. This loop needs some work.
*/
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
/* 0 ... 3 are static entries. */
for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) {
switch (i) {
case 0: /* `.' */
case 1: /* `..' */
d.d_fileno = PROCFS_FILENO(0, PFSroot, -1);
d.d_namlen = i + 1;
memcpy(d.d_name, "..", d.d_namlen);
d.d_name[i + 1] = '\0';
d.d_type = DT_DIR;
break;
case 2:
d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1);
d.d_namlen = sizeof("curproc") - 1;
memcpy(d.d_name, "curproc", sizeof("curproc"));
d.d_type = DT_LNK;
break;
case 3:
d.d_fileno = PROCFS_FILENO(0, PFSself, -1);
d.d_namlen = sizeof("self") - 1;
memcpy(d.d_name, "self", sizeof("self"));
d.d_type = DT_LNK;
break;
}
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
nc++;
if (cookies)
*cookies++ = i + 1;
}
if (error)
break;
/* 4 ... are process entries. */
ctx.uiop = uio;
ctx.error = 0;
ctx.off = 4;
ctx.startoff = i;
ctx.cookies = cookies;
ctx.ncookies = nc;
proclist_foreach_call(&allproc,
procfs_root_readdir_callback, &ctx);
cookies = ctx.cookies;
nc = ctx.ncookies;
error = ctx.error;
if (error)
break;
/* misc entries. */
if (i < ctx.off)
i = ctx.off;
if (i >= ctx.off + nproc_root_targets)
break;
error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH);
if (error)
break;
for (pt = &proc_root_targets[i - ctx.off];
uio->uio_resid >= UIO_MX &&
pt < &proc_root_targets[nproc_root_targets];
pt++, i++) {
if (pt->pt_valid &&
(*pt->pt_valid)(NULL, vp->v_mount) == 0)
continue;
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY),
NULL, NULL) != 0)
continue;
d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1);
d.d_namlen = pt->pt_namlen;
memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
nc++;
if (cookies)
*cookies++ = i + 1;
}
out:
KASSERT(p != NULL);
ncookies = nc;
procfs_proc_unlock(p);
break;
}
default:
error = ENOTDIR;
break;
}
if (ap->a_ncookies) {
if (error) {
if (cookies)
free(*ap->a_cookies, M_TEMP);
*ap->a_ncookies = 0;
*ap->a_cookies = NULL;
} else
*ap->a_ncookies = ncookies;
}
uio->uio_offset = i;
return (error);
}
/*
* readlink reads the link of `curproc' and others
*/
int
procfs_readlink(void *v)
{
struct vop_readlink_args *ap = v;
char bf[16]; /* should be enough */
char *bp = bf;
char *path = NULL;
int len = 0;
int error = 0;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct proc *pown = NULL;
if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1))
len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1))
len = snprintf(bf, sizeof(bf), "%s", "curproc");
else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0))
len = snprintf(bf, sizeof(bf), "..");
else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
bp = pown->p_path;
len = strlen(bp);
} else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) ||
pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
if (path == NULL) {
procfs_proc_unlock(pown);
return (ENOMEM);
}
bp = path + MAXPATHLEN;
*--bp = '\0';
procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown,
&bp, path, MAXPATHLEN);
len = strlen(bp);
} else {
file_t *fp;
struct vnode *vxp;
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
fp = fd_getfile2(pown, pfs->pfs_fd);
if (fp == NULL) {
procfs_proc_unlock(pown);
return EBADF;
}
switch (fp->f_type) {
case DTYPE_VNODE:
vxp = fp->f_vnode;
if (vxp->v_type != VDIR &&
!procfs_proc_is_linux_compat()) {
error = EINVAL;
break;
}
if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK))
== NULL) {
error = ENOMEM;
break;
}
bp = path + MAXPATHLEN;
*--bp = '\0';
/*
* XXX: kludge to avoid locking against ourselves
* in getcwd()
*/
if (vxp->v_tag == VT_PROCFS) {
*--bp = '/';
} else {
rw_enter(&curproc->p_cwdi->cwdi_lock,
RW_READER);
vp = curproc->p_cwdi->cwdi_rdir;
if (vp == NULL)
vp = rootvnode;
error = getcwd_common(vxp, vp, &bp, path,
MAXPATHLEN / 2, 0, curlwp);
rw_exit(&curproc->p_cwdi->cwdi_lock);
}
if (error)
break;
len = strlen(bp);
break;
case DTYPE_MISC:
len = snprintf(bf, sizeof(bf), "%s", "[misc]");
break;
case DTYPE_KQUEUE:
len = snprintf(bf, sizeof(bf), "%s", "[kqueue]");
break;
case DTYPE_SEM:
len = snprintf(bf, sizeof(bf), "%s", "[ksem]");
break;
default:
error = EINVAL;
break;
}
closef(fp);
}
if (error == 0)
error = uiomove(bp, len, ap->a_uio);
if (pown)
procfs_proc_unlock(pown);
if (path)
free(path, M_TEMP);
return error;
}
int
procfs_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
if ((ap->a_flags & PGO_LOCKED) == 0)
rw_exit(ap->a_vp->v_uobj.vmobjlock);
return (EFAULT);
}
/*
* convert decimal ascii to int
*/
static int
atoi(const char *b, size_t len)
{
int p = 0;
while (len--) {
char c = *b++;
if (c < '0' || c > '9')
return -1;
p = 10 * p + (c - '0');
}
return p;
}
/**
* convert DTYPE_XXX to corresponding DT_XXX
* matching what procfs_loadvnode() does.
*/
static uint8_t
fttodt(file_t *fp)
{
switch (fp->f_type) {
case DTYPE_VNODE:
switch (fp->f_vnode->v_type) {
case VREG: return DT_REG;
case VDIR: return DT_LNK; /* symlink */
case VBLK: return DT_BLK;
case VCHR: return DT_CHR;
case VLNK: return DT_LNK;
case VSOCK: return DT_SOCK;
case VFIFO: return DT_FIFO;
default: return DT_UNKNOWN;
}
case DTYPE_PIPE: return DT_FIFO;
case DTYPE_SOCKET: return DT_SOCK;
case DTYPE_KQUEUE: /*FALLTHROUGH*/
case DTYPE_MISC: /*FALLTHROUGH*/
case DTYPE_SEM: return DT_LNK; /* symlinks */
default: return DT_UNKNOWN;
}
}
/* $NetBSD: pmap_private.h,v 1.5 2023/10/04 20:28:06 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _X86_PMAP_PRIVATE_H_
#define _X86_PMAP_PRIVATE_H_
#ifndef _MACHINE_PMAP_PRIVATE_H_X86
#error Include machine/pmap_private.h, not x86/pmap_private.h.
#endif
#ifdef _KERNEL_OPT
#include "opt_svs.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <machine/cpufunc.h>
#include <machine/pte.h>
#include <machine/vmparam.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_pmap.h>
struct pmap;
#define SLAREA_USER 0
#define SLAREA_PTE 1
#define SLAREA_MAIN 2
#define SLAREA_PCPU 3
#define SLAREA_DMAP 4
#define SLAREA_HYPV 5
#define SLAREA_ASAN 6
#define SLAREA_MSAN 7
#define SLAREA_KERN 8
#define SLSPACE_NAREAS 9
struct slotspace {
struct {
size_t sslot; /* start slot */
size_t nslot; /* # of slots */
bool active; /* area is active */
} area[SLSPACE_NAREAS];
};
extern struct slotspace slotspace;
#include <x86/gdt.h>
struct pcpu_entry {
uint8_t gdt[MAXGDTSIZ];
uint8_t ldt[MAX_USERLDT_SIZE];
uint8_t idt[PAGE_SIZE];
uint8_t tss[PAGE_SIZE];
uint8_t ist0[PAGE_SIZE];
uint8_t ist1[PAGE_SIZE];
uint8_t ist2[PAGE_SIZE];
uint8_t ist3[PAGE_SIZE];
uint8_t rsp0[2 * PAGE_SIZE];
} __packed;
struct pcpu_area {
#ifdef SVS
uint8_t utls[PAGE_SIZE];
#endif
uint8_t ldt[PAGE_SIZE];
struct pcpu_entry ent[MAXCPUS];
} __packed;
extern struct pcpu_area *pcpuarea;
#define PMAP_PCID_KERN 0
#define PMAP_PCID_USER 1
/*
* pmap data structures: see pmap.c for details of locking.
*/
/*
* we maintain a list of all non-kernel pmaps
*/
LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
/*
* linked list of all non-kernel pmaps
*/
extern struct pmap_head pmaps;
extern kmutex_t pmaps_lock; /* protects pmaps */
/*
* pool_cache(9) that pmaps are allocated from
*/
extern struct pool_cache pmap_cache;
/*
* the pmap structure
*
* note that the pm_obj contains the lock pointer, the reference count,
* page list, and number of PTPs within the pmap.
*
* pm_lock is the same as the lock for vm object 0. Changes to
* the other objects may only be made if that lock has been taken
* (the other object locks are only used when uvm_pagealloc is called)
*/
struct pv_page;
struct pmap {
struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */
LIST_ENTRY(pmap) pm_list; /* list of all pmaps */
pd_entry_t *pm_pdir; /* VA of PD */
paddr_t pm_pdirpa[PDP_SIZE]; /* PA of PDs (read-only after create) */
struct vm_page *pm_ptphint[PTP_LEVELS-1];
/* pointer to a PTP in our pmap */
struct pmap_statistics pm_stats; /* pmap stats */
struct pv_entry *pm_pve; /* spare pv_entry */
LIST_HEAD(, pv_page) pm_pvp_part;
LIST_HEAD(, pv_page) pm_pvp_empty;
LIST_HEAD(, pv_page) pm_pvp_full;
#if !defined(__x86_64__)
vaddr_t pm_hiexec; /* highest executable mapping */
#endif /* !defined(__x86_64__) */
union descriptor *pm_ldt; /* user-set LDT */
size_t pm_ldt_len; /* XXX unused, remove */
int pm_ldt_sel; /* LDT selector */
kcpuset_t *pm_cpus; /* mask of CPUs using pmap */
kcpuset_t *pm_kernel_cpus; /* mask of CPUs using kernel part
of pmap */
kcpuset_t *pm_xen_ptp_cpus; /* mask of CPUs which have this pmap's
ptp mapped */
long pm_pctr; /* for assertions */
LIST_HEAD(,vm_page) pm_gc_ptp; /* PTPs queued for free */
/* Used by NVMM and Xen */
int (*pm_enter)(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int);
bool (*pm_extract)(struct pmap *, vaddr_t, paddr_t *);
void (*pm_remove)(struct pmap *, vaddr_t, vaddr_t);
int (*pm_sync_pv)(struct vm_page *, vaddr_t, paddr_t, int, uint8_t *,
pt_entry_t *);
void (*pm_pp_remove_ent)(struct pmap *, struct vm_page *, pt_entry_t,
vaddr_t);
void (*pm_write_protect)(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
void (*pm_unwire)(struct pmap *, vaddr_t);
void (*pm_tlb_flush)(struct pmap *);
void *pm_data;
kmutex_t pm_lock /* locks for pm_objs */
__aligned(64); /* give lock own cache line */
krwlock_t pm_dummy_lock; /* ugly hack for abusing uvm_object */
};
/* macro to access pm_pdirpa slots */
#ifdef PAE
#define pmap_pdirpa(pmap, index) \
((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
#else
#define pmap_pdirpa(pmap, index) \
((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
#endif
/*
* global kernel variables
*/
/*
* PDPpaddr is the physical address of the kernel's PDP.
* - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
* value associated to the kernel process, proc0.
* - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to
* the L3 PD, it cannot be considered as the equivalent of a %cr3 any more.
* - Xen: it corresponds to the PFN of the kernel's PDP.
*/
extern u_long PDPpaddr;
extern pd_entry_t pmap_pg_g; /* do we support PTE_G? */
extern pd_entry_t pmap_pg_nx; /* do we support PTE_NX? */
extern int pmap_largepages;
extern long nkptp[PTP_LEVELS];
#define pmap_valid_entry(E) ((E) & PTE_P) /* is PDE or PTE valid? */
void pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **,
pd_entry_t * const **);
void pmap_unmap_ptes(struct pmap *, struct pmap *);
bool pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *,
int *lastlvl);
bool pmap_is_curpmap(struct pmap *);
void pmap_ept_transform(struct pmap *);
#ifndef __HAVE_DIRECT_MAP
void pmap_vpage_cpu_init(struct cpu_info *);
#endif
vaddr_t slotspace_rand(int, size_t, size_t, size_t, vaddr_t);
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
typedef enum tlbwhy {
TLBSHOOT_REMOVE_ALL,
TLBSHOOT_KENTER,
TLBSHOOT_KREMOVE,
TLBSHOOT_FREE_PTP,
TLBSHOOT_REMOVE_PTE,
TLBSHOOT_SYNC_PV,
TLBSHOOT_WRITE_PROTECT,
TLBSHOOT_ENTER,
TLBSHOOT_NVMM,
TLBSHOOT_BUS_DMA,
TLBSHOOT_BUS_SPACE,
TLBSHOOT__MAX,
} tlbwhy_t;
void pmap_tlb_init(void);
void pmap_tlb_cpu_init(struct cpu_info *);
void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t);
void pmap_tlb_shootnow(void);
void pmap_tlb_intr(void);
/*
* inline functions
*/
/*
* pmap_update_pg: flush one page from the TLB (or flush the whole thing
* if hardware doesn't support one-page flushing)
*/
__inline static void __unused
pmap_update_pg(vaddr_t va)
{
invlpg(va);
}
/*
* various address inlines
*
* vtopte: return a pointer to the PTE mapping a VA, works only for
* user and PT addresses
*
* kvtopte: return a pointer to the PTE mapping a kernel VA
*/
#include <lib/libkern/libkern.h>
static __inline pt_entry_t * __unused
vtopte(vaddr_t va)
{
KASSERT(va < VM_MIN_KERNEL_ADDRESS);
return (PTE_BASE + pl1_i(va));
}
static __inline pt_entry_t * __unused
kvtopte(vaddr_t va)
{
pd_entry_t *pde;
KASSERT(va >= VM_MIN_KERNEL_ADDRESS); pde = L2_BASE + pl2_i(va);
if (*pde & PTE_PS)
return ((pt_entry_t *)pde);
return (PTE_BASE + pl1_i(va));
}
#ifdef XENPV
#include <sys/bitops.h>
#define XPTE_MASK L1_FRAME
/* Selects the index of a PTE in (A)PTE_BASE */
#define XPTE_SHIFT (L1_SHIFT - ilog2(sizeof(pt_entry_t)))
/* PTE access inline functions */
/*
* Get the machine address of the pointed pte
* We use hardware MMU to get value so works only for levels 1-3
*/
static __inline paddr_t
xpmap_ptetomach(pt_entry_t *pte)
{
pt_entry_t *up_pte;
vaddr_t va = (vaddr_t) pte;
va = ((va & XPTE_MASK) >> XPTE_SHIFT) | (vaddr_t) PTE_BASE;
up_pte = (pt_entry_t *) va;
return (paddr_t) (((*up_pte) & PTE_FRAME) + (((vaddr_t) pte) & (~PTE_FRAME & ~VA_SIGN_MASK)));
}
/* Xen helpers to change bits of a pte */
#define XPMAP_UPDATE_DIRECT 1 /* Update direct map entry flags too */
paddr_t vtomach(vaddr_t);
#define vtomfn(va) (vtomach(va) >> PAGE_SHIFT)
#endif /* XENPV */
#ifdef __HAVE_PCPU_AREA
extern struct pcpu_area *pcpuarea;
#define PDIR_SLOT_PCPU 510
#define PMAP_PCPU_BASE (VA_SIGN_NEG((PDIR_SLOT_PCPU * NBPD_L4)))
#endif
void svs_quad_copy(void *, void *, long);
#ifdef _KERNEL_OPT
#include "opt_efi.h"
#endif
#ifdef EFI_RUNTIME
void * pmap_activate_sync(struct pmap *);
void pmap_deactivate_sync(struct pmap *, void *);
bool pmap_is_user(struct pmap *);
#else
static inline bool
pmap_is_user(struct pmap *pmap)
{
KASSERT(pmap != pmap_kernel());
return true;
}
#endif
#endif /* _X86_PMAP_PRIVATE_H_ */
/* $NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/mutex.h>
#include <sys/kmem.h>
/* Mutex cache */
#define MUTEX_OBJ_MAGIC 0x5aa3c85d
struct kmutexobj {
kmutex_t mo_lock;
u_int mo_magic;
u_int mo_refcnt;
uint8_t mo_pad[COHERENCY_UNIT - sizeof(kmutex_t) -
sizeof(u_int) * 2];
};
/*
* mutex_obj_alloc:
*
* Allocate a single lock object, waiting for memory if needed.
*/
kmutex_t *
mutex_obj_alloc(kmutex_type_t type, int ipl)
{
struct kmutexobj *mo;
mo = kmem_intr_alloc(sizeof(*mo), KM_SLEEP);
KASSERT(ALIGNED_POINTER(mo, coherency_unit));
_mutex_init(&mo->mo_lock, type, ipl,
(uintptr_t)__builtin_return_address(0));
mo->mo_magic = MUTEX_OBJ_MAGIC;
mo->mo_refcnt = 1;
return (kmutex_t *)mo;
}
/*
* mutex_obj_alloc:
*
* Allocate a single lock object, failing if no memory available.
*/
kmutex_t *
mutex_obj_tryalloc(kmutex_type_t type, int ipl)
{
struct kmutexobj *mo;
mo = kmem_intr_alloc(sizeof(*mo), KM_NOSLEEP);
KASSERT(ALIGNED_POINTER(mo, coherency_unit));
if (__predict_true(mo != NULL)) {
_mutex_init(&mo->mo_lock, type, ipl,
(uintptr_t)__builtin_return_address(0));
mo->mo_magic = MUTEX_OBJ_MAGIC;
mo->mo_refcnt = 1;
}
return (kmutex_t *)mo;
}
/*
* mutex_obj_hold:
*
* Add a single reference to a lock object. A reference to the object
* must already be held, and must be held across this call.
*/
void
mutex_obj_hold(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
"%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
__func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
KASSERTMSG(mo->mo_refcnt > 0,
"%s: lock %p: mo->mo_refcnt (%#x) == 0",
__func__, mo, mo->mo_refcnt);
atomic_inc_uint(&mo->mo_refcnt);
}
/*
* mutex_obj_free:
*
* Drop a reference from a lock object. If the last reference is being
* dropped, free the object and return true. Otherwise, return false.
*/
bool
mutex_obj_free(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
"%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
__func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
KASSERTMSG(mo->mo_refcnt > 0,
"%s: lock %p: mo->mo_refcnt (%#x) == 0",
__func__, mo, mo->mo_refcnt);
membar_release();
if (atomic_dec_uint_nv(&mo->mo_refcnt) > 0) {
return false;
}
membar_acquire();
mutex_destroy(&mo->mo_lock);
kmem_intr_free(mo, sizeof(*mo));
return true;
}
/*
* mutex_obj_refcnt:
*
* Return the reference count on a lock object.
*/
u_int
mutex_obj_refcnt(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
return mo->mo_refcnt;
}
/* $NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997
* Jonathan Stone and Jason R. Thorpe. All rights reserved.
*
* This software is derived from information provided by Matt Thomas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Jonathan Stone
* and Jason R. Thorpe for the NetBSD Project.
* 4. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/if_media.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_mod.h>
static void
ifmword_n2o(int *oldwd, int *newwd)
{
if (IFM_SUBTYPE(*newwd) > IFM_OTHER)
*oldwd = (*newwd & ~(_IFM_ETH_XTMASK | IFM_TMASK)) | IFM_OTHER;
else
*oldwd = *newwd;
}
/*ARGSUSED*/
static int
compat_ifmediareq_pre(struct ifreq *ifr, u_long *cmd, bool *do_post)
{
struct ifmediareq *ifmr = (struct ifmediareq *)ifr;
switch (*cmd) {
case SIOCSIFMEDIA_80:
*cmd = SIOCSIFMEDIA; /* Convert to new one */
if ((IFM_TYPE(ifr->ifr_media) == IFM_ETHER) &&
IFM_SUBTYPE(ifr->ifr_media) > IFM_OTHER) {
/* Clear unused bits to not to change to wrong media */
ifr->ifr_media &= ~_IFM_ETH_XTMASK;
}
return 0;
case SIOCGIFMEDIA_80:
*cmd = SIOCGIFMEDIA; /* Convert to new one */
if (ifmr->ifm_count != 0) {
/*
* Tell the upper layer to try to convert each ifmedia
* entry in the post process.
*/
*do_post = true;
}
return 0;
default:
return 0;
}
}
/*ARGSUSED*/
static int
compat_ifmediareq_post(struct ifreq *ifr, u_long cmd)
{
struct ifmediareq *ifmr = (struct ifmediareq *)ifr;
size_t minwords;
size_t count;
int error, *kptr;
switch (cmd) {
case SIOCSIFMEDIA:
return 0;
case SIOCGIFMEDIA:
if (ifmr->ifm_count < 0)
return EINVAL;
/*
* ifmr->ifm_count was already ajusted in ifmedia_ioctl(), so
* there is no problem to trust ifm_count.
*/
minwords = ifmr->ifm_count;
kptr = malloc(minwords * sizeof(*kptr), M_TEMP,
M_WAITOK|M_ZERO);
if (kptr == NULL)
return ENOMEM;
/*
* Convert ifm_current and ifm_active.
* It's not required to convert ifm_mask.
*/
ifmword_n2o(&ifmr->ifm_current, &ifmr->ifm_current);
ifmword_n2o(&ifmr->ifm_active, &ifmr->ifm_active);
/* Convert ifm_ulist array */
for (count = 0; count < minwords; count++) {
int oldmwd;
error = ufetch_int(&ifmr->ifm_ulist[count], &oldmwd);
if (error != 0)
goto out;
ifmword_n2o(&kptr[count], &oldmwd);
}
/* Copy to userland in old format */
error = copyout(kptr, ifmr->ifm_ulist,
minwords * sizeof(*kptr));
out:
free(kptr, M_TEMP);
return error;
default:
return 0;
}
}
void
ifmedia_80_init(void)
{
MODULE_HOOK_SET(ifmedia_80_pre_hook, compat_ifmediareq_pre);
MODULE_HOOK_SET(ifmedia_80_post_hook, compat_ifmediareq_post);
}
void
ifmedia_80_fini(void)
{
MODULE_HOOK_UNSET(ifmedia_80_post_hook);
MODULE_HOOK_UNSET(ifmedia_80_pre_hook);
}
/* $NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* UFS Rename
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $");
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/vnode.h>
#include <sys/vnode_if.h>
#include <sys/wapbl.h>
#include <miscfs/genfs/genfs.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ufs/ufsmount.h>
/*
* Forward declarations
*/
static int ufs_sane_rename(struct vnode *, struct componentname *,
struct vnode *, struct componentname *,
kauth_cred_t, bool);
static bool ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *,
const struct ufs_lookup_results *);
static int ufs_rename_recalculate_fulr(struct vnode *,
struct ufs_lookup_results *, const struct ufs_lookup_results *,
const struct componentname *);
static int ufs_direct_namlen(const struct direct *, const struct vnode *);
static int ufs_read_dotdot(struct vnode *, kauth_cred_t, ino_t *);
static int ufs_dirbuf_dotdot_namlen(const struct dirtemplate *,
const struct vnode *);
static const struct genfs_rename_ops ufs_genfs_rename_ops;
/*
* ufs_sane_rename: The hairiest vop, with the saner API.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tcnp (to component name),
* . cred (credentials structure), and
* . posixly_correct (flag for behaviour if target & source link same file).
*
* fdvp and tdvp may be the same, and must be referenced and unlocked.
*/
static int
ufs_sane_rename(
struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct)
{
struct ufs_lookup_results fulr, tulr;
return genfs_sane_rename(&ufs_genfs_rename_ops,
fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
cred, posixly_correct);
}
/*
* ufs_rename: The hairiest vop, with the insanest API. Defer to
* genfs_insane_rename immediately.
*/
int
ufs_rename(void *v)
{
return genfs_insane_rename(v, &ufs_sane_rename);
}
/*
* ufs_gro_directory_empty_p: Return true if the directory vp is
* empty. dvp is its parent.
*
* vp and dvp must be locked and referenced.
*/
bool
ufs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred,
struct vnode *vp, struct vnode *dvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(dvp != NULL); KASSERT(vp != dvp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_mount == mp); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
return ufs_dirempty(VTOI(vp), VTOI(dvp)->i_number, cred);
}
/*
* ufs_gro_rename_check_possible: Check whether a rename is possible
* independent of credentials.
*/
int
ufs_gro_rename_check_possible(struct mount *mp,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
return genfs_ufslike_rename_check_possible(
VTOI(fdvp)->i_flags, VTOI(fvp)->i_flags,
VTOI(tdvp)->i_flags, (tvp? VTOI(tvp)->i_flags : 0),
(tvp != NULL),
IMMUTABLE, APPEND);
}
/*
* ufs_gro_rename_check_permitted: Check whether a rename is permitted
* given our credentials.
*/
int
ufs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
return genfs_ufslike_rename_check_permitted(cred,
fdvp, VTOI(fdvp)->i_mode, VTOI(fdvp)->i_uid,
fvp, VTOI(fvp)->i_uid,
tdvp, VTOI(tdvp)->i_mode, VTOI(tdvp)->i_uid,
tvp, (tvp? VTOI(tvp)->i_uid : 0));
}
/*
* ufs_gro_remove_check_possible: Check whether a remove is possible
* independent of credentials.
*/
int
ufs_gro_remove_check_possible(struct mount *mp,
struct vnode *dvp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp);
KASSERT(dvp->v_type == VDIR);
KASSERT(vp->v_type != VDIR);
KASSERT(dvp->v_mount == mp);
KASSERT(vp->v_mount == mp);
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
return genfs_ufslike_remove_check_possible(
VTOI(dvp)->i_flags, VTOI(vp)->i_flags,
IMMUTABLE, APPEND);
}
/*
* ufs_gro_remove_check_permitted: Check whether a remove is permitted
* given our credentials.
*/
int
ufs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp);
KASSERT(dvp->v_type == VDIR);
KASSERT(vp->v_type != VDIR);
KASSERT(dvp->v_mount == mp);
KASSERT(vp->v_mount == mp);
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
return genfs_ufslike_remove_check_permitted(cred,
dvp, VTOI(dvp)->i_mode, VTOI(dvp)->i_uid, vp, VTOI(vp)->i_uid);
}
/*
* A virgin directory (no blushing please).
*
* XXX Copypasta from ufs_vnops.c. Kill!
*/
static const struct dirtemplate mastertemplate = {
0, 12, DT_DIR, 1, ".",
0, UFS_DIRBLKSIZ - 12, DT_DIR, 2, ".."
};
/*
* ufs_gro_rename: Actually perform the rename operation.
*/
int
ufs_gro_rename(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde, struct vnode *fvp,
struct vnode *tdvp, struct componentname *tcnp,
void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp)
{
struct ufs_lookup_results *fulr = fde;
struct ufs_lookup_results *tulr = tde;
bool directory_p, reparent_p;
struct direct *newdir;
int error;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fulr != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* We shall need to temporarily bump the link count, so make
* sure there is room to do so.
*/
if ((nlink_t)VTOI(fvp)->i_nlink >= LINK_MAX)
return EMLINK;
directory_p = (fvp->v_type == VDIR);
KASSERT(directory_p == ((VTOI(fvp)->i_mode & IFMT) == IFDIR)); KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR))); KASSERT((tvp == NULL) || (directory_p ==
((VTOI(tvp)->i_mode & IFMT) == IFDIR)));
reparent_p = (fdvp != tdvp);
KASSERT(reparent_p == (VTOI(fdvp)->i_number != VTOI(tdvp)->i_number));
/*
* Commence hacking of the data on disk.
*/
error = UFS_WAPBL_BEGIN(mp);
if (error)
goto ihateyou;
/*
* 1) Bump link count while we're moving stuff
* around. If we crash somewhere before
* completing our work, the link count
* may be wrong, but correctable.
*/
KASSERT((nlink_t)VTOI(fvp)->i_nlink < LINK_MAX);
VTOI(fvp)->i_nlink++;
DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
VTOI(fvp)->i_flag |= IN_CHANGE;
error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP);
if (error)
goto whymustithurtsomuch;
/*
* 2) If target doesn't exist, link the target
* to the source and unlink the source.
* Otherwise, rewrite the target directory
* entry to reference the source inode and
* expunge the original entry's existence.
*/
if (tvp == NULL) {
/*
* Account for ".." in new directory.
* When source and destination have the same
* parent we don't fool with the link count.
*/
if (directory_p && reparent_p) { if ((nlink_t)VTOI(tdvp)->i_nlink >= LINK_MAX) {
error = EMLINK;
goto whymustithurtsomuch;
}
KASSERT((nlink_t)VTOI(tdvp)->i_nlink < LINK_MAX);
VTOI(tdvp)->i_nlink++;
DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
error = UFS_UPDATE(tdvp, NULL, NULL, UPDATE_DIROP);
if (error) {
/*
* Link count update didn't take --
* back out the in-memory link count.
*/
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink,
VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
goto whymustithurtsomuch;
}
}
newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
ufs_makedirentry(VTOI(fvp), tcnp, newdir);
error = ufs_direnter(tdvp, tulr, NULL, newdir, tcnp, NULL);
pool_cache_put(ufs_direct_cache, newdir);
if (error) {
if (directory_p && reparent_p) {
/*
* Directory update didn't take, but
* the link count update did -- back
* out the in-memory link count and the
* on-disk link count.
*/
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink,
VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
(void)UFS_UPDATE(tdvp, NULL, NULL,
UPDATE_WAIT | UPDATE_DIROP);
}
goto whymustithurtsomuch;
}
} else {
if (directory_p)
/* XXX WTF? Why purge here? Why not purge others? */
cache_purge(tdvp);
/*
* Make the target directory's entry for tcnp point at
* the source node.
*
* XXX ufs_dirrewrite decrements tvp's link count, but
* doesn't touch the link count of the new inode. Go
* figure.
*/
error = ufs_dirrewrite(VTOI(tdvp), tulr->ulr_offset,
VTOI(tvp), VTOI(fvp)->i_number, IFTODT(VTOI(fvp)->i_mode),
((directory_p && reparent_p) ? reparent_p : directory_p),
IN_CHANGE | IN_UPDATE);
if (error)
goto whymustithurtsomuch;
/*
* If the source and target are directories, and the
* target is in the same directory as the source,
* decrement the link count of the common parent
* directory, since we are removing the target from
* that directory.
*/
if (directory_p && !reparent_p) {
KASSERT(fdvp == tdvp);
/* XXX check, don't kassert */
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
}
if (directory_p) {
/*
* XXX I don't understand the following comment
* from ufs_rename -- in particular, the part
* about `there may be other hard links'.
*
* Truncate inode. The only stuff left in the directory
* is "." and "..". The "." reference is inconsequential
* since we are quashing it. We have removed the "."
* reference and the reference in the parent directory,
* but there may be other hard links.
*
* XXX The ufs_dirempty call earlier does
* not guarantee anything about nlink.
*/
if (VTOI(tvp)->i_nlink != 1) ufs_dirbad(VTOI(tvp), (doff_t)0,
"hard-linked directory");
VTOI(tvp)->i_nlink = 0;
DIP_ASSIGN(VTOI(tvp), nlink, 0);
(void) UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, cred);
}
}
/*
* If the source is a directory with a new parent, the link
* count of the old parent directory must be decremented and
* ".." set to point to the new parent.
*
* XXX ufs_dirrewrite updates the link count of fdvp, but not
* the link count of fvp or the link count of tdvp. Go figure.
*/
if (directory_p && reparent_p) {
error = ufs_dirrewrite(VTOI(fvp), mastertemplate.dot_reclen,
VTOI(fdvp), VTOI(tdvp)->i_number, DT_DIR, 0, IN_CHANGE);
#if 0 /* XXX This branch was not in ufs_rename! */
if (error)
goto whymustithurtsomuch;
#endif
/* XXX WTF? Why purge here? Why not purge others? */
cache_purge(fdvp);
}
/*
* 3) Unlink the source.
*/
/*
* ufs_direnter may compact the directory in the process of
* inserting a new entry. That may invalidate fulr, which we
* need in order to remove the old entry. In that case, we
* need to recalculate what fulr should be.
*/
if (!reparent_p && (tvp == NULL) &&
ufs_rename_ulr_overlap_p(fulr, tulr)) {
error = ufs_rename_recalculate_fulr(fdvp, fulr, tulr, fcnp);
#if 0 /* XXX */
if (error) /* XXX Try to back out changes? */
goto whymustithurtsomuch;
#endif
}
/*
* XXX 0 means !isrmdir. But can't this be an rmdir?
* XXX Well, turns out that argument to ufs_dirremove is ignored...
* XXX And it turns out ufs_dirremove updates the link count of fvp.
* XXX But it doesn't update the link count of fdvp. Go figure.
* XXX fdvp's link count is updated in ufs_dirrewrite instead.
* XXX Actually, sometimes it doesn't update fvp's link count.
* XXX I hate the world.
*/
error = ufs_dirremove(fdvp, fulr, VTOI(fvp), fcnp->cn_flags, 0);
if (error)
#if 0 /* XXX */
goto whymustithurtsomuch;
#endif
goto arghmybrainhurts;
if (tvp != NULL) {
*tvp_nlinkp = VTOI(tvp)->i_nlink;
}
#if 0 /* XXX */
genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp);
#endif
goto arghmybrainhurts;
whymustithurtsomuch:
KASSERT(0 < VTOI(fvp)->i_nlink);
VTOI(fvp)->i_nlink--;
DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
VTOI(fvp)->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
arghmybrainhurts:
UFS_WAPBL_END(mp);
ihateyou:
return error;
}
/*
* ufs_rename_ulr_overlap_p: True iff tulr overlaps with fulr so that
* entering a directory entry at tulr may move fulr.
*/
static bool
ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *fulr,
const struct ufs_lookup_results *tulr)
{
doff_t from_prev_start, from_prev_end, to_start, to_end;
KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr);
/*
* fulr is from a DELETE lookup, so fulr->ulr_count is the size
* of the preceding entry (d_reclen).
*/
from_prev_end = fulr->ulr_offset;
KASSERT(fulr->ulr_count <= from_prev_end);
from_prev_start = (from_prev_end - fulr->ulr_count);
/*
* tulr is from a RENAME lookup, so tulr->ulr_count is the size
* of the free space for an entry that we are about to fill.
*/
to_start = tulr->ulr_offset;
KASSERT(tulr->ulr_count < (UFS_MAXDIRSIZE - to_start));
to_end = (to_start + tulr->ulr_count);
return
(((to_start <= from_prev_start) && (from_prev_start < to_end)) || ((to_start <= from_prev_end) && (from_prev_end < to_end)));
}
/*
* ufs_rename_recalculate_fulr: If we have just entered a directory into
* dvp at tulr, and we were about to remove one at fulr for an entry
* named fcnp, fulr may be invalid. So, if necessary, recalculate it.
*/
static int
ufs_rename_recalculate_fulr(struct vnode *dvp,
struct ufs_lookup_results *fulr, const struct ufs_lookup_results *tulr,
const struct componentname *fcnp)
{
struct mount *mp;
struct ufsmount *ump;
int needswap;
/* XXX int is a silly type for this; blame ufsmount::um_dirblksiz. */
int dirblksiz;
doff_t search_start, search_end;
doff_t offset; /* Offset of entry we're examining. */
struct buf *bp; /* I/O block we're examining. */
char *dirbuf; /* Pointer into directory at search_start. */
struct direct *ep; /* Pointer to the entry we're examining. */
/* XXX direct::d_reclen is 16-bit;
* ufs_lookup_results::ulr_reclen is 32-bit. Blah. */
uint32_t reclen; /* Length of the entry we're examining. */
uint32_t prev_reclen; /* Length of the preceding entry. */
int error;
KASSERT(dvp != NULL);
KASSERT(dvp->v_mount != NULL); KASSERT(VTOI(dvp) != NULL); KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(ufs_rename_ulr_overlap_p(fulr, tulr));
mp = dvp->v_mount;
ump = VFSTOUFS(mp);
KASSERT(ump != NULL); KASSERT(ump == VTOI(dvp)->i_ump);
needswap = UFS_MPNEEDSWAP(ump);
dirblksiz = ump->um_dirblksiz;
KASSERT(0 < dirblksiz); KASSERT((dirblksiz & (dirblksiz - 1)) == 0);
/* A directory block may not span across multiple I/O blocks. */
KASSERT(dirblksiz <= mp->mnt_stat.f_iosize);
/* Find the bounds of the search. */
search_start = tulr->ulr_offset;
KASSERT(fulr->ulr_reclen < (UFS_MAXDIRSIZE - fulr->ulr_offset));
search_end = (fulr->ulr_offset + fulr->ulr_reclen);
/* Compaction must happen only within a directory block. (*) */
KASSERT(search_start <= search_end); KASSERT((search_end - (search_start &~ (dirblksiz - 1))) <= dirblksiz);
dirbuf = NULL;
bp = NULL;
error = ufs_blkatoff(dvp, (off_t)search_start, &dirbuf, &bp, false);
if (error)
return error;
KASSERT(dirbuf != NULL); KASSERT(bp != NULL);
/*
* Guarantee we sha'n't go past the end of the buffer we got.
* dirbuf is bp->b_data + (search_start & (iosize - 1)), and
* the valid range is [bp->b_data, bp->b_data + bp->b_bcount).
*/
KASSERT((search_end - search_start) <=
(bp->b_bcount - (search_start & (mp->mnt_stat.f_iosize - 1))));
prev_reclen = fulr->ulr_count;
offset = search_start;
/*
* Search from search_start to search_end for the entry matching
* fcnp, which must be there because we found it before and it
* should only at most have moved earlier.
*/
for (;;) {
KASSERT(search_start <= offset); KASSERT(offset < search_end);
/*
* Examine the directory entry at offset.
*/
ep = (struct direct *)(dirbuf + (offset - search_start));
reclen = ufs_rw16(ep->d_reclen, needswap);
if (ep->d_ino == 0)
goto next; /* Entry is unused. */
if (ufs_rw32(ep->d_ino, needswap) == UFS_WINO)
goto next; /* Entry is whiteout. */
if (fcnp->cn_namelen != ufs_direct_namlen(ep, dvp))
goto next; /* Wrong name length. */
if (memcmp(ep->d_name, fcnp->cn_nameptr, fcnp->cn_namelen))
goto next; /* Wrong name. */
/* Got it! */
break;
next:
if (! ((reclen < search_end) &&
(offset < (search_end - reclen)))) {
brelse(bp, 0);
return EIO; /* XXX Panic? What? */
}
/* We may not move past the search end. */
KASSERT(reclen < search_end);
KASSERT(offset < (search_end - reclen));
/*
* We may not move across a directory block boundary;
* see (*) above.
*/
KASSERT((offset &~ (dirblksiz - 1)) ==
((offset + reclen) &~ (dirblksiz - 1)));
prev_reclen = reclen;
offset += reclen;
}
/*
* Found the entry. Record where.
*/
fulr->ulr_offset = offset;
fulr->ulr_reclen = reclen;
/*
* Record the preceding record length, but not if we're at the
* start of a directory block.
*/
fulr->ulr_count = ((offset & (dirblksiz - 1))? prev_reclen : 0);
brelse(bp, 0);
return 0;
}
/*
* ufs_direct_namlen: Return the namlen of the directory entry ep from
* the directory vp.
*/
static int /* XXX int? uint8_t? */
ufs_direct_namlen(const struct direct *ep, const struct vnode *vp)
{
bool swap;
KASSERT(ep != NULL);
KASSERT(vp != NULL); KASSERT(VTOI(vp) != NULL); KASSERT(VTOI(vp)->i_ump != NULL);
#if (BYTE_ORDER == LITTLE_ENDIAN)
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif
return ((FSFMT(vp) && swap)? ep->d_type : ep->d_namlen);
}
/*
* ufs_gro_remove: Rename an object over another link to itself,
* effectively removing just the original link.
*/
int
ufs_gro_remove(struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
nlink_t *tvp_nlinkp)
{
struct ufs_lookup_results *ulr = de;
int error;
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(cnp != NULL);
KASSERT(ulr != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp);
KASSERT(dvp->v_mount == mp);
KASSERT(vp->v_mount == mp);
KASSERT(dvp->v_type == VDIR);
KASSERT(vp->v_type != VDIR);
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
KASSERT(cnp->cn_nameiop == DELETE);
error = UFS_WAPBL_BEGIN(mp);
if (error)
goto out;
/* XXX ufs_dirremove decrements vp's link count for us. */
error = ufs_dirremove(dvp, ulr, VTOI(vp), cnp->cn_flags, 0);
UFS_WAPBL_END(mp);
*tvp_nlinkp = VTOI(vp)->i_nlink;
out:
return error;
}
/*
* ufs_gro_lookup: Look up and save the lookup results.
*/
int
ufs_gro_lookup(struct mount *mp, struct vnode *dvp,
struct componentname *cnp, void *de_ret, struct vnode **vp_ret)
{
struct ufs_lookup_results *ulr_ret = de_ret;
struct vnode *vp = NULL;
int error;
(void)mp;
KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(ulr_ret != NULL); KASSERT(vp_ret != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
/* Kludge cargo-culted from dholland's ufs_rename. */
cnp->cn_flags &=~ MODMASK;
cnp->cn_flags |= (LOCKPARENT | LOCKLEAF);
error = relookup(dvp, &vp, cnp, 0 /* dummy */);
if ((error == 0) && (vp == NULL)) {
error = ENOENT;
goto out;
} else if (error) {
return error;
}
/*
* Thanks to VFS insanity, relookup locks vp, which screws us
* in various ways.
*/
KASSERT(vp != NULL);
VOP_UNLOCK(vp);
out: *ulr_ret = VTOI(dvp)->i_crap;
*vp_ret = vp;
return error;
}
/*
* ufs_rmdired_p: Check whether the directory vp has been rmdired.
*
* vp must be locked and referenced.
*/
static bool
ufs_rmdired_p(struct vnode *vp)
{
KASSERT(vp != NULL);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR);
/* XXX Is this correct? */
return (VTOI(vp)->i_size == 0);
}
/*
* ufs_read_dotdot: Store in *ino_ret the inode number of the parent
* of the directory vp.
*/
static int
ufs_read_dotdot(struct vnode *vp, kauth_cred_t cred, ino_t *ino_ret)
{
struct dirtemplate dirbuf;
int error;
KASSERT(vp != NULL);
KASSERT(ino_ret != NULL);
KASSERT(vp->v_type == VDIR);
error = ufs_bufio(UIO_READ, vp, &dirbuf, sizeof dirbuf, (off_t)0,
IO_NODELOCKED, cred, NULL, NULL);
if (error)
return error;
if (ufs_dirbuf_dotdot_namlen(&dirbuf, vp) != 2 ||
dirbuf.dotdot_name[0] != '.' ||
dirbuf.dotdot_name[1] != '.')
/* XXX Panic? Print warning? */
return ENOTDIR;
*ino_ret = ufs_rw32(dirbuf.dotdot_ino,
UFS_MPNEEDSWAP(VTOI(vp)->i_ump));
return 0;
}
/*
* ufs_dirbuf_dotdot_namlen: Return the namlen of the directory buffer
* dirbuf that came from the directory vp. Swap byte order if
* necessary.
*/
static int /* XXX int? uint8_t? */
ufs_dirbuf_dotdot_namlen(const struct dirtemplate *dirbuf,
const struct vnode *vp)
{
bool swap;
KASSERT(dirbuf != NULL);
KASSERT(vp != NULL);
KASSERT(VTOI(vp) != NULL);
KASSERT(VTOI(vp)->i_ump != NULL);
#if (BYTE_ORDER == LITTLE_ENDIAN)
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif
return ((FSFMT(vp) && swap)?
dirbuf->dotdot_type : dirbuf->dotdot_namlen);
}
/*
* ufs_gro_genealogy: Analyze the genealogy of the source and target
* directories.
*/
int
ufs_gro_genealogy(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct vnode *tdvp,
struct vnode **intermediate_node_ret)
{
struct vnode *vp, *dvp;
ino_t dotdot_ino = 0; /* XXX: gcc */
int error;
KASSERT(mp != NULL);
KASSERT(fdvp != NULL);
KASSERT(tdvp != NULL);
KASSERT(fdvp != tdvp);
KASSERT(intermediate_node_ret != NULL);
KASSERT(fdvp->v_mount == mp);
KASSERT(tdvp->v_mount == mp);
KASSERT(fdvp->v_type == VDIR);
KASSERT(tdvp->v_type == VDIR);
/*
* We need to provisionally lock tdvp to keep rmdir from
* deleting it -- or any ancestor -- at an inopportune moment.
*/
error = ufs_gro_lock_directory(mp, tdvp);
if (error)
return error;
vp = tdvp;
vref(vp);
for (;;) {
KASSERT(vp != NULL);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
KASSERT(vp->v_mount == mp);
KASSERT(vp->v_type == VDIR);
KASSERT(!ufs_rmdired_p(vp));
/* Did we hit the root without finding fdvp? */
if (VTOI(vp)->i_number == UFS_ROOTINO) {
vput(vp);
*intermediate_node_ret = NULL;
return 0;
}
error = ufs_read_dotdot(vp, cred, &dotdot_ino);
if (error) {
vput(vp);
return error;
}
/* Did we find that fdvp is an ancestor of tdvp? */
if (VTOI(fdvp)->i_number == dotdot_ino) {
/* Unlock vp, but keep it referenced. */
VOP_UNLOCK(vp);
*intermediate_node_ret = vp;
return 0;
}
/* Neither -- keep ascending the family tree. */
error = vcache_get(mp, &dotdot_ino, sizeof(dotdot_ino), &dvp);
vput(vp);
if (error)
return error;
error = vn_lock(dvp, LK_EXCLUSIVE);
if (error) {
vrele(dvp);
return error;
}
KASSERT(dvp != NULL);
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
vp = dvp;
if (vp->v_type != VDIR) {
/*
* XXX Panic? Print a warning? Can this
* happen if we lose the race I suspect to
* exist above, and the `..' inode number has
* been recycled?
*/
vput(vp);
return ENOTDIR;
}
if (ufs_rmdired_p(vp)) {
vput(vp);
return ENOENT;
}
}
}
/*
* ufs_gro_lock_directory: Lock the directory vp, but fail if it has
* been rmdir'd.
*/
int
ufs_gro_lock_directory(struct mount *mp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(vp->v_mount == mp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (ufs_rmdired_p(vp)) { VOP_UNLOCK(vp);
return ENOENT;
}
return 0;
}
static const struct genfs_rename_ops ufs_genfs_rename_ops = {
.gro_directory_empty_p = ufs_gro_directory_empty_p,
.gro_rename_check_possible = ufs_gro_rename_check_possible,
.gro_rename_check_permitted = ufs_gro_rename_check_permitted,
.gro_remove_check_possible = ufs_gro_remove_check_possible,
.gro_remove_check_permitted = ufs_gro_remove_check_permitted,
.gro_rename = ufs_gro_rename,
.gro_remove = ufs_gro_remove,
.gro_lookup = ufs_gro_lookup,
.gro_genealogy = ufs_gro_genealogy,
.gro_lock_directory = ufs_gro_lock_directory,
};
/* $NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $ */
/*-
* Copyright (c)2011 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
/*
* page dirtiness status tracking
*
* separated from uvm_page.c mainly for rump
*/
/*
* these constants are chosen to match so that we can convert between
* them quickly.
*/
__CTASSERT(UVM_PAGE_STATUS_UNKNOWN == 0);
__CTASSERT(UVM_PAGE_STATUS_DIRTY == PG_DIRTY);
__CTASSERT(UVM_PAGE_STATUS_CLEAN == PG_CLEAN);
/*
* uvm_pagegetdirty: return the dirtiness status (one of UVM_PAGE_STATUS_
* values) of the page.
*
* called with the owner locked.
*/
unsigned int
uvm_pagegetdirty(struct vm_page *pg)
{
struct uvm_object * const uobj __diagused = pg->uobject;
KASSERT((~pg->flags & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
return pg->flags & (PG_CLEAN|PG_DIRTY);
}
/*
* uvm_pagemarkdirty: set the dirtiness status (one of UVM_PAGE_STATUS_ values)
* of the page.
*
* called with the owner locked.
*
* update the radix tree tag for object-owned page.
*
* if new status is UVM_PAGE_STATUS_UNKNOWN, clear pmap-level dirty bit
* so that later uvm_pagecheckdirty() can notice modifications on the page.
*/
void
uvm_pagemarkdirty(struct vm_page *pg, unsigned int newstatus)
{
struct uvm_object * const uobj = pg->uobject;
const unsigned int oldstatus = uvm_pagegetdirty(pg);
enum cpu_count base;
KASSERT((~newstatus & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT((newstatus & ~(PG_CLEAN|PG_DIRTY)) == 0); KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
if (oldstatus == newstatus) {
return;
}
/*
* set UVM_PAGE_DIRTY_TAG tag unless known CLEAN so that putpages can
* find possibly-dirty pages quickly.
*/
if (uobj != NULL) {
if (newstatus == UVM_PAGE_STATUS_CLEAN) {
uvm_obj_page_clear_dirty(pg); } else if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
/*
* on first dirty page, mark the object dirty.
* for vnodes this inserts to the syncer worklist.
*/
if (uvm_obj_clean_p(uobj) &&
uobj->pgops->pgo_markdirty != NULL) {
(*uobj->pgops->pgo_markdirty)(uobj);
}
uvm_obj_page_set_dirty(pg);
}
}
if (newstatus == UVM_PAGE_STATUS_UNKNOWN) {
/*
* start relying on pmap-level dirtiness tracking.
*/
pmap_clear_modify(pg);
}
pg->flags &= ~(PG_CLEAN|PG_DIRTY);
pg->flags |= newstatus;
KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
if ((pg->flags & PG_STAT) != 0) {
if ((pg->flags & PG_SWAPBACKED) != 0) {
base = CPU_COUNT_ANONUNKNOWN;
} else {
base = CPU_COUNT_FILEUNKNOWN;
}
kpreempt_disable();
CPU_COUNT(base + oldstatus, -1); CPU_COUNT(base + newstatus, +1);
kpreempt_enable();
}
}
/*
* uvm_pagecheckdirty: check if page is dirty, and remove its dirty bit.
*
* called with the owner locked.
*
* returns if the page was dirty.
*
* if protected is true, mark the page CLEAN. otherwise, mark the page UNKNOWN.
* ("mark" in the sense of uvm_pagemarkdirty().)
*/
bool
uvm_pagecheckdirty(struct vm_page *pg, bool pgprotected)
{
const unsigned int oldstatus = uvm_pagegetdirty(pg);
bool modified;
KASSERT(uvm_page_owner_locked_p(pg, true));
/*
* if pgprotected is true, mark the page CLEAN.
* otherwise mark the page UNKNOWN unless it's CLEAN.
*
* possible transitions:
*
* CLEAN -> CLEAN , modified = false
* UNKNOWN -> UNKNOWN, modified = true
* UNKNOWN -> UNKNOWN, modified = false
* UNKNOWN -> CLEAN , modified = true
* UNKNOWN -> CLEAN , modified = false
* DIRTY -> UNKNOWN, modified = true
* DIRTY -> CLEAN , modified = true
*
* pmap_clear_modify is necessary if either of
* oldstatus or newstatus is UVM_PAGE_STATUS_UNKNOWN.
*/
if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
modified = false;
} else {
const unsigned int newstatus = pgprotected ?
UVM_PAGE_STATUS_CLEAN : UVM_PAGE_STATUS_UNKNOWN;
if (oldstatus == UVM_PAGE_STATUS_DIRTY) {
modified = true;
if (newstatus == UVM_PAGE_STATUS_UNKNOWN) { pmap_clear_modify(pg);
}
} else {
KASSERT(oldstatus == UVM_PAGE_STATUS_UNKNOWN);
modified = pmap_clear_modify(pg);
}
uvm_pagemarkdirty(pg, newstatus);
}
return modified;
}
/* $NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
*/
/*
* System calls on descriptors.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_readahead.h>
/*
* Duplicate a file descriptor.
*/
int
sys_dup(struct lwp *l, const struct sys_dup_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
} */
int error, newfd, oldfd;
file_t *fp;
oldfd = SCARG(uap, fd);
if ((fp = fd_getfile(oldfd)) == NULL) {
return EBADF;
}
error = fd_dup(fp, 0, &newfd, false);
fd_putfile(oldfd);
*retval = newfd;
return error;
}
/*
* Duplicate a file descriptor to a particular value.
*/
int
dodup(struct lwp *l, int from, int to, int flags, register_t *retval)
{
int error;
file_t *fp;
if ((fp = fd_getfile(from)) == NULL)
return EBADF;
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
fd_putfile(from);
if ((u_int)to >= curproc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
(u_int)to >= maxfiles)
error = EBADF;
else if (from == to)
error = 0;
else
error = fd_dup2(fp, to, flags);
closef(fp);
*retval = to;
return error;
}
int
sys_dup3(struct lwp *l, const struct sys_dup3_args *uap, register_t *retval)
{
/* {
syscallarg(int) from;
syscallarg(int) to;
syscallarg(int) flags;
} */
return dodup(l, SCARG(uap, from), SCARG(uap, to), SCARG(uap, flags),
retval);
}
int
sys_dup2(struct lwp *l, const struct sys_dup2_args *uap, register_t *retval)
{
/* {
syscallarg(int) from;
syscallarg(int) to;
} */
return dodup(l, SCARG(uap, from), SCARG(uap, to), 0, retval);
}
/*
* fcntl call which is being passed to the file's fs.
*/
static int
fcntl_forfs(int fd, file_t *fp, int cmd, void *arg)
{
int error;
u_int size;
void *data, *memp;
#define STK_PARAMS 128
char stkbuf[STK_PARAMS];
if ((fp->f_flag & (FREAD | FWRITE)) == 0)
return (EBADF);
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = (size_t)F_PARAM_LEN(cmd);
if (size > F_PARAM_MAX)
return (EINVAL);
memp = NULL;
if (size > sizeof(stkbuf)) { memp = kmem_alloc(size, KM_SLEEP);
data = memp;
} else
data = stkbuf;
if (cmd & F_FSIN) {
if (size) {
error = copyin(arg, data, size);
if (error) { if (memp)
kmem_free(memp, size);
return (error);
}
} else
*(void **)data = arg;
} else if ((cmd & F_FSOUT) != 0 && size != 0) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
memset(data, 0, size); } else if (cmd & F_FSVOID) *(void **)data = arg;
error = (*fp->f_ops->fo_fcntl)(fp, cmd, data);
/*
* Copy any data to user, size was
* already set and checked above.
*/
if (error == 0 && (cmd & F_FSOUT) && size) error = copyout(data, arg, size); if (memp)
kmem_free(memp, size);
return (error);
}
int
do_fcntl_lock(int fd, int cmd, struct flock *fl)
{
struct file *fp = NULL;
proc_t *p;
int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
int error, flg;
if ((fp = fd_getfile(fd)) == NULL) {
error = EBADF;
goto out;
}
if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
error = EINVAL;
goto out;
}
flg = F_POSIX;
p = curproc;
switch (cmd) {
case F_SETLKW:
flg |= F_WAIT;
/* Fall into F_SETLK */
/* FALLTHROUGH */
case F_SETLK:
switch (fl->l_type) {
case F_RDLCK:
if ((fp->f_flag & FREAD) == 0) {
error = EBADF;
break;
}
if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock);
p->p_flag |= PK_ADVLOCK;
mutex_exit(p->p_lock);
}
error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
break;
case F_WRLCK:
if ((fp->f_flag & FWRITE) == 0) {
error = EBADF;
break;
}
if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock);
p->p_flag |= PK_ADVLOCK;
mutex_exit(p->p_lock);
}
error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
break;
case F_UNLCK:
error = (*fo_advlock)(fp, p, F_UNLCK, fl, F_POSIX);
break;
default:
error = EINVAL;
break;
}
break;
case F_GETLK:
if (fl->l_type != F_RDLCK &&
fl->l_type != F_WRLCK &&
fl->l_type != F_UNLCK) {
error = EINVAL;
break;
}
error = (*fo_advlock)(fp, p, F_GETLK, fl, F_POSIX);
break;
default:
error = EINVAL;
break;
}
out: if (fp)
fd_putfile(fd);
return error;
}
/*
* The file control system call.
*/
int
sys_fcntl(struct lwp *l, const struct sys_fcntl_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) cmd;
syscallarg(void *) arg;
} */
int fd, i, tmp, error, cmd, newmin;
filedesc_t *fdp;
fdtab_t *dt;
file_t *fp;
char *kpath;
struct flock fl;
bool cloexec = false;
fd = SCARG(uap, fd);
cmd = SCARG(uap, cmd);
fdp = l->l_fd;
error = 0;
switch (cmd) {
case F_CLOSEM:
if (fd < 0)
return EBADF;
while ((i = fdp->fd_lastfile) >= fd) { if (fd_getfile(i) == NULL) {
/* Another thread has updated. */
continue;
}
fd_close(i);
}
return 0;
case F_MAXFD:
*retval = fdp->fd_lastfile;
return 0;
case F_SETLKW:
case F_SETLK:
case F_GETLK:
error = copyin(SCARG(uap, arg), &fl, sizeof(fl));
if (error)
return error;
error = do_fcntl_lock(fd, cmd, &fl);
if (cmd == F_GETLK && error == 0) error = copyout(&fl, SCARG(uap, arg), sizeof(fl));
return error;
default:
/* Handled below */
break;
}
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((cmd & F_FSCTL)) {
error = fcntl_forfs(fd, fp, cmd, SCARG(uap, arg));
fd_putfile(fd);
return error;
}
switch (cmd) {
case F_DUPFD_CLOEXEC:
cloexec = true;
/*FALLTHROUGH*/
case F_DUPFD:
newmin = (long)SCARG(uap, arg);
if ((u_int)newmin >=
l->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
(u_int)newmin >= maxfiles) {
fd_putfile(fd);
return EINVAL;
}
error = fd_dup(fp, newmin, &i, cloexec);
*retval = i;
break;
case F_GETFD:
dt = atomic_load_consume(&fdp->fd_dt);
*retval = dt->dt_ff[fd]->ff_exclose;
break;
case F_SETFD:
fd_set_exclose(l, fd,
((long)SCARG(uap, arg) & FD_CLOEXEC) != 0);
break;
case F_GETNOSIGPIPE:
*retval = (fp->f_flag & FNOSIGPIPE) != 0;
break;
case F_SETNOSIGPIPE:
if (SCARG(uap, arg))
atomic_or_uint(&fp->f_flag, FNOSIGPIPE);
else
atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE);
*retval = 0;
break;
case F_GETFL:
*retval = OFLAGS(fp->f_flag);
break;
case F_SETFL:
/* XXX not guaranteed to be atomic. */
tmp = FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
error = (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &tmp);
if (error)
break;
i = tmp ^ fp->f_flag;
if (i & FNONBLOCK) {
int flgs = tmp & FNONBLOCK;
error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, &flgs);
if (error) {
(*fp->f_ops->fo_fcntl)(fp, F_SETFL,
&fp->f_flag);
break;
}
} if (i & FASYNC) {
int flgs = tmp & FASYNC;
error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, &flgs);
if (error) {
if (i & FNONBLOCK) { tmp = fp->f_flag & FNONBLOCK;
(void)(*fp->f_ops->fo_ioctl)(fp,
FIONBIO, &tmp);
}
(*fp->f_ops->fo_fcntl)(fp, F_SETFL,
&fp->f_flag);
break;
}
}
fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) | tmp;
break;
case F_GETOWN:
error = (*fp->f_ops->fo_ioctl)(fp, FIOGETOWN, &tmp);
*retval = tmp;
break;
case F_SETOWN:
tmp = (int)(uintptr_t) SCARG(uap, arg);
error = (*fp->f_ops->fo_ioctl)(fp, FIOSETOWN, &tmp);
break;
case F_GETPATH:
kpath = PNBUF_GET();
/* vnodes need extra context, so are handled separately */
if (fp->f_type == DTYPE_VNODE)
error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode,
l, l->l_proc);
else
error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath); if (error == 0) error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN,
NULL);
PNBUF_PUT(kpath);
break;
case F_ADD_SEALS:
tmp = (int)(uintptr_t) SCARG(uap, arg);
error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp);
break;
case F_GET_SEALS:
error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp);
*retval = tmp;
break;
default:
error = EINVAL;
}
fd_putfile(fd);
return (error);
}
/*
* Close a file descriptor.
*/
int
sys_close(struct lwp *l, const struct sys_close_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
} */
int error;
int fd = SCARG(uap, fd);
if (fd_getfile(fd) == NULL) {
return EBADF;
}
error = fd_close(fd);
if (error == ERESTART) {
#ifdef DIAGNOSTIC
printf("%s[%d]: close(%d) returned ERESTART\n",
l->l_proc->p_comm, (int)l->l_proc->p_pid, fd);
#endif
error = EINTR;
}
return error;
}
/*
* Return status information about a file descriptor.
* Common function for compat code.
*/
int
do_sys_fstat(int fd, struct stat *sb)
{
file_t *fp;
int error;
if ((fp = fd_getfile(fd)) == NULL) {
return EBADF;
}
error = (*fp->f_ops->fo_stat)(fp, sb);
fd_putfile(fd);
return error;
}
/*
* Return status information about a file descriptor.
*/
int
sys___fstat50(struct lwp *l, const struct sys___fstat50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct stat *) sb;
} */
struct stat sb;
int error;
error = do_sys_fstat(SCARG(uap, fd), &sb); if (error == 0) { error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
}
return error;
}
/*
* Return pathconf information about a file descriptor.
*/
int
sys_fpathconf(struct lwp *l, const struct sys_fpathconf_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) name;
} */
int fd, name, error;
file_t *fp;
fd = SCARG(uap, fd);
name = SCARG(uap, name);
error = 0;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if (fp->f_ops->fo_fpathconf == NULL)
error = EOPNOTSUPP;
else
error = (*fp->f_ops->fo_fpathconf)(fp, name, retval);
fd_putfile(fd);
return error;
}
/*
* Apply an advisory lock on a file descriptor.
*
* Just attempt to get a record lock of the requested type on
* the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
*/
/* ARGSUSED */
int
sys_flock(struct lwp *l, const struct sys_flock_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) how;
} */
int fd, how, error;
struct file *fp = NULL;
int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
struct flock lf;
fd = SCARG(uap, fd);
how = SCARG(uap, how);
if ((fp = fd_getfile(fd)) == NULL) {
error = EBADF;
goto out;
}
if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
KASSERT((atomic_load_relaxed(&fp->f_flag) & FHASLOCK) == 0);
error = EOPNOTSUPP;
goto out;
}
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
switch (how & ~LOCK_NB) {
case LOCK_UN:
lf.l_type = F_UNLCK;
atomic_and_uint(&fp->f_flag, ~FHASLOCK);
error = (*fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
goto out;
case LOCK_EX:
lf.l_type = F_WRLCK;
break;
case LOCK_SH:
lf.l_type = F_RDLCK;
break;
default:
error = EINVAL;
goto out;
}
atomic_or_uint(&fp->f_flag, FHASLOCK);
if (how & LOCK_NB) {
error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK);
} else {
error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
}
out: if (fp)
fd_putfile(fd);
return error;
}
int
do_posix_fadvise(int fd, off_t offset, off_t len, int advice)
{
file_t *fp;
int error;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if (fp->f_ops->fo_posix_fadvise == NULL) {
error = EOPNOTSUPP;
} else {
error = (*fp->f_ops->fo_posix_fadvise)(fp, offset, len,
advice);
}
fd_putfile(fd);
return error;
}
int
sys___posix_fadvise50(struct lwp *l,
const struct sys___posix_fadvise50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) pad;
syscallarg(off_t) offset;
syscallarg(off_t) len;
syscallarg(int) advice;
} */
*retval = do_posix_fadvise(SCARG(uap, fd), SCARG(uap, offset),
SCARG(uap, len), SCARG(uap, advice));
return 0;
}
int
sys_pipe(struct lwp *l, const void *v, register_t *retval)
{
int fd[2], error;
if ((error = pipe1(l, fd, 0)) != 0)
return error;
retval[0] = fd[0];
retval[1] = fd[1];
return 0;
}
int
sys_pipe2(struct lwp *l, const struct sys_pipe2_args *uap, register_t *retval)
{
/* {
syscallarg(int[2]) fildes;
syscallarg(int) flags;
} */
int fd[2], error;
if ((error = pipe1(l, fd, SCARG(uap, flags))) != 0)
return error;
if ((error = copyout(fd, SCARG(uap, fildes), sizeof(fd))) != 0)
return error;
retval[0] = 0;
return 0;
}
/* $NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp
* @(#)null_subr.c 8.7 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
#ifdef LAYERFS_DIAGNOSTIC
int layerfs_debug = 1;
#endif
/*
* layer cache:
* Each cache entry holds a reference to the lower vnode
* along with a pointer to the alias vnode. When an
* entry is added the lower vnode is VREF'd. When the
* alias is removed the lower vnode is vrele'd.
*/
void
layerfs_init(void)
{
/* Nothing. */
}
void
layerfs_done(void)
{
/* Nothing. */
}
/*
* layer_node_create: try to find an existing layerfs vnode referring to it,
* otherwise make a new vnode which contains a reference to the lower vnode.
*/
int
layer_node_create(struct mount *mp, struct vnode *lowervp, struct vnode **nvpp)
{
int error;
struct vnode *aliasvp;
error = vcache_get(mp, &lowervp, sizeof(lowervp), &aliasvp);
if (error)
return error;
/*
* Now that we acquired a reference on the upper vnode, release one
* on the lower node. The existence of the layer_node retains one
* reference to the lower node.
*/
vrele(lowervp);
KASSERT(vrefcnt(lowervp) > 0);
#ifdef LAYERFS_DIAGNOSTIC
if (layerfs_debug)
vprint("layer_node_create: alias", aliasvp);
#endif
*nvpp = aliasvp;
return 0;
}
#ifdef LAYERFS_DIAGNOSTIC
struct vnode *
layer_checkvp(struct vnode *vp, const char *fil, int lno)
{
struct layer_node *a = VTOLAYER(vp);
#ifdef notyet
/*
* Can't do this check because vop_reclaim runs
* with a funny vop vector.
*
* WRS - no it doesnt...
*/
if (vp->v_op != layer_vnodeop_p) {
printf ("layer_checkvp: on non-layer-node\n");
#ifdef notyet
while (layer_checkvp_barrier) /*WAIT*/ ;
#endif
panic("layer_checkvp");
};
#endif
if (a->layer_lowervp == NULL) {
/* Should never happen */
int i; u_long *p;
printf("vp = %p, ZERO ptr\n", vp);
for (p = (u_long *) a, i = 0; i < 8; i++)
printf(" %lx", p[i]);
printf("\n");
/* wait for debugger */
panic("layer_checkvp");
}
if (vrefcnt(a->layer_lowervp) < 1) {
int i; u_long *p;
printf("vp = %p, unref'ed lowervp\n", vp);
for (p = (u_long *) a, i = 0; i < 8; i++)
printf(" %lx", p[i]);
printf("\n");
/* wait for debugger */
panic ("layer with unref'ed lowervp");
};
#ifdef notnow
printf("layer %p/%d -> %p/%d [%s, %d]\n",
LAYERTOV(a), vrefcnt(LAYERTOV(a)),
a->layer_lowervp, vrefcnt(a->layer_lowervp),
fil, lno);
#endif
return a->layer_lowervp;
}
#endif
/* $NetBSD: if_ether.h,v 1.91 2024/02/05 21:46:06 andvar Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_ether.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NET_IF_ETHER_H_
#define _NET_IF_ETHER_H_
#ifdef _KERNEL
#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif
#include <sys/mbuf.h>
#endif
#ifndef _STANDALONE
#include <net/if.h>
#endif
/*
* Some basic Ethernet constants.
*/
#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */
#define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */
#define ETHER_CRC_LEN 4 /* length of the Ethernet CRC */
#define ETHER_HDR_LEN ((ETHER_ADDR_LEN * 2) + ETHER_TYPE_LEN)
#define ETHER_MIN_LEN 64 /* minimum frame length, including CRC */
#define ETHER_MAX_LEN 1518 /* maximum frame length, including CRC */
#define ETHER_MAX_LEN_JUMBO 9018 /* maximum jumbo frame len, including CRC */
/*
* Some Ethernet extensions.
*/
#define ETHER_VLAN_ENCAP_LEN 4 /* length of 802.1Q VLAN encapsulation */
#define EVL_VLANOFTAG(tag) ((tag) & 4095) /* VLAN ID */
#define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) /* Priority */
#define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) /* CFI */
#define ETHER_PPPOE_ENCAP_LEN 8 /* length of PPPoE encapsulation */
/*
* Mbuf adjust factor to force 32-bit alignment of IP header.
* Drivers should do m_adj(m, ETHER_ALIGN) when setting up a
* receive so the upper layers get the IP header properly aligned
* past the 14-byte Ethernet header.
*/
#define ETHER_ALIGN 2 /* driver adjust for IP hdr alignment */
/*
* Ethernet address - 6 octets
* this is only used by the ethers(3) functions.
*/
struct ether_addr {
uint8_t ether_addr_octet[ETHER_ADDR_LEN];
};
/*
* Structure of a 10Mb/s Ethernet header.
*/
struct ether_header {
uint8_t ether_dhost[ETHER_ADDR_LEN];
uint8_t ether_shost[ETHER_ADDR_LEN];
uint16_t ether_type;
};
#include <net/ethertypes.h>
#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define ETHER_IS_LOCAL(addr) (*(addr) & 0x02) /* is address local? */
#define ETHERMTU_JUMBO (ETHER_MAX_LEN_JUMBO - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define ETHERMTU (ETHER_MAX_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define ETHERMIN (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
/*
* Compute the maximum frame size based on ethertype (i.e. possible
* encapsulation) and whether or not an FCS is present.
*/
#define ETHER_MAX_FRAME(ifp, etype, hasfcs) \
((ifp)->if_mtu + ETHER_HDR_LEN + \
((hasfcs) ? ETHER_CRC_LEN : 0) + \
(((etype) == ETHERTYPE_VLAN) ? ETHER_VLAN_ENCAP_LEN : 0) + \
(((etype) == ETHERTYPE_PPPOE) ? ETHER_PPPOE_ENCAP_LEN : 0))
/*
* Ethernet CRC32 polynomials (big- and little-endian versions).
*/
#define ETHER_CRC_POLY_LE 0xedb88320
#define ETHER_CRC_POLY_BE 0x04c11db6
#ifndef _STANDALONE
/*
* Ethernet-specific mbuf flags.
*/
#define M_HASFCS M_LINK0 /* FCS included at end of frame */
#define M_PROMISC M_LINK1 /* this packet is not for us */
#ifdef _KERNEL
/*
* Macro to map an IP multicast address to an Ethernet multicast address.
* The high-order 25 bits of the Ethernet address are statically assigned,
* and the low-order 23 bits are taken from the low end of the IP address.
*/
#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \
/* const struct in_addr *ipaddr; */ \
/* uint8_t enaddr[ETHER_ADDR_LEN]; */ \
do { \
(enaddr)[0] = 0x01; \
(enaddr)[1] = 0x00; \
(enaddr)[2] = 0x5e; \
(enaddr)[3] = ((const uint8_t *)ipaddr)[1] & 0x7f; \
(enaddr)[4] = ((const uint8_t *)ipaddr)[2]; \
(enaddr)[5] = ((const uint8_t *)ipaddr)[3]; \
} while (/*CONSTCOND*/0)
/*
* Macro to map an IP6 multicast address to an Ethernet multicast address.
* The high-order 16 bits of the Ethernet address are statically assigned,
* and the low-order 32 bits are taken from the low end of the IP6 address.
*/
#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \
/* struct in6_addr *ip6addr; */ \
/* uint8_t enaddr[ETHER_ADDR_LEN]; */ \
{ \
(enaddr)[0] = 0x33; \
(enaddr)[1] = 0x33; \
(enaddr)[2] = ((const uint8_t *)ip6addr)[12]; \
(enaddr)[3] = ((const uint8_t *)ip6addr)[13]; \
(enaddr)[4] = ((const uint8_t *)ip6addr)[14]; \
(enaddr)[5] = ((const uint8_t *)ip6addr)[15]; \
}
#endif
struct mii_data;
struct ethercom;
typedef int (*ether_cb_t)(struct ethercom *);
typedef int (*ether_vlancb_t)(struct ethercom *, uint16_t, bool);
/*
* Structure shared between the ethernet driver modules and
* the multicast list code. For example, each ec_softc or il_softc
* begins with this structure.
*/
struct ethercom {
struct ifnet ec_if; /* network-visible interface */
LIST_HEAD(, ether_multi) ec_multiaddrs; /* list of ether multicast
addrs */
int ec_multicnt; /* length of ec_multiaddrs
list */
int ec_capabilities; /* capabilities, provided by
driver */
int ec_capenable; /* tells hardware which
capabilities to enable */
int ec_nvlans; /* # VLANs on this interface */
SIMPLEQ_HEAD(, vlanid_list) ec_vids; /* list of VLAN IDs */
/* The device handle for the MII bus child device. */
struct mii_data *ec_mii;
struct ifmedia *ec_ifmedia;
/*
* Called after a change to ec_if.if_flags. Returns
* ENETRESET if the device should be reinitialized with
* ec_if.if_init, 0 on success, not 0 on failure.
*/
ether_cb_t ec_ifflags_cb;
/*
* Called whenever a vlan interface is configured or unconfigured.
* Args include the vlan tag and a flag indicating whether the tag is
* being added or removed.
*/
ether_vlancb_t ec_vlan_cb;
/* Hooks called at the beginning of detach of this interface */
khook_list_t *ec_ifdetach_hooks;
kmutex_t *ec_lock;
/* Flags used only by the kernel */
int ec_flags;
#ifdef MBUFTRACE
struct mowner ec_rx_mowner; /* mbufs received */
struct mowner ec_tx_mowner; /* mbufs transmitted */
#endif
};
#define ETHERCAP_VLAN_MTU 0x00000001 /* VLAN-compatible MTU */
#define ETHERCAP_VLAN_HWTAGGING 0x00000002 /* hardware VLAN tag support */
#define ETHERCAP_JUMBO_MTU 0x00000004 /* 9000 byte MTU supported */
#define ETHERCAP_VLAN_HWFILTER 0x00000008 /* iface hw can filter vlan tag */
#define ETHERCAP_EEE 0x00000010 /* Energy Efficiency Ethernet */
#define ETHERCAP_MASK 0x0000001f
#define ECCAPBITS \
"\020" \
"\1VLAN_MTU" \
"\2VLAN_HWTAGGING" \
"\3JUMBO_MTU" \
"\4VLAN_HWFILTER" \
"\5EEE"
/* ioctl() for Ethernet capabilities */
struct eccapreq {
char eccr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
int eccr_capabilities; /* supported capabiliites */
int eccr_capenable; /* capabilities enabled */
};
/* sysctl for Ethernet multicast addresses */
struct ether_multi_sysctl {
u_int enm_refcount;
uint8_t enm_addrlo[ETHER_ADDR_LEN];
uint8_t enm_addrhi[ETHER_ADDR_LEN];
};
#ifdef _KERNEL
/*
* Flags for ec_flags
*/
/* Store IFF_ALLMULTI in ec_flags instead of if_flags to avoid data races. */
#define ETHER_F_ALLMULTI __BIT(0)
extern const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN];
extern const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN];
void ether_set_ifflags_cb(struct ethercom *, ether_cb_t);
void ether_set_vlan_cb(struct ethercom *, ether_vlancb_t);
int ether_ioctl(struct ifnet *, u_long, void *);
int ether_addmulti(const struct sockaddr *, struct ethercom *);
int ether_delmulti(const struct sockaddr *, struct ethercom *);
int ether_multiaddr(const struct sockaddr *, uint8_t[ETHER_ADDR_LEN],
uint8_t[ETHER_ADDR_LEN]);
void ether_input(struct ifnet *, struct mbuf *);
/*
* Ethernet multicast address structure. There is one of these for each
* multicast address or range of multicast addresses that we are supposed
* to listen to on a particular interface. They are kept in a linked list,
* rooted in the interface's ethercom structure.
*/
struct ether_multi {
uint8_t enm_addrlo[ETHER_ADDR_LEN]; /* low or only address of range */
uint8_t enm_addrhi[ETHER_ADDR_LEN]; /* high or only address of range */
u_int enm_refcount; /* no. claims to this addr/range */
LIST_ENTRY(ether_multi) enm_list;
};
/*
* Structure used by macros below to remember position when stepping through
* all of the ether_multi records.
*/
struct ether_multistep {
struct ether_multi *e_enm;
};
/*
* lookup the ether_multi record for a given range of Ethernet
* multicast addresses connected to a given ethercom structure.
* If no matching record is found, NULL is returned.
*/
static __inline struct ether_multi *
ether_lookup_multi(const uint8_t *addrlo, const uint8_t *addrhi,
const struct ethercom *ec)
{
struct ether_multi *enm;
LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) { if (memcmp(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN) != 0)
continue;
if (memcmp(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN) != 0)
continue;
break;
}
return enm;
}
/*
* step through all of the ether_multi records, one at a time.
* The current position is remembered in "step", which the caller must
* provide. ether_first_multi(), below, must be called to initialize "step"
* and get the first record. Both functions return a NULL when there
* are no remaining records.
*/
static __inline struct ether_multi *
ether_next_multi(struct ether_multistep *step)
{
struct ether_multi *enm;
enm = step->e_enm;
if (enm != NULL)
step->e_enm = LIST_NEXT(enm, enm_list);
return enm;
}
#define ETHER_NEXT_MULTI(step, enm) \
/* struct ether_multistep step; */ \
/* struct ether_multi *enm; */ \
(enm) = ether_next_multi(&(step))
static __inline struct ether_multi *
ether_first_multi(struct ether_multistep *step, const struct ethercom *ec)
{
step->e_enm = LIST_FIRST(&ec->ec_multiaddrs);
return ether_next_multi(step);
}
#define ETHER_FIRST_MULTI(step, ec, enm) \
/* struct ether_multistep step; */ \
/* struct ethercom *ec; */ \
/* struct ether_multi *enm; */ \
(enm) = ether_first_multi(&(step), (ec))
#define ETHER_LOCK(ec) mutex_enter((ec)->ec_lock)
#define ETHER_UNLOCK(ec) mutex_exit((ec)->ec_lock)
/*
* Ethernet 802.1Q VLAN structures.
*/
/* for ethercom */
struct vlanid_list {
uint16_t vid;
SIMPLEQ_ENTRY(vlanid_list) vid_list;
};
/* add VLAN tag to input/received packet */
static __inline void
vlan_set_tag(struct mbuf *m, uint16_t vlantag)
{
/* VLAN tag contains priority, CFI and VLAN ID */
KASSERT((m->m_flags & M_PKTHDR) != 0);
m->m_pkthdr.ether_vtag = vlantag;
m->m_flags |= M_VLANTAG;
return;
}
/* extract VLAN ID value from a VLAN tag */
static __inline uint16_t
vlan_get_tag(struct mbuf *m)
{
KASSERT((m->m_flags & M_PKTHDR) != 0);
KASSERT(m->m_flags & M_VLANTAG);
return m->m_pkthdr.ether_vtag;
}
static __inline bool
vlan_has_tag(struct mbuf *m)
{
return (m->m_flags & M_VLANTAG) != 0;
}
static __inline bool
vlan_is_hwtag_enabled(struct ifnet *_ifp)
{
struct ethercom *ec = (void *)_ifp;
if (ec->ec_capenable & ETHERCAP_VLAN_HWTAGGING)
return true;
return false;
}
/* test if any VLAN is configured for this interface */
#define VLAN_ATTACHED(ec) ((ec)->ec_nvlans > 0)
void etherinit(void);
void ether_ifattach(struct ifnet *, const uint8_t *);
void ether_ifdetach(struct ifnet *);
int ether_mediachange(struct ifnet *);
void ether_mediastatus(struct ifnet *, struct ifmediareq *);
void * ether_ifdetachhook_establish(struct ifnet *,
void (*)(void *), void *arg);
void ether_ifdetachhook_disestablish(struct ifnet *,
void *, kmutex_t *);
char *ether_sprintf(const uint8_t *);
char *ether_snprintf(char *, size_t, const uint8_t *);
uint32_t ether_crc32_le(const uint8_t *, size_t);
uint32_t ether_crc32_be(const uint8_t *, size_t);
int ether_aton_r(u_char *, size_t, const char *);
int ether_enable_vlan_mtu(struct ifnet *);
int ether_disable_vlan_mtu(struct ifnet *);
int ether_add_vlantag(struct ifnet *, uint16_t, bool *);
int ether_del_vlantag(struct ifnet *, uint16_t);
int ether_inject_vlantag(struct mbuf **, uint16_t, uint16_t);
struct mbuf *
ether_strip_vlantag(struct mbuf *);
#else
/*
* Prototype ethers(3) functions.
*/
#include <sys/cdefs.h>
__BEGIN_DECLS
char * ether_ntoa(const struct ether_addr *);
struct ether_addr *
ether_aton(const char *);
int ether_ntohost(char *, const struct ether_addr *);
int ether_hostton(const char *, struct ether_addr *);
int ether_line(const char *, struct ether_addr *, char *);
__END_DECLS
#endif
#endif /* _STANDALONE */
#endif /* !_NET_IF_ETHER_H_ */
/* $NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $ */
/*
* Copyright (c) 2002, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2004 The FreeBSD Foundation
* Copyright (c) 2004 Robert Watson
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
*/
/*
* Socket operation routines.
*
* These routines are called by the routines in sys_socket.c or from a
* system process, and implement the semantics of socket operations by
* switching out to the protocol specific routines.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_sock_counters.h"
#include "opt_sosend_loan.h"
#include "opt_mbuftrace.h"
#include "opt_somaxkva.h"
#include "opt_multiprocessor.h" /* XXX */
#include "opt_sctp.h"
#include "opt_pipe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/uidinfo.h>
#include <sys/event.h>
#include <sys/poll.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_loan.h>
#include <uvm/uvm_page.h>
#ifdef SCTP
#include <netinet/sctp_route.h>
#endif
MALLOC_DEFINE(M_SONAME, "soname", "socket name");
extern const struct fileops socketops;
static int sooptions;
extern int somaxconn; /* patchable (XXX sysctl) */
int somaxconn = SOMAXCONN;
kmutex_t *softnet_lock;
#ifdef SOSEND_COUNTERS
#include <sys/device.h>
static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "loan big");
static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "copy big");
static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "copy small");
static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "kva limit");
#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++
EVCNT_ATTACH_STATIC(sosend_loan_big);
EVCNT_ATTACH_STATIC(sosend_copy_big);
EVCNT_ATTACH_STATIC(sosend_copy_small);
EVCNT_ATTACH_STATIC(sosend_kvalimit);
#else
#define SOSEND_COUNTER_INCR(ev) /* nothing */
#endif /* SOSEND_COUNTERS */
#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
int sock_loan_thresh = -1;
#else
int sock_loan_thresh = 4096;
#endif
static kmutex_t so_pendfree_lock;
static struct mbuf *so_pendfree = NULL;
#ifndef SOMAXKVA
#define SOMAXKVA (16 * 1024 * 1024)
#endif
int somaxkva = SOMAXKVA;
static int socurkva;
static kcondvar_t socurkva_cv;
#ifndef SOFIXEDBUF
#define SOFIXEDBUF true
#endif
bool sofixedbuf = SOFIXEDBUF;
static kauth_listener_t socket_listener;
#define SOCK_LOAN_CHUNK 65536
static void sopendfree_thread(void *);
static kcondvar_t pendfree_thread_cv;
static lwp_t *sopendfree_lwp;
static void sysctl_kern_socket_setup(void);
static struct sysctllog *socket_sysctllog;
static vsize_t
sokvareserve(struct socket *so, vsize_t len)
{
int error;
mutex_enter(&so_pendfree_lock);
while (socurkva + len > somaxkva) {
SOSEND_COUNTER_INCR(&sosend_kvalimit);
error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
if (error) {
len = 0;
break;
}
}
socurkva += len;
mutex_exit(&so_pendfree_lock);
return len;
}
static void
sokvaunreserve(vsize_t len)
{
mutex_enter(&so_pendfree_lock);
socurkva -= len;
cv_broadcast(&socurkva_cv);
mutex_exit(&so_pendfree_lock);
}
/*
* sokvaalloc: allocate kva for loan.
*/
vaddr_t
sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so)
{
vaddr_t lva;
if (sokvareserve(so, len) == 0)
return 0;
lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask,
UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
if (lva == 0) {
sokvaunreserve(len);
return 0;
}
return lva;
}
/*
* sokvafree: free kva for loan.
*/
void
sokvafree(vaddr_t sva, vsize_t len)
{
uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
sokvaunreserve(len);
}
static void
sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
{
vaddr_t sva, eva;
vsize_t len;
int npgs;
KASSERT(pgs != NULL);
eva = round_page((vaddr_t) buf + size);
sva = trunc_page((vaddr_t) buf);
len = eva - sva;
npgs = len >> PAGE_SHIFT;
pmap_kremove(sva, len);
pmap_update(pmap_kernel());
uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
sokvafree(sva, len);
}
/*
* sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock
* so_pendfree_lock when freeing mbufs.
*/
static void
sopendfree_thread(void *v)
{
struct mbuf *m, *next;
size_t rv;
mutex_enter(&so_pendfree_lock);
for (;;) {
rv = 0;
while (so_pendfree != NULL) {
m = so_pendfree;
so_pendfree = NULL;
mutex_exit(&so_pendfree_lock);
for (; m != NULL; m = next) {
next = m->m_next;
KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) ==
0);
KASSERT(m->m_ext.ext_refcnt == 0);
rv += m->m_ext.ext_size;
sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
m->m_ext.ext_size);
pool_cache_put(mb_cache, m);
}
mutex_enter(&so_pendfree_lock);
}
if (rv)
cv_broadcast(&socurkva_cv);
cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
}
panic("sopendfree_thread");
/* NOTREACHED */
}
void
soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
{
KASSERT(m != NULL);
/*
* postpone freeing mbuf.
*
* we can't do it in interrupt context
* because we need to put kva back to kernel_map.
*/
mutex_enter(&so_pendfree_lock);
m->m_next = so_pendfree;
so_pendfree = m;
cv_signal(&pendfree_thread_cv);
mutex_exit(&so_pendfree_lock);
}
static long
sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
{
struct iovec *iov = uio->uio_iov;
vaddr_t sva, eva;
vsize_t len;
vaddr_t lva;
int npgs, error;
vaddr_t va;
int i;
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
return 0;
if (iov->iov_len < (size_t) space)
space = iov->iov_len;
if (space > SOCK_LOAN_CHUNK)
space = SOCK_LOAN_CHUNK;
eva = round_page((vaddr_t) iov->iov_base + space);
sva = trunc_page((vaddr_t) iov->iov_base);
len = eva - sva;
npgs = len >> PAGE_SHIFT;
KASSERT(npgs <= M_EXT_MAXPAGES);
lva = sokvaalloc(sva, len, so);
if (lva == 0)
return 0;
error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
if (error) {
sokvafree(lva, len);
return 0;
}
for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
VM_PROT_READ, 0);
pmap_update(pmap_kernel());
lva += (vaddr_t) iov->iov_base & PAGE_MASK;
MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
uio->uio_resid -= space;
/* uio_offset not updated, not set/used for write(2) */
uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
uio->uio_iov->iov_len -= space;
if (uio->uio_iov->iov_len == 0) { uio->uio_iov++;
uio->uio_iovcnt--;
}
return space;
}
static int
socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_network_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg0;
if ((action != KAUTH_NETWORK_SOCKET) &&
(action != KAUTH_NETWORK_BIND))
return result;
switch (req) {
case KAUTH_REQ_NETWORK_BIND_PORT:
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_NETWORK_SOCKET_DROP: {
/* Normal users can only drop their own connections. */
struct socket *so = (struct socket *)arg1;
if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0)
result = KAUTH_RESULT_ALLOW;
break;
}
case KAUTH_REQ_NETWORK_SOCKET_OPEN:
/* We allow "raw" routing/bluetooth sockets to anyone. */
switch ((u_long)arg1) {
case PF_ROUTE:
case PF_OROUTE:
case PF_BLUETOOTH:
case PF_CAN:
result = KAUTH_RESULT_ALLOW;
break;
default:
/* Privileged, let secmodel handle this. */
if ((u_long)arg2 == SOCK_RAW)
break;
result = KAUTH_RESULT_ALLOW;
break;
}
break;
case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return result;
}
void
soinit(void)
{
sysctl_kern_socket_setup();
#ifdef SCTP
/* Update the SCTP function hooks if necessary*/
vec_sctp_add_ip_address = sctp_add_ip_address;
vec_sctp_delete_ip_address = sctp_delete_ip_address;
#endif
mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
cv_init(&socurkva_cv, "sokva");
cv_init(&pendfree_thread_cv, "sopendfr");
soinit2();
/* Set the initial adjusted socket buffer size. */
if (sb_max_set(sb_max))
panic("bad initial sb_max value: %lu", sb_max);
socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
socket_listener_cb, NULL);
}
void
soinit1(void)
{
int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
if (error)
panic("soinit1 %d", error);
}
/*
* socreate: create a new socket of the specified type and the protocol.
*
* => Caller may specify another socket for lock sharing (must not be held).
* => Returns the new socket without lock held.
*/
int
socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
struct socket *lockso)
{
const struct protosw *prp;
struct socket *so;
uid_t uid;
int error;
kmutex_t *lock;
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
KAUTH_ARG(proto));
if (error != 0)
return error;
if (proto)
prp = pffindproto(dom, proto, type);
else
prp = pffindtype(dom, type);
if (prp == NULL) {
/* no support for domain */
if (pffinddomain(dom) == 0)
return EAFNOSUPPORT;
/* no support for socket type */
if (proto == 0 && type != 0)
return EPROTOTYPE;
return EPROTONOSUPPORT;
}
if (prp->pr_usrreqs == NULL)
return EPROTONOSUPPORT;
if (prp->pr_type != type)
return EPROTOTYPE;
so = soget(true);
so->so_type = type;
so->so_proto = prp;
so->so_send = sosend;
so->so_receive = soreceive;
so->so_options = sooptions;
#ifdef MBUFTRACE
so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
so->so_mowner = &prp->pr_domain->dom_mowner;
#endif
uid = kauth_cred_geteuid(l->l_cred);
so->so_uidinfo = uid_find(uid);
so->so_egid = kauth_cred_getegid(l->l_cred);
so->so_cpid = l->l_proc->p_pid;
/*
* Lock assigned and taken during PCB attach, unless we share
* the lock with another socket, e.g. socketpair(2) case.
*/
if (lockso) {
/*
* lockso->so_lock should be stable at this point, so
* no need for atomic_load_*.
*/
lock = lockso->so_lock;
so->so_lock = lock;
mutex_obj_hold(lock);
mutex_enter(lock);
}
/* Attach the PCB (returns with the socket lock held). */
error = (*prp->pr_usrreqs->pr_attach)(so, proto);
KASSERT(solocked(so));
if (error) {
KASSERT(so->so_pcb == NULL);
so->so_state |= SS_NOFDREF;
sofree(so);
return error;
}
so->so_cred = kauth_cred_hold(l->l_cred);
sounlock(so);
*aso = so;
return 0;
}
/*
* fsocreate: create a socket and a file descriptor associated with it.
* Returns the allocated file structure in *fpp, but the descriptor
* is not visible yet for the process.
* Caller is responsible for calling fd_affix() for the returned *fpp once
* it's socket initialization is finished successfully, or fd_abort() if it's
* initialization fails.
*
*
* => On success, write file descriptor to *fdout and *fpp and return zero.
* => On failure, return non-zero; *fdout and *fpp will be undefined.
*/
int
fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout,
file_t **fpp, struct socket *lockso)
{
lwp_t *l = curlwp;
int error, fd, flags;
struct socket *so;
file_t *fp;
flags = type & SOCK_FLAGS_MASK;
type &= ~SOCK_FLAGS_MASK;
error = socreate(domain, &so, type, proto, l, lockso);
if (error) {
return error;
}
if ((error = fd_allocfile(&fp, &fd)) != 0) {
soclose(so);
return error;
}
fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
fp->f_type = DTYPE_SOCKET;
fp->f_ops = &socketops;
if (flags & SOCK_NONBLOCK) {
so->so_state |= SS_NBIO;
}
fp->f_socket = so;
if (sop != NULL) {
*sop = so;
}
*fdout = fd;
*fpp = fp;
return error;
}
int
sofamily(const struct socket *so)
{
const struct protosw *pr;
const struct domain *dom;
if ((pr = so->so_proto) == NULL)
return AF_UNSPEC;
if ((dom = pr->pr_domain) == NULL)
return AF_UNSPEC;
return dom->dom_family;
}
int
sobind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
int error;
solock(so); if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
sounlock(so);
return EAFNOSUPPORT;
}
error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l);
sounlock(so);
return error;
}
int
solisten(struct socket *so, int backlog, struct lwp *l)
{
int error;
short oldopt, oldqlimit;
solock(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
SS_ISDISCONNECTING)) != 0) {
sounlock(so);
return EINVAL;
}
oldopt = so->so_options;
oldqlimit = so->so_qlimit;
if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN;
if (backlog < 0)
backlog = 0;
so->so_qlimit = uimin(backlog, somaxconn);
error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l);
if (error != 0) { so->so_options = oldopt;
so->so_qlimit = oldqlimit;
sounlock(so);
return error;
}
sounlock(so);
return 0;
}
void
sofree(struct socket *so)
{
u_int refs;
KASSERT(solocked(so)); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
sounlock(so);
return;
}
if (so->so_head) {
/*
* We must not decommission a socket that's on the accept(2)
* queue. If we do, then accept(2) may hang after select(2)
* indicated that the listening socket was ready.
*/
if (!soqremque(so, 0)) { sounlock(so);
return;
}
}
if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
RLIM_INFINITY);
if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
RLIM_INFINITY);
sbrelease(&so->so_snd, so);
KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
sorflush(so);
refs = so->so_aborting; /* XXX */
/* Remove accept filter if one is present. */
if (so->so_accf != NULL) (void)accept_filt_clear(so);
sounlock(so);
if (refs == 0) /* XXX */ soput(so);
}
/*
* soclose: close a socket on last file table reference removal.
* Initiate disconnect if connected. Free socket when disconnect complete.
*/
int
soclose(struct socket *so)
{
struct socket *so2;
int error = 0;
solock(so); if (so->so_options & SO_ACCEPTCONN) {
for (;;) {
if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
KASSERT(solocked2(so, so2));
(void) soqremque(so2, 0);
/* soabort drops the lock. */
(void) soabort(so2);
solock(so);
continue;
}
if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { KASSERT(solocked2(so, so2));
(void) soqremque(so2, 1);
/* soabort drops the lock. */
(void) soabort(so2);
solock(so);
continue;
}
break;
}
}
if (so->so_pcb == NULL)
goto discard;
if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error)
goto drop;
}
if (so->so_options & SO_LINGER) { if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) ==
(SS_ISDISCONNECTING|SS_NBIO))
goto drop;
while (so->so_state & SS_ISCONNECTED) {
error = sowait(so, true, so->so_linger * hz);
if (error)
break;
}
}
}
drop:
if (so->so_pcb) { KASSERT(solocked(so));
(*so->so_proto->pr_usrreqs->pr_detach)(so);
}
discard:
KASSERT((so->so_state & SS_NOFDREF) == 0);
kauth_cred_free(so->so_cred);
so->so_cred = NULL;
so->so_state |= SS_NOFDREF;
sofree(so);
return error;
}
/*
* Must be called with the socket locked.. Will return with it unlocked.
*/
int
soabort(struct socket *so)
{
u_int refs;
int error;
KASSERT(solocked(so));
KASSERT(so->so_head == NULL);
so->so_aborting++; /* XXX */
error = (*so->so_proto->pr_usrreqs->pr_abort)(so);
refs = --so->so_aborting; /* XXX */
if (error || (refs == 0)) {
sofree(so);
} else {
sounlock(so);
}
return error;
}
int
soaccept(struct socket *so, struct sockaddr *nam)
{
int error;
KASSERT(solocked(so));
KASSERT((so->so_state & SS_NOFDREF) != 0);
so->so_state &= ~SS_NOFDREF;
if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
(so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam);
else
error = ECONNABORTED;
return error;
}
int
soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
int error;
KASSERT(solocked(so)); if (so->so_options & SO_ACCEPTCONN)
return EOPNOTSUPP;
/*
* If protocol is connection-based, can only connect once.
* Otherwise, if connected, try to disconnect first.
* This allows user to disconnect by connecting to, e.g.,
* a null address.
*/
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) {
error = EISCONN;
} else {
if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
return EAFNOSUPPORT;
}
error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
}
return error;
}
int
soconnect2(struct socket *so1, struct socket *so2)
{ KASSERT(solocked2(so1, so2));
return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2);
}
int
sodisconnect(struct socket *so)
{
int error;
KASSERT(solocked(so)); if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
} else if (so->so_state & SS_ISDISCONNECTING) {
error = EALREADY;
} else {
error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so);
}
return error;
}
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
/*
* Send on a socket.
* If send must go all at once and message is larger than
* send buffering, then hard error.
* Lock against other senders.
* If must go all at once and not enough room now, then
* inform user that this would block and do nothing.
* Otherwise, if nonblocking, send as much as possible.
* The data to be sent is described by "uio" if nonzero,
* otherwise by the mbuf chain "top" (which must be null
* if uio is not). Data provided in mbuf chain must be small
* enough to send all at once.
*
* Returns nonzero on error, timeout or signal; callers
* must check for short counts if EINTR/ERESTART are returned.
* Data and control buffers are freed on return.
*/
int
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct lwp *l)
{
struct mbuf **mp, *m;
long space, len, resid, clen, mlen;
int error, s, dontroute, atomic;
short wakeup_state = 0;
clen = 0;
/*
* solock() provides atomicity of access. splsoftnet() prevents
* protocol processing soft interrupts from interrupting us and
* blocking (expensive).
*/
s = splsoftnet();
solock(so);
atomic = sosendallatonce(so) || top;
if (uio)
resid = uio->uio_resid;
else
resid = top->m_pkthdr.len;
/*
* In theory resid should be unsigned.
* However, space must be signed, as it might be less than 0
* if we over-committed, and we must use a signed comparison
* of space and resid. On the other hand, a negative resid
* causes us to loop sending 0-length segments to the protocol.
*/
if (resid < 0) {
error = EINVAL;
goto out;
}
dontroute =
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
(so->so_proto->pr_flags & PR_ATOMIC);
l->l_ru.ru_msgsnd++;
if (control) clen = control->m_len;
restart:
if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
goto out;
do {
if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
goto release;
}
if (so->so_error) {
error = so->so_error;
if ((flags & MSG_PEEK) == 0) so->so_error = 0;
goto release;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if (resid || clen == 0) {
error = ENOTCONN;
goto release;
}
} else if (addr == NULL) {
error = EDESTADDRREQ;
goto release;
}
}
space = sbspace(&so->so_snd);
if (flags & MSG_OOB)
space += 1024;
if ((atomic && resid > so->so_snd.sb_hiwat) ||
clen > so->so_snd.sb_hiwat) {
error = EMSGSIZE;
goto release;
}
if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
error = EWOULDBLOCK;
goto release;
}
sbunlock(&so->so_snd);
if (wakeup_state & SS_RESTARTSYS) {
error = ERESTART;
goto out;
}
error = sbwait(&so->so_snd);
if (error)
goto out;
wakeup_state = so->so_state;
goto restart;
}
wakeup_state = 0;
mp = ⊤
space -= clen;
do {
if (uio == NULL) {
/*
* Data is prepackaged in "top".
*/
resid = 0;
if (flags & MSG_EOR)
top->m_flags |= M_EOR;
} else do {
sounlock(so);
splx(s);
if (top == NULL) {
m = m_gethdr(M_WAIT, MT_DATA);
mlen = MHLEN;
m->m_pkthdr.len = 0;
m_reset_rcvif(m);
} else {
m = m_get(M_WAIT, MT_DATA);
mlen = MLEN;
}
MCLAIM(m, so->so_snd.sb_mowner);
if (sock_loan_thresh >= 0 && uio->uio_iov->iov_len >= sock_loan_thresh &&
space >= sock_loan_thresh &&
(len = sosend_loan(so, uio, m,
space)) != 0) {
SOSEND_COUNTER_INCR(&sosend_loan_big);
space -= len;
goto have_data;
}
if (resid >= MINCLSIZE && space >= MCLBYTES) {
SOSEND_COUNTER_INCR(&sosend_copy_big);
m_clget(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0)
goto nopages;
mlen = MCLBYTES;
if (atomic && top == 0) {
len = lmin(MCLBYTES - max_hdr,
resid);
m->m_data += max_hdr;
} else
len = lmin(MCLBYTES, resid);
space -= len;
} else {
nopages:
SOSEND_COUNTER_INCR(&sosend_copy_small);
len = lmin(lmin(mlen, resid), space);
space -= len;
/*
* For datagram protocols, leave room
* for protocol headers in first mbuf.
*/
if (atomic && top == 0 && len < mlen) m_align(m, len);
}
error = uiomove(mtod(m, void *), (int)len, uio);
have_data:
resid = uio->uio_resid;
m->m_len = len;
*mp = m;
top->m_pkthdr.len += len;
s = splsoftnet();
solock(so); if (error != 0)
goto release;
mp = &m->m_next;
if (resid <= 0) {
if (flags & MSG_EOR)
top->m_flags |= M_EOR;
break;
}
} while (space > 0 && atomic); if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
goto release;
}
if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME;
if (flags & MSG_OOB) {
error = (*so->so_proto->pr_usrreqs->pr_sendoob)(
so, top, control);
} else {
error = (*so->so_proto->pr_usrreqs->pr_send)(so,
top, addr, control, l);
}
if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME;
clen = 0;
control = NULL;
top = NULL;
mp = ⊤
if (error != 0)
goto release;
} while (resid && space > 0); } while (resid);
release:
sbunlock(&so->so_snd);
out:
sounlock(so);
splx(s);
if (top) m_freem(top); if (control) m_freem(control);
return error;
}
/*
* Following replacement or removal of the first mbuf on the first
* mbuf chain of a socket buffer, push necessary state changes back
* into the socket buffer so that other consumers see the values
* consistently. 'nextrecord' is the caller's locally stored value of
* the original value of sb->sb_mb->m_nextpkt which must be restored
* when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
*/
static void
sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
{
KASSERT(solocked(sb->sb_so));
/*
* First, update for the new value of nextrecord. If necessary,
* make it the first record.
*/
if (sb->sb_mb != NULL)
sb->sb_mb->m_nextpkt = nextrecord;
else
sb->sb_mb = nextrecord;
/*
* Now update any dependent socket buffer fields to reflect
* the new state. This is an inline of SB_EMPTY_FIXUP, with
* the addition of a second clause that takes care of the
* case where sb_mb has been updated, but remains the last
* record.
*/
if (sb->sb_mb == NULL) {
sb->sb_mbtail = NULL;
sb->sb_lastrecord = NULL;
} else if (sb->sb_mb->m_nextpkt == NULL)
sb->sb_lastrecord = sb->sb_mb;
}
/*
* Implement receive operations on a socket.
*
* We depend on the way that records are added to the sockbuf by sbappend*. In
* particular, each record (mbufs linked through m_next) must begin with an
* address if the protocol so specifies, followed by an optional mbuf or mbufs
* containing ancillary data, and then zero or more mbufs of data.
*
* In order to avoid blocking network interrupts for the entire time here, we
* splx() while doing the actual copy to user space. Although the sockbuf is
* locked, new data may still be appended, and thus we must maintain
* consistency of the sockbuf during that time.
*
* The caller may receive the data as a single mbuf chain by supplying an mbuf
* **mp0 for use in returning the chain. The uio is then used only for the
* count in uio_resid.
*/
int
soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
struct lwp *l = curlwp;
struct mbuf *m, **mp, *mt;
size_t len, offset, moff, orig_resid;
int atomic, flags, error, s, type;
const struct protosw *pr;
struct mbuf *nextrecord;
int mbuf_removed = 0;
const struct domain *dom;
short wakeup_state = 0;
pr = so->so_proto;
atomic = pr->pr_flags & PR_ATOMIC;
dom = pr->pr_domain;
mp = mp0;
type = 0;
orig_resid = uio->uio_resid;
if (paddr != NULL) *paddr = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL)
flags = *flagsp &~ MSG_EOR;
else
flags = 0;
if (flags & MSG_OOB) {
m = m_get(M_WAIT, MT_DATA);
solock(so);
error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK);
sounlock(so);
if (error)
goto bad;
do {
error = uiomove(mtod(m, void *),
MIN(uio->uio_resid, m->m_len), uio);
m = m_free(m);
} while (uio->uio_resid > 0 && error == 0 && m);
bad:
if (m != NULL) m_freem(m);
return error;
}
if (mp != NULL) *mp = NULL;
/*
* solock() provides atomicity of access. splsoftnet() prevents
* protocol processing soft interrupts from interrupting us and
* blocking (expensive).
*/
s = splsoftnet();
solock(so);
restart:
if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
sounlock(so);
splx(s);
return error;
}
m = so->so_rcv.sb_mb;
/*
* If we have less data than requested, block awaiting more
* (subject to any timeout) if:
* 1. the current count is less than the low water mark,
* 2. MSG_WAITALL is set, and it is possible to do the entire
* receive operation at once if we block (resid <= hiwat), or
* 3. MSG_DONTWAIT is not set.
* If MSG_WAITALL is set but resid is larger than the receive buffer,
* we have to do the receive in sections, and thus risk returning
* a short count if a timeout or signal occurs after we start.
*/
if (m == NULL ||
((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && !atomic)) {
#ifdef DIAGNOSTIC
if (m == NULL && so->so_rcv.sb_cc) panic("receive 1");
#endif
if (so->so_error || so->so_rerror) {
u_short *e;
if (m != NULL)
goto dontblock;
e = so->so_error ? &so->so_error : &so->so_rerror;
error = *e;
if ((flags & MSG_PEEK) == 0) *e = 0;
goto release;
}
if (so->so_state & SS_CANTRCVMORE) {
if (m != NULL)
goto dontblock;
else
goto release;
}
for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
m = so->so_rcv.sb_mb;
goto dontblock;
}
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
error = ENOTCONN;
goto release;
}
if (uio->uio_resid == 0)
goto release;
if ((so->so_state & SS_NBIO) ||
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
error = EWOULDBLOCK;
goto release;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
sbunlock(&so->so_rcv);
if (wakeup_state & SS_RESTARTSYS)
error = ERESTART;
else
error = sbwait(&so->so_rcv);
if (error != 0) {
sounlock(so);
splx(s);
return error;
}
wakeup_state = so->so_state;
goto restart;
}
dontblock:
/*
* On entry here, m points to the first record of the socket buffer.
* From this point onward, we maintain 'nextrecord' as a cache of the
* pointer to the next record in the socket buffer. We must keep the
* various socket buffer pointers and local stack versions of the
* pointers in sync, pushing out modifications before dropping the
* socket lock, and re-reading them when picking it up.
*
* Otherwise, we will race with the network stack appending new data
* or records onto the socket buffer by using inconsistent/stale
* versions of the field, possibly resulting in socket buffer
* corruption.
*
* By holding the high-level sblock(), we prevent simultaneous
* readers from pulling off the front of the socket buffer.
*/
if (l != NULL)
l->l_ru.ru_msgrcv++;
KASSERT(m == so->so_rcv.sb_mb);
SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
nextrecord = m->m_nextpkt;
if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME);
orig_resid = 0;
if (flags & MSG_PEEK) {
if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
mbuf_removed = 1;
if (paddr != NULL) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = NULL;
m = so->so_rcv.sb_mb;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
sbsync(&so->so_rcv, nextrecord);
}
}
if (pr->pr_flags & PR_ADDR_OPT) {
/*
* For SCTP we may be getting a whole message OR a partial
* delivery.
*/
if (m->m_type == MT_SONAME) {
orig_resid = 0;
if (flags & MSG_PEEK) {
if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
mbuf_removed = 1;
if (paddr) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = 0;
m = so->so_rcv.sb_mb;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
sbsync(&so->so_rcv, nextrecord);
}
}
}
/*
* Process one or more MT_CONTROL mbufs present before any data mbufs
* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
* just copy the data; if !MSG_PEEK, we call into the protocol to
* perform externalization (or freeing if controlp == NULL).
*/
if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
struct mbuf *cm = NULL, *cmn;
struct mbuf **cme = &cm;
do {
if (flags & MSG_PEEK) {
if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT);
controlp = (*controlp == NULL ? NULL :
&(*controlp)->m_next);
}
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
so->so_rcv.sb_mb = m->m_next;
m->m_next = NULL;
*cme = m;
cme = &(*cme)->m_next;
m = so->so_rcv.sb_mb;
}
} while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sbsync(&so->so_rcv, nextrecord); for (; cm != NULL; cm = cmn) {
cmn = cm->m_next;
cm->m_next = NULL;
type = mtod(cm, struct cmsghdr *)->cmsg_type;
if (controlp != NULL) {
if (dom->dom_externalize != NULL &&
type == SCM_RIGHTS) {
sounlock(so);
splx(s);
error = (*dom->dom_externalize)(cm, l,
(flags & MSG_CMSG_CLOEXEC) ?
O_CLOEXEC : 0);
s = splsoftnet();
solock(so);
}
*controlp = cm;
while (*controlp != NULL) controlp = &(*controlp)->m_next;
} else {
/*
* Dispose of any SCM_RIGHTS message that went
* through the read path rather than recv.
*/
if (dom->dom_dispose != NULL &&
type == SCM_RIGHTS) {
sounlock(so);
(*dom->dom_dispose)(cm);
solock(so);
}
m_freem(cm);
}
}
if (m != NULL)
nextrecord = so->so_rcv.sb_mb->m_nextpkt;
else
nextrecord = so->so_rcv.sb_mb;
orig_resid = 0;
}
/* If m is non-NULL, we have some data to read. */
if (__predict_true(m != NULL)) {
type = m->m_type;
if (type == MT_OOBDATA)
flags |= MSG_OOB;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
moff = 0;
offset = 0;
while (m != NULL && uio->uio_resid > 0 && error == 0) {
/*
* If the type of mbuf has changed, end the receive
* operation and do a short read.
*/
if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA)
break;
} else if (type == MT_OOBDATA) {
break;
} else if (m->m_type == MT_CONTROL) {
break;
}
#ifdef DIAGNOSTIC
else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
panic("%s: m_type=%d", __func__, m->m_type);
}
#endif
so->so_state &= ~SS_RCVATMARK;
wakeup_state = 0;
len = uio->uio_resid;
if (so->so_oobmark && len > so->so_oobmark - offset)
len = so->so_oobmark - offset;
if (len > m->m_len - moff)
len = m->m_len - moff;
/*
* If mp is set, just pass back the mbufs.
* Otherwise copy them out via the uio, then free.
* Sockbuf must be consistent here (points to current mbuf,
* it points to next record) when we drop priority;
* we must note any additions to the sockbuf when we
* block interrupts again.
*/
if (mp == NULL) {
SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
sounlock(so);
splx(s);
error = uiomove(mtod(m, char *) + moff, len, uio);
s = splsoftnet();
solock(so); if (error != 0) {
/*
* If any part of the record has been removed
* (such as the MT_SONAME mbuf, which will
* happen when PR_ADDR, and thus also
* PR_ATOMIC, is set), then drop the entire
* record to maintain the atomicity of the
* receive operation.
*
* This avoids a later panic("receive 1a")
* when compiled with DIAGNOSTIC.
*/
if (m && mbuf_removed && atomic) (void) sbdroprecord(&so->so_rcv);
goto release;
}
} else {
uio->uio_resid -= len;
}
if (len == m->m_len - moff) {
if (m->m_flags & M_EOR)
flags |= MSG_EOR;
#ifdef SCTP
if (m->m_flags & M_NOTIFICATION)
flags |= MSG_NOTIFICATION;
#endif
if (flags & MSG_PEEK) {
m = m->m_next;
moff = 0;
} else {
nextrecord = m->m_nextpkt;
sbfree(&so->so_rcv, m);
if (mp) {
*mp = m;
mp = &m->m_next;
so->so_rcv.sb_mb = m = m->m_next;
*mp = NULL;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
/*
* If m != NULL, we also know that
* so->so_rcv.sb_mb != NULL.
*/
KASSERT(so->so_rcv.sb_mb == m);
if (m) {
m->m_nextpkt = nextrecord;
if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m;
} else {
so->so_rcv.sb_mb = nextrecord;
SB_EMPTY_FIXUP(&so->so_rcv);
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
}
} else if (flags & MSG_PEEK) {
moff += len;
} else {
if (mp != NULL) {
mt = m_copym(m, 0, len, M_NOWAIT);
if (__predict_false(mt == NULL)) {
sounlock(so);
mt = m_copym(m, 0, len, M_WAIT);
solock(so);
}
*mp = mt;
}
m->m_data += len;
m->m_len -= len;
so->so_rcv.sb_cc -= len;
}
if (so->so_oobmark) {
if ((flags & MSG_PEEK) == 0) {
so->so_oobmark -= len;
if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK;
break;
}
} else {
offset += len;
if (offset == so->so_oobmark)
break;
}
} else {
so->so_state &= ~SS_POLLRDBAND;
}
if (flags & MSG_EOR)
break;
/*
* If the MSG_WAITALL flag is set (for non-atomic socket),
* we must not quit until "uio->uio_resid == 0" or an error
* termination. If a signal/timeout occurs, return
* with a short count but without error.
* Keep sockbuf locked against other readers.
*/
while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_rerror ||
so->so_state & SS_CANTRCVMORE)
break;
/*
* If we are peeking and the socket receive buffer is
* full, stop since we can't get more data to peek at.
*/
if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
break;
/*
* If we've drained the socket buffer, tell the
* protocol in case it needs to do something to
* get it filled again.
*/
if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
if (wakeup_state & SS_RESTARTSYS)
error = ERESTART;
else
error = sbwait(&so->so_rcv);
if (error != 0) {
sbunlock(&so->so_rcv);
sounlock(so);
splx(s);
return 0;
}
if ((m = so->so_rcv.sb_mb) != NULL)
nextrecord = m->m_nextpkt;
wakeup_state = so->so_state;
}
}
if (m && atomic) {
flags |= MSG_TRUNC;
if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv);
}
if ((flags & MSG_PEEK) == 0) { if (m == NULL) {
/*
* First part is an inline SB_EMPTY_FIXUP(). Second
* part makes sure sb_lastrecord is up-to-date if
* there is still data in the socket buffer.
*/
so->so_rcv.sb_mb = nextrecord;
if (so->so_rcv.sb_mb == NULL) {
so->so_rcv.sb_mbtail = NULL;
so->so_rcv.sb_lastrecord = NULL;
} else if (nextrecord->m_nextpkt == NULL)
so->so_rcv.sb_lastrecord = nextrecord;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
}
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
sbunlock(&so->so_rcv);
goto restart;
}
if (flagsp != NULL) *flagsp |= flags;
release:
sbunlock(&so->so_rcv);
sounlock(so);
splx(s);
return error;
}
int
soshutdown(struct socket *so, int how)
{
const struct protosw *pr;
int error;
KASSERT(solocked(so));
pr = so->so_proto;
if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
return EINVAL;
if (how == SHUT_RD || how == SHUT_RDWR) {
sorflush(so);
error = 0;
}
if (how == SHUT_WR || how == SHUT_RDWR)
error = (*pr->pr_usrreqs->pr_shutdown)(so);
return error;
}
void
sorestart(struct socket *so)
{
/*
* An application has called close() on an fd on which another
* of its threads has called a socket system call.
* Mark this and wake everyone up, and code that would block again
* instead returns ERESTART.
* On system call re-entry the fd is validated and EBADF returned.
* Any other fd will block again on the 2nd syscall.
*/
solock(so);
so->so_state |= SS_RESTARTSYS;
cv_broadcast(&so->so_cv);
cv_broadcast(&so->so_snd.sb_cv);
cv_broadcast(&so->so_rcv.sb_cv);
sounlock(so);
}
void
sorflush(struct socket *so)
{
struct sockbuf *sb, asb;
const struct protosw *pr;
KASSERT(solocked(so));
sb = &so->so_rcv;
pr = so->so_proto;
socantrcvmore(so);
sb->sb_flags |= SB_NOINTR;
(void )sblock(sb, M_WAITOK);
sbunlock(sb);
asb = *sb;
/*
* Clear most of the sockbuf structure, but leave some of the
* fields valid.
*/
memset(&sb->sb_startzero, 0,
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
sounlock(so);
(*pr->pr_domain->dom_dispose)(asb.sb_mb);
solock(so);
}
sbrelease(&asb, so);
}
/*
* internal set SOL_SOCKET options
*/
static int
sosetopt1(struct socket *so, const struct sockopt *sopt)
{
int error, opt;
int optval = 0; /* XXX: gcc */
struct linger l;
struct timeval tv;
opt = sopt->sopt_name;
switch (opt) {
case SO_ACCEPTFILTER:
error = accept_filt_setopt(so, sopt);
KASSERT(solocked(so));
break;
case SO_LINGER:
error = sockopt_get(sopt, &l, sizeof(l)); solock(so); if (error)
break;
if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
l.l_linger > (INT_MAX / hz)) {
error = EDOM;
break;
}
so->so_linger = l.l_linger;
if (l.l_onoff)
so->so_options |= SO_LINGER;
else
so->so_options &= ~SO_LINGER;
break;
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_USELOOPBACK:
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_NOSIGPIPE:
case SO_RERROR:
error = sockopt_getint(sopt, &optval); solock(so); if (error)
break;
if (optval)
so->so_options |= opt;
else
so->so_options &= ~opt;
break;
case SO_SNDBUF:
case SO_RCVBUF:
case SO_SNDLOWAT:
case SO_RCVLOWAT:
error = sockopt_getint(sopt, &optval); solock(so); if (error)
break;
/*
* Values < 1 make no sense for any of these
* options, so disallow them.
*/
if (optval < 1) {
error = EINVAL;
break;
}
switch (opt) {
case SO_SNDBUF:
if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
error = ENOBUFS;
break;
}
if (sofixedbuf) so->so_snd.sb_flags &= ~SB_AUTOSIZE;
break;
case SO_RCVBUF:
if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
error = ENOBUFS;
break;
}
if (sofixedbuf) so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
break;
/*
* Make sure the low-water is never greater than
* the high-water.
*/
case SO_SNDLOWAT:
if (optval > so->so_snd.sb_hiwat) optval = so->so_snd.sb_hiwat;
so->so_snd.sb_lowat = optval;
break;
case SO_RCVLOWAT:
if (optval > so->so_rcv.sb_hiwat) optval = so->so_rcv.sb_hiwat;
so->so_rcv.sb_lowat = optval;
break;
}
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
solock(so); error = sockopt_get(sopt, &tv, sizeof(tv));
if (error)
break;
if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
error = EDOM;
break;
}
if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
error = EDOM;
break;
}
optval = tv.tv_sec * hz + tv.tv_usec / tick;
if (optval == 0 && tv.tv_usec != 0)
optval = 1;
switch (opt) {
case SO_SNDTIMEO:
so->so_snd.sb_timeo = optval;
break;
case SO_RCVTIMEO:
so->so_rcv.sb_timeo = optval;
break;
}
break;
default:
MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook,
(opt, so, sopt), enosys(), error);
if (error == ENOSYS || error == EPASSTHROUGH) { solock(so);
error = ENOPROTOOPT;
}
break;
}
KASSERT(solocked(so));
return error;
}
int
sosetopt(struct socket *so, struct sockopt *sopt)
{
int error, prerr;
if (sopt->sopt_level == SOL_SOCKET) {
error = sosetopt1(so, sopt); KASSERT(solocked(so));
} else {
error = ENOPROTOOPT;
solock(so);
}
if ((error == 0 || error == ENOPROTOOPT) && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
/* give the protocol stack a shot */
prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
if (prerr == 0)
error = 0;
else if (prerr != ENOPROTOOPT)
error = prerr;
}
sounlock(so);
return error;
}
/*
* so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
*/
int
so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
const void *val, size_t valsize)
{
struct sockopt sopt;
int error;
KASSERT(valsize == 0 || val != NULL);
sockopt_init(&sopt, level, name, valsize);
sockopt_set(&sopt, val, valsize);
error = sosetopt(so, &sopt);
sockopt_destroy(&sopt);
return error;
}
/*
* internal get SOL_SOCKET options
*/
static int
sogetopt1(struct socket *so, struct sockopt *sopt)
{
int error, optval, opt;
struct linger l;
struct timeval tv;
switch ((opt = sopt->sopt_name)) {
case SO_ACCEPTFILTER:
error = accept_filt_getopt(so, sopt);
break;
case SO_LINGER:
l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
l.l_linger = so->so_linger;
error = sockopt_set(sopt, &l, sizeof(l));
break;
case SO_USELOOPBACK:
case SO_DONTROUTE:
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_NOSIGPIPE:
case SO_RERROR:
case SO_ACCEPTCONN:
error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
break;
case SO_TYPE:
error = sockopt_setint(sopt, so->so_type);
break;
case SO_ERROR:
if (so->so_error == 0) { so->so_error = so->so_rerror;
so->so_rerror = 0;
}
error = sockopt_setint(sopt, so->so_error);
so->so_error = 0;
break;
case SO_SNDBUF:
error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
break;
case SO_RCVBUF:
error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
break;
case SO_SNDLOWAT:
error = sockopt_setint(sopt, so->so_snd.sb_lowat);
break;
case SO_RCVLOWAT:
error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
optval = (opt == SO_SNDTIMEO ?
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
memset(&tv, 0, sizeof(tv));
tv.tv_sec = optval / hz;
tv.tv_usec = (optval % hz) * tick;
error = sockopt_set(sopt, &tv, sizeof(tv));
break;
case SO_OVERFLOWED:
error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
break;
default:
MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook,
(opt, so, sopt), enosys(), error);
if (error)
error = ENOPROTOOPT;
break;
}
return error;
}
int
sogetopt(struct socket *so, struct sockopt *sopt)
{
int error;
solock(so);
if (sopt->sopt_level != SOL_SOCKET) {
if (so->so_proto && so->so_proto->pr_ctloutput) { error = ((*so->so_proto->pr_ctloutput)
(PRCO_GETOPT, so, sopt));
} else
error = (ENOPROTOOPT);
} else {
error = sogetopt1(so, sopt);
}
sounlock(so);
return error;
}
/*
* alloc sockopt data buffer buffer
* - will be released at destroy
*/
static int
sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
{
void *data;
KASSERT(sopt->sopt_size == 0);
if (len > sizeof(sopt->sopt_buf)) {
data = kmem_zalloc(len, kmflag);
if (data == NULL)
return ENOMEM;
sopt->sopt_data = data;
} else
sopt->sopt_data = sopt->sopt_buf;
sopt->sopt_size = len;
return 0;
}
/*
* initialise sockopt storage
* - MAY sleep during allocation
*/
void
sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
{
memset(sopt, 0, sizeof(*sopt));
sopt->sopt_level = level;
sopt->sopt_name = name;
(void)sockopt_alloc(sopt, size, KM_SLEEP);
}
/*
* destroy sockopt storage
* - will release any held memory references
*/
void
sockopt_destroy(struct sockopt *sopt)
{ if (sopt->sopt_data != sopt->sopt_buf) kmem_free(sopt->sopt_data, sopt->sopt_size);
memset(sopt, 0, sizeof(*sopt));
}
/*
* set sockopt value
* - value is copied into sockopt
* - memory is allocated when necessary, will not sleep
*/
int
sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
{
int error;
if (sopt->sopt_size == 0) {
error = sockopt_alloc(sopt, len, KM_NOSLEEP);
if (error)
return error;
}
sopt->sopt_retsize = MIN(sopt->sopt_size, len);
if (sopt->sopt_retsize > 0) {
memcpy(sopt->sopt_data, buf, sopt->sopt_retsize);
}
return 0;
}
/*
* common case of set sockopt integer value
*/
int
sockopt_setint(struct sockopt *sopt, int val)
{
return sockopt_set(sopt, &val, sizeof(int));
}
/*
* get sockopt value
* - correct size must be given
*/
int
sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
{ if (sopt->sopt_size != len)
return EINVAL;
memcpy(buf, sopt->sopt_data, len);
return 0;
}
/*
* common case of get sockopt integer value
*/
int
sockopt_getint(const struct sockopt *sopt, int *valp)
{ return sockopt_get(sopt, valp, sizeof(int));
}
/*
* set sockopt value from mbuf
* - ONLY for legacy code
* - mbuf is released by sockopt
* - will not sleep
*/
int
sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
{
size_t len;
int error;
len = m_length(m);
if (sopt->sopt_size == 0) {
error = sockopt_alloc(sopt, len, KM_NOSLEEP);
if (error)
return error;
}
sopt->sopt_retsize = MIN(sopt->sopt_size, len);
m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data);
m_freem(m);
return 0;
}
/*
* get sockopt value into mbuf
* - ONLY for legacy code
* - mbuf to be released by the caller
* - will not sleep
*/
struct mbuf *
sockopt_getmbuf(const struct sockopt *sopt)
{
struct mbuf *m;
if (sopt->sopt_size > MCLBYTES)
return NULL;
m = m_get(M_DONTWAIT, MT_SOOPTS);
if (m == NULL)
return NULL;
if (sopt->sopt_size > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
m->m_len = sopt->sopt_size;
return m;
}
void
sohasoutofband(struct socket *so)
{
so->so_state |= SS_POLLRDBAND;
fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
}
static void
filt_sordetach(struct knote *kn)
{
struct socket *so;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
if (selremove_knote(&so->so_rcv.sb_sel, kn))
so->so_rcv.sb_flags &= ~SB_KNOTE;
sounlock(so);
}
/*ARGSUSED*/
static int
filt_soread(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = so->so_rcv.sb_cc;
if (so->so_state & SS_CANTRCVMORE) {
knote_set_eof(kn, 0);
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error || so->so_rerror)
rv = 1;
else if (kn->kn_sfflags & NOTE_LOWAT)
rv = (kn->kn_data >= kn->kn_sdata);
else
rv = (kn->kn_data >= so->so_rcv.sb_lowat);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static void
filt_sowdetach(struct knote *kn)
{
struct socket *so;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
if (selremove_knote(&so->so_snd.sb_sel, kn))
so->so_snd.sb_flags &= ~SB_KNOTE;
sounlock(so);
}
/*ARGSUSED*/
static int
filt_sowrite(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = sbspace(&so->so_snd);
if (so->so_state & SS_CANTSENDMORE) {
knote_set_eof(kn, 0);
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error)
rv = 1;
else if (((so->so_state & SS_ISCONNECTED) == 0) &&
(so->so_proto->pr_flags & PR_CONNREQUIRED))
rv = 0;
else if (kn->kn_sfflags & NOTE_LOWAT)
rv = (kn->kn_data >= kn->kn_sdata);
else
rv = (kn->kn_data >= so->so_snd.sb_lowat);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static int
filt_soempty(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
rv = (kn->kn_data = sbused(&so->so_snd)) == 0 ||
(so->so_options & SO_ACCEPTCONN) != 0;
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
/*ARGSUSED*/
static int
filt_solisten(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
/*
* Set kn_data to number of incoming connections, not
* counting partial (incomplete) connections.
*/
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = so->so_qlen;
rv = (kn->kn_data > 0);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static const struct filterops solisten_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sordetach,
.f_event = filt_solisten,
};
static const struct filterops soread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sordetach,
.f_event = filt_soread,
};
static const struct filterops sowrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sowdetach,
.f_event = filt_sowrite,
};
static const struct filterops soempty_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sowdetach,
.f_event = filt_soempty,
};
int
soo_kqfilter(struct file *fp, struct knote *kn)
{
struct socket *so;
struct sockbuf *sb;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
switch (kn->kn_filter) {
case EVFILT_READ:
if (so->so_options & SO_ACCEPTCONN)
kn->kn_fop = &solisten_filtops;
else
kn->kn_fop = &soread_filtops;
sb = &so->so_rcv;
break;
case EVFILT_WRITE:
kn->kn_fop = &sowrite_filtops;
sb = &so->so_snd;
#ifdef PIPE_SOCKETPAIR
if (so->so_state & SS_ISAPIPE) {
/* Other end of pipe has been closed. */
if (so->so_state & SS_ISDISCONNECTED) {
sounlock(so);
return EBADF;
}
}
#endif
break;
case EVFILT_EMPTY:
kn->kn_fop = &soempty_filtops;
sb = &so->so_snd;
break;
default:
sounlock(so);
return EINVAL;
}
selrecord_knote(&sb->sb_sel, kn);
sb->sb_flags |= SB_KNOTE;
sounlock(so);
return 0;
}
static int
sodopoll(struct socket *so, int events)
{
int revents;
revents = 0;
if (events & (POLLIN | POLLRDNORM)) if (soreadable(so))
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM)) if (sowritable(so))
revents |= events & (POLLOUT | POLLWRNORM);
if (events & (POLLPRI | POLLRDBAND)) if (so->so_state & SS_POLLRDBAND)
revents |= events & (POLLPRI | POLLRDBAND);
return revents;
}
int
sopoll(struct socket *so, int events)
{
int revents = 0;
#ifndef DIAGNOSTIC
/*
* Do a quick, unlocked check in expectation that the socket
* will be ready for I/O. Don't do this check if DIAGNOSTIC,
* as the solocked() assertions will fail.
*/
if ((revents = sodopoll(so, events)) != 0)
return revents;
#endif
solock(so); if ((revents = sodopoll(so, events)) == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(curlwp, &so->so_rcv.sb_sel);
so->so_rcv.sb_flags |= SB_NOTIFY;
}
if (events & (POLLOUT | POLLWRNORM)) { selrecord(curlwp, &so->so_snd.sb_sel);
so->so_snd.sb_flags |= SB_NOTIFY;
}
}
sounlock(so);
return revents;
}
struct mbuf **
sbsavetimestamp(int opt, struct mbuf **mp)
{
struct timeval tv;
int error;
memset(&tv, 0, sizeof(tv));
microtime(&tv);
MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error);
if (error == 0)
return mp;
if (opt & SO_TIMESTAMP) {
*mp = sbcreatecontrol(&tv, sizeof(tv),
SCM_TIMESTAMP, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
return mp;
}
#include <sys/sysctl.h>
static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
static int sysctl_kern_sbmax(SYSCTLFN_PROTO);
/*
* sysctl helper routine for kern.somaxkva. ensures that the given
* value is not too small.
* (XXX should we maybe make sure it's not too large as well?)
*/
static int
sysctl_kern_somaxkva(SYSCTLFN_ARGS)
{
int error, new_somaxkva;
struct sysctlnode node;
new_somaxkva = somaxkva;
node = *rnode;
node.sysctl_data = &new_somaxkva;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
return EINVAL;
mutex_enter(&so_pendfree_lock);
somaxkva = new_somaxkva;
cv_broadcast(&socurkva_cv);
mutex_exit(&so_pendfree_lock);
return error;
}
/*
* sysctl helper routine for kern.sbmax. Basically just ensures that
* any new value is not too small.
*/
static int
sysctl_kern_sbmax(SYSCTLFN_ARGS)
{
int error, new_sbmax;
struct sysctlnode node;
new_sbmax = sb_max;
node = *rnode;
node.sysctl_data = &new_sbmax;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
KERNEL_LOCK(1, NULL);
error = sb_max_set(new_sbmax);
KERNEL_UNLOCK_ONE(NULL);
return error;
}
/*
* sysctl helper routine for kern.sooptions. Ensures that only allowed
* options can be set.
*/
static int
sysctl_kern_sooptions(SYSCTLFN_ARGS)
{
int error, new_options;
struct sysctlnode node;
new_options = sooptions;
node = *rnode;
node.sysctl_data = &new_options;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (new_options & ~SO_DEFOPTS)
return EINVAL;
sooptions = new_options;
return 0;
}
static void
sysctl_kern_socket_setup(void)
{
KASSERT(socket_sysctllog == NULL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "somaxkva",
SYSCTL_DESCR("Maximum amount of kernel memory to be "
"used for socket buffers"),
sysctl_kern_somaxkva, 0, NULL, 0,
CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_BOOL, "sofixedbuf",
SYSCTL_DESCR("Prevent scaling of fixed socket buffers"),
NULL, 0, &sofixedbuf, 0,
CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sbmax",
SYSCTL_DESCR("Maximum socket buffer size"),
sysctl_kern_sbmax, 0, NULL, 0,
CTL_KERN, KERN_SBMAX, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sooptions",
SYSCTL_DESCR("Default socket options"),
sysctl_kern_sooptions, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $");
#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <miscfs/genfs/genfs.h>
#ifdef DIAGNOSTIC
int dirchk = 1;
#else
int dirchk = 0;
#endif
#if BYTE_ORDER == LITTLE_ENDIAN
# define ENDIANSWAP(needswap) ((needswap) == 0)
#else
# define ENDIANSWAP(needswap) ((needswap) != 0)
#endif
#define NAMLEN(fsfmt, needswap, dp) \
((fsfmt) && ENDIANSWAP(needswap) ? (dp)->d_type : (dp)->d_namlen)
static void
ufs_dirswap(struct direct *dirp)
{
uint8_t tmp = dirp->d_namlen;
dirp->d_namlen = dirp->d_type;
dirp->d_type = tmp;
}
struct slotinfo {
enum {
NONE, /* need to search a slot for our new entry */
COMPACT, /* a compaction can make a slot in the current
DIRBLKSIZ block */
FOUND, /* found a slot (or no need to search) */
} status;
doff_t offset; /* offset of area with free space.
a special value -1 for invalid */
int size; /* size of area at slotoffset */
int freespace; /* accumulated amount of space free in
the current DIRBLKSIZ block */
int needed; /* size of the entry we're seeking */
};
static void
calc_count(struct ufs_lookup_results *results, int dirblksiz, doff_t prevoff)
{
if ((results->ulr_offset & (dirblksiz - 1)) == 0)
results->ulr_count = 0;
else
results->ulr_count = results->ulr_offset - prevoff;
}
static void
slot_init(struct slotinfo *slot)
{
slot->status = FOUND;
slot->offset = -1;
slot->freespace = slot->size = slot->needed = 0;
}
#ifdef UFS_DIRHASH
static doff_t
slot_findfree(struct slotinfo *slot, struct inode *dp)
{
if (slot->status == FOUND)
return dp->i_size;
slot->offset = ufsdirhash_findfree(dp, slot->needed, &slot->size);
if (slot->offset < 0)
return dp->i_size;
slot->status = COMPACT;
doff_t enduseful = ufsdirhash_enduseful(dp);
if (enduseful < 0)
return dp->i_size;
return enduseful;
}
#endif
static void
slot_white(struct slotinfo *slot, uint16_t reclen,
struct ufs_lookup_results *results)
{
slot->status = FOUND;
slot->offset = results->ulr_offset;
slot->size = reclen;
results->ulr_reclen = slot->size;
}
static void
slot_update(struct slotinfo *slot, int size, uint16_t reclen, doff_t offset)
{
if (size >= slot->needed) {
slot->status = FOUND;
slot->offset = offset;
slot->size = reclen;
} else if (slot->status == NONE) {
slot->freespace += size;
if (slot->offset == -1) slot->offset = offset; if (slot->freespace >= slot->needed) { slot->status = COMPACT;
slot->size = offset + reclen - slot->offset;
}
}
}
/*
* Return an indication of where the new directory entry should be put.
* If we didn't find a slot, then set results->ulr_count to 0 indicating
* that the new slot belongs at the end of the directory. If we found a slot,
* then the new entry can be put in the range from results->ulr_offset to
* results->ulr_offset + results->ulr_count.
*/
static int
slot_estimate(const struct slotinfo *slot, int dirblksiz, int nameiop,
doff_t prevoff, doff_t enduseful, const struct inode *ip,
struct ufs_lookup_results *results)
{
if (slot->status == NONE) {
results->ulr_offset = roundup(ip->i_size, dirblksiz);
results->ulr_count = 0;
enduseful = results->ulr_offset;
} else if (nameiop == DELETE) {
results->ulr_offset = slot->offset;
calc_count(results, dirblksiz, prevoff);
} else {
results->ulr_offset = slot->offset;
results->ulr_count = slot->size;
if (enduseful < slot->offset + slot->size)
enduseful = slot->offset + slot->size;
}
results->ulr_endoff = roundup(enduseful, dirblksiz);
#if 0 /* commented out by dbj. none of the on disk fields changed */
ip->i_flag |= IN_CHANGE | IN_UPDATE;
#endif
return EJUSTRETURN;
}
/*
* Check if we can delete inode tdp in directory vdp with inode ip and creds.
*/
static int
ufs_can_delete(struct vnode *tdp, struct vnode *vdp, struct inode *ip,
kauth_cred_t cred)
{
int error;
#ifdef UFS_ACL
/*
* NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
*
* 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD
*/
/*
* XXX: Is this check required?
*/
error = VOP_ACCESS(vdp, VEXEC, cred);
if (error)
goto out;
#if 0
/* Moved to ufs_remove, ufs_rmdir because they hold the lock */
error = VOP_ACCESSX(tdp, VDELETE, cred);
if (error == 0)
return (0);
#endif
error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred);
if (error == 0)
return (0);
error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred);
if (error)
goto out;
#endif /* !UFS_ACL */
/*
* Write access to directory required to delete files.
*/
error = VOP_ACCESS(vdp, VWRITE, cred);
if (error)
goto out;
if (!(ip->i_mode & ISVTX))
return 0;
/*
* If directory is "sticky", then user must own
* the directory, or the file in it, else she
* may not delete it (unless she's root). This
* implements append-only directories.
*/
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, tdp, vdp,
genfs_can_sticky(vdp, cred, ip->i_uid, VTOI(tdp)->i_uid));
if (error) {
error = EPERM; // Why override?
goto out;
}
return 0;
out:
vrele(tdp);
return error;
}
static int
ufs_getino(struct vnode *vdp, struct inode *ip, ino_t foundino,
struct vnode **tdp, bool same)
{
if (ip->i_number == foundino) {
if (same)
return EISDIR;
vref(vdp);
*tdp = vdp;
return 0;
}
return vcache_get(vdp->v_mount, &foundino, sizeof(foundino), tdp);
}
/*
* Convert a component of a pathname into a pointer to a locked inode.
* This is a very central and rather complicated routine.
* If the file system is not maintained in a strict tree hierarchy,
* this can result in a deadlock situation (see comments in code below).
*
* The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
* on whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it and the target of the pathname
* exists, lookup returns both the target and its parent directory locked.
* When creating or renaming and LOCKPARENT is specified, the target may
* not be ".". When deleting and LOCKPARENT is specified, the target may
* be "."., but the caller must check to ensure it does an vrele and vput
* instead of two vputs.
*
* Overall outline of ufs_lookup:
*
* check accessibility of directory
* look for name in cache, if found, then if at end of path
* and deleting or creating, drop it, else return name
* search for name in directory, to found or notfound
* notfound:
* if creating, return locked directory, leaving info on available slots
* else return error
* found:
* if at end of path and deleting, return information to allow delete
* if at end of path and rewriting (RENAME and LOCKPARENT), lock target
* inode and return info to allow rewrite
* if not at end, add name to cache; if at end and neither creating
* nor deleting, add name to cache
*/
int
ufs_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */
struct inode *dp = VTOI(vdp); /* inode for directory being searched */
struct buf *bp; /* a buffer of directory entries */
struct direct *ep; /* the current directory entry */
int entryoffsetinblock; /* offset of ep in bp's buffer */
struct slotinfo slot;
int numdirpasses; /* strategy for directory search */
doff_t endsearch; /* offset to end directory search */
doff_t prevoff; /* previous value of ulr_offset */
struct vnode *tdp; /* returned by vcache_get */
doff_t enduseful; /* pointer past last used dir slot.
used for directory truncation. */
u_long bmask; /* block offset mask */
int error;
struct vnode **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
int flags;
int nameiop = cnp->cn_nameiop;
struct ufsmount *ump = dp->i_ump;
const int needswap = UFS_MPNEEDSWAP(ump);
int dirblksiz = ump->um_dirblksiz;
ino_t foundino;
struct ufs_lookup_results *results;
int iswhiteout; /* temp result from cache_lookup() */
const int fsfmt = FSFMT(vdp);
uint16_t reclen;
flags = cnp->cn_flags;
bp = NULL;
*vpp = NULL;
endsearch = 0; /* silence compiler warning */
/*
* Check accessibility of directory.
*/
if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
return (error);
if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
(nameiop == DELETE || nameiop == RENAME))
return (EROFS);
/*
* We now have a segment name to search for, and a directory to search.
*
* Before tediously performing a linear scan of the directory,
* check the name cache to see if the directory/name pair
* we are looking for is known already.
*/
if (cache_lookup(vdp, cnp->cn_nameptr, cnp->cn_namelen,
cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
if (iswhiteout) { cnp->cn_flags |= ISWHITEOUT;
}
return *vpp == NULLVP ? ENOENT : 0;
}
/* May need to restart the lookup with an exclusive lock. */
if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) {
return ENOLCK;
}
/*
* Produce the auxiliary lookup results into i_crap. Increment
* its serial number so elsewhere we can tell if we're using
* stale results. This should not be done this way. XXX.
*/
results = &dp->i_crap;
dp->i_crapcounter++;
if (iswhiteout) {
/*
* The namecache set iswhiteout without finding a
* cache entry. As of this writing (20121014), this
* can happen if there was a whiteout entry that has
* been invalidated by the lookup. It is not clear if
* it is correct to set ISWHITEOUT in this case or
* not; however, doing so retains the prior behavior,
* so we'll go with that until some clearer answer
* appears. XXX
*/
cnp->cn_flags |= ISWHITEOUT;
}
/*
* Suppress search for slots unless creating
* file and at end of pathname, in which case
* we watch for a place to put the new file in
* case it doesn't already exist.
*/
slot_init(&slot);
if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slot.status = NONE;
slot.needed = UFS_DIRECTSIZ(cnp->cn_namelen);
}
/*
* If there is cached information on a previous search of
* this directory, pick up where we last left off.
* We cache only lookups as these are the most common
* and have the greatest payoff. Caching CREATE has little
* benefit as it usually must search the entire directory
* to determine that the entry does not exist. Caching the
* location of the last DELETE or RENAME has not reduced
* profiling time and hence has been removed in the interest
* of simplicity.
*/
bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
#ifdef UFS_DIRHASH
/*
* Use dirhash for fast operations on large directories. The logic
* to determine whether to hash the directory is contained within
* ufsdirhash_build(); a zero return means that it decided to hash
* this directory and it successfully built up the hash table.
*/
if (ufsdirhash_build(dp) == 0) {
/* Look for a free slot if needed. */
enduseful = slot_findfree(&slot, dp);
/* Look up the component. */
numdirpasses = 1;
entryoffsetinblock = 0; /* silence compiler warning */
switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
&results->ulr_offset, &bp,
nameiop == DELETE ? &prevoff : NULL)) {
case 0:
ep = (void *)((char *)bp->b_data +
(results->ulr_offset & bmask));
reclen = ufs_rw16(ep->d_reclen, needswap);
goto foundentry;
case ENOENT:
results->ulr_offset = roundup(dp->i_size, dirblksiz);
goto notfound;
default:
/* Something failed; just do a linear search. */
break;
}
}
#endif /* UFS_DIRHASH */
if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
results->ulr_diroff >= dp->i_size) {
entryoffsetinblock = 0;
results->ulr_offset = 0;
numdirpasses = 1;
} else {
results->ulr_offset = results->ulr_diroff;
entryoffsetinblock = results->ulr_offset & bmask;
if (entryoffsetinblock != 0 &&
(error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
NULL, &bp, false)))
goto out;
numdirpasses = 2;
namecache_count_2passes();
}
prevoff = results->ulr_offset;
endsearch = roundup(dp->i_size, dirblksiz);
enduseful = 0;
searchloop:
while (results->ulr_offset < endsearch) {
preempt_point();
/*
* If necessary, get the next directory block.
*/
if ((results->ulr_offset & bmask) == 0) { if (bp != NULL) brelse(bp, 0);
error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
NULL, &bp, false);
if (error)
goto out;
entryoffsetinblock = 0;
}
/*
* If still looking for a slot, and at a DIRBLKSIZ
* boundary, have to start looking for free space again.
*/
if (slot.status == NONE &&
(entryoffsetinblock & (dirblksiz - 1)) == 0) {
slot.offset = -1;
slot.freespace = 0;
}
/*
* Get pointer to next entry.
* Full validation checks are slow, so we only check
* enough to insure forward progress through the
* directory. Complete checks can be run by patching
* "dirchk" to be true.
*/
KASSERT(bp != NULL);
ep = (void *)((char *)bp->b_data + entryoffsetinblock);
const char *msg;
reclen = ufs_rw16(ep->d_reclen, needswap); if ((reclen == 0 && (msg = "null entry")) || (dirchk &&
(msg = ufs_dirbadentry(vdp, ep, entryoffsetinblock)))) {
ufs_dirbad(dp, results->ulr_offset, msg);
reclen = dirblksiz -
(entryoffsetinblock & (dirblksiz - 1));
goto next;
}
/*
* If an appropriate sized slot has not yet been found,
* check to see if one is available. Also accumulate space
* in the current block so that we can determine if
* compaction is viable.
*/
if (slot.status != FOUND) {
int size = reclen;
if (ep->d_ino != 0) size -= UFS_DIRSIZ(fsfmt, ep, needswap); if (size > 0) slot_update(&slot, size, reclen,
results->ulr_offset);
}
if (ep->d_ino == 0)
goto next;
/*
* Check for a name match.
*/
const uint16_t namlen = NAMLEN(fsfmt, needswap, ep);
if (namlen != cnp->cn_namelen ||
memcmp(cnp->cn_nameptr, ep->d_name, (size_t)namlen))
goto next;
#ifdef UFS_DIRHASH
foundentry:
#endif
/*
* Save directory entry's inode number and
* reclen, and release directory buffer.
*/
if (!fsfmt && ep->d_type == DT_WHT) { slot_white(&slot, reclen, results);
/*
* This is used to set results->ulr_endoff, which may
* be used by ufs_direnter() as a length to truncate
* the directory to. Therefore, it must point past the
* end of the last non-empty directory entry. We don't
* know where that is in this case, so we effectively
* disable shrinking by using the existing size of the
* directory.
*
* Note that we wouldn't expect to shrink the
* directory while rewriting an existing entry anyway.
*/
enduseful = endsearch;
cnp->cn_flags |= ISWHITEOUT;
numdirpasses--;
goto notfound;
}
foundino = ufs_rw32(ep->d_ino, needswap);
results->ulr_reclen = reclen;
goto found;
next:
prevoff = results->ulr_offset;
results->ulr_offset += reclen;
entryoffsetinblock += reclen;
if (ep->d_ino)
enduseful = results->ulr_offset;
}
notfound:
/*
* If we started in the middle of the directory and failed
* to find our target, we must check the beginning as well.
*/
if (numdirpasses == 2) {
numdirpasses--;
results->ulr_offset = 0;
endsearch = results->ulr_diroff;
goto searchloop;
}
if (bp != NULL) brelse(bp, 0);
/*
* If creating, and at end of pathname and current
* directory has not been removed, then can consider
* allowing file to be created.
*/
if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) &&
(cnp->cn_flags & ISWHITEOUT))) &&
(flags & ISLASTCN) && dp->i_nlink != 0) {
/*
* Access for write is interpreted as allowing
* creation of files in the directory.
*/
if (flags & WILLBEDIR)
error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
else
error = VOP_ACCESS(vdp, VWRITE, cred); if (error)
goto out;
error = slot_estimate(&slot, dirblksiz, nameiop,
prevoff, enduseful, dp, results);
/*
* We return with the directory locked, so that
* the parameters we set up above will still be
* valid if we actually decide to do a direnter().
* We return ni_vp == NULL to indicate that the entry
* does not currently exist; we leave a pointer to
* the (locked) directory inode in ndp->ni_dvp.
*
* NB - if the directory is unlocked, then this
* information cannot be used.
*/
goto out;
}
/*
* Insert name into cache (as non-existent) if appropriate.
*/
if (nameiop != CREATE) {
cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
cnp->cn_flags);
}
error = ENOENT;
goto out;
found:
if (numdirpasses == 2) namecache_count_pass2();
/*
* Check that directory length properly reflects presence
* of this entry.
*/
const uint64_t newisize =
results->ulr_offset + UFS_DIRSIZ(fsfmt, ep, needswap);
if (newisize > dp->i_size) {
ufs_dirbad(dp, results->ulr_offset, "i_size too small");
dp->i_size = newisize;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
}
brelse(bp, 0);
/*
* Found component in pathname.
* If the final component of path name, save information
* in the cache as to where the entry was found.
*/
if ((flags & ISLASTCN) && nameiop == LOOKUP)
results->ulr_diroff = results->ulr_offset & ~(dirblksiz - 1);
/*
* If deleting, and at end of pathname, return
* parameters which can be used to remove file.
* Lock the inode, being careful with ".".
*/
if (nameiop == DELETE && (flags & ISLASTCN)) {
/*
* Return pointer to current entry in results->ulr_offset,
* and distance past previous entry (if there
* is a previous entry in this block) in results->ulr_count.
* Save directory inode pointer in ndp->ni_dvp for dirremove().
*/
calc_count(results, dirblksiz, prevoff);
if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
goto out;
if ((error = ufs_can_delete(tdp, vdp, dp, cred)) != 0)
goto out;
*vpp = tdp;
goto out;
}
/*
* If rewriting (RENAME), return the inode and the
* information required to rewrite the present directory
* Must get inode of directory entry to verify it's a
* regular file, or empty directory.
*/
if (nameiop == RENAME && (flags & ISLASTCN)) {
if (flags & WILLBEDIR)
error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
else
error = VOP_ACCESS(vdp, VWRITE, cred); if (error)
goto out;
/*
* Careful about locking second inode.
* This can only occur if the target is ".".
*/
if ((error = ufs_getino(vdp, dp, foundino, &tdp, true)) != 0)
goto out;
*vpp = tdp;
goto out;
}
if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
goto out;
*vpp = tdp;
/*
* Insert name into cache if appropriate.
*/
cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
error = 0;
out:
return error;
}
void
ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
{
struct mount *mp = ITOV(ip)->v_mount;
void (*p)(const char *, ...) __printflike(1, 2) =
(mp->mnt_flag & MNT_RDONLY) == 0 ? panic : printf;
(*p)("%s: bad dir ino %ju at offset %d: %s\n",
mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number,
offset, how);
}
/*
* Do consistency checking on a directory entry:
* record length must be multiple of 4
* entry must fit in rest of its DIRBLKSIZ block
* record must be large enough to contain entry
* name is not longer than FFS_MAXNAMLEN
* name must be as long as advertised, and null terminated
*/
const char *
ufs_dirbadentry(const struct vnode *dp, const struct direct *ep,
int entryoffsetinblock)
{
const struct ufsmount *ump = VFSTOUFS(dp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int dirblksiz = ump->um_dirblksiz;
const int maxsize = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
const int fsfmt = FSFMT(dp);
const uint8_t namlen = NAMLEN(fsfmt, needswap, ep);
const uint16_t reclen = ufs_rw16(ep->d_reclen, needswap); const int dirsiz = (int)UFS_DIRSIZ(fsfmt, ep, needswap);
const char *name = ep->d_name;
const char *str;
#ifdef DIAGNOSTIC
static char buf[512];
#endif
if ((reclen & 0x3) != 0)
str = "not rounded";
else if (reclen > maxsize)
str = "too big";
else if (reclen < dirsiz)
str = "too small";
#if FFS_MAXNAMLEN < 255
else if (namlen > FFS_MAXNAMLEN)
str = "long name";
#endif
else
str = NULL;
if (str) {
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "Bad dir (%s), reclen=%#x, "
"namlen=%d, dirsiz=%d <= reclen=%d <= maxsize=%d, "
"flags=%#x, entryoffsetinblock=%d, dirblksiz=%d",
str, reclen, namlen, dirsiz, reclen, maxsize,
dp->v_mount->mnt_flag, entryoffsetinblock, dirblksiz);
str = buf;
#endif
return str;
}
if (ep->d_ino == 0)
return NULL;
for (uint8_t i = 0; i < namlen; i++)
if (name[i] == '\0') {
str = "NUL in name";
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "%s [%s] i=%d, namlen=%d",
str, name, i, namlen);
str = buf;
#endif
return str;
}
if (name[namlen]) {
str = "missing NUL in name";
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "%s [%*.*s] namlen=%d", str,
namlen, namlen, name, namlen);
str = buf;
#endif
return str;
}
return NULL;
}
/*
* Construct a new directory entry after a call to namei, using the
* name in the componentname argument cnp. The argument ip is the
* inode to which the new directory entry will refer.
*/
void
ufs_makedirentry(struct inode *ip, struct componentname *cnp,
struct direct *newdirp)
{
size_t namelen = cnp->cn_namelen;
newdirp->d_ino = ip->i_number;
newdirp->d_namlen = namelen;
memcpy(newdirp->d_name, cnp->cn_nameptr, namelen);
/* NUL terminate and zero out padding */
memset(&newdirp->d_name[namelen], 0, UFS_NAMEPAD(namelen));
if (FSFMT(ITOV(ip)))
newdirp->d_type = 0;
else
newdirp->d_type = IFTODT(ip->i_mode);
}
static int
ufs_dirgrow(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
const kauth_cred_t cr = cnp->cn_cred;
const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int dirblksiz = ump->um_dirblksiz;
const int fsfmt = FSFMT(dvp);
const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
struct inode *dp = VTOI(dvp);
int error, ret, blkoff;
struct timespec ts;
struct buf *bp;
/*
* If ulr_count is 0, then namei could find no
* space in the directory. Here, ulr_offset will
* be on a directory block boundary and we will write the
* new entry into a fresh block.
*/
if (ulr->ulr_offset & (dirblksiz - 1))
panic("%s: newblk", __func__); if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
return error;
}
dp->i_size = ulr->ulr_offset + dirblksiz;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
uvm_vnp_setsize(dvp, dp->i_size);
dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp);
blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, ulr->ulr_offset);
ufsdirhash_add(dp, dirp, ulr->ulr_offset);
ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
ulr->ulr_offset);
}
#endif
error = VOP_BWRITE(bp->b_vp, bp);
vfs_timestamp(&ts);
ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
if (error == 0)
return ret;
return error;
}
static int
#if __GNUC_PREREQ__(5, 3)
/* This gets miscompiled by gcc 5.3 PR/51094 */
__attribute__((__optimize__("no-tree-vrp")))
#endif
ufs_dircompact(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int fsfmt = FSFMT(dvp);
const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
struct inode *dp = VTOI(dvp);
struct buf *bp;
u_int dsize;
struct direct *ep, *nep;
int error, loc, spacefree;
char *dirbuf;
uint16_t reclen;
UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
/*
* If ulr_count is non-zero, then namei found space for the new
* entry in the range ulr_offset to ulr_offset + ulr_count
* in the directory. To use this space, we may have to compact
* the entries located there, by copying them together towards the
* beginning of the block, leaving the free space in one usable
* chunk at the end.
*/
/*
* Increase size of directory if entry eats into new space.
* This should never push the size past a new multiple of
* DIRBLKSIZ.
*
* N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
*/
if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
#ifdef DIAGNOSTIC
printf("%s: reached 4.2-only block, not supposed to happen\n",
__func__);
#endif
dp->i_size = ulr->ulr_offset + ulr->ulr_count;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
}
/*
* Get the block containing the space for the new directory entry.
*/
error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
if (error)
return error;
/*
* Find space for the new entry. In the simple case, the entry at
* offset base will have the space. If it does not, then namei
* arranged that compacting the region ulr_offset to
* ulr_offset + ulr_count would yield the space.
*/
ep = (void *)dirbuf;
dsize = (ep->d_ino != 0) ? UFS_DIRSIZ(fsfmt, ep, needswap) : 0; reclen = ufs_rw16(ep->d_reclen, needswap);
spacefree = reclen - dsize;
for (loc = reclen; loc < ulr->ulr_count; ) {
nep = (void *)(dirbuf + loc);
/* Trim the existing slot (NB: dsize may be zero). */
ep->d_reclen = ufs_rw16(dsize, needswap); ep = (void *)((char *)ep + dsize);
reclen = ufs_rw16(nep->d_reclen, needswap);
loc += reclen;
if (nep->d_ino == 0) {
/*
* A mid-block unused entry. Such entries are
* never created by the kernel, but fsck_ffs
* can create them (and it doesn't fix them).
*
* Add up the free space, and initialise the
* relocated entry since we don't memcpy it.
*/
spacefree += reclen;
ep->d_ino = 0;
dsize = 0;
continue;
}
dsize = UFS_DIRSIZ(fsfmt, nep, needswap);
spacefree += reclen - dsize;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_move(dp, nep,
ulr->ulr_offset + ((char *)nep - dirbuf),
ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
memcpy(ep, nep, dsize);
}
/*
* Here, `ep' points to a directory entry containing `dsize' in-use
* bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
* then the entry is completely unused (dsize == 0). The value
* of ep->d_reclen is always indeterminate.
*
* Update the pointer fields in the previous entry (if any),
* copy in the new entry, and write out the block.
*/
if (ep->d_ino == 0 || (ufs_rw32(ep->d_ino, needswap) == UFS_WINO &&
memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
if (spacefree + dsize < newentrysize)
panic("%s: too big", __func__); dirp->d_reclen = spacefree + dsize;
} else {
if (spacefree < newentrysize)
panic("%s: nospace", __func__);
dirp->d_reclen = spacefree;
ep->d_reclen = ufs_rw16(dsize, needswap);
ep = (void *)((char *)ep + dsize);
}
dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
dirp->d_reclen == spacefree))
ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
memcpy(ep, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { const int dirblkmsk = ump->um_dirblksiz - 1;
ufsdirhash_checkblock(dp, dirbuf -
(ulr->ulr_offset & dirblkmsk),
ulr->ulr_offset & ~dirblkmsk);
}
#endif
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* If all went well, and the directory can be shortened, proceed
* with the truncation. Note that we have to unlock the inode for
* the entry that we just entered, as the truncation may need to
* lock other inodes which can lead to deadlock if we also hold a
* lock on the newly entered node.
*/
if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
const kauth_cred_t cr = cnp->cn_cred;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
#endif
(void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
}
UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
return error;
}
/*
* Write a directory entry after a call to namei, using the parameters
* that ufs_lookup left in nameidata and in the ufs_lookup_results.
*
* DVP is the directory to be updated. It must be locked.
* ULR is the ufs_lookup_results structure from the final lookup step.
* TVP is not used. (XXX: why is it here? remove it)
* DIRP is the new directory entry contents.
* CNP is the componentname from the final lookup step.
* NEWDIRBP is not used and (XXX) should be removed. The previous
* comment here said it was used by the now-removed softupdates code.
*
* The link count of the target inode is *not* incremented; the
* caller does that.
*
* If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
* directory entry. ulr_offset, which is the place to put the entry,
* should be on a block boundary (and should be at the end of the
* directory AFAIK) and a fresh block is allocated to put the new
* directory entry in.
*
* If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
* the entry into. This slot ranges from ulr_offset to ulr_offset +
* ulr_count. However, this slot may already be partially populated
* requiring compaction. See notes below.
*
* Furthermore, if ulr_count is not zero and ulr_endoff is not the
* same as i_size, the directory is truncated to size ulr_endoff.
*/
int
ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
if (ulr->ulr_count == 0)
return ufs_dirgrow(dvp, ulr, tvp, dirp, cnp, newdirbp);
else
return ufs_dircompact(dvp, ulr, tvp, dirp, cnp, newdirbp);
}
/*
* Remove a directory entry after a call to namei, using the
* parameters that ufs_lookup left in nameidata and in the
* ufs_lookup_results.
*
* DVP is the directory to be updated. It must be locked.
* ULR is the ufs_lookup_results structure from the final lookup step.
* IP, if not null, is the inode being unlinked.
* FLAGS may contain DOWHITEOUT.
* ISRMDIR is not used and (XXX) should be removed.
*
* If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
* instead of being cleared.
*
* ulr->ulr_offset contains the position of the directory entry
* to be removed.
*
* ulr->ulr_reclen contains the size of the directory entry to be
* removed.
*
* ulr->ulr_count contains the size of the *previous* directory
* entry. This allows finding it, for free space management. If
* ulr_count is 0, the target entry is at the beginning of the
* directory. (Does this ever happen? The first entry should be ".",
* which should only be removed at rmdir time. Does rmdir come here
* to clear out the "." and ".." entries? Perhaps, but I doubt it.)
*
* The space is marked free by adding it to the record length (not
* name length) of the preceding entry. If the first entry becomes
* free, it is marked free by setting the inode number to 0.
*
* The link count of IP is decremented. Note that this is not the
* inverse behavior of ufs_direnter, which does not adjust link
* counts. Sigh.
*/
int
ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct inode *ip, int flags, int isrmdir)
{
struct inode *dp = VTOI(dvp);
struct direct *ep;
struct buf *bp;
int error;
const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
uint16_t reclen;
UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
if (flags & DOWHITEOUT) {
/*
* Whiteout entry: set d_ino to UFS_WINO.
*/
error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &ep,
&bp, true);
if (error)
return (error);
ep->d_ino = ufs_rw32(UFS_WINO, needswap);
ep->d_type = DT_WHT;
goto out;
}
if ((error = ufs_blkatoff(dvp,
(off_t)(ulr->ulr_offset - ulr->ulr_count), &ep, &bp, true)) != 0)
return (error);
reclen = ufs_rw16(ep->d_reclen, needswap);
#ifdef UFS_DIRHASH
/*
* Remove the dirhash entry. This is complicated by the fact
* that `ep' is the previous entry when ulr_count != 0.
*/
if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
(void *)((char *)ep + reclen), ulr->ulr_offset);
#endif
if (ulr->ulr_count == 0) {
/*
* First entry in block: set d_ino to zero.
*/
ep->d_ino = 0;
} else {
/*
* Collapse new free space into previous entry.
*/
ep->d_reclen = ufs_rw16(reclen + ulr->ulr_reclen, needswap);
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { int dirblksiz = ip->i_ump->um_dirblksiz;
ufsdirhash_checkblock(dp, (char *)ep -
((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
ulr->ulr_offset & ~(dirblksiz - 1));
}
#endif
out:
if (ip) {
ip->i_nlink--;
DIP_ASSIGN(ip, nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
}
/*
* XXX did it ever occur to anyone that it might be a good
* idea to restore ip->i_nlink if this fails? Or something?
* Currently on error return from this function the state of
* ip->i_nlink depends on what happened, and callers
* definitely do not take this into account.
*/
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* If the last named reference to a snapshot goes away,
* drop its snapshot reference so that it will be reclaimed
* when last open reference goes away.
*/
if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
ip->i_nlink == 0)
UFS_SNAPGONE(ITOV(ip)); UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
return (error);
}
/*
* Rewrite an existing directory entry to point at the inode supplied.
*
* DP is the directory to update.
* OFFSET is the position of the entry in question. It may come
* from ulr_offset of a ufs_lookup_results.
* OIP is the old inode the directory previously pointed to.
* NEWINUM is the number of the new inode.
* NEWTYPE is the new value for the type field of the directory entry.
* (This is ignored if the fs doesn't support that.)
* ISRMDIR is not used and (XXX) should be removed.
* IFLAGS are added to DP's inode flags.
*
* The link count of OIP is decremented. Note that the link count of
* the new inode is *not* incremented. Yay for symmetry.
*/
int
ufs_dirrewrite(struct inode *dp, off_t offset,
struct inode *oip, ino_t newinum, int newtype,
int isrmdir, int iflags)
{
struct buf *bp;
struct direct *ep;
struct vnode *vdp = ITOV(dp);
int error;
error = ufs_blkatoff(vdp, offset, &ep, &bp, true);
if (error)
return (error);
ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump)); if (!FSFMT(vdp)) ep->d_type = newtype;
oip->i_nlink--;
DIP_ASSIGN(oip, nlink, oip->i_nlink);
oip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= iflags;
/*
* If the last named reference to a snapshot goes away,
* drop its snapshot reference so that it will be reclaimed
* when last open reference goes away.
*/
if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0) UFS_SNAPGONE(ITOV(oip)); UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
return (error);
}
/*
* Check if a directory is empty or not.
* Inode supplied must be locked.
*
* Using a struct dirtemplate here is not precisely
* what we want, but better than using a struct direct.
*
* NB: does not handle corrupted directories.
*/
int
ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
{
doff_t off;
struct direct dbuf;
struct direct *dp = &dbuf;
int error;
size_t count;
const int needswap = UFS_IPNEEDSWAP(ip);
const int fsfmt = FSFMT(ITOV(ip));
#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
for (off = 0; off < ip->i_size; off += ufs_rw16(dp->d_reclen, needswap)) { error = ufs_bufio(UIO_READ, ITOV(ip), dp, MINDIRSIZ,
off, IO_NODELOCKED, cred, &count, NULL);
/*
* Since we read MINDIRSIZ, residual must
* be 0 unless we're at end of file.
*/
if (error || count != 0)
return (0);
/* avoid infinite loops */
if (dp->d_reclen == 0)
return (0);
/* skip empty entries */
ino_t ino = ufs_rw32(dp->d_ino, needswap); if (ino == 0 || ino == UFS_WINO)
continue;
/* accept only "." and ".." */
const uint8_t namlen = NAMLEN(fsfmt, needswap, dp);
if (namlen > 2)
return (0);
if (dp->d_name[0] != '.')
return (0);
/*
* At this point namlen must be 1 or 2.
* 1 implies ".", 2 implies ".." if second
* char is also "."
*/
if (namlen == 1 && ino == ip->i_number)
continue;
if (dp->d_name[1] == '.' && ino == parentino)
continue;
return (0);
}
return (1);
}
#define UFS_DIRRABLKS 0
int ufs_dirrablks = UFS_DIRRABLKS;
/*
* ufs_blkatoff: Return buffer with the contents of block "offset" from
* the beginning of directory "vp". If "res" is non-NULL, fill it in with
* a pointer to the remaining space in the directory. If the caller intends
* to modify the buffer returned, "modify" must be true.
*/
int
ufs_blkatoff(struct vnode *vp, off_t offset, void *v, struct buf **bpp,
bool modify)
{
char **res = v;
struct inode *ip __diagused;
struct buf *bp;
daddr_t lbn;
const int dirrablks = ufs_dirrablks;
daddr_t *blks;
int *blksizes;
int run, error;
struct mount *mp = vp->v_mount;
const int bshift = mp->mnt_fs_bshift;
const int bsize = 1 << bshift;
off_t eof;
blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
ip = VTOI(vp);
KASSERT(vp->v_size == ip->i_size);
GOP_SIZE(vp, vp->v_size, &eof, 0);
lbn = offset >> bshift;
for (run = 0; run <= dirrablks;) {
const off_t curoff = lbn << bshift;
const int size = MIN(eof - curoff, bsize);
if (size == 0) {
break;
}
KASSERT(curoff < eof);
blks[run] = lbn;
blksizes[run] = size;
lbn++;
run++;
if (size != bsize) {
break;
}
}
KASSERT(run >= 1); error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
run - 1, (modify ? B_MODIFY : 0), &bp);
if (error != 0) {
*bpp = NULL;
goto out;
}
if (res) {
*res = (char *)bp->b_data + (offset & (bsize - 1));
}
*bpp = bp;
out:
kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
return error;
}
/* $NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $");
#define ELFSIZE 64
#include "exec_elf.c"
#include <sys/module.h>
#define ELF64_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux64Info) \
+ MAXPATHLEN + ALIGN(1))
MODULE(MODULE_CLASS_EXEC, exec_elf64, NULL);
static struct execsw exec_elf64_execsw[] = {
/* Native Elf64 */
{
.es_hdrsz = sizeof (Elf64_Ehdr),
.es_makecmds = exec_elf64_makecmds,
.u = {
.elf_probe_func = netbsd_elf64_probe,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_FIRST,
.es_arglen = ELF64_AUXSIZE,
.es_copyargs = elf64_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf64,
.es_setup_stack = exec_setup_stack,
},
#if EXEC_ELF_NOTELESS
/* Generic Elf64 -- run at NetBSD Elf64 */
{
.es_hdrsz = sizeof (Elf64_Ehdr),
.es_makecmds = exec_elf64_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = ELF64_AUXSIZE,
.es_copyargs = elf64_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf64,
.es_setup_stack = exec_setup_stack,
},
#endif
};
static int
exec_elf64_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(exec_elf64_execsw,
__arraycount(exec_elf64_execsw));
case MODULE_CMD_FINI:
return exec_remove(exec_elf64_execsw,
__arraycount(exec_elf64_execsw));
default:
return ENOTTY;
}
}
/* $NetBSD: syscallvar.h,v 1.12 2018/04/19 21:19:07 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_SYSCALLVAR_H_
#define _SYS_SYSCALLVAR_H_
#ifndef _KERNEL
#error nothing of interest to userspace here
#endif
#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif
#include <sys/systm.h>
#include <sys/proc.h>
extern struct emul emul_netbsd;
struct syscall_package {
u_short sp_code;
u_short sp_flags;
sy_call_t *sp_call;
};
void syscall_init(void);
int syscall_establish(const struct emul *, const struct syscall_package *);
int syscall_disestablish(const struct emul *, const struct syscall_package *);
static __inline int
sy_call(const struct sysent *sy, struct lwp *l, const void *uap,
register_t *rval)
{
int error;
l->l_sysent = sy;
error = (*sy->sy_call)(l, uap, rval);
l->l_sysent = NULL;
return error;
}
static __inline int
sy_invoke(const struct sysent *sy, struct lwp *l, const void *uap,
register_t *rval, int code)
{
const bool do_trace = l->l_proc->p_trace_enabled &&
(sy->sy_flags & SYCALL_INDIRECT) == 0;
int error;
#ifdef KDTRACE_HOOKS
#define KDTRACE_ENTRY(a) (a)
#else
#define KDTRACE_ENTRY(a) (0)
#endif
if (__predict_true(!(do_trace || KDTRACE_ENTRY(sy->sy_entry))) || (error = trace_enter(code, sy, uap)) == 0) {
rval[0] = 0;
#if !defined(__mips__) && !defined(__m68k__)
/*
* Due to the mips userland code for SYS_break needing v1 to be
* preserved, we can't clear this on mips.
*/
rval[1] = 0;
#endif
error = sy_call(sy, l, uap, rval);
}
if (__predict_false(do_trace || KDTRACE_ENTRY(sy->sy_return))) {
trace_exit(code, sy, uap, rval, error);
}
return error;
}
/* inclusion in the kernel currently depends on SYSCALL_DEBUG */
extern const char * const syscallnames[];
extern const char * const altsyscallnames[];
#endif /* _SYS_SYSCALLVAR_H_ */
/* $NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */
/*
* Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2004 The FreeBSD Foundation
* Copyright (c) 2004 Robert Watson
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
*/
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vn.c 1.13 94/04/02$
*
* @(#)vn.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/compat_stub.h>
#include <sys/socketvar.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <compat/common/compat_mod.h>
static int
uipc_socket_50_getopt1(int opt, struct socket *so, struct sockopt *sopt)
{
int optval, error;
struct timeval50 otv;
switch (opt) {
case SO_OSNDTIMEO:
case SO_ORCVTIMEO:
optval = (opt == SO_OSNDTIMEO ?
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
otv.tv_sec = optval / hz;
otv.tv_usec = (optval % hz) * tick;
error = sockopt_set(sopt, &otv, sizeof(otv));
break;
case SO_OTIMESTAMP:
error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
break;
default:
error = EPASSTHROUGH;
}
return error;
}
static int
uipc_socket_50_setopt1(int opt, struct socket *so, const struct sockopt *sopt)
{
int optval, error;
struct timeval50 otv;
struct timeval tv;
switch (opt) {
case SO_OSNDTIMEO:
case SO_ORCVTIMEO:
solock(so);
error = sockopt_get(sopt, &otv, sizeof(otv));
if (error)
break;
timeval50_to_timeval(&otv, &tv);
/* Code duplicated from sys/kern/uipc_socket.c */
if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
error = EDOM;
break;
}
if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
error = EDOM;
break;
}
optval = tv.tv_sec * hz + tv.tv_usec / tick;
if (optval == 0 && tv.tv_usec != 0)
optval = 1;
switch (opt) {
case SO_OSNDTIMEO:
so->so_snd.sb_timeo = optval;
break;
case SO_ORCVTIMEO:
so->so_rcv.sb_timeo = optval;
break;
}
break;
case SO_OTIMESTAMP:
error = sockopt_getint(sopt, &optval);
solock(so); if (error)
break;
if (optval)
so->so_options |= opt;
else
so->so_options &= ~opt;
break;
default:
error = EPASSTHROUGH;
}
return error;
}
static int
uipc_socket_50_sbts(int opt, struct mbuf ***mp)
{
struct timeval50 tv50;
struct timeval tv;
microtime(&tv);
if (opt & SO_OTIMESTAMP) {
timeval_to_timeval50(&tv, &tv50);
**mp = sbcreatecontrol(&tv50, sizeof(tv50), SCM_OTIMESTAMP,
SOL_SOCKET);
if (**mp)
*mp = &(**mp)->m_next;
return 0;
} else
return EPASSTHROUGH;
}
void
kern_uipc_socket_50_init(void)
{
MODULE_HOOK_SET(uipc_socket_50_setopt1_hook, uipc_socket_50_setopt1);
MODULE_HOOK_SET(uipc_socket_50_getopt1_hook, uipc_socket_50_getopt1);
MODULE_HOOK_SET(uipc_socket_50_sbts_hook, uipc_socket_50_sbts);
}
void
kern_uipc_socket_50_fini(void)
{
MODULE_HOOK_UNSET(uipc_socket_50_setopt1_hook);
MODULE_HOOK_UNSET(uipc_socket_50_getopt1_hook);
MODULE_HOOK_UNSET(uipc_socket_50_sbts_hook);
}
/* $NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $ */
/*-
* Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed
* to Berkeley by John Heidemann of the UCLA Ficus project.
*
* Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_init.c 8.5 (Berkeley) 5/11/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/dirhash.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/ucred.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/fifofs/fifo.h>
#include <miscfs/specfs/specdev.h>
/*
* Sigh, such primitive tools are these...
*/
#if 0
#define DODEBUG(A) A
#else
#define DODEBUG(A)
#endif
SDT_PROVIDER_DEFINE(vfs);
/*
* These vnodeopv_descs are listed here because they are not
* associated with any particular file system, and thus cannot
* be initialized by vfs_attach().
*/
const struct vnodeopv_desc * const vfs_special_vnodeopv_descs[] = {
&dead_vnodeop_opv_desc,
&fifo_vnodeop_opv_desc,
&spec_vnodeop_opv_desc,
NULL,
};
struct vfs_list_head vfs_list = /* vfs list */
LIST_HEAD_INITIALIZER(vfs_list);
static kauth_listener_t mount_listener;
/*
* This code doesn't work if the defn is **vnodop_defns with cc.
* The problem is because of the compiler sometimes putting in an
* extra level of indirection for arrays. It's an interesting
* "feature" of C.
*/
typedef int (*PFI)(void *);
/*
* A miscellaneous routine.
* A generic "default" routine that just returns an error.
*/
/*ARGSUSED*/
int
vn_default_error(void *v)
{
return (EOPNOTSUPP);
}
static struct sysctllog *vfs_sysctllog;
/*
* Top level filesystem related information gathering.
*/
static void
sysctl_vfs_setup(void)
{
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "generic",
SYSCTL_DESCR("Non-specific vfs related information"),
NULL, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "fstypes",
SYSCTL_DESCR("List of file systems present"),
sysctl_vfs_generic_fstypes, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "magiclinks",
SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
NULL, 0, &vfs_magiclinks, 0,
CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "timestamp_precision",
SYSCTL_DESCR("File timestamp precision"),
NULL, 0, &vfs_timestamp_precision, 0,
CTL_VFS, VFS_GENERIC, VFS_TIMESTAMP_PRECISION,
CTL_EOL);
}
/*
* vfs_init.c
*
* Allocate and fill in operations vectors.
*
* An undocumented feature of this approach to defining operations is that
* there can be multiple entries in vfs_opv_descs for the same operations
* vector. This allows third parties to extend the set of operations
* supported by another layer in a binary compatibile way. For example,
* assume that NFS needed to be modified to support Ficus. NFS has an entry
* (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
* default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
* listing those new operations Ficus adds to NFS, all without modifying the
* NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
* that is a(whole)nother story.) This is a feature.
*/
/*
* Init the vector, if it needs it.
* Also handle backwards compatibility.
*/
static void
vfs_opv_init_explicit(const struct vnodeopv_desc *vfs_opv_desc)
{
int (**opv_desc_vector)(void *);
const struct vnodeopv_entry_desc *opve_descp;
opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
for (opve_descp = vfs_opv_desc->opv_desc_ops;
opve_descp->opve_op;
opve_descp++) {
/*
* Sanity check: is this operation listed
* in the list of operations? We check this
* by seeing if its offset is zero. Since
* the default routine should always be listed
* first, it should be the only one with a zero
* offset. Any other operation with a zero
* offset is probably not listed in
* vfs_op_descs, and so is probably an error.
*
* A panic here means the layer programmer
* has committed the all-too common bug
* of adding a new operation to the layer's
* list of vnode operations but
* not adding the operation to the system-wide
* list of supported operations.
*/
if (opve_descp->opve_op->vdesc_offset == 0 &&
opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) {
printf("operation %s not listed in %s.\n",
opve_descp->opve_op->vdesc_name, "vfs_op_descs");
panic ("vfs_opv_init: bad operation");
}
/*
* Fill in this entry.
*/
opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
opve_descp->opve_impl;
}
}
static void
vfs_opv_init_default(const struct vnodeopv_desc *vfs_opv_desc)
{
int j;
int (**opv_desc_vector)(void *);
opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
/*
* Force every operations vector to have a default routine.
*/
if (opv_desc_vector[VOFFSET(vop_default)] == NULL)
panic("vfs_opv_init: operation vector without default routine.");
for (j = 0; j < VNODE_OPS_COUNT; j++)
if (opv_desc_vector[j] == NULL)
opv_desc_vector[j] =
opv_desc_vector[VOFFSET(vop_default)];
}
void
vfs_opv_init(const struct vnodeopv_desc * const *vopvdpp)
{
int (**opv_desc_vector)(void *);
int i;
/*
* Allocate the vectors.
*/
for (i = 0; vopvdpp[i] != NULL; i++) {
opv_desc_vector =
kmem_alloc(VNODE_OPS_COUNT * sizeof(PFI), KM_SLEEP);
memset(opv_desc_vector, 0, VNODE_OPS_COUNT * sizeof(PFI));
*(vopvdpp[i]->opv_desc_vector_p) = opv_desc_vector;
DODEBUG(printf("vector at %p allocated\n",
opv_desc_vector_p));
}
/*
* ...and fill them in.
*/
for (i = 0; vopvdpp[i] != NULL; i++)
vfs_opv_init_explicit(vopvdpp[i]);
/*
* Finally, go back and replace unfilled routines
* with their default.
*/
for (i = 0; vopvdpp[i] != NULL; i++)
vfs_opv_init_default(vopvdpp[i]);
}
void
vfs_opv_free(const struct vnodeopv_desc * const *vopvdpp)
{
int i;
/*
* Free the vectors allocated in vfs_opv_init().
*/
for (i = 0; vopvdpp[i] != NULL; i++) {
kmem_free(*(vopvdpp[i]->opv_desc_vector_p),
VNODE_OPS_COUNT * sizeof(PFI));
*(vopvdpp[i]->opv_desc_vector_p) = NULL;
}
}
#ifdef DEBUG
static void
vfs_op_check(void)
{
int i;
DODEBUG(printf("Vnode_interface_init.\n"));
/*
* Check offset of each op.
*/
for (i = 0; vfs_op_descs[i]; i++) {
if (vfs_op_descs[i]->vdesc_offset != i)
panic("vfs_op_check: vfs_op_desc[] offset mismatch");
}
if (i != VNODE_OPS_COUNT) {
panic("vfs_op_check: vnode ops count mismatch (%d != %d)",
i, VNODE_OPS_COUNT);
}
DODEBUG(printf ("vfs_opv_numops=%d\n", VNODE_OPS_COUNT));
}
#endif /* DEBUG */
/*
* Common routine to check if an unprivileged mount is allowed.
*
* We export just this part (i.e., without the access control) so that if a
* secmodel wants to implement finer grained user mounts it can do so without
* copying too much code. More elaborate policies (i.e., specific users allowed
* to also create devices and/or introduce set-id binaries, or export
* file-systems) will require a different implementation.
*
* This routine is intended to be called from listener context, and as such
* does not take credentials as an argument.
*/
int
usermount_common_policy(struct mount *mp, u_long flags)
{
/* No exporting if unprivileged. */
if (flags & MNT_EXPORTED)
return EPERM;
/* Must have 'nosuid' and 'nodev'. */
if ((flags & MNT_NODEV) == 0 || (flags & MNT_NOSUID) == 0)
return EPERM;
/* Retain 'noexec'. */
if ((mp->mnt_flag & MNT_NOEXEC) && (flags & MNT_NOEXEC) == 0)
return EPERM;
return 0;
}
static int
mount_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_system_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)(uintptr_t)arg0;
if (action != KAUTH_SYSTEM_MOUNT)
return result;
if (req == KAUTH_REQ_SYSTEM_MOUNT_GET)
result = KAUTH_RESULT_ALLOW;
else if (req == KAUTH_REQ_SYSTEM_MOUNT_DEVICE) {
vnode_t *devvp = arg2;
accmode_t accmode = (accmode_t)(unsigned long)arg3;
int error;
error = VOP_ACCESS(devvp, accmode, cred);
if (!error)
result = KAUTH_RESULT_ALLOW;
}
return result;
}
/*
* Initialize the vnode structures and initialize each file system type.
*/
void
vfsinit(void)
{
/*
* Attach sysctl nodes
*/
sysctl_vfs_setup();
/*
* Initialize the vnode table
*/
vntblinit();
/*
* Initialize the vnode name cache
*/
nchinit();
#ifdef DEBUG
/*
* Check the list of vnode operations.
*/
vfs_op_check();
#endif
/*
* Initialize the special vnode operations.
*/
vfs_opv_init(vfs_special_vnodeopv_descs);
/*
* Initialise generic dirhash.
*/
dirhash_init();
/*
* Initialise VFS hooks.
*/
vfs_hooks_init();
mount_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
mount_listener_cb, NULL);
/*
* Establish each file system which was statically
* included in the kernel.
*/
module_init_class(MODULE_CLASS_VFS);
/*
* Initialize EVFILT_FS for kqueue.
*/
vfs_evfilt_fs_init();
}
/*
* Drop a reference to a file system type.
*/
void
vfs_delref(struct vfsops *vfs)
{
mutex_enter(&vfs_list_lock);
vfs->vfs_refcount--;
mutex_exit(&vfs_list_lock);
}
/*
* Establish a file system and initialize it.
*/
int
vfs_attach(struct vfsops *vfs)
{
struct vfsops *v;
int error = 0;
mutex_enter(&vfs_list_lock);
/*
* Make sure this file system doesn't already exist.
*/
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
error = EEXIST;
goto out;
}
}
/*
* Initialize the vnode operations for this file system.
*/
vfs_opv_init(vfs->vfs_opv_descs);
/*
* Now initialize the file system itself.
*/
(*vfs->vfs_init)();
/*
* ...and link it into the kernel's list.
*/
LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
/*
* Sanity: make sure the reference count is 0.
*/
vfs->vfs_refcount = 0;
out:
mutex_exit(&vfs_list_lock);
return (error);
}
/*
* Remove a file system from the kernel.
*/
int
vfs_detach(struct vfsops *vfs)
{
struct vfsops *v;
int error = 0;
mutex_enter(&vfs_list_lock);
/*
* Make sure no one is using the filesystem.
*/
if (vfs->vfs_refcount != 0) {
error = EBUSY;
goto out;
}
/*
* ...and remove it from the kernel's list.
*/
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (v == vfs) {
LIST_REMOVE(v, vfs_list);
break;
}
}
if (v == NULL) {
error = ESRCH;
goto out;
}
/*
* Now run the file system-specific cleanups.
*/
(*vfs->vfs_done)();
/*
* Free the vnode operations vector.
*/
vfs_opv_free(vfs->vfs_opv_descs);
out:
mutex_exit(&vfs_list_lock);
return (error);
}
void
vfs_reinit(void)
{
struct vfsops *vfs;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(vfs, &vfs_list, vfs_list) {
if (vfs->vfs_reinit) {
vfs->vfs_refcount++;
mutex_exit(&vfs_list_lock);
(*vfs->vfs_reinit)();
mutex_enter(&vfs_list_lock);
vfs->vfs_refcount--;
}
}
mutex_exit(&vfs_list_lock);
}
/* $NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $ */
/*-
* Copyright (c) 2003, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Kranenburg, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 John S. Dyson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
* 4. Modifications may be freely made to this file if the above conditions
* are met.
*/
/*
* This file contains a high-performance replacement for the socket-based
* pipes scheme originally used. It does not support all features of
* sockets, but does do everything that pipes normally do.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/ttycom.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/select.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/pipe.h>
static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int pipe_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int pipe_close(file_t *);
static int pipe_poll(file_t *, int);
static int pipe_kqfilter(file_t *, struct knote *);
static int pipe_stat(file_t *, struct stat *);
static int pipe_ioctl(file_t *, u_long, void *);
static void pipe_restart(file_t *);
static int pipe_fpathconf(file_t *, int, register_t *);
static int pipe_posix_fadvise(file_t *, off_t, off_t, int);
static const struct fileops pipeops = {
.fo_name = "pipe",
.fo_read = pipe_read,
.fo_write = pipe_write,
.fo_ioctl = pipe_ioctl,
.fo_fcntl = fnullop_fcntl,
.fo_poll = pipe_poll,
.fo_stat = pipe_stat,
.fo_close = pipe_close,
.fo_kqfilter = pipe_kqfilter,
.fo_restart = pipe_restart,
.fo_fpathconf = pipe_fpathconf,
.fo_posix_fadvise = pipe_posix_fadvise,
};
/*
* Default pipe buffer size(s), this can be kind-of large now because pipe
* space is pageable. The pipe code will try to maintain locality of
* reference for performance reasons, so small amounts of outstanding I/O
* will not wipe the cache.
*/
#define MINPIPESIZE (PIPE_SIZE / 3)
#define MAXPIPESIZE (2 * PIPE_SIZE / 3)
/*
* Limit the number of "big" pipes
*/
#define LIMITBIGPIPES 32
static u_int maxbigpipes __read_mostly = LIMITBIGPIPES;
static u_int nbigpipe = 0;
/*
* Amount of KVA consumed by pipe buffers.
*/
static u_int amountpipekva = 0;
static void pipeclose(struct pipe *);
static void pipe_free_kmem(struct pipe *);
static int pipe_create(struct pipe **, pool_cache_t, struct timespec *);
static int pipelock(struct pipe *, bool);
static inline void pipeunlock(struct pipe *);
static void pipeselwakeup(struct pipe *, struct pipe *, int);
static int pipespace(struct pipe *, int);
static int pipe_ctor(void *, void *, int);
static void pipe_dtor(void *, void *);
static pool_cache_t pipe_wr_cache;
static pool_cache_t pipe_rd_cache;
void
pipe_init(void)
{
/* Writer side is not automatically allocated KVA. */
pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr",
NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL);
KASSERT(pipe_wr_cache != NULL);
/* Reader side gets preallocated KVA. */
pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd",
NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1);
KASSERT(pipe_rd_cache != NULL);
}
static int
pipe_ctor(void *arg, void *obj, int flags)
{
struct pipe *pipe;
vaddr_t va;
pipe = obj;
memset(pipe, 0, sizeof(struct pipe));
if (arg != NULL) {
/* Preallocate space. */
va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0,
UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
KASSERT(va != 0);
pipe->pipe_kmem = va;
atomic_add_int(&amountpipekva, PIPE_SIZE);
}
cv_init(&pipe->pipe_rcv, "pipe_rd");
cv_init(&pipe->pipe_wcv, "pipe_wr");
cv_init(&pipe->pipe_draincv, "pipe_drn");
cv_init(&pipe->pipe_lkcv, "pipe_lk");
selinit(&pipe->pipe_sel);
pipe->pipe_state = PIPE_SIGNALR;
return 0;
}
static void
pipe_dtor(void *arg, void *obj)
{
struct pipe *pipe;
pipe = obj;
cv_destroy(&pipe->pipe_rcv);
cv_destroy(&pipe->pipe_wcv);
cv_destroy(&pipe->pipe_draincv);
cv_destroy(&pipe->pipe_lkcv);
seldestroy(&pipe->pipe_sel);
if (pipe->pipe_kmem != 0) {
uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE,
UVM_KMF_PAGEABLE);
atomic_add_int(&amountpipekva, -PIPE_SIZE);
}
}
/*
* The pipe system call for the DTYPE_PIPE type of pipes
*/
int
pipe1(struct lwp *l, int *fildes, int flags)
{
struct pipe *rpipe, *wpipe;
struct timespec nt;
file_t *rf, *wf;
int fd, error;
proc_t *p;
if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
return EINVAL;
p = curproc;
rpipe = wpipe = NULL;
getnanotime(&nt);
if ((error = pipe_create(&rpipe, pipe_rd_cache, &nt)) || (error = pipe_create(&wpipe, pipe_wr_cache, &nt))) {
goto free2;
}
rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
wpipe->pipe_lock = rpipe->pipe_lock;
mutex_obj_hold(wpipe->pipe_lock);
error = fd_allocfile(&rf, &fd);
if (error)
goto free2;
fildes[0] = fd;
error = fd_allocfile(&wf, &fd);
if (error)
goto free3;
fildes[1] = fd;
rf->f_flag = FREAD | flags;
rf->f_type = DTYPE_PIPE;
rf->f_pipe = rpipe;
rf->f_ops = &pipeops;
fd_set_exclose(l, fildes[0], (flags & O_CLOEXEC) != 0);
wf->f_flag = FWRITE | flags;
wf->f_type = DTYPE_PIPE;
wf->f_pipe = wpipe;
wf->f_ops = &pipeops;
fd_set_exclose(l, fildes[1], (flags & O_CLOEXEC) != 0);
rpipe->pipe_peer = wpipe;
wpipe->pipe_peer = rpipe;
fd_affix(p, rf, fildes[0]);
fd_affix(p, wf, fildes[1]);
return (0);
free3:
fd_abort(p, rf, fildes[0]);
free2:
pipeclose(wpipe);
pipeclose(rpipe);
return (error);
}
/*
* Allocate kva for pipe circular buffer, the space is pageable
* This routine will 'realloc' the size of a pipe safely, if it fails
* it will retain the old buffer.
* If it fails it will return ENOMEM.
*/
static int
pipespace(struct pipe *pipe, int size)
{
void *buffer;
/*
* Allocate pageable virtual address space. Physical memory is
* allocated on demand.
*/
if (size == PIPE_SIZE && pipe->pipe_kmem != 0) {
buffer = (void *)pipe->pipe_kmem;
} else {
buffer = (void *)uvm_km_alloc(kernel_map, round_page(size),
0, UVM_KMF_PAGEABLE);
if (buffer == NULL)
return (ENOMEM);
atomic_add_int(&amountpipekva, size);
}
/* free old resources if we're resizing */
pipe_free_kmem(pipe);
pipe->pipe_buffer.buffer = buffer;
pipe->pipe_buffer.size = size;
pipe->pipe_buffer.in = 0;
pipe->pipe_buffer.out = 0;
pipe->pipe_buffer.cnt = 0;
return (0);
}
/*
* Initialize and allocate VM and memory for pipe.
*/
static int
pipe_create(struct pipe **pipep, pool_cache_t cache, struct timespec *nt)
{
struct pipe *pipe;
int error;
pipe = pool_cache_get(cache, PR_WAITOK);
KASSERT(pipe != NULL);
*pipep = pipe;
error = 0;
pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime = *nt;
pipe->pipe_lock = NULL;
if (cache == pipe_rd_cache) {
error = pipespace(pipe, PIPE_SIZE);
} else {
pipe->pipe_buffer.buffer = NULL;
pipe->pipe_buffer.size = 0;
pipe->pipe_buffer.in = 0;
pipe->pipe_buffer.out = 0;
pipe->pipe_buffer.cnt = 0;
}
return error;
}
/*
* Lock a pipe for I/O, blocking other access
* Called with pipe spin lock held.
*/
static int
pipelock(struct pipe *pipe, bool catch_p)
{
int error;
KASSERT(mutex_owned(pipe->pipe_lock)); while (pipe->pipe_state & PIPE_LOCKFL) {
if (catch_p) {
error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock);
if (error != 0) {
return error;
}
} else
cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock);
}
pipe->pipe_state |= PIPE_LOCKFL;
return 0;
}
/*
* unlock a pipe I/O lock
*/
static inline void
pipeunlock(struct pipe *pipe)
{
KASSERT(pipe->pipe_state & PIPE_LOCKFL);
pipe->pipe_state &= ~PIPE_LOCKFL;
cv_signal(&pipe->pipe_lkcv);
}
/*
* Select/poll wakeup. This also sends SIGIO to peer connected to
* 'sigpipe' side of pipe.
*/
static void
pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code)
{
int band;
switch (code) {
case POLL_IN:
band = POLLIN|POLLRDNORM;
break;
case POLL_OUT:
band = POLLOUT|POLLWRNORM;
break;
case POLL_HUP:
band = POLLHUP;
break;
case POLL_ERR:
band = POLLERR;
break;
default:
band = 0;
#ifdef DIAGNOSTIC
printf("bad siginfo code %d in pipe notification.\n", code);
#endif
break;
}
selnotify(&selp->pipe_sel, band, NOTE_SUBMIT);
if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0)
return;
fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
}
static int
pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct pipe *rpipe = fp->f_pipe;
struct pipebuf *bp = &rpipe->pipe_buffer;
kmutex_t *lock = rpipe->pipe_lock;
int error;
size_t nread = 0;
size_t size;
size_t ocnt;
unsigned int wakeup_state = 0;
/*
* Try to avoid locking the pipe if we have nothing to do.
*
* There are programs which share one pipe amongst multiple processes
* and perform non-blocking reads in parallel, even if the pipe is
* empty. This in particular is the case with BSD make, which when
* spawned with a high -j number can find itself with over half of the
* calls failing to find anything.
*/
if ((fp->f_flag & FNONBLOCK) != 0) {
if (__predict_false(uio->uio_resid == 0))
return (0);
if (atomic_load_relaxed(&bp->cnt) == 0 &&
(atomic_load_relaxed(&rpipe->pipe_state) & PIPE_EOF) == 0)
return (EAGAIN);
}
mutex_enter(lock);
++rpipe->pipe_busy;
ocnt = bp->cnt;
again:
error = pipelock(rpipe, true);
if (error)
goto unlocked_error;
while (uio->uio_resid) {
/*
* Normal pipe buffer receive.
*/
if (bp->cnt > 0) {
size = bp->size - bp->out;
if (size > bp->cnt)
size = bp->cnt;
if (size > uio->uio_resid)
size = uio->uio_resid;
mutex_exit(lock);
error = uiomove((char *)bp->buffer + bp->out, size, uio);
mutex_enter(lock);
if (error)
break;
bp->out += size;
if (bp->out >= bp->size)
bp->out = 0;
bp->cnt -= size;
/*
* If there is no more to read in the pipe, reset
* its pointers to the beginning. This improves
* cache hit stats.
*/
if (bp->cnt == 0) {
bp->in = 0;
bp->out = 0;
}
nread += size;
continue;
}
/*
* Break if some data was read.
*/
if (nread > 0)
break;
/*
* Detect EOF condition.
* Read returns 0 on EOF, no need to set error.
*/
if (rpipe->pipe_state & PIPE_EOF)
break;
/*
* Don't block on non-blocking I/O.
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
break;
}
/*
* Unlock the pipe buffer for our remaining processing.
* We will either break out with an error or we will
* sleep and relock to loop.
*/
pipeunlock(rpipe);
#if 1 /* XXX (dsl) I'm sure these aren't needed here ... */
/*
* We want to read more, wake up select/poll.
*/
pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
/*
* If the "write-side" is blocked, wake it up now.
*/
cv_broadcast(&rpipe->pipe_wcv);
#endif
if (wakeup_state & PIPE_RESTART) {
error = ERESTART;
goto unlocked_error;
}
/* Now wait until the pipe is filled */
error = cv_wait_sig(&rpipe->pipe_rcv, lock);
if (error != 0)
goto unlocked_error;
wakeup_state = rpipe->pipe_state;
goto again;
}
if (error == 0)
getnanotime(&rpipe->pipe_atime);
pipeunlock(rpipe);
unlocked_error:
--rpipe->pipe_busy;
if (rpipe->pipe_busy == 0) {
rpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&rpipe->pipe_draincv);
}
if (bp->cnt < MINPIPESIZE) {
cv_broadcast(&rpipe->pipe_wcv);
}
/*
* If anything was read off the buffer, signal to the writer it's
* possible to write more data. Also send signal if we are here for the
* first time after last write.
*/
if ((bp->size - bp->cnt) >= PIPE_BUF
&& (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
rpipe->pipe_state &= ~PIPE_SIGNALR;
}
mutex_exit(lock);
return (error);
}
static int
pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct pipe *wpipe, *rpipe;
struct pipebuf *bp;
kmutex_t *lock;
int error;
unsigned int wakeup_state = 0;
/* We want to write to our peer */
rpipe = fp->f_pipe;
lock = rpipe->pipe_lock;
error = 0;
mutex_enter(lock);
wpipe = rpipe->pipe_peer;
/*
* Detect loss of pipe read side, issue SIGPIPE if lost.
*/
if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) {
mutex_exit(lock);
return EPIPE;
}
++wpipe->pipe_busy;
/* Acquire the long-term pipe lock */
if ((error = pipelock(wpipe, true)) != 0) {
--wpipe->pipe_busy;
if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&wpipe->pipe_draincv);
}
mutex_exit(lock);
return (error);
}
bp = &wpipe->pipe_buffer;
/*
* If it is advantageous to resize the pipe buffer, do so.
*/
if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < maxbigpipes) && (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
atomic_inc_uint(&nbigpipe);
}
while (uio->uio_resid) {
size_t space;
space = bp->size - bp->cnt;
/* Writes of size <= PIPE_BUF must be atomic. */
if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
space = 0;
if (space > 0) {
int size; /* Transfer size */
int segsize; /* first segment to transfer */
/*
* Transfer size is minimum of uio transfer
* and free space in pipe buffer.
*/
if (space > uio->uio_resid)
size = uio->uio_resid;
else
size = space;
/*
* First segment to transfer is minimum of
* transfer size and contiguous space in
* pipe buffer. If first segment to transfer
* is less than the transfer size, we've got
* a wraparound in the buffer.
*/
segsize = bp->size - bp->in;
if (segsize > size)
segsize = size;
/* Transfer first segment */
mutex_exit(lock);
error = uiomove((char *)bp->buffer + bp->in, segsize,
uio);
if (error == 0 && segsize < size) {
/*
* Transfer remaining part now, to
* support atomic writes. Wraparound
* happened.
*/
KASSERT(bp->in + segsize == bp->size);
error = uiomove(bp->buffer,
size - segsize, uio);
}
mutex_enter(lock);
if (error)
break;
bp->in += size;
if (bp->in >= bp->size) { KASSERT(bp->in == size - segsize + bp->size);
bp->in = size - segsize;
}
bp->cnt += size;
KASSERT(bp->cnt <= bp->size);
wakeup_state = 0;
} else {
/*
* If the "read-side" has been blocked, wake it up now.
*/
cv_broadcast(&wpipe->pipe_rcv);
/*
* Don't block on non-blocking I/O.
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
break;
}
/*
* We have no more space and have something to offer,
* wake up select/poll.
*/
if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN); if (wakeup_state & PIPE_RESTART) {
error = ERESTART;
break;
}
/*
* If read side wants to go away, we just issue a signal
* to ourselves.
*/
if (wpipe->pipe_state & PIPE_EOF) {
error = EPIPE;
break;
}
pipeunlock(wpipe);
error = cv_wait_sig(&wpipe->pipe_wcv, lock);
(void)pipelock(wpipe, false); if (error != 0)
break;
wakeup_state = wpipe->pipe_state;
}
}
--wpipe->pipe_busy;
if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&wpipe->pipe_draincv);
}
if (bp->cnt > 0) { cv_broadcast(&wpipe->pipe_rcv);
}
/*
* Don't return EPIPE if I/O was successful
*/
if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0)
error = 0;
if (error == 0)
getnanotime(&wpipe->pipe_mtime);
/*
* We have something to offer, wake up select/poll.
*/
if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN);
/*
* Arrange for next read(2) to do a signal.
*/
wpipe->pipe_state |= PIPE_SIGNALR;
pipeunlock(wpipe);
mutex_exit(lock);
return (error);
}
/*
* We implement a very minimal set of ioctls for compatibility with sockets.
*/
int
pipe_ioctl(file_t *fp, u_long cmd, void *data)
{
struct pipe *pipe = fp->f_pipe;
kmutex_t *lock = pipe->pipe_lock;
switch (cmd) {
case FIONBIO:
return (0);
case FIOASYNC:
mutex_enter(lock);
if (*(int *)data) {
pipe->pipe_state |= PIPE_ASYNC;
} else {
pipe->pipe_state &= ~PIPE_ASYNC;
}
mutex_exit(lock);
return (0);
case FIONREAD:
mutex_enter(lock);
*(int *)data = pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case FIONWRITE:
/* Look at other side */
mutex_enter(lock);
pipe = pipe->pipe_peer;
if (pipe == NULL)
*(int *)data = 0;
else
*(int *)data = pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case FIONSPACE:
/* Look at other side */
mutex_enter(lock);
pipe = pipe->pipe_peer;
if (pipe == NULL)
*(int *)data = 0;
else
*(int *)data = pipe->pipe_buffer.size -
pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case TIOCSPGRP:
case FIOSETOWN:
return fsetown(&pipe->pipe_pgid, cmd, data);
case TIOCGPGRP:
case FIOGETOWN:
return fgetown(pipe->pipe_pgid, cmd, data);
}
return (EPASSTHROUGH);
}
int
pipe_poll(file_t *fp, int events)
{
struct pipe *rpipe = fp->f_pipe;
struct pipe *wpipe;
int eof = 0;
int revents = 0;
mutex_enter(rpipe->pipe_lock);
wpipe = rpipe->pipe_peer;
if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_buffer.cnt > 0) ||
(rpipe->pipe_state & PIPE_EOF))
revents |= events & (POLLIN | POLLRDNORM);
eof |= (rpipe->pipe_state & PIPE_EOF);
if (wpipe == NULL)
revents |= events & (POLLOUT | POLLWRNORM);
else {
if (events & (POLLOUT | POLLWRNORM)) if ((wpipe->pipe_state & PIPE_EOF) || (
(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
revents |= events & (POLLOUT | POLLWRNORM);
eof |= (wpipe->pipe_state & PIPE_EOF);
}
if (wpipe == NULL || eof)
revents |= POLLHUP;
if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) selrecord(curlwp, &rpipe->pipe_sel); if (events & (POLLOUT | POLLWRNORM)) selrecord(curlwp, &wpipe->pipe_sel);
}
mutex_exit(rpipe->pipe_lock);
return (revents);
}
static int
pipe_stat(file_t *fp, struct stat *ub)
{
struct pipe *pipe = fp->f_pipe;
mutex_enter(pipe->pipe_lock);
memset(ub, 0, sizeof(*ub));
ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
ub->st_blksize = pipe->pipe_buffer.size;
if (ub->st_blksize == 0 && pipe->pipe_peer) ub->st_blksize = pipe->pipe_peer->pipe_buffer.size;
ub->st_size = pipe->pipe_buffer.cnt;
ub->st_blocks = (ub->st_size) ? 1 : 0;
ub->st_atimespec = pipe->pipe_atime;
ub->st_mtimespec = pipe->pipe_mtime;
ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime;
ub->st_uid = kauth_cred_geteuid(fp->f_cred);
ub->st_gid = kauth_cred_getegid(fp->f_cred);
/*
* Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
* XXX (st_dev, st_ino) should be unique.
*/
mutex_exit(pipe->pipe_lock);
return 0;
}
static int
pipe_close(file_t *fp)
{
struct pipe *pipe = fp->f_pipe;
fp->f_pipe = NULL;
pipeclose(pipe);
return (0);
}
static void
pipe_restart(file_t *fp)
{
struct pipe *pipe = fp->f_pipe;
/*
* Unblock blocked reads/writes in order to allow close() to complete.
* System calls return ERESTART so that the fd is revalidated.
* (Partial writes return the transfer length.)
*/
mutex_enter(pipe->pipe_lock);
pipe->pipe_state |= PIPE_RESTART;
/* Wakeup both cvs, maybe we only need one, but maybe there are some
* other paths where wakeup is needed, and it saves deciding which! */
cv_broadcast(&pipe->pipe_rcv);
cv_broadcast(&pipe->pipe_wcv);
mutex_exit(pipe->pipe_lock);
}
static int
pipe_fpathconf(struct file *fp, int name, register_t *retval)
{
switch (name) {
case _PC_PIPE_BUF:
*retval = PIPE_BUF;
return 0;
default:
return EINVAL;
}
}
static int
pipe_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{
return ESPIPE;
}
static void
pipe_free_kmem(struct pipe *pipe)
{
if (pipe->pipe_buffer.buffer != NULL) { if (pipe->pipe_buffer.size > PIPE_SIZE) { atomic_dec_uint(&nbigpipe);
}
if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { uvm_km_free(kernel_map,
(vaddr_t)pipe->pipe_buffer.buffer,
pipe->pipe_buffer.size, UVM_KMF_PAGEABLE);
atomic_add_int(&amountpipekva,
-pipe->pipe_buffer.size);
}
pipe->pipe_buffer.buffer = NULL;
}
}
/*
* Shutdown the pipe.
*/
static void
pipeclose(struct pipe *pipe)
{
kmutex_t *lock;
struct pipe *ppipe;
if (pipe == NULL)
return;
KASSERT(cv_is_valid(&pipe->pipe_rcv));
KASSERT(cv_is_valid(&pipe->pipe_wcv));
KASSERT(cv_is_valid(&pipe->pipe_draincv));
KASSERT(cv_is_valid(&pipe->pipe_lkcv));
lock = pipe->pipe_lock;
if (lock == NULL)
/* Must have failed during create */
goto free_resources;
mutex_enter(lock);
pipeselwakeup(pipe, pipe, POLL_HUP);
/*
* If the other side is blocked, wake it up saying that
* we want to close it down.
*/
pipe->pipe_state |= PIPE_EOF;
if (pipe->pipe_busy) {
while (pipe->pipe_busy) {
cv_broadcast(&pipe->pipe_wcv);
cv_wait_sig(&pipe->pipe_draincv, lock);
}
}
/*
* Disconnect from peer.
*/
if ((ppipe = pipe->pipe_peer) != NULL) {
pipeselwakeup(ppipe, ppipe, POLL_HUP);
ppipe->pipe_state |= PIPE_EOF;
cv_broadcast(&ppipe->pipe_rcv);
ppipe->pipe_peer = NULL;
}
/*
* Any knote objects still left in the list are
* the one attached by peer. Since no one will
* traverse this list, we just clear it.
*
* XXX Exposes select/kqueue internals.
*/
SLIST_INIT(&pipe->pipe_sel.sel_klist);
KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0);
mutex_exit(lock);
mutex_obj_free(lock);
/*
* Free resources.
*/
free_resources:
pipe->pipe_pgid = 0;
pipe->pipe_state = PIPE_SIGNALR;
pipe->pipe_peer = NULL;
pipe->pipe_lock = NULL;
pipe_free_kmem(pipe);
if (pipe->pipe_kmem != 0) {
pool_cache_put(pipe_rd_cache, pipe);
} else {
pool_cache_put(pipe_wr_cache, pipe);
}
}
static void
filt_pipedetach(struct knote *kn)
{
struct pipe *pipe;
kmutex_t *lock;
pipe = ((file_t *)kn->kn_obj)->f_pipe;
lock = pipe->pipe_lock;
mutex_enter(lock);
switch(kn->kn_filter) {
case EVFILT_WRITE:
/* Need the peer structure, not our own. */
pipe = pipe->pipe_peer;
/* If reader end already closed, just return. */
if (pipe == NULL) {
mutex_exit(lock);
return;
}
break;
default:
/* Nothing to do. */
break;
}
KASSERT(kn->kn_hook == pipe);
selremove_knote(&pipe->pipe_sel, kn);
mutex_exit(lock);
}
static int
filt_piperead(struct knote *kn, long hint)
{
struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
struct pipe *wpipe;
int rv;
if ((hint & NOTE_SUBMIT) == 0) {
mutex_enter(rpipe->pipe_lock);
}
wpipe = rpipe->pipe_peer;
kn->kn_data = rpipe->pipe_buffer.cnt;
if ((rpipe->pipe_state & PIPE_EOF) ||
(wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
knote_set_eof(kn, 0);
rv = 1;
} else {
rv = kn->kn_data > 0;
}
if ((hint & NOTE_SUBMIT) == 0) {
mutex_exit(rpipe->pipe_lock);
}
return rv;
}
static int
filt_pipewrite(struct knote *kn, long hint)
{
struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
struct pipe *wpipe;
int rv;
if ((hint & NOTE_SUBMIT) == 0) {
mutex_enter(rpipe->pipe_lock);
}
wpipe = rpipe->pipe_peer;
if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
kn->kn_data = 0;
knote_set_eof(kn, 0);
rv = 1;
} else {
kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
rv = kn->kn_data >= PIPE_BUF;
}
if ((hint & NOTE_SUBMIT) == 0) {
mutex_exit(rpipe->pipe_lock);
}
return rv;
}
static const struct filterops pipe_rfiltops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_pipedetach,
.f_event = filt_piperead,
};
static const struct filterops pipe_wfiltops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_pipedetach,
.f_event = filt_pipewrite,
};
static int
pipe_kqfilter(file_t *fp, struct knote *kn)
{
struct pipe *pipe;
kmutex_t *lock;
pipe = ((file_t *)kn->kn_obj)->f_pipe;
lock = pipe->pipe_lock;
mutex_enter(lock);
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &pipe_rfiltops;
break;
case EVFILT_WRITE:
kn->kn_fop = &pipe_wfiltops;
pipe = pipe->pipe_peer;
if (pipe == NULL) {
/* Other end of pipe has been closed. */
mutex_exit(lock);
return (EBADF);
}
break;
default:
mutex_exit(lock);
return (EINVAL);
}
kn->kn_hook = pipe;
selrecord_knote(&pipe->pipe_sel, kn);
mutex_exit(lock);
return (0);
}
/*
* Handle pipe sysctls.
*/
SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "pipe",
SYSCTL_DESCR("Pipe settings"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PIPE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxbigpipes",
SYSCTL_DESCR("Maximum number of \"big\" pipes"),
NULL, 0, &maxbigpipes, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "nbigpipes",
SYSCTL_DESCR("Number of \"big\" pipes"),
NULL, 0, &nbigpipe, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "kvasize",
SYSCTL_DESCR("Amount of kernel memory consumed by pipe "
"buffers"),
NULL, 0, &amountpipekva, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL);
}
/* $NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95
*/
/*
* procfs VFS interface
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/procfs/procfs.h>
#include <uvm/uvm_extern.h> /* for PAGE_SIZE */
MODULE(MODULE_CLASS_VFS, procfs, "ptrace_common");
VFS_PROTOS(procfs);
#define PROCFS_HASHSIZE 256
#define PROCFS_EXEC_HOOK ((void *)1)
#define PROCFS_EXIT_HOOK ((void *)2)
static kauth_listener_t procfs_listener;
static void *procfs_exechook;
static void *procfs_exithook;
LIST_HEAD(hashhead, pfsnode);
static u_long procfs_hashmask;
static struct hashhead *procfs_hashtab;
static kmutex_t procfs_hashlock;
static struct hashhead *
procfs_hashhead(pid_t pid)
{
return &procfs_hashtab[pid & procfs_hashmask];
}
void
procfs_hashrem(struct pfsnode *pfs)
{
mutex_enter(&procfs_hashlock);
LIST_REMOVE(pfs, pfs_hash);
mutex_exit(&procfs_hashlock);
}
/*
* VFS Operations.
*
* mount system call
*/
/* ARGSUSED */
int
procfs_mount(
struct mount *mp,
const char *path,
void *data,
size_t *data_len)
{
struct lwp *l = curlwp;
struct procfsmount *pmnt;
struct procfs_args *args = data;
int error;
if (args == NULL)
return EINVAL;
if (UIO_MX & (UIO_MX-1)) {
log(LOG_ERR, "procfs: invalid directory entry size");
return (EINVAL);
}
if (mp->mnt_flag & MNT_GETARGS) {
if (*data_len < sizeof *args)
return EINVAL;
pmnt = VFSTOPROC(mp);
if (pmnt == NULL)
return EIO;
args->version = PROCFS_ARGSVERSION;
args->flags = pmnt->pmnt_flags;
*data_len = sizeof *args;
return 0;
}
if (mp->mnt_flag & MNT_UPDATE)
return (EOPNOTSUPP);
if (*data_len >= sizeof *args && args->version != PROCFS_ARGSVERSION)
return EINVAL;
pmnt = kmem_zalloc(sizeof(struct procfsmount), KM_SLEEP);
mp->mnt_stat.f_namemax = PROCFS_MAXNAMLEN;
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_data = pmnt;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, "procfs", UIO_SYSSPACE,
mp->mnt_op->vfs_name, mp, l);
if (*data_len >= sizeof *args)
pmnt->pmnt_flags = args->flags;
else
pmnt->pmnt_flags = 0;
mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP;
return error;
}
/*
* unmount system call
*/
int
procfs_unmount(struct mount *mp, int mntflags)
{
int error;
int flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if ((error = vflush(mp, 0, flags)) != 0)
return (error);
kmem_free(mp->mnt_data, sizeof(struct procfsmount));
mp->mnt_data = NULL;
return 0;
}
int
procfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
int error;
error = procfs_allocvp(mp, vpp, 0, PFSroot, -1);
if (error == 0) {
error = vn_lock(*vpp, lktype);
if (error != 0) { vrele(*vpp);
*vpp = NULL;
}
}
return error;
}
/* ARGSUSED */
int
procfs_start(struct mount *mp, int flags)
{
return (0);
}
/*
* Get file system statistics.
*/
int
procfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
genfs_statvfs(mp, sbp);
sbp->f_bsize = PAGE_SIZE;
sbp->f_frsize = PAGE_SIZE;
sbp->f_iosize = PAGE_SIZE;
sbp->f_blocks = 1;
sbp->f_files = maxproc; /* approx */
sbp->f_ffree = maxproc - atomic_load_relaxed(&nprocs); /* approx */
sbp->f_favail = maxproc - atomic_load_relaxed(&nprocs); /* approx */
return (0);
}
/*ARGSUSED*/
int
procfs_sync(
struct mount *mp,
int waitfor,
kauth_cred_t uc)
{
return (0);
}
/*ARGSUSED*/
int
procfs_vget(struct mount *mp, ino_t ino, int lktype,
struct vnode **vpp)
{
return (EOPNOTSUPP);
}
int
procfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
int error;
struct pfskey pfskey;
struct pfsnode *pfs;
KASSERT(key_len == sizeof(pfskey));
memcpy(&pfskey, key, key_len);
pfs = kmem_alloc(sizeof(*pfs), KM_SLEEP);
pfs->pfs_pid = pfskey.pk_pid;
pfs->pfs_type = pfskey.pk_type;
pfs->pfs_fd = pfskey.pk_fd;
pfs->pfs_vnode = vp;
pfs->pfs_mount = mp;
pfs->pfs_flags = 0;
pfs->pfs_fileno =
PROCFS_FILENO(pfs->pfs_pid, pfs->pfs_type, pfs->pfs_fd);
vp->v_tag = VT_PROCFS;
vp->v_op = procfs_vnodeop_p;
vp->v_data = pfs;
switch (pfs->pfs_type) {
case PFSroot: /* /proc = dr-xr-xr-x */
vp->v_vflag |= VV_ROOT;
/*FALLTHROUGH*/
case PFSproc: /* /proc/N = dr-xr-xr-x */
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VDIR;
break;
case PFStask: /* /proc/N/task = dr-xr-xr-x */
if (pfs->pfs_fd == -1) { pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
S_IROTH|S_IXOTH;
vp->v_type = VDIR;
break;
}
/*FALLTHROUGH*/
case PFScurproc: /* /proc/curproc = lr-xr-xr-x */
case PFSself: /* /proc/self = lr-xr-xr-x */
case PFScwd: /* /proc/N/cwd = lr-xr-xr-x */
case PFSchroot: /* /proc/N/chroot = lr-xr-xr-x */
case PFSexe: /* /proc/N/exe = lr-xr-xr-x */
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VLNK;
break;
case PFSfd:
if (pfs->pfs_fd == -1) { /* /proc/N/fd = dr-x------ */
pfs->pfs_mode = S_IRUSR|S_IXUSR;
vp->v_type = VDIR;
} else { /* /proc/N/fd/M = [ps-]rw------- */
file_t *fp;
vnode_t *vxp;
struct proc *p;
mutex_enter(&proc_lock);
p = procfs_proc_find(mp, pfs->pfs_pid);
mutex_exit(&proc_lock);
if (p == NULL) {
error = ENOENT;
goto bad;
}
KASSERT(rw_read_held(&p->p_reflock)); if ((fp = fd_getfile2(p, pfs->pfs_fd)) == NULL) {
error = EBADF;
goto bad;
}
pfs->pfs_mode = S_IRUSR|S_IWUSR;
switch (fp->f_type) {
case DTYPE_VNODE:
vxp = fp->f_vnode;
/*
* We make symlinks for directories
* to avoid cycles.
*/
if (vxp->v_type == VDIR ||
procfs_proc_is_linux_compat())
goto symlink;
vp->v_type = vxp->v_type;
break;
case DTYPE_PIPE:
vp->v_type = VFIFO;
break;
case DTYPE_SOCKET:
vp->v_type = VSOCK;
break;
case DTYPE_KQUEUE:
case DTYPE_MISC:
case DTYPE_SEM:
symlink:
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|
S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VLNK;
break;
default:
error = EOPNOTSUPP;
closef(fp);
goto bad;
}
closef(fp);
}
break;
case PFSfile: /* /proc/N/file = -rw------- */
case PFSmem: /* /proc/N/mem = -rw------- */
case PFSregs: /* /proc/N/regs = -rw------- */
case PFSfpregs: /* /proc/N/fpregs = -rw------- */
pfs->pfs_mode = S_IRUSR|S_IWUSR;
vp->v_type = VREG;
break;
case PFSnote: /* /proc/N/note = --w------ */
case PFSnotepg: /* /proc/N/notepg = --w------ */
pfs->pfs_mode = S_IWUSR;
vp->v_type = VREG;
break;
case PFSmap: /* /proc/N/map = -r-------- */
case PFSmaps: /* /proc/N/maps = -r-------- */
case PFSauxv: /* /proc/N/auxv = -r-------- */
case PFSenviron: /* /proc/N/environ = -r-------- */
pfs->pfs_mode = S_IRUSR;
vp->v_type = VREG;
break;
case PFSstatus: /* /proc/N/status = -r--r--r-- */
case PFSstat: /* /proc/N/stat = -r--r--r-- */
case PFScmdline: /* /proc/N/cmdline = -r--r--r-- */
case PFSemul: /* /proc/N/emul = -r--r--r-- */
case PFSmeminfo: /* /proc/meminfo = -r--r--r-- */
case PFScpustat: /* /proc/stat = -r--r--r-- */
case PFSdevices: /* /proc/devices = -r--r--r-- */
case PFScpuinfo: /* /proc/cpuinfo = -r--r--r-- */
case PFSuptime: /* /proc/uptime = -r--r--r-- */
case PFSmounts: /* /proc/mounts = -r--r--r-- */
case PFSloadavg: /* /proc/loadavg = -r--r--r-- */
case PFSstatm: /* /proc/N/statm = -r--r--r-- */
case PFSversion: /* /proc/version = -r--r--r-- */
case PFSlimit: /* /proc/limit = -r--r--r-- */
pfs->pfs_mode = S_IRUSR|S_IRGRP|S_IROTH;
vp->v_type = VREG;
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
procfs_machdep_allocvp(vp);
break;
#endif
default:
panic("procfs_allocvp");
}
mutex_enter(&procfs_hashlock);
LIST_INSERT_HEAD(procfs_hashhead(pfs->pfs_pid), pfs, pfs_hash);
mutex_exit(&procfs_hashlock);
uvm_vnp_setsize(vp, 0);
*new_key = &pfs->pfs_key;
return 0;
bad:
vp->v_tag =VT_NON;
vp->v_type = VNON;
vp->v_op = NULL;
vp->v_data = NULL;
kmem_free(pfs, sizeof(*pfs));
return error;
}
void
procfs_init(void)
{
}
void
procfs_reinit(void)
{
}
void
procfs_done(void)
{
}
extern const struct vnodeopv_desc procfs_vnodeop_opv_desc;
const struct vnodeopv_desc * const procfs_vnodeopv_descs[] = {
&procfs_vnodeop_opv_desc,
NULL,
};
struct vfsops procfs_vfsops = {
.vfs_name = MOUNT_PROCFS,
.vfs_min_mount_data = sizeof (struct procfs_args),
.vfs_mount = procfs_mount,
.vfs_start = procfs_start,
.vfs_unmount = procfs_unmount,
.vfs_root = procfs_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = procfs_statvfs,
.vfs_sync = procfs_sync,
.vfs_vget = procfs_vget,
.vfs_loadvnode = procfs_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = procfs_init,
.vfs_reinit = procfs_reinit,
.vfs_done = procfs_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = procfs_vnodeopv_descs
};
static void
procfs_exechook_cb(struct proc *p, void *arg)
{
struct hashhead *head;
struct pfsnode *pfs;
struct mount *mp;
struct pfskey key;
struct vnode *vp;
int error;
if (arg == PROCFS_EXEC_HOOK && !(p->p_flag & PK_SUGID))
return;
head = procfs_hashhead(p->p_pid);
again:
mutex_enter(&procfs_hashlock);
LIST_FOREACH(pfs, head, pfs_hash) {
if (pfs->pfs_pid != p->p_pid)
continue;
mp = pfs->pfs_mount;
key = pfs->pfs_key;
vfs_ref(mp);
mutex_exit(&procfs_hashlock);
error = vcache_get(mp, &key, sizeof(key), &vp);
vfs_rele(mp);
if (error != 0)
goto again;
if (vrecycle(vp))
goto again;
do {
error = vfs_suspend(mp, 0);
} while (error == EINTR || error == ERESTART);
vgone(vp);
if (error == 0)
vfs_resume(mp);
goto again;
}
mutex_exit(&procfs_hashlock);
}
static int
procfs_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
struct pfsnode *pfs;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
pfs = arg1;
if (action != KAUTH_PROCESS_PROCFS)
return result;
switch (pfs->pfs_type) {
case PFSregs:
case PFSfpregs:
case PFSmem:
if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) ||
ISSET(p->p_flag, PK_SUGID))
break;
/*FALLTHROUGH*/
default:
result = KAUTH_RESULT_ALLOW;
break;
}
return result;
}
SYSCTL_SETUP(procfs_sysctl_setup, "procfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "procfs",
SYSCTL_DESCR("Process file system"),
NULL, 0, NULL, 0,
CTL_VFS, 12, CTL_EOL);
/*
* XXX the "12" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "12" is the order as taken from sys/mount.h
*/
}
static int
procfs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&procfs_vfsops);
if (error != 0)
break;
procfs_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
procfs_listener_cb, NULL);
procfs_exechook = exechook_establish(procfs_exechook_cb,
PROCFS_EXEC_HOOK);
procfs_exithook = exithook_establish(procfs_exechook_cb,
PROCFS_EXIT_HOOK);
mutex_init(&procfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
procfs_hashtab = hashinit(PROCFS_HASHSIZE, HASH_LIST, true,
&procfs_hashmask);
break;
case MODULE_CMD_FINI:
error = vfs_detach(&procfs_vfsops);
if (error != 0)
break;
kauth_unlisten_scope(procfs_listener);
exechook_disestablish(procfs_exechook);
exithook_disestablish(procfs_exithook);
mutex_destroy(&procfs_hashlock);
hashdone(procfs_hashtab, HASH_LIST, procfs_hashmask);
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/cpu.h>
static SLIST_HEAD(uihashhead, uidinfo) *uihashtbl;
static u_long uihash;
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
static int
sysctl_kern_uidinfo_cnt(SYSCTLFN_ARGS)
{
static const struct {
const char *name;
u_int value;
} nv[] = {
#define _MEM(n) { # n, offsetof(struct uidinfo, ui_ ## n) }
_MEM(proccnt),
_MEM(lwpcnt),
_MEM(lockcnt),
_MEM(semcnt),
_MEM(sbsize),
#undef _MEM
};
for (size_t i = 0; i < __arraycount(nv); i++)
if (strcmp(nv[i].name, rnode->sysctl_name) == 0) {
uint64_t cnt;
struct sysctlnode node = *rnode;
struct uidinfo *uip;
node.sysctl_data = &cnt;
uip = uid_find(kauth_cred_geteuid(l->l_cred));
*(uint64_t *)node.sysctl_data =
*(u_long *)((char *)uip + nv[i].value);
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
return EINVAL;
}
static struct sysctllog *kern_uidinfo_sysctllog;
static void
sysctl_kern_uidinfo_setup(void)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(&kern_uidinfo_sysctllog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "uidinfo",
SYSCTL_DESCR("Resource usage per uid"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "proccnt",
SYSCTL_DESCR("Number of processes for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "lwpcnt",
SYSCTL_DESCR("Number of lwps for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "lockcnt",
SYSCTL_DESCR("Number of locks for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "semcnt",
SYSCTL_DESCR("Number of semaphores used for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "sbsize",
SYSCTL_DESCR("Socket buffers used for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
}
static int
uid_stats(struct hashstat_sysctl *hs, bool fill)
{
struct uidinfo *uip;
uint64_t chain;
strlcpy(hs->hash_name, "uihash", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "user info (uid->used proc) hash",
sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = uihash + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
SLIST_FOREACH(uip, &uihashtbl[i], ui_hash) {
membar_datadep_consumer();
chain++;
}
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
}
return 0;
}
void
uid_init(void)
{
/*
* In case of MP system, SLIST_FOREACH would force a cache line
* write-back for every modified 'uidinfo', thus we try to keep the
* lists short.
*/
const u_int uihash_sz = (maxcpus > 1 ? 1024 : 64);
uihashtbl = hashinit(uihash_sz, HASH_SLIST, true, &uihash);
/*
* Ensure that uid 0 is always in the user hash table, as
* sbreserve() expects it available from interrupt context.
*/
(void)uid_find(0);
sysctl_kern_uidinfo_setup();
hashstat_register("uihash", uid_stats);
}
struct uidinfo *
uid_find(uid_t uid)
{
struct uidinfo *uip, *uip_first, *newuip;
struct uihashhead *uipp;
uipp = UIHASH(uid);
newuip = NULL;
/*
* To make insertion atomic, abstraction of SLIST will be violated.
*/
uip_first = uipp->slh_first;
again:
SLIST_FOREACH(uip, uipp, ui_hash) {
membar_datadep_consumer();
if (uip->ui_uid != uid)
continue;
if (newuip != NULL) kmem_free(newuip, sizeof(*newuip));
return uip;
}
if (newuip == NULL) newuip = kmem_zalloc(sizeof(*newuip), KM_SLEEP);
newuip->ui_uid = uid;
/*
* If atomic insert is unsuccessful, another thread might be
* allocated this 'uid', thus full re-check is needed.
*/
newuip->ui_hash.sle_next = uip_first;
membar_producer();
uip = atomic_cas_ptr(&uipp->slh_first, uip_first, newuip);
if (uip != uip_first) {
uip_first = uip;
goto again;
}
return newuip;
}
/*
* Change the count associated with number of processes
* a given user is using.
*/
int
chgproccnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long proccnt;
uip = uid_find(uid);
proccnt = atomic_add_long_nv(&uip->ui_proccnt, diff);
KASSERTMSG(proccnt >= 0, "uid=%d diff=%d proccnt=%ld",
uid, diff, proccnt);
return proccnt;
}
/*
* Change the count associated with number of lwps
* a given user is using.
*/
int
chglwpcnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long lwpcnt;
uip = uid_find(uid);
lwpcnt = atomic_add_long_nv(&uip->ui_lwpcnt, diff);
KASSERTMSG(lwpcnt >= 0, "uid=%d diff=%d lwpcnt=%ld",
uid, diff, lwpcnt);
return lwpcnt;
}
/*
* Change the count associated with number of semaphores
* a given user is using.
*/
int
chgsemcnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long semcnt;
uip = uid_find(uid);
semcnt = atomic_add_long_nv(&uip->ui_semcnt, diff);
KASSERTMSG(semcnt >= 0, "uid=%d diff=%d semcnt=%ld",
uid, diff, semcnt);
return semcnt;
}
int
chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax)
{
rlim_t nsb;
const long diff = to - *hiwat;
nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff);
if (diff > 0 && nsb > xmax) {
atomic_add_long((long *)&uip->ui_sbsize, -diff);
return 0;
}
*hiwat = to;
return 1;
}
/* $NetBSD: userret.h,v 1.13 2018/07/26 09:29:08 maxv Exp $ */
/*
* XXXfvdl same as i386 counterpart, but should probably be independent.
*/
/*-
* Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/userret.h>
static __inline void userret(struct lwp *);
/*
* Define the code needed before returning to user mode, for
* trap and syscall.
*/
static __inline void
userret(struct lwp *l)
{
/* Invoke MI userret code */
mi_userret(l);
}
/* $NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Alan Barrett
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/compat_stub.h>
#include <sys/kmem.h>
#include <sys/tty.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/ttycom.h>
/* convert struct ptmget to struct compat_60_ptmget */
static int
ptmget_to_ptmget60(struct ptmget *pg, struct compat_60_ptmget *pg60)
{
memset(pg60, 0, sizeof(*pg60));
pg60->cfd = pg->cfd;
pg60->sfd = pg->sfd;
strlcpy(pg60->cn, pg->cn, sizeof(pg60->cn));
strlcpy(pg60->sn, pg->sn, sizeof(pg60->sn));
if (strlen(pg->cn) >= sizeof(pg60->cn)
|| strlen(pg->sn) >= sizeof(pg60->sn))
return E2BIG;
return 0;
}
/* Helper for compat ioctls that use struct compat_60_ptmget. */
static int
compat_60_ptmget_ioctl(dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
int ret;
u_long newcmd;
struct ptmget *pg;
const struct cdevsw *cd = cdevsw_lookup(dev);
if (cd == NULL || cd->d_ioctl == NULL)
return ENXIO;
switch (cmd) {
case COMPAT_60_TIOCPTMGET: newcmd = TIOCPTMGET; break;
case COMPAT_60_TIOCPTSNAME: newcmd = TIOCPTSNAME; break;
default: return ENOTTY;
}
pg = kmem_alloc(sizeof(*pg), KM_SLEEP);
ret = (cd->d_ioctl)(dev, newcmd, pg, flag, l);
if (ret != 0)
goto out;
ret = ptmget_to_ptmget60(pg, data);
out:
kmem_free(pg, sizeof(*pg));
return ret;
}
/*
* COMPAT_60 versions of ttioctl and ptmioctl.
*/
int
compat_60_ttioctl(struct tty *tp, u_long cmd, void *data, int flag,
struct lwp *l)
{ switch (cmd) {
case COMPAT_60_TIOCPTMGET:
case COMPAT_60_TIOCPTSNAME:
return compat_60_ptmget_ioctl(tp->t_dev, cmd, data, flag, l);
default:
return EPASSTHROUGH;
}
}
int
compat_60_ptmioctl(dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
switch (cmd) {
case COMPAT_60_TIOCPTMGET:
return compat_60_ptmget_ioctl(dev, cmd, data, flag, l);
default:
return EPASSTHROUGH;
}
}
void
kern_tty_60_init(void)
{
MODULE_HOOK_SET(tty_ttioctl_60_hook, compat_60_ttioctl);
MODULE_HOOK_SET(tty_ptmioctl_60_hook, compat_60_ptmioctl);
}
void
kern_tty_60_fini(void)
{
MODULE_HOOK_UNSET(tty_ttioctl_60_hook);
MODULE_HOOK_UNSET(tty_ptmioctl_60_hook);
}
/* $NetBSD: lwp.h,v 1.231 2023/11/02 10:31:55 martin Exp $ */
/*
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Nathan J. Williams and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_LWP_H_
#define _SYS_LWP_H_
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/condvar.h>
#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/resource.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/specificdata.h>
#include <sys/time.h>
#include <sys/wchan.h>
#if defined(_KERNEL)
struct lwp;
/* forward declare this for <machine/cpu.h> so it can get l_cpu. */
static __inline struct cpu_info *lwp_getcpu(struct lwp *);
#include <machine/cpu.h> /* curcpu() and cpu_info */
#include <sys/atomic.h>
#ifdef _KERNEL_OPT
#include "opt_kcov.h"
#include "opt_kmsan.h"
#include "opt_maxlwp.h"
#endif
#endif
#include <machine/proc.h> /* Machine-dependent proc substruct. */
/*
* Lightweight process. Field markings and the corresponding locks:
*
* a: proc_lock
* c: condition variable interlock, passed to cv_wait()
* l: *l_mutex
* p: l_proc->p_lock
* s: spc_mutex, which may or may not be referenced by l_mutex
* S: l_selcluster->sc_lock
* (: unlocked, stable
* !: unlocked, may only be reliably accessed by the LWP itself
*
* Fields are clustered together by usage (to increase the likelihood
* of cache hits) and by size (to reduce dead space in the structure).
*/
#include <sys/pcu.h>
struct lockdebug;
struct sysent;
struct lwp {
/* Must not be zeroed on free. */
struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */
kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */
struct turnstile *l_ts; /* l: current turnstile */
int l_stat; /* l: overall LWP status */
int l__reserved; /* : padding - reuse as needed */
/* Scheduling and overall state. */
#define l_startzero l_runq
TAILQ_ENTRY(lwp) l_runq; /* s: run queue */
union {
void * info; /* s: scheduler-specific structure */
u_int timeslice; /* l: time-quantum for SCHED_M2 */
} l_sched;
void *l_addr; /* l: PCB address; use lwp_getpcb() */
struct mdlwp l_md; /* l: machine-dependent fields. */
struct bintime l_rtime; /* l: real time */
struct bintime l_stime; /* l: start time (while ONPROC) */
int l_flag; /* l: misc flag values */
u_int l_swtime; /* l: time swapped in or out */
u_int l_rticks; /* l: Saved start time of run */
u_int l_rticksum; /* l: Sum of ticks spent running */
u_int l_slpticks; /* l: Saved start time of sleep */
u_int l_slpticksum; /* l: Sum of ticks spent sleeping */
int l_class; /* l: scheduling class */
pri_t l_boostpri; /* l: boosted priority after blocking */
pri_t l_priority; /* l: scheduler priority */
pri_t l_inheritedprio;/* l: inherited priority */
pri_t l_protectprio; /* l: for PTHREAD_PRIO_PROTECT */
pri_t l_auxprio; /* l: max(inherit,protect) priority */
int l_protectdepth; /* l: for PTHREAD_PRIO_PROTECT */
u_int l_cpticks; /* (: Ticks of CPU time */
psetid_t l_psid; /* l: assigned processor-set ID */
fixpt_t l_pctcpu; /* p: %cpu during l_swtime */
fixpt_t l_estcpu; /* l: cpu time for SCHED_4BSD */
SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */
struct cpu_info *l_target_cpu; /* l: target CPU to migrate */
struct lwpctl *l_lwpctl; /* p: lwpctl block kernel address */
struct lcpage *l_lcpage; /* p: lwpctl containing page */
kcpuset_t *l_affinity; /* l: CPU set for affinity */
/* Synchronisation. */
const struct syncobj *l_syncobj;/* l: sync object operations set */
LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */
wchan_t l_wchan; /* l: sleep address */
const char *l_wmesg; /* l: reason for sleep */
struct sleepq *l_sleepq; /* l: current sleep queue */
callout_t l_timeout_ch; /* !: callout for tsleep */
kcondvar_t l_waitcv; /* a: vfork() wait */
u_int l_slptime; /* l: time since last blocked */
bool l_vforkwaiting; /* a: vfork() waiting */
/* User-space synchronization. */
uintptr_t l_robust_head; /* !: list of robust futexes */
uint32_t l___rsvd1; /* reserved for future use */
#if PCU_UNIT_COUNT > 0
struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT];
uint32_t l_pcu_valid;
#endif
/* Process level and global state, misc. */
lwpid_t l_lid; /* (: LWP identifier; local to proc */
LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */
void *l_ctxlink; /* p: uc_link {get,set}context */
struct proc *l_proc; /* p: parent process */
LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */
char *l_name; /* (: name, optional */
lwpid_t l_waiter; /* p: first LWP waiting on us */
lwpid_t l_waitingfor; /* p: specific LWP we are waiting on */
int l_prflag; /* p: process level flags */
u_int l_refcnt; /* p: reference count on this LWP */
/* State of select() or poll(). */
int l_selflag; /* S: polling state flags */
int l_selret; /* S: return value of select/poll */
SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */
uintptr_t l_selrec; /* !: argument for selrecord() */
struct selcluster *l_selcluster;/* !: associated cluster data */
void * l_selbits; /* (: select() bit-field */
size_t l_selni; /* (: size of a single bit-field */
/* Signals. */
int l_sigrestore; /* p: need to restore old sig mask */
sigset_t l_sigwaitset; /* p: signals being waited for */
kcondvar_t l_sigcv; /* p: for sigsuspend() */
struct ksiginfo *l_sigwaited; /* p: delivered signals from set */
sigpend_t *l_sigpendset; /* p: XXX issignal()/postsig() baton */
LIST_ENTRY(lwp) l_sigwaiter; /* p: chain on list of waiting LWPs */
stack_t l_sigstk; /* p: sp & on stack state variable */
sigset_t l_sigmask; /* p: signal mask */
sigpend_t l_sigpend; /* p: signals to this LWP */
sigset_t l_sigoldmask; /* p: mask for sigpause */
/* Private data. */
specificdata_reference
l_specdataref; /* !: subsystem lwp-specific data */
struct timespec l_ktrcsw; /* !: for ktrace CSW trace XXX */
void *l_private; /* !: svr4-style lwp-private data */
struct lwp *l_switchto; /* !: mi_switch: switch to this LWP */
struct kauth_cred *l_cred; /* !: cached credentials */
struct filedesc *l_fd; /* !: cached copy of proc::p_fd */
void *l_emuldata; /* !: kernel lwp-private data */
struct fstrans_lwp_info *l_fstrans; /* (: fstrans private data */
u_short l_shlocks; /* !: lockdebug: shared locks held */
u_short l_exlocks; /* !: lockdebug: excl. locks held */
u_short l_psrefs; /* !: count of psref held */
u_short l_blcnt; /* !: count of kernel_lock held */
volatile int l_nopreempt; /* !: don't preempt me! */
volatile u_int l_dopreempt; /* s: kernel preemption pending */
int l_pflag; /* !: LWP private flags */
int l_dupfd; /* !: side return from cloning devs XXX */
const struct sysent * volatile l_sysent;/* !: currently active syscall */
struct rusage l_ru; /* !: accounting information */
uint64_t l_pfailtime; /* !: for kernel preemption */
uintptr_t l_pfailaddr; /* !: for kernel preemption */
uintptr_t l_pfaillock; /* !: for kernel preemption */
_TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */
volatile void *l_ld_wanted; /* !: lock currently wanted by LWP */
uintptr_t l_rwcallsite; /* !: rwlock actual callsite */
int l_tcgen; /* !: for timecounter removal */
/* These are only used by 'options SYSCALL_TIMES'. */
uint32_t l_syscall_time; /* !: time epoch for current syscall */
uint64_t *l_syscall_counter; /* !: counter for current process */
struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */
#ifdef KMSAN
void *l_kmsan; /* !: KMSAN private data. */
#endif
#ifdef KCOV
void *l_kcov; /* !: KCOV private data. */
#endif
};
/*
* UAREA_PCB_OFFSET: an offset of PCB structure in the uarea. MD code may
* define it in <machine/proc.h>, to indicate a different uarea layout.
*/
#ifndef UAREA_PCB_OFFSET
#define UAREA_PCB_OFFSET 0
#endif
LIST_HEAD(lwplist, lwp); /* A list of LWPs. */
#ifdef _KERNEL
extern struct lwplist alllwp; /* List of all LWPs. */
extern lwp_t lwp0; /* LWP for proc0. */
extern int maxlwp __read_mostly; /* max number of lwps */
#ifndef MAXLWP
#define MAXLWP 4096 /* default max */
#endif
#ifndef MAXMAXLWP
#define MAXMAXLWP 65535 /* absolute max */
#endif
#endif
#endif /* _KERNEL || _KMEMUSER */
/*
* These flags are kept in l_flag, and they are modified only with the LWP
* locked.
*/
#define LW_IDLE 0x00000001 /* Idle lwp. */
#define LW_LWPCTL 0x00000002 /* Adjust lwpctl in userret */
#define LW_STIMO 0x00000040 /* Sleep timed out */
#define LW_SINTR 0x00000080 /* Sleep is interruptible. */
#define LW_CATCHINTR 0x00000100 /* LW_SINTR intent; see sleepq_block(). */
#define LW_SYSTEM 0x00000200 /* Kernel thread */
#define LW_SYSTEM_FPU 0x00000400 /* Kernel thread with vector/FP enabled */
#define LW_DBGSUSPEND 0x00010000 /* Suspend by debugger */
#define LW_WSUSPEND 0x00020000 /* Suspend before return to user */
#define LW_BATCH 0x00040000 /* LWP tends to hog CPU */
#define LW_WCORE 0x00080000 /* Stop for core dump on return to user */
#define LW_WEXIT 0x00100000 /* Exit before return to user */
#define LW_PENDSIG 0x01000000 /* Pending signal for us */
#define LW_CANCELLED 0x02000000 /* tsleep should not sleep */
#define LW_CACHECRED 0x04000000 /* Cache new process credential */
#define LW_WREBOOT 0x08000000 /* System is rebooting, please suspend */
#define LW_UNPARKED 0x10000000 /* Unpark op pending */
#define LW_RUMP_CLEAR 0x40000000 /* Clear curlwp in RUMP scheduler */
#define LW_RUMP_QEXIT 0x80000000 /* LWP should exit ASAP */
/*
* The second set of flags is kept in l_pflag, and they are modified only by
* the LWP itself, or modified when it's known the LWP cannot be running.
* LP_RUNNING is typically updated with the LWP locked, but not always in
* the case of soft interrupt handlers.
*/
#define LP_KTRACTIVE 0x00000001 /* Executing ktrace operation */
#define LP_KTRCSW 0x00000002 /* ktrace context switch marker */
#define LP_KTRCSWUSER 0x00000004 /* ktrace context switch marker */
/* 0x00000008 was LP_PIDLID */
#define LP_OWEUPC 0x00000010 /* Owe user profiling tick */
#define LP_MPSAFE 0x00000020 /* Starts life without kernel_lock */
#define LP_INTR 0x00000040 /* Soft interrupt handler */
#define LP_SYSCTLWRITE 0x00000080 /* sysctl write lock held */
#define LP_MUSTJOIN 0x00000100 /* Must join kthread on exit */
#define LP_SINGLESTEP 0x00000400 /* Single step thread in ptrace(2) */
#define LP_TIMEINTR 0x00010000 /* Time this soft interrupt */
#define LP_PREEMPTING 0x00020000 /* mi_switch called involuntarily */
#define LP_RUNNING 0x20000000 /* Active on a CPU */
#define LP_TELEPORT 0x40000000 /* Teleport to new CPU on preempt() */
#define LP_BOUND 0x80000000 /* Bound to a CPU */
/*
* The third set of flags is kept in l_prflag and they are modified only
* with p_lock held.
*/
#define LPR_DETACHED 0x00800000 /* Won't be waited for. */
#define LPR_DRAINING 0x80000000 /* Draining references before exiting */
/*
* Mask indicating that there is "exceptional" work to be done on return to
* user.
*/
#define LW_USERRET (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND \
| LW_WCORE | LW_LWPCTL | LW_CACHECRED)
/*
* Status values.
*
* A note about LSRUN and LSONPROC: LSRUN indicates that a process is
* runnable but *not* yet running, i.e. is on a run queue. LSONPROC
* indicates that the process is actually executing on a CPU, i.e.
* it is no longer on a run queue.
*
* These values are set in stone and must not be reused with future changes.
*/
#define LSIDL 1 /* Process being created by fork. */
#define LSRUN 2 /* Currently runnable. */
#define LSSLEEP 3 /* Sleeping on an address. */
#define LSSTOP 4 /* Process debugging or suspension. */
#define LSZOMB 5 /* Awaiting collection by parent. */
/* define LSDEAD 6 Process is almost a zombie. (removed in 5.0) */
#define LSONPROC 7 /* Process is currently on a CPU. */
#define LSSUSPENDED 8 /* Not running, not signalable. */
#if defined(_KERNEL) || defined(_KMEMUSER)
static __inline void *
lwp_getpcb(struct lwp *l)
{
return l->l_addr;
}
#endif /* _KERNEL || _KMEMUSER */
#ifdef _KERNEL
void lwpinit(void);
void lwp0_init(void);
void lwp_startup(lwp_t *, lwp_t *);
void startlwp(void *);
void lwp_lock(lwp_t *);
void lwp_unlock(lwp_t *);
pri_t lwp_eprio(lwp_t *);
int lwp_locked(lwp_t *, kmutex_t *);
kmutex_t *lwp_setlock(lwp_t *, kmutex_t *);
void lwp_unlock_to(lwp_t *, kmutex_t *);
int lwp_trylock(lwp_t *);
void lwp_changepri(lwp_t *, pri_t);
void lwp_lendpri(lwp_t *, pri_t);
void lwp_addref(lwp_t *);
void lwp_delref(lwp_t *);
void lwp_delref2(lwp_t *);
bool lwp_drainrefs(lwp_t *);
bool lwp_alive(lwp_t *);
lwp_t *lwp_find_first(proc_t *);
int lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool);
void lwp_continue(lwp_t *);
void lwp_unsleep(lwp_t *, bool);
void lwp_unstop(lwp_t *);
void lwp_exit(lwp_t *);
int lwp_suspend(lwp_t *, lwp_t *);
int lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *);
void lwp_start(lwp_t *, int);
void lwp_migrate(lwp_t *, struct cpu_info *);
lwp_t * lwp_find2(pid_t, lwpid_t);
lwp_t * lwp_find(proc_t *, int);
void lwp_userret(lwp_t *);
void lwp_need_userret(lwp_t *);
void lwp_free(lwp_t *, bool, bool);
long lwp_pctr(void);
int lwp_setprivate(lwp_t *, void *);
int do_lwp_create(lwp_t *, void *, u_long, lwp_t **, const sigset_t *,
const stack_t *);
void lwp_thread_cleanup(lwp_t *);
void lwpinit_specificdata(void);
int lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void lwp_specific_key_delete(specificdata_key_t);
void lwp_initspecific(lwp_t *);
void lwp_finispecific(lwp_t *);
void *lwp_getspecific(specificdata_key_t);
#if defined(_LWP_API_PRIVATE)
void *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t);
#endif
void lwp_setspecific(specificdata_key_t, void *);
void lwp_setspecific_by_lwp(lwp_t *, specificdata_key_t, void *);
/* Syscalls. */
int lwp_park(clockid_t, int, struct timespec *);
int lwp_unpark(const lwpid_t *, const u_int);
/* DDB. */
void lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2));
int lwp_create(lwp_t *, struct proc *, vaddr_t, int, void *, size_t,
void (*)(void *), void *, lwp_t **, int, const sigset_t *, const stack_t *);
/*
* XXX _MODULE
* We should provide real stubs for the below that modules can use.
*/
static __inline void
spc_lock(struct cpu_info *ci)
{
mutex_spin_enter(ci->ci_schedstate.spc_mutex);
}
static __inline void
spc_unlock(struct cpu_info *ci)
{
mutex_spin_exit(ci->ci_schedstate.spc_mutex);
}
static __inline void
spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2)
{
struct schedstate_percpu *spc1 = &ci1->ci_schedstate;
struct schedstate_percpu *spc2 = &ci2->ci_schedstate;
KASSERT(ci1 != ci2);
if (ci1 < ci2) {
mutex_spin_enter(spc1->spc_mutex);
mutex_spin_enter(spc2->spc_mutex);
} else {
mutex_spin_enter(spc2->spc_mutex);
mutex_spin_enter(spc1->spc_mutex);
}
}
/*
* Allow machine-dependent code to override curlwp in <machine/cpu.h> for
* its own convenience. Otherwise, we declare it as appropriate.
*/
#if !defined(curlwp)
#if defined(MULTIPROCESSOR)
#define curlwp curcpu()->ci_curlwp /* Current running LWP */
#else
extern struct lwp *curlwp; /* Current running LWP */
#endif /* MULTIPROCESSOR */
#endif /* ! curlwp */
#define curproc (curlwp->l_proc)
/*
* This provides a way for <machine/cpu.h> to get l_cpu for curlwp before
* struct lwp is defined.
*/
static __inline struct cpu_info *
lwp_getcpu(struct lwp *l)
{
return l->l_cpu;
}
static __inline bool
CURCPU_IDLE_P(void)
{
struct cpu_info *ci = curcpu();
return ci->ci_onproc == ci->ci_data.cpu_idlelwp;
}
/*
* Disable and re-enable preemption. Only for low-level kernel
* use. Device drivers and anything that could potentially be
* compiled as a module should use kpreempt_disable() and
* kpreempt_enable().
*/
static __inline void
KPREEMPT_DISABLE(lwp_t *l)
{
struct lwp *l1 __diagused;
KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1);
l->l_nopreempt++;
__insn_barrier();
}
static __inline void
KPREEMPT_ENABLE(lwp_t *l)
{
struct lwp *l1 __diagused;
KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); KASSERT(l->l_nopreempt > 0);
__insn_barrier();
l->l_nopreempt--;
__insn_barrier();
if (__predict_false(l->l_dopreempt)) kpreempt(0);
}
/* For lwp::l_dopreempt */
#define DOPREEMPT_ACTIVE 0x01
#define DOPREEMPT_COUNTED 0x02
/*
* Prevent curlwp from migrating between CPUs between curlwp_bind and
* curlwp_bindx. One use case is psref(9) that has a contract that
* forbids migrations.
*/
static __inline int
curlwp_bind(void)
{
int bound;
bound = curlwp->l_pflag & LP_BOUND;
curlwp->l_pflag |= LP_BOUND;
__insn_barrier();
return bound;
}
static __inline void
curlwp_bindx(int bound)
{
KASSERT(curlwp->l_pflag & LP_BOUND);
__insn_barrier();
curlwp->l_pflag ^= bound ^ LP_BOUND;
}
#endif /* _KERNEL */
/* Flags for _lwp_create(), as per Solaris. */
#define LWP_DETACHED 0x00000040
#define LWP_SUSPENDED 0x00000080
/* Kernel-internal flags for LWP creation. */
/* 0x40000000 was LWP_PIDLID */
#define LWP_VFORK 0x80000000
#endif /* !_SYS_LWP_H_ */
/* $NetBSD: bus_private.h,v 1.16 2022/01/22 15:10:32 skrll Exp $ */
/* NetBSD: bus.h,v 1.8 2005/03/09 19:04:46 matt Exp */
/*-
* Copyright (c) 1996, 1997, 1998, 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 Charles M. Hannum. All rights reserved.
* Copyright (c) 1996 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(_X86_BUS_PRIVATE_H_)
#define _X86_BUS_PRIVATE_H_
/*
* Cookie used for bounce buffers. A pointer to one of these it stashed in
* the DMA map.
*/
struct x86_bus_dma_cookie {
int id_flags; /* flags; see below */
/*
* Information about the original buffer used during
* DMA map syncs. Note that origibuflen is only used
* for ID_BUFTYPE_LINEAR.
*/
void *id_origbuf; /* pointer to orig buffer if
bouncing */
bus_size_t id_origbuflen; /* ...and size */
int id_buftype; /* type of buffer */
void *id_bouncebuf; /* pointer to the bounce buffer */
bus_size_t id_bouncebuflen; /* ...and size */
int id_nbouncesegs; /* number of valid bounce segs */
bus_dma_segment_t id_bouncesegs[0]; /* array of bounce buffer
physical memory segments */
};
/* id_flags */
#define X86_DMA_MIGHT_NEED_BOUNCE 0x01 /* may need bounce buffers */
#define X86_DMA_HAS_BOUNCE 0x02 /* has bounce buffers */
#define X86_DMA_IS_BOUNCING 0x04 /* is bouncing current xfer */
/* id_buftype */
#define X86_DMA_BUFTYPE_INVALID 0
#define X86_DMA_BUFTYPE_LINEAR 1
#define X86_DMA_BUFTYPE_MBUF 2
#define X86_DMA_BUFTYPE_UIO 3
#define X86_DMA_BUFTYPE_RAW 4
/*
* default address translation macros, which are appropriate where
* paddr_t == bus_addr_t.
*/
#if !defined(_BUS_PHYS_TO_BUS)
#define _BUS_PHYS_TO_BUS(pa) ((bus_addr_t)(pa))
#endif /* !defined(_BUS_PHYS_TO_BUS) */
#if !defined(_BUS_BUS_TO_PHYS)
#define _BUS_BUS_TO_PHYS(ba) ((paddr_t)(ba))
#endif /* !defined(_BUS_BUS_TO_PHYS) */
#if !defined(_BUS_VM_PAGE_TO_BUS)
#define _BUS_VM_PAGE_TO_BUS(pg) _BUS_PHYS_TO_BUS(VM_PAGE_TO_PHYS(pg))
#endif /* !defined(_BUS_VM_PAGE_TO_BUS) */
#if !defined(_BUS_BUS_TO_VM_PAGE)
#define _BUS_BUS_TO_VM_PAGE(ba) PHYS_TO_VM_PAGE(ba)
#endif /* !defined(_BUS_BUS_TO_VM_PAGE) */
#if !defined(_BUS_PMAP_ENTER)
#define _BUS_PMAP_ENTER(pmap, va, ba, prot, flags) \
pmap_enter(pmap, va, ba, prot, flags)
#endif /* _BUS_PMAP_ENTER */
#if !defined(_BUS_VIRT_TO_BUS)
#include <uvm/uvm_extern.h>
static __inline bus_addr_t _bus_virt_to_bus(struct pmap *, vaddr_t);
#define _BUS_VIRT_TO_BUS(pm, va) _bus_virt_to_bus((pm), (va))
static __inline bus_addr_t
_bus_virt_to_bus(struct pmap *pm, vaddr_t va)
{
paddr_t pa;
if (!pmap_extract(pm, va, &pa)) {
panic("_bus_virt_to_bus");
}
return _BUS_PHYS_TO_BUS(pa);
}
#endif /* !defined(_BUS_VIRT_TO_BUS) */
/*
* by default, the end address of RAM visible on bus is the same as the
* largest physical address.
*/
#ifndef _BUS_AVAIL_END
#define _BUS_AVAIL_END (avail_end - 1)
#endif
struct x86_bus_dma_tag {
bus_dma_tag_t bdt_super;
/* bdt_present: bitmap indicating overrides present (1) in *this* tag,
* bdt_exists: bitmap indicating overrides present (1) in *this* tag
* or in an ancestor's tag (follow bdt_super to ancestors)
*/
uint64_t bdt_present;
uint64_t bdt_exists;
const struct bus_dma_overrides *bdt_ov;
void *bdt_ctx;
/*
* The `bounce threshold' is checked while we are loading
* the DMA map. If the physical address of the segment
* exceeds the threshold, an error will be returned. The
* caller can then take whatever action is necessary to
* bounce the transfer. If this value is 0, it will be
* ignored.
*/
int _tag_needs_free;
bus_addr_t _bounce_thresh;
bus_addr_t _bounce_alloc_lo;
bus_addr_t _bounce_alloc_hi;
int (*_may_bounce)(bus_dma_tag_t, bus_dmamap_t, int, int *);
};
#endif /* !defined(_X86_BUS_PRIVATE_H_) */
/* $NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $ */
/* NetBSD: subr_disk.c,v 1.70 2005/08/20 12:00:01 yamt Exp $ */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/module.h>
#define STRAT_MATCH(id, bs) (strcmp((id), (bs)->bs_name) == 0)
static void sysctl_kern_bufq_strategies_setup(struct sysctllog **);
static SLIST_HEAD(, bufq_strat) bufq_strat_list =
SLIST_HEAD_INITIALIZER(bufq_strat_list);
static kmutex_t bufq_mutex;
static struct sysctllog *sysctllog;
void
bufq_init(void)
{
mutex_init(&bufq_mutex, MUTEX_DEFAULT, IPL_NONE);
sysctl_kern_bufq_strategies_setup(&sysctllog);
}
int
bufq_register(struct bufq_strat *bs)
{
mutex_enter(&bufq_mutex);
SLIST_INSERT_HEAD(&bufq_strat_list, bs, bs_next);
bs->bs_refcnt = 0;
mutex_exit(&bufq_mutex);
return 0;
}
int
bufq_unregister(struct bufq_strat *bs)
{
mutex_enter(&bufq_mutex);
if (bs->bs_refcnt != 0) {
mutex_exit(&bufq_mutex);
return EBUSY;
}
SLIST_REMOVE(&bufq_strat_list, bs, bufq_strat, bs_next);
mutex_exit(&bufq_mutex);
return 0;
}
/*
* Create a device buffer queue.
*/
int
bufq_alloc(struct bufq_state **bufqp, const char *strategy, int flags)
{
struct bufq_strat *bsp, *it;
struct bufq_state *bufq;
int error = 0;
u_int gen;
bool found_exact;
char strategy_module_name[MAXPATHLEN];
KASSERT((flags & BUFQ_EXACT) == 0 || strategy != BUFQ_STRAT_ANY); switch (flags & BUFQ_SORT_MASK) {
case BUFQ_SORT_RAWBLOCK:
case BUFQ_SORT_CYLINDER:
break;
case 0:
/*
* for strategies which don't care about block numbers.
* eg. fcfs
*/
flags |= BUFQ_SORT_RAWBLOCK;
break;
default:
panic("bufq_alloc: sort out of range");
}
/*
* select strategy.
* if a strategy specified by flags is found, use it.
* otherwise, select one with the largest bs_prio.
*/
mutex_enter(&bufq_mutex);
do {
gen = module_gen;
bsp = NULL;
found_exact = false;
SLIST_FOREACH(it, &bufq_strat_list, bs_next) { if (strategy != BUFQ_STRAT_ANY &&
STRAT_MATCH(strategy, (it))) {
bsp = it;
found_exact = true;
break;
}
if (bsp == NULL || (it)->bs_prio > bsp->bs_prio)
bsp = it;
}
if (strategy == BUFQ_STRAT_ANY || found_exact)
break;
/* Try to autoload the bufq strategy module */
strlcpy(strategy_module_name, "bufq_",
sizeof(strategy_module_name));
strlcat(strategy_module_name, strategy,
sizeof(strategy_module_name));
mutex_exit(&bufq_mutex);
(void) module_autoload(strategy_module_name, MODULE_CLASS_BUFQ);
mutex_enter(&bufq_mutex);
} while (gen != module_gen); if (bsp == NULL) {
panic("bufq_alloc: no strategy");
}
if (strategy != BUFQ_STRAT_ANY && !found_exact) {
if ((flags & BUFQ_EXACT)) {
error = ENOENT;
mutex_exit(&bufq_mutex);
goto out;
}
#if defined(DEBUG)
printf("bufq_alloc: '%s' is not available. using '%s'.\n",
strategy, bsp->bs_name);
#endif
}
#if defined(BUFQ_DEBUG)
/* XXX aprint? */
printf("bufq_alloc: using '%s'\n", bsp->bs_name);
#endif
bsp->bs_refcnt++;
mutex_exit(&bufq_mutex);
*bufqp = bufq = kmem_zalloc(sizeof(*bufq), KM_SLEEP);
bufq->bq_flags = flags;
bufq->bq_strat = bsp;
(*bsp->bs_initfn)(bufq);
out:
return error;
}
void
bufq_put(struct bufq_state *bufq, struct buf *bp)
{
(*bufq->bq_put)(bufq, bp);
}
struct buf *
bufq_get(struct bufq_state *bufq)
{
return (*bufq->bq_get)(bufq, 1);
}
struct buf *
bufq_peek(struct bufq_state *bufq)
{
return (*bufq->bq_get)(bufq, 0);
}
struct buf *
bufq_cancel(struct bufq_state *bufq, struct buf *bp)
{
return (*bufq->bq_cancel)(bufq, bp);
}
/*
* Drain a device buffer queue.
*/
void
bufq_drain(struct bufq_state *bufq)
{
struct buf *bp;
while ((bp = bufq_get(bufq)) != NULL) {
bp->b_error = EIO;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
}
/*
* Destroy a device buffer queue.
*/
void
bufq_free(struct bufq_state *bufq)
{ KASSERT(bufq_peek(bufq) == NULL);
bufq->bq_fini(bufq);
mutex_enter(&bufq_mutex);
bufq->bq_strat->bs_refcnt--;
mutex_exit(&bufq_mutex);
kmem_free(bufq, sizeof(*bufq));
}
/*
* get a strategy identifier of a buffer queue.
*/
const char *
bufq_getstrategyname(struct bufq_state *bufq)
{
return bufq->bq_strat->bs_name;
}
/*
* move all requests on a buffer queue to another.
*/
void
bufq_move(struct bufq_state *dst, struct bufq_state *src)
{
struct buf *bp;
while ((bp = bufq_get(src)) != NULL) {
bufq_put(dst, bp);
}
}
static int
docopy(char *buf, size_t *bufoffp, size_t buflen,
const char *datap, size_t datalen)
{
int error = 0;
if (buf != NULL && datalen > 0) {
if (*bufoffp + datalen > buflen) {
goto out;
}
error = copyout(datap, buf + *bufoffp, datalen);
if (error) {
goto out;
}
}
out:
if (error == 0) {
*bufoffp += datalen;
}
return error;
}
static int
docopystr(char *buf, size_t *bufoffp, size_t buflen, const char *datap)
{
return docopy(buf, bufoffp, buflen, datap, strlen(datap));
}
static int
docopynul(char *buf, size_t *bufoffp, size_t buflen)
{
return docopy(buf, bufoffp, buflen, "", 1);
}
/*
* sysctl function that will print all bufq strategies
* currently available to the kernel.
*/
static int
sysctl_kern_bufq_strategies(SYSCTLFN_ARGS)
{
const struct bufq_strat *bq_strat;
const char *delim = "";
size_t off = 0;
size_t buflen = *oldlenp;
int error;
SLIST_FOREACH(bq_strat, &bufq_strat_list, bs_next) {
error = docopystr(oldp, &off, buflen, delim);
if (error) {
goto out;
}
error = docopystr(oldp, &off, buflen, (bq_strat)->bs_name);
if (error) {
goto out;
}
delim = " ";
}
/* In case there are no registered strategies ... */
if (off == 0) {
error = docopystr(oldp, &off, buflen, "NULL");
if (error) {
goto out;
}
}
/* NUL terminate */
error = docopynul(oldp, &off, buflen);
out:
*oldlenp = off;
return error;
}
static void
sysctl_kern_bufq_strategies_setup(struct sysctllog **clog)
{
const struct sysctlnode *node;
node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "bufq",
SYSCTL_DESCR("buffer queue subtree"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node != NULL) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "strategies",
SYSCTL_DESCR("List of bufq strategies present"),
sysctl_kern_bufq_strategies, 0, NULL, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
}
}
/* $NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $ */
/*-
* Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christopher G. Demetriou, by Andrew Doran, and by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_time.c 8.4 (Berkeley) 5/26/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $");
#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/signalvar.h>
#include <sys/syslog.h>
#include <sys/timetc.h>
#include <sys/timevar.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
kmutex_t itimer_mutex __cacheline_aligned; /* XXX static */
static struct itlist itimer_realtime_changed_notify;
static void itimer_callout(void *);
static void ptimer_intr(void *);
static void *ptimer_sih __read_mostly;
static TAILQ_HEAD(, ptimer) ptimer_queue;
#define CLOCK_VIRTUAL_P(clockid) \
((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF)
CTASSERT(ITIMER_REAL == CLOCK_REALTIME);
CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL);
CTASSERT(ITIMER_PROF == CLOCK_PROF);
CTASSERT(ITIMER_MONOTONIC == CLOCK_MONOTONIC);
#define DELAYTIMER_MAX 32
/*
* Initialize timekeeping.
*/
void
time_init(void)
{
mutex_init(&itimer_mutex, MUTEX_DEFAULT, IPL_SCHED);
LIST_INIT(&itimer_realtime_changed_notify);
TAILQ_INIT(&ptimer_queue);
ptimer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
ptimer_intr, NULL);
}
/*
* Check if the time will wrap if set to ts.
*
* ts - timespec describing the new time
* delta - the delta between the current time and ts
*/
bool
time_wraps(struct timespec *ts, struct timespec *delta)
{
/*
* Don't allow the time to be set forward so far it
* will wrap and become negative, thus allowing an
* attacker to bypass the next check below. The
* cutoff is 1 year before rollover occurs, so even
* if the attacker uses adjtime(2) to move the time
* past the cutoff, it will take a very long time
* to get to the wrap point.
*/
if ((ts->tv_sec > LLONG_MAX - 365*24*60*60) ||
(delta->tv_sec < 0 || delta->tv_nsec < 0))
return true;
return false;
}
/*
* itimer_lock:
*
* Acquire the interval timer data lock.
*/
void
itimer_lock(void)
{
mutex_spin_enter(&itimer_mutex);
}
/*
* itimer_unlock:
*
* Release the interval timer data lock.
*/
void
itimer_unlock(void)
{
mutex_spin_exit(&itimer_mutex);
}
/*
* itimer_lock_held:
*
* Check that the interval timer lock is held for diagnostic
* assertions.
*/
inline bool __diagused
itimer_lock_held(void)
{
return mutex_owned(&itimer_mutex);
}
/*
* Time of day and interval timer support.
*
* These routines provide the kernel entry points to get and set
* the time-of-day and per-process interval timers. Subroutines
* here provide support for adding and subtracting timeval structures
* and decrementing interval timers, optionally reloading the interval
* timers when they expire.
*/
/* This function is used by clock_settime and settimeofday */
static int
settime1(struct proc *p, const struct timespec *ts, bool check_kauth)
{
struct timespec delta, now;
/*
* The time being set to an unreasonable value will cause
* unreasonable system behaviour.
*/
if (ts->tv_sec < 0 || ts->tv_sec > (1LL << 36))
return EINVAL;
nanotime(&now);
timespecsub(ts, &now, &delta);
if (check_kauth && kauth_authorize_system(kauth_cred_get(),
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts),
&delta, KAUTH_ARG(check_kauth ? false : true)) != 0) {
return EPERM;
}
#ifdef notyet
if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */
return EPERM;
}
#endif
tc_setclock(ts);
resettodr();
/*
* Notify pending CLOCK_REALTIME timers about the real time change.
* There may be inactive timers on this list, but this happens
* comparatively less often than timers firing, and so it's better
* to put the extra checks here than to complicate the other code
* path.
*/
struct itimer *it;
itimer_lock();
LIST_FOREACH(it, &itimer_realtime_changed_notify, it_rtchgq) {
KASSERT(it->it_ops->ito_realtime_changed != NULL);
if (timespecisset(&it->it_time.it_value)) {
(*it->it_ops->ito_realtime_changed)(it);
}
}
itimer_unlock();
return 0;
}
int
settime(struct proc *p, struct timespec *ts)
{
return settime1(p, ts, true);
}
/* ARGSUSED */
int
sys___clock_gettime50(struct lwp *l,
const struct sys___clock_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */
int error;
struct timespec ats;
error = clock_gettime1(SCARG(uap, clock_id), &ats);
if (error != 0)
return error;
return copyout(&ats, SCARG(uap, tp), sizeof(ats));
}
/* ARGSUSED */
int
sys___clock_settime50(struct lwp *l,
const struct sys___clock_settime50_args *uap, register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(const struct timespec *) tp;
} */
int error;
struct timespec ats;
if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
return error;
return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true);
}
int
clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp,
bool check_kauth)
{
int error;
if (tp->tv_nsec < 0 || tp->tv_nsec >= 1000000000L)
return EINVAL;
switch (clock_id) {
case CLOCK_REALTIME:
if ((error = settime1(p, tp, check_kauth)) != 0)
return error;
break;
case CLOCK_MONOTONIC:
return EINVAL; /* read-only clock */
default:
return EINVAL;
}
return 0;
}
int
sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */
struct timespec ts;
int error;
if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0)
return error;
if (SCARG(uap, tp)) error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
return error;
}
int
clock_getres1(clockid_t clock_id, struct timespec *ts)
{
switch (clock_id) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
ts->tv_sec = 0;
if (tc_getfrequency() > 1000000000)
ts->tv_nsec = 1;
else
ts->tv_nsec = 1000000000 / tc_getfrequency();
break;
default:
return EINVAL;
}
return 0;
}
/* ARGSUSED */
int
sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct timespec *) rqtp;
syscallarg(struct timespec *) rmtp;
} */
struct timespec rmt, rqt;
int error, error1;
error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
if (error)
return error;
error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt,
SCARG(uap, rmtp) ? &rmt : NULL);
if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
return error;
error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
return error1 ? error1 : error;
}
/* ARGSUSED */
int
sys_clock_nanosleep(struct lwp *l, const struct sys_clock_nanosleep_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(int) flags;
syscallarg(struct timespec *) rqtp;
syscallarg(struct timespec *) rmtp;
} */
struct timespec rmt, rqt;
int error, error1;
error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
if (error)
goto out;
error = nanosleep1(l, SCARG(uap, clock_id), SCARG(uap, flags), &rqt,
SCARG(uap, rmtp) ? &rmt : NULL);
if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
goto out;
if ((SCARG(uap, flags) & TIMER_ABSTIME) == 0 &&
(error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt))) != 0)
error = error1;
out:
*retval = error;
return 0;
}
int
nanosleep1(struct lwp *l, clockid_t clock_id, int flags, struct timespec *rqt,
struct timespec *rmt)
{
struct timespec rmtstart;
int error, timo;
if ((error = ts2timo(clock_id, flags, rqt, &timo, &rmtstart)) != 0) {
if (error == ETIMEDOUT) {
error = 0;
if (rmt != NULL) rmt->tv_sec = rmt->tv_nsec = 0;
}
return error;
}
/*
* Avoid inadvertently sleeping forever
*/
if (timo == 0)
timo = 1;
again:
error = kpause("nanoslp", true, timo, NULL);
if (error == EWOULDBLOCK)
error = 0;
if (rmt != NULL || error == 0) {
struct timespec rmtend;
struct timespec t0;
struct timespec *t;
int err;
err = clock_gettime1(clock_id, &rmtend);
if (err != 0)
return err;
t = (rmt != NULL) ? rmt : &t0;
if (flags & TIMER_ABSTIME) {
timespecsub(rqt, &rmtend, t);
} else {
if (timespeccmp(&rmtend, &rmtstart, <))
timespecclear(t); /* clock wound back */
else
timespecsub(&rmtend, &rmtstart, t); if (timespeccmp(rqt, t, <))
timespecclear(t);
else
timespecsub(rqt, t, t);
}
if (t->tv_sec < 0)
timespecclear(t);
if (error == 0) {
timo = tstohz(t);
if (timo > 0)
goto again;
}
}
if (error == ERESTART)
error = EINTR;
return error;
}
int
sys_clock_getcpuclockid2(struct lwp *l,
const struct sys_clock_getcpuclockid2_args *uap,
register_t *retval)
{
/* {
syscallarg(idtype_t idtype;
syscallarg(id_t id);
syscallarg(clockid_t *)clock_id;
} */
pid_t pid;
lwpid_t lid;
clockid_t clock_id;
id_t id = SCARG(uap, id);
switch (SCARG(uap, idtype)) {
case P_PID:
pid = id == 0 ? l->l_proc->p_pid : id;
clock_id = CLOCK_PROCESS_CPUTIME_ID | pid;
break;
case P_LWPID:
lid = id == 0 ? l->l_lid : id;
clock_id = CLOCK_THREAD_CPUTIME_ID | lid;
break;
default:
return EINVAL;
}
return copyout(&clock_id, SCARG(uap, clock_id), sizeof(clock_id));
}
/* ARGSUSED */
int
sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct timeval *) tp;
syscallarg(void *) tzp; really "struct timezone *";
} */
struct timeval atv;
int error = 0;
struct timezone tzfake;
if (SCARG(uap, tp)) {
memset(&atv, 0, sizeof(atv));
microtime(&atv);
error = copyout(&atv, SCARG(uap, tp), sizeof(atv));
if (error)
return error;
}
if (SCARG(uap, tzp)) {
/*
* NetBSD has no kernel notion of time zone, so we just
* fake up a timezone struct and return it if demanded.
*/
tzfake.tz_minuteswest = 0;
tzfake.tz_dsttime = 0;
error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake));
}
return error;
}
/* ARGSUSED */
int
sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct timeval *) tv;
syscallarg(const void *) tzp; really "const struct timezone *";
} */
return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true);
}
int
settimeofday1(const struct timeval *utv, bool userspace,
const void *utzp, struct lwp *l, bool check_kauth)
{
struct timeval atv;
struct timespec ts;
int error;
/* Verify all parameters before changing time. */
/*
* NetBSD has no kernel notion of time zone, and only an
* obsolete program would try to set it, so we log a warning.
*/
if (utzp)
log(LOG_WARNING, "pid %d attempted to set the "
"(obsolete) kernel time zone\n", l->l_proc->p_pid); if (utv == NULL)
return 0;
if (userspace) { if ((error = copyin(utv, &atv, sizeof(atv))) != 0)
return error;
utv = &atv;
}
if (utv->tv_usec < 0 || utv->tv_usec >= 1000000)
return EINVAL;
TIMEVAL_TO_TIMESPEC(utv, &ts);
return settime1(l->l_proc, &ts, check_kauth);
}
int time_adjusted; /* set if an adjustment is made */
/* ARGSUSED */
int
sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct timeval *) delta;
syscallarg(struct timeval *) olddelta;
} */
int error;
struct timeval atv, oldatv;
if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME,
KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0)
return error;
if (SCARG(uap, delta)) {
error = copyin(SCARG(uap, delta), &atv,
sizeof(*SCARG(uap, delta)));
if (error)
return error;
}
adjtime1(SCARG(uap, delta) ? &atv : NULL,
SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc);
if (SCARG(uap, olddelta))
error = copyout(&oldatv, SCARG(uap, olddelta),
sizeof(*SCARG(uap, olddelta)));
return error;
}
void
adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p)
{
if (olddelta) {
memset(olddelta, 0, sizeof(*olddelta));
mutex_spin_enter(&timecounter_lock);
olddelta->tv_sec = time_adjtime / 1000000;
olddelta->tv_usec = time_adjtime % 1000000;
if (olddelta->tv_usec < 0) {
olddelta->tv_usec += 1000000;
olddelta->tv_sec--;
}
mutex_spin_exit(&timecounter_lock);
}
if (delta) {
mutex_spin_enter(&timecounter_lock);
/*
* XXX This should maybe just report failure to
* userland for nonsense deltas.
*/
if (delta->tv_sec > INT64_MAX/1000000 - 1) {
time_adjtime = INT64_MAX;
} else if (delta->tv_sec < INT64_MIN/1000000 + 1) {
time_adjtime = INT64_MIN;
} else {
time_adjtime = delta->tv_sec * 1000000
+ MAX(-999999, MIN(999999, delta->tv_usec));
}
if (time_adjtime) {
/* We need to save the system time during shutdown */
time_adjusted |= 1;
}
mutex_spin_exit(&timecounter_lock);
}
}
/*
* Interval timer support.
*
* The itimer_*() routines provide generic support for interval timers,
* both real (CLOCK_REALTIME, CLOCK_MONOTIME), and virtual (CLOCK_VIRTUAL,
* CLOCK_PROF).
*
* Real timers keep their deadline as an absolute time, and are fired
* by a callout. Virtual timers are kept as a linked-list of deltas,
* and are processed by hardclock().
*
* Because the real time timer callout may be delayed in real time due
* to interrupt processing on the system, it is possible for the real
* time timeout routine (itimer_callout()) run past after its deadline.
* It does not suffice, therefore, to reload the real timer .it_value
* from the timer's .it_interval. Rather, we compute the next deadline
* in absolute time based on the current time and the .it_interval value,
* and report any overruns.
*
* Note that while the virtual timers are supported in a generic fashion
* here, they only (currently) make sense as per-process timers, and thus
* only really work for that case.
*/
/*
* itimer_init:
*
* Initialize the common data for an interval timer.
*/
void
itimer_init(struct itimer * const it, const struct itimer_ops * const ops,
clockid_t const id, struct itlist * const itl)
{
KASSERT(itimer_lock_held());
KASSERT(ops != NULL);
timespecclear(&it->it_time.it_value);
it->it_ops = ops;
it->it_clockid = id;
it->it_overruns = 0;
it->it_dying = false;
if (!CLOCK_VIRTUAL_P(id)) {
KASSERT(itl == NULL);
callout_init(&it->it_ch, CALLOUT_MPSAFE);
callout_setfunc(&it->it_ch, itimer_callout, it);
if (id == CLOCK_REALTIME && ops->ito_realtime_changed != NULL) {
LIST_INSERT_HEAD(&itimer_realtime_changed_notify,
it, it_rtchgq);
}
} else {
KASSERT(itl != NULL);
it->it_vlist = itl;
it->it_active = false;
}
}
/*
* itimer_poison:
*
* Poison an interval timer, preventing it from being scheduled
* or processed, in preparation for freeing the timer.
*/
void
itimer_poison(struct itimer * const it)
{
KASSERT(itimer_lock_held());
it->it_dying = true;
/*
* For non-virtual timers, stop the callout, or wait for it to
* run if it has already fired. It cannot restart again after
* this point: the callout won't restart itself when dying, no
* other users holding the lock can restart it, and any other
* users waiting for callout_halt concurrently (itimer_settime)
* will restart from the top.
*/
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
callout_halt(&it->it_ch, &itimer_mutex);
if (it->it_clockid == CLOCK_REALTIME &&
it->it_ops->ito_realtime_changed != NULL) {
LIST_REMOVE(it, it_rtchgq);
}
}
}
/*
* itimer_fini:
*
* Release resources used by an interval timer.
*
* N.B. itimer_lock must be held on entry, and is released on exit.
*/
void
itimer_fini(struct itimer * const it)
{
KASSERT(itimer_lock_held());
/* All done with the global state. */
itimer_unlock();
/* Destroy the callout, if needed. */
if (!CLOCK_VIRTUAL_P(it->it_clockid))
callout_destroy(&it->it_ch);
}
/*
* itimer_decr:
*
* Decrement an interval timer by a specified number of nanoseconds,
* which must be less than a second, i.e. < 1000000000. If the timer
* expires, then reload it. In this case, carry over (nsec - old value)
* to reduce the value reloaded into the timer so that the timer does
* not drift. This routine assumes that it is called in a context where
* the timers on which it is operating cannot change in value.
*
* Returns true if the timer has expired.
*/
static bool
itimer_decr(struct itimer *it, int nsec)
{
struct itimerspec *itp;
int error __diagused;
KASSERT(itimer_lock_held());
KASSERT(CLOCK_VIRTUAL_P(it->it_clockid));
itp = &it->it_time;
if (itp->it_value.tv_nsec < nsec) {
if (itp->it_value.tv_sec == 0) {
/* expired, and already in next interval */
nsec -= itp->it_value.tv_nsec;
goto expire;
}
itp->it_value.tv_nsec += 1000000000;
itp->it_value.tv_sec--;
}
itp->it_value.tv_nsec -= nsec;
nsec = 0;
if (timespecisset(&itp->it_value))
return false;
/* expired, exactly at end of interval */
expire:
if (timespecisset(&itp->it_interval)) {
itp->it_value = itp->it_interval;
itp->it_value.tv_nsec -= nsec;
if (itp->it_value.tv_nsec < 0) {
itp->it_value.tv_nsec += 1000000000;
itp->it_value.tv_sec--;
}
error = itimer_settime(it);
KASSERT(error == 0); /* virtual, never fails */
} else
itp->it_value.tv_nsec = 0; /* sec is already 0 */
return true;
}
/*
* itimer_arm_real:
*
* Arm a non-virtual timer.
*/
static void
itimer_arm_real(struct itimer * const it)
{
KASSERT(!it->it_dying);
KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
KASSERT(!callout_pending(&it->it_ch));
/*
* Don't need to check tshzto() return value, here.
* callout_schedule() does it for us.
*/
callout_schedule(&it->it_ch,
(it->it_clockid == CLOCK_MONOTONIC
? tshztoup(&it->it_time.it_value)
: tshzto(&it->it_time.it_value)));
}
/*
* itimer_callout:
*
* Callout to expire a non-virtual timer. Queue it up for processing,
* and then reload, if it is configured to do so.
*
* N.B. A delay in processing this callout causes multiple
* SIGALRM calls to be compressed into one.
*/
static void
itimer_callout(void *arg)
{
uint64_t last_val, next_val, interval, now_ns;
struct timespec now, next;
struct itimer * const it = arg;
int backwards;
itimer_lock();
(*it->it_ops->ito_fire)(it);
if (!timespecisset(&it->it_time.it_interval)) {
timespecclear(&it->it_time.it_value);
itimer_unlock();
return;
}
if (it->it_clockid == CLOCK_MONOTONIC) {
getnanouptime(&now);
} else {
getnanotime(&now);
}
backwards = (timespeccmp(&it->it_time.it_value, &now, >));
/* Nonnegative interval guaranteed by itimerfix. */
KASSERT(it->it_time.it_interval.tv_sec >= 0);
KASSERT(it->it_time.it_interval.tv_nsec >= 0);
/* Handle the easy case of non-overflown timers first. */
if (!backwards &&
timespecaddok(&it->it_time.it_value, &it->it_time.it_interval)) {
timespecadd(&it->it_time.it_value, &it->it_time.it_interval,
&next);
it->it_time.it_value = next;
} else {
now_ns = timespec2ns(&now);
last_val = timespec2ns(&it->it_time.it_value);
interval = timespec2ns(&it->it_time.it_interval);
next_val = now_ns +
(now_ns - last_val + interval - 1) % interval;
if (backwards)
next_val += interval;
else
it->it_overruns += (now_ns - last_val) / interval;
it->it_time.it_value.tv_sec = next_val / 1000000000;
it->it_time.it_value.tv_nsec = next_val % 1000000000;
}
/*
* Reset the callout, if it's not going away.
*/
if (!it->it_dying)
itimer_arm_real(it);
itimer_unlock();
}
/*
* itimer_settime:
*
* Set up the given interval timer. The value in it->it_time.it_value
* is taken to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC
* timers and a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers.
*
* If the callout had already fired but not yet run, fails with
* ERESTART -- caller must restart from the top to look up a timer.
*/
int
itimer_settime(struct itimer *it)
{
struct itimer *itn, *pitn;
struct itlist *itl;
KASSERT(itimer_lock_held());
KASSERT(!it->it_dying);
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
/*
* Try to stop the callout. However, if it had already
* fired, we have to drop the lock to wait for it, so
* the world may have changed and pt may not be there
* any more. In that case, tell the caller to start
* over from the top.
*/
if (callout_halt(&it->it_ch, &itimer_mutex))
return ERESTART;
KASSERT(!it->it_dying);
/* Now we can touch it and start it up again. */
if (timespecisset(&it->it_time.it_value))
itimer_arm_real(it);
} else {
if (it->it_active) {
itn = LIST_NEXT(it, it_list);
LIST_REMOVE(it, it_list);
for ( ; itn; itn = LIST_NEXT(itn, it_list))
timespecadd(&it->it_time.it_value,
&itn->it_time.it_value,
&itn->it_time.it_value);
}
if (timespecisset(&it->it_time.it_value)) {
itl = it->it_vlist;
for (itn = LIST_FIRST(itl), pitn = NULL;
itn && timespeccmp(&it->it_time.it_value,
&itn->it_time.it_value, >);
pitn = itn, itn = LIST_NEXT(itn, it_list))
timespecsub(&it->it_time.it_value,
&itn->it_time.it_value,
&it->it_time.it_value);
if (pitn)
LIST_INSERT_AFTER(pitn, it, it_list);
else
LIST_INSERT_HEAD(itl, it, it_list);
for ( ; itn ; itn = LIST_NEXT(itn, it_list))
timespecsub(&itn->it_time.it_value,
&it->it_time.it_value,
&itn->it_time.it_value);
it->it_active = true;
} else {
it->it_active = false;
}
}
/* Success! */
return 0;
}
/*
* itimer_gettime:
*
* Return the remaining time of an interval timer.
*/
void
itimer_gettime(const struct itimer *it, struct itimerspec *aits)
{
struct timespec now;
struct itimer *itn;
KASSERT(itimer_lock_held());
KASSERT(!it->it_dying);
*aits = it->it_time;
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
/*
* Convert from absolute to relative time in .it_value
* part of real time timer. If time for real time
* timer has passed return 0, else return difference
* between current time and time for the timer to go
* off.
*/
if (timespecisset(&aits->it_value)) {
if (it->it_clockid == CLOCK_REALTIME) {
getnanotime(&now);
} else { /* CLOCK_MONOTONIC */
getnanouptime(&now);
}
if (timespeccmp(&aits->it_value, &now, <))
timespecclear(&aits->it_value);
else
timespecsub(&aits->it_value, &now,
&aits->it_value);
}
} else if (it->it_active) {
for (itn = LIST_FIRST(it->it_vlist); itn && itn != it;
itn = LIST_NEXT(itn, it_list))
timespecadd(&aits->it_value,
&itn->it_time.it_value, &aits->it_value);
KASSERT(itn != NULL); /* it should be findable on the list */
} else
timespecclear(&aits->it_value);
}
/*
* Per-process timer support.
*
* Both the BSD getitimer() family and the POSIX timer_*() family of
* routines are supported.
*
* All timers are kept in an array pointed to by p_timers, which is
* allocated on demand - many processes don't use timers at all. The
* first four elements in this array are reserved for the BSD timers:
* element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, element
* 2 is ITIMER_PROF, and element 3 is ITIMER_MONOTONIC. The rest may be
* allocated by the timer_create() syscall.
*
* These timers are a "sub-class" of interval timer.
*/
/*
* ptimer_free:
*
* Free the per-process timer at the specified index.
*/
static void
ptimer_free(struct ptimers *pts, int index)
{
struct itimer *it;
struct ptimer *pt;
KASSERT(itimer_lock_held());
it = pts->pts_timers[index];
pt = container_of(it, struct ptimer, pt_itimer);
pts->pts_timers[index] = NULL;
itimer_poison(it);
/*
* Remove it from the queue to be signalled. Must be done
* after itimer is poisoned, because we may have had to wait
* for the callout to complete.
*/
if (pt->pt_queued) {
TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
pt->pt_queued = false;
}
itimer_fini(it); /* releases itimer_lock */
kmem_free(pt, sizeof(*pt));
}
/*
* ptimers_alloc:
*
* Allocate a ptimers for the specified process.
*/
static struct ptimers *
ptimers_alloc(struct proc *p)
{
struct ptimers *pts;
int i;
pts = kmem_alloc(sizeof(*pts), KM_SLEEP);
LIST_INIT(&pts->pts_virtual);
LIST_INIT(&pts->pts_prof);
for (i = 0; i < TIMER_MAX; i++)
pts->pts_timers[i] = NULL;
itimer_lock();
if (p->p_timers == NULL) {
p->p_timers = pts;
itimer_unlock();
return pts;
}
itimer_unlock();
kmem_free(pts, sizeof(*pts));
return p->p_timers;
}
/*
* ptimers_free:
*
* Clean up the per-process timers. If "which" is set to TIMERS_ALL,
* then clean up all timers and free all the data structures. If
* "which" is set to TIMERS_POSIX, only clean up the timers allocated
* by timer_create(), not the BSD setitimer() timers, and only free the
* structure if none of those remain.
*
* This function is exported because it is needed in the exec and
* exit code paths.
*/
void
ptimers_free(struct proc *p, int which)
{
struct ptimers *pts;
struct itimer *itn;
struct timespec ts;
int i;
if (p->p_timers == NULL)
return;
pts = p->p_timers;
itimer_lock();
if (which == TIMERS_ALL) {
p->p_timers = NULL;
i = 0;
} else {
timespecclear(&ts);
for (itn = LIST_FIRST(&pts->pts_virtual);
itn && itn != pts->pts_timers[ITIMER_VIRTUAL];
itn = LIST_NEXT(itn, it_list)) {
KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
timespecadd(&ts, &itn->it_time.it_value, &ts);
}
LIST_FIRST(&pts->pts_virtual) = NULL;
if (itn) {
KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
timespecadd(&ts, &itn->it_time.it_value,
&itn->it_time.it_value);
LIST_INSERT_HEAD(&pts->pts_virtual, itn, it_list);
}
timespecclear(&ts);
for (itn = LIST_FIRST(&pts->pts_prof);
itn && itn != pts->pts_timers[ITIMER_PROF];
itn = LIST_NEXT(itn, it_list)) {
KASSERT(itn->it_clockid == CLOCK_PROF);
timespecadd(&ts, &itn->it_time.it_value, &ts);
}
LIST_FIRST(&pts->pts_prof) = NULL;
if (itn) {
KASSERT(itn->it_clockid == CLOCK_PROF);
timespecadd(&ts, &itn->it_time.it_value,
&itn->it_time.it_value);
LIST_INSERT_HEAD(&pts->pts_prof, itn, it_list);
}
i = TIMER_MIN;
}
for ( ; i < TIMER_MAX; i++) {
if (pts->pts_timers[i] != NULL) {
/* Free the timer and release the lock. */
ptimer_free(pts, i);
/* Reacquire the lock for the next one. */
itimer_lock();
}
}
if (pts->pts_timers[0] == NULL && pts->pts_timers[1] == NULL &&
pts->pts_timers[2] == NULL && pts->pts_timers[3] == NULL) {
p->p_timers = NULL;
itimer_unlock();
kmem_free(pts, sizeof(*pts));
} else
itimer_unlock();
}
/*
* ptimer_fire:
*
* Fire a per-process timer.
*/
static void
ptimer_fire(struct itimer *it)
{
struct ptimer *pt = container_of(it, struct ptimer, pt_itimer);
KASSERT(itimer_lock_held());
/*
* XXX Can overrun, but we don't do signal queueing yet, anyway.
* XXX Relying on the clock interrupt is stupid.
*/
if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
return;
}
if (!pt->pt_queued) {
TAILQ_INSERT_TAIL(&ptimer_queue, pt, pt_chain);
pt->pt_queued = true;
softint_schedule(ptimer_sih);
}
}
/*
* Operations vector for per-process timers (BSD and POSIX).
*/
static const struct itimer_ops ptimer_itimer_ops = {
.ito_fire = ptimer_fire,
};
/*
* sys_timer_create:
*
* System call to create a POSIX timer.
*/
int
sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct sigevent *) evp;
syscallarg(timer_t *) timerid;
} */
return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id),
SCARG(uap, evp), copyin, l);
}
int
timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp,
copyin_t fetch_event, struct lwp *l)
{
int error;
timer_t timerid;
struct itlist *itl;
struct ptimers *pts;
struct ptimer *pt;
struct proc *p;
p = l->l_proc;
if ((u_int)id > CLOCK_MONOTONIC)
return EINVAL;
if ((pts = p->p_timers) == NULL)
pts = ptimers_alloc(p);
pt = kmem_zalloc(sizeof(*pt), KM_SLEEP);
if (evp != NULL) {
if (((error =
(*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) ||
((pt->pt_ev.sigev_notify < SIGEV_NONE) ||
(pt->pt_ev.sigev_notify > SIGEV_SA)) ||
(pt->pt_ev.sigev_notify == SIGEV_SIGNAL &&
(pt->pt_ev.sigev_signo <= 0 ||
pt->pt_ev.sigev_signo >= NSIG))) {
kmem_free(pt, sizeof(*pt));
return (error ? error : EINVAL);
}
}
/* Find a free timer slot, skipping those reserved for setitimer(). */
itimer_lock();
for (timerid = TIMER_MIN; timerid < TIMER_MAX; timerid++)
if (pts->pts_timers[timerid] == NULL)
break;
if (timerid == TIMER_MAX) {
itimer_unlock();
kmem_free(pt, sizeof(*pt));
return EAGAIN;
}
if (evp == NULL) {
pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
switch (id) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
pt->pt_ev.sigev_signo = SIGALRM;
break;
case CLOCK_VIRTUAL:
pt->pt_ev.sigev_signo = SIGVTALRM;
break;
case CLOCK_PROF:
pt->pt_ev.sigev_signo = SIGPROF;
break;
}
pt->pt_ev.sigev_value.sival_int = timerid;
}
switch (id) {
case CLOCK_VIRTUAL:
itl = &pts->pts_virtual;
break;
case CLOCK_PROF:
itl = &pts->pts_prof;
break;
default:
itl = NULL;
}
itimer_init(&pt->pt_itimer, &ptimer_itimer_ops, id, itl);
pt->pt_proc = p;
pt->pt_poverruns = 0;
pt->pt_entry = timerid;
pt->pt_queued = false;
pts->pts_timers[timerid] = &pt->pt_itimer;
itimer_unlock();
return copyout(&timerid, tid, sizeof(timerid));
}
/*
* sys_timer_delete:
*
* System call to delete a POSIX timer.
*/
int
sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
} */
struct proc *p = l->l_proc;
timer_t timerid;
struct ptimers *pts;
struct itimer *it, *itn;
timerid = SCARG(uap, timerid);
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
if (CLOCK_VIRTUAL_P(it->it_clockid)) {
if (it->it_active) {
itn = LIST_NEXT(it, it_list);
LIST_REMOVE(it, it_list);
for ( ; itn; itn = LIST_NEXT(itn, it_list))
timespecadd(&it->it_time.it_value,
&itn->it_time.it_value,
&itn->it_time.it_value);
it->it_active = false;
}
}
/* Free the timer and release the lock. */
ptimer_free(pts, timerid);
return 0;
}
/*
* sys___timer_settime50:
*
* System call to set/arm a POSIX timer.
*/
int
sys___timer_settime50(struct lwp *l,
const struct sys___timer_settime50_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
syscallarg(int) flags;
syscallarg(const struct itimerspec *) value;
syscallarg(struct itimerspec *) ovalue;
} */
int error;
struct itimerspec value, ovalue, *ovp = NULL;
if ((error = copyin(SCARG(uap, value), &value,
sizeof(struct itimerspec))) != 0)
return error;
if (SCARG(uap, ovalue))
ovp = &ovalue;
if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp,
SCARG(uap, flags), l->l_proc)) != 0)
return error;
if (ovp)
return copyout(&ovalue, SCARG(uap, ovalue),
sizeof(struct itimerspec));
return 0;
}
int
dotimer_settime(int timerid, struct itimerspec *value,
struct itimerspec *ovalue, int flags, struct proc *p)
{
struct timespec now;
struct itimerspec val, oval;
struct ptimers *pts;
struct itimer *it;
int error;
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
val = *value;
if ((error = itimespecfix(&val.it_value)) != 0 ||
(error = itimespecfix(&val.it_interval)) != 0)
return error;
itimer_lock();
restart:
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
oval = it->it_time;
it->it_time = val;
/*
* If we've been passed a relative time for a realtime timer,
* convert it to absolute; if an absolute time for a virtual
* timer, convert it to relative and make sure we don't set it
* to zero, which would cancel the timer, or let it go
* negative, which would confuse the comparison tests.
*/
if (timespecisset(&it->it_time.it_value)) {
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
if ((flags & TIMER_ABSTIME) == 0) {
if (it->it_clockid == CLOCK_REALTIME) {
getnanotime(&now);
} else { /* CLOCK_MONOTONIC */
getnanouptime(&now);
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
}
} else {
if ((flags & TIMER_ABSTIME) != 0) {
getnanotime(&now);
timespecsub(&it->it_time.it_value, &now,
&it->it_time.it_value);
if (!timespecisset(&it->it_time.it_value) ||
it->it_time.it_value.tv_sec < 0) {
it->it_time.it_value.tv_sec = 0;
it->it_time.it_value.tv_nsec = 1;
}
}
}
}
error = itimer_settime(it);
if (error == ERESTART) {
KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
goto restart;
}
KASSERT(error == 0);
itimer_unlock();
if (ovalue)
*ovalue = oval;
return 0;
}
/*
* sys___timer_gettime50:
*
* System call to return the time remaining until a POSIX timer fires.
*/
int
sys___timer_gettime50(struct lwp *l,
const struct sys___timer_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
syscallarg(struct itimerspec *) value;
} */
struct itimerspec its;
int error;
if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc,
&its)) != 0)
return error;
return copyout(&its, SCARG(uap, value), sizeof(its));
}
int
dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its)
{
struct itimer *it;
struct ptimers *pts;
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
itimer_gettime(it, its);
itimer_unlock();
return 0;
}
/*
* sys_timer_getoverrun:
*
* System call to return the number of times a POSIX timer has
* expired while a notification was already pending. The counter
* is reset when a timer expires and a notification can be posted.
*/
int
sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
} */
struct proc *p = l->l_proc;
struct ptimers *pts;
int timerid;
struct itimer *it;
struct ptimer *pt;
timerid = SCARG(uap, timerid);
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
pt = container_of(it, struct ptimer, pt_itimer);
*retval = pt->pt_poverruns;
if (*retval >= DELAYTIMER_MAX)
*retval = DELAYTIMER_MAX;
itimer_unlock();
return 0;
}
/*
* sys___getitimer50:
*
* System call to get the time remaining before a BSD timer fires.
*/
int
sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */
struct proc *p = l->l_proc;
struct itimerval aitv;
int error;
memset(&aitv, 0, sizeof(aitv));
error = dogetitimer(p, SCARG(uap, which), &aitv);
if (error)
return error;
return copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval));
}
int
dogetitimer(struct proc *p, int which, struct itimerval *itvp)
{
struct ptimers *pts;
struct itimer *it;
struct itimerspec its;
if ((u_int)which > ITIMER_MONOTONIC)
return EINVAL;
itimer_lock();
pts = p->p_timers;
if (pts == NULL || (it = pts->pts_timers[which]) == NULL) {
timerclear(&itvp->it_value);
timerclear(&itvp->it_interval);
} else {
itimer_gettime(it, &its);
TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value);
TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval);
}
itimer_unlock();
return 0;
}
/*
* sys___setitimer50:
*
* System call to set/arm a BSD timer.
*/
int
sys___setitimer50(struct lwp *l, const struct sys___setitimer50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(const struct itimerval *) itv;
syscallarg(struct itimerval *) oitv;
} */
struct proc *p = l->l_proc;
int which = SCARG(uap, which);
struct sys___getitimer50_args getargs;
const struct itimerval *itvp;
struct itimerval aitv;
int error;
itvp = SCARG(uap, itv);
if (itvp &&
(error = copyin(itvp, &aitv, sizeof(struct itimerval))) != 0)
return error;
if (SCARG(uap, oitv) != NULL) {
SCARG(&getargs, which) = which;
SCARG(&getargs, itv) = SCARG(uap, oitv);
if ((error = sys___getitimer50(l, &getargs, retval)) != 0)
return error;
}
if (itvp == 0)
return 0;
return dosetitimer(p, which, &aitv);
}
int
dosetitimer(struct proc *p, int which, struct itimerval *itvp)
{
struct timespec now;
struct ptimers *pts;
struct ptimer *spare;
struct itimer *it;
struct itlist *itl;
int error;
if ((u_int)which > ITIMER_MONOTONIC)
return EINVAL;
if (itimerfix(&itvp->it_value) || itimerfix(&itvp->it_interval))
return EINVAL;
/*
* Don't bother allocating data structures if the process just
* wants to clear the timer.
*/
spare = NULL;
pts = p->p_timers;
retry:
if (!timerisset(&itvp->it_value) && (pts == NULL ||
pts->pts_timers[which] == NULL))
return 0;
if (pts == NULL) pts = ptimers_alloc(p);
itimer_lock();
restart:
it = pts->pts_timers[which];
if (it == NULL) {
struct ptimer *pt;
if (spare == NULL) {
itimer_unlock();
spare = kmem_zalloc(sizeof(*spare), KM_SLEEP);
goto retry;
}
pt = spare;
spare = NULL;
it = &pt->pt_itimer;
pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
pt->pt_ev.sigev_value.sival_int = which;
switch (which) {
case ITIMER_REAL:
case ITIMER_MONOTONIC:
itl = NULL;
pt->pt_ev.sigev_signo = SIGALRM;
break;
case ITIMER_VIRTUAL:
itl = &pts->pts_virtual;
pt->pt_ev.sigev_signo = SIGVTALRM;
break;
case ITIMER_PROF:
itl = &pts->pts_prof;
pt->pt_ev.sigev_signo = SIGPROF;
break;
default:
panic("%s: can't happen %d", __func__, which);
}
itimer_init(it, &ptimer_itimer_ops, which, itl);
pt->pt_proc = p;
pt->pt_entry = which;
pts->pts_timers[which] = it;
}
TIMEVAL_TO_TIMESPEC(&itvp->it_value, &it->it_time.it_value);
TIMEVAL_TO_TIMESPEC(&itvp->it_interval, &it->it_time.it_interval);
error = 0;
if (timespecisset(&it->it_time.it_value)) {
/* Convert to absolute time */
/* XXX need to wrap in splclock for timecounters case? */
switch (which) {
case ITIMER_REAL:
getnanotime(&now);
if (!timespecaddok(&it->it_time.it_value, &now)) {
error = EINVAL;
goto out;
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
break;
case ITIMER_MONOTONIC:
getnanouptime(&now);
if (!timespecaddok(&it->it_time.it_value, &now)) {
error = EINVAL;
goto out;
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
break;
default:
break;
}
}
error = itimer_settime(it);
if (error == ERESTART) { KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
goto restart;
}
KASSERT(error == 0);
out:
itimer_unlock();
if (spare != NULL) kmem_free(spare, sizeof(*spare));
return error;
}
/*
* ptimer_tick:
*
* Called from hardclock() to decrement per-process virtual timers.
*/
void
ptimer_tick(lwp_t *l, bool user)
{
struct ptimers *pts;
struct itimer *it;
proc_t *p;
p = l->l_proc;
if (p->p_timers == NULL)
return;
itimer_lock();
if ((pts = l->l_proc->p_timers) != NULL) {
/*
* Run current process's virtual and profile time, as needed.
*/
if (user && (it = LIST_FIRST(&pts->pts_virtual)) != NULL)
if (itimer_decr(it, tick * 1000))
(*it->it_ops->ito_fire)(it);
if ((it = LIST_FIRST(&pts->pts_prof)) != NULL)
if (itimer_decr(it, tick * 1000))
(*it->it_ops->ito_fire)(it);
}
itimer_unlock();
}
/*
* ptimer_intr:
*
* Software interrupt handler for processing per-process
* timer expiration.
*/
static void
ptimer_intr(void *cookie)
{
ksiginfo_t ksi;
struct itimer *it;
struct ptimer *pt;
proc_t *p;
mutex_enter(&proc_lock);
itimer_lock();
while ((pt = TAILQ_FIRST(&ptimer_queue)) != NULL) {
it = &pt->pt_itimer;
TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
KASSERT(pt->pt_queued);
pt->pt_queued = false;
p = pt->pt_proc;
if (p->p_timers == NULL) {
/* Process is dying. */
continue;
}
if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
continue;
}
if (sigismember(&p->p_sigpend.sp_set, pt->pt_ev.sigev_signo)) {
it->it_overruns++;
continue;
}
KSI_INIT(&ksi);
ksi.ksi_signo = pt->pt_ev.sigev_signo;
ksi.ksi_code = SI_TIMER;
ksi.ksi_value = pt->pt_ev.sigev_value;
pt->pt_poverruns = it->it_overruns;
it->it_overruns = 0;
itimer_unlock();
kpsignal(p, &ksi, NULL);
itimer_lock();
}
itimer_unlock();
mutex_exit(&proc_lock);
}
/* $NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#define _LWP_API_PRIVATE
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/specificdata.h>
static specificdata_domain_t lwp_specificdata_domain;
void
lwpinit_specificdata(void)
{
lwp_specificdata_domain = specificdata_domain_create();
KASSERT(lwp_specificdata_domain != NULL);
}
/*
* lwp_specific_key_create --
* Create a key for subsystem lwp-specific data.
*/
int
lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor));
}
/*
* lwp_specific_key_delete --
* Delete a key for subsystem lwp-specific data.
*/
void
lwp_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(lwp_specificdata_domain, key);
}
/*
* lwp_initspecific --
* Initialize an LWP's specificdata container.
*/
void
lwp_initspecific(struct lwp *l)
{
int error __diagused;
error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref);
KASSERT(error == 0);
}
/*
* lwp_finispecific --
* Finalize an LWP's specificdata container.
*/
void
lwp_finispecific(struct lwp *l)
{
specificdata_fini(lwp_specificdata_domain, &l->l_specdataref);
}
/*
* lwp_getspecific --
* Return lwp-specific data corresponding to the specified key.
*
* Note: LWP specific data is NOT INTERLOCKED. An LWP should access
* only its OWN SPECIFIC DATA. If it is necessary to access another
* LWP's specifc data, care must be taken to ensure that doing so
* would not cause internal data structure inconsistency (i.e. caller
* can guarantee that the target LWP is not inside an lwp_getspecific()
* or lwp_setspecific() call).
*/
void *
lwp_getspecific(specificdata_key_t key)
{
return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
&curlwp->l_specdataref, key));
}
void *
_lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key)
{
return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
&l->l_specdataref, key));
}
/*
* lwp_setspecific --
* Set lwp-specific data corresponding to the specified key.
*/
void
lwp_setspecific(specificdata_key_t key, void *data)
{
specificdata_setspecific(lwp_specificdata_domain,
&curlwp->l_specdataref, key, data);
}
void
lwp_setspecific_by_lwp(struct lwp *l, specificdata_key_t key, void *data)
{
specificdata_setspecific(lwp_specificdata_domain,
&l->l_specdataref, key, data);
}
/* $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_pgflcache.c: page freelist cache.
*
* This implements a tiny per-CPU cache of pages that sits between the main
* page allocator and the freelists. By allocating and freeing pages in
* batch, it reduces freelist contention by an order of magnitude.
*
* The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
* uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
* world. On system with one CPU per physical package (e.g. a uniprocessor)
* the cache is not enabled.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $");
#include "opt_uvm.h"
#include "opt_multiprocessor.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>
/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR
/*
* MAXPGS - maximum pages per color, per bucket.
* FILLPGS - number of pages to allocate at once, per color, per bucket.
*
* Why the chosen values:
*
* (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
* colors. We make the assumption that most of the time allocation activity
* will be centered around one UVM freelist, so most of the time there will
* be no more than 224kB worth of cached pages per-CPU. That's tiny, but
* enough to hugely reduce contention on the freelist locks, and give us a
* small pool of pages which if we're very lucky may have some L1/L2 cache
* locality, and do so without subtracting too much from the L2/L3 cache
* benefits of having per-package free lists in the page allocator.
*
* (2) With the chosen values on _LP64, the data structure for each color
* takes up a single cache line (64 bytes) giving this very low overhead
* even in the "miss" case.
*
* (3) We don't want to cause too much pressure by hiding away memory that
* could otherwise be put to good use.
*/
#define MAXPGS 7
#define FILLPGS 6
/* Variable size, according to # colors. */
struct pgflcache {
struct pccolor {
intptr_t count;
struct vm_page *pages[MAXPGS];
} color[1];
};
static kmutex_t uvm_pgflcache_lock;
static int uvm_pgflcache_sem;
/*
* uvm_pgflcache_fill: fill specified freelist/color from global list
*
* => must be called at IPL_VM
* => must be called with given bucket lock held
* => must only fill from the correct bucket for this CPU
*/
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
struct pgflbucket *pgb;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
struct vm_page *pg;
int count;
KASSERT(mutex_owned(&uvm_freelist_locks[b].lock)); KASSERT(ucpu->pgflbucket == b);
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return;
}
/* Fill only to the limit. */
pcc = &pc->color[c];
pgb = uvm.page_free[fl].pgfl_buckets[b];
head = &pgb->pgb_colors[c];
if (pcc->count >= FILLPGS) {
return;
}
/* Pull pages from the bucket until it's empty, or we are full. */
count = pcc->count;
pg = LIST_FIRST(head);
while (__predict_true(pg != NULL && count < FILLPGS)) { KASSERT(pg->flags & PG_FREE); KASSERT(uvm_page_get_bucket(pg) == b);
pcc->pages[count++] = pg;
pg = LIST_NEXT(pg, pageq.list);
}
/* Violate LIST abstraction to remove all pages at once. */
head->lh_first = pg;
if (__predict_true(pg != NULL)) { pg->pageq.list.le_prev = &head->lh_first;
}
pgb->pgb_nfree -= (count - pcc->count);
CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
pcc->count = count;
}
/*
* uvm_pgflcache_spill: spill specified freelist/color to global list
*
* => must be called at IPL_VM
* => mark __noinline so we don't pull it into uvm_pgflcache_free()
*/
static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflbucket *pgb;
struct pgfreelist *pgfl;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
kmutex_t *lock;
int b, adj;
pc = ucpu->pgflcache[fl];
pcc = &pc->color[c];
pgfl = &uvm.page_free[fl];
b = ucpu->pgflbucket;
pgb = pgfl->pgfl_buckets[b];
head = &pgb->pgb_colors[c];
lock = &uvm_freelist_locks[b].lock;
mutex_spin_enter(lock);
for (adj = pcc->count; pcc->count != 0;) {
pcc->count--;
KASSERT(pcc->pages[pcc->count] != NULL); KASSERT(pcc->pages[pcc->count]->flags & PG_FREE); LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
}
pgb->pgb_nfree += adj;
CPU_COUNT(CPU_COUNT_FREEPAGES, adj);
mutex_spin_exit(lock);
}
/*
* uvm_pgflcache_alloc: try to allocate a cached page.
*
* => must be called at IPL_VM
* => allocate only from the given freelist and given page color
*/
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflcache *pc;
struct pccolor *pcc;
struct vm_page *pg;
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return NULL;
}
/* Very simple: if we have a page then return it. */
pcc = &pc->color[c];
if (__predict_false(pcc->count == 0)) {
return NULL;
}
pg = pcc->pages[--(pcc->count)];
KASSERT(pg != NULL); KASSERT(pg->flags == PG_FREE); KASSERT(uvm_page_get_freelist(pg) == fl); KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
return pg;
}
/*
* uvm_pgflcache_free: cache a page, if possible.
*
* => must be called at IPL_VM
* => must only send pages for the correct bucket for this CPU
*/
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
struct pgflcache *pc;
struct pccolor *pcc;
int fl, c;
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
/* If caching is off, then bail out. */
fl = uvm_page_get_freelist(pg); if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return false;
}
/* If the array is full spill it first, then add page to array. */
c = VM_PGCOLOR(pg);
pcc = &pc->color[c];
KASSERT((pg->flags & PG_FREE) == 0); if (__predict_false(pcc->count == MAXPGS)) { uvm_pgflcache_spill(ucpu, fl, c);
}
pg->flags = PG_FREE;
pcc->pages[pcc->count] = pg;
pcc->count++;
return true;
}
/*
* uvm_pgflcache_init: allocate and initialize per-CPU data structures for
* the free page cache. Don't set anything in motion - that's taken care
* of by uvm_pgflcache_resume().
*/
static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
size_t sz;
ucpu = ci->ci_data.cpu_uvm;
KASSERT(ucpu->pgflcachemem == NULL);
KASSERT(ucpu->pgflcache[0] == NULL);
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
ucpu->pgflcachememsz =
(roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}
/*
* uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
* and shut down caching on the CPU. Called on each CPU in the system via
* xcall.
*/
static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
struct uvm_cpu *ucpu;
int fl, color, s;
ucpu = curcpu()->ci_data.cpu_uvm;
for (fl = 0; fl < VM_NFREELIST; fl++) {
s = splvm();
for (color = 0; color < uvmexp.ncolors; color++) {
uvm_pgflcache_spill(ucpu, fl, color);
}
ucpu->pgflcache[fl] = NULL;
splx(s);
}
}
/*
* uvm_pgflcache_pause: pause operation of the caches
*/
void
uvm_pgflcache_pause(void)
{
uint64_t where;
/* First one in starts draining. Everyone else waits. */
mutex_enter(&uvm_pgflcache_lock);
if (uvm_pgflcache_sem++ == 0) {
where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu,
(void *)1, NULL);
xc_wait(where);
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_resume: resume operation of the caches
*/
void
uvm_pgflcache_resume(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct uvm_cpu *ucpu;
uintptr_t addr;
size_t sz;
int fl;
/* Last guy out takes care of business. */
mutex_enter(&uvm_pgflcache_lock);
KASSERT(uvm_pgflcache_sem > 0);
if (uvm_pgflcache_sem-- > 1) {
mutex_exit(&uvm_pgflcache_lock);
return;
}
/*
* Make sure dependant data structure updates are remotely visible.
* Essentially this functions as a global memory barrier.
*/
xc_barrier(XC_HIGHPRI);
/*
* Then set all of the pointers in place on each CPU. As soon as
* each pointer is set, caching is operational in that dimension.
*/
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
for (CPU_INFO_FOREACH(cii, ci)) {
ucpu = ci->ci_data.cpu_uvm;
addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
for (fl = 0; fl < VM_NFREELIST; fl++) {
ucpu->pgflcache[fl] = (struct pgflcache *)addr;
addr += sz;
}
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_start: start operation of the cache.
*
* => called once only, when init(8) is about to be started
*/
void
uvm_pgflcache_start(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(uvm_pgflcache_sem > 0);
/*
* There's not much point doing this if every CPU has its own
* bucket (and that includes the uniprocessor case).
*/
if (ncpu == uvm.bucketcount) {
return;
}
/* Create data structures for each CPU. */
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_pgflcache_init_cpu(ci);
}
/* Kick it into action. */
uvm_pgflcache_resume();
}
/*
* uvm_pgflcache_init: set up data structures for the free page cache.
*/
void
uvm_pgflcache_init(void)
{
uvm_pgflcache_sem = 1;
mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
}
#else /* MULTIPROCESSOR */
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
return NULL;
}
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
return false;
}
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
}
void
uvm_pgflcache_pause(void)
{
}
void
uvm_pgflcache_resume(void)
{
}
void
uvm_pgflcache_start(void)
{
}
void
uvm_pgflcache_init(void)
{
}
#endif /* MULTIPROCESSOR */
/* $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $ */
/*
* Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
* from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $");
#include "opt_uvmhist.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#include "opt_vmswap.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/conf.h>
#include <sys/cprng.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/vmem.h>
#include <sys/blist.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>
#include <sys/workqueue.h>
#include <uvm/uvm.h>
#include <miscfs/specfs/specdev.h>
#include <crypto/aes/aes.h>
#include <crypto/aes/aes_cbc.h>
/*
* uvm_swap.c: manage configuration and i/o to swap space.
*/
/*
* swap space is managed in the following way:
*
* each swap partition or file is described by a "swapdev" structure.
* each "swapdev" structure contains a "swapent" structure which contains
* information that is passed up to the user (via system calls).
*
* each swap partition is assigned a "priority" (int) which controls
* swap partition usage.
*
* the system maintains a global data structure describing all swap
* partitions/files. there is a sorted LIST of "swappri" structures
* which describe "swapdev"'s at that priority. this LIST is headed
* by the "swap_priority" global var. each "swappri" contains a
* TAILQ of "swapdev" structures at that priority.
*
* locking:
* - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
* system call and prevents the swap priority list from changing
* while we are in the middle of a system call (e.g. SWAP_STATS).
* - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
* structures including the priority list, the swapdev structures,
* and the swapmap arena.
*
* each swap device has the following info:
* - swap device in use (could be disabled, preventing future use)
* - swap enabled (allows new allocations on swap)
* - map info in /dev/drum
* - vnode pointer
* for swap files only:
* - block size
* - max byte count in buffer
* - buffer
*
* userland controls and configures swap with the swapctl(2) system call.
* the sys_swapctl performs the following operations:
* [1] SWAP_NSWAP: returns the number of swap devices currently configured
* [2] SWAP_STATS: given a pointer to an array of swapent structures
* (passed in via "arg") of a size passed in via "misc" ... we load
* the current swap config into the array. The actual work is done
* in the uvm_swap_stats() function.
* [3] SWAP_ON: given a pathname in arg (could be device or file) and a
* priority in "misc", start swapping on it.
* [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
* [5] SWAP_CTL: changes the priority of a swap device (new priority in
* "misc")
*/
/*
* swapdev: describes a single swap partition/file
*
* note the following should be true:
* swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
* swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
*/
struct swapdev {
dev_t swd_dev; /* device id */
int swd_flags; /* flags:inuse/enable/fake */
int swd_priority; /* our priority */
int swd_nblks; /* blocks in this device */
char *swd_path; /* saved pathname of device */
int swd_pathlen; /* length of pathname */
int swd_npages; /* #pages we can use */
int swd_npginuse; /* #pages in use */
int swd_npgbad; /* #pages bad */
int swd_drumoffset; /* page0 offset in drum */
int swd_drumsize; /* #pages in drum */
blist_t swd_blist; /* blist for this swapdev */
struct vnode *swd_vp; /* backing vnode */
TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */
int swd_bsize; /* blocksize (bytes) */
int swd_maxactive; /* max active i/o reqs */
struct bufq_state *swd_tab; /* buffer list */
int swd_active; /* number of active buffers */
volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */
struct aesenc swd_enckey; /* AES key expanded for enc */
struct aesdec swd_deckey; /* AES key expanded for dec */
bool swd_encinit; /* true if keys initialized */
};
/*
* swap device priority entry; the list is kept sorted on `spi_priority'.
*/
struct swappri {
int spi_priority; /* priority */
TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
/* tailq of swapdevs at this priority */
LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
};
/*
* The following two structures are used to keep track of data transfers
* on swap devices associated with regular files.
* NOTE: this code is more or less a copy of vnd.c; we use the same
* structure names here to ease porting..
*/
struct vndxfer {
struct buf *vx_bp; /* Pointer to parent buffer */
struct swapdev *vx_sdp;
int vx_error;
int vx_pending; /* # of pending aux buffers */
int vx_flags;
#define VX_BUSY 1
#define VX_DEAD 2
};
struct vndbuf {
struct buf vb_buf;
struct vndxfer *vb_xfer;
};
/*
* We keep a of pool vndbuf's and vndxfer structures.
*/
static struct pool vndxfer_pool, vndbuf_pool;
/*
* local variables
*/
static vmem_t *swapmap; /* controls the mapping of /dev/drum */
/* list of all active swap devices [by priority] */
LIST_HEAD(swap_priority, swappri);
static struct swap_priority swap_priority;
/* locks */
static kmutex_t uvm_swap_data_lock __cacheline_aligned;
static krwlock_t swap_syscall_lock;
bool uvm_swap_init_done = false;
/* workqueue and use counter for swap to regular files */
static int sw_reg_count = 0;
static struct workqueue *sw_reg_workqueue;
/* tuneables */
u_int uvm_swapisfull_factor = 99;
#if VMSWAP_DEFAULT_PLAINTEXT
bool uvm_swap_encrypt = false;
#else
bool uvm_swap_encrypt = true;
#endif
/*
* prototypes
*/
static struct swapdev *swapdrum_getsdp(int);
static struct swapdev *swaplist_find(struct vnode *, bool);
static void swaplist_insert(struct swapdev *,
struct swappri *, int);
static void swaplist_trim(void);
static int swap_on(struct lwp *, struct swapdev *);
static int swap_off(struct lwp *, struct swapdev *);
static void sw_reg_strategy(struct swapdev *, struct buf *, int);
static void sw_reg_biodone(struct buf *);
static void sw_reg_iodone(struct work *wk, void *dummy);
static void sw_reg_start(struct swapdev *);
static int uvm_swap_io(struct vm_page **, int, int, int);
static void uvm_swap_genkey(struct swapdev *);
static void uvm_swap_encryptpage(struct swapdev *, void *, int);
static void uvm_swap_decryptpage(struct swapdev *, void *, int);
static size_t
encmap_size(size_t npages)
{
struct swapdev *sdp;
const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
const size_t bitsperword = NBBY * bytesperword;
const size_t nbits = npages; /* one bit for each page */
const size_t nwords = howmany(nbits, bitsperword);
const size_t nbytes = nwords * bytesperword;
return nbytes;
}
/*
* uvm_swap_init: init the swap system data structures and locks
*
* => called at boot time from init_main.c after the filesystems
* are brought up (which happens after uvm_init())
*/
void
uvm_swap_init(void)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLED(pdhist);
/*
* first, init the swap list, its counter, and its lock.
* then get a handle on the vnode for /dev/drum by using
* the its dev_t number ("swapdev", from MD conf.c).
*/
LIST_INIT(&swap_priority);
uvmexp.nswapdev = 0;
rw_init(&swap_syscall_lock);
mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
if (bdevvp(swapdev, &swapdev_vp))
panic("%s: can't get vnode for swap device", __func__);
if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
panic("%s: can't lock swap device", __func__);
if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
panic("%s: can't open swap device", __func__);
VOP_UNLOCK(swapdev_vp);
/*
* create swap block resource map to map /dev/drum. the range
* from 1 to INT_MAX allows 2 gigablocks of swap space. note
* that block 0 is reserved (used to indicate an allocation
* failure, or no allocation).
*/
swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
VM_NOSLEEP, IPL_NONE);
if (swapmap == 0) {
panic("%s: vmem_create failed", __func__);
}
pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
NULL, IPL_BIO);
pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
NULL, IPL_BIO);
uvm_swap_init_done = true;
UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
}
/*
* swaplist functions: functions that operate on the list of swap
* devices on the system.
*/
/*
* swaplist_insert: insert swap device "sdp" into the global list
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => caller must provide a newly allocated swappri structure (we will
* FREE it if we don't need it... this it to prevent allocation
* blocking here while adding swap)
*/
static void
swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
{
struct swappri *spp, *pspp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock));
/*
* find entry at or after which to insert the new device.
*/
pspp = NULL;
LIST_FOREACH(spp, &swap_priority, spi_swappri) { if (priority <= spp->spi_priority)
break;
pspp = spp;
}
/*
* new priority?
*/
if (spp == NULL || spp->spi_priority != priority) {
spp = newspp; /* use newspp! */
UVMHIST_LOG(pdhist, "created new swappri = %jd",
priority, 0, 0, 0);
spp->spi_priority = priority;
TAILQ_INIT(&spp->spi_swapdev);
if (pspp)
LIST_INSERT_AFTER(pspp, spp, spi_swappri);
else
LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
} else {
/* we don't need a new priority structure, free it */
kmem_free(newspp, sizeof(*newspp));
}
/*
* priority found (or created). now insert on the priority's
* tailq list and bump the total number of swapdevs.
*/
sdp->swd_priority = priority;
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev++;
}
/*
* swaplist_find: find and optionally remove a swap device from the
* global list.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => we return the swapdev we found (and removed)
*/
static struct swapdev *
swaplist_find(struct vnode *vp, bool remove)
{
struct swapdev *sdp;
struct swappri *spp;
KASSERT(rw_lock_held(&swap_syscall_lock)); KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); KASSERT(mutex_owned(&uvm_swap_data_lock));
/*
* search the lists for the requested vp
*/
LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_vp == vp) {
if (remove) { TAILQ_REMOVE(&spp->spi_swapdev,
sdp, swd_next);
uvmexp.nswapdev--;
}
return(sdp);
}
}
}
return (NULL);
}
/*
* swaplist_trim: scan priority list for empty priority entries and kill
* them.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
*/
static void
swaplist_trim(void)
{
struct swappri *spp, *nextspp;
KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { if (!TAILQ_EMPTY(&spp->spi_swapdev))
continue;
LIST_REMOVE(spp, spi_swappri);
kmem_free(spp, sizeof(*spp));
}
}
/*
* swapdrum_getsdp: given a page offset in /dev/drum, convert it back
* to the "swapdev" that maps that section of the drum.
*
* => each swapdev takes one big contig chunk of the drum
* => caller must hold uvm_swap_data_lock
*/
static struct swapdev *
swapdrum_getsdp(int pgno)
{
struct swapdev *sdp;
struct swappri *spp;
KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_flags & SWF_FAKE)
continue;
if (pgno >= sdp->swd_drumoffset &&
pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
return sdp;
}
}
}
return NULL;
}
/*
* swapdrum_sdp_is: true iff the swap device for pgno is sdp
*
* => for use in positive assertions only; result is not stable
*/
static bool __debugused
swapdrum_sdp_is(int pgno, struct swapdev *sdp)
{
bool result;
mutex_enter(&uvm_swap_data_lock);
result = swapdrum_getsdp(pgno) == sdp;
mutex_exit(&uvm_swap_data_lock);
return result;
}
void swapsys_lock(krw_t op)
{
rw_enter(&swap_syscall_lock, op);
}
void swapsys_unlock(void)
{
rw_exit(&swap_syscall_lock);
}
static void
swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
{
se->se_dev = sdp->swd_dev;
se->se_flags = sdp->swd_flags;
se->se_nblks = sdp->swd_nblks;
se->se_inuse = inuse;
se->se_priority = sdp->swd_priority;
KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
strcpy(se->se_path, sdp->swd_path);
}
int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
(void *)enosys;
int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
(void *)enosys;
/*
* sys_swapctl: main entry point for swapctl(2) system call
* [with two helper functions: swap_on and swap_off]
*/
int
sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
{
/* {
syscallarg(int) cmd;
syscallarg(void *) arg;
syscallarg(int) misc;
} */
struct vnode *vp;
struct nameidata nd;
struct swappri *spp;
struct swapdev *sdp;
#define SWAP_PATH_MAX (PATH_MAX + 1)
char *userpath;
size_t len = 0;
int error;
int priority;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* we handle the non-priv NSWAP and STATS request first.
*
* SWAP_NSWAP: return number of config'd swap devices
* [can also be obtained with uvmexp sysctl]
*/
if (SCARG(uap, cmd) == SWAP_NSWAP) {
const int nswapdev = uvmexp.nswapdev;
UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
0, 0, 0);
*retval = nswapdev;
return 0;
}
userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
/*
* ensure serialized syscall access by grabbing the swap_syscall_lock
*/
rw_enter(&swap_syscall_lock, RW_WRITER);
/*
* SWAP_STATS: get stats on current # of configured swap devs
*
* note that the swap_priority list can't change as long
* as we are holding the swap_syscall_lock. we don't want
* to grab the uvm_swap_data_lock because we may fault&sleep during
* copyout() and we don't want to be holding that lock then!
*/
switch (SCARG(uap, cmd)) {
case SWAP_STATS13:
error = (*uvm_swap_stats13)(uap, retval);
goto out;
case SWAP_STATS50:
error = (*uvm_swap_stats50)(uap, retval);
goto out;
case SWAP_STATS:
error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
NULL, sizeof(struct swapent), retval);
UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
goto out;
case SWAP_GETDUMPDEV:
error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
goto out;
default:
break;
}
/*
* all other requests require superuser privs. verify.
*/
if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
0, NULL, NULL, NULL)))
goto out;
if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
/* drop the current dump device */
dumpdev = NODEV;
dumpcdev = NODEV;
cpu_dumpconf();
goto out;
}
/*
* at this point we expect a path name in arg. we will
* use namei() to gain a vnode reference (vref), and lock
* the vnode (VOP_LOCK).
*
* XXX: a NULL arg means use the root vnode pointer (e.g. for
* miniroot)
*/
if (SCARG(uap, arg) == NULL) {
vp = rootvp; /* miniroot */
vref(vp);
if (vn_lock(vp, LK_EXCLUSIVE)) {
vrele(vp);
error = EBUSY;
goto out;
}
if (SCARG(uap, cmd) == SWAP_ON &&
copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
panic("swapctl: miniroot copy failed");
} else {
struct pathbuf *pb;
/*
* This used to allow copying in one extra byte
* (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
* This was completely pointless because if anyone
* used that extra byte namei would fail with
* ENAMETOOLONG anyway, so I've removed the excess
* logic. - dholland 20100215
*/
error = pathbuf_copyin(SCARG(uap, arg), &pb);
if (error) {
goto out;
}
if (SCARG(uap, cmd) == SWAP_ON) {
/* get a copy of the string */
pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
len = strlen(userpath) + 1;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
if ((error = namei(&nd))) {
pathbuf_destroy(pb);
goto out;
}
vp = nd.ni_vp;
pathbuf_destroy(pb);
}
/* note: "vp" is referenced and locked */
error = 0; /* assume no error */
switch(SCARG(uap, cmd)) {
case SWAP_DUMPDEV:
if (vp->v_type != VBLK) {
error = ENOTBLK;
break;
}
if (bdevsw_lookup(vp->v_rdev)) {
dumpdev = vp->v_rdev;
dumpcdev = devsw_blk2chr(dumpdev);
} else
dumpdev = NODEV;
cpu_dumpconf();
break;
case SWAP_CTL:
/*
* get new priority, remove old entry (if any) and then
* reinsert it in the correct place. finally, prune out
* any empty priority structures.
*/
priority = SCARG(uap, misc);
spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
mutex_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, true)) == NULL) {
error = ENOENT;
} else {
swaplist_insert(sdp, spp, priority);
swaplist_trim();
}
mutex_exit(&uvm_swap_data_lock);
if (error)
kmem_free(spp, sizeof(*spp));
break;
case SWAP_ON:
/*
* check for duplicates. if none found, then insert a
* dummy entry on the list to prevent someone else from
* trying to enable this device while we are working on
* it.
*/
priority = SCARG(uap, misc);
sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
sdp->swd_flags = SWF_FAKE;
sdp->swd_vp = vp;
sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
mutex_enter(&uvm_swap_data_lock);
if (swaplist_find(vp, false) != NULL) {
error = EBUSY;
mutex_exit(&uvm_swap_data_lock);
bufq_free(sdp->swd_tab);
kmem_free(sdp, sizeof(*sdp));
kmem_free(spp, sizeof(*spp));
break;
}
swaplist_insert(sdp, spp, priority);
mutex_exit(&uvm_swap_data_lock);
KASSERT(len > 0);
sdp->swd_pathlen = len;
sdp->swd_path = kmem_alloc(len, KM_SLEEP);
if (copystr(userpath, sdp->swd_path, len, 0) != 0)
panic("swapctl: copystr");
/*
* we've now got a FAKE placeholder in the swap list.
* now attempt to enable swap on it. if we fail, undo
* what we've done and kill the fake entry we just inserted.
* if swap_on is a success, it will clear the SWF_FAKE flag
*/
if ((error = swap_on(l, sdp)) != 0) {
mutex_enter(&uvm_swap_data_lock);
(void) swaplist_find(vp, true); /* kill fake entry */
swaplist_trim();
mutex_exit(&uvm_swap_data_lock);
bufq_free(sdp->swd_tab);
kmem_free(sdp->swd_path, sdp->swd_pathlen);
kmem_free(sdp, sizeof(*sdp));
break;
}
break;
case SWAP_OFF:
mutex_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, false)) == NULL) {
mutex_exit(&uvm_swap_data_lock);
error = ENXIO;
break;
}
/*
* If a device isn't in use or enabled, we
* can't stop swapping from it (again).
*/
if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
mutex_exit(&uvm_swap_data_lock);
error = EBUSY;
break;
}
/*
* do the real work.
*/
error = swap_off(l, sdp);
break;
default:
error = EINVAL;
}
/*
* done! release the ref gained by namei() and unlock.
*/
vput(vp);
out:
rw_exit(&swap_syscall_lock);
kmem_free(userpath, SWAP_PATH_MAX);
UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0);
return (error);
}
/*
* uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
* away from sys_swapctl() in order to allow COMPAT_* swapctl()
* emulation to use it directly without going through sys_swapctl().
* The problem with using sys_swapctl() there is that it involves
* copying the swapent array to the stackgap, and this array's size
* is not known at build time. Hence it would not be possible to
* ensure it would fit in the stackgap in any case.
*/
int
uvm_swap_stats(char *ptr, int misc,
void (*f)(void *, const struct swapent *), size_t len,
register_t *retval)
{
struct swappri *spp;
struct swapdev *sdp;
struct swapent sep;
int count = 0;
int error;
KASSERT(len <= sizeof(sep)); if (len == 0)
return ENOSYS;
if (misc < 0)
return EINVAL;
if (misc == 0 || uvmexp.nswapdev == 0)
return 0;
/* Make sure userland cannot exhaust kernel memory */
if ((size_t)misc > (size_t)uvmexp.nswapdev)
misc = uvmexp.nswapdev;
KASSERT(rw_lock_held(&swap_syscall_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
int inuse;
if (misc-- <= 0)
break;
inuse = btodb((uint64_t)sdp->swd_npginuse <<
PAGE_SHIFT);
memset(&sep, 0, sizeof(sep));
swapent_cvt(&sep, sdp, inuse); if (f) (*f)(&sep, &sep); if ((error = copyout(&sep, ptr, len)) != 0)
return error;
ptr += len;
count++;
}
}
*retval = count;
return 0;
}
/*
* swap_on: attempt to enable a swapdev for swapping. note that the
* swapdev is already on the global list, but disabled (marked
* SWF_FAKE).
*
* => we avoid the start of the disk (to protect disk labels)
* => we also avoid the miniroot, if we are swapping to root.
* => caller should leave uvm_swap_data_lock unlocked, we may lock it
* if needed.
*/
static int
swap_on(struct lwp *l, struct swapdev *sdp)
{
struct vnode *vp;
int error, npages, nblocks, size;
long addr;
vmem_addr_t result;
struct vattr va;
dev_t dev;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* we want to enable swapping on sdp. the swd_vp contains
* the vnode we want (locked and ref'd), and the swd_dev
* contains the dev_t of the file, if it a block device.
*/
vp = sdp->swd_vp;
dev = sdp->swd_dev;
/*
* open the swap file (mostly useful for block device files to
* let device driver know what is up).
*
* we skip the open/close for root on swap because the root
* has already been opened when root was mounted (mountroot).
*/
if (vp != rootvp) { if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
return (error);
}
/* XXX this only works for block devices */
UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
/*
* we now need to determine the size of the swap area. for
* block specials we can call the d_psize function.
* for normal files, we must stat [get attrs].
*
* we put the result in nblks.
* for normal files, we also want the filesystem block size
* (which we get with statfs).
*/
switch (vp->v_type) {
case VBLK:
if ((nblocks = bdev_size(dev)) == -1) {
error = ENXIO;
goto bad;
}
break;
case VREG:
if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
goto bad;
nblocks = (int)btodb(va.va_size);
sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
/*
* limit the max # of outstanding I/O requests we issue
* at any one time. take it easy on NFS servers.
*/
if (vp->v_tag == VT_NFS)
sdp->swd_maxactive = 2; /* XXX */
else
sdp->swd_maxactive = 8; /* XXX */
break;
default:
error = ENXIO;
goto bad;
}
/*
* save nblocks in a safe place and convert to pages.
*/
sdp->swd_nblks = nblocks;
npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
/*
* for block special files, we want to make sure that leave
* the disklabel and bootblocks alone, so we arrange to skip
* over them (arbitrarily choosing to skip PAGE_SIZE bytes).
* note that because of this the "size" can be less than the
* actual number of blocks on the device.
*/
if (vp->v_type == VBLK) {
/* we use pages 1 to (size - 1) [inclusive] */
size = npages - 1;
addr = 1;
} else {
/* we use pages 0 to (size - 1) [inclusive] */
size = npages;
addr = 0;
}
/*
* make sure we have enough blocks for a reasonable sized swap
* area. we want at least one page.
*/
if (size < 1) {
UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
error = EINVAL;
goto bad;
}
UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
/*
* now we need to allocate an extent to manage this swap device
*/
sdp->swd_blist = blist_create(npages);
/* mark all expect the `saved' region free. */
blist_free(sdp->swd_blist, addr, size);
/*
* allocate space to for swap encryption state and mark the
* keys uninitialized so we generate them lazily
*/
sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
sdp->swd_encinit = false;
/*
* if the vnode we are swapping to is the root vnode
* (i.e. we are swapping to the miniroot) then we want
* to make sure we don't overwrite it. do a statfs to
* find its size and skip over it.
*/
if (vp == rootvp) {
struct mount *mp;
struct statvfs *sp;
int rootblocks, rootpages;
mp = rootvnode->v_mount;
sp = &mp->mnt_stat;
rootblocks = sp->f_blocks * btodb(sp->f_frsize);
/*
* XXX: sp->f_blocks isn't the total number of
* blocks in the filesystem, it's the number of
* data blocks. so, our rootblocks almost
* definitely underestimates the total size
* of the filesystem - how badly depends on the
* details of the filesystem type. there isn't
* an obvious way to deal with this cleanly
* and perfectly, so for now we just pad our
* rootblocks estimate with an extra 5 percent.
*/
rootblocks += (rootblocks >> 5) +
(rootblocks >> 6) +
(rootblocks >> 7);
rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
if (rootpages > size)
panic("swap_on: miniroot larger than swap?");
if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
panic("swap_on: unable to preserve miniroot");
}
size -= rootpages;
printf("Preserved %d pages of miniroot ", rootpages);
printf("leaving %d pages of swap\n", size);
}
/*
* add a ref to vp to reflect usage as a swap device.
*/
vref(vp);
/*
* now add the new swapdev to the drum and enable.
*/
error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
if (error != 0)
panic("swapdrum_add");
/*
* If this is the first regular swap create the workqueue.
* => Protected by swap_syscall_lock.
*/
if (vp->v_type != VBLK) { if (sw_reg_count++ == 0) { KASSERT(sw_reg_workqueue == NULL); if (workqueue_create(&sw_reg_workqueue, "swapiod",
sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
panic("%s: workqueue_create failed", __func__);
}
}
sdp->swd_drumoffset = (int)result;
sdp->swd_drumsize = npages;
sdp->swd_npages = size;
mutex_enter(&uvm_swap_data_lock);
sdp->swd_flags &= ~SWF_FAKE; /* going live */
sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
uvmexp.swpages += size;
uvmexp.swpgavail += size;
mutex_exit(&uvm_swap_data_lock);
return (0);
/*
* failure: clean up and return error.
*/
bad:
if (sdp->swd_blist) { blist_destroy(sdp->swd_blist);
}
if (vp != rootvp) { (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
}
return (error);
}
/*
* swap_off: stop swapping on swapdev
*
* => swap data should be locked, we will unlock.
*/
static int
swap_off(struct lwp *l, struct swapdev *sdp)
{
int npages = sdp->swd_npages;
int error = 0;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
KASSERT(rw_write_held(&swap_syscall_lock));
KASSERT(mutex_owned(&uvm_swap_data_lock));
/* disable the swap area being removed */
sdp->swd_flags &= ~SWF_ENABLE;
uvmexp.swpgavail -= npages;
mutex_exit(&uvm_swap_data_lock);
/*
* the idea is to find all the pages that are paged out to this
* device, and page them all in. in uvm, swap-backed pageable
* memory can take two forms: aobjs and anons. call the
* swapoff hook for each subsystem to bring in pages.
*/
if (uao_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize) ||
amap_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize)) {
error = ENOMEM;
} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
error = EBUSY;
}
if (error) {
mutex_enter(&uvm_swap_data_lock);
sdp->swd_flags |= SWF_ENABLE;
uvmexp.swpgavail += npages;
mutex_exit(&uvm_swap_data_lock);
return error;
}
/*
* If this is the last regular swap destroy the workqueue.
* => Protected by swap_syscall_lock.
*/
if (sdp->swd_vp->v_type != VBLK) {
KASSERT(sw_reg_count > 0);
KASSERT(sw_reg_workqueue != NULL);
if (--sw_reg_count == 0) {
workqueue_destroy(sw_reg_workqueue);
sw_reg_workqueue = NULL;
}
}
/*
* done with the vnode.
* drop our ref on the vnode before calling VOP_CLOSE()
* so that spec_close() can tell if this is the last close.
*/
vrele(sdp->swd_vp);
if (sdp->swd_vp != rootvp) {
(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
}
mutex_enter(&uvm_swap_data_lock);
uvmexp.swpages -= npages;
uvmexp.swpginuse -= sdp->swd_npgbad;
if (swaplist_find(sdp->swd_vp, true) == NULL)
panic("%s: swapdev not in list", __func__);
swaplist_trim();
mutex_exit(&uvm_swap_data_lock);
/*
* free all resources!
*/
vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
blist_destroy(sdp->swd_blist);
bufq_free(sdp->swd_tab);
kmem_free(__UNVOLATILE(sdp->swd_encmap),
encmap_size(sdp->swd_drumsize));
explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
kmem_free(sdp, sizeof(*sdp));
return (0);
}
void
uvm_swap_shutdown(struct lwp *l)
{
struct swapdev *sdp;
struct swappri *spp;
struct vnode *vp;
int error;
if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
return;
printf("turning off swap...");
rw_enter(&swap_syscall_lock, RW_WRITER);
mutex_enter(&uvm_swap_data_lock);
again:
LIST_FOREACH(spp, &swap_priority, spi_swappri)
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_flags & SWF_FAKE)
continue;
if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
continue;
#ifdef DEBUG
printf("\nturning off swap on %s...", sdp->swd_path);
#endif
/* Have to lock and reference vnode for swap_off(). */
vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
vref(vp);
error = swap_off(l, sdp);
vput(vp);
mutex_enter(&uvm_swap_data_lock);
if (error) {
printf("stopping swap on %s failed "
"with error %d\n", sdp->swd_path, error);
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev--;
swaplist_trim();
}
goto again;
}
printf(" done\n");
mutex_exit(&uvm_swap_data_lock);
rw_exit(&swap_syscall_lock);
}
/*
* /dev/drum interface and i/o functions
*/
/*
* swopen: allow the initial open from uvm_swap_init() and reject all others.
*/
static int
swopen(dev_t dev, int flag, int mode, struct lwp *l)
{
static bool inited = false;
if (!inited) {
inited = true;
return 0;
}
return ENODEV;
}
/*
* swstrategy: perform I/O on the drum
*
* => we must map the i/o request from the drum to the correct swapdev.
*/
static void
swstrategy(struct buf *bp)
{
struct swapdev *sdp;
struct vnode *vp;
int pageno, bn;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* convert block number to swapdev. note that swapdev can't
* be yanked out from under us because we are holding resources
* in it (i.e. the blocks we are doing I/O on).
*/
pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(pageno);
mutex_exit(&uvm_swap_data_lock);
if (sdp == NULL) {
bp->b_error = EINVAL;
bp->b_resid = bp->b_bcount;
biodone(bp);
UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
return;
}
/*
* convert drum page number to block number on this swapdev.
*/
pageno -= sdp->swd_drumoffset; /* page # on swapdev */
bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
((bp->b_flags & B_READ) == 0) ? 1 : 0,
sdp->swd_drumoffset, bn, bp->b_bcount);
/*
* for block devices we finish up here.
* for regular files we have to do more work which we delegate
* to sw_reg_strategy().
*/
vp = sdp->swd_vp; /* swapdev vnode pointer */
switch (vp->v_type) {
default:
panic("%s: vnode type 0x%x", __func__, vp->v_type);
case VBLK:
/*
* must convert "bp" from an I/O on /dev/drum to an I/O
* on the swapdev (sdp).
*/
bp->b_blkno = bn; /* swapdev block number */
bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
/*
* if we are doing a write, we have to redirect the i/o on
* drum's v_numoutput counter to the swapdevs.
*/
if ((bp->b_flags & B_READ) == 0) { mutex_enter(bp->b_objlock);
vwakeup(bp); /* kills one 'v_numoutput' on drum */
mutex_exit(bp->b_objlock);
mutex_enter(vp->v_interlock);
vp->v_numoutput++; /* put it on swapdev */
mutex_exit(vp->v_interlock);
}
/*
* finally plug in swapdev vnode and start I/O
*/
bp->b_vp = vp;
bp->b_objlock = vp->v_interlock;
VOP_STRATEGY(vp, bp);
return;
case VREG:
/*
* delegate to sw_reg_strategy function.
*/
sw_reg_strategy(sdp, bp, bn);
return;
}
/* NOTREACHED */
}
/*
* swread: the read function for the drum (just a call to physio)
*/
/*ARGSUSED*/
static int
swread(dev_t dev, struct uio *uio, int ioflag)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
}
/*
* swwrite: the write function for the drum (just a call to physio)
*/
/*ARGSUSED*/
static int
swwrite(dev_t dev, struct uio *uio, int ioflag)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
}
const struct bdevsw swap_bdevsw = {
.d_open = swopen,
.d_close = noclose,
.d_strategy = swstrategy,
.d_ioctl = noioctl,
.d_dump = nodump,
.d_psize = nosize,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
const struct cdevsw swap_cdevsw = {
.d_open = nullopen,
.d_close = nullclose,
.d_read = swread,
.d_write = swwrite,
.d_ioctl = noioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER,
};
/*
* sw_reg_strategy: handle swap i/o to regular files
*/
static void
sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
{
struct vnode *vp;
struct vndxfer *vnx;
daddr_t nbn;
char *addr;
off_t byteoff;
int s, off, nra, error, sz, resid;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* allocate a vndxfer head for this transfer and point it to
* our buffer.
*/
vnx = pool_get(&vndxfer_pool, PR_WAITOK);
vnx->vx_flags = VX_BUSY;
vnx->vx_error = 0;
vnx->vx_pending = 0;
vnx->vx_bp = bp;
vnx->vx_sdp = sdp;
/*
* setup for main loop where we read filesystem blocks into
* our buffer.
*/
error = 0;
bp->b_resid = bp->b_bcount; /* nothing transferred yet! */
addr = bp->b_data; /* current position in buffer */
byteoff = dbtob((uint64_t)bn);
for (resid = bp->b_resid; resid; resid -= sz) {
struct vndbuf *nbp;
/*
* translate byteoffset into block number. return values:
* vp = vnode of underlying device
* nbn = new block number (on underlying vnode dev)
* nra = num blocks we can read-ahead (excludes requested
* block)
*/
nra = 0;
error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
&vp, &nbn, &nra);
if (error == 0 && nbn == (daddr_t)-1) {
/*
* this used to just set error, but that doesn't
* do the right thing. Instead, it causes random
* memory errors. The panic() should remain until
* this condition doesn't destabilize the system.
*/
#if 1
panic("%s: swap to sparse file", __func__);
#else
error = EIO; /* failure */
#endif
}
/*
* punt if there was an error or a hole in the file.
* we must wait for any i/o ops we have already started
* to finish before returning.
*
* XXX we could deal with holes here but it would be
* a hassle (in the write case).
*/
if (error) {
s = splbio();
vnx->vx_error = error; /* pass error up */
goto out;
}
/*
* compute the size ("sz") of this transfer (in bytes).
*/
off = byteoff % sdp->swd_bsize;
sz = (1 + nra) * sdp->swd_bsize - off;
if (sz > resid)
sz = resid;
UVMHIST_LOG(pdhist, "sw_reg_strategy: "
"vp %#jx/%#jx offset %#jx/%#jx",
(uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
/*
* now get a buf structure. note that the vb_buf is
* at the front of the nbp structure so that you can
* cast pointers between the two structure easily.
*/
nbp = pool_get(&vndbuf_pool, PR_WAITOK);
buf_init(&nbp->vb_buf);
nbp->vb_buf.b_flags = bp->b_flags;
nbp->vb_buf.b_cflags = bp->b_cflags;
nbp->vb_buf.b_oflags = bp->b_oflags;
nbp->vb_buf.b_bcount = sz;
nbp->vb_buf.b_bufsize = sz;
nbp->vb_buf.b_error = 0;
nbp->vb_buf.b_data = addr;
nbp->vb_buf.b_lblkno = 0;
nbp->vb_buf.b_blkno = nbn + btodb(off);
nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
nbp->vb_buf.b_iodone = sw_reg_biodone;
nbp->vb_buf.b_vp = vp;
nbp->vb_buf.b_objlock = vp->v_interlock;
if (vp->v_type == VBLK) { nbp->vb_buf.b_dev = vp->v_rdev;
}
nbp->vb_xfer = vnx; /* patch it back in to vnx */
/*
* Just sort by block number
*/
s = splbio();
if (vnx->vx_error != 0) {
buf_destroy(&nbp->vb_buf);
pool_put(&vndbuf_pool, nbp);
goto out;
}
vnx->vx_pending++;
/* sort it in and start I/O if we are not over our limit */
/* XXXAD locking */
bufq_put(sdp->swd_tab, &nbp->vb_buf);
sw_reg_start(sdp);
splx(s);
/*
* advance to the next I/O
*/
byteoff += sz;
addr += sz;
}
s = splbio();
out: /* Arrive here at splbio */
vnx->vx_flags &= ~VX_BUSY;
if (vnx->vx_pending == 0) { error = vnx->vx_error;
pool_put(&vndxfer_pool, vnx);
bp->b_error = error;
biodone(bp);
}
splx(s);
}
/*
* sw_reg_start: start an I/O request on the requested swapdev
*
* => reqs are sorted by b_rawblkno (above)
*/
static void
sw_reg_start(struct swapdev *sdp)
{
struct buf *bp;
struct vnode *vp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/* recursion control */
if ((sdp->swd_flags & SWF_BUSY) != 0)
return;
sdp->swd_flags |= SWF_BUSY;
while (sdp->swd_active < sdp->swd_maxactive) {
bp = bufq_get(sdp->swd_tab);
if (bp == NULL)
break;
sdp->swd_active++;
UVMHIST_LOG(pdhist,
"sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx",
(uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
bp->b_bcount);
vp = bp->b_vp;
KASSERT(bp->b_objlock == vp->v_interlock);
if ((bp->b_flags & B_READ) == 0) {
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
}
VOP_STRATEGY(vp, bp);
}
sdp->swd_flags &= ~SWF_BUSY;
}
/*
* sw_reg_biodone: one of our i/o's has completed
*/
static void
sw_reg_biodone(struct buf *bp)
{
workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
}
/*
* sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
*
* => note that we can recover the vndbuf struct by casting the buf ptr
*/
static void
sw_reg_iodone(struct work *wk, void *dummy)
{
struct vndbuf *vbp = (void *)wk;
struct vndxfer *vnx = vbp->vb_xfer;
struct buf *pbp = vnx->vx_bp; /* parent buffer */
struct swapdev *sdp = vnx->vx_sdp;
int s, resid, error;
KASSERT(&vbp->vb_buf.b_work == wk);
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
(uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
(uintptr_t)vbp->vb_buf.b_data);
UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx",
vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
/*
* protect vbp at splbio and update.
*/
s = splbio();
resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
pbp->b_resid -= resid;
vnx->vx_pending--;
if (vbp->vb_buf.b_error != 0) {
/* pass error upward */
error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0);
vnx->vx_error = error;
}
/*
* kill vbp structure
*/
buf_destroy(&vbp->vb_buf);
pool_put(&vndbuf_pool, vbp);
/*
* wrap up this transaction if it has run to completion or, in
* case of an error, when all auxiliary buffers have returned.
*/
if (vnx->vx_error != 0) {
/* pass error upward */
error = vnx->vx_error;
if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
pbp->b_error = error;
biodone(pbp);
pool_put(&vndxfer_pool, vnx);
}
} else if (pbp->b_resid == 0) {
KASSERT(vnx->vx_pending == 0);
if ((vnx->vx_flags & VX_BUSY) == 0) {
UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !",
(uintptr_t)pbp, vnx->vx_error, 0, 0);
biodone(pbp);
pool_put(&vndxfer_pool, vnx);
}
}
/*
* done! start next swapdev I/O if one is pending
*/
sdp->swd_active--;
sw_reg_start(sdp);
splx(s);
}
/*
* uvm_swap_alloc: allocate space on swap
*
* => allocation is done "round robin" down the priority list, as we
* allocate in a priority we "rotate" the circle queue.
* => space can be freed with uvm_swap_free
* => we return the page slot number in /dev/drum (0 == invalid slot)
* => we lock uvm_swap_data_lock
* => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
*/
int
uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
{
struct swapdev *sdp;
struct swappri *spp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* no swap devices configured yet? definite failure.
*/
if (uvmexp.nswapdev < 1)
return 0;
/*
* XXXJAK: BEGIN HACK
*
* blist_alloc() in subr_blist.c will panic if we try to allocate
* too many slots.
*/
if (*nslots > BLIST_MAX_ALLOC) {
if (__predict_false(lessok == false))
return 0;
*nslots = BLIST_MAX_ALLOC;
}
/* XXXJAK: END HACK */
/*
* lock data lock, convert slots into blocks, and enter loop
*/
mutex_enter(&uvm_swap_data_lock);
ReTry: /* XXXMRG */
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
uint64_t result;
/* if it's not enabled, then we can't swap from it */
if ((sdp->swd_flags & SWF_ENABLE) == 0)
continue;
if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
continue;
result = blist_alloc(sdp->swd_blist, *nslots);
if (result == BLIST_NONE) {
continue;
}
KASSERT(result < sdp->swd_drumsize);
/*
* successful allocation! now rotate the tailq.
*/
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
sdp->swd_npginuse += *nslots;
uvmexp.swpginuse += *nslots;
mutex_exit(&uvm_swap_data_lock);
/* done! return drum slot number */
UVMHIST_LOG(pdhist,
"success! returning %jd slots starting at %jd",
*nslots, result + sdp->swd_drumoffset, 0, 0);
return (result + sdp->swd_drumoffset);
}
}
/* XXXMRG: BEGIN HACK */
if (*nslots > 1 && lessok) {
*nslots = 1;
/* XXXMRG: ugh! blist should support this for us */
goto ReTry;
}
/* XXXMRG: END HACK */
mutex_exit(&uvm_swap_data_lock);
return 0;
}
/*
* uvm_swapisfull: return true if most of available swap is allocated
* and in use. we don't count some small portion as it may be inaccessible
* to us at any given moment, for example if there is lock contention or if
* pages are busy.
*/
bool
uvm_swapisfull(void)
{
int swpgonly;
bool rv;
if (uvmexp.swpages == 0) {
return true;
}
mutex_enter(&uvm_swap_data_lock);
KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
uvm_swapisfull_factor);
rv = (swpgonly >= uvmexp.swpgavail);
mutex_exit(&uvm_swap_data_lock);
return (rv);
}
/*
* uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
*
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_markbad(int startslot, int nslots)
{
struct swapdev *sdp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
KASSERT(sdp != NULL);
/*
* we just keep track of how many pages have been marked bad
* in this device, to make everything add up in swap_off().
* we assume here that the range of slots will all be within
* one swap device.
*/
KASSERT(uvmexp.swpgonly >= nslots);
atomic_add_int(&uvmexp.swpgonly, -nslots);
sdp->swd_npgbad += nslots;
UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
mutex_exit(&uvm_swap_data_lock);
}
/*
* uvm_swap_free: free swap slots
*
* => this can be all or part of an allocation made by uvm_swap_alloc
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_free(int startslot, int nslots)
{
struct swapdev *sdp;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
startslot, 0, 0);
/*
* ignore attempts to free the "bad" slot.
*/
if (startslot == SWSLOT_BAD) {
return;
}
/*
* convert drum slot offset back to sdp, free the blocks
* in the extent, and return. must hold pri lock to do
* lookup and access the extent.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
KASSERT(uvmexp.nswapdev >= 1);
KASSERT(sdp != NULL);
KASSERT(sdp->swd_npginuse >= nslots);
blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
sdp->swd_npginuse -= nslots;
uvmexp.swpginuse -= nslots;
mutex_exit(&uvm_swap_data_lock);
}
/*
* uvm_swap_put: put any number of pages into a contig place on swap
*
* => can be sync or async
*/
int
uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
{
int error;
error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
return error;
}
/*
* uvm_swap_get: get a single page from swap
*
* => usually a sync op (from fault)
*/
int
uvm_swap_get(struct vm_page *page, int swslot, int flags)
{
int error;
atomic_inc_uint(&uvmexp.nswget);
KASSERT(flags & PGO_SYNCIO);
if (swslot == SWSLOT_BAD) {
return EIO;
}
error = uvm_swap_io(&page, swslot, 1, B_READ |
((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
if (error == 0) {
/*
* this page is no longer only in swap.
*/
KASSERT(uvmexp.swpgonly > 0);
atomic_dec_uint(&uvmexp.swpgonly);
}
return error;
}
/*
* uvm_swap_io: do an i/o operation to swap
*/
static int
uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
{
daddr_t startblk;
struct buf *bp;
vaddr_t kva;
int error, mapinflags;
bool write, async, swap_encrypt;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
startslot, npages, flags, 0);
write = (flags & B_READ) == 0;
async = (flags & B_ASYNC) != 0;
swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
/*
* allocate a buf for the i/o.
*/
KASSERT(curlwp != uvm.pagedaemon_lwp || write);
KASSERT(curlwp != uvm.pagedaemon_lwp || async);
bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
if (bp == NULL) {
uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
return ENOMEM;
}
/*
* convert starting drum slot to block number
*/
startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
/*
* first, map the pages into the kernel.
*/
mapinflags = !write ?
UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
if (write && swap_encrypt) /* need to encrypt in-place */
mapinflags |= UVMPAGER_MAPIN_READ;
kva = uvm_pagermapin(pps, npages, mapinflags);
/*
* encrypt writes in place if requested
*/
if (write) do {
struct swapdev *sdp;
int i;
/*
* Get the swapdev so we can discriminate on the
* encryption state. There may or may not be an
* encryption key generated; we may or may not be asked
* to encrypt swap.
*
* 1. NO KEY, NO ENCRYPTION: Nothing to do.
*
* 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
* and mark the slots encrypted.
*
* 3. KEY, BUT NO ENCRYPTION: The slots may already be
* marked encrypted from a past life. Mark them not
* encrypted.
*
* 4. KEY, ENCRYPTION: Encrypt and mark the slots
* encrypted.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
if (!sdp->swd_encinit) {
if (!swap_encrypt) {
mutex_exit(&uvm_swap_data_lock);
break;
}
uvm_swap_genkey(sdp);
}
KASSERT(sdp->swd_encinit);
mutex_exit(&uvm_swap_data_lock);
for (i = 0; i < npages; i++) {
int s = startslot + i;
KDASSERT(swapdrum_sdp_is(s, sdp));
KASSERT(s >= sdp->swd_drumoffset);
s -= sdp->swd_drumoffset;
KASSERT(s < sdp->swd_drumsize);
if (swap_encrypt) {
uvm_swap_encryptpage(sdp,
(void *)(kva + (vsize_t)i*PAGE_SIZE), s);
atomic_or_32(&sdp->swd_encmap[s/32],
__BIT(s%32));
} else {
atomic_and_32(&sdp->swd_encmap[s/32],
~__BIT(s%32));
}
}
} while (0);
/*
* fill in the bp/sbp. we currently route our i/o through
* /dev/drum's vnode [swapdev_vp].
*/
bp->b_cflags = BC_BUSY | BC_NOCACHE;
bp->b_flags = (flags & (B_READ|B_ASYNC));
bp->b_proc = &proc0; /* XXX */
bp->b_vnbufs.le_next = NOLIST;
bp->b_data = (void *)kva;
bp->b_blkno = startblk;
bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
/*
* bump v_numoutput (counter of number of active outputs).
*/
if (write) {
mutex_enter(swapdev_vp->v_interlock);
swapdev_vp->v_numoutput++;
mutex_exit(swapdev_vp->v_interlock);
}
/*
* for async ops we must set up the iodone handler.
*/
if (async) {
bp->b_iodone = uvm_aio_aiodone;
UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
if (curlwp == uvm.pagedaemon_lwp)
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
else
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
} else {
bp->b_iodone = NULL;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
}
UVMHIST_LOG(pdhist,
"about to start io: data = %#jx blkno = %#jx, bcount = %jd",
(uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
/*
* now we start the I/O, and if async, return.
*/
VOP_STRATEGY(swapdev_vp, bp);
if (async) {
/*
* Reads are always synchronous; if this changes, we
* need to add an asynchronous path for decryption.
*/
KASSERT(write);
return 0;
}
/*
* must be sync i/o. wait for it to finish
*/
error = biowait(bp);
if (error)
goto out;
/*
* decrypt reads in place if needed
*/
if (!write) do {
struct swapdev *sdp;
bool encinit;
int i;
/*
* Get the sdp. Everything about it except the encinit
* bit, saying whether the encryption key is
* initialized or not, and the encrypted bit for each
* page, is stable until all swap pages have been
* released and the device is removed.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
encinit = sdp->swd_encinit;
mutex_exit(&uvm_swap_data_lock);
if (!encinit)
/*
* If there's no encryption key, there's no way
* any of these slots can be encrypted, so
* nothing to do here.
*/
break;
for (i = 0; i < npages; i++) {
int s = startslot + i;
KDASSERT(swapdrum_sdp_is(s, sdp));
KASSERT(s >= sdp->swd_drumoffset);
s -= sdp->swd_drumoffset;
KASSERT(s < sdp->swd_drumsize);
if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
__BIT(s%32)) == 0)
continue;
uvm_swap_decryptpage(sdp,
(void *)(kva + (vsize_t)i*PAGE_SIZE), s);
}
} while (0);
out:
/*
* kill the pager mapping
*/
uvm_pagermapout(kva, npages);
/*
* now dispose of the buf and we're done.
*/
if (write) {
mutex_enter(swapdev_vp->v_interlock);
vwakeup(bp);
mutex_exit(swapdev_vp->v_interlock);
}
putiobuf(bp);
UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0);
return (error);
}
/*
* uvm_swap_genkey(sdp)
*
* Generate a key for swap encryption.
*/
static void
uvm_swap_genkey(struct swapdev *sdp)
{
uint8_t key[32];
KASSERT(!sdp->swd_encinit);
cprng_strong(kern_cprng, key, sizeof key, 0);
aes_setenckey256(&sdp->swd_enckey, key);
aes_setdeckey256(&sdp->swd_deckey, key);
explicit_memset(key, 0, sizeof key);
sdp->swd_encinit = true;
}
/*
* uvm_swap_encryptpage(sdp, kva, slot)
*
* Encrypt one page of data at kva for the specified slot number
* in the swap device.
*/
static void
uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
{
uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
/* iv := AES_k(le32enc(slot) || 0^96) */
le32enc(preiv, slot);
aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
/* *kva := AES-CBC_k(iv, *kva) */
aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
AES_256_NROUNDS);
explicit_memset(&iv, 0, sizeof iv);
}
/*
* uvm_swap_decryptpage(sdp, kva, slot)
*
* Decrypt one page of data at kva for the specified slot number
* in the swap device.
*/
static void
uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
{
uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
/* iv := AES_k(le32enc(slot) || 0^96) */
le32enc(preiv, slot);
aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
/* *kva := AES-CBC^{-1}_k(iv, *kva) */
aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
AES_256_NROUNDS);
explicit_memset(&iv, 0, sizeof iv);
}
SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
SYSCTL_DESCR("Encrypt data when swapped out to disk"),
NULL, 0, &uvm_swap_encrypt, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $ */
/* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>
/*
* Seek sort for disks.
*
* There are actually two queues, sorted in ascendening order. The first
* queue holds those requests which are positioned after the current block;
* the second holds requests which came in after their position was passed.
* Thus we implement a one-way scan, retracting after reaching the end of
* the drive to the first request on the second queue, at which time it
* becomes the first queue.
*
* A one-way scan is natural because of the way UNIX read-ahead blocks are
* allocated.
*/
struct bufq_disksort {
TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */
};
static void bufq_disksort_init(struct bufq_state *);
static void bufq_disksort_put(struct bufq_state *, struct buf *);
static struct buf *bufq_disksort_get(struct bufq_state *, int);
BUFQ_DEFINE(disksort, 20, bufq_disksort_init);
static void
bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bq, *nbq;
int sortby;
sortby = bufq->bq_flags & BUFQ_SORT_MASK;
bq = TAILQ_FIRST(&disksort->bq_head);
/*
* If the queue is empty it's easy; we just go on the end.
*/
if (bq == NULL) {
TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
return;
}
/*
* If we lie before the currently active request, then we
* must locate the second request list and add ourselves to it.
*/
if (buf_inorder(bp, bq, sortby)) {
while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
/*
* Check for an ``inversion'' in the normally ascending
* block numbers, indicating the start of the second
* request list.
*/
if (buf_inorder(nbq, bq, sortby)) {
/*
* Search the second request list for the first
* request at a larger block number. We go
* after that; if there is no such request, we
* go at the end.
*/
do {
if (buf_inorder(bp, nbq, sortby))
goto insert;
bq = nbq;
} while ((nbq =
TAILQ_NEXT(bq, b_actq)) != NULL);
goto insert; /* after last */
}
bq = nbq;
}
/*
* No inversions... we will go after the last, and
* be the first request in the second request list.
*/
goto insert;
}
/*
* Request is at/after the current request...
* sort in the first request list.
*/
while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
/*
* We want to go after the current request if there is an
* inversion after it (i.e. it is the end of the first
* request list), or if the next request is a larger cylinder
* than our request.
*/
if (buf_inorder(nbq, bq, sortby) ||
buf_inorder(bp, nbq, sortby))
goto insert;
bq = nbq;
}
/*
* Neither a second list nor a larger request... we go at the end of
* the first list, which is the same as the end of the whole schebang.
*/
insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
}
static struct buf *
bufq_disksort_get(struct bufq_state *bufq, int remove)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bp;
bp = TAILQ_FIRST(&disksort->bq_head);
if (bp != NULL && remove) TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);
return (bp);
}
static struct buf *
bufq_disksort_cancel(struct bufq_state *bufq, struct buf *buf)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bq;
TAILQ_FOREACH(bq, &disksort->bq_head, b_actq) {
if (bq == buf) {
TAILQ_REMOVE(&disksort->bq_head, bq, b_actq);
return buf;
}
}
return NULL;
}
static void
bufq_disksort_fini(struct bufq_state *bufq)
{ KASSERT(bufq->bq_private != NULL);
kmem_free(bufq->bq_private, sizeof(struct bufq_disksort));
}
static void
bufq_disksort_init(struct bufq_state *bufq)
{
struct bufq_disksort *disksort;
disksort = kmem_zalloc(sizeof(*disksort), KM_SLEEP);
bufq->bq_private = disksort;
bufq->bq_get = bufq_disksort_get;
bufq->bq_put = bufq_disksort_put;
bufq->bq_cancel = bufq_disksort_cancel;
bufq->bq_fini = bufq_disksort_fini;
TAILQ_INIT(&disksort->bq_head);
}
MODULE(MODULE_CLASS_BUFQ, bufq_disksort, NULL);
static int
bufq_disksort_modcmd(modcmd_t cmd, void *opaque)
{
switch (cmd) {
case MODULE_CMD_INIT:
return bufq_register(&bufq_strat_disksort);
case MODULE_CMD_FINI:
return bufq_unregister(&bufq_strat_disksort);
default:
return ENOTTY;
}
}
/* $NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
*/
/*-
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/once.h>
#include <sys/workqueue.h>
#include <sys/kmem.h>
#include <uvm/uvm_extern.h>
ONCE_DECL(physio_initialized);
struct workqueue *physio_workqueue;
int physio_concurrency = 16;
/* #define PHYSIO_DEBUG */
#if defined(PHYSIO_DEBUG)
#define DPRINTF(a) printf a
#else /* defined(PHYSIO_DEBUG) */
#define DPRINTF(a) /* nothing */
#endif /* defined(PHYSIO_DEBUG) */
struct physio_stat {
int ps_running;
int ps_error;
int ps_failed;
off_t ps_endoffset;
size_t ps_resid;
buf_t *ps_orig_bp;
kmutex_t ps_lock;
kcondvar_t ps_cv;
};
static void
physio_done(struct work *wk, void *dummy)
{
struct buf *bp = (void *)wk;
size_t todo = bp->b_bufsize;
size_t done = bp->b_bcount - bp->b_resid;
struct physio_stat *ps = bp->b_private;
bool is_iobuf;
KASSERT(&bp->b_work == wk);
KASSERT(bp->b_bcount <= todo);
KASSERT(bp->b_resid <= bp->b_bcount);
KASSERT((bp->b_flags & B_PHYS) != 0);
KASSERT(dummy == NULL);
vunmapbuf(bp, todo);
uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
mutex_enter(&ps->ps_lock);
is_iobuf = (bp != ps->ps_orig_bp);
if (__predict_false(done != todo)) {
off_t endoffset = dbtob(bp->b_blkno) + done;
/*
* we got an error or hit EOM.
*
* we only care about the first one.
* ie. the one at the lowest offset.
*/
KASSERT(ps->ps_endoffset != endoffset);
DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
__func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
bp->b_blkno, bp->b_bcount, bp->b_flags));
if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
" -> %" PRIu64 "\n",
__func__, ps,
ps->ps_error, bp->b_error,
ps->ps_endoffset, endoffset));
ps->ps_endoffset = endoffset;
ps->ps_error = bp->b_error;
}
ps->ps_failed++;
ps->ps_resid += todo - done;
} else {
KASSERT(bp->b_error == 0);
}
ps->ps_running--;
cv_signal(&ps->ps_cv);
mutex_exit(&ps->ps_lock);
if (is_iobuf)
putiobuf(bp);
}
static void
physio_biodone(struct buf *bp)
{
#if defined(DIAGNOSTIC)
struct physio_stat *ps = bp->b_private;
size_t todo = bp->b_bufsize;
size_t done = bp->b_bcount - bp->b_resid;
KASSERT(ps->ps_running > 0); KASSERT(bp->b_bcount <= todo); KASSERT(bp->b_resid <= bp->b_bcount); if (done == todo) KASSERTMSG(bp->b_error == 0, "error=%d", bp->b_error);
#endif /* defined(DIAGNOSTIC) */
workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
}
static void
physio_wait(struct physio_stat *ps, int n)
{
KASSERT(mutex_owned(&ps->ps_lock)); while (ps->ps_running > n)
cv_wait(&ps->ps_cv, &ps->ps_lock);
}
static int
physio_init(void)
{
int error;
KASSERT(physio_workqueue == NULL);
error = workqueue_create(&physio_workqueue, "physiod",
physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
return error;
}
/*
* Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
* from the raw device to user buffers, and bypasses the buffer cache.
*/
int
physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
void (*min_phys)(struct buf *), struct uio *uio)
{
struct iovec *iovp;
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int i, error;
struct buf *bp = NULL;
struct physio_stat *ps;
int concurrency = physio_concurrency - 1;
int isdisk;
error = RUN_ONCE(&physio_initialized, physio_init); if (__predict_false(error != 0)) {
return error;
}
DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
__func__, uio->uio_offset, uio->uio_resid));
flags &= B_READ | B_WRITE;
ps = kmem_zalloc(sizeof(*ps), KM_SLEEP);
/* ps->ps_running = 0; */
/* ps->ps_error = 0; */
/* ps->ps_failed = 0; */
ps->ps_orig_bp = obp;
ps->ps_endoffset = -1;
ps->ps_resid = 0;
mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ps->ps_cv, "physio");
/* Allow concurrent I/O only for disks */
isdisk = cdev_type(dev) == D_DISK;
if (!isdisk)
concurrency = 0;
/* Make sure we have a buffer, creating one if necessary. */
if (obp != NULL) {
mutex_enter(&bufcache_lock);
/* Mark it busy, so nobody else will use it. */
while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
;
mutex_exit(&bufcache_lock);
concurrency = 0; /* see "XXXkludge" comment below */
}
for (i = 0; i < uio->uio_iovcnt; i++) {
bool sync = true;
iovp = &uio->uio_iov[i];
while (iovp->iov_len > 0) {
size_t todo;
vaddr_t endp;
mutex_enter(&ps->ps_lock);
if (ps->ps_failed != 0) {
goto done_locked;
}
physio_wait(ps, sync ? 0 : concurrency);
mutex_exit(&ps->ps_lock);
if (obp != NULL) {
/*
* XXXkludge
* some drivers use "obp" as an identifier.
*/
bp = obp;
} else {
bp = getiobuf(NULL, true);
bp->b_cflags |= BC_BUSY;
}
bp->b_dev = dev;
bp->b_proc = p;
bp->b_private = ps;
/*
* Mrk the buffer busy for physical I/O. Also set
* B_PHYS because it's an I/O to user memory, and
* B_RAW because B_RAW is to be "set by physio for
* raw transfers".
*/
bp->b_oflags = 0;
bp->b_cflags |= BC_BUSY;
bp->b_flags = flags | B_PHYS | B_RAW;
bp->b_iodone = physio_biodone;
/* Set up the buffer for a maximum-sized transfer. */
bp->b_blkno = btodb(uio->uio_offset);
if (isdisk) {
/*
* For disks, check that offsets are at least block
* aligned, the block addresses are used to track
* errors of finished requests.
*/
if (uio->uio_offset & (DEV_BSIZE - 1)) {
error = EINVAL;
goto done;
}
/*
* Split request into MAXPHYS chunks
*/
bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
} else {
bp->b_bcount = MIN(INT_MAX, iovp->iov_len);
}
bp->b_data = iovp->iov_base;
/*
* Call minphys to bound the transfer size,
* and remember the amount of data to transfer,
* for later comparison.
*/
(*min_phys)(bp);
todo = bp->b_bufsize = bp->b_bcount;
#if defined(DIAGNOSTIC)
if (todo > MAXPHYS)
panic("todo(%zu) > MAXPHYS; minphys broken",
todo);
#endif /* defined(DIAGNOSTIC) */
sync = false;
endp = (vaddr_t)bp->b_data + todo;
if (trunc_page(endp) != endp) {
/*
* Following requests can overlap.
* note that uvm_vslock does round_page.
*/
sync = true;
}
/*
* Lock the part of the user address space involved
* in the transfer.
*/
error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
(flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
if (error) {
goto done;
}
/*
* Beware vmapbuf(); if successful it clobbers
* b_data and saves it in b_saveaddr.
* However, vunmapbuf() restores b_data.
*/
if ((error = vmapbuf(bp, todo)) != 0) {
uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
goto done;
}
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
mutex_enter(&ps->ps_lock);
ps->ps_running++;
mutex_exit(&ps->ps_lock);
/* Call strategy to start the transfer. */
(*strategy)(bp);
bp = NULL;
iovp->iov_len -= todo;
iovp->iov_base = (char *)iovp->iov_base + todo;
uio->uio_offset += todo;
uio->uio_resid -= todo;
}
}
done:
mutex_enter(&ps->ps_lock);
done_locked:
physio_wait(ps, 0);
mutex_exit(&ps->ps_lock);
KASSERT(ps->ps_failed || ps->ps_endoffset == -1);
/*
* Compute residual, for disks adjust for the
* lowest numbered block that returned an error.
*/
if (isdisk) {
if (ps->ps_failed != 0) {
off_t delta;
delta = uio->uio_offset - ps->ps_endoffset;
KASSERT(delta > 0);
uio->uio_resid += delta;
/* uio->uio_offset = ps->ps_endoffset; */
}
} else {
uio->uio_resid += ps->ps_resid;
}
if (bp != NULL && bp != obp) { putiobuf(bp);
}
if (error == 0) { error = ps->ps_error;
}
mutex_destroy(&ps->ps_lock);
cv_destroy(&ps->ps_cv);
kmem_free(ps, sizeof(*ps));
/*
* Clean up the state of the buffer. Remember if somebody wants
* it, so we can wake them up below. Also, if we had to steal it,
* give it back.
*/
if (obp != NULL) { KASSERT((obp->b_cflags & BC_BUSY) != 0);
/*
* If another process is waiting for the raw I/O buffer,
* wake up processes waiting to do physical I/O;
*/
mutex_enter(&bufcache_lock);
obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
obp->b_flags &= ~(B_PHYS | B_RAW);
obp->b_iodone = NULL;
cv_broadcast(&obp->b_busy);
mutex_exit(&bufcache_lock);
}
DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
__func__, uio->uio_offset, uio->uio_resid));
return error;
}
/*
* A minphys() routine is called by physio() to adjust the size of each
* I/O transfer before the latter is passed to the strategy routine.
*
* This minphys() is a default that must be called to enforce limits
* that are applicable to all devices, because of limitations in the
* kernel or the hardware platform.
*/
void
minphys(struct buf *bp)
{ if (bp->b_bcount > MAXPHYS) bp->b_bcount = MAXPHYS;
}
/* $NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $ */
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_tcp_debug.h"
#include "opt_tcp_congctl.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_congctl.h>
#ifdef TCP_DEBUG
#include <netinet/tcp_debug.h>
#endif
/*
* TODO:
* consider separating the actual implementations in another file.
*/
static void tcp_common_congestion_exp(struct tcpcb *, int, int);
static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_slow_retransmit(struct tcpcb *);
static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
const struct tcphdr *);
static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_congestion_exp(struct tcpcb *tp);
static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
const struct tcphdr *);
static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_congestion_exp(struct tcpcb *);
static void tcp_congctl_fillnames(void);
extern int tcprexmtthresh;
MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
/* currently selected global congestion control */
char tcp_congctl_global_name[TCPCC_MAXLEN];
/* available global congestion control algorithms */
char tcp_congctl_avail[10 * TCPCC_MAXLEN];
/*
* Used to list the available congestion control algorithms.
*/
TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
static struct tcp_congctlent * tcp_congctl_global;
static kmutex_t tcp_congctl_mtx;
void
tcp_congctl_init(void)
{
int r __diagused;
mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
/* Base algorithms. */
r = tcp_congctl_register("reno", &tcp_reno_ctl);
KASSERT(r == 0);
r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
KASSERT(r == 0);
r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
KASSERT(r == 0);
/* NewReno is the default. */
#ifndef TCP_CONGCTL_DEFAULT
#define TCP_CONGCTL_DEFAULT "newreno"
#endif
r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
KASSERT(r == 0);
}
/*
* Register a congestion algorithm and select it if we have none.
*/
int
tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
{
struct tcp_congctlent *ntcc, *tccp;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
if (!strcmp(name, tccp->congctl_name)) {
/* name already registered */
return EEXIST;
}
ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
ntcc->congctl_ctl = tcc;
TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
tcp_congctl_fillnames();
if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
tcp_congctl_select(NULL, name);
return 0;
}
int
tcp_congctl_unregister(const char *name)
{
struct tcp_congctlent *tccp, *rtccp;
unsigned int size;
rtccp = NULL;
size = 0;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
if (!strcmp(name, tccp->congctl_name))
rtccp = tccp;
size++;
}
if (!rtccp)
return ENOENT;
if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
return EBUSY;
TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
free(rtccp, M_TCPCONGCTL);
tcp_congctl_fillnames();
return 0;
}
/*
* Select a congestion algorithm by name.
*/
int
tcp_congctl_select(struct tcpcb *tp, const char *name)
{
struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
bool old_found, new_found;
KASSERT(name); old_found = (tp == NULL || tp->t_congctl == NULL);
old_tccp = NULL;
new_found = false;
new_tccp = NULL;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
old_tccp = tccp;
old_found = true;
}
if (!new_found && !strcmp(name, tccp->congctl_name)) {
new_tccp = tccp;
new_found = true;
}
if (new_found && old_found) {
if (tp) {
mutex_enter(&tcp_congctl_mtx);
if (old_tccp) old_tccp->congctl_refcnt--;
tp->t_congctl = new_tccp->congctl_ctl;
new_tccp->congctl_refcnt++;
mutex_exit(&tcp_congctl_mtx);
} else {
tcp_congctl_global = new_tccp;
strlcpy(tcp_congctl_global_name,
new_tccp->congctl_name,
sizeof(tcp_congctl_global_name) - 1);
}
return 0;
}
}
return EINVAL;
}
void
tcp_congctl_release(struct tcpcb *tp)
{
struct tcp_congctlent *tccp;
KASSERT(tp->t_congctl); TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
if (tccp->congctl_ctl == tp->t_congctl) {
tccp->congctl_refcnt--;
return;
}
}
}
/*
* Returns the name of a congestion algorithm.
*/
const char *
tcp_congctl_bystruct(const struct tcp_congctl *tcc)
{
struct tcp_congctlent *tccp;
KASSERT(tcc);
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
if (tccp->congctl_ctl == tcc)
return tccp->congctl_name;
return NULL;
}
static void
tcp_congctl_fillnames(void)
{
struct tcp_congctlent *tccp;
const char *delim = " ";
tcp_congctl_avail[0] = '\0';
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
strlcat(tcp_congctl_avail, tccp->congctl_name,
sizeof(tcp_congctl_avail) - 1);
if (TAILQ_NEXT(tccp, congctl_ent))
strlcat(tcp_congctl_avail, delim,
sizeof(tcp_congctl_avail) - 1);
}
}
/* ------------------------------------------------------------------------ */
/*
* Common stuff
*/
/* Window reduction (1-beta) for [New]Reno: 0.5 */
#define RENO_BETAA 1
#define RENO_BETAB 2
/* Window reduction (1-beta) for Cubic: 0.8 */
#define CUBIC_BETAA 4
#define CUBIC_BETAB 5
/* Draft Rhee Section 4.1 */
#define CUBIC_CA 4
#define CUBIC_CB 10
static void
tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
{
u_long win;
/*
* Reduce the congestion window and the slow start threshold.
*/
win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_segsz;
tp->snd_recover = tp->snd_max;
tp->snd_cwnd = tp->snd_ssthresh;
/*
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
/* ------------------------------------------------------------------------ */
/*
* TCP/Reno congestion control.
*/
static void
tcp_reno_congestion_exp(struct tcpcb *tp)
{
tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
}
static int
tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* Dup acks mean that packets have left the
* network (they're now cached at the receiver)
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
*
* If we are using TCP/SACK, then enter
* Fast Recovery if the receiver SACKs
* data that is tcprexmtthresh * MSS
* bytes past the last ACKed segment,
* irrespective of the number of DupAcks.
*/
tcp_seq onxt = tp->snd_nxt;
tp->t_partialacks = 0;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
if (TCP_SACK_ENABLED(tp)) {
tp->t_dupacks = tcprexmtthresh;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd = tp->t_segsz;
(void) tcp_output(tp);
return 0;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_segsz;
(void) tcp_output(tp);
tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
return 0;
}
static int
tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* We know we're losing at the current
* window size so do congestion avoidance
* (set ssthresh to half the current window
* and pull our congestion window back to
* the new ssthresh).
*/
tcp_reno_congestion_exp(tp);
return tcp_reno_do_fast_retransmit(tp, th);
}
static void
tcp_reno_slow_retransmit(struct tcpcb *tp)
{
u_long win;
/*
* Close the congestion window down to one segment
* (we'll open it by one segment for each ack we get).
* Since we probably have a window's worth of unacked
* data accumulated, this "slow start" keeps us from
* dumping all that data as back-to-back packets (which
* might overwhelm an intermediate gateway).
*
* There are two phases to the opening: Initially we
* open by one mss on each ack. This makes the window
* size increase exponentially with time. If the
* window is larger than the path can handle, this
* exponential growth results in dropped packet(s)
* almost immediately. To get more time between
* drops but still "push" the network to take advantage
* of improving conditions, we switch from exponential
* to linear window opening at some threshold size.
* For a threshold, we use half the current window
* size, truncated to a multiple of the mss.
*
* (the minimum cwnd that will give us exponential
* growth is 2 mss. We don't allow the threshold
* to go below this.)
*/
win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
if (win < 2)
win = 2;
/* Loss Window MUST be one segment. */
tp->snd_cwnd = tp->t_segsz;
tp->snd_ssthresh = win * tp->t_segsz;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
static void
tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
const struct tcphdr *th)
{
if (tp->t_partialacks < 0) {
/*
* We were not in fast recovery. Reset the duplicate ack
* counter.
*/
tp->t_dupacks = 0;
} else {
/*
* Clamp the congestion window to the crossover point and
* exit fast recovery.
*/
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
tp->snd_fack = th->th_ack;
}
}
static void
tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* When new data is acked, open the congestion window.
*/
u_int cw = tp->snd_cwnd;
u_int incr = tp->t_segsz;
if (tcp_do_abc) {
/*
* RFC 3465 Appropriate Byte Counting (ABC)
*/
int acked = th->th_ack - tp->snd_una;
if (cw >= tp->snd_ssthresh) {
tp->t_bytes_acked += acked;
if (tp->t_bytes_acked >= cw) {
/* Time to increase the window. */
tp->t_bytes_acked -= cw;
} else {
/* No need to increase yet. */
incr = 0;
}
} else {
/*
* use 2*SMSS or 1*SMSS for the "L" param,
* depending on sysctl setting.
*
* (See RFC 3465 2.3 Choosing the Limit)
*/
u_int abc_lim;
abc_lim = (tcp_abc_aggressive == 0 ||
tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
incr = uimin(acked, abc_lim);
}
} else {
/*
* If the window gives us less than ssthresh packets
* in flight, open exponentially (segsz per packet).
* Otherwise open linearly: segsz per window
* (segsz^2 / cwnd per packet).
*/
if (cw >= tp->snd_ssthresh) {
incr = incr * incr / cw;
}
}
tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
}
const struct tcp_congctl tcp_reno_ctl = {
.fast_retransmit = tcp_reno_fast_retransmit,
.slow_retransmit = tcp_reno_slow_retransmit,
.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
.newack = tcp_reno_newack,
.cong_exp = tcp_reno_congestion_exp,
};
/*
* TCP/NewReno Congestion control.
*/
static int
tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
if (SEQ_LT(th->th_ack, tp->snd_high)) {
/*
* False fast retransmit after timeout.
* Do not enter fast recovery
*/
tp->t_dupacks = 0;
return 1;
}
/*
* Fast retransmit is same as reno.
*/
return tcp_reno_fast_retransmit(tp, th);
}
/*
* Implement the NewReno response to a new ack, checking for partial acks in
* fast recovery.
*/
static void
tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
{
if (tp->t_partialacks < 0) {
/*
* We were not in fast recovery. Reset the duplicate ack
* counter.
*/
tp->t_dupacks = 0;
} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
/*
* This is a partial ack. Retransmit the first unacknowledged
* segment and deflate the congestion window by the amount of
* acknowledged data. Do not exit fast recovery.
*/
tcp_seq onxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
int sack_num_segs = 1, sack_bytes_rxmt = 0;
/*
* snd_una has not yet been updated and the socket's send
* buffer has not yet drained off the ACK'd data, so we
* have to leave snd_una as it was to get the correct data
* offset in tcp_output().
*/
tp->t_partialacks++;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
if (TCP_SACK_ENABLED(tp)) {
/*
* Partial ack handling within a sack recovery episode.
* Keeping this very simple for now. When a partial ack
* is received, force snd_cwnd to a value that will
* allow the sender to transmit no more than 2 segments.
* If necessary, a fancier scheme can be adopted at a
* later point, but for now, the goal is to prevent the
* sender from bursting a large amount of data in the
* midst of sack recovery.
*/
/*
* send one or 2 segments based on how much
* new data was acked
*/
if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
sack_num_segs = 2;
(void)tcp_sack_output(tp, &sack_bytes_rxmt);
tp->snd_cwnd = sack_bytes_rxmt +
(tp->snd_nxt - tp->sack_newdata) +
sack_num_segs * tp->t_segsz;
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
} else {
tp->snd_nxt = th->th_ack;
/*
* Set snd_cwnd to one segment beyond ACK'd offset
* snd_una is not yet updated when we're called
*/
tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
/*
* Partial window deflation. Relies on fact that
* tp->snd_una not updated yet.
*/
tp->snd_cwnd -= (th->th_ack - tp->snd_una -
tp->t_segsz);
}
} else {
/*
* Complete ack. Inflate the congestion window to ssthresh
* and exit fast recovery.
*
* Window inflation should have left us with approx.
* snd_ssthresh outstanding data. But in case we
* would be inclined to send a burst, better to do
* it via the slow start mechanism.
*/
if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+ tp->t_segsz;
else
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
tp->snd_fack = th->th_ack;
}
}
static void
tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* If we are still in fast recovery (meaning we are using
* NewReno and we have only received partial acks), do not
* inflate the window yet.
*/
if (tp->t_partialacks < 0)
tcp_reno_newack(tp, th);
}
const struct tcp_congctl tcp_newreno_ctl = {
.fast_retransmit = tcp_newreno_fast_retransmit,
.slow_retransmit = tcp_reno_slow_retransmit,
.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
.newack = tcp_newreno_newack,
.cong_exp = tcp_reno_congestion_exp,
};
/*
* CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
*/
/* Cubic prototypes */
static void tcp_cubic_update_ctime(struct tcpcb *tp);
static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
static uint32_t tcp_cubic_cbrt(uint32_t);
static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
/* Cubic TIME functions - XXX I don't like using timevals and microuptime */
/*
* Set congestion timer to now
*/
static void
tcp_cubic_update_ctime(struct tcpcb *tp)
{
struct timeval now_timeval;
getmicrouptime(&now_timeval);
tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
now_timeval.tv_usec / 1000;
}
/*
* miliseconds from last congestion
*/
static uint32_t
tcp_cubic_diff_ctime(struct tcpcb *tp)
{
struct timeval now_timeval;
getmicrouptime(&now_timeval);
return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
tp->snd_cubic_ctime;
}
/*
* Approximate cubic root
*/
#define CBRT_ROUNDS 30
static uint32_t
tcp_cubic_cbrt(uint32_t v)
{
int i, rounds = CBRT_ROUNDS;
uint64_t x = v / 3;
/* We fail to calculate correct for small numbers */
if (v == 0)
return 0;
else if (v < 4)
return 1;
/*
* largest x that 2*x^3+3*x fits 64bit
* Avoid overflow for a time cost
*/
if (x > 2097151)
rounds += 10;
for (i = 0; i < rounds; i++)
if (rounds == CBRT_ROUNDS)
x = (v + 2 * x * x * x) / (3 * x * x);
else
/* Avoid overflow */
x = v / (3 * x * x) + 2 * x / 3;
return (uint32_t)x;
}
/* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
static ulong
tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
{
uint32_t K;
long tK3;
/* Section 3.1 Eq. 2 */
K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
CUBIC_CB / CUBIC_CA);
/* (t-K)^3 - not clear why is the measure unit mattering */
tK3 = (long)(ms_elapsed + rtt) - (long)K;
tK3 = tK3 * tK3 * tK3;
return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
}
static void
tcp_cubic_congestion_exp(struct tcpcb *tp)
{
/*
* Congestion - Set WMax and shrink cwnd
*/
tcp_cubic_update_ctime(tp);
/* Section 3.6 - Fast Convergence */
if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
} else {
tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
tp->snd_cubic_wmax = tp->snd_cwnd;
}
tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
/* Shrink CWND */
tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
}
static int
tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
if (SEQ_LT(th->th_ack, tp->snd_high)) {
/* See newreno */
tp->t_dupacks = 0;
return 1;
}
/*
* mark WMax
*/
tcp_cubic_congestion_exp(tp);
/* Do fast retransmit */
return tcp_reno_do_fast_retransmit(tp, th);
}
static void
tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
{
uint32_t ms_elapsed, rtt;
u_long w_tcp;
/* Congestion avoidance and not in fast recovery and usable rtt */
if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
/*
* t_srtt is 1/32 units of slow ticks
* converting it in ms would be equal to
* (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
*/
(rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
ms_elapsed = tcp_cubic_diff_ctime(tp);
/* Compute W_tcp(t) */
w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
ms_elapsed / rtt / 3;
if (tp->snd_cwnd > w_tcp) {
/* Not in TCP friendly mode */
tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
tp->snd_cwnd) / tp->snd_cwnd;
} else {
/* friendly TCP mode */
tp->snd_cwnd = w_tcp;
}
/* Make sure we are within limits */
tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
} else {
/* Use New Reno */
tcp_newreno_newack(tp, th);
}
}
static void
tcp_cubic_slow_retransmit(struct tcpcb *tp)
{
/* Timeout - Mark new congestion */
tcp_cubic_congestion_exp(tp);
/* Loss Window MUST be one segment. */
tp->snd_cwnd = tp->t_segsz;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
const struct tcp_congctl tcp_cubic_ctl = {
.fast_retransmit = tcp_cubic_fast_retransmit,
.slow_retransmit = tcp_cubic_slow_retransmit,
.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
.newack = tcp_cubic_newack,
.cong_exp = tcp_cubic_congestion_exp,
};
/* $NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $ */
/* $KAME: udp6_usrreq.c,v 1.86 2001/05/27 17:33:00 itojun Exp $ */
/* $KAME: udp6_output.c,v 1.43 2001/10/15 09:19:52 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)udp_var.h 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udp_private.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/udp6_var.h>
#include <netinet6/udp6_private.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/esp.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif
#include "faith.h"
#if defined(NFAITH) && NFAITH > 0
#include <net/if_faith.h>
#endif
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
*/
extern struct inpcbtable udbtable;
percpu_t *udp6stat_percpu;
/* UDP on IP6 parameters */
static int udp6_sendspace = 9216; /* really max datagram size */
static int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6));
/* 40 1K datagrams */
static void udp6_notify(struct inpcb *, int);
static void sysctl_net_inet6_udp6_setup(struct sysctllog **);
#ifdef IPSEC
static int udp6_espinudp(struct mbuf **, int);
#endif
#ifdef UDP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt udp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum bad");
struct evcnt udp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum ok");
struct evcnt udp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum data");
struct evcnt udp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "swcsum");
EVCNT_ATTACH_STATIC(udp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(udp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(udp6_hwcsum_data);
EVCNT_ATTACH_STATIC(udp6_swcsum);
#define UDP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
#else
#define UDP_CSUM_COUNTER_INCR(ev) /* nothing */
#endif
void
udp6_init(void)
{
sysctl_net_inet6_udp6_setup(NULL);
udp6stat_percpu = percpu_alloc(sizeof(uint64_t) * UDP6_NSTATS);
udp_init_common();
}
/*
* Notify a udp user of an asynchronous error;
* just wake up so that he can collect error status.
*/
static void
udp6_notify(struct inpcb *inp, int errno)
{
inp->inp_socket->so_error = errno;
sorwakeup(inp->inp_socket);
sowwakeup(inp->inp_socket);
}
void *
udp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct udphdr uh;
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
struct mbuf *m;
int off;
void *cmdarg;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void (*notify)(struct inpcb *, int) = udp6_notify;
struct udp_portonly {
u_int16_t uh_sport;
u_int16_t uh_dport;
} *uhp;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE) {
/* special code is present, see below */
notify = in6pcb_rtchange;
}
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
off = 0;
}
if (ip6) {
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(*uhp)) {
if (cmd == PRC_MSGSIZE)
icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
return NULL;
}
memset(&uh, 0, sizeof(uh));
m_copydata(m, off, sizeof(*uhp), (void *)&uh);
if (cmd == PRC_MSGSIZE) {
int valid = 0;
/*
* Check to see if we have a valid UDP socket
* corresponding to the address in the ICMPv6 message
* payload.
*/
if (in6pcb_lookup(&udbtable, &sa6->sin6_addr,
uh.uh_dport, (const struct in6_addr *)&sa6_src->sin6_addr,
uh.uh_sport, 0, 0))
valid++;
#if 0
/*
* As the use of sendto(2) is fairly popular,
* we may want to allow non-connected pcb too.
* But it could be too weak against attacks...
* We should at least check if the local address (= s)
* is really ours.
*/
else if (in6pcb_lookup_bound(&udbtable, &sa6->sin6_addr,
uh.uh_dport, 0))
valid++;
#endif
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* regardless of if we called
* icmp6_mtudisc_update(), we need to call
* in6pcb_notify(), to notify path MTU change
* to the userland (RFC3542), because some
* unconnected sockets may share the same
* destination and want to know the path MTU.
*/
}
(void)in6pcb_notify(&udbtable, sa, uh.uh_dport,
sin6tocsa(sa6_src), uh.uh_sport, cmd, cmdarg,
notify);
} else {
(void)in6pcb_notify(&udbtable, sa, 0,
sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
}
return NULL;
}
int
udp6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int s;
int error = 0;
struct inpcb *inp;
int family;
int optval;
family = so->so_proto->pr_domain->dom_family;
s = splsoftnet();
switch (family) {
#ifdef INET
case PF_INET:
if (sopt->sopt_level != IPPROTO_UDP) { error = ip_ctloutput(op, so, sopt);
goto end;
}
break;
#endif
#ifdef INET6
case PF_INET6:
if (sopt->sopt_level != IPPROTO_UDP) { error = ip6_ctloutput(op, so, sopt);
goto end;
}
break;
#endif
default:
error = EAFNOSUPPORT;
goto end;
}
switch (op) {
case PRCO_SETOPT:
inp = sotoinpcb(so);
switch (sopt->sopt_name) {
case UDP_ENCAP:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch(optval) {
case 0:
inp->inp_flags &= ~IN6P_ESPINUDP;
break;
case UDP_ENCAP_ESPINUDP:
inp->inp_flags |= IN6P_ESPINUDP;
break;
default:
error = EINVAL;
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
break;
default:
error = EINVAL;
break;
}
end:
splx(s);
return error;
}
static void
udp6_sendup(struct mbuf *m, int off /* offset of data portion */,
struct sockaddr *src, struct socket *so)
{
struct mbuf *opts = NULL;
struct mbuf *n;
struct inpcb *inp;
KASSERT(so != NULL);
KASSERT(so->so_proto->pr_domain->dom_family == AF_INET6);
inp = sotoinpcb(so);
KASSERT(inp != NULL);
#if defined(IPSEC)
if (ipsec_used && ipsec_in_reject(m, inp)) {
if ((n = m_copypacket(m, M_DONTWAIT)) != NULL)
icmp6_error(n, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_ADMIN, 0);
return;
}
#endif
if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
if (inp->inp_flags & IN6P_CONTROLOPTS ||
SOOPT_TIMESTAMP(inp->inp_socket->so_options)) {
struct ip6_hdr *ip6 = mtod(n, struct ip6_hdr *);
ip6_savecontrol(inp, &opts, ip6, n);
}
m_adj(n, off);
if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) {
m_freem(n);
if (opts)
m_freem(opts);
UDP6_STATINC(UDP6_STAT_FULLSOCK);
soroverflow(so);
} else
sorwakeup(so);
}
}
int
udp6_realinput(int af, struct sockaddr_in6 *src, struct sockaddr_in6 *dst,
struct mbuf **mp, int off)
{
u_int16_t sport, dport;
int rcvcnt;
struct in6_addr src6, *dst6;
const struct in_addr *dst4;
struct inpcb *inp;
struct mbuf *m = *mp;
rcvcnt = 0;
off += sizeof(struct udphdr); /* now, offset of payload */
if (af != AF_INET && af != AF_INET6)
goto bad;
if (src->sin6_family != AF_INET6 || dst->sin6_family != AF_INET6)
goto bad;
src6 = src->sin6_addr;
if (sa6_recoverscope(src) != 0) {
/* XXX: should be impossible. */
goto bad;
}
sport = src->sin6_port;
dport = dst->sin6_port;
dst4 = (struct in_addr *)&dst->sin6_addr.s6_addr[12];
dst6 = &dst->sin6_addr;
if (IN6_IS_ADDR_MULTICAST(dst6) ||
(af == AF_INET && IN_MULTICAST(dst4->s_addr))) {
/*
* Deliver a multicast or broadcast datagram to *all* sockets
* for which the local and remote addresses and ports match
* those of the incoming datagram. This allows more than
* one process to receive multi/broadcasts on the same port.
* (This really ought to be done for unicast datagrams as
* well, but that would cause problems with existing
* applications that open both address-specific sockets and
* a wildcard socket listening to the same port -- they would
* end up receiving duplicates of every unicast datagram.
* Those applications open the multiple sockets to overcome an
* inadequacy of the UDP socket interface, but for backwards
* compatibility we avoid the problem here rather than
* fixing the interface. Maybe 4.5BSD will remedy this?)
*/
/*
* KAME note: traditionally we dropped udpiphdr from mbuf here.
* we need udpiphdr for IPsec processing so we do that later.
*/
/*
* Locate pcb(s) for datagram.
*/
TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if (inp->inp_lport != dport)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
dst6))
continue;
} else {
if (IN6_IS_ADDR_V4MAPPED(dst6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
&src6) || inp->inp_fport != sport)
continue;
} else {
if (IN6_IS_ADDR_V4MAPPED(&src6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
}
udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
rcvcnt++;
/*
* Don't look for additional matches if this one does
* not have either the SO_REUSEPORT or SO_REUSEADDR
* socket options set. This heuristic avoids searching
* through all pcbs in the common case of a non-shared
* port. It assumes that an application will never
* clear these options after setting them.
*/
if ((inp->inp_socket->so_options &
(SO_REUSEPORT|SO_REUSEADDR)) == 0)
break;
}
} else {
/*
* Locate pcb for datagram.
*/
inp = in6pcb_lookup(&udbtable, &src6, sport, dst6,
dport, 0, 0);
if (inp == NULL) {
UDP_STATINC(UDP_STAT_PCBHASHMISS);
inp = in6pcb_lookup_bound(&udbtable, dst6, dport, 0);
if (inp == NULL)
return rcvcnt;
}
#ifdef IPSEC
/* Handle ESP over UDP */
if (inp->inp_flags & IN6P_ESPINUDP) {
switch (udp6_espinudp(mp, off)) {
case -1: /* Error, m was freed */
rcvcnt = -1;
goto bad;
case 1: /* ESP over UDP */
rcvcnt++;
goto bad;
case 0: /* plain UDP */
default: /* Unexpected */
/*
* Normal UDP processing will take place,
* m may have changed.
*/
m = *mp;
break;
}
}
#endif
if (inp->inp_overudp_cb != NULL) {
int ret;
ret = inp->inp_overudp_cb(mp, off, inp->inp_socket,
sin6tosa(src), inp->inp_overudp_arg);
switch (ret) {
case -1: /* Error, m was freed */
rcvcnt = -1;
goto bad;
case 1: /* Foo over UDP */
KASSERT(*mp == NULL);
rcvcnt++;
goto bad;
case 0: /* plain UDP */
default: /* Unexpected */
/*
* Normal UDP processing will take place,
* m may have changed.
*/
break;
}
}
udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
rcvcnt++;
}
bad:
return rcvcnt;
}
int
udp6_input_checksum(struct mbuf *m, const struct udphdr *uh, int off, int len)
{
/*
* XXX it's better to record and check if this mbuf is
* already checked.
*/
if (__predict_false((m->m_flags & M_LOOP) && !udp_do_loopback_cksum)) {
goto good;
}
if (uh->uh_sum == 0) {
UDP6_STATINC(UDP6_STAT_NOSUM);
goto bad;
}
switch (m->m_pkthdr.csum_flags &
((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv6) |
M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
case M_CSUM_UDPv6|M_CSUM_TCP_UDP_BAD:
UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_bad);
UDP6_STATINC(UDP6_STAT_BADSUM);
goto bad;
#if 0 /* notyet */
case M_CSUM_UDPv6|M_CSUM_DATA:
#endif
case M_CSUM_UDPv6:
/* Checksum was okay. */
UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_ok);
break;
default:
/*
* Need to compute it ourselves. Maybe skip checksum
* on loopback interfaces.
*/
UDP_CSUM_COUNTER_INCR(&udp6_swcsum);
if (in6_cksum(m, IPPROTO_UDP, off, len) != 0) {
UDP6_STATINC(UDP6_STAT_BADSUM);
goto bad;
}
}
good:
return 0;
bad:
return -1;
}
int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
int off = *offp;
struct sockaddr_in6 src, dst;
struct ip6_hdr *ip6;
struct udphdr *uh;
u_int32_t plen, ulen;
ip6 = mtod(m, struct ip6_hdr *);
#if defined(NFAITH) && 0 < NFAITH
if (faithprefix(&ip6->ip6_dst)) {
/* send icmp6 host unreach? */
m_freem(m);
return IPPROTO_DONE;
}
#endif
UDP6_STATINC(UDP6_STAT_IPACKETS);
/* Check for jumbogram is done in ip6_input. We can trust pkthdr.len. */
plen = m->m_pkthdr.len - off;
IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(struct udphdr));
if (uh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return IPPROTO_DONE;
}
/*
* Enforce alignment requirements that are violated in
* some cases, see kern/50766 for details.
*/
if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) {
m = m_copyup(m, off + sizeof(struct udphdr), 0);
if (m == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return IPPROTO_DONE;
}
ip6 = mtod(m, struct ip6_hdr *);
uh = (struct udphdr *)(mtod(m, char *) + off);
}
KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr));
ulen = ntohs((u_short)uh->uh_ulen);
/*
* RFC2675 section 4: jumbograms will have 0 in the UDP header field,
* iff payload length > 0xffff.
*/
if (ulen == 0 && plen > 0xffff)
ulen = plen;
if (plen != ulen) {
UDP6_STATINC(UDP6_STAT_BADLEN);
goto bad;
}
/* destination port of 0 is illegal, based on RFC768. */
if (uh->uh_dport == 0)
goto bad;
/*
* Checksum extended UDP header and data. Maybe skip checksum
* on loopback interfaces.
*/
if (udp6_input_checksum(m, uh, off, ulen))
goto bad;
/*
* Construct source and dst sockaddrs.
*/
memset(&src, 0, sizeof(src));
src.sin6_family = AF_INET6;
src.sin6_len = sizeof(struct sockaddr_in6);
src.sin6_addr = ip6->ip6_src;
src.sin6_port = uh->uh_sport;
memset(&dst, 0, sizeof(dst));
dst.sin6_family = AF_INET6;
dst.sin6_len = sizeof(struct sockaddr_in6);
dst.sin6_addr = ip6->ip6_dst;
dst.sin6_port = uh->uh_dport;
if (udp6_realinput(AF_INET6, &src, &dst, &m, off) == 0) {
if (m->m_flags & M_MCAST) {
UDP6_STATINC(UDP6_STAT_NOPORTMCAST);
goto bad;
}
UDP6_STATINC(UDP6_STAT_NOPORT);
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
m = NULL;
}
bad:
if (m)
m_freem(m);
return IPPROTO_DONE;
}
int
udp6_output(struct inpcb * const inp, struct mbuf *m,
struct sockaddr_in6 * const addr6, struct mbuf * const control,
struct lwp * const l)
{
u_int32_t ulen = m->m_pkthdr.len;
u_int32_t plen = sizeof(struct udphdr) + ulen;
struct ip6_hdr *ip6;
struct udphdr *udp6;
struct in6_addr _laddr, *laddr, *faddr;
struct in6_addr laddr_mapped; /* XXX ugly */
struct sockaddr_in6 *sin6 = NULL;
struct ifnet *oifp = NULL;
int scope_ambiguous = 0;
u_int16_t fport;
int error = 0;
struct ip6_pktopts *optp = NULL;
struct ip6_pktopts opt;
int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
#ifdef INET
struct ip *ip;
struct udpiphdr *ui;
int flags = 0;
#endif
struct sockaddr_in6 tmp;
if (addr6) {
sin6 = addr6;
if (sin6->sin6_len != sizeof(*sin6)) {
error = EINVAL;
goto release;
}
if (sin6->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
goto release;
}
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
goto release;
}
if (control) {
if (__predict_false(l == NULL)) {
panic("%s: control but no lwp", __func__);
}
if ((error = ip6_setpktopts(control, &opt,
in6p_outputopts(inp), l->l_cred, IPPROTO_UDP)) != 0)
goto release;
optp = &opt;
} else
optp = in6p_outputopts(inp);
if (sin6) {
/*
* Slightly different than v4 version in that we call
* in6_selectsrc and in6pcb_set_port to fill in the local
* address and port rather than inpcb_connect. inpcb_connect
* sets inp_faddr which causes EISCONN below to be hit on
* subsequent sendto.
*/
if (sin6->sin6_port == 0) {
error = EADDRNOTAVAIL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
/* how about ::ffff:0.0.0.0 case? */
error = EISCONN;
goto release;
}
faddr = &sin6->sin6_addr;
fport = sin6->sin6_port; /* allow 0 port */
if (IN6_IS_ADDR_V4MAPPED(faddr)) {
if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
/*
* I believe we should explicitly discard the
* packet when mapped addresses are disabled,
* rather than send the packet as an IPv6 one.
* If we chose the latter approach, the packet
* might be sent out on the wire based on the
* default route, the situation which we'd
* probably want to avoid.
* (20010421 jinmei@kame.net)
*/
error = EINVAL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) {
/*
* when remote addr is an IPv4-mapped address,
* local addr should not be an IPv6 address,
* since you cannot determine how to map IPv6
* source address to IPv4.
*/
error = EINVAL;
goto release;
}
af = AF_INET;
}
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
struct psref psref;
int bound = curlwp_bind();
error = in6_selectsrc(sin6, optp,
in6p_moptions(inp),
&inp->inp_route,
&in6p_laddr(inp), &oifp, &psref, &_laddr);
if (error)
laddr = NULL;
else
laddr = &_laddr;
if (oifp && scope_ambiguous &&
(error = in6_setscope(&sin6->sin6_addr,
oifp, NULL))) {
if_put(oifp, &psref);
curlwp_bindx(bound);
goto release;
}
if_put(oifp, &psref);
curlwp_bindx(bound);
} else {
/*
* XXX: freebsd[34] does not have in_selectsrc, but
* we can omit the whole part because freebsd4 calls
* udp_output() directly in this case, and thus we'll
* never see this path.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
struct sockaddr_in sin_dst;
struct in_addr ina;
struct in_ifaddr *ia4;
struct psref _psref;
int bound;
memcpy(&ina, &faddr->s6_addr[12], sizeof(ina));
sockaddr_in_init(&sin_dst, &ina, 0);
bound = curlwp_bind();
ia4 = in_selectsrc(&sin_dst, &inp->inp_route,
inp->inp_socket->so_options, NULL,
&error, &_psref);
if (ia4 == NULL) {
curlwp_bindx(bound);
if (error == 0)
error = EADDRNOTAVAIL;
goto release;
}
memset(&laddr_mapped, 0, sizeof(laddr_mapped));
laddr_mapped.s6_addr16[5] = 0xffff; /* ugly */
memcpy(&laddr_mapped.s6_addr[12],
&IA_SIN(ia4)->sin_addr,
sizeof(IA_SIN(ia4)->sin_addr));
ia4_release(ia4, &_psref);
curlwp_bindx(bound);
laddr = &laddr_mapped;
} else
{
laddr = &in6p_laddr(inp); /* XXX */
}
}
if (laddr == NULL) { if (error == 0)
error = EADDRNOTAVAIL;
goto release;
}
if (inp->inp_lport == 0) {
/*
* Craft a sockaddr_in6 for the local endpoint. Use the
* "any" as a base, set the address, and recover the
* scope.
*/
struct sockaddr_in6 lsin6 =
*((const struct sockaddr_in6 *)inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin6.sin6_addr = *laddr;
error = sa6_recoverscope(&lsin6);
if (error)
goto release;
error = in6pcb_set_port(&lsin6, inp, l);
if (error) {
in6p_laddr(inp) = in6addr_any;
goto release;
}
}
} else {
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
error = ENOTCONN;
goto release;
}
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY))
{
/*
* XXX: this case would happen when the
* application sets the V6ONLY flag after
* connecting the foreign address.
* Such applications should be fixed,
* so we bark here.
*/
log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
"option was set for a connected socket\n");
error = EINVAL;
goto release;
} else
af = AF_INET;
}
laddr = &in6p_laddr(inp);
faddr = &in6p_faddr(inp);
fport = inp->inp_fport;
}
if (af == AF_INET)
hlen = sizeof(struct ip);
/*
* Calculate data length and get a mbuf
* for UDP and IP6 headers.
*/
M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto release;
}
/*
* Stuff checksum and output datagram.
*/
udp6 = (struct udphdr *)(mtod(m, char *) + hlen);
udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
udp6->uh_dport = fport;
if (plen <= 0xffff) udp6->uh_ulen = htons((u_int16_t)plen);
else
udp6->uh_ulen = 0;
udp6->uh_sum = 0;
switch (af) {
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
#if 0 /* ip6_plen will be filled in ip6_output. */
ip6->ip6_plen = htons((u_int16_t)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
ip6->ip6_hlim = in6pcb_selecthlim_rt(inp);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
udp6->uh_sum = in6_cksum_phdr(laddr, faddr,
htonl(plen), htonl(IPPROTO_UDP));
m->m_pkthdr.csum_flags = M_CSUM_UDPv6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
UDP6_STATINC(UDP6_STAT_OPACKETS);
error = ip6_output(m, optp, &inp->inp_route, 0,
in6p_moptions(inp), inp, NULL);
break;
case AF_INET:
#ifdef INET
/* can't transmit jumbogram over IPv4 */
if (plen > 0xffff) {
error = EMSGSIZE;
goto release;
}
ip = mtod(m, struct ip *);
ui = (struct udpiphdr *)ip;
memset(ui->ui_x1, 0, sizeof(ui->ui_x1));
ui->ui_pr = IPPROTO_UDP;
ui->ui_len = htons(plen);
memcpy(&ui->ui_src, &laddr->s6_addr[12], sizeof(ui->ui_src));
ui->ui_ulen = ui->ui_len;
flags = (inp->inp_socket->so_options &
(SO_DONTROUTE | SO_BROADCAST));
memcpy(&ui->ui_dst, &faddr->s6_addr[12], sizeof(ui->ui_dst));
udp6->uh_sum = in_cksum(m, hlen + plen);
if (udp6->uh_sum == 0)
udp6->uh_sum = 0xffff;
ip->ip_len = htons(hlen + plen);
ip->ip_ttl = in6pcb_selecthlim(inp, NULL); /* XXX */
ip->ip_tos = 0; /* XXX */
UDP_STATINC(UDP_STAT_OPACKETS);
error = ip_output(m, NULL, &inp->inp_route, flags /* XXX */,
inp->inp_moptions, NULL);
break;
#else
error = EAFNOSUPPORT;
goto release;
#endif
}
goto releaseopt;
release:
m_freem(m);
releaseopt:
if (control) { if (optp == &opt) ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return (error);
}
static int
udp6_attach(struct socket *so, int proto)
{
struct inpcb *inp;
int s, error;
KASSERT(sotoinpcb(so) == NULL);
sosetlock(so);
error = soreserve(so, udp6_sendspace, udp6_recvspace);
if (error) {
return error;
}
/*
* MAPPED_ADDR implementation spec:
* Always attach for IPv6, and only when necessary for IPv4.
*/
s = splsoftnet();
error = inpcb_create(so, &udbtable);
splx(s);
if (error) {
return error;
}
inp = sotoinpcb(so);
in6p_cksum(inp) = -1; /* just to be sure */
KASSERT(solocked(so));
return 0;
}
static void
udp6_detach(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
int s;
KASSERT(solocked(so));
KASSERT(inp != NULL);
s = splsoftnet();
inpcb_destroy(inp);
splx(s);
}
static int
udp6_accept(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL);
s = splsoftnet();
error = in6pcb_bind(inp, sin6, l);
splx(s);
return error;
}
static int
udp6_listen(struct socket *so, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
return EISCONN;
s = splsoftnet();
error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l);
splx(s);
if (error == 0) soisconnected(so);
return error;
}
static int
udp6_connect2(struct socket *so, struct socket *so2)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_disconnect(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
return ENOTCONN;
s = splsoftnet();
in6pcb_disconnect(inp);
memset((void *)&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
splx(s);
so->so_state &= ~SS_ISCONNECTED; /* XXX */
in6pcb_set_state(inp, INP_BOUND); /* XXX */
return 0;
}
static int
udp6_shutdown(struct socket *so)
{
int s;
s = splsoftnet();
socantsendmore(so);
splx(s);
return 0;
}
static int
udp6_abort(struct socket *so)
{
int s;
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
s = splsoftnet();
soisdisconnected(so);
inpcb_destroy(sotoinpcb(so));
splx(s);
return 0;
}
static int
udp6_ioctl(struct socket *so, u_long cmd, void *addr6, struct ifnet *ifp)
{
/*
* MAPPED_ADDR implementation info:
* Mapped addr support for PRU_CONTROL is not necessary.
* Because typical user of PRU_CONTROL is such as ifconfig,
* and they don't associate any addr to their socket. Then
* socket family is only hint about the PRU_CONTROL'ed address
* family, especially when getting addrs from kernel.
* So AF_INET socket need to be used to control AF_INET addrs,
* and AF_INET6 socket for AF_INET6 addrs.
*/
return in6_control(so, cmd, addr6, ifp);
}
static int
udp6_stat(struct socket *so, struct stat *ub)
{
KASSERT(solocked(so));
/* stat: don't bother with a blocksize */
return 0;
}
static int
udp6_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
KASSERT(nam != NULL);
in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
udp6_sockaddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
KASSERT(nam != NULL);
in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
udp6_rcvd(struct socket *so, int flags, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL);
s = splsoftnet();
error = udp6_output(inp, m, (struct sockaddr_in6 *)nam, control, l);
splx(s);
return error;
}
static int
udp6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
static int
udp6_purgeif(struct socket *so, struct ifnet *ifp)
{
mutex_enter(softnet_lock);
in6pcb_purgeif0(&udbtable, ifp);
#ifdef NET_MPSAFE
mutex_exit(softnet_lock);
#endif
in6_purgeif(ifp);
#ifdef NET_MPSAFE
mutex_enter(softnet_lock);
#endif
in6pcb_purgeif(&udbtable, ifp);
mutex_exit(softnet_lock);
return 0;
}
static int
sysctl_net_inet6_udp6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(udp6stat_percpu, UDP6_NSTATS));
}
static void
sysctl_net_inet6_udp6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "udp6",
SYSCTL_DESCR("UDPv6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sendspace",
SYSCTL_DESCR("Default UDP send buffer size"),
NULL, 0, &udp6_sendspace, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_SENDSPACE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "recvspace",
SYSCTL_DESCR("Default UDP receive buffer size"),
NULL, 0, &udp6_recvspace, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_RECVSPACE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "do_loopback_cksum",
SYSCTL_DESCR("Perform UDP checksum on loopback"),
NULL, 0, &udp_do_loopback_cksum, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_LOOPBACKCKSUM,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pcblist",
SYSCTL_DESCR("UDP protocol control block list"),
sysctl_inpcblist, 0, &udbtable, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, CTL_CREATE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("UDPv6 statistics"),
sysctl_net_inet6_udp6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_STATS,
CTL_EOL);
}
void
udp6_statinc(u_int stat)
{
KASSERT(stat < UDP6_NSTATS);
UDP6_STATINC(stat);
}
#ifdef IPSEC
/*
* Returns:
* 1 if the packet was processed
* 0 if normal UDP processing should take place
* -1 if an error occurred and m was freed
*/
static int
udp6_espinudp(struct mbuf **mp, int off)
{
const size_t skip = sizeof(struct udphdr);
size_t len;
void *data;
size_t minlen;
int ip6hdrlen;
struct ip6_hdr *ip6;
struct m_tag *tag;
struct udphdr *udphdr;
u_int16_t sport, dport;
struct mbuf *m = *mp;
uint32_t *marker;
/*
* Collapse the mbuf chain if the first mbuf is too short
* The longest case is: UDP + non ESP marker + ESP
*/
minlen = off + sizeof(u_int64_t) + sizeof(struct esp);
if (minlen > m->m_pkthdr.len)
minlen = m->m_pkthdr.len;
if (m->m_len < minlen) {
if ((*mp = m_pullup(m, minlen)) == NULL) {
return -1;
}
m = *mp;
}
len = m->m_len - off;
data = mtod(m, char *) + off;
/* Ignore keepalive packets */
if ((len == 1) && (*(unsigned char *)data == 0xff)) {
m_freem(m);
*mp = NULL; /* avoid any further processing by caller ... */
return 1;
}
/* Handle Non-ESP marker (32bit). If zero, then IKE. */
marker = (uint32_t *)data;
if (len <= sizeof(uint32_t))
return 0;
if (marker[0] == 0)
return 0;
/*
* Get the UDP ports. They are handled in network
* order everywhere in IPSEC_NAT_T code.
*/
udphdr = (struct udphdr *)((char *)data - skip);
sport = udphdr->uh_sport;
dport = udphdr->uh_dport;
/*
* Remove the UDP header (and possibly the non ESP marker)
* IPv6 header length is ip6hdrlen
* Before:
* <---- off --->
* +-----+------+-----+
* | IP6 | UDP | ESP |
* +-----+------+-----+
* <-skip->
* After:
* +-----+-----+
* | IP6 | ESP |
* +-----+-----+
* <-skip->
*/
ip6hdrlen = off - sizeof(struct udphdr);
memmove(mtod(m, char *) + skip, mtod(m, void *), ip6hdrlen);
m_adj(m, skip);
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - skip);
ip6->ip6_nxt = IPPROTO_ESP;
/*
* We have modified the packet - it is now ESP, so we should not
* return to UDP processing ...
*
* Add a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
* the source UDP port. This is required if we want
* to select the right SPD for multiple hosts behind
* same NAT
*/
if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) {
m_freem(m);
return -1;
}
((u_int16_t *)(tag + 1))[0] = sport;
((u_int16_t *)(tag + 1))[1] = dport;
m_tag_prepend(m, tag);
if (ipsec_used)
ipsec6_common_input(&m, &ip6hdrlen, IPPROTO_ESP);
else
m_freem(m);
/* We handled it, it shouldn't be handled by UDP */
*mp = NULL; /* avoid free by caller ... */
return 1;
}
#endif /* IPSEC */
PR_WRAP_USRREQS(udp6)
#define udp6_attach udp6_attach_wrapper
#define udp6_detach udp6_detach_wrapper
#define udp6_accept udp6_accept_wrapper
#define udp6_bind udp6_bind_wrapper
#define udp6_listen udp6_listen_wrapper
#define udp6_connect udp6_connect_wrapper
#define udp6_connect2 udp6_connect2_wrapper
#define udp6_disconnect udp6_disconnect_wrapper
#define udp6_shutdown udp6_shutdown_wrapper
#define udp6_abort udp6_abort_wrapper
#define udp6_ioctl udp6_ioctl_wrapper
#define udp6_stat udp6_stat_wrapper
#define udp6_peeraddr udp6_peeraddr_wrapper
#define udp6_sockaddr udp6_sockaddr_wrapper
#define udp6_rcvd udp6_rcvd_wrapper
#define udp6_recvoob udp6_recvoob_wrapper
#define udp6_send udp6_send_wrapper
#define udp6_sendoob udp6_sendoob_wrapper
#define udp6_purgeif udp6_purgeif_wrapper
const struct pr_usrreqs udp6_usrreqs = {
.pr_attach = udp6_attach,
.pr_detach = udp6_detach,
.pr_accept = udp6_accept,
.pr_bind = udp6_bind,
.pr_listen = udp6_listen,
.pr_connect = udp6_connect,
.pr_connect2 = udp6_connect2,
.pr_disconnect = udp6_disconnect,
.pr_shutdown = udp6_shutdown,
.pr_abort = udp6_abort,
.pr_ioctl = udp6_ioctl,
.pr_stat = udp6_stat,
.pr_peeraddr = udp6_peeraddr,
.pr_sockaddr = udp6_sockaddr,
.pr_rcvd = udp6_rcvd,
.pr_recvoob = udp6_recvoob,
.pr_send = udp6_send,
.pr_sendoob = udp6_sendoob,
.pr_purgeif = udp6_purgeif,
};
/* $NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $ */
/* $KAME$ */
/*
* Copyright (C) 2000 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $");
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet6/scope6_var.h>
#ifdef ENABLE_DEFAULT_SCOPE
int ip6_use_defzone = 1;
#else
int ip6_use_defzone = 0;
#endif
static struct scope6_id sid_default;
#define SID(ifp) \
((ifp)->if_afdata[AF_INET6] == NULL ? NULL : \
((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)
void
scope6_init(void)
{
memset(&sid_default, 0, sizeof(sid_default));
}
struct scope6_id *
scope6_ifattach(struct ifnet *ifp)
{
struct scope6_id *sid;
sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO);
/*
* XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
* Should we rather hardcode here?
*/
sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
#ifdef MULTI_SCOPE
/* by default, we don't care about scope boundary for these scopes. */
sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1;
sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1;
#endif
return sid;
}
void
scope6_ifdetach(struct scope6_id *sid)
{
free(sid, M_IFADDR);
}
/*
* Get a scope of the address. Interface-local, link-local, site-local
* or global.
*/
int
in6_addrscope(const struct in6_addr *addr)
{
int scope;
if (addr->s6_addr[0] == 0xfe) { scope = addr->s6_addr[1] & 0xc0; switch (scope) {
case 0x80:
return IPV6_ADDR_SCOPE_LINKLOCAL;
case 0xc0:
return IPV6_ADDR_SCOPE_SITELOCAL;
default:
return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */
}
}
if (addr->s6_addr[0] == 0xff) {
scope = addr->s6_addr[1] & 0x0f;
/*
* due to other scope such as reserved,
* return scope doesn't work.
*/
switch (scope) {
case IPV6_ADDR_SCOPE_INTFACELOCAL:
return IPV6_ADDR_SCOPE_INTFACELOCAL;
case IPV6_ADDR_SCOPE_LINKLOCAL:
return IPV6_ADDR_SCOPE_LINKLOCAL;
case IPV6_ADDR_SCOPE_SITELOCAL:
return IPV6_ADDR_SCOPE_SITELOCAL;
default:
return IPV6_ADDR_SCOPE_GLOBAL;
}
}
if (memcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr[15] == 1) /* loopback */
return IPV6_ADDR_SCOPE_LINKLOCAL;
if (addr->s6_addr[15] == 0) {
/*
* Regard the unspecified addresses as global,
* since it has no ambiguity.
* XXX: not sure if it's correct...
*/
return IPV6_ADDR_SCOPE_GLOBAL;
}
}
return IPV6_ADDR_SCOPE_GLOBAL;
}
uint32_t
scope6_addr2default(const struct in6_addr *addr)
{
uint32_t id;
/*
* special case: The loopback address should be considered as
* link-local, but there's no ambiguity in the syntax.
*/
if (IN6_IS_ADDR_LOOPBACK(addr))
return 0;
/*
* XXX: 32-bit read is atomic on all our platforms, is it OK
* not to lock here?
*/
id = sid_default.s6id_list[in6_addrscope(addr)];
return id;
}
/*
* Validate the specified scope zone ID in the sin6_scope_id field. If the ID
* is unspecified (=0), needs to be specified, and the default zone ID can be
* used, the default value will be used.
* This routine then generates the kernel-internal form: if the address scope
* of is interface-local or link-local, embed the interface index in the
* address.
*/
int
sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
{
struct ifnet *ifp;
uint32_t zoneid;
if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) zoneid = scope6_addr2default(&sin6->sin6_addr); if (zoneid != 0 && (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
int s;
/*
* At this moment, we only check interface-local and
* link-local scope IDs, and use interface indices as the
* zone IDs assuming a one-to-one mapping between interfaces
* and links.
*/
s = pserialize_read_enter();
ifp = if_byindex(zoneid);
if (ifp == NULL) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
/* XXX assignment to 16bit from 32bit variable */
sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);
sin6->sin6_scope_id = 0;
}
return 0;
}
struct sockaddr *
sockaddr_in6_externalize(struct sockaddr *dst, socklen_t socklen,
const struct sockaddr *src)
{
struct sockaddr_in6 *sin6;
sin6 = satosin6(sockaddr_copy(dst, socklen, src));
if (sin6 == NULL || sa6_recoverscope(sin6) != 0)
return NULL;
return dst;
}
/*
* generate standard sockaddr_in6 from embedded form.
*/
int
sa6_recoverscope(struct sockaddr_in6 *sin6)
{
uint32_t zoneid;
char ip6buf[INET6_ADDRSTRLEN];
if (sin6->sin6_scope_id != 0) {
log(LOG_NOTICE,
"%s: assumption failure (non 0 ID): %s%%%d\n", __func__,
IN6_PRINT(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id);
/* XXX: proceed anyway... */
}
if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
/*
* KAME assumption: link id == interface id
*/
zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
if (zoneid) {
int s = pserialize_read_enter();
if (!if_byindex(zoneid)) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
sin6->sin6_addr.s6_addr16[1] = 0;
sin6->sin6_scope_id = zoneid;
}
}
return 0;
}
int
in6_setzoneid(struct in6_addr *in6, uint32_t zoneid)
{
if (IN6_IS_SCOPE_EMBEDDABLE(in6))
in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */
return 0;
}
/*
* Determine the appropriate scope zone ID for in6 and ifp. If ret_id is
* non NULL, it is set to the zone ID. If the zone ID needs to be embedded
* in the in6_addr structure, in6 will be modified.
*/
int
in6_setscope(struct in6_addr *in6, const struct ifnet *ifp, uint32_t *ret_id)
{
int scope;
uint32_t zoneid = 0;
const struct scope6_id *sid = SID(ifp); if (sid == NULL) {
log(LOG_NOTICE, "%s: no scope id for %s\n", __func__,
if_name(ifp));
return EINVAL;
}
/*
* special case: the loopback address can only belong to a loopback
* interface.
*/
if (IN6_IS_ADDR_LOOPBACK(in6)) {
if (!(ifp->if_flags & IFF_LOOPBACK)) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_NOTICE, "%s: can't set scope for not loopback "
"interface %s and loopback address %s\n",
__func__, if_name(ifp), IN6_PRINT(ip6buf, in6));
return EINVAL;
} else {
if (ret_id != NULL) *ret_id = 0; /* there's no ambiguity */
return 0;
}
}
scope = in6_addrscope(in6); switch (scope) {
case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL];
break;
case IPV6_ADDR_SCOPE_LINKLOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL];
break;
case IPV6_ADDR_SCOPE_SITELOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL];
break;
case IPV6_ADDR_SCOPE_ORGLOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL];
break;
default:
zoneid = 0; /* XXX: treat as global. */
break;
}
if (ret_id != NULL) *ret_id = zoneid; return in6_setzoneid(in6, zoneid);
}
const char *
in6_getscopename(const struct in6_addr *addr)
{
switch (in6_addrscope(addr)) {
case IPV6_ADDR_SCOPE_INTFACELOCAL:
return "interface";
#if IPV6_ADDR_SCOPE_INTFACELOCAL != IPV6_ADDR_SCOPE_NODELOCAL
case IPV6_ADDR_SCOPE_NODELOCAL:
return "node";
#endif
case IPV6_ADDR_SCOPE_LINKLOCAL:
return "link";
case IPV6_ADDR_SCOPE_SITELOCAL:
return "site";
case IPV6_ADDR_SCOPE_ORGLOCAL:
return "organization";
case IPV6_ADDR_SCOPE_GLOBAL:
return "global";
default:
return "unknown";
}
}
/*
* Just clear the embedded scope identifier. Return 0 if the original address
* is intact; return non 0 if the address is modified.
*/
int
in6_clearscope(struct in6_addr *in6)
{
int modified = 0;
if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
if (in6->s6_addr16[1] != 0)
modified = 1;
in6->s6_addr16[1] = 0;
}
return modified;
}
/* $NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95
*/
/*
* Stubs for system calls and facilities not included in the system.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ktrace.h"
#include "opt_sysv.h"
#include "opt_modular.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstypes.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/ktrace.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/userconf.h>
bool default_bus_space_is_equal(bus_space_tag_t, bus_space_tag_t);
bool default_bus_space_handle_is_equal(bus_space_tag_t, bus_space_handle_t,
bus_space_handle_t);
/*
* SYSV Semaphores, Shared Memory, Message Queues
*/
#ifndef MODULAR
#ifndef SYSVMSG
__strong_alias(msgctl1,enosys);
#endif
#ifndef SYSVSHM
__strong_alias(shmctl1,enosys);
#endif
#ifndef SYSVSEM
__strong_alias(semctl1,enosys);
#endif
#endif
/*
* ktrace stubs. ktruser() goes to enosys as we want to fail the syscall,
* but not kill the process: utrace() is a debugging feature.
*/
#ifndef KTRACE
__strong_alias(ktr_csw,nullop); /* Probes */
__strong_alias(ktr_emul,nullop);
__strong_alias(ktr_geniov,nullop);
__strong_alias(ktr_genio,nullop);
__strong_alias(ktr_mibio,nullop);
__strong_alias(ktr_namei,nullop);
__strong_alias(ktr_namei2,nullop);
__strong_alias(ktr_psig,nullop);
__strong_alias(ktr_syscall,nullop);
__strong_alias(ktr_sysret,nullop);
__strong_alias(ktr_kuser,nullop);
__strong_alias(ktr_mib,nullop);
__strong_alias(ktr_execarg,nullop);
__strong_alias(ktr_execenv,nullop);
__strong_alias(ktr_execfd,nullop);
__strong_alias(sys_fktrace,sys_nosys); /* Syscalls */
__strong_alias(sys_ktrace,sys_nosys);
__strong_alias(sys_utrace,sys_nosys);
int ktrace_on; /* Misc */
__strong_alias(ktruser,enosys);
__strong_alias(ktr_point,nullop);
#endif /* KTRACE */
__weak_alias(device_register, voidop);
__weak_alias(device_register_post_config, voidop);
__weak_alias(spldebug_start, voidop);
__weak_alias(spldebug_stop, voidop);
__weak_alias(machdep_init,nullop);
__weak_alias(pci_chipset_tag_create, eopnotsupp);
__weak_alias(pci_chipset_tag_destroy, voidop);
__weak_alias(bus_space_reserve, eopnotsupp);
__weak_alias(bus_space_reserve_subregion, eopnotsupp);
__weak_alias(bus_space_release, voidop);
__weak_alias(bus_space_reservation_map, eopnotsupp);
__weak_alias(bus_space_reservation_unmap, voidop);
__weak_alias(bus_dma_tag_create, eopnotsupp);
__weak_alias(bus_dma_tag_destroy, voidop);
__weak_alias(bus_space_tag_create, eopnotsupp);
__weak_alias(bus_space_tag_destroy, voidop);
__strict_weak_alias(bus_space_is_equal, default_bus_space_is_equal);
__strict_weak_alias(bus_space_handle_is_equal,
default_bus_space_handle_is_equal);
__weak_alias(userconf_bootinfo, voidop);
__weak_alias(userconf_init, voidop);
__weak_alias(userconf_prompt, voidop);
__weak_alias(kobj_renamespace, nullop);
__weak_alias(interrupt_get_count, nullop);
__weak_alias(interrupt_get_assigned, voidop);
__weak_alias(interrupt_get_available, voidop);
__weak_alias(interrupt_get_devname, voidop);
__weak_alias(interrupt_construct_intrids, nullret);
__weak_alias(interrupt_destruct_intrids, voidop);
__weak_alias(interrupt_distribute, eopnotsupp);
__weak_alias(interrupt_distribute_handler, eopnotsupp);
/*
* Scheduler activations system calls. These need to remain until libc's
* major version is bumped.
*/
__strong_alias(sys_sa_register,sys_nosys);
__strong_alias(sys_sa_stacks,sys_nosys);
__strong_alias(sys_sa_enable,sys_nosys);
__strong_alias(sys_sa_setconcurrency,sys_nosys);
__strong_alias(sys_sa_yield,sys_nosys);
__strong_alias(sys_sa_preempt,sys_nosys);
__strong_alias(sys_sa_unblockyield,sys_nosys);
/*
* Stubs for compat_netbsd32.
*/
__strong_alias(dosa_register,sys_nosys);
__strong_alias(sa_stacks1,sys_nosys);
/*
* Stubs for drivers. See sys/conf.h.
*/
__strong_alias(devenodev,enodev);
__strong_alias(deveopnotsupp,eopnotsupp);
__strong_alias(devnullop,nullop);
__strong_alias(ttyenodev,enodev);
__strong_alias(ttyvenodev,voidop);
__strong_alias(ttyvnullop,nullop);
/*
* Stubs for architectures that do not support kernel preemption.
*/
#ifndef __HAVE_PREEMPTION
bool
cpu_kpreempt_enter(uintptr_t where, int s)
{
return false;
}
void
cpu_kpreempt_exit(uintptr_t where)
{
}
bool
cpu_kpreempt_disabled(void)
{
return true;
}
#else
# ifndef MULTIPROCESSOR
# error __HAVE_PREEMPTION requires MULTIPROCESSOR
# endif
#endif /* !__HAVE_PREEMPTION */
int
sys_nosys(struct lwp *l, const void *v, register_t *retval)
{
mutex_enter(&proc_lock);
psignal(l->l_proc, SIGSYS);
mutex_exit(&proc_lock);
return ENOSYS;
}
/*
* Unsupported device function (e.g. writing to read-only device).
*/
int
enodev(void)
{
return (ENODEV);
}
/*
* Unconfigured device function; driver not configured.
*/
int
enxio(void)
{
return (ENXIO);
}
/*
* Unsupported ioctl function.
*/
int
enoioctl(void)
{
return (ENOTTY);
}
/*
* Unsupported system function.
* This is used for an otherwise-reasonable operation
* that is not supported by the current system binary.
*/
int
enosys(void)
{
return (ENOSYS);
}
/*
* Return error for operation not supported
* on a specific object or file type.
*/
int
eopnotsupp(void)
{
return (EOPNOTSUPP);
}
/*
* Generic null operation, void return value.
*/
void
voidop(void)
{
}
/*
* Generic null operation, always returns success.
*/
int
nullop(void *v)
{
return (0);
}
/*
* Generic null operation, always returns null.
*/
void *
nullret(void)
{
return (NULL);
}
bool
default_bus_space_handle_is_equal(bus_space_tag_t t,
bus_space_handle_t h1, bus_space_handle_t h2)
{
return memcmp(&h1, &h2, sizeof(h1)) == 0;
}
bool
default_bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2)
{
return memcmp(&t1, &t2, sizeof(t1)) == 0;
}
/* Stubs for architectures with no kernel FPU access. */
__weak_alias(kthread_fpu_enter_md, voidop);
__weak_alias(kthread_fpu_exit_md, voidop);
/* $NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $ */
/*
* Copyright (c) 1996, 2000 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the
* NetBSD Project. See http://www.NetBSD.org/ for
* information about NetBSD.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* --(license Id: LICENSE.proto,v 1.1 2000/06/13 21:40:26 cgd Exp )--
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratories.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp (LBL)
*
* @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "drvctl.h"
#endif
#include <sys/param.h>
#include <sys/device.h>
#include <sys/device_impl.h>
#include <sys/disklabel.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/errno.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/unistd.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/callout.h>
#include <sys/devmon.h>
#include <sys/cpu.h>
#include <sys/sysctl.h>
#include <sys/stdarg.h>
#include <sys/localcount.h>
#include <sys/disk.h>
#include <sys/rndsource.h>
#include <machine/limits.h>
/*
* Autoconfiguration subroutines.
*/
/*
* Device autoconfiguration timings are mixed into the entropy pool.
*/
static krndsource_t rnd_autoconf_source;
/*
* ioconf.c exports exactly two names: cfdata and cfroots. All system
* devices and drivers are found via these tables.
*/
extern struct cfdata cfdata[];
extern const short cfroots[];
/*
* List of all cfdriver structures. We use this to detect duplicates
* when other cfdrivers are loaded.
*/
struct cfdriverlist allcfdrivers = LIST_HEAD_INITIALIZER(&allcfdrivers);
extern struct cfdriver * const cfdriver_list_initial[];
/*
* Initial list of cfattach's.
*/
extern const struct cfattachinit cfattachinit[];
/*
* List of cfdata tables. We always have one such list -- the one
* built statically when the kernel was configured.
*/
struct cftablelist allcftables = TAILQ_HEAD_INITIALIZER(allcftables);
static struct cftable initcftable;
#define ROOT ((device_t)NULL)
struct matchinfo {
cfsubmatch_t fn;
device_t parent;
const int *locs;
void *aux;
struct cfdata *match;
int pri;
};
struct alldevs_foray {
int af_s;
struct devicelist af_garbage;
};
/*
* Internal version of the cfargs structure; all versions are
* canonicalized to this.
*/
struct cfargs_internal {
union {
cfsubmatch_t submatch;/* submatch function (direct config) */
cfsearch_t search; /* search function (indirect config) */
};
const char * iattr; /* interface attribute */
const int * locators; /* locators array */
devhandle_t devhandle; /* devhandle_t (by value) */
};
static char *number(char *, int);
static void mapply(struct matchinfo *, cfdata_t);
static void config_devdelete(device_t);
static void config_devunlink(device_t, struct devicelist *);
static void config_makeroom(int, struct cfdriver *);
static void config_devlink(device_t);
static void config_alldevs_enter(struct alldevs_foray *);
static void config_alldevs_exit(struct alldevs_foray *);
static void config_add_attrib_dict(device_t);
static device_t config_attach_internal(device_t, cfdata_t, void *,
cfprint_t, const struct cfargs_internal *);
static void config_collect_garbage(struct devicelist *);
static void config_dump_garbage(struct devicelist *);
static void pmflock_debug(device_t, const char *, int);
static device_t deviter_next1(deviter_t *);
static void deviter_reinit(deviter_t *);
struct deferred_config {
TAILQ_ENTRY(deferred_config) dc_queue;
device_t dc_dev;
void (*dc_func)(device_t);
};
TAILQ_HEAD(deferred_config_head, deferred_config);
static struct deferred_config_head deferred_config_queue =
TAILQ_HEAD_INITIALIZER(deferred_config_queue);
static struct deferred_config_head interrupt_config_queue =
TAILQ_HEAD_INITIALIZER(interrupt_config_queue);
static int interrupt_config_threads = 8;
static struct deferred_config_head mountroot_config_queue =
TAILQ_HEAD_INITIALIZER(mountroot_config_queue);
static int mountroot_config_threads = 2;
static lwp_t **mountroot_config_lwpids;
static size_t mountroot_config_lwpids_size;
bool root_is_mounted = false;
static void config_process_deferred(struct deferred_config_head *, device_t);
/* Hooks to finalize configuration once all real devices have been found. */
struct finalize_hook {
TAILQ_ENTRY(finalize_hook) f_list;
int (*f_func)(device_t);
device_t f_dev;
};
static TAILQ_HEAD(, finalize_hook) config_finalize_list =
TAILQ_HEAD_INITIALIZER(config_finalize_list);
static int config_finalize_done;
/* list of all devices */
static struct devicelist alldevs = TAILQ_HEAD_INITIALIZER(alldevs);
static kmutex_t alldevs_lock __cacheline_aligned;
static devgen_t alldevs_gen = 1;
static int alldevs_nread = 0;
static int alldevs_nwrite = 0;
static bool alldevs_garbage = false;
static struct devicelist config_pending =
TAILQ_HEAD_INITIALIZER(config_pending);
static kmutex_t config_misc_lock;
static kcondvar_t config_misc_cv;
static bool detachall = false;
#define STREQ(s1, s2) \
(*(s1) == *(s2) && strcmp((s1), (s2)) == 0)
static bool config_initialized = false; /* config_init() has been called. */
static int config_do_twiddle;
static callout_t config_twiddle_ch;
static void sysctl_detach_setup(struct sysctllog **);
int no_devmon_insert(const char *, prop_dictionary_t);
int (*devmon_insert_vec)(const char *, prop_dictionary_t) = no_devmon_insert;
typedef int (*cfdriver_fn)(struct cfdriver *);
static int
frob_cfdrivervec(struct cfdriver * const *cfdriverv,
cfdriver_fn drv_do, cfdriver_fn drv_undo,
const char *style, bool dopanic)
{
void (*pr)(const char *, ...) __printflike(1, 2) =
dopanic ? panic : printf;
int i, error = 0, e2 __diagused;
for (i = 0; cfdriverv[i] != NULL; i++) {
if ((error = drv_do(cfdriverv[i])) != 0) {
pr("configure: `%s' driver %s failed: %d",
cfdriverv[i]->cd_name, style, error);
goto bad;
}
}
KASSERT(error == 0);
return 0;
bad:
printf("\n");
for (i--; i >= 0; i--) {
e2 = drv_undo(cfdriverv[i]);
KASSERT(e2 == 0);
}
return error;
}
typedef int (*cfattach_fn)(const char *, struct cfattach *);
static int
frob_cfattachvec(const struct cfattachinit *cfattachv,
cfattach_fn att_do, cfattach_fn att_undo,
const char *style, bool dopanic)
{
const struct cfattachinit *cfai = NULL;
void (*pr)(const char *, ...) __printflike(1, 2) =
dopanic ? panic : printf;
int j = 0, error = 0, e2 __diagused;
for (cfai = &cfattachv[0]; cfai->cfai_name != NULL; cfai++) {
for (j = 0; cfai->cfai_list[j] != NULL; j++) {
if ((error = att_do(cfai->cfai_name,
cfai->cfai_list[j])) != 0) {
pr("configure: attachment `%s' "
"of `%s' driver %s failed: %d",
cfai->cfai_list[j]->ca_name,
cfai->cfai_name, style, error);
goto bad;
}
}
}
KASSERT(error == 0);
return 0;
bad:
/*
* Rollback in reverse order. dunno if super-important, but
* do that anyway. Although the code looks a little like
* someone did a little integration (in the math sense).
*/
printf("\n");
if (cfai) {
bool last;
for (last = false; last == false; ) {
if (cfai == &cfattachv[0])
last = true;
for (j--; j >= 0; j--) {
e2 = att_undo(cfai->cfai_name,
cfai->cfai_list[j]);
KASSERT(e2 == 0);
}
if (!last) {
cfai--;
for (j = 0; cfai->cfai_list[j] != NULL; j++)
;
}
}
}
return error;
}
/*
* Initialize the autoconfiguration data structures. Normally this
* is done by configure(), but some platforms need to do this very
* early (to e.g. initialize the console).
*/
void
config_init(void)
{
KASSERT(config_initialized == false);
mutex_init(&alldevs_lock, MUTEX_DEFAULT, IPL_VM);
mutex_init(&config_misc_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&config_misc_cv, "cfgmisc");
callout_init(&config_twiddle_ch, CALLOUT_MPSAFE);
frob_cfdrivervec(cfdriver_list_initial,
config_cfdriver_attach, NULL, "bootstrap", true);
frob_cfattachvec(cfattachinit,
config_cfattach_attach, NULL, "bootstrap", true);
initcftable.ct_cfdata = cfdata;
TAILQ_INSERT_TAIL(&allcftables, &initcftable, ct_list);
rnd_attach_source(&rnd_autoconf_source, "autoconf", RND_TYPE_UNKNOWN,
RND_FLAG_COLLECT_TIME);
config_initialized = true;
}
/*
* Init or fini drivers and attachments. Either all or none
* are processed (via rollback). It would be nice if this were
* atomic to outside consumers, but with the current state of
* locking ...
*/
int
config_init_component(struct cfdriver * const *cfdriverv,
const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
int error;
KERNEL_LOCK(1, NULL);
if ((error = frob_cfdrivervec(cfdriverv,
config_cfdriver_attach, config_cfdriver_detach, "init", false))!= 0)
goto out;
if ((error = frob_cfattachvec(cfattachv,
config_cfattach_attach, config_cfattach_detach,
"init", false)) != 0) {
frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, NULL, "init rollback", true);
goto out;
}
if ((error = config_cfdata_attach(cfdatav, 1)) != 0) {
frob_cfattachvec(cfattachv,
config_cfattach_detach, NULL, "init rollback", true);
frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, NULL, "init rollback", true);
goto out;
}
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
int
config_fini_component(struct cfdriver * const *cfdriverv,
const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
int error;
KERNEL_LOCK(1, NULL);
if ((error = config_cfdata_detach(cfdatav)) != 0)
goto out;
if ((error = frob_cfattachvec(cfattachv,
config_cfattach_detach, config_cfattach_attach,
"fini", false)) != 0) {
if (config_cfdata_attach(cfdatav, 0) != 0)
panic("config_cfdata fini rollback failed");
goto out;
}
if ((error = frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, config_cfdriver_attach,
"fini", false)) != 0) {
frob_cfattachvec(cfattachv,
config_cfattach_attach, NULL, "fini rollback", true);
if (config_cfdata_attach(cfdatav, 0) != 0)
panic("config_cfdata fini rollback failed");
goto out;
}
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
void
config_init_mi(void)
{
if (!config_initialized)
config_init();
sysctl_detach_setup(NULL);
}
void
config_deferred(device_t dev)
{
KASSERT(KERNEL_LOCKED_P());
config_process_deferred(&deferred_config_queue, dev);
config_process_deferred(&interrupt_config_queue, dev);
config_process_deferred(&mountroot_config_queue, dev);
}
static void
config_interrupts_thread(void *cookie)
{
struct deferred_config *dc;
device_t dev;
mutex_enter(&config_misc_lock);
while ((dc = TAILQ_FIRST(&interrupt_config_queue)) != NULL) {
TAILQ_REMOVE(&interrupt_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
dev = dc->dc_dev;
(*dc->dc_func)(dev);
if (!device_pmf_is_registered(dev))
aprint_debug_dev(dev,
"WARNING: power management not supported\n");
config_pending_decr(dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
}
mutex_exit(&config_misc_lock);
kthread_exit(0);
}
void
config_create_interruptthreads(void)
{
int i;
for (i = 0; i < interrupt_config_threads; i++) {
(void)kthread_create(PRI_NONE, 0/*XXXSMP */, NULL,
config_interrupts_thread, NULL, NULL, "configintr");
}
}
static void
config_mountroot_thread(void *cookie)
{
struct deferred_config *dc;
mutex_enter(&config_misc_lock);
while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) {
TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
(*dc->dc_func)(dc->dc_dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
}
mutex_exit(&config_misc_lock);
kthread_exit(0);
}
void
config_create_mountrootthreads(void)
{
int i;
if (!root_is_mounted)
root_is_mounted = true;
mountroot_config_lwpids_size = sizeof(mountroot_config_lwpids) *
mountroot_config_threads;
mountroot_config_lwpids = kmem_alloc(mountroot_config_lwpids_size,
KM_NOSLEEP);
KASSERT(mountroot_config_lwpids);
for (i = 0; i < mountroot_config_threads; i++) {
mountroot_config_lwpids[i] = 0;
(void)kthread_create(PRI_NONE, KTHREAD_MUSTJOIN/* XXXSMP */,
NULL, config_mountroot_thread, NULL,
&mountroot_config_lwpids[i],
"configroot");
}
}
void
config_finalize_mountroot(void)
{
int i, error;
for (i = 0; i < mountroot_config_threads; i++) {
if (mountroot_config_lwpids[i] == 0)
continue;
error = kthread_join(mountroot_config_lwpids[i]);
if (error)
printf("%s: thread %x joined with error %d\n",
__func__, i, error);
}
kmem_free(mountroot_config_lwpids, mountroot_config_lwpids_size);
}
/*
* Announce device attach/detach to userland listeners.
*/
int
no_devmon_insert(const char *name, prop_dictionary_t p)
{
return ENODEV;
}
static void
devmon_report_device(device_t dev, bool isattach)
{
prop_dictionary_t ev, dict = device_properties(dev);
const char *parent;
const char *what;
const char *where;
device_t pdev = device_parent(dev);
/* If currently no drvctl device, just return */
if (devmon_insert_vec == no_devmon_insert)
return;
ev = prop_dictionary_create();
if (ev == NULL)
return;
what = (isattach ? "device-attach" : "device-detach");
parent = (pdev == NULL ? "root" : device_xname(pdev)); if (prop_dictionary_get_string(dict, "location", &where)) { prop_dictionary_set_string(ev, "location", where);
aprint_debug("ev: %s %s at %s in [%s]\n",
what, device_xname(dev), parent, where);
}
if (!prop_dictionary_set_string(ev, "device", device_xname(dev)) ||
!prop_dictionary_set_string(ev, "parent", parent)) {
prop_object_release(ev);
return;
}
if ((*devmon_insert_vec)(what, ev) != 0)
prop_object_release(ev);
}
/*
* Add a cfdriver to the system.
*/
int
config_cfdriver_attach(struct cfdriver *cd)
{
struct cfdriver *lcd;
/* Make sure this driver isn't already in the system. */
LIST_FOREACH(lcd, &allcfdrivers, cd_list) {
if (STREQ(lcd->cd_name, cd->cd_name))
return EEXIST;
}
LIST_INIT(&cd->cd_attach);
LIST_INSERT_HEAD(&allcfdrivers, cd, cd_list);
return 0;
}
/*
* Remove a cfdriver from the system.
*/
int
config_cfdriver_detach(struct cfdriver *cd)
{
struct alldevs_foray af;
int i, rc = 0;
config_alldevs_enter(&af);
/* Make sure there are no active instances. */
for (i = 0; i < cd->cd_ndevs; i++) {
if (cd->cd_devs[i] != NULL) {
rc = EBUSY;
break;
}
}
config_alldevs_exit(&af);
if (rc != 0)
return rc;
/* ...and no attachments loaded. */
if (LIST_EMPTY(&cd->cd_attach) == 0)
return EBUSY;
LIST_REMOVE(cd, cd_list);
KASSERT(cd->cd_devs == NULL);
return 0;
}
/*
* Look up a cfdriver by name.
*/
struct cfdriver *
config_cfdriver_lookup(const char *name)
{
struct cfdriver *cd;
LIST_FOREACH(cd, &allcfdrivers, cd_list) { if (STREQ(cd->cd_name, name))
return cd;
}
return NULL;
}
/*
* Add a cfattach to the specified driver.
*/
int
config_cfattach_attach(const char *driver, struct cfattach *ca)
{
struct cfattach *lca;
struct cfdriver *cd;
cd = config_cfdriver_lookup(driver);
if (cd == NULL)
return ESRCH;
/* Make sure this attachment isn't already on this driver. */
LIST_FOREACH(lca, &cd->cd_attach, ca_list) {
if (STREQ(lca->ca_name, ca->ca_name))
return EEXIST;
}
LIST_INSERT_HEAD(&cd->cd_attach, ca, ca_list);
return 0;
}
/*
* Remove a cfattach from the specified driver.
*/
int
config_cfattach_detach(const char *driver, struct cfattach *ca)
{
struct alldevs_foray af;
struct cfdriver *cd;
device_t dev;
int i, rc = 0;
cd = config_cfdriver_lookup(driver);
if (cd == NULL)
return ESRCH;
config_alldevs_enter(&af);
/* Make sure there are no active instances. */
for (i = 0; i < cd->cd_ndevs; i++) {
if ((dev = cd->cd_devs[i]) == NULL)
continue;
if (dev->dv_cfattach == ca) {
rc = EBUSY;
break;
}
}
config_alldevs_exit(&af);
if (rc != 0)
return rc;
LIST_REMOVE(ca, ca_list);
return 0;
}
/*
* Look up a cfattach by name.
*/
static struct cfattach *
config_cfattach_lookup_cd(struct cfdriver *cd, const char *atname)
{
struct cfattach *ca;
LIST_FOREACH(ca, &cd->cd_attach, ca_list) { if (STREQ(ca->ca_name, atname))
return ca;
}
return NULL;
}
/*
* Look up a cfattach by driver/attachment name.
*/
struct cfattach *
config_cfattach_lookup(const char *name, const char *atname)
{
struct cfdriver *cd;
cd = config_cfdriver_lookup(name);
if (cd == NULL)
return NULL;
return config_cfattach_lookup_cd(cd, atname);
}
/*
* Apply the matching function and choose the best. This is used
* a few times and we want to keep the code small.
*/
static void
mapply(struct matchinfo *m, cfdata_t cf)
{
int pri;
if (m->fn != NULL) {
pri = (*m->fn)(m->parent, cf, m->locs, m->aux);
} else {
pri = config_match(m->parent, cf, m->aux);
}
if (pri > m->pri) {
m->match = cf;
m->pri = pri;
}
}
int
config_stdsubmatch(device_t parent, cfdata_t cf, const int *locs, void *aux)
{
const struct cfiattrdata *ci;
const struct cflocdesc *cl;
int nlocs, i;
ci = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
KASSERT(ci);
nlocs = ci->ci_loclen;
KASSERT(!nlocs || locs);
for (i = 0; i < nlocs; i++) {
cl = &ci->ci_locdesc[i];
if (cl->cld_defaultstr != NULL &&
cf->cf_loc[i] == cl->cld_default)
continue;
if (cf->cf_loc[i] == locs[i])
continue;
return 0;
}
return config_match(parent, cf, aux);
}
/*
* Helper function: check whether the driver supports the interface attribute
* and return its descriptor structure.
*/
static const struct cfiattrdata *
cfdriver_get_iattr(const struct cfdriver *cd, const char *ia)
{
const struct cfiattrdata * const *cpp;
if (cd->cd_attrs == NULL)
return 0;
for (cpp = cd->cd_attrs; *cpp; cpp++) {
if (STREQ((*cpp)->ci_name, ia)) {
/* Match. */
return *cpp;
}
}
return 0;
}
static int __diagused
cfdriver_iattr_count(const struct cfdriver *cd)
{
const struct cfiattrdata * const *cpp;
int i;
if (cd->cd_attrs == NULL)
return 0;
for (i = 0, cpp = cd->cd_attrs; *cpp; cpp++) {
i++;
}
return i;
}
/*
* Lookup an interface attribute description by name.
* If the driver is given, consider only its supported attributes.
*/
const struct cfiattrdata *
cfiattr_lookup(const char *name, const struct cfdriver *cd)
{
const struct cfdriver *d;
const struct cfiattrdata *ia;
if (cd)
return cfdriver_get_iattr(cd, name);
LIST_FOREACH(d, &allcfdrivers, cd_list) {
ia = cfdriver_get_iattr(d, name);
if (ia)
return ia;
}
return 0;
}
/*
* Determine if `parent' is a potential parent for a device spec based
* on `cfp'.
*/
static int
cfparent_match(const device_t parent, const struct cfparent *cfp)
{
struct cfdriver *pcd;
/* We don't match root nodes here. */
if (cfp == NULL)
return 0;
pcd = parent->dv_cfdriver;
KASSERT(pcd != NULL);
/*
* First, ensure this parent has the correct interface
* attribute.
*/
if (!cfdriver_get_iattr(pcd, cfp->cfp_iattr))
return 0;
/*
* If no specific parent device instance was specified (i.e.
* we're attaching to the attribute only), we're done!
*/
if (cfp->cfp_parent == NULL)
return 1;
/*
* Check the parent device's name.
*/
if (STREQ(pcd->cd_name, cfp->cfp_parent) == 0)
return 0; /* not the same parent */
/*
* Make sure the unit number matches.
*/
if (cfp->cfp_unit == DVUNIT_ANY || /* wildcard */
cfp->cfp_unit == parent->dv_unit)
return 1;
/* Unit numbers don't match. */
return 0;
}
/*
* Helper for config_cfdata_attach(): check all devices whether it could be
* parent any attachment in the config data table passed, and rescan.
*/
static void
rescan_with_cfdata(const struct cfdata *cf)
{
device_t d;
const struct cfdata *cf1;
deviter_t di;
KASSERT(KERNEL_LOCKED_P());
/*
* "alldevs" is likely longer than a modules's cfdata, so make it
* the outer loop.
*/
for (d = deviter_first(&di, 0); d != NULL; d = deviter_next(&di)) {
if (!(d->dv_cfattach->ca_rescan))
continue;
for (cf1 = cf; cf1->cf_name; cf1++) {
if (!cfparent_match(d, cf1->cf_pspec))
continue;
(*d->dv_cfattach->ca_rescan)(d,
cfdata_ifattr(cf1), cf1->cf_loc);
config_deferred(d);
}
}
deviter_release(&di);
}
/*
* Attach a supplemental config data table and rescan potential
* parent devices if required.
*/
int
config_cfdata_attach(cfdata_t cf, int scannow)
{
struct cftable *ct;
KERNEL_LOCK(1, NULL);
ct = kmem_alloc(sizeof(*ct), KM_SLEEP);
ct->ct_cfdata = cf;
TAILQ_INSERT_TAIL(&allcftables, ct, ct_list);
if (scannow)
rescan_with_cfdata(cf);
KERNEL_UNLOCK_ONE(NULL);
return 0;
}
/*
* Helper for config_cfdata_detach: check whether a device is
* found through any attachment in the config data table.
*/
static int
dev_in_cfdata(device_t d, cfdata_t cf)
{
const struct cfdata *cf1;
for (cf1 = cf; cf1->cf_name; cf1++)
if (d->dv_cfdata == cf1)
return 1;
return 0;
}
/*
* Detach a supplemental config data table. Detach all devices found
* through that table (and thus keeping references to it) before.
*/
int
config_cfdata_detach(cfdata_t cf)
{
device_t d;
int error = 0;
struct cftable *ct;
deviter_t di;
KERNEL_LOCK(1, NULL);
for (d = deviter_first(&di, DEVITER_F_RW); d != NULL;
d = deviter_next(&di)) {
if (!dev_in_cfdata(d, cf))
continue;
if ((error = config_detach(d, 0)) != 0)
break;
}
deviter_release(&di);
if (error) {
aprint_error_dev(d, "unable to detach instance\n");
goto out;
}
TAILQ_FOREACH(ct, &allcftables, ct_list) {
if (ct->ct_cfdata == cf) {
TAILQ_REMOVE(&allcftables, ct, ct_list);
kmem_free(ct, sizeof(*ct));
error = 0;
goto out;
}
}
/* not found -- shouldn't happen */
error = EINVAL;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
/*
* Invoke the "match" routine for a cfdata entry on behalf of
* an external caller, usually a direct config "submatch" routine.
*/
int
config_match(device_t parent, cfdata_t cf, void *aux)
{
struct cfattach *ca;
KASSERT(KERNEL_LOCKED_P());
ca = config_cfattach_lookup(cf->cf_name, cf->cf_atname);
if (ca == NULL) {
/* No attachment for this entry, oh well. */
return 0;
}
return (*ca->ca_match)(parent, cf, aux);
}
/*
* Invoke the "probe" routine for a cfdata entry on behalf of
* an external caller, usually an indirect config "search" routine.
*/
int
config_probe(device_t parent, cfdata_t cf, void *aux)
{
/*
* This is currently a synonym for config_match(), but this
* is an implementation detail; "match" and "probe" routines
* have different behaviors.
*
* XXX config_probe() should return a bool, because there is
* XXX no match score for probe -- it's either there or it's
* XXX not, but some ports abuse the return value as a way
* XXX to attach "critical" devices before "non-critical"
* XXX devices.
*/
return config_match(parent, cf, aux);
}
static struct cfargs_internal *
cfargs_canonicalize(const struct cfargs * const cfargs,
struct cfargs_internal * const store)
{
struct cfargs_internal *args = store;
memset(args, 0, sizeof(*args));
/* If none specified, are all-NULL pointers are good. */
if (cfargs == NULL) {
return args;
}
/*
* Only one arguments version is recognized at this time.
*/
if (cfargs->cfargs_version != CFARGS_VERSION) {
panic("cfargs_canonicalize: unknown version %lu\n",
(unsigned long)cfargs->cfargs_version);
}
/*
* submatch and search are mutually-exclusive.
*/
if (cfargs->submatch != NULL && cfargs->search != NULL) {
panic("cfargs_canonicalize: submatch and search are "
"mutually-exclusive");
}
if (cfargs->submatch != NULL) {
args->submatch = cfargs->submatch;
} else if (cfargs->search != NULL) {
args->search = cfargs->search;
}
args->iattr = cfargs->iattr;
args->locators = cfargs->locators;
args->devhandle = cfargs->devhandle;
return args;
}
/*
* Iterate over all potential children of some device, calling the given
* function (default being the child's match function) for each one.
* Nonzero returns are matches; the highest value returned is considered
* the best match. Return the `found child' if we got a match, or NULL
* otherwise. The `aux' pointer is simply passed on through.
*
* Note that this function is designed so that it can be used to apply
* an arbitrary function to all potential children (its return value
* can be ignored).
*/
static cfdata_t
config_search_internal(device_t parent, void *aux,
const struct cfargs_internal * const args)
{
struct cftable *ct;
cfdata_t cf;
struct matchinfo m;
KASSERT(config_initialized);
KASSERTMSG((!args->iattr ||
cfdriver_get_iattr(parent->dv_cfdriver, args->iattr)),
"%s searched for child at interface attribute %s,"
" but device %s(4) has no such interface attribute in config(5)",
device_xname(parent), args->iattr,
parent->dv_cfdriver->cd_name);
KASSERTMSG((args->iattr ||
cfdriver_iattr_count(parent->dv_cfdriver) < 2),
"%s searched for child without interface attribute,"
" needed to disambiguate among the %d declared for in %s(4)"
" in config(5)",
device_xname(parent),
cfdriver_iattr_count(parent->dv_cfdriver),
parent->dv_cfdriver->cd_name);
m.fn = args->submatch; /* N.B. union */
m.parent = parent;
m.locs = args->locators;
m.aux = aux;
m.match = NULL;
m.pri = 0;
TAILQ_FOREACH(ct, &allcftables, ct_list) {
for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
/* We don't match root nodes here. */
if (!cf->cf_pspec)
continue;
/*
* Skip cf if no longer eligible, otherwise scan
* through parents for one matching `parent', and
* try match function.
*/
if (cf->cf_fstate == FSTATE_FOUND)
continue;
if (cf->cf_fstate == FSTATE_DNOTFOUND ||
cf->cf_fstate == FSTATE_DSTAR)
continue;
/*
* If an interface attribute was specified,
* consider only children which attach to
* that attribute.
*/
if (args->iattr != NULL &&
!STREQ(args->iattr, cfdata_ifattr(cf)))
continue;
if (cfparent_match(parent, cf->cf_pspec))
mapply(&m, cf);
}
}
rnd_add_uint32(&rnd_autoconf_source, 0);
return m.match;
}
cfdata_t
config_search(device_t parent, void *aux, const struct cfargs *cfargs)
{
cfdata_t cf;
struct cfargs_internal store;
cf = config_search_internal(parent, aux,
cfargs_canonicalize(cfargs, &store));
return cf;
}
/*
* Find the given root device.
* This is much like config_search, but there is no parent.
* Don't bother with multiple cfdata tables; the root node
* must always be in the initial table.
*/
cfdata_t
config_rootsearch(cfsubmatch_t fn, const char *rootname, void *aux)
{
cfdata_t cf;
const short *p;
struct matchinfo m;
m.fn = fn;
m.parent = ROOT;
m.aux = aux;
m.match = NULL;
m.pri = 0;
m.locs = 0;
/*
* Look at root entries for matching name. We do not bother
* with found-state here since only one root should ever be
* searched (and it must be done first).
*/
for (p = cfroots; *p >= 0; p++) {
cf = &cfdata[*p];
if (strcmp(cf->cf_name, rootname) == 0)
mapply(&m, cf);
}
return m.match;
}
static const char * const msgs[] = {
[QUIET] = "",
[UNCONF] = " not configured\n",
[UNSUPP] = " unsupported\n",
};
/*
* The given `aux' argument describes a device that has been found
* on the given parent, but not necessarily configured. Locate the
* configuration data for that device (using the submatch function
* provided, or using candidates' cd_match configuration driver
* functions) and attach it, and return its device_t. If the device was
* not configured, call the given `print' function and return NULL.
*/
device_t
config_found_acquire(device_t parent, void *aux, cfprint_t print,
const struct cfargs * const cfargs)
{
cfdata_t cf;
struct cfargs_internal store;
const struct cfargs_internal * const args =
cfargs_canonicalize(cfargs, &store);
device_t dev;
KERNEL_LOCK(1, NULL);
cf = config_search_internal(parent, aux, args);
if (cf != NULL) {
dev = config_attach_internal(parent, cf, aux, print, args);
goto out;
}
if (print) {
if (config_do_twiddle && cold)
twiddle();
const int pret = (*print)(aux, device_xname(parent));
KASSERT(pret >= 0);
KASSERT(pret < __arraycount(msgs));
KASSERT(msgs[pret] != NULL);
aprint_normal("%s", msgs[pret]);
}
dev = NULL;
out: KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_found(parent, aux, print, cfargs)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_found_acquire with a matching device_release once they
* have finished with the returned device_t.
*/
device_t
config_found(device_t parent, void *aux, cfprint_t print,
const struct cfargs * const cfargs)
{
device_t dev;
KASSERT(KERNEL_LOCKED_P());
dev = config_found_acquire(parent, aux, print, cfargs);
if (dev == NULL)
return NULL;
device_release(dev);
return dev;
}
/*
* As above, but for root devices.
*/
device_t
config_rootfound(const char *rootname, void *aux)
{
cfdata_t cf;
device_t dev = NULL;
KERNEL_LOCK(1, NULL);
if ((cf = config_rootsearch(NULL, rootname, aux)) != NULL)
dev = config_attach(ROOT, cf, aux, NULL, CFARGS_NONE);
else
aprint_error("root device %s not configured\n", rootname);
KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/* just like sprintf(buf, "%d") except that it works from the end */
static char *
number(char *ep, int n)
{
*--ep = 0;
while (n >= 10) {
*--ep = (n % 10) + '0';
n /= 10;
}
*--ep = n + '0';
return ep;
}
/*
* Expand the size of the cd_devs array if necessary.
*
* The caller must hold alldevs_lock. config_makeroom() may release and
* re-acquire alldevs_lock, so callers should re-check conditions such
* as alldevs_nwrite == 0 and alldevs_nread == 0 when config_makeroom()
* returns.
*/
static void
config_makeroom(int n, struct cfdriver *cd)
{
int ondevs, nndevs;
device_t *osp, *nsp;
KASSERT(mutex_owned(&alldevs_lock));
alldevs_nwrite++;
/* XXX arithmetic overflow */
for (nndevs = MAX(4, cd->cd_ndevs); nndevs <= n; nndevs += nndevs)
;
while (n >= cd->cd_ndevs) {
/*
* Need to expand the array.
*/
ondevs = cd->cd_ndevs;
osp = cd->cd_devs;
/*
* Release alldevs_lock around allocation, which may
* sleep.
*/
mutex_exit(&alldevs_lock);
nsp = kmem_alloc(sizeof(device_t) * nndevs, KM_SLEEP);
mutex_enter(&alldevs_lock);
/*
* If another thread moved the array while we did
* not hold alldevs_lock, try again.
*/
if (cd->cd_devs != osp || cd->cd_ndevs != ondevs) {
mutex_exit(&alldevs_lock);
kmem_free(nsp, sizeof(device_t) * nndevs);
mutex_enter(&alldevs_lock);
continue;
}
memset(nsp + ondevs, 0, sizeof(device_t) * (nndevs - ondevs));
if (ondevs != 0)
memcpy(nsp, cd->cd_devs, sizeof(device_t) * ondevs); cd->cd_ndevs = nndevs;
cd->cd_devs = nsp;
if (ondevs != 0) {
mutex_exit(&alldevs_lock);
kmem_free(osp, sizeof(device_t) * ondevs);
mutex_enter(&alldevs_lock);
}
}
KASSERT(mutex_owned(&alldevs_lock));
alldevs_nwrite--;
}
/*
* Put dev into the devices list.
*/
static void
config_devlink(device_t dev)
{
mutex_enter(&alldevs_lock);
KASSERT(device_cfdriver(dev)->cd_devs[dev->dv_unit] == dev);
dev->dv_add_gen = alldevs_gen;
/* It is safe to add a device to the tail of the list while
* readers and writers are in the list.
*/
TAILQ_INSERT_TAIL(&alldevs, dev, dv_list);
mutex_exit(&alldevs_lock);
}
static void
config_devfree(device_t dev)
{
KASSERT(dev->dv_flags & DVF_PRIV_ALLOC); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dev->dv_cfattach->ca_devsize > 0) kmem_free(dev->dv_private, dev->dv_cfattach->ca_devsize);
kmem_free(dev, sizeof(*dev));
}
/*
* Caller must hold alldevs_lock.
*/
static void
config_devunlink(device_t dev, struct devicelist *garbage)
{
struct device_garbage *dg = &dev->dv_garbage;
cfdriver_t cd = device_cfdriver(dev);
int i;
KASSERT(mutex_owned(&alldevs_lock));
KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending);
/* Unlink from device list. Link to garbage list. */
TAILQ_REMOVE(&alldevs, dev, dv_list);
TAILQ_INSERT_TAIL(garbage, dev, dv_list);
/* Remove from cfdriver's array. */
cd->cd_devs[dev->dv_unit] = NULL;
/*
* If the device now has no units in use, unlink its softc array.
*/
for (i = 0; i < cd->cd_ndevs; i++) {
if (cd->cd_devs[i] != NULL)
break;
}
/* Nothing found. Unlink, now. Deallocate, later. */
if (i == cd->cd_ndevs) {
dg->dg_ndevs = cd->cd_ndevs;
dg->dg_devs = cd->cd_devs;
cd->cd_devs = NULL;
cd->cd_ndevs = 0;
}
}
static void
config_devdelete(device_t dev)
{
struct device_garbage *dg = &dev->dv_garbage;
device_lock_t dvl = device_getlock(dev);
KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs);
localcount_fini(dev->dv_localcount);
kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount));
cv_destroy(&dvl->dvl_cv);
mutex_destroy(&dvl->dvl_mtx);
KASSERT(dev->dv_properties != NULL);
prop_object_release(dev->dv_properties);
if (dev->dv_activity_handlers)
panic("%s with registered handlers", __func__); if (dev->dv_locators) { size_t amount = *--dev->dv_locators;
kmem_free(dev->dv_locators, amount);
}
config_devfree(dev);
}
static int
config_unit_nextfree(cfdriver_t cd, cfdata_t cf)
{
int unit = cf->cf_unit;
KASSERT(mutex_owned(&alldevs_lock));
if (unit < 0)
return -1;
if (cf->cf_fstate == FSTATE_STAR) {
for (; unit < cd->cd_ndevs; unit++) if (cd->cd_devs[unit] == NULL)
break;
/*
* unit is now the unit of the first NULL device pointer,
* or max(cd->cd_ndevs,cf->cf_unit).
*/
} else {
if (unit < cd->cd_ndevs && cd->cd_devs[unit] != NULL)
unit = -1;
}
return unit;
}
static int
config_unit_alloc(device_t dev, cfdriver_t cd, cfdata_t cf)
{
struct alldevs_foray af;
int unit;
config_alldevs_enter(&af);
for (;;) {
unit = config_unit_nextfree(cd, cf); if (unit == -1)
break;
if (unit < cd->cd_ndevs) {
cd->cd_devs[unit] = dev;
dev->dv_unit = unit;
break;
}
config_makeroom(unit, cd);
}
config_alldevs_exit(&af);
return unit;
}
static device_t
config_devalloc(const device_t parent, const cfdata_t cf,
const struct cfargs_internal * const args)
{
cfdriver_t cd;
cfattach_t ca;
size_t lname, lunit;
const char *xunit;
int myunit;
char num[10];
device_t dev;
void *dev_private;
const struct cfiattrdata *ia;
device_lock_t dvl;
cd = config_cfdriver_lookup(cf->cf_name);
if (cd == NULL)
return NULL;
ca = config_cfattach_lookup_cd(cd, cf->cf_atname);
if (ca == NULL)
return NULL;
/* get memory for all device vars */
KASSERT(ca->ca_flags & DVF_PRIV_ALLOC); if (ca->ca_devsize > 0) { dev_private = kmem_zalloc(ca->ca_devsize, KM_SLEEP);
} else {
dev_private = NULL;
}
dev = kmem_zalloc(sizeof(*dev), KM_SLEEP);
dev->dv_handle = args->devhandle;
dev->dv_class = cd->cd_class;
dev->dv_cfdata = cf;
dev->dv_cfdriver = cd;
dev->dv_cfattach = ca;
dev->dv_activity_count = 0;
dev->dv_activity_handlers = NULL;
dev->dv_private = dev_private;
dev->dv_flags = ca->ca_flags; /* inherit flags from class */
dev->dv_attaching = curlwp;
myunit = config_unit_alloc(dev, cd, cf);
if (myunit == -1) {
config_devfree(dev);
return NULL;
}
/* compute length of name and decimal expansion of unit number */
lname = strlen(cd->cd_name);
xunit = number(&num[sizeof(num)], myunit);
lunit = &num[sizeof(num)] - xunit;
if (lname + lunit > sizeof(dev->dv_xname))
panic("config_devalloc: device name too long");
dvl = device_getlock(dev);
mutex_init(&dvl->dvl_mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&dvl->dvl_cv, "pmfsusp");
memcpy(dev->dv_xname, cd->cd_name, lname);
memcpy(dev->dv_xname + lname, xunit, lunit);
dev->dv_parent = parent;
if (parent != NULL)
dev->dv_depth = parent->dv_depth + 1;
else
dev->dv_depth = 0;
dev->dv_flags |= DVF_ACTIVE; /* always initially active */
if (args->locators) { KASSERT(parent); /* no locators at root */
ia = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
dev->dv_locators =
kmem_alloc(sizeof(int) * (ia->ci_loclen + 1), KM_SLEEP);
*dev->dv_locators++ = sizeof(int) * (ia->ci_loclen + 1);
memcpy(dev->dv_locators, args->locators,
sizeof(int) * ia->ci_loclen);
}
dev->dv_properties = prop_dictionary_create();
KASSERT(dev->dv_properties != NULL);
prop_dictionary_set_string_nocopy(dev->dv_properties,
"device-driver", dev->dv_cfdriver->cd_name);
prop_dictionary_set_uint16(dev->dv_properties,
"device-unit", dev->dv_unit);
if (parent != NULL) { prop_dictionary_set_string(dev->dv_properties,
"device-parent", device_xname(parent));
}
dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount),
KM_SLEEP);
localcount_init(dev->dv_localcount);
if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev);
return dev;
}
/*
* Create an array of device attach attributes and add it
* to the device's dv_properties dictionary.
*
* <key>interface-attributes</key>
* <array>
* <dict>
* <key>attribute-name</key>
* <string>foo</string>
* <key>locators</key>
* <array>
* <dict>
* <key>loc-name</key>
* <string>foo-loc1</string>
* </dict>
* <dict>
* <key>loc-name</key>
* <string>foo-loc2</string>
* <key>default</key>
* <string>foo-loc2-default</string>
* </dict>
* ...
* </array>
* </dict>
* ...
* </array>
*/
static void
config_add_attrib_dict(device_t dev)
{
int i, j;
const struct cfiattrdata *ci;
prop_dictionary_t attr_dict, loc_dict;
prop_array_t attr_array, loc_array;
if ((attr_array = prop_array_create()) == NULL)
return;
for (i = 0; ; i++) {
if ((ci = dev->dv_cfdriver->cd_attrs[i]) == NULL)
break;
if ((attr_dict = prop_dictionary_create()) == NULL)
break;
prop_dictionary_set_string_nocopy(attr_dict, "attribute-name",
ci->ci_name);
/* Create an array of the locator names and defaults */
if (ci->ci_loclen != 0 &&
(loc_array = prop_array_create()) != NULL) {
for (j = 0; j < ci->ci_loclen; j++) {
loc_dict = prop_dictionary_create();
if (loc_dict == NULL)
continue;
prop_dictionary_set_string_nocopy(loc_dict,
"loc-name", ci->ci_locdesc[j].cld_name);
if (ci->ci_locdesc[j].cld_defaultstr != NULL) prop_dictionary_set_string_nocopy(
loc_dict, "default",
ci->ci_locdesc[j].cld_defaultstr);
prop_array_set(loc_array, j, loc_dict);
prop_object_release(loc_dict);
}
prop_dictionary_set_and_rel(attr_dict, "locators",
loc_array);
}
prop_array_add(attr_array, attr_dict);
prop_object_release(attr_dict);
}
if (i == 0)
prop_object_release(attr_array);
else
prop_dictionary_set_and_rel(dev->dv_properties,
"interface-attributes", attr_array);
return;
}
/*
* Attach a found device.
*
* Returns the device referenced, to be released with device_release.
*/
static device_t
config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs_internal * const args)
{
device_t dev;
struct cftable *ct;
const char *drvname;
bool deferred;
KASSERT(KERNEL_LOCKED_P());
dev = config_devalloc(parent, cf, args);
if (!dev)
panic("config_attach: allocation of device softc failed");
/* XXX redundant - see below? */
if (cf->cf_fstate != FSTATE_STAR) {
KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
cf->cf_fstate = FSTATE_FOUND;
}
config_devlink(dev);
if (config_do_twiddle && cold)
twiddle();
else
aprint_naive("Found ");
/*
* We want the next two printfs for normal, verbose, and quiet,
* but not silent (in which case, we're twiddling, instead).
*/
if (parent == ROOT) {
aprint_naive("%s (root)", device_xname(dev));
aprint_normal("%s (root)", device_xname(dev));
} else {
aprint_naive("%s at %s", device_xname(dev),
device_xname(parent));
aprint_normal("%s at %s", device_xname(dev),
device_xname(parent));
if (print)
(void) (*print)(aux, NULL);
}
/*
* Before attaching, clobber any unfound devices that are
* otherwise identical.
* XXX code above is redundant?
*/
drvname = dev->dv_cfdriver->cd_name;
TAILQ_FOREACH(ct, &allcftables, ct_list) {
for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
if (STREQ(cf->cf_name, drvname) &&
cf->cf_unit == dev->dv_unit) {
if (cf->cf_fstate == FSTATE_NOTFOUND)
cf->cf_fstate = FSTATE_FOUND;
}
}
}
device_register(dev, aux);
/* Let userland know */
devmon_report_device(dev, true);
/*
* Prevent detach until the driver's attach function, and all
* deferred actions, have finished.
*/
config_pending_incr(dev);
/*
* Prevent concurrent detach from destroying the device_t until
* the caller has released the device.
*/
device_acquire(dev);
/* Call the driver's attach function. */
(*dev->dv_cfattach->ca_attach)(parent, dev, aux);
/*
* Allow other threads to acquire references to the device now
* that the driver's attach function is done.
*/
mutex_enter(&config_misc_lock);
KASSERT(dev->dv_attaching == curlwp);
dev->dv_attaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
/*
* Synchronous parts of attach are done. Allow detach, unless
* the driver's attach function scheduled deferred actions.
*/
config_pending_decr(dev);
mutex_enter(&config_misc_lock);
deferred = (dev->dv_pending != 0);
mutex_exit(&config_misc_lock);
if (!deferred && !device_pmf_is_registered(dev))
aprint_debug_dev(dev,
"WARNING: power management not supported\n");
config_process_deferred(&deferred_config_queue, dev);
device_register_post_config(dev, aux);
rnd_add_uint32(&rnd_autoconf_source, 0);
return dev;
}
device_t
config_attach_acquire(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs *cfargs)
{
struct cfargs_internal store;
device_t dev;
KERNEL_LOCK(1, NULL);
dev = config_attach_internal(parent, cf, aux, print,
cfargs_canonicalize(cfargs, &store));
KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_attach(parent, cf, aux, print, cfargs)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_attach_acquire with a matching device_release once they
* have finished with the returned device_t.
*/
device_t
config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs *cfargs)
{
device_t dev;
KASSERT(KERNEL_LOCKED_P());
dev = config_attach_acquire(parent, cf, aux, print, cfargs);
if (dev == NULL)
return NULL;
device_release(dev);
return dev;
}
/*
* As above, but for pseudo-devices. Pseudo-devices attached in this
* way are silently inserted into the device tree, and their children
* attached.
*
* Note that because pseudo-devices are attached silently, any information
* the attach routine wishes to print should be prefixed with the device
* name by the attach routine.
*/
device_t
config_attach_pseudo_acquire(cfdata_t cf, void *aux)
{
device_t dev;
KERNEL_LOCK(1, NULL);
struct cfargs_internal args = { };
dev = config_devalloc(ROOT, cf, &args);
if (!dev)
goto out;
/* XXX mark busy in cfdata */
if (cf->cf_fstate != FSTATE_STAR) { KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
cf->cf_fstate = FSTATE_FOUND;
}
config_devlink(dev);
#if 0 /* XXXJRT not yet */
device_register(dev, NULL); /* like a root node */
#endif
/* Let userland know */
devmon_report_device(dev, true);
/*
* Prevent detach until the driver's attach function, and all
* deferred actions, have finished.
*/
config_pending_incr(dev);
/*
* Prevent concurrent detach from destroying the device_t until
* the caller has released the device.
*/
device_acquire(dev);
/* Call the driver's attach function. */
(*dev->dv_cfattach->ca_attach)(ROOT, dev, aux);
/*
* Allow other threads to acquire references to the device now
* that the driver's attach function is done.
*/
mutex_enter(&config_misc_lock);
KASSERT(dev->dv_attaching == curlwp);
dev->dv_attaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
/*
* Synchronous parts of attach are done. Allow detach, unless
* the driver's attach function scheduled deferred actions.
*/
config_pending_decr(dev);
config_process_deferred(&deferred_config_queue, dev);
out: KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_attach_pseudo(cf)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_attach_pseudo_acquire with a matching device_release
* once they have finished with the returned device_t. As a
* bonus, config_attach_pseudo_acquire can pass a non-null aux
* argument into the driver's attach routine.
*/
device_t
config_attach_pseudo(cfdata_t cf)
{
device_t dev;
dev = config_attach_pseudo_acquire(cf, NULL);
if (dev == NULL)
return dev;
device_release(dev);
return dev;
}
/*
* Caller must hold alldevs_lock.
*/
static void
config_collect_garbage(struct devicelist *garbage)
{
device_t dv;
KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(mutex_owned(&alldevs_lock)); while (alldevs_nwrite == 0 && alldevs_nread == 0 && alldevs_garbage) { TAILQ_FOREACH(dv, &alldevs, dv_list) {
if (dv->dv_del_gen != 0)
break;
}
if (dv == NULL) {
alldevs_garbage = false;
break;
}
config_devunlink(dv, garbage);
}
KASSERT(mutex_owned(&alldevs_lock));
}
static void
config_dump_garbage(struct devicelist *garbage)
{
device_t dv;
while ((dv = TAILQ_FIRST(garbage)) != NULL) { TAILQ_REMOVE(garbage, dv, dv_list); config_devdelete(dv);
}
}
static int
config_detach_enter(device_t dev)
{
struct lwp *l __diagused;
int error = 0;
mutex_enter(&config_misc_lock);
/*
* Wait until attach has fully completed, and until any
* concurrent detach (e.g., drvctl racing with USB event
* thread) has completed.
*
* Caller must hold alldevs_nread or alldevs_nwrite (e.g., via
* deviter) to ensure the winner of the race doesn't free the
* device leading the loser of the race into use-after-free.
*
* XXX Not all callers do this!
*/
while (dev->dv_pending || dev->dv_detaching) {
KASSERTMSG(dev->dv_detaching != curlwp,
"recursively detaching %s", device_xname(dev));
error = cv_wait_sig(&config_misc_cv, &config_misc_lock);
if (error)
goto out;
}
/*
* Attach has completed, and no other concurrent detach is
* running. Claim the device for detaching. This will cause
* all new attempts to acquire references to block.
*/
KASSERTMSG((l = dev->dv_attaching) == NULL,
"lwp %ld [%s] @ %p attaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == NULL,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detaching = curlwp;
out: mutex_exit(&config_misc_lock);
return error;
}
static void
config_detach_exit(device_t dev)
{
struct lwp *l __diagused;
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == curlwp,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
}
/*
* Detach a device. Optionally forced (e.g. because of hardware
* removal) and quiet. Returns zero if successful, non-zero
* (an error code) otherwise.
*
* Note that this code wants to be run from a process context, so
* that the detach can sleep to allow processes which have a device
* open to run and unwind their stacks.
*
* Caller must hold a reference with device_acquire or
* device_lookup_acquire.
*/
int
config_detach_release(device_t dev, int flags)
{
struct alldevs_foray af;
struct cftable *ct;
cfdata_t cf;
const struct cfattach *ca;
struct cfdriver *cd;
device_t d __diagused;
int rv = 0;
KERNEL_LOCK(1, NULL);
cf = dev->dv_cfdata;
KASSERTMSG((cf == NULL || cf->cf_fstate == FSTATE_FOUND ||
cf->cf_fstate == FSTATE_STAR),
"config_detach: %s: bad device fstate: %d",
device_xname(dev), cf ? cf->cf_fstate : -1);
cd = dev->dv_cfdriver;
KASSERT(cd != NULL);
ca = dev->dv_cfattach;
KASSERT(ca != NULL);
/*
* Only one detach at a time, please -- and not until fully
* attached.
*/
rv = config_detach_enter(dev);
device_release(dev);
if (rv) {
KERNEL_UNLOCK_ONE(NULL);
return rv;
}
mutex_enter(&alldevs_lock);
if (dev->dv_del_gen != 0) {
mutex_exit(&alldevs_lock);
#ifdef DIAGNOSTIC
printf("%s: %s is already detached\n", __func__,
device_xname(dev));
#endif /* DIAGNOSTIC */
config_detach_exit(dev);
KERNEL_UNLOCK_ONE(NULL);
return ENOENT;
}
alldevs_nwrite++;
mutex_exit(&alldevs_lock);
/*
* Call the driver's .ca_detach function, unless it has none or
* we are skipping it because it's unforced shutdown time and
* the driver didn't ask to detach on shutdown.
*/
if (!detachall &&
(flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN &&
(dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) {
rv = EOPNOTSUPP;
} else if (ca->ca_detach != NULL) {
rv = (*ca->ca_detach)(dev, flags);
} else
rv = EOPNOTSUPP;
KASSERTMSG(!dev->dv_detach_done, "%s detached twice, error=%d",
device_xname(dev), rv);
/*
* If it was not possible to detach the device, then we either
* panic() (for the forced but failed case), or return an error.
*/
if (rv) {
/*
* Detach failed -- likely EOPNOTSUPP or EBUSY. Driver
* must not have called config_detach_commit.
*/
KASSERTMSG(!dev->dv_detach_committed,
"%s committed to detaching and then backed out, error=%d",
device_xname(dev), rv);
if (flags & DETACH_FORCE) {
panic("config_detach: forced detach of %s failed (%d)",
device_xname(dev), rv);
}
goto out;
}
/*
* The device has now been successfully detached.
*/
dev->dv_detach_done = true;
/*
* If .ca_detach didn't commit to detach, then do that for it.
* This wakes any pending device_lookup_acquire calls so they
* will fail.
*/
config_detach_commit(dev);
/*
* If it was possible to detach the device, ensure that the
* device is deactivated.
*/
dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */
/*
* Wait for all device_lookup_acquire references -- mostly, for
* all attempts to open the device -- to drain. It is the
* responsibility of .ca_detach to ensure anything with open
* references will be interrupted and release them promptly,
* not block indefinitely. All new attempts to acquire
* references will fail, as config_detach_commit has arranged
* by now.
*/
mutex_enter(&config_misc_lock);
localcount_drain(dev->dv_localcount,
&config_misc_cv, &config_misc_lock);
mutex_exit(&config_misc_lock);
/* Let userland know */
devmon_report_device(dev, false);
#ifdef DIAGNOSTIC
/*
* Sanity: If you're successfully detached, you should have no
* children. (Note that because children must be attached
* after parents, we only need to search the latter part of
* the list.)
*/
mutex_enter(&alldevs_lock);
for (d = TAILQ_NEXT(dev, dv_list); d != NULL;
d = TAILQ_NEXT(d, dv_list)) {
if (d->dv_parent == dev && d->dv_del_gen == 0) {
printf("config_detach: detached device %s"
" has children %s\n", device_xname(dev),
device_xname(d));
panic("config_detach");
}
}
mutex_exit(&alldevs_lock);
#endif
/* notify the parent that the child is gone */
if (dev->dv_parent) {
device_t p = dev->dv_parent;
if (p->dv_cfattach->ca_childdetached)
(*p->dv_cfattach->ca_childdetached)(p, dev);
}
/*
* Mark cfdata to show that the unit can be reused, if possible.
*/
TAILQ_FOREACH(ct, &allcftables, ct_list) {
for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
if (STREQ(cf->cf_name, cd->cd_name)) {
if (cf->cf_fstate == FSTATE_FOUND &&
cf->cf_unit == dev->dv_unit)
cf->cf_fstate = FSTATE_NOTFOUND;
}
}
}
if (dev->dv_cfdata != NULL && (flags & DETACH_QUIET) == 0)
aprint_normal_dev(dev, "detached\n");
out:
config_detach_exit(dev);
config_alldevs_enter(&af);
KASSERT(alldevs_nwrite != 0);
--alldevs_nwrite;
if (rv == 0 && dev->dv_del_gen == 0) {
if (alldevs_nwrite == 0 && alldevs_nread == 0)
config_devunlink(dev, &af.af_garbage);
else {
dev->dv_del_gen = alldevs_gen;
alldevs_garbage = true;
}
}
config_alldevs_exit(&af);
KERNEL_UNLOCK_ONE(NULL);
return rv;
}
/*
* config_detach(dev, flags)
*
* Legacy entry point for callers that have not acquired a
* reference to dev.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should be converted to use device_acquire under a lock
* taken also by .ca_childdetached to synchronize access to the
* device_t, and then config_detach_release ouside the lock.
* Alternatively, most drivers detach children only in their own
* detach routines, which can be done with config_detach_children
* instead.
*/
int
config_detach(device_t dev, int flags)
{
device_acquire(dev);
return config_detach_release(dev, flags);
}
/*
* config_detach_commit(dev)
*
* Issued by a driver's .ca_detach routine to notify anyone
* waiting in device_lookup_acquire that the driver is committed
* to detaching the device, which allows device_lookup_acquire to
* wake up and fail immediately.
*
* Safe to call multiple times -- idempotent. Must be called
* during config_detach_enter/exit. Safe to use with
* device_lookup because the device is not actually removed from
* the table until after config_detach_exit.
*/
void
config_detach_commit(device_t dev)
{
struct lwp *l __diagused;
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == curlwp,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detach_committed = true;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
}
int
config_detach_children(device_t parent, int flags)
{
device_t dv;
deviter_t di;
int error = 0;
KASSERT(KERNEL_LOCKED_P());
for (dv = deviter_first(&di, DEVITER_F_RW); dv != NULL;
dv = deviter_next(&di)) {
if (device_parent(dv) != parent)
continue;
if ((error = config_detach(dv, flags)) != 0)
break;
}
deviter_release(&di);
return error;
}
device_t
shutdown_first(struct shutdown_state *s)
{
if (!s->initialized) {
deviter_init(&s->di, DEVITER_F_SHUTDOWN|DEVITER_F_LEAVES_FIRST);
s->initialized = true;
}
return shutdown_next(s);
}
device_t
shutdown_next(struct shutdown_state *s)
{
device_t dv;
while ((dv = deviter_next(&s->di)) != NULL && !device_is_active(dv))
;
if (dv == NULL)
s->initialized = false;
return dv;
}
bool
config_detach_all(int how)
{
static struct shutdown_state s;
device_t curdev;
bool progress = false;
int flags;
KERNEL_LOCK(1, NULL);
if ((how & (RB_NOSYNC|RB_DUMP)) != 0)
goto out;
if ((how & RB_POWERDOWN) == RB_POWERDOWN)
flags = DETACH_SHUTDOWN | DETACH_POWEROFF;
else
flags = DETACH_SHUTDOWN;
for (curdev = shutdown_first(&s); curdev != NULL;
curdev = shutdown_next(&s)) {
aprint_debug(" detaching %s, ", device_xname(curdev));
if (config_detach(curdev, flags) == 0) {
progress = true;
aprint_debug("success.");
} else
aprint_debug("failed.");
}
out: KERNEL_UNLOCK_ONE(NULL);
return progress;
}
static bool
device_is_ancestor_of(device_t ancestor, device_t descendant)
{
device_t dv;
for (dv = descendant; dv != NULL; dv = device_parent(dv)) {
if (device_parent(dv) == ancestor)
return true;
}
return false;
}
int
config_deactivate(device_t dev)
{
deviter_t di;
const struct cfattach *ca;
device_t descendant;
int s, rv = 0, oflags;
for (descendant = deviter_first(&di, DEVITER_F_ROOT_FIRST);
descendant != NULL;
descendant = deviter_next(&di)) {
if (dev != descendant &&
!device_is_ancestor_of(dev, descendant))
continue;
if ((descendant->dv_flags & DVF_ACTIVE) == 0)
continue;
ca = descendant->dv_cfattach;
oflags = descendant->dv_flags;
descendant->dv_flags &= ~DVF_ACTIVE;
if (ca->ca_activate == NULL)
continue;
s = splhigh();
rv = (*ca->ca_activate)(descendant, DVACT_DEACTIVATE);
splx(s);
if (rv != 0)
descendant->dv_flags = oflags;
}
deviter_release(&di);
return rv;
}
/*
* Defer the configuration of the specified device until all
* of its parent's devices have been attached.
*/
void
config_defer(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
if (dev->dv_parent == NULL)
panic("config_defer: can't defer config of a root device");
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
config_pending_incr(dev);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &deferred_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("config_defer: deferred twice");
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Defer some autoconfiguration for a device until after interrupts
* are enabled.
*/
void
config_interrupts(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
/*
* If interrupts are enabled, callback now.
*/
if (cold == 0) {
(*func)(dev);
return;
}
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
config_pending_incr(dev);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &interrupt_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("config_interrupts: deferred twice");
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&interrupt_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Defer some autoconfiguration for a device until after root file system
* is mounted (to load firmware etc).
*/
void
config_mountroot(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
/*
* If root file system is mounted, callback now.
*/
if (root_is_mounted) {
(*func)(dev);
return;
}
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &mountroot_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("%s: deferred twice", __func__);
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Process a deferred configuration queue.
*/
static void
config_process_deferred(struct deferred_config_head *queue, device_t parent)
{
struct deferred_config *dc;
KASSERT(KERNEL_LOCKED_P());
mutex_enter(&config_misc_lock);
dc = TAILQ_FIRST(queue);
while (dc) { if (parent == NULL || dc->dc_dev->dv_parent == parent) { TAILQ_REMOVE(queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
(*dc->dc_func)(dc->dc_dev);
config_pending_decr(dc->dc_dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
/* Restart, queue might have changed */
dc = TAILQ_FIRST(queue);
} else {
dc = TAILQ_NEXT(dc, dc_queue);
}
}
mutex_exit(&config_misc_lock);
}
/*
* Manipulate the config_pending semaphore.
*/
void
config_pending_incr(device_t dev)
{
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_pending < INT_MAX,
"%s: excess config_pending_incr", device_xname(dev));
if (dev->dv_pending++ == 0) TAILQ_INSERT_TAIL(&config_pending, dev, dv_pending_list);
#ifdef DEBUG_AUTOCONF
printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
mutex_exit(&config_misc_lock);
}
void
config_pending_decr(device_t dev)
{
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_pending > 0,
"%s: excess config_pending_decr", device_xname(dev));
if (--dev->dv_pending == 0) { TAILQ_REMOVE(&config_pending, dev, dv_pending_list);
cv_broadcast(&config_misc_cv);
}
#ifdef DEBUG_AUTOCONF
printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
mutex_exit(&config_misc_lock);
}
/*
* Register a "finalization" routine. Finalization routines are
* called iteratively once all real devices have been found during
* autoconfiguration, for as long as any one finalizer has done
* any work.
*/
int
config_finalize_register(device_t dev, int (*fn)(device_t))
{
struct finalize_hook *f;
int error = 0;
KERNEL_LOCK(1, NULL);
/*
* If finalization has already been done, invoke the
* callback function now.
*/
if (config_finalize_done) {
while ((*fn)(dev) != 0)
/* loop */ ;
goto out;
}
/* Ensure this isn't already on the list. */
TAILQ_FOREACH(f, &config_finalize_list, f_list) {
if (f->f_func == fn && f->f_dev == dev) {
error = EEXIST;
goto out;
}
}
f = kmem_alloc(sizeof(*f), KM_SLEEP);
f->f_func = fn;
f->f_dev = dev;
TAILQ_INSERT_TAIL(&config_finalize_list, f, f_list);
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
void
config_finalize(void)
{
struct finalize_hook *f;
struct pdevinit *pdev;
extern struct pdevinit pdevinit[];
unsigned t0 = getticks();
int errcnt, rv;
/*
* Now that device driver threads have been created, wait for
* them to finish any deferred autoconfiguration.
*/
mutex_enter(&config_misc_lock);
while (!TAILQ_EMPTY(&config_pending)) {
const unsigned t1 = getticks();
if (t1 - t0 >= hz) {
void (*pr)(const char *, ...) __printflike(1,2);
device_t dev;
if (t1 - t0 >= 60*hz) {
pr = aprint_normal;
t0 = t1;
} else {
pr = aprint_debug;
}
(*pr)("waiting for devices:");
TAILQ_FOREACH(dev, &config_pending, dv_pending_list)
(*pr)(" %s", device_xname(dev));
(*pr)("\n");
}
(void)cv_timedwait(&config_misc_cv, &config_misc_lock,
mstohz(1000));
}
mutex_exit(&config_misc_lock);
KERNEL_LOCK(1, NULL);
/* Attach pseudo-devices. */
for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
(*pdev->pdev_attach)(pdev->pdev_count);
/* Run the hooks until none of them does any work. */
do {
rv = 0;
TAILQ_FOREACH(f, &config_finalize_list, f_list)
rv |= (*f->f_func)(f->f_dev);
} while (rv != 0);
config_finalize_done = 1;
/* Now free all the hooks. */
while ((f = TAILQ_FIRST(&config_finalize_list)) != NULL) {
TAILQ_REMOVE(&config_finalize_list, f, f_list);
kmem_free(f, sizeof(*f));
}
KERNEL_UNLOCK_ONE(NULL);
errcnt = aprint_get_error_count();
if ((boothowto & (AB_QUIET|AB_SILENT)) != 0 &&
(boothowto & AB_VERBOSE) == 0) {
mutex_enter(&config_misc_lock);
if (config_do_twiddle) {
config_do_twiddle = 0;
printf_nolog(" done.\n");
}
mutex_exit(&config_misc_lock);
}
if (errcnt != 0) {
printf("WARNING: %d error%s while detecting hardware; "
"check system log.\n", errcnt,
errcnt == 1 ? "" : "s");
}
}
void
config_twiddle_init(void)
{
if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) {
config_do_twiddle = 1;
}
callout_setfunc(&config_twiddle_ch, config_twiddle_fn, NULL);
}
void
config_twiddle_fn(void *cookie)
{
mutex_enter(&config_misc_lock);
if (config_do_twiddle) {
twiddle();
callout_schedule(&config_twiddle_ch, mstohz(100));
}
mutex_exit(&config_misc_lock);
}
static void
config_alldevs_enter(struct alldevs_foray *af)
{
TAILQ_INIT(&af->af_garbage);
mutex_enter(&alldevs_lock);
config_collect_garbage(&af->af_garbage);
}
static void
config_alldevs_exit(struct alldevs_foray *af)
{
mutex_exit(&alldevs_lock);
config_dump_garbage(&af->af_garbage);
}
/*
* device_lookup:
*
* Look up a device instance for a given driver.
*
* Caller is responsible for ensuring the device's state is
* stable, either by holding a reference already obtained with
* device_lookup_acquire or by otherwise ensuring the device is
* attached and can't be detached (e.g., holding an open device
* node and ensuring *_detach calls vdevgone).
*
* XXX Find a way to assert this.
*
* Safe for use up to and including interrupt context at IPL_VM.
* Never sleeps.
*/
device_t
device_lookup(cfdriver_t cd, int unit)
{
device_t dv;
mutex_enter(&alldevs_lock);
if (unit < 0 || unit >= cd->cd_ndevs)
dv = NULL;
else if ((dv = cd->cd_devs[unit]) != NULL && dv->dv_del_gen != 0)
dv = NULL;
mutex_exit(&alldevs_lock);
return dv;
}
/*
* device_lookup_private:
*
* Look up a softc instance for a given driver.
*/
void *
device_lookup_private(cfdriver_t cd, int unit)
{ return device_private(device_lookup(cd, unit));
}
/*
* device_lookup_acquire:
*
* Look up a device instance for a given driver, and return a
* reference to it that must be released by device_release.
*
* => If the device is still attaching, blocks until *_attach has
* returned.
*
* => If the device is detaching, blocks until *_detach has
* returned. May succeed or fail in that case, depending on
* whether *_detach has backed out (EBUSY) or committed to
* detaching.
*
* May sleep.
*/
device_t
device_lookup_acquire(cfdriver_t cd, int unit)
{
device_t dv;
ASSERT_SLEEPABLE();
/* XXX This should have a pserialized fast path -- TBD. */
mutex_enter(&config_misc_lock);
mutex_enter(&alldevs_lock);
retry: if (unit < 0 || unit >= cd->cd_ndevs || (dv = cd->cd_devs[unit]) == NULL || dv->dv_del_gen != 0 ||
dv->dv_detach_committed) {
dv = NULL;
} else {
/*
* Wait for the device to stabilize, if attaching or
* detaching. Either way we must wait for *_attach or
* *_detach to complete, and either way we must retry:
* even if detaching, *_detach might fail (EBUSY) so
* the device may still be there.
*/
if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) ||
dv->dv_detaching != NULL) {
mutex_exit(&alldevs_lock);
cv_wait(&config_misc_cv, &config_misc_lock);
mutex_enter(&alldevs_lock);
goto retry;
}
device_acquire(dv);
}
mutex_exit(&alldevs_lock);
mutex_exit(&config_misc_lock);
return dv;
}
/*
* device_acquire:
*
* Acquire a reference to a device. It is the caller's
* responsibility to ensure that the device's .ca_detach routine
* cannot return before calling this. Caller must release the
* reference with device_release or config_detach_release.
*/
void
device_acquire(device_t dv)
{
/*
* No lock because the caller has promised that this can't
* change concurrently with device_acquire.
*/
KASSERTMSG(!dv->dv_detach_done, "%s",
dv == NULL ? "(null)" : device_xname(dv));
localcount_acquire(dv->dv_localcount);
}
/*
* device_release:
*
* Release a reference to a device acquired with device_acquire or
* device_lookup_acquire.
*/
void
device_release(device_t dv)
{
localcount_release(dv->dv_localcount,
&config_misc_cv, &config_misc_lock);
}
/*
* device_find_by_xname:
*
* Returns the device of the given name or NULL if it doesn't exist.
*/
device_t
device_find_by_xname(const char *name)
{
device_t dv;
deviter_t di;
for (dv = deviter_first(&di, 0); dv != NULL; dv = deviter_next(&di)) {
if (strcmp(device_xname(dv), name) == 0)
break;
}
deviter_release(&di);
return dv;
}
/*
* device_find_by_driver_unit:
*
* Returns the device of the given driver name and unit or
* NULL if it doesn't exist.
*/
device_t
device_find_by_driver_unit(const char *name, int unit)
{
struct cfdriver *cd;
if ((cd = config_cfdriver_lookup(name)) == NULL)
return NULL;
return device_lookup(cd, unit);
}
static bool
match_strcmp(const char * const s1, const char * const s2)
{
return strcmp(s1, s2) == 0;
}
static bool
match_pmatch(const char * const s1, const char * const s2)
{
return pmatch(s1, s2, NULL) == 2;
}
static bool
strarray_match_internal(const char ** const strings,
unsigned int const nstrings, const char * const str,
unsigned int * const indexp,
bool (*match_fn)(const char *, const char *))
{
unsigned int i;
if (strings == NULL || nstrings == 0) {
return false;
}
for (i = 0; i < nstrings; i++) {
if ((*match_fn)(strings[i], str)) {
*indexp = i;
return true;
}
}
return false;
}
static int
strarray_match(const char ** const strings, unsigned int const nstrings,
const char * const str)
{
unsigned int idx;
if (strarray_match_internal(strings, nstrings, str, &idx,
match_strcmp)) {
return (int)(nstrings - idx);
}
return 0;
}
static int
strarray_pmatch(const char ** const strings, unsigned int const nstrings,
const char * const pattern)
{
unsigned int idx;
if (strarray_match_internal(strings, nstrings, pattern, &idx,
match_pmatch)) {
return (int)(nstrings - idx);
}
return 0;
}
static int
device_compatible_match_strarray_internal(
const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp,
int (*match_fn)(const char **, unsigned int, const char *))
{
const struct device_compatible_entry *dce = NULL;
int rv;
if (ndevice_compats == 0 || device_compats == NULL ||
driver_compats == NULL)
return 0;
for (dce = driver_compats; dce->compat != NULL; dce++) {
rv = (*match_fn)(device_compats, ndevice_compats, dce->compat);
if (rv != 0) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return rv;
}
}
return 0;
}
/*
* device_compatible_match:
*
* Match a driver's "compatible" data against a device's
* "compatible" strings. Returns resulted weighted by
* which device "compatible" string was matched.
*/
int
device_compatible_match(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, NULL, strarray_match);
}
/*
* device_compatible_pmatch:
*
* Like device_compatible_match(), but uses pmatch(9) to compare
* the device "compatible" strings against patterns in the
* driver's "compatible" data.
*/
int
device_compatible_pmatch(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, NULL, strarray_pmatch);
}
static int
device_compatible_match_strlist_internal(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp,
int (*match_fn)(const char *, size_t, const char *))
{
const struct device_compatible_entry *dce = NULL;
int rv;
if (device_compats == NULL || device_compatsize == 0 ||
driver_compats == NULL)
return 0;
for (dce = driver_compats; dce->compat != NULL; dce++) {
rv = (*match_fn)(device_compats, device_compatsize,
dce->compat);
if (rv != 0) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return rv;
}
}
return 0;
}
/*
* device_compatible_match_strlist:
*
* Like device_compatible_match(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
int
device_compatible_match_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, NULL, strlist_match);
}
/*
* device_compatible_pmatch_strlist:
*
* Like device_compatible_pmatch(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
int
device_compatible_pmatch_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, NULL, strlist_pmatch);
}
static int
device_compatible_match_id_internal(
uintptr_t const id, uintptr_t const mask, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp)
{
const struct device_compatible_entry *dce = NULL;
if (mask == 0)
return 0;
for (dce = driver_compats; dce->id != sentinel_id; dce++) {
if ((id & mask) == dce->id) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return 1;
}
}
return 0;
}
/*
* device_compatible_match_id:
*
* Like device_compatible_match(), but takes a single
* unsigned integer device ID.
*/
int
device_compatible_match_id(
uintptr_t const id, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_id_internal(id, (uintptr_t)-1,
sentinel_id, driver_compats, NULL);
}
/*
* device_compatible_lookup:
*
* Look up and return the device_compatible_entry, using the
* same matching criteria used by device_compatible_match().
*/
const struct device_compatible_entry *
device_compatible_lookup(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, &dce, strarray_match)) {
return dce;
}
return NULL;
}
/*
* device_compatible_plookup:
*
* Look up and return the device_compatible_entry, using the
* same matching criteria used by device_compatible_pmatch().
*/
const struct device_compatible_entry *
device_compatible_plookup(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, &dce, strarray_pmatch)) {
return dce;
}
return NULL;
}
/*
* device_compatible_lookup_strlist:
*
* Like device_compatible_lookup(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
const struct device_compatible_entry *
device_compatible_lookup_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, &dce, strlist_match)) {
return dce;
}
return NULL;
}
/*
* device_compatible_plookup_strlist:
*
* Like device_compatible_plookup(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
const struct device_compatible_entry *
device_compatible_plookup_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, &dce, strlist_pmatch)) {
return dce;
}
return NULL;
}
/*
* device_compatible_lookup_id:
*
* Like device_compatible_lookup(), but takes a single
* unsigned integer device ID.
*/
const struct device_compatible_entry *
device_compatible_lookup_id(
uintptr_t const id, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_id_internal(id, (uintptr_t)-1,
sentinel_id, driver_compats, &dce)) {
return dce;
}
return NULL;
}
/*
* Power management related functions.
*/
bool
device_pmf_is_registered(device_t dev)
{
return (dev->dv_flags & DVF_POWER_HANDLERS) != 0;
}
bool
device_pmf_driver_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
return true;
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
dev->dv_driver_suspend != NULL &&
!(*dev->dv_driver_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_DRIVER_SUSPENDED;
return true;
}
bool
device_pmf_driver_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
return true;
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
dev->dv_driver_resume != NULL &&
!(*dev->dv_driver_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_DRIVER_SUSPENDED;
return true;
}
bool
device_pmf_driver_shutdown(device_t dev, int how)
{
if (*dev->dv_driver_shutdown != NULL &&
!(*dev->dv_driver_shutdown)(dev, how))
return false;
return true;
}
void
device_pmf_driver_register(device_t dev,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
bool (*shutdown)(device_t, int))
{
dev->dv_driver_suspend = suspend;
dev->dv_driver_resume = resume;
dev->dv_driver_shutdown = shutdown;
dev->dv_flags |= DVF_POWER_HANDLERS;
}
void
device_pmf_driver_deregister(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
dev->dv_driver_suspend = NULL;
dev->dv_driver_resume = NULL;
mutex_enter(&dvl->dvl_mtx);
dev->dv_flags &= ~DVF_POWER_HANDLERS;
while (dvl->dvl_nlock > 0 || dvl->dvl_nwait > 0) {
/* Wake a thread that waits for the lock. That
* thread will fail to acquire the lock, and then
* it will wake the next thread that waits for the
* lock, or else it will wake us.
*/
cv_signal(&dvl->dvl_cv);
pmflock_debug(dev, __func__, __LINE__);
cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
pmflock_debug(dev, __func__, __LINE__);
}
mutex_exit(&dvl->dvl_mtx);
}
void
device_pmf_driver_child_register(device_t dev)
{
device_t parent = device_parent(dev);
if (parent == NULL || parent->dv_driver_child_register == NULL)
return;
(*parent->dv_driver_child_register)(dev);
}
void
device_pmf_driver_set_child_register(device_t dev,
void (*child_register)(device_t))
{
dev->dv_driver_child_register = child_register;
}
static void
pmflock_debug(device_t dev, const char *func, int line)
{
#ifdef PMFLOCK_DEBUG
device_lock_t dvl = device_getlock(dev);
const char *curlwp_name;
if (curlwp->l_name != NULL)
curlwp_name = curlwp->l_name;
else
curlwp_name = curlwp->l_proc->p_comm;
aprint_debug_dev(dev,
"%s.%d, %s dvl_nlock %d dvl_nwait %d dv_flags %x\n", func, line,
curlwp_name, dvl->dvl_nlock, dvl->dvl_nwait, dev->dv_flags);
#endif /* PMFLOCK_DEBUG */
}
static bool
device_pmf_lock1(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
while (device_pmf_is_registered(dev) &&
dvl->dvl_nlock > 0 && dvl->dvl_holder != curlwp) {
dvl->dvl_nwait++;
pmflock_debug(dev, __func__, __LINE__);
cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
pmflock_debug(dev, __func__, __LINE__);
dvl->dvl_nwait--;
}
if (!device_pmf_is_registered(dev)) {
pmflock_debug(dev, __func__, __LINE__);
/* We could not acquire the lock, but some other thread may
* wait for it, also. Wake that thread.
*/
cv_signal(&dvl->dvl_cv);
return false;
}
dvl->dvl_nlock++;
dvl->dvl_holder = curlwp;
pmflock_debug(dev, __func__, __LINE__);
return true;
}
bool
device_pmf_lock(device_t dev)
{
bool rc;
device_lock_t dvl = device_getlock(dev);
mutex_enter(&dvl->dvl_mtx);
rc = device_pmf_lock1(dev);
mutex_exit(&dvl->dvl_mtx);
return rc;
}
void
device_pmf_unlock(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
KASSERT(dvl->dvl_nlock > 0);
mutex_enter(&dvl->dvl_mtx);
if (--dvl->dvl_nlock == 0)
dvl->dvl_holder = NULL;
cv_signal(&dvl->dvl_cv);
pmflock_debug(dev, __func__, __LINE__);
mutex_exit(&dvl->dvl_mtx);
}
device_lock_t
device_getlock(device_t dev)
{
return &dev->dv_lock;
}
void *
device_pmf_bus_private(device_t dev)
{
return dev->dv_bus_private;
}
bool
device_pmf_bus_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
return true;
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0 ||
(dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
dev->dv_bus_suspend != NULL &&
!(*dev->dv_bus_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_BUS_SUSPENDED;
return true;
}
bool
device_pmf_bus_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_BUS_SUSPENDED) == 0)
return true;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
dev->dv_bus_resume != NULL &&
!(*dev->dv_bus_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_BUS_SUSPENDED;
return true;
}
bool
device_pmf_bus_shutdown(device_t dev, int how)
{
if (*dev->dv_bus_shutdown != NULL &&
!(*dev->dv_bus_shutdown)(dev, how))
return false;
return true;
}
void
device_pmf_bus_register(device_t dev, void *priv,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
bool (*shutdown)(device_t, int), void (*deregister)(device_t))
{
dev->dv_bus_private = priv;
dev->dv_bus_resume = resume;
dev->dv_bus_suspend = suspend;
dev->dv_bus_shutdown = shutdown;
dev->dv_bus_deregister = deregister;
}
void
device_pmf_bus_deregister(device_t dev)
{
if (dev->dv_bus_deregister == NULL)
return;
(*dev->dv_bus_deregister)(dev);
dev->dv_bus_private = NULL;
dev->dv_bus_suspend = NULL;
dev->dv_bus_resume = NULL;
dev->dv_bus_deregister = NULL;
}
void *
device_pmf_class_private(device_t dev)
{
return dev->dv_class_private;
}
bool
device_pmf_class_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) != 0)
return true;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
dev->dv_class_suspend != NULL &&
!(*dev->dv_class_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_CLASS_SUSPENDED;
return true;
}
bool
device_pmf_class_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
return true;
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0 ||
(dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
dev->dv_class_resume != NULL &&
!(*dev->dv_class_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_CLASS_SUSPENDED;
return true;
}
void
device_pmf_class_register(device_t dev, void *priv,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
void (*deregister)(device_t))
{
dev->dv_class_private = priv;
dev->dv_class_suspend = suspend;
dev->dv_class_resume = resume;
dev->dv_class_deregister = deregister;
}
void
device_pmf_class_deregister(device_t dev)
{
if (dev->dv_class_deregister == NULL)
return;
(*dev->dv_class_deregister)(dev);
dev->dv_class_private = NULL;
dev->dv_class_suspend = NULL;
dev->dv_class_resume = NULL;
dev->dv_class_deregister = NULL;
}
bool
device_active(device_t dev, devactive_t type)
{
size_t i;
if (dev->dv_activity_count == 0)
return false;
for (i = 0; i < dev->dv_activity_count; ++i) {
if (dev->dv_activity_handlers[i] == NULL)
break;
(*dev->dv_activity_handlers[i])(dev, type);
}
return true;
}
bool
device_active_register(device_t dev, void (*handler)(device_t, devactive_t))
{
void (**new_handlers)(device_t, devactive_t);
void (**old_handlers)(device_t, devactive_t);
size_t i, old_size, new_size;
int s;
old_handlers = dev->dv_activity_handlers;
old_size = dev->dv_activity_count;
KASSERT(old_size == 0 || old_handlers != NULL);
for (i = 0; i < old_size; ++i) {
KASSERT(old_handlers[i] != handler);
if (old_handlers[i] == NULL) {
old_handlers[i] = handler;
return true;
}
}
new_size = old_size + 4;
new_handlers = kmem_alloc(sizeof(void *) * new_size, KM_SLEEP);
for (i = 0; i < old_size; ++i)
new_handlers[i] = old_handlers[i];
new_handlers[old_size] = handler;
for (i = old_size+1; i < new_size; ++i)
new_handlers[i] = NULL;
s = splhigh();
dev->dv_activity_count = new_size;
dev->dv_activity_handlers = new_handlers;
splx(s);
if (old_size > 0)
kmem_free(old_handlers, sizeof(void *) * old_size);
return true;
}
void
device_active_deregister(device_t dev, void (*handler)(device_t, devactive_t))
{
void (**old_handlers)(device_t, devactive_t);
size_t i, old_size;
int s;
old_handlers = dev->dv_activity_handlers;
old_size = dev->dv_activity_count;
for (i = 0; i < old_size; ++i) {
if (old_handlers[i] == handler)
break;
if (old_handlers[i] == NULL)
return; /* XXX panic? */
}
if (i == old_size)
return; /* XXX panic? */
for (; i < old_size - 1; ++i) {
if ((old_handlers[i] = old_handlers[i + 1]) != NULL)
continue;
if (i == 0) {
s = splhigh();
dev->dv_activity_count = 0;
dev->dv_activity_handlers = NULL;
splx(s);
kmem_free(old_handlers, sizeof(void *) * old_size);
}
return;
}
old_handlers[i] = NULL;
}
/* Return true iff the device_t `dev' exists at generation `gen'. */
static bool
device_exists_at(device_t dv, devgen_t gen)
{
return (dv->dv_del_gen == 0 || dv->dv_del_gen > gen) &&
dv->dv_add_gen <= gen;
}
static bool
deviter_visits(const deviter_t *di, device_t dv)
{
return device_exists_at(dv, di->di_gen);
}
/*
* Device Iteration
*
* deviter_t: a device iterator. Holds state for a "walk" visiting
* each device_t's in the device tree.
*
* deviter_init(di, flags): initialize the device iterator `di'
* to "walk" the device tree. deviter_next(di) will return
* the first device_t in the device tree, or NULL if there are
* no devices.
*
* `flags' is one or more of DEVITER_F_RW, indicating that the
* caller intends to modify the device tree by calling
* config_detach(9) on devices in the order that the iterator
* returns them; DEVITER_F_ROOT_FIRST, asking for the devices
* nearest the "root" of the device tree to be returned, first;
* DEVITER_F_LEAVES_FIRST, asking for the devices furthest from
* the root of the device tree, first; and DEVITER_F_SHUTDOWN,
* indicating both that deviter_init() should not respect any
* locks on the device tree, and that deviter_next(di) may run
* in more than one LWP before the walk has finished.
*
* Only one DEVITER_F_RW iterator may be in the device tree at
* once.
*
* DEVITER_F_SHUTDOWN implies DEVITER_F_RW.
*
* Results are undefined if the flags DEVITER_F_ROOT_FIRST and
* DEVITER_F_LEAVES_FIRST are used in combination.
*
* deviter_first(di, flags): initialize the device iterator `di'
* and return the first device_t in the device tree, or NULL
* if there are no devices. The statement
*
* dv = deviter_first(di);
*
* is shorthand for
*
* deviter_init(di);
* dv = deviter_next(di);
*
* deviter_next(di): return the next device_t in the device tree,
* or NULL if there are no more devices. deviter_next(di)
* is undefined if `di' was not initialized with deviter_init() or
* deviter_first().
*
* deviter_release(di): stops iteration (subsequent calls to
* deviter_next() will return NULL), releases any locks and
* resources held by the device iterator.
*
* Device iteration does not return device_t's in any particular
* order. An iterator will never return the same device_t twice.
* Device iteration is guaranteed to complete---i.e., if deviter_next(di)
* is called repeatedly on the same `di', it will eventually return
* NULL. It is ok to attach/detach devices during device iteration.
*/
void
deviter_init(deviter_t *di, deviter_flags_t flags)
{
device_t dv;
memset(di, 0, sizeof(*di));
if ((flags & DEVITER_F_SHUTDOWN) != 0)
flags |= DEVITER_F_RW;
mutex_enter(&alldevs_lock);
if ((flags & DEVITER_F_RW) != 0)
alldevs_nwrite++;
else
alldevs_nread++;
di->di_gen = alldevs_gen++;
di->di_flags = flags;
switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
case DEVITER_F_LEAVES_FIRST:
TAILQ_FOREACH(dv, &alldevs, dv_list) {
if (!deviter_visits(di, dv))
continue;
di->di_curdepth = MAX(di->di_curdepth, dv->dv_depth);
}
break;
case DEVITER_F_ROOT_FIRST:
TAILQ_FOREACH(dv, &alldevs, dv_list) {
if (!deviter_visits(di, dv))
continue;
di->di_maxdepth = MAX(di->di_maxdepth, dv->dv_depth);
}
break;
default:
break;
}
deviter_reinit(di);
mutex_exit(&alldevs_lock);
}
static void
deviter_reinit(deviter_t *di)
{
KASSERT(mutex_owned(&alldevs_lock));
if ((di->di_flags & DEVITER_F_RW) != 0)
di->di_prev = TAILQ_LAST(&alldevs, devicelist);
else
di->di_prev = TAILQ_FIRST(&alldevs);
}
device_t
deviter_first(deviter_t *di, deviter_flags_t flags)
{
deviter_init(di, flags);
return deviter_next(di);
}
static device_t
deviter_next2(deviter_t *di)
{
device_t dv;
KASSERT(mutex_owned(&alldevs_lock));
dv = di->di_prev;
if (dv == NULL)
return NULL;
if ((di->di_flags & DEVITER_F_RW) != 0)
di->di_prev = TAILQ_PREV(dv, devicelist, dv_list);
else
di->di_prev = TAILQ_NEXT(dv, dv_list);
return dv;
}
static device_t
deviter_next1(deviter_t *di)
{
device_t dv;
KASSERT(mutex_owned(&alldevs_lock));
do {
dv = deviter_next2(di);
} while (dv != NULL && !deviter_visits(di, dv));
return dv;
}
device_t
deviter_next(deviter_t *di)
{
device_t dv = NULL;
mutex_enter(&alldevs_lock);
switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
case 0:
dv = deviter_next1(di);
break;
case DEVITER_F_LEAVES_FIRST:
while (di->di_curdepth >= 0) {
if ((dv = deviter_next1(di)) == NULL) {
di->di_curdepth--;
deviter_reinit(di);
} else if (dv->dv_depth == di->di_curdepth)
break;
}
break;
case DEVITER_F_ROOT_FIRST:
while (di->di_curdepth <= di->di_maxdepth) {
if ((dv = deviter_next1(di)) == NULL) {
di->di_curdepth++;
deviter_reinit(di);
} else if (dv->dv_depth == di->di_curdepth)
break;
}
break;
default:
break;
}
mutex_exit(&alldevs_lock);
return dv;
}
void
deviter_release(deviter_t *di)
{
bool rw = (di->di_flags & DEVITER_F_RW) != 0;
mutex_enter(&alldevs_lock);
if (rw)
--alldevs_nwrite;
else
--alldevs_nread;
/* XXX wake a garbage-collection thread */
mutex_exit(&alldevs_lock);
}
const char *
cfdata_ifattr(const struct cfdata *cf)
{
return cf->cf_pspec->cfp_iattr;
}
bool
ifattr_match(const char *snull, const char *t)
{
return (snull == NULL) || strcmp(snull, t) == 0;
}
void
null_childdetached(device_t self, device_t child)
{
/* do nothing */
}
static void
sysctl_detach_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_BOOL, "detachall",
SYSCTL_DESCR("Detach all devices at shutdown"),
NULL, 0, &detachall, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp
* @(#)umap_subr.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/umapfs/umap.h>
u_long umap_findid(u_long, u_long [][2], int);
int umap_node_alloc(struct mount *, struct vnode *,
struct vnode **);
/*
* umap_findid is called by various routines in umap_vnodeops.c to
* find a user or group id in a map.
*/
u_long
umap_findid(u_long id, u_long map[][2], int nentries)
{
int i;
/* Find uid entry in map */
i = 0;
while ((i<nentries) && ((map[i][0]) != id))
i++;
if (i < nentries)
return (map[i][1]);
else
return (-1);
}
/*
* umap_reverse_findid is called by umap_getattr() in umap_vnodeops.c to
* find a user or group id in a map, in reverse.
*/
u_long
umap_reverse_findid(u_long id, u_long map[][2], int nentries)
{
int i;
/* Find uid entry in map */
i = 0;
while ((i<nentries) && ((map[i][1]) != id))
i++;
if (i < nentries)
return (map[i][0]);
else
return (-1);
}
/* umap_mapids maps all of the ids in a credential, both user and group. */
void
umap_mapids(struct mount *v_mount, kauth_cred_t credp)
{
int i, unentries, gnentries;
uid_t uid;
gid_t gid;
u_long (*usermap)[2], (*groupmap)[2];
gid_t groups[NGROUPS];
uint16_t ngroups;
if (credp == NOCRED || credp == FSCRED)
return;
unentries = MOUNTTOUMAPMOUNT(v_mount)->info_nentries;
usermap = MOUNTTOUMAPMOUNT(v_mount)->info_mapdata;
gnentries = MOUNTTOUMAPMOUNT(v_mount)->info_gnentries;
groupmap = MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata;
/* Find uid entry in map */
uid = (uid_t) umap_findid(kauth_cred_geteuid(credp), usermap, unentries); if (uid != -1)
kauth_cred_seteuid(credp, uid);
else
kauth_cred_seteuid(credp, (uid_t)NOBODY);
#if 1
/* cr_gid is the same as cr_groups[0] in 4BSD, but not in NetBSD */
/* Find gid entry in map */
gid = (gid_t) umap_findid(kauth_cred_getegid(credp), groupmap, gnentries); if (gid != -1)
kauth_cred_setegid(credp, gid);
else
kauth_cred_setegid(credp, NULLGROUP);
#endif
/* Now we must map each of the set of groups in the cr_groups
structure. */
ngroups = kauth_cred_ngroups(credp);
for (i = 0; i < ngroups; i++) {
/* XXX elad: can't we just skip cases where gid == -1? */
groups[i] = kauth_cred_group(credp, i);
gid = (gid_t) umap_findid(groups[i],
groupmap, gnentries);
if (gid != -1)
groups[i] = gid;
else
groups[i] = NULLGROUP;
}
kauth_cred_setgroups(credp, groups, ngroups, -1, UIO_SYSSPACE);
}
/* $NetBSD: scsipi_base.h,v 1.24 2017/02/26 10:58:47 maya Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _DEV_SCSIPI_SCSIPI_BASE_H_
#define _DEV_SCSIPI_SCSIPI_BASE_H_
struct scsipi_xfer *scsipi_get_xs(struct scsipi_periph *, int);
void scsipi_put_xs(struct scsipi_xfer *);
static __inline struct scsipi_xfer *scsipi_make_xs_internal(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
static __inline struct scsipi_xfer *scsipi_make_xs_unlocked(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
static __inline struct scsipi_xfer *scsipi_make_xs_locked(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
/*
* Make a scsipi_xfer, and return a pointer to it.
*/
static __inline struct scsipi_xfer *
scsipi_make_xs_internal(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
struct scsipi_xfer *xs;
if ((xs = scsipi_get_xs(periph, flags)) == NULL)
return (NULL);
/*
* Fill out the scsipi_xfer structure. We don't know whose context
* the cmd is in, so copy it.
*/
memcpy(&xs->cmdstore, cmd, cmdlen);
xs->cmd = &xs->cmdstore;
xs->cmdlen = cmdlen;
xs->data = data_addr;
xs->datalen = datalen;
xs->xs_retries = retries;
xs->timeout = timeout;
xs->bp = bp;
return (xs);
}
static __inline struct scsipi_xfer *
scsipi_make_xs_unlocked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
datalen, retries, timeout, bp, flags & ~XS_CTL_NOSLEEP);
}
static __inline struct scsipi_xfer *
scsipi_make_xs_locked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
KDASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
datalen, retries, timeout, bp, flags | XS_CTL_NOSLEEP);
}
#endif /* _DEV_SCSIPI_SCSIPI_BASE_H_ */
/* $NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $ */
/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $");
#include <sys/module.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/types.h>
#include <sys/specificdata.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/ioccom.h>
#include <sys/lwp.h>
#include <sys/fault.h>
typedef struct {
volatile bool enabled;
volatile bool oneshot;
volatile unsigned long nth;
volatile unsigned long cnt;
volatile unsigned long nfaults;
} fault_t;
static fault_t fault_global __cacheline_aligned = {
.enabled = false,
.oneshot = false,
.nth = FAULT_NTH_MIN,
.cnt = 0,
.nfaults = 0
};
static kmutex_t fault_global_lock __cacheline_aligned;
static specificdata_key_t fault_lwp_key;
/* -------------------------------------------------------------------------- */
bool
fault_inject(void)
{
volatile unsigned long cnt;
fault_t *f;
if (__predict_false(cold))
return false;
if (__predict_false(atomic_load_acquire(&fault_global.enabled))) {
f = &fault_global;
} else {
f = lwp_getspecific(fault_lwp_key);
if (__predict_true(f == NULL))
return false;
if (__predict_false(!f->enabled))
return false;
}
if (atomic_load_relaxed(&f->oneshot)) { if (__predict_true(atomic_load_relaxed(&f->nfaults) > 0))
return false;
}
cnt = atomic_inc_ulong_nv(&f->cnt);
if (__predict_false(cnt % atomic_load_relaxed(&f->nth) == 0)) { atomic_inc_ulong(&f->nfaults);
return true;
}
return false;
}
/* -------------------------------------------------------------------------- */
static int
fault_open(dev_t dev, int flag, int mode, struct lwp *l)
{
return 0;
}
static int
fault_close(dev_t dev, int flag, int mode, struct lwp *l)
{
return 0;
}
static int
fault_ioc_enable(struct fault_ioc_enable *args)
{
fault_t *f;
if (args->mode != FAULT_MODE_NTH_ONESHOT)
return EINVAL;
if (args->nth < FAULT_NTH_MIN)
return EINVAL;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
mutex_enter(&fault_global_lock);
if (fault_global.enabled) {
mutex_exit(&fault_global_lock);
return EEXIST;
}
fault_global.oneshot = true;
atomic_store_relaxed(&fault_global.nth, args->nth);
fault_global.cnt = 0;
fault_global.nfaults = 0;
atomic_store_release(&fault_global.enabled, true);
mutex_exit(&fault_global_lock);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f != NULL) {
if (f->enabled)
return EEXIST;
} else {
f = kmem_zalloc(sizeof(*f), KM_SLEEP);
lwp_setspecific(fault_lwp_key, f);
}
f->oneshot = true;
atomic_store_relaxed(&f->nth, args->nth);
f->cnt = 0;
f->nfaults = 0;
atomic_store_release(&f->enabled, true);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioc_disable(struct fault_ioc_disable *args)
{
fault_t *f;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
mutex_enter(&fault_global_lock);
if (!fault_global.enabled) {
mutex_exit(&fault_global_lock);
return ENOENT;
}
atomic_store_release(&fault_global.enabled, false);
mutex_exit(&fault_global_lock);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f == NULL)
return ENOENT;
if (!f->enabled)
return ENOENT;
atomic_store_release(&f->enabled, false);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioc_getinfo(struct fault_ioc_getinfo *args)
{
fault_t *f;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
args->nfaults = atomic_load_relaxed(&fault_global.nfaults);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f == NULL)
return ENOENT;
args->nfaults = atomic_load_relaxed(&f->nfaults);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l)
{
switch (cmd) {
case FAULT_IOC_ENABLE:
return fault_ioc_enable(addr);
case FAULT_IOC_DISABLE:
return fault_ioc_disable(addr);
case FAULT_IOC_GETINFO:
return fault_ioc_getinfo(addr);
default:
return EINVAL;
}
}
const struct cdevsw fault_cdevsw = {
.d_open = fault_open,
.d_close = fault_close,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = fault_ioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* -------------------------------------------------------------------------- */
MODULE(MODULE_CLASS_MISC, fault, NULL);
static void
fault_lwp_free(void *arg)
{
fault_t *f = (fault_t *)arg;
if (f == NULL) {
return;
}
kmem_free(f, sizeof(*f));
}
static void
fault_init(void)
{
mutex_init(&fault_global_lock, MUTEX_DEFAULT, IPL_NONE);
lwp_specific_key_create(&fault_lwp_key, fault_lwp_free);
}
static int
fault_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
fault_init();
return 0;
case MODULE_CMD_FINI:
return EINVAL;
default:
return ENOTTY;
}
}
/* $NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
* 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Coyote Point Systems, Inc.
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
*/
/*
* TODO list for SYN cache stuff:
*
* Find room for a "state" field, which is needed to keep a
* compressed state for TIME_WAIT TCBs. It's been noted already
* that this is fairly important for very high-volume web and
* mail servers, which use a large number of short-lived
* connections.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lwp.h> /* for lwp0 */
#include <sys/cprng.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_syncache.h>
#ifdef TCP_SIGNATURE
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif /* IPSEC*/
#endif
static void syn_cache_timer(void *);
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
struct syn_cache_head **);
static int syn_cache_respond(struct syn_cache *);
/* syn hash parameters */
#define TCP_SYN_HASH_SIZE 293
#define TCP_SYN_BUCKET_SIZE 35
static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
/*
* TCP compressed state engine. Currently used to hold compressed
* state for SYN_RECEIVED.
*/
u_long syn_cache_count;
static u_int32_t syn_hash1, syn_hash2;
#define SYN_HASH(sa, sp, dp) \
((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
((u_int32_t)(sp)))^syn_hash2)))
#ifndef INET6
#define SYN_HASHALL(hash, src, dst) \
do { \
hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
((const struct sockaddr_in *)(src))->sin_port, \
((const struct sockaddr_in *)(dst))->sin_port); \
} while (/*CONSTCOND*/ 0)
#else
#define SYN_HASH6(sa, sp, dp) \
((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
(((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
& 0x7fffffff)
#define SYN_HASHALL(hash, src, dst) \
do { \
switch ((src)->sa_family) { \
case AF_INET: \
hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
((const struct sockaddr_in *)(src))->sin_port, \
((const struct sockaddr_in *)(dst))->sin_port); \
break; \
case AF_INET6: \
hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
((const struct sockaddr_in6 *)(src))->sin6_port, \
((const struct sockaddr_in6 *)(dst))->sin6_port); \
break; \
default: \
hash = 0; \
} \
} while (/*CONSTCOND*/0)
#endif /* INET6 */
static struct pool syn_cache_pool;
/*
* We don't estimate RTT with SYNs, so each packet starts with the default
* RTT and each timer step has a fixed timeout value.
*/
static inline void
syn_cache_timer_arm(struct syn_cache *sc)
{
TCPT_RANGESET(sc->sc_rxtcur,
TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
TCPTV_REXMTMAX);
callout_reset(&sc->sc_timer,
sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
}
#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
static inline void
syn_cache_rm(struct syn_cache *sc)
{
TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
sc, sc_bucketq);
sc->sc_tp = NULL;
LIST_REMOVE(sc, sc_tpq);
tcp_syn_cache[sc->sc_bucketidx].sch_length--;
callout_stop(&sc->sc_timer);
syn_cache_count--;
}
static inline void
syn_cache_put(struct syn_cache *sc)
{
if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts);
rtcache_free(&sc->sc_route);
sc->sc_flags |= SCF_DEAD;
if (!callout_invoking(&sc->sc_timer)) callout_schedule(&(sc)->sc_timer, 1);
}
void
syn_cache_init(void)
{
int i;
pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
"synpl", NULL, IPL_SOFTNET);
/* Initialize the hash buckets. */
for (i = 0; i < tcp_syn_cache_size; i++)
TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
}
void
syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
{
struct syn_cache_head *scp;
struct syn_cache *sc2;
int s;
/*
* If there are no entries in the hash table, reinitialize
* the hash secrets.
*/
if (syn_cache_count == 0) {
syn_hash1 = cprng_fast32();
syn_hash2 = cprng_fast32();
}
SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
scp = &tcp_syn_cache[sc->sc_bucketidx];
/*
* Make sure that we don't overflow the per-bucket
* limit or the total cache size limit.
*/
s = splsoftnet();
if (scp->sch_length >= tcp_syn_bucket_limit) {
TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
/*
* The bucket is full. Toss the oldest element in the
* bucket. This will be the first entry in the bucket.
*/
sc2 = TAILQ_FIRST(&scp->sch_bucket);
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find an
* entry in our bucket.
*/
if (sc2 == NULL)
panic("syn_cache_insert: bucketoverflow: impossible");
#endif
syn_cache_rm(sc2);
syn_cache_put(sc2); /* calls pool_put but see spl above */
} else if (syn_cache_count >= tcp_syn_cache_limit) {
struct syn_cache_head *scp2, *sce;
TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
/*
* The cache is full. Toss the oldest entry in the
* first non-empty bucket we can find.
*
* XXX We would really like to toss the oldest
* entry in the cache, but we hope that this
* condition doesn't happen very often.
*/
scp2 = scp;
if (TAILQ_EMPTY(&scp2->sch_bucket)) {
sce = &tcp_syn_cache[tcp_syn_cache_size];
for (++scp2; scp2 != scp; scp2++) {
if (scp2 >= sce)
scp2 = &tcp_syn_cache[0];
if (! TAILQ_EMPTY(&scp2->sch_bucket))
break;
}
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find a
* non-empty bucket.
*/
if (scp2 == scp)
panic("syn_cache_insert: cacheoverflow: "
"impossible");
#endif
}
sc2 = TAILQ_FIRST(&scp2->sch_bucket);
syn_cache_rm(sc2);
syn_cache_put(sc2); /* calls pool_put but see spl above */
}
/*
* Initialize the entry's timer.
*/
sc->sc_rxttot = 0;
sc->sc_rxtshift = 0;
syn_cache_timer_arm(sc);
/* Link it from tcpcb entry */
LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
/* Put it into the bucket. */
TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
scp->sch_length++;
syn_cache_count++;
TCP_STATINC(TCP_STAT_SC_ADDED);
splx(s);
}
/*
* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
* If we have retransmitted an entry the maximum number of times, expire
* that entry.
*/
static void
syn_cache_timer(void *arg)
{
struct syn_cache *sc = arg;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
callout_ack(&sc->sc_timer);
if (__predict_false(sc->sc_flags & SCF_DEAD)) {
TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
goto free;
}
if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
/* Drop it -- too many retransmissions. */
goto dropit;
}
/*
* Compute the total amount of time this entry has
* been on a queue. If this entry has been on longer
* than the keep alive timer would allow, expire it.
*/
sc->sc_rxttot += sc->sc_rxtcur;
if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
goto dropit;
TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
(void)syn_cache_respond(sc);
/* Advance the timer back-off. */
sc->sc_rxtshift++;
syn_cache_timer_arm(sc);
goto out;
dropit:
TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
syn_cache_rm(sc);
if (sc->sc_ipopts)
(void) m_free(sc->sc_ipopts);
rtcache_free(&sc->sc_route);
free:
callout_destroy(&sc->sc_timer);
pool_put(&syn_cache_pool, sc);
out:
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Remove syn cache created by the specified tcb entry,
* because this does not make sense to keep them
* (if there's no tcb entry, syn cache entry will never be used)
*/
void
syn_cache_cleanup(struct tcpcb *tp)
{
struct syn_cache *sc, *nsc;
int s;
s = splsoftnet();
for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
nsc = LIST_NEXT(sc, sc_tpq);
#ifdef DIAGNOSTIC
if (sc->sc_tp != tp)
panic("invalid sc_tp in syn_cache_cleanup");
#endif
syn_cache_rm(sc);
syn_cache_put(sc); /* calls pool_put but see spl above */
}
/* just for safety */
LIST_INIT(&tp->t_sc);
splx(s);
}
/*
* Find an entry in the syn cache.
*/
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
struct syn_cache_head **headp)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
u_int32_t hash;
int s;
SYN_HASHALL(hash, src, dst);
scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
*headp = scp;
s = splsoftnet();
for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
sc = TAILQ_NEXT(sc, sc_bucketq)) {
if (sc->sc_hash != hash)
continue;
if (!memcmp(&sc->sc_src, src, src->sa_len) &&
!memcmp(&sc->sc_dst, dst, dst->sa_len)) {
splx(s);
return (sc);
}
}
splx(s);
return (NULL);
}
/*
* This function gets called when we receive an ACK for a socket in the
* LISTEN state. We look up the connection in the syn cache, and if it's
* there, we pull it out of the cache and turn it into a full-blown
* connection in the SYN-RECEIVED state.
*
* The return values may not be immediately obvious, and their effects
* can be subtle, so here they are:
*
* NULL SYN was not found in cache; caller should drop the
* packet and send an RST.
*
* -1 We were unable to create the new connection, and are
* aborting it. An ACK,RST is being sent to the peer
* (unless we got screwey sequence numbers; see below),
* because the 3-way handshake has been completed. Caller
* should not free the mbuf, since we may be using it. If
* we are not, we will free it.
*
* Otherwise, the return value is a pointer to the new socket
* associated with the connection.
*/
struct socket *
syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
struct tcphdr *th, struct socket *so, struct mbuf *m)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
struct inpcb *inp = NULL;
struct tcpcb *tp;
int s;
struct socket *oso;
s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return NULL;
}
/*
* Verify the sequence and ack numbers. Try getting the correct
* response again.
*/
if ((th->th_ack != sc->sc_iss + 1) ||
SEQ_LEQ(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
m_freem(m);
(void)syn_cache_respond(sc);
splx(s);
return ((struct socket *)(-1));
}
/* Remove this cache entry */
syn_cache_rm(sc);
splx(s);
/*
* Ok, create the full blown connection, and set things up
* as they would have been set up if we had created the
* connection when the SYN arrived. If we can't create
* the connection, abort it.
*/
/*
* inp still has the OLD in_pcb stuff, set the
* v6-related flags on the new guy, too. This is
* done particularly for the case where an AF_INET6
* socket is bound only to a port, and a v4 connection
* comes in on that port.
* we also copy the flowinfo from the original pcb
* to the new one.
*/
oso = so;
so = sonewconn(so, true);
if (so == NULL)
goto resetandabort;
inp = sotoinpcb(so);
switch (src->sa_family) {
case AF_INET:
if (inp->inp_af == AF_INET) {
in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
inp->inp_options = ip_srcroute(m);
inpcb_set_state(inp, INP_BOUND);
if (inp->inp_options == NULL) {
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
/* IPv4 packet to AF_INET6 socket */
memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
bcopy(&((struct sockaddr_in *)dst)->sin_addr,
&in6p_laddr(inp).s6_addr32[3],
sizeof(((struct sockaddr_in *)dst)->sin_addr));
inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
intotcpcb(inp)->t_family = AF_INET;
if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
else
inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
inpcb_set_state(inp, INP_BOUND);
}
#endif
break;
#ifdef INET6
case AF_INET6:
if (inp->inp_af == AF_INET6) {
in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
inpcb_set_state(inp, INP_BOUND);
}
break;
#endif
}
#ifdef INET6
if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
struct inpcb *oinp = sotoinpcb(oso);
/* inherit socket options from the listening socket */
inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
if (inp->inp_flags & IN6P_CONTROLOPTS) {
m_freem(inp->inp_options);
inp->inp_options = NULL;
}
ip6_savecontrol(inp, &inp->inp_options,
mtod(m, struct ip6_hdr *), m);
}
#endif
/*
* Give the new socket our cached route reference.
*/
rtcache_copy(&inp->inp_route, &sc->sc_route);
rtcache_free(&sc->sc_route);
if (inp->inp_af == AF_INET) {
struct sockaddr_in sin;
memcpy(&sin, src, src->sa_len);
if (inpcb_connect(inp, &sin, &lwp0)) {
goto resetandabort;
}
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
struct sockaddr_in6 sin6;
memcpy(&sin6, src, src->sa_len);
if (src->sa_family == AF_INET) {
/* IPv4 packet to AF_INET6 socket */
in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
}
if (in6pcb_connect(inp, &sin6, NULL)) {
goto resetandabort;
}
}
#endif
else {
goto resetandabort;
}
tp = intotcpcb(inp);
tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
if (sc->sc_request_r_scale != 15) {
tp->requested_s_scale = sc->sc_requested_s_scale;
tp->request_r_scale = sc->sc_request_r_scale;
tp->snd_scale = sc->sc_requested_s_scale;
tp->rcv_scale = sc->sc_request_r_scale;
tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
}
if (sc->sc_flags & SCF_TIMESTAMP)
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->ts_timebase = sc->sc_timebase;
tp->t_template = tcp_template(tp);
if (tp->t_template == 0) {
tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
so = NULL;
m_freem(m);
goto abort;
}
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
tcp_sendseqinit(tp);
tcp_rcvseqinit(tp);
tp->t_state = TCPS_SYN_RECEIVED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
TCP_STATINC(TCP_STAT_ACCEPTS);
if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
tp->t_flags |= TF_WILL_SACK;
if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
tp->t_flags |= TF_ECN_PERMIT;
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
/* Initialize tp->t_ourmss before we deal with the peer's! */
tp->t_ourmss = sc->sc_ourmaxseg;
tcp_mss_from_peer(tp, sc->sc_peermaxseg);
/*
* Initialize the initial congestion window. If we
* had to retransmit the SYN,ACK, we must initialize cwnd
* to 1 segment (i.e. the Loss Window).
*/
if (sc->sc_rxtshift)
tp->snd_cwnd = tp->t_peermss;
else {
int ss = tcp_init_win;
if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
ss = tcp_init_win_local;
#ifdef INET6
else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
ss = tcp_init_win_local;
#endif
tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
}
tcp_rmx_rtt(tp);
tp->snd_wl1 = sc->sc_irs;
tp->rcv_up = sc->sc_irs + 1;
/*
* This is what would have happened in tcp_output() when
* the SYN,ACK was sent.
*/
tp->snd_up = tp->snd_una;
tp->snd_max = tp->snd_nxt = tp->iss+1;
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
TCP_STATINC(TCP_STAT_SC_COMPLETED);
s = splsoftnet();
syn_cache_put(sc);
splx(s);
return so;
resetandabort:
(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
abort:
if (so != NULL) {
(void) soqremque(so, 1);
(void) soabort(so);
mutex_enter(softnet_lock);
}
s = splsoftnet();
syn_cache_put(sc);
splx(s);
TCP_STATINC(TCP_STAT_SC_ABORTED);
return ((struct socket *)(-1));
}
/*
* This function is called when we get a RST for a
* non-existent connection, so that we can see if the
* connection is in the syn cache. If it is, zap it.
*/
void
syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
int s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return;
}
if (SEQ_LT(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs+1)) {
splx(s);
return;
}
syn_cache_rm(sc);
TCP_STATINC(TCP_STAT_SC_RESET);
syn_cache_put(sc); /* calls pool_put but see spl above */
splx(s);
}
void
syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
struct tcphdr *th)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
int s;
s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return;
}
/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
if (ntohl(th->th_seq) != sc->sc_iss) {
splx(s);
return;
}
/*
* If we've retransmitted 3 times and this is our second error,
* we remove the entry. Otherwise, we allow it to continue on.
* This prevents us from incorrectly nuking an entry during a
* spurious network outage.
*
* See tcp_notify().
*/
if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
sc->sc_flags |= SCF_UNREACH;
splx(s);
return;
}
syn_cache_rm(sc);
TCP_STATINC(TCP_STAT_SC_UNREACH);
syn_cache_put(sc); /* calls pool_put but see spl above */
splx(s);
}
/*
* Given a LISTEN socket and an inbound SYN request, add this to the syn
* cache, and send back a segment:
* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
* to the source.
*
* IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
* Doing so would require that we hold onto the data and deliver it
* to the application. However, if we are the target of a SYN-flood
* DoS attack, an attacker could send data which would eventually
* consume all available buffer space if it were ACKed. By not ACKing
* the data, we avoid this DoS scenario.
*/
int
syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
int optlen, struct tcp_opt_info *oi)
{
struct tcpcb tb, *tp;
long win;
struct syn_cache *sc;
struct syn_cache_head *scp;
struct mbuf *ipopts;
int s;
tp = sototcpcb(so);
/*
* Initialize some local state.
*/
win = sbspace(&so->so_rcv);
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
#ifdef TCP_SIGNATURE
if (optp || (tp->t_flags & TF_SIGNATURE))
#else
if (optp)
#endif
{
tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
#ifdef TCP_SIGNATURE
tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
#endif
tb.t_state = TCPS_LISTEN;
if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
return 0;
} else
tb.t_flags = 0;
switch (src->sa_family) {
case AF_INET:
/* Remember the IP options, if any. */
ipopts = ip_srcroute(m);
break;
default:
ipopts = NULL;
}
/*
* See if we already have an entry for this connection.
* If we do, resend the SYN,ACK. We do not count this
* as a retransmission (XXX though maybe we should).
*/
if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
TCP_STATINC(TCP_STAT_SC_DUPESYN);
if (ipopts) {
/*
* If we were remembering a previous source route,
* forget it and use the new one we've been given.
*/
if (sc->sc_ipopts)
(void)m_free(sc->sc_ipopts);
sc->sc_ipopts = ipopts;
}
sc->sc_timestamp = tb.ts_recent;
m_freem(m);
if (syn_cache_respond(sc) == 0) {
uint64_t *tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDACKS]++;
tcps[TCP_STAT_SNDTOTAL]++;
TCP_STAT_PUTREF();
}
return 1;
}
s = splsoftnet();
sc = pool_get(&syn_cache_pool, PR_NOWAIT);
splx(s);
if (sc == NULL) {
if (ipopts)
(void)m_free(ipopts);
return 0;
}
/*
* Fill in the cache, and put the necessary IP and TCP
* options into the reply.
*/
memset(sc, 0, sizeof(struct syn_cache));
callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
memcpy(&sc->sc_src, src, src->sa_len);
memcpy(&sc->sc_dst, dst, dst->sa_len);
sc->sc_flags = 0;
sc->sc_ipopts = ipopts;
sc->sc_irs = th->th_seq;
switch (src->sa_family) {
case AF_INET:
{
struct sockaddr_in *srcin = (void *)src;
struct sockaddr_in *dstin = (void *)dst;
sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
&srcin->sin_addr, dstin->sin_port,
srcin->sin_port, sizeof(dstin->sin_addr));
break;
}
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *srcin6 = (void *)src;
struct sockaddr_in6 *dstin6 = (void *)dst;
sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
&srcin6->sin6_addr, dstin6->sin6_port,
srcin6->sin6_port, sizeof(dstin6->sin6_addr));
break;
}
#endif
}
sc->sc_peermaxseg = oi->maxseg;
sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
sc->sc_win = win;
sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */
sc->sc_timestamp = tb.ts_recent;
if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP))
sc->sc_flags |= SCF_TIMESTAMP;
if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
(TF_RCVD_SCALE|TF_REQ_SCALE)) {
sc->sc_requested_s_scale = tb.requested_s_scale;
sc->sc_request_r_scale = 0;
/*
* Pick the smallest possible scaling factor that
* will still allow us to scale up to sb_max.
*
* We do this because there are broken firewalls that
* will corrupt the window scale option, leading to
* the other endpoint believing that our advertised
* window is unscaled. At scale factors larger than
* 5 the unscaled window will drop below 1500 bytes,
* leading to serious problems when traversing these
* broken firewalls.
*
* With the default sbmax of 256K, a scale factor
* of 3 will be chosen by this algorithm. Those who
* choose a larger sbmax should watch out
* for the compatibility problems mentioned above.
*
* RFC1323: The Window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled.
*/
while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
(TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
sc->sc_request_r_scale++;
} else {
sc->sc_requested_s_scale = 15;
sc->sc_request_r_scale = 15;
}
if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
sc->sc_flags |= SCF_SACK_PERMIT;
/*
* ECN setup packet received.
*/
if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
sc->sc_flags |= SCF_ECN_PERMIT;
#ifdef TCP_SIGNATURE
if (tb.t_flags & TF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
#endif
sc->sc_tp = tp;
m_freem(m);
if (syn_cache_respond(sc) == 0) {
uint64_t *tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDACKS]++;
tcps[TCP_STAT_SNDTOTAL]++;
TCP_STAT_PUTREF();
syn_cache_insert(sc, tp);
} else {
s = splsoftnet();
/*
* syn_cache_put() will try to schedule the timer, so
* we need to initialize it
*/
syn_cache_timer_arm(sc);
syn_cache_put(sc);
splx(s);
TCP_STATINC(TCP_STAT_SC_DROPPED);
}
return 1;
}
/*
* syn_cache_respond: (re)send SYN+ACK.
*
* Returns 0 on success.
*/
static int
syn_cache_respond(struct syn_cache *sc)
{
#ifdef INET6
struct rtentry *rt = NULL;
#endif
struct route *ro;
u_int8_t *optp;
int optlen, error;
u_int16_t tlen;
struct ip *ip = NULL;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
struct tcpcb *tp;
struct tcphdr *th;
struct mbuf *m;
u_int hlen;
#ifdef TCP_SIGNATURE
struct secasvar *sav = NULL;
u_int8_t *sigp = NULL;
#endif
ro = &sc->sc_route;
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
hlen = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
return EAFNOSUPPORT;
}
/* Worst case scenario, since we don't know the option size yet. */
tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
KASSERT(max_linkhdr + tlen <= MCLBYTES);
/*
* Create the IP+TCP header from scratch.
*/
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m && (max_linkhdr + tlen) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL)
return ENOBUFS;
MCLAIM(m, &tcp_tx_mowner);
tp = sc->sc_tp;
/* Fixup the mbuf. */
m->m_data += max_linkhdr;
m_reset_rcvif(m);
memset(mtod(m, void *), 0, tlen);
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip = mtod(m, struct ip *);
ip->ip_v = 4;
ip->ip_dst = sc->sc_src.sin.sin_addr;
ip->ip_src = sc->sc_dst.sin.sin_addr;
ip->ip_p = IPPROTO_TCP;
th = (struct tcphdr *)(ip + 1);
th->th_dport = sc->sc_src.sin.sin_port;
th->th_sport = sc->sc_dst.sin.sin_port;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
ip6->ip6_nxt = IPPROTO_TCP;
/* ip6_plen will be updated in ip6_output() */
th = (struct tcphdr *)(ip6 + 1);
th->th_dport = sc->sc_src.sin6.sin6_port;
th->th_sport = sc->sc_dst.sin6.sin6_port;
break;
#endif
default:
panic("%s: impossible (1)", __func__);
}
th->th_seq = htonl(sc->sc_iss);
th->th_ack = htonl(sc->sc_irs + 1);
th->th_flags = TH_SYN|TH_ACK;
th->th_win = htons(sc->sc_win);
/* th_x2, th_sum, th_urp already 0 from memset */
/* Tack on the TCP options. */
optp = (u_int8_t *)(th + 1);
optlen = 0;
*optp++ = TCPOPT_MAXSEG;
*optp++ = TCPOLEN_MAXSEG;
*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
*optp++ = sc->sc_ourmaxseg & 0xff;
optlen += TCPOLEN_MAXSEG;
if (sc->sc_request_r_scale != 15) {
*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
sc->sc_request_r_scale);
optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
}
if (sc->sc_flags & SCF_SACK_PERMIT) {
/* Let the peer know that we will SACK. */
*optp++ = TCPOPT_SACK_PERMITTED;
*optp++ = TCPOLEN_SACK_PERMITTED;
optlen += TCPOLEN_SACK_PERMITTED;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
*optp++ = TCPOPT_TIMESTAMP;
*optp++ = TCPOLEN_TIMESTAMP;
u_int32_t *lp = (u_int32_t *)(optp);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
*lp = htonl(sc->sc_timestamp);
optp += TCPOLEN_TIMESTAMP - 2;
optlen += TCPOLEN_TIMESTAMP;
}
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE) {
sav = tcp_signature_getsav(m);
if (sav == NULL) {
m_freem(m);
return EPERM;
}
*optp++ = TCPOPT_SIGNATURE;
*optp++ = TCPOLEN_SIGNATURE;
sigp = optp;
memset(optp, 0, TCP_SIGLEN);
optp += TCP_SIGLEN;
optlen += TCPOLEN_SIGNATURE;
}
#endif
/*
* Terminate and pad TCP options to a 4 byte boundary.
*
* According to RFC793: "The content of the header beyond the
* End-of-Option option must be header padding (i.e., zero)."
* And later: "The padding is composed of zeros."
*/
if (optlen % 4) {
optlen += TCPOLEN_EOL;
*optp++ = TCPOPT_EOL;
}
while (optlen % 4) {
optlen += TCPOLEN_PAD;
*optp++ = TCPOPT_PAD;
}
/* Compute the actual values now that we've added the options. */
tlen = hlen + sizeof(struct tcphdr) + optlen;
m->m_len = m->m_pkthdr.len = tlen;
th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
#ifdef TCP_SIGNATURE
if (sav) {
(void)tcp_signature(m, th, hlen, sav, sigp);
key_sa_recordxfer(sav, m);
KEY_SA_UNREF(&sav);
}
#endif
/*
* Send ECN SYN-ACK setup packet.
* Routes can be asymmetric, so, even if we receive a packet
* with ECE and CWR set, we must not assume no one will block
* the ECE packet we are about to send.
*/
if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
th->th_flags |= TH_ECE;
TCP_STATINC(TCP_STAT_ECN_SHS);
/*
* draft-ietf-tcpm-ecnsyn-00.txt
*
* "[...] a TCP node MAY respond to an ECN-setup
* SYN packet by setting ECT in the responding
* ECN-setup SYN/ACK packet, indicating to routers
* that the SYN/ACK packet is ECN-Capable.
* This allows a congested router along the path
* to mark the packet instead of dropping the
* packet as an indication of congestion."
*
* "[...] There can be a great benefit in setting
* an ECN-capable codepoint in SYN/ACK packets [...]
* Congestion is most likely to occur in
* the server-to-client direction. As a result,
* setting an ECN-capable codepoint in SYN/ACK
* packets can reduce the occurrence of three-second
* retransmit timeouts resulting from the drop
* of SYN/ACK packets."
*
* Page 4 and 6, January 2006.
*/
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_tos |= IPTOS_ECN_ECT0;
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
break;
#endif
}
TCP_STATINC(TCP_STAT_ECN_ECT);
}
/*
* Compute the packet's checksum.
*
* Fill in some straggling IP bits. Note the stack expects
* ip_len to be in host order, for convenience.
*/
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_len = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip->ip_len = htons(tlen);
ip->ip_ttl = ip_defttl;
/* XXX tos? */
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_plen = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_plen = htons(tlen - hlen);
/* ip6_hlim will be initialized afterwards */
/* XXX flowlabel? */
break;
#endif
}
/* XXX use IPsec policy on listening socket, on SYN ACK */
tp = sc->sc_tp;
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
error = ip_output(m, sc->sc_ipopts, ro,
(ip_mtudisc ? IP_MTUDISC : 0),
NULL, tp ? tp->t_inpcb : NULL);
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_hlim = in6pcb_selecthlim(NULL,
(rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
rtcache_unref(rt, ro);
error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
tp ? tp->t_inpcb : NULL, NULL);
break;
#endif
default:
panic("%s: impossible (2)", __func__);
}
return error;
}
/* $NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $ */
/*-
* Copyright (c) 1982, 1986 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
*/
/*-
* Copyright (c) 1995 Charles M. Hannum. All rights reserved.
* Copyright (c) 1989, 1990 William Jolitz
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
*/
/*
* Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $");
#include "opt_mtrr.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/core.h>
#include <sys/exec.h>
#include <sys/ptrace.h>
#include <uvm/uvm.h>
#include <machine/cpu.h>
#include <machine/gdt.h>
#include <machine/reg.h>
#include <machine/specialreg.h>
#ifdef MTRR
#include <machine/mtrr.h>
#endif
#include <x86/fpu.h>
#include <x86/dbregs.h>
extern struct pool x86_dbregspl;
void
cpu_proc_fork(struct proc *p1, struct proc *p2)
{
p2->p_md.md_flags = p1->p_md.md_flags;
}
/*
* cpu_lwp_fork: finish a new LWP (l2) operation.
*
* First LWP (l1) is the process being forked. If it is &lwp0, then we
* are creating a kthread, where return path and argument are specified
* with `func' and `arg'.
*
* If an alternate user-level stack is requested (with non-zero values
* in both the stack and stacksize arguments), then set up the user stack
* pointer accordingly.
*/
void
cpu_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
void (*func)(void *), void *arg)
{
struct pcb *pcb1, *pcb2;
struct trapframe *tf;
struct switchframe *sf;
vaddr_t uv;
KASSERT(l1 == curlwp || l1 == &lwp0);
pcb1 = lwp_getpcb(l1);
pcb2 = lwp_getpcb(l2);
/* Copy the PCB from parent, except the FPU state. */
memcpy(pcb2, pcb1, offsetof(struct pcb, pcb_savefpu));
/* Fork the FPU state. */
fpu_lwp_fork(l1, l2);
/* Never inherit CPU Debug Registers */
pcb2->pcb_dbregs = NULL;
pcb2->pcb_flags &= ~PCB_DBREGS;
#if defined(XENPV)
pcb2->pcb_iopl = IOPL_KPL;
#endif
/*
* Set the kernel stack address (from the address to uarea) and
* trapframe address for child.
*
* Rig kernel stack so that it would start out in lwp_trampoline()
* and call child_return() with l2 as an argument. This causes the
* newly-created child process to go directly to user level with a
* parent return value of 0 from fork(), while the parent process
* returns normally.
*/
uv = uvm_lwp_getuarea(l2);
KASSERT(uv % PAGE_SIZE == 0);
#ifdef __x86_64__
#ifdef SVS
pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE +
sizeof(struct trapframe));
KASSERT((pcb2->pcb_rsp0 & 0xF) == 0);
#else
pcb2->pcb_rsp0 = (uv + USPACE - 16);
#endif
tf = (struct trapframe *)pcb2->pcb_rsp0 - 1;
#else
pcb2->pcb_esp0 = (uv + USPACE - 16);
tf = (struct trapframe *)pcb2->pcb_esp0 - 1;
pcb2->pcb_iomap = NULL;
#endif
l2->l_md.md_regs = tf;
/*
* Copy the trapframe from parent, so that return to userspace
* will be to right address, with correct registers.
*/
memcpy(tf, l1->l_md.md_regs, sizeof(struct trapframe));
/* Child LWP might get aston() before returning to userspace. */
tf->tf_trapno = T_ASTFLT;
/* If specified, set a different user stack for a child. */
if (stack != NULL) {
#ifdef __x86_64__
tf->tf_rsp = (uint64_t)stack + stacksize;
#else
tf->tf_esp = (uint32_t)stack + stacksize;
#endif
}
l2->l_md.md_flags = l1->l_md.md_flags;
KASSERT(l2->l_md.md_astpending == 0);
sf = (struct switchframe *)tf - 1;
#ifdef __x86_64__
sf->sf_r12 = (uint64_t)func;
sf->sf_r13 = (uint64_t)arg;
sf->sf_rip = (uint64_t)lwp_trampoline;
pcb2->pcb_rsp = (uint64_t)sf;
pcb2->pcb_rbp = (uint64_t)l2;
#else
/*
* XXX Is there a reason sf->sf_edi isn't initialized here?
* Could this leak potentially sensitive information to new
* userspace processes?
*/
sf->sf_esi = (int)func;
sf->sf_ebx = (int)arg;
sf->sf_eip = (int)lwp_trampoline;
pcb2->pcb_esp = (int)sf;
pcb2->pcb_ebp = (int)l2;
#endif
}
/*
* cpu_lwp_free is called from exit() to let machine-dependent
* code free machine-dependent resources. Note that this routine
* must not block. NB: this may be called with l != curlwp in
* error paths.
*/
void
cpu_lwp_free(struct lwp *l, int proc)
{ if (l != curlwp)
return;
/* Abandon the FPU state. */
fpu_lwp_abandon(l);
/* Abandon the dbregs state. */
x86_dbregs_abandon(l);
#ifdef MTRR
if (proc && l->l_proc->p_md.md_flags & MDP_USEDMTRR) mtrr_clean(l->l_proc);
#endif
}
/*
* cpu_lwp_free2 is called when an LWP is being reaped.
* This routine may block.
*/
void
cpu_lwp_free2(struct lwp *l)
{
struct pcb *pcb;
pcb = lwp_getpcb(l);
KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0); if (pcb->pcb_dbregs) { pool_put(&x86_dbregspl, pcb->pcb_dbregs);
pcb->pcb_dbregs = NULL;
}
}
/*
* Convert kernel VA to physical address
*/
paddr_t
kvtop(void *addr)
{
paddr_t pa;
bool ret __diagused;
ret = pmap_extract(pmap_kernel(), (vaddr_t)addr, &pa);
KASSERT(ret == true);
return pa;
}
/*
* Map a user I/O request into kernel virtual address space.
* Note: the pages are already locked by uvm_vslock(), so we
* do not need to pass an access_type to pmap_enter().
*/
int
vmapbuf(struct buf *bp, vsize_t len)
{
vaddr_t faddr, taddr, off;
paddr_t fpa;
KASSERT((bp->b_flags & B_PHYS) != 0);
bp->b_saveaddr = bp->b_data;
faddr = trunc_page((vaddr_t)bp->b_data);
off = (vaddr_t)bp->b_data - faddr;
len = round_page(off + len);
taddr = uvm_km_alloc(phys_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA);
bp->b_data = (void *)(taddr + off);
/*
* The region is locked, so we expect that pmap_extract() will return
* true.
* XXX: unwise to expect this in a multithreaded environment.
* anything can happen to a pmap between the time we lock a
* region, release the pmap lock, and then relock it for
* the pmap_extract().
*
* no need to flush TLB since we expect nothing to be mapped
* where we just allocated (TLB will be flushed when our
* mapping is removed).
*/
while (len) {
(void) pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
faddr, &fpa);
pmap_kenter_pa(taddr, fpa, VM_PROT_READ|VM_PROT_WRITE, 0);
faddr += PAGE_SIZE;
taddr += PAGE_SIZE;
len -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
return 0;
}
/*
* Unmap a previously-mapped user I/O request.
*/
void
vunmapbuf(struct buf *bp, vsize_t len)
{
vaddr_t addr, off;
KASSERT((bp->b_flags & B_PHYS) != 0);
addr = trunc_page((vaddr_t)bp->b_data);
off = (vaddr_t)bp->b_data - addr;
len = round_page(off + len);
pmap_kremove(addr, len);
pmap_update(pmap_kernel());
uvm_km_free(phys_map, addr, len, UVM_KMF_VAONLY);
bp->b_data = bp->b_saveaddr;
bp->b_saveaddr = 0;
}
#ifdef __HAVE_CPU_UAREA_ROUTINES
/*
* Layout of the uarea:
* Page[0] = PCB
* Page[1] = RedZone
* Page[2] = Stack
* Page[...] = Stack
* Page[UPAGES-1] = Stack
* Page[UPAGES] = RedZone
* There is a redzone at the beginning of the stack, and another one at the
* end. The former is to protect against deep recursions that could corrupt
* the PCB, the latter to protect against severe stack overflows.
*/
void *
cpu_uarea_alloc(bool system)
{
vaddr_t base, va;
paddr_t pa;
base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0,
UVM_KMF_WIRED|UVM_KMF_WAITVA);
/* Page[1] = RedZone */
va = base + PAGE_SIZE;
if (!pmap_extract(pmap_kernel(), va, &pa)) {
panic("%s: impossible, Page[1] unmapped", __func__);
}
pmap_kremove(va, PAGE_SIZE);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
/* Page[UPAGES] = RedZone */
va = base + USPACE;
if (!pmap_extract(pmap_kernel(), va, &pa)) {
panic("%s: impossible, Page[UPAGES] unmapped", __func__);
}
pmap_kremove(va, PAGE_SIZE);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
pmap_update(pmap_kernel());
return (void *)base;
}
bool
cpu_uarea_free(void *addr)
{
vaddr_t base = (vaddr_t)addr;
KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL));
KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL));
uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);
return true;
}
#endif /* __HAVE_CPU_UAREA_ROUTINES */
/* $NetBSD: secmodel.c,v 1.2 2014/11/04 16:01:58 maxv Exp $ */
/*-
* Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <secmodel/secmodel.h>
#include <prop/proplib.h>
/* List of secmodels, parameters, and lock. */
static LIST_HEAD(, secmodel_descr) secmodels =
LIST_HEAD_INITIALIZER(secmodels);
static unsigned int secmodel_copy_cred_on_fork = false;
static krwlock_t secmodels_lock;
static int nsecmodels = 0; /* number of registered secmodels */
static int secmodel_plug(secmodel_t);
static int secmodel_unplug(secmodel_t);
int
secmodel_nsecmodels(void)
{
return nsecmodels;
}
void
secmodel_init(void)
{
rw_init(&secmodels_lock);
secmodel_copy_cred_on_fork = false;
}
/*
* Register a new secmodel.
*/
int
secmodel_register(secmodel_t *secmodel, const char *id, const char *name,
prop_dictionary_t behavior,
secmodel_eval_t eval, secmodel_setinfo_t setinfo)
{
int err;
secmodel_t sm;
sm = kmem_alloc(sizeof(*sm), KM_SLEEP);
sm->sm_id = id;
sm->sm_name = name;
sm->sm_behavior = behavior;
sm->sm_eval = eval;
sm->sm_setinfo = setinfo;
err = secmodel_plug(sm);
if (err == 0) {
atomic_inc_uint(&nsecmodels);
} else {
kmem_free(sm, sizeof(*sm));
sm = NULL;
}
*secmodel = sm;
return err;
}
/*
* Deregister a secmodel.
*/
int
secmodel_deregister(secmodel_t sm)
{
int error;
error = secmodel_unplug(sm);
if (error == 0) {
atomic_dec_uint(&nsecmodels);
kmem_free(sm, sizeof(*sm));
}
return error;
}
/*
* Lookup a secmodel by its id.
*
* Requires "secmodels_lock" handling by the caller.
*/
static secmodel_t
secmodel_lookup(const char *id)
{
secmodel_t tsm;
KASSERT(rw_lock_held(&secmodels_lock));
LIST_FOREACH(tsm, &secmodels, sm_list) {
if (strcasecmp(tsm->sm_id, id) == 0) {
return tsm;
}
}
return NULL;
}
/*
* Adjust system-global secmodel behavior following the addition
* or removal of a secmodel.
*
* Requires "secmodels_lock" to be held by the caller.
*/
static void
secmodel_adjust_behavior(secmodel_t sm, bool added)
{
bool r, b;
KASSERT(rw_write_held(&secmodels_lock));
#define ADJUST_COUNTER(which, added) \
do { \
if (added) { \
(which)++; \
} else { \
if ((which) > 0) \
(which)--; \
} \
} while (/*CONSTCOND*/0)
/* Copy credentials on fork? */
r = prop_dictionary_get_bool(sm->sm_behavior, "copy-cred-on-fork", &b);
if (r) {
ADJUST_COUNTER(secmodel_copy_cred_on_fork, added);
}
#undef ADJUST_COUNTER
}
static int
secmodel_plug(secmodel_t sm)
{
secmodel_t tsm;
int error = 0;
if (sm == NULL)
return EFAULT;
/* Check if the secmodel is already present. */
rw_enter(&secmodels_lock, RW_WRITER);
tsm = secmodel_lookup(sm->sm_id);
if (tsm != NULL) {
error = EEXIST;
goto out;
}
/* Add the secmodel. */
LIST_INSERT_HEAD(&secmodels, sm, sm_list);
/* Adjust behavior. */
secmodel_adjust_behavior(sm, true);
out:
/* Unlock the secmodels list. */
rw_exit(&secmodels_lock);
return error;
}
static int
secmodel_unplug(secmodel_t sm)
{
secmodel_t tsm;
int error = 0;
if (sm == NULL)
return EFAULT;
/* Make sure the secmodel is present. */
rw_enter(&secmodels_lock, RW_WRITER);
tsm = secmodel_lookup(sm->sm_id);
if (tsm == NULL) {
error = ENOENT;
goto out;
}
/* Remove the secmodel. */
LIST_REMOVE(tsm, sm_list);
/* Adjust behavior. */
secmodel_adjust_behavior(tsm, false);
out:
/* Unlock the secmodels list. */
rw_exit(&secmodels_lock);
return error;
}
/* XXX TODO */
int
secmodel_setinfo(const char *id, void *v, int *err)
{
return EOPNOTSUPP;
}
int
secmodel_eval(const char *id, const char *what, void *arg, void *ret)
{
secmodel_t sm;
int error = 0;
rw_enter(&secmodels_lock, RW_READER);
sm = secmodel_lookup(id);
if (sm == NULL) {
error = EINVAL;
goto out;
}
if (sm->sm_eval == NULL) {
error = ENOENT;
goto out;
}
if (ret == NULL) {
error = EFAULT;
goto out;
}
error = sm->sm_eval(what, arg, ret);
/* pass error from a secmodel(9) callback as a negative value */
error = -error;
out:
rw_exit(&secmodels_lock);
return error;
}
/* $NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $ */
/*-
* Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
/*
* Checksum of the IPv6 pseudo header.
*
* off is supposed to be the skipped IPv6 header, len is the payload size.
*/
int
in6_cksum(struct mbuf *m, u_int8_t nxt, uint32_t off, uint32_t len)
{
union {
uint16_t words[16];
struct {
struct in6_addr ip6_src;
struct in6_addr ip6_dst;
} addrs;
} u;
const struct in6_addr *in6_src;
const struct in6_addr *in6_dst;
const struct ip6_hdr *ip6;
uint32_t sum;
const uint16_t *w;
const char *cp;
if (nxt == 0)
return cpu_in_cksum(m, len, off, 0);
if (__predict_false(off < sizeof(struct ip6_hdr)))
panic("in6_cksum: offset too short for IPv6 header");
if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
panic("in6_cksum: mbuf too short for IPv6 header");
/*
* Compute the equivalent of:
* struct ip6_hdr_pseudo ip6;
*
* bzero(sizeof(*ip6));
* ip6.ip6ph_nxt = nxt;
* ip6.ip6ph_len = htonl(len);
* ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src;
* in6_clearscope(&ip6->ip6ph_src);
* ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst;
* in6_clearscope(&ip6->ip6ph_dst);
* sum = one_add(&ip6);
*/
#if BYTE_ORDER == LITTLE_ENDIAN
sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8;
#else
sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt;
#endif
cp = mtod(m, const char *);
w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src));
ip6 = (const void *)cp;
if (__predict_true((uintptr_t)w % 2 == 0)) {
in6_src = &ip6->ip6_src;
in6_dst = &ip6->ip6_dst;
} else {
memcpy(&u, &ip6->ip6_src, 32);
w = u.words;
in6_src = &u.addrs.ip6_src;
in6_dst = &u.addrs.ip6_dst;
}
sum += w[0];
if (!IN6_IS_SCOPE_EMBEDDABLE(in6_src))
sum += w[1];
sum += w[2];
sum += w[3];
sum += w[4];
sum += w[5];
sum += w[6];
sum += w[7];
w += 8;
sum += w[0];
if (!IN6_IS_SCOPE_EMBEDDABLE(in6_dst))
sum += w[1];
sum += w[2];
sum += w[3];
sum += w[4];
sum += w[5];
sum += w[6];
sum += w[7];
return cpu_in_cksum(m, len, off, sum);
}
/* $NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $");
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/wait.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/compat_stub.h>
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE2(proc, kernel, , signal__clear,
"int", /* signal */
"ksiginfo_t *"); /* signal-info */
int
sys___sigaction_sigtramp(struct lwp *l,
const struct sys___sigaction_sigtramp_args *uap, register_t *retval)
{
/* {
syscallarg(int) signum;
syscallarg(const struct sigaction *) nsa;
syscallarg(struct sigaction *) osa;
syscallarg(void *) tramp;
syscallarg(int) vers;
} */
struct sigaction nsa, osa;
int error;
if (SCARG(uap, nsa)) {
error = copyin(SCARG(uap, nsa), &nsa, sizeof(nsa));
if (error)
return (error);
}
error = sigaction1(l, SCARG(uap, signum),
SCARG(uap, nsa) ? &nsa : 0, SCARG(uap, osa) ? &osa : 0,
SCARG(uap, tramp), SCARG(uap, vers));
if (error)
return (error);
if (SCARG(uap, osa)) {
error = copyout(&osa, SCARG(uap, osa), sizeof(osa));
if (error)
return (error);
}
return 0;
}
/*
* Manipulate signal mask. Note that we receive new mask, not pointer, and
* return old mask as return value; the library stub does the rest.
*/
int
sys___sigprocmask14(struct lwp *l, const struct sys___sigprocmask14_args *uap,
register_t *retval)
{
/* {
syscallarg(int) how;
syscallarg(const sigset_t *) set;
syscallarg(sigset_t *) oset;
} */
struct proc *p = l->l_proc;
sigset_t nss, oss;
int error;
if (SCARG(uap, set)) {
error = copyin(SCARG(uap, set), &nss, sizeof(nss));
if (error)
return error;
}
mutex_enter(p->p_lock);
error = sigprocmask1(l, SCARG(uap, how),
SCARG(uap, set) ? &nss : 0, SCARG(uap, oset) ? &oss : 0);
mutex_exit(p->p_lock);
if (error)
return error;
if (SCARG(uap, oset)) {
error = copyout(&oss, SCARG(uap, oset), sizeof(oss));
if (error)
return error;
}
return 0;
}
int
sys___sigpending14(struct lwp *l, const struct sys___sigpending14_args *uap,
register_t *retval)
{
/* {
syscallarg(sigset_t *) set;
} */
sigset_t ss;
sigpending1(l, &ss);
return copyout(&ss, SCARG(uap, set), sizeof(ss));
}
/*
* Suspend process until signal, providing mask to be set in the meantime.
* Note nonstandard calling convention: libc stub passes mask, not pointer,
* to save a copyin.
*/
int
sys___sigsuspend14(struct lwp *l, const struct sys___sigsuspend14_args *uap,
register_t *retval)
{
/* {
syscallarg(const sigset_t *) set;
} */
sigset_t ss;
int error;
if (SCARG(uap, set)) {
error = copyin(SCARG(uap, set), &ss, sizeof(ss));
if (error)
return error;
}
return sigsuspend1(l, SCARG(uap, set) ? &ss : 0);
}
int
sys___sigaltstack14(struct lwp *l, const struct sys___sigaltstack14_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct sigaltstack *) nss;
syscallarg(struct sigaltstack *) oss;
} */
stack_t nss, oss;
int error;
if (SCARG(uap, nss)) {
error = copyin(SCARG(uap, nss), &nss, sizeof(nss));
if (error)
return error;
}
error = sigaltstack1(l,
SCARG(uap, nss) ? &nss : 0, SCARG(uap, oss) ? &oss : 0);
if (error)
return error;
if (SCARG(uap, oss)) {
error = copyout(&oss, SCARG(uap, oss), sizeof(oss));
if (error)
return error;
}
return 0;
}
int
kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval)
{
int error;
struct proc *p;
if ((u_int)ksi->ksi_signo >= NSIG)
return EINVAL;
if (pid != l->l_proc->p_pid) {
if (ksi->ksi_pid != l->l_proc->p_pid)
return EPERM;
if (ksi->ksi_uid != kauth_cred_geteuid(l->l_cred))
return EPERM;
switch (ksi->ksi_code) {
case SI_USER:
case SI_QUEUE:
break;
default:
return EPERM;
}
}
if (pid > 0) {
/* kill single process */
mutex_enter(&proc_lock);
p = proc_find_raw(pid);
if (p == NULL || (p->p_stat != SACTIVE && p->p_stat != SSTOP)) {
mutex_exit(&proc_lock);
/* IEEE Std 1003.1-2001: return success for zombies */
return p ? 0 : ESRCH;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(ksi->ksi_signo),
NULL, NULL);
if (!error && ksi->ksi_signo) {
error = kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return error;
}
switch (pid) {
case -1: /* broadcast signal */
return killpg1(l, ksi, 0, 1);
case 0: /* signal own process group */
return killpg1(l, ksi, 0, 0);
default: /* negative explicit process group */
return killpg1(l, ksi, -pid, 0);
}
/* NOTREACHED */
}
int
sys_sigqueueinfo(struct lwp *l, const struct sys_sigqueueinfo_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t int) pid;
syscallarg(const siginfo_t *) info;
} */
ksiginfo_t ksi;
int error;
KSI_INIT(&ksi);
if ((error = copyin(&SCARG(uap, info)->_info, &ksi.ksi_info,
sizeof(ksi.ksi_info))) != 0)
return error;
return kill1(l, SCARG(uap, pid), &ksi, retval);
}
int
sys_kill(struct lwp *l, const struct sys_kill_args *uap, register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(int) signum;
} */
ksiginfo_t ksi;
KSI_INIT(&ksi);
ksi.ksi_signo = SCARG(uap, signum);
ksi.ksi_code = SI_USER;
ksi.ksi_pid = l->l_proc->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
return kill1(l, SCARG(uap, pid), &ksi, retval);
}
int
sys_getcontext(struct lwp *l, const struct sys_getcontext_args *uap,
register_t *retval)
{
/* {
syscallarg(struct __ucontext *) ucp;
} */
struct proc *p = l->l_proc;
ucontext_t uc;
memset(&uc, 0, sizeof(uc));
mutex_enter(p->p_lock);
getucontext(l, &uc);
mutex_exit(p->p_lock);
return copyout(&uc, SCARG(uap, ucp), sizeof (*SCARG(uap, ucp)));
}
int
sys_setcontext(struct lwp *l, const struct sys_setcontext_args *uap,
register_t *retval)
{
/* {
syscallarg(const ucontext_t *) ucp;
} */
struct proc *p = l->l_proc;
ucontext_t uc;
int error;
error = copyin(SCARG(uap, ucp), &uc, sizeof (uc));
if (error)
return error;
if ((uc.uc_flags & _UC_CPU) == 0)
return EINVAL;
mutex_enter(p->p_lock);
error = setucontext(l, &uc);
mutex_exit(p->p_lock);
if (error)
return error;
return EJUSTRETURN;
}
/*
* sigtimedwait(2) system call, used also for implementation
* of sigwaitinfo() and sigwait().
*
* This only handles single LWP in signal wait. libpthread provides
* its own sigtimedwait() wrapper to DTRT WRT individual threads.
*/
int
sys_____sigtimedwait50(struct lwp *l,
const struct sys_____sigtimedwait50_args *uap, register_t *retval)
{
return sigtimedwait1(l, uap, retval, copyin, copyout, copyin, copyout);
}
int
sigaction1(struct lwp *l, int signum, const struct sigaction *nsa,
struct sigaction *osa, const void *tramp, int vers)
{
struct proc *p;
struct sigacts *ps;
sigset_t tset;
int prop, error;
ksiginfoq_t kq;
static bool v0v1valid;
if (signum <= 0 || signum >= NSIG)
return EINVAL;
p = l->l_proc;
error = 0;
ksiginfo_queue_init(&kq);
/*
* Trampoline ABI version __SIGTRAMP_SIGCODE_VERSION (0) is reserved
* for the legacy kernel provided on-stack trampoline. Conversely,
* if we are using a non-0 ABI version, we must have a trampoline.
* Only validate the vers if a new sigaction was supplied and there
* was an actual handler specified (not SIG_IGN or SIG_DFL), which
* don't require a trampoline. Emulations use legacy kernel
* trampolines with version 0, alternatively check for that too.
*
* If version < __SIGTRAMP_SIGINFO_VERSION_MIN (usually 2), we try
* to autoload the compat module. Note that we interlock with the
* unload check in compat_modcmd() using kernconfig_lock. If the
* autoload fails, we don't try it again for this process.
*/
if (nsa != NULL && nsa->sa_handler != SIG_IGN
&& nsa->sa_handler != SIG_DFL) {
if (__predict_false(vers < __SIGTRAMP_SIGINFO_VERSION_MIN)) {
if (vers == __SIGTRAMP_SIGCODE_VERSION &&
p->p_sigctx.ps_sigcode != NULL) {
/*
* if sigcode is used for this emulation,
* version 0 is allowed.
*/
}
#ifdef __HAVE_STRUCT_SIGCONTEXT
else if (p->p_flag & PK_32) {
/*
* The 32-bit compat module will have
* pre-validated this for us.
*/
v0v1valid = true;
} else if ((p->p_lflag & PL_SIGCOMPAT) == 0) {
kernconfig_lock();
(void)module_autoload("compat_16",
MODULE_CLASS_ANY);
if (sendsig_sigcontext_16_hook.hooked) {
/*
* We need to remember if the
* sigcontext method may be useable,
* because libc may use it even
* if siginfo is available.
*/
v0v1valid = true;
}
mutex_enter(&proc_lock);
/*
* Prevent unload of compat module while
* this process remains.
*/
p->p_lflag |= PL_SIGCOMPAT;
mutex_exit(&proc_lock);
kernconfig_unlock();
}
#endif /* __HAVE_STRUCT_SIGCONTEXT */
}
switch (vers) {
case __SIGTRAMP_SIGCODE_VERSION:
/* kernel supplied trampoline. */
if (tramp != NULL ||
(p->p_sigctx.ps_sigcode == NULL && !v0v1valid)) {
return EINVAL;
}
break;
#ifdef __HAVE_STRUCT_SIGCONTEXT
case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
__SIGTRAMP_SIGCONTEXT_VERSION_MAX:
/* sigcontext, user supplied trampoline. */
if (tramp == NULL || !v0v1valid) {
return EINVAL;
}
break;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
case __SIGTRAMP_SIGINFO_VERSION_MIN ...
__SIGTRAMP_SIGINFO_VERSION_MAX:
/* siginfo, user supplied trampoline. */
if (tramp == NULL) {
return EINVAL;
}
break;
default:
/* Invalid trampoline version. */
return EINVAL;
}
}
mutex_enter(p->p_lock);
ps = p->p_sigacts;
if (osa)
sigaction_copy(osa, &SIGACTION_PS(ps, signum));
if (!nsa)
goto out;
prop = sigprop[signum];
if ((nsa->sa_flags & ~SA_ALLBITS) || (prop & SA_CANTMASK)) {
error = EINVAL;
goto out;
}
sigaction_copy(&SIGACTION_PS(ps, signum), nsa);
ps->sa_sigdesc[signum].sd_tramp = tramp;
ps->sa_sigdesc[signum].sd_vers = vers;
sigminusset(&sigcantmask, &SIGACTION_PS(ps, signum).sa_mask);
if ((prop & SA_NORESET) != 0)
SIGACTION_PS(ps, signum).sa_flags &= ~SA_RESETHAND;
if (signum == SIGCHLD) {
if (nsa->sa_flags & SA_NOCLDSTOP)
p->p_sflag |= PS_NOCLDSTOP;
else
p->p_sflag &= ~PS_NOCLDSTOP;
if (nsa->sa_flags & SA_NOCLDWAIT) {
/*
* Paranoia: since SA_NOCLDWAIT is implemented by
* reparenting the dying child to PID 1 (and trust
* it to reap the zombie), PID 1 itself is forbidden
* to set SA_NOCLDWAIT.
*/
if (p->p_pid == 1)
p->p_flag &= ~PK_NOCLDWAIT;
else
p->p_flag |= PK_NOCLDWAIT;
} else
p->p_flag &= ~PK_NOCLDWAIT;
if (nsa->sa_handler == SIG_IGN) {
/*
* Paranoia: same as above.
*/
if (p->p_pid == 1)
p->p_flag &= ~PK_CLDSIGIGN;
else
p->p_flag |= PK_CLDSIGIGN;
} else
p->p_flag &= ~PK_CLDSIGIGN;
}
if ((nsa->sa_flags & SA_NODEFER) == 0)
sigaddset(&SIGACTION_PS(ps, signum).sa_mask, signum);
else
sigdelset(&SIGACTION_PS(ps, signum).sa_mask, signum);
/*
* Set bit in p_sigctx.ps_sigignore for signals that are set to
* SIG_IGN, and for signals set to SIG_DFL where the default is to
* ignore. However, don't put SIGCONT in p_sigctx.ps_sigignore, as
* we have to restart the process.
*/
if (nsa->sa_handler == SIG_IGN ||
(nsa->sa_handler == SIG_DFL && (prop & SA_IGNORE) != 0)) {
/* Never to be seen again. */
sigemptyset(&tset);
sigaddset(&tset, signum);
sigclearall(p, &tset, &kq);
if (signum != SIGCONT) {
/* Easier in psignal */
sigaddset(&p->p_sigctx.ps_sigignore, signum);
}
sigdelset(&p->p_sigctx.ps_sigcatch, signum);
} else {
sigdelset(&p->p_sigctx.ps_sigignore, signum);
if (nsa->sa_handler == SIG_DFL)
sigdelset(&p->p_sigctx.ps_sigcatch, signum);
else
sigaddset(&p->p_sigctx.ps_sigcatch, signum);
}
/*
* Previously held signals may now have become visible. Ensure that
* we check for them before returning to userspace.
*/
if (sigispending(l, 0)) {
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
out:
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
return error;
}
int
sigprocmask1(struct lwp *l, int how, const sigset_t *nss, sigset_t *oss)
{
sigset_t *mask = &l->l_sigmask;
bool more;
KASSERT(mutex_owned(l->l_proc->p_lock));
if (oss) {
*oss = *mask;
}
if (nss == NULL) {
return 0;
}
switch (how) {
case SIG_BLOCK:
sigplusset(nss, mask);
more = false;
break;
case SIG_UNBLOCK:
sigminusset(nss, mask);
more = true;
break;
case SIG_SETMASK:
*mask = *nss;
more = true;
break;
default:
return EINVAL;
}
sigminusset(&sigcantmask, mask);
if (more && sigispending(l, 0)) {
/*
* Check for pending signals on return to user.
*/
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
return 0;
}
void
sigpending1(struct lwp *l, sigset_t *ss)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
*ss = l->l_sigpend.sp_set;
sigplusset(&p->p_sigpend.sp_set, ss);
mutex_exit(p->p_lock);
}
void
sigsuspendsetup(struct lwp *l, const sigset_t *ss)
{
struct proc *p = l->l_proc;
/*
* When returning from sigsuspend/pselect/pollts, we want
* the old mask to be restored after the
* signal handler has finished. Thus, we
* save it here and mark the sigctx structure
* to indicate this.
*/
mutex_enter(p->p_lock);
l->l_sigrestore = 1;
l->l_sigoldmask = l->l_sigmask;
l->l_sigmask = *ss;
sigminusset(&sigcantmask, &l->l_sigmask);
/* Check for pending signals when sleeping. */
if (sigispending(l, 0)) { lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
mutex_exit(p->p_lock);
}
void
sigsuspendteardown(struct lwp *l)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
/* Check for pending signals when sleeping. */
if (l->l_sigrestore) {
if (sigispending(l, 0)) {
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
} else {
l->l_sigrestore = 0;
l->l_sigmask = l->l_sigoldmask;
}
}
mutex_exit(p->p_lock);
}
int
sigsuspend1(struct lwp *l, const sigset_t *ss)
{
if (ss)
sigsuspendsetup(l, ss);
while (kpause("pause", true, 0, NULL) == 0)
;
/* always return EINTR rather than ERESTART... */
return EINTR;
}
int
sigaltstack1(struct lwp *l, const stack_t *nss, stack_t *oss)
{
struct proc *p = l->l_proc;
int error = 0;
mutex_enter(p->p_lock);
if (oss)
*oss = l->l_sigstk;
if (nss) {
if (nss->ss_flags & ~SS_ALLBITS)
error = EINVAL;
else if (nss->ss_flags & SS_DISABLE) {
if (l->l_sigstk.ss_flags & SS_ONSTACK)
error = EINVAL;
} else if (nss->ss_size < MINSIGSTKSZ)
error = ENOMEM;
if (!error)
l->l_sigstk = *nss;
}
mutex_exit(p->p_lock);
return error;
}
int
sigtimedwait1(struct lwp *l, const struct sys_____sigtimedwait50_args *uap,
register_t *retval, copyin_t fetchss, copyout_t storeinf, copyin_t fetchts,
copyout_t storets)
{
/* {
syscallarg(const sigset_t *) set;
syscallarg(siginfo_t *) info;
syscallarg(struct timespec *) timeout;
} */
struct proc *p = l->l_proc;
int error, signum, timo;
struct timespec ts, tsstart, tsnow;
ksiginfo_t ksi;
/*
* Calculate timeout, if it was specified.
*
* NULL pointer means an infinite timeout.
* {.tv_sec = 0, .tv_nsec = 0} means do not block.
*/
if (SCARG(uap, timeout)) {
error = (*fetchts)(SCARG(uap, timeout), &ts, sizeof(ts));
if (error)
return error;
if ((error = itimespecfix(&ts)) != 0)
return error;
timo = tstohz(&ts);
if (timo == 0) {
if (ts.tv_sec == 0 && ts.tv_nsec == 0)
timo = -1; /* do not block */
else
timo = 1; /* the shortest possible timeout */
}
/*
* Remember current uptime, it would be used in
* ECANCELED/ERESTART case.
*/
getnanouptime(&tsstart);
} else {
memset(&tsstart, 0, sizeof(tsstart)); /* XXXgcc */
timo = 0; /* infinite timeout */
}
error = (*fetchss)(SCARG(uap, set), &l->l_sigwaitset,
sizeof(l->l_sigwaitset));
if (error)
return error;
/*
* Silently ignore SA_CANTMASK signals. psignal1() would ignore
* SA_CANTMASK signals in waitset, we do this only for the below
* siglist check.
*/
sigminusset(&sigcantmask, &l->l_sigwaitset);
memset(&ksi.ksi_info, 0, sizeof(ksi.ksi_info));
mutex_enter(p->p_lock);
/* Check for pending signals in the process, if no - then in LWP. */
if ((signum = sigget(&p->p_sigpend, &ksi, 0, &l->l_sigwaitset)) == 0)
signum = sigget(&l->l_sigpend, &ksi, 0, &l->l_sigwaitset);
if (signum != 0) {
/* If found a pending signal, just copy it out to the user. */
mutex_exit(p->p_lock);
goto out;
}
if (timo < 0) {
/* If not allowed to block, return an error */
mutex_exit(p->p_lock);
return EAGAIN;
}
/*
* Set up the sigwait list and wait for signal to arrive.
* We can either be woken up or time out.
*/
l->l_sigwaited = &ksi;
LIST_INSERT_HEAD(&p->p_sigwaiters, l, l_sigwaiter);
error = cv_timedwait_sig(&l->l_sigcv, p->p_lock, timo);
/*
* Need to find out if we woke as a result of _lwp_wakeup() or a
* signal outside our wait set.
*/
if (l->l_sigwaited != NULL) {
if (error == EINTR) {
/* Wakeup via _lwp_wakeup(). */
error = ECANCELED;
} else if (!error) {
/* Spurious wakeup - arrange for syscall restart. */
error = ERESTART;
}
l->l_sigwaited = NULL;
LIST_REMOVE(l, l_sigwaiter);
}
mutex_exit(p->p_lock);
/*
* If the sleep was interrupted (either by signal or wakeup), update
* the timeout and copyout new value back. It would be used when
* the syscall would be restarted or called again.
*/
if (timo && (error == ERESTART || error == ECANCELED)) {
getnanouptime(&tsnow);
/* Compute how much time has passed since start. */
timespecsub(&tsnow, &tsstart, &tsnow);
/* Subtract passed time from timeout. */
timespecsub(&ts, &tsnow, &ts);
if (ts.tv_sec < 0)
error = EAGAIN;
else {
/* Copy updated timeout to userland. */
error = (*storets)(&ts, SCARG(uap, timeout),
sizeof(ts));
}
}
out:
/*
* If a signal from the wait set arrived, copy it to userland.
* Copy only the used part of siginfo, the padding part is
* left unchanged (userland is not supposed to touch it anyway).
*/
if (error == 0 && SCARG(uap, info)) {
error = (*storeinf)(&ksi.ksi_info, SCARG(uap, info),
sizeof(ksi.ksi_info));
}
if (error == 0) {
*retval = ksi.ksi_info._signo;
SDT_PROBE(proc, kernel, , signal__clear, *retval,
&ksi, 0, 0, 0);
}
return error;
}
/* $NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $ */
/*-
* Copyright (c) 1998, 1999, 2007, 2009, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/sched.h>
#include <sys/kmem.h>
#include <sys/msan.h>
#include <uvm/uvm_extern.h>
static kmutex_t kthread_lock;
static kcondvar_t kthread_cv;
void
kthread_sysinit(void)
{
mutex_init(&kthread_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&kthread_cv, "kthrwait");
}
/*
* kthread_create: create a kernel thread, that is, system-only LWP.
*/
int
kthread_create(pri_t pri, int flag, struct cpu_info *ci,
void (*func)(void *), void *arg, lwp_t **lp, const char *fmt, ...)
{
lwp_t *l;
vaddr_t uaddr;
int error, lc;
va_list ap;
KASSERT((flag & KTHREAD_INTR) == 0 || (flag & KTHREAD_MPSAFE) != 0);
uaddr = uvm_uarea_system_alloc(
(flag & (KTHREAD_INTR|KTHREAD_IDLE)) == KTHREAD_IDLE ? ci : NULL);
if (uaddr == 0) {
return ENOMEM;
}
kmsan_orig((void *)uaddr, USPACE, KMSAN_TYPE_POOL, __RET_ADDR);
if ((flag & KTHREAD_TS) != 0) {
lc = SCHED_OTHER;
} else {
lc = SCHED_RR;
}
error = lwp_create(&lwp0, &proc0, uaddr, LWP_DETACHED, NULL,
0, func, arg, &l, lc, &lwp0.l_sigmask, &lwp0.l_sigstk);
if (error) {
uvm_uarea_system_free(uaddr);
return error;
}
if (fmt != NULL) { l->l_name = kmem_alloc(MAXCOMLEN, KM_SLEEP);
va_start(ap, fmt);
vsnprintf(l->l_name, MAXCOMLEN, fmt, ap);
va_end(ap);
}
/*
* Set parameters.
*/
if (pri == PRI_NONE) {
if ((flag & KTHREAD_TS) != 0) {
/* Maximum user priority level. */
pri = MAXPRI_USER;
} else {
/* Minimum kernel priority level. */
pri = PRI_KTHREAD;
}
}
mutex_enter(proc0.p_lock);
lwp_lock(l);
lwp_changepri(l, pri);
if (ci != NULL) { if (ci != l->l_cpu) {
lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock);
lwp_lock(l);
l->l_cpu = ci;
}
l->l_pflag |= LP_BOUND;
}
if ((flag & KTHREAD_MUSTJOIN) != 0) { KASSERT(lp != NULL);
l->l_pflag |= LP_MUSTJOIN;
}
if ((flag & KTHREAD_INTR) != 0) { l->l_pflag |= LP_INTR;
}
if ((flag & KTHREAD_MPSAFE) == 0) {
l->l_pflag &= ~LP_MPSAFE;
}
/*
* Set the new LWP running, unless the caller has requested
* otherwise.
*/
KASSERT(l->l_stat == LSIDL);
if ((flag & KTHREAD_IDLE) == 0) {
setrunnable(l);
/* LWP now unlocked */
} else {
lwp_unlock(l);
}
mutex_exit(proc0.p_lock);
/* All done! */
if (lp != NULL) { *lp = l;
}
return 0;
}
/*
* Cause a kernel thread to exit. Assumes the exiting thread is the
* current context.
*/
void
kthread_exit(int ecode)
{
const char *name;
lwp_t *l = curlwp;
/* If the kernel lock is held, we need to drop it now. */
if ((l->l_pflag & LP_MPSAFE) == 0) {
KERNEL_UNLOCK_LAST(l);
}
/* We can't do much with the exit code, so just report it. */
if (ecode != 0) {
if ((name = l->l_name) == NULL)
name = "unnamed";
printf("WARNING: kthread `%s' (%d) exits with status %d\n",
name, l->l_lid, ecode);
}
/* Barrier for joining. */
if (l->l_pflag & LP_MUSTJOIN) {
bool *exitedp;
mutex_enter(&kthread_lock);
while ((exitedp = l->l_private) == NULL) {
cv_wait(&kthread_cv, &kthread_lock);
}
KASSERT(!*exitedp);
*exitedp = true;
cv_broadcast(&kthread_cv);
mutex_exit(&kthread_lock);
}
/* And exit.. */
lwp_exit(l);
panic("kthread_exit");
}
/*
* Wait for a kthread to exit, as pthread_join().
*/
int
kthread_join(lwp_t *l)
{
bool exited = false;
KASSERT((l->l_flag & LW_SYSTEM) != 0);
KASSERT((l->l_pflag & LP_MUSTJOIN) != 0);
/*
* - Ask the kthread to write to `exited'.
* - After this, touching l is forbidden -- it may be freed.
* - Wait until the kthread has written to `exited'.
*/
mutex_enter(&kthread_lock);
KASSERT(l->l_private == NULL);
l->l_private = &exited;
cv_broadcast(&kthread_cv);
while (!exited) {
cv_wait(&kthread_cv, &kthread_lock);
}
mutex_exit(&kthread_lock);
return 0;
}
/*
* kthread_fpu_enter()
*
* Allow the current lwp, which must be a kthread, to use the FPU.
* Return a cookie that must be passed to kthread_fpu_exit when
* done. Must be used only in thread context. Recursive -- you
* can call kthread_fpu_enter several times in a row as long as
* you pass the cookies in reverse order to kthread_fpu_exit.
*/
int
kthread_fpu_enter(void)
{
struct lwp *l = curlwp;
int s;
KASSERTMSG(!cpu_intr_p(),
"%s is not allowed in interrupt context", __func__);
KASSERTMSG(!cpu_softintr_p(),
"%s is not allowed in interrupt context", __func__);
/*
* Remember whether this thread already had FPU access, and
* mark this thread as having FPU access.
*/
lwp_lock(l);
KASSERTMSG(l->l_flag & LW_SYSTEM,
"%s is allowed only in kthreads", __func__);
s = l->l_flag & LW_SYSTEM_FPU;
l->l_flag |= LW_SYSTEM_FPU;
lwp_unlock(l);
/* Take MD steps to enable the FPU if necessary. */
if (s == 0)
kthread_fpu_enter_md();
return s;
}
/*
* kthread_fpu_exit(s)
*
* Restore the current lwp's FPU access to what it was before the
* matching call to kthread_fpu_enter() that returned s. Must be
* used only in thread context.
*/
void
kthread_fpu_exit(int s)
{
struct lwp *l = curlwp;
KASSERT(s == (s & LW_SYSTEM_FPU));
KASSERTMSG(!cpu_intr_p(),
"%s is not allowed in interrupt context", __func__);
KASSERTMSG(!cpu_softintr_p(),
"%s is not allowed in interrupt context", __func__);
lwp_lock(l);
KASSERTMSG(l->l_flag & LW_SYSTEM,
"%s is allowed only in kthreads", __func__);
KASSERT(l->l_flag & LW_SYSTEM_FPU);
l->l_flag ^= s ^ LW_SYSTEM_FPU;
lwp_unlock(l);
/* Take MD steps to zero and disable the FPU if necessary. */
if (s == 0)
kthread_fpu_exit_md();
}
/* $NetBSD: spl.h,v 1.10 2021/11/02 11:26:05 ryo Exp $ */
/*-
* Copyright (c)2005 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* this header is intended to be included by MD header.
*
* an assumption: makeiplcookie() is reasonably fast.
* if it isn't the case for your port, it's better to have MD optimized
* splxxx() functions, rather than using this header.
*/
#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error not supposed to be exposed to userland.
#endif /* !defined(_KERNEL) && !defined(_KMEMUSER) */
#define _SPL_DECL(x, X) \
static __inline __always_inline int \
spl##x(void) \
{ return splraiseipl(makeiplcookie(IPL_##X)); }
#if defined(IPL_SOFTCLOCK)
_SPL_DECL(softclock, SOFTCLOCK)
#endif /* defined(IPL_SOFTCLOCK) */
#if defined(IPL_SOFTNET)
_SPL_DECL(softnet, SOFTNET)
#endif /* defined(IPL_SOFTNET) */
#if defined(IPL_SOFTSERIAL)
_SPL_DECL(softserial, SOFTSERIAL)
#endif /* defined(IPL_SOFTSERIAL) */
_SPL_DECL(vm, VM)
_SPL_DECL(sched, SCHED)
_SPL_DECL(high, HIGH)
#undef _SPL_DECL
/* $NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel mutex implementation, modeled after those found in Solaris,
* a description of which can be found in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*/
#define __MUTEX_PRIVATE
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <dev/lockstat.h>
#include <machine/lock.h>
/*
* When not running a debug kernel, spin mutexes are not much
* more than an splraiseipl() and splx() pair.
*/
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
#define FULL
#endif
/*
* Debugging support.
*/
#define MUTEX_WANTLOCK(mtx) \
LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_TESTLOCK(mtx) \
LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), -1)
#define MUTEX_LOCKED(mtx) \
LOCKDEBUG_LOCKED(MUTEX_DEBUG_P(mtx), (mtx), NULL, \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_UNLOCKED(mtx) \
LOCKDEBUG_UNLOCKED(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_ABORT(mtx, msg) \
mutex_abort(__func__, __LINE__, mtx, msg)
#if defined(LOCKDEBUG)
#define MUTEX_DASSERT(mtx, cond) \
do { \
if (__predict_false(!(cond))) \
MUTEX_ABORT(mtx, "assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else /* LOCKDEBUG */
#define MUTEX_DASSERT(mtx, cond) /* nothing */
#endif /* LOCKDEBUG */
#if defined(DIAGNOSTIC)
#define MUTEX_ASSERT(mtx, cond) \
do { \
if (__predict_false(!(cond))) \
MUTEX_ABORT(mtx, "assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else /* DIAGNOSTIC */
#define MUTEX_ASSERT(mtx, cond) /* nothing */
#endif /* DIAGNOSTIC */
/*
* Some architectures can't use __cpu_simple_lock as is so allow a way
* for them to use an alternate definition.
*/
#ifndef MUTEX_SPINBIT_LOCK_INIT
#define MUTEX_SPINBIT_LOCK_INIT(mtx) __cpu_simple_lock_init(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCKED_P
#define MUTEX_SPINBIT_LOCKED_P(mtx) __SIMPLELOCK_LOCKED_P(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_TRY
#define MUTEX_SPINBIT_LOCK_TRY(mtx) __cpu_simple_lock_try(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_UNLOCK
#define MUTEX_SPINBIT_LOCK_UNLOCK(mtx) __cpu_simple_unlock(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_INITIALIZE_SPIN_IPL
#define MUTEX_INITIALIZE_SPIN_IPL(mtx, ipl) \
((mtx)->mtx_ipl = makeiplcookie((ipl)))
#endif
/*
* Spin mutex SPL save / restore.
*/
#define MUTEX_SPIN_SPLRAISE(mtx) \
do { \
const int s = splraiseipl(MUTEX_SPIN_IPL(mtx)); \
struct cpu_info * const x__ci = curcpu(); \
const int x__cnt = x__ci->ci_mtx_count--; \
__insn_barrier(); \
if (x__cnt == 0) \
x__ci->ci_mtx_oldspl = s; \
} while (/* CONSTCOND */ 0)
#define MUTEX_SPIN_SPLRESTORE(mtx) \
do { \
struct cpu_info * const x__ci = curcpu(); \
const int s = x__ci->ci_mtx_oldspl; \
__insn_barrier(); \
if (++(x__ci->ci_mtx_count) == 0) \
splx(s); \
} while (/* CONSTCOND */ 0)
/*
* Memory barriers.
*/
#ifdef __HAVE_ATOMIC_AS_MEMBAR
#define MUTEX_MEMBAR_ENTER()
#else
#define MUTEX_MEMBAR_ENTER() membar_enter()
#endif
/*
* For architectures that provide 'simple' mutexes: they provide a
* CAS function that is either MP-safe, or does not need to be MP
* safe. Adaptive mutexes on these architectures do not require an
* additional interlock.
*/
#ifdef __HAVE_SIMPLE_MUTEXES
#define MUTEX_OWNER(owner) \
(owner & MUTEX_THREAD)
#define MUTEX_HAS_WAITERS(mtx) \
(((int)(mtx)->mtx_owner & MUTEX_BIT_WAITERS) != 0)
#define MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug) \
do { \
if (!dodebug) \
(mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \
} while (/* CONSTCOND */ 0)
#define MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl) \
do { \
(mtx)->mtx_owner = MUTEX_BIT_SPIN; \
if (!dodebug) \
(mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \
MUTEX_INITIALIZE_SPIN_IPL((mtx), (ipl)); \
MUTEX_SPINBIT_LOCK_INIT((mtx)); \
} while (/* CONSTCOND */ 0)
#define MUTEX_DESTROY(mtx) \
do { \
(mtx)->mtx_owner = MUTEX_THREAD; \
} while (/* CONSTCOND */ 0)
#define MUTEX_SPIN_P(owner) \
(((owner) & MUTEX_BIT_SPIN) != 0)
#define MUTEX_ADAPTIVE_P(owner) \
(((owner) & MUTEX_BIT_SPIN) == 0)
#ifndef MUTEX_CAS
#define MUTEX_CAS(p, o, n) \
(atomic_cas_ulong((volatile unsigned long *)(p), (o), (n)) == (o))
#endif /* MUTEX_CAS */
#define MUTEX_DEBUG_P(mtx) (((mtx)->mtx_owner & MUTEX_BIT_NODEBUG) == 0)
#if defined(LOCKDEBUG)
#define MUTEX_OWNED(owner) (((owner) & ~MUTEX_BIT_NODEBUG) != 0)
#define MUTEX_INHERITDEBUG(n, o) (n) |= (o) & MUTEX_BIT_NODEBUG
#else /* defined(LOCKDEBUG) */
#define MUTEX_OWNED(owner) ((owner) != 0)
#define MUTEX_INHERITDEBUG(n, o) /* nothing */
#endif /* defined(LOCKDEBUG) */
static inline int
MUTEX_ACQUIRE(kmutex_t *mtx, uintptr_t curthread)
{
int rv;
uintptr_t oldown = 0;
uintptr_t newown = curthread;
MUTEX_INHERITDEBUG(oldown, mtx->mtx_owner);
MUTEX_INHERITDEBUG(newown, oldown);
rv = MUTEX_CAS(&mtx->mtx_owner, oldown, newown);
membar_acquire();
return rv;
}
static inline int
MUTEX_SET_WAITERS(kmutex_t *mtx, uintptr_t owner)
{
int rv;
rv = MUTEX_CAS(&mtx->mtx_owner, owner, owner | MUTEX_BIT_WAITERS);
MUTEX_MEMBAR_ENTER();
return rv;
}
static inline void
MUTEX_RELEASE(kmutex_t *mtx)
{
uintptr_t newown;
newown = 0;
MUTEX_INHERITDEBUG(newown, mtx->mtx_owner);
atomic_store_release(&mtx->mtx_owner, newown);
}
#endif /* __HAVE_SIMPLE_MUTEXES */
/*
* Patch in stubs via strong alias where they are not available.
*/
#if defined(LOCKDEBUG)
#undef __HAVE_MUTEX_STUBS
#undef __HAVE_SPIN_MUTEX_STUBS
#endif
#ifndef __HAVE_MUTEX_STUBS
__strong_alias(mutex_enter,mutex_vector_enter);
__strong_alias(mutex_exit,mutex_vector_exit);
#endif
#ifndef __HAVE_SPIN_MUTEX_STUBS
__strong_alias(mutex_spin_enter,mutex_vector_enter);
__strong_alias(mutex_spin_exit,mutex_vector_exit);
#endif
static void mutex_abort(const char *, size_t, volatile const kmutex_t *,
const char *);
static void mutex_dump(const volatile void *, lockop_printer_t);
static lwp_t *mutex_owner(wchan_t);
lockops_t mutex_spin_lockops = {
.lo_name = "Mutex",
.lo_type = LOCKOPS_SPIN,
.lo_dump = mutex_dump,
};
lockops_t mutex_adaptive_lockops = {
.lo_name = "Mutex",
.lo_type = LOCKOPS_SLEEP,
.lo_dump = mutex_dump,
};
syncobj_t mutex_syncobj = {
.sobj_name = "mutex",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = turnstile_unsleep,
.sobj_changepri = turnstile_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = mutex_owner,
};
/*
* mutex_dump:
*
* Dump the contents of a mutex structure.
*/
static void
mutex_dump(const volatile void *cookie, lockop_printer_t pr)
{
const volatile kmutex_t *mtx = cookie;
uintptr_t owner = mtx->mtx_owner;
pr("owner field : %#018lx wait/spin: %16d/%d\n",
(long)MUTEX_OWNER(owner), MUTEX_HAS_WAITERS(mtx),
MUTEX_SPIN_P(owner));
}
/*
* mutex_abort:
*
* Dump information about an error and panic the system. This
* generates a lot of machine code in the DIAGNOSTIC case, so
* we ask the compiler to not inline it.
*/
static void __noinline
mutex_abort(const char *func, size_t line, volatile const kmutex_t *mtx,
const char *msg)
{
LOCKDEBUG_ABORT(func, line, mtx, (MUTEX_SPIN_P(mtx->mtx_owner) ?
&mutex_spin_lockops : &mutex_adaptive_lockops), msg);
}
/*
* mutex_init:
*
* Initialize a mutex for use. Note that adaptive mutexes are in
* essence spin mutexes that can sleep to avoid deadlock and wasting
* CPU time. We can't easily provide a type of mutex that always
* sleeps - see comments in mutex_vector_enter() about releasing
* mutexes unlocked.
*/
void
_mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl,
uintptr_t return_address)
{
lockops_t *lockops __unused;
bool dodebug;
memset(mtx, 0, sizeof(*mtx));
if (ipl == IPL_NONE || ipl == IPL_SOFTCLOCK ||
ipl == IPL_SOFTBIO || ipl == IPL_SOFTNET ||
ipl == IPL_SOFTSERIAL) {
lockops = (type == MUTEX_NODEBUG ?
NULL : &mutex_adaptive_lockops);
dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug);
} else {
lockops = (type == MUTEX_NODEBUG ?
NULL : &mutex_spin_lockops);
dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl);
}
}
void
mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl)
{
_mutex_init(mtx, type, ipl, (uintptr_t)__builtin_return_address(0));
}
/*
* mutex_destroy:
*
* Tear down a mutex.
*/
void
mutex_destroy(kmutex_t *mtx)
{
uintptr_t owner = mtx->mtx_owner;
if (MUTEX_ADAPTIVE_P(owner)) {
MUTEX_ASSERT(mtx, !MUTEX_OWNED(owner)); MUTEX_ASSERT(mtx, !MUTEX_HAS_WAITERS(mtx));
} else {
MUTEX_ASSERT(mtx, !MUTEX_SPINBIT_LOCKED_P(mtx));
}
LOCKDEBUG_FREE(MUTEX_DEBUG_P(mtx), mtx);
MUTEX_DESTROY(mtx);
}
#ifdef MULTIPROCESSOR
/*
* mutex_oncpu:
*
* Return true if an adaptive mutex owner is running on a CPU in the
* system. If the target is waiting on the kernel big lock, then we
* must release it. This is necessary to avoid deadlock.
*/
static bool
mutex_oncpu(uintptr_t owner)
{
struct cpu_info *ci;
lwp_t *l;
KASSERT(kpreempt_disabled()); if (!MUTEX_OWNED(owner)) {
return false;
}
/*
* See lwp_dtor() why dereference of the LWP pointer is safe.
* We must have kernel preemption disabled for that.
*/
l = (lwp_t *)MUTEX_OWNER(owner);
ci = l->l_cpu;
if (ci && ci->ci_curlwp == l) {
/* Target is running; do we need to block? */
return (atomic_load_relaxed(&ci->ci_biglock_wanted) != l);
}
/* Not running. It may be safe to block now. */
return false;
}
#endif /* MULTIPROCESSOR */
/*
* mutex_vector_enter:
*
* Support routine for mutex_enter() that must handle all cases. In
* the LOCKDEBUG case, mutex_enter() is always aliased here, even if
* fast-path stubs are available. If a mutex_spin_enter() stub is
* not available, then it is also aliased directly here.
*/
void
mutex_vector_enter(kmutex_t *mtx)
{
uintptr_t owner, curthread;
turnstile_t *ts;
#ifdef MULTIPROCESSOR
u_int count;
#endif
LOCKSTAT_COUNTER(spincnt);
LOCKSTAT_COUNTER(slpcnt);
LOCKSTAT_TIMER(spintime);
LOCKSTAT_TIMER(slptime);
LOCKSTAT_FLAG(lsflag);
/*
* Handle spin mutexes.
*/
KPREEMPT_DISABLE(curlwp);
owner = mtx->mtx_owner;
if (MUTEX_SPIN_P(owner)) {
#if defined(LOCKDEBUG) && defined(MULTIPROCESSOR)
u_int spins = 0;
#endif
KPREEMPT_ENABLE(curlwp); MUTEX_SPIN_SPLRAISE(mtx); MUTEX_WANTLOCK(mtx);
#ifdef FULL
if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
MUTEX_LOCKED(mtx);
return;
}
#if !defined(MULTIPROCESSOR)
MUTEX_ABORT(mtx, "locking against myself");
#else /* !MULTIPROCESSOR */
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
/*
* Spin testing the lock word and do exponential backoff
* to reduce cache line ping-ponging between CPUs.
*/
do {
while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
SPINLOCK_SPIN_HOOK;
SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins)) MUTEX_ABORT(mtx, "spinout");
#endif /* LOCKDEBUG */
}
} while (!MUTEX_SPINBIT_LOCK_TRY(mtx)); if (count != SPINLOCK_BACKOFF_MIN) { LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_EVENT(lsflag, mtx,
LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
}
LOCKSTAT_EXIT(lsflag);
#endif /* !MULTIPROCESSOR */
#endif /* FULL */
MUTEX_LOCKED(mtx);
return;
}
curthread = (uintptr_t)curlwp;
MUTEX_DASSERT(mtx, MUTEX_ADAPTIVE_P(owner));
MUTEX_ASSERT(mtx, curthread != 0);
MUTEX_ASSERT(mtx, !cpu_intr_p()); MUTEX_WANTLOCK(mtx); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section());
LOCKDEBUG_BARRIER(&kernel_lock, 1);
}
LOCKSTAT_ENTER(lsflag);
/*
* Adaptive mutex; spin trying to acquire the mutex. If we
* determine that the owner is not running on a processor,
* then we stop spinning, and sleep instead.
*/
for (;;) {
if (!MUTEX_OWNED(owner)) {
/*
* Mutex owner clear could mean two things:
*
* * The mutex has been released.
* * The owner field hasn't been set yet.
*
* Try to acquire it again. If that fails,
* we'll just loop again.
*/
if (MUTEX_ACQUIRE(mtx, curthread))
break;
owner = mtx->mtx_owner;
continue;
}
if (__predict_false(MUTEX_OWNER(owner) == curthread)) { MUTEX_ABORT(mtx, "locking against myself");
}
#ifdef MULTIPROCESSOR
/*
* Check to see if the owner is running on a processor.
* If so, then we should just spin, as the owner will
* likely release the lock very soon.
*/
if (mutex_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
do {
KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count);
KPREEMPT_DISABLE(curlwp);
owner = mtx->mtx_owner;
} while (mutex_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_COUNT(spincnt, 1);
if (!MUTEX_OWNED(owner))
continue;
}
#endif
ts = turnstile_lookup(mtx);
/*
* Once we have the turnstile chain interlock, mark the
* mutex as having waiters. If that fails, spin again:
* chances are that the mutex has been released.
*/
if (!MUTEX_SET_WAITERS(mtx, owner)) {
turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
#ifdef MULTIPROCESSOR
/*
* mutex_exit() is permitted to release the mutex without
* any interlocking instructions, and the following can
* occur as a result:
*
* CPU 1: MUTEX_SET_WAITERS() CPU2: mutex_exit()
* ---------------------------- ----------------------------
* .. load mtx->mtx_owner
* .. see has-waiters bit clear
* set has-waiters bit ..
* .. store mtx->mtx_owner := 0
* return success
*
* There is another race that can occur: a third CPU could
* acquire the mutex as soon as it is released. Since
* adaptive mutexes are primarily spin mutexes, this is not
* something that we need to worry about too much. What we
* do need to ensure is that the waiters bit gets set.
*
* To allow the unlocked release, we need to make some
* assumptions here:
*
* o Release is the only non-atomic/unlocked operation
* that can be performed on the mutex. (It must still
* be atomic on the local CPU, e.g. in case interrupted
* or preempted).
*
* o At any given time on each mutex, MUTEX_SET_WAITERS()
* can only ever be in progress on one CPU in the
* system - guaranteed by the turnstile chain lock.
*
* o No other operations other than MUTEX_SET_WAITERS()
* and release can modify a mutex with a non-zero
* owner field.
*
* o If the holding LWP switches away, it posts a store
* fence before changing curlwp, ensuring that any
* overwrite of the mutex waiters flag by mutex_exit()
* completes before the modification of curlwp becomes
* visible to this CPU.
*
* o cpu_switchto() posts a store fence after setting curlwp
* and before resuming execution of an LWP.
*
* o _kernel_lock() posts a store fence before setting
* curcpu()->ci_biglock_wanted, and after clearing it.
* This ensures that any overwrite of the mutex waiters
* flag by mutex_exit() completes before the modification
* of ci_biglock_wanted becomes visible.
*
* After MUTEX_SET_WAITERS() succeeds, simultaneously
* confirming that the same LWP still holds the mutex
* since we took the turnstile lock and notifying it that
* we're waiting, we check the lock holder's status again.
* Some of the possible outcomes (not an exhaustive list;
* XXX this should be made exhaustive):
*
* 1. The on-CPU check returns true: the holding LWP is
* running again. The lock may be released soon and
* we should spin. Importantly, we can't trust the
* value of the waiters flag.
*
* 2. The on-CPU check returns false: the holding LWP is
* not running. We now have the opportunity to check
* if mutex_exit() has blatted the modifications made
* by MUTEX_SET_WAITERS().
*
* 3. The on-CPU check returns false: the holding LWP may
* or may not be running. It has context switched at
* some point during our check. Again, we have the
* chance to see if the waiters bit is still set or
* has been overwritten.
*
* 4. The on-CPU check returns false: the holding LWP is
* running on a CPU, but wants the big lock. It's OK
* to check the waiters field in this case.
*
* 5. The has-waiters check fails: the mutex has been
* released, the waiters flag cleared and another LWP
* now owns the mutex.
*
* 6. The has-waiters check fails: the mutex has been
* released.
*
* If the waiters bit is not set it's unsafe to go asleep,
* as we might never be awoken.
*/
if (mutex_oncpu(owner)) { turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
membar_consumer();
if (!MUTEX_HAS_WAITERS(mtx)) {
turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
#endif /* MULTIPROCESSOR */
LOCKSTAT_START_TIMER(lsflag, slptime);
turnstile_block(ts, TS_WRITER_Q, mtx, &mutex_syncobj);
LOCKSTAT_STOP_TIMER(lsflag, slptime);
LOCKSTAT_COUNT(slpcnt, 1);
owner = mtx->mtx_owner;
}
KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SLEEP1,
slpcnt, slptime);
LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SPIN,
spincnt, spintime);
LOCKSTAT_EXIT(lsflag);
MUTEX_DASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_LOCKED(mtx);
}
/*
* mutex_vector_exit:
*
* Support routine for mutex_exit() that handles all cases.
*/
void
mutex_vector_exit(kmutex_t *mtx)
{
turnstile_t *ts;
uintptr_t curthread;
if (MUTEX_SPIN_P(mtx->mtx_owner)) {
#ifdef FULL
if (__predict_false(!MUTEX_SPINBIT_LOCKED_P(mtx))) { MUTEX_ABORT(mtx, "exiting unheld spin mutex");
}
MUTEX_UNLOCKED(mtx);
MUTEX_SPINBIT_LOCK_UNLOCK(mtx);
#endif
MUTEX_SPIN_SPLRESTORE(mtx);
return;
}
#ifndef __HAVE_MUTEX_STUBS
/*
* On some architectures without mutex stubs, we can enter here to
* release mutexes before interrupts and whatnot are up and running.
* We need this hack to keep them sweet.
*/
if (__predict_false(cold)) {
MUTEX_UNLOCKED(mtx); MUTEX_RELEASE(mtx);
return;
}
#endif
curthread = (uintptr_t)curlwp;
MUTEX_DASSERT(mtx, curthread != 0); MUTEX_ASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_UNLOCKED(mtx);
#if !defined(LOCKDEBUG)
__USE(curthread);
#endif
#ifdef LOCKDEBUG
/*
* Avoid having to take the turnstile chain lock every time
* around. Raise the priority level to splhigh() in order
* to disable preemption and so make the following atomic.
* This also blocks out soft interrupts that could set the
* waiters bit.
*/
{
int s = splhigh();
if (!MUTEX_HAS_WAITERS(mtx)) {
MUTEX_RELEASE(mtx);
splx(s);
return;
}
splx(s);
}
#endif
/*
* Get this lock's turnstile. This gets the interlock on
* the sleep queue. Once we have that, we can clear the
* lock. If there was no turnstile for the lock, there
* were no waiters remaining.
*/
ts = turnstile_lookup(mtx);
if (ts == NULL) {
MUTEX_RELEASE(mtx);
turnstile_exit(mtx);
} else {
MUTEX_RELEASE(mtx);
turnstile_wakeup(ts, TS_WRITER_Q,
TS_WAITERS(ts, TS_WRITER_Q), NULL);
}
}
#ifndef __HAVE_SIMPLE_MUTEXES
/*
* mutex_wakeup:
*
* Support routine for mutex_exit() that wakes up all waiters.
* We assume that the mutex has been released, but it need not
* be.
*/
void
mutex_wakeup(kmutex_t *mtx)
{
turnstile_t *ts;
ts = turnstile_lookup(mtx);
if (ts == NULL) {
turnstile_exit(mtx);
return;
}
MUTEX_CLEAR_WAITERS(mtx);
turnstile_wakeup(ts, TS_WRITER_Q, TS_WAITERS(ts, TS_WRITER_Q), NULL);
}
#endif /* !__HAVE_SIMPLE_MUTEXES */
/*
* mutex_owned:
*
* Return true if the current LWP (adaptive) or CPU (spin)
* holds the mutex.
*/
int
mutex_owned(const kmutex_t *mtx)
{ if (mtx == NULL)
return 0;
if (MUTEX_ADAPTIVE_P(mtx->mtx_owner))
return MUTEX_OWNER(mtx->mtx_owner) == (uintptr_t)curlwp;
#ifdef FULL
return MUTEX_SPINBIT_LOCKED_P(mtx);
#else
return 1;
#endif
}
/*
* mutex_owner:
*
* Return the current owner of an adaptive mutex. Used for
* priority inheritance.
*/
static lwp_t *
mutex_owner(wchan_t wchan)
{ volatile const kmutex_t *mtx = wchan; MUTEX_ASSERT(mtx, MUTEX_ADAPTIVE_P(mtx->mtx_owner));
return (struct lwp *)MUTEX_OWNER(mtx->mtx_owner);
}
/*
* mutex_ownable:
*
* When compiled with DEBUG and LOCKDEBUG defined, ensure that
* the mutex is available. We cannot use !mutex_owned() since
* that won't work correctly for spin mutexes.
*/
int
mutex_ownable(const kmutex_t *mtx)
{
#ifdef LOCKDEBUG
MUTEX_TESTLOCK(mtx);
#endif
return 1;
}
/*
* mutex_tryenter:
*
* Try to acquire the mutex; return non-zero if we did.
*/
int
mutex_tryenter(kmutex_t *mtx)
{
uintptr_t curthread;
/*
* Handle spin mutexes.
*/
if (MUTEX_SPIN_P(mtx->mtx_owner)) {
MUTEX_SPIN_SPLRAISE(mtx);
#ifdef FULL
if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx);
return 1;
}
MUTEX_SPIN_SPLRESTORE(mtx);
#else
MUTEX_WANTLOCK(mtx);
MUTEX_LOCKED(mtx);
return 1;
#endif
} else {
curthread = (uintptr_t)curlwp;
MUTEX_ASSERT(mtx, curthread != 0); if (MUTEX_ACQUIRE(mtx, curthread)) { MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx); MUTEX_DASSERT(mtx,
MUTEX_OWNER(mtx->mtx_owner) == curthread);
return 1;
}
}
return 0;
}
#if defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL)
/*
* mutex_spin_retry:
*
* Support routine for mutex_spin_enter(). Assumes that the caller
* has already raised the SPL, and adjusted counters.
*/
void
mutex_spin_retry(kmutex_t *mtx)
{
#ifdef MULTIPROCESSOR
u_int count;
LOCKSTAT_TIMER(spintime);
LOCKSTAT_FLAG(lsflag);
#ifdef LOCKDEBUG
u_int spins = 0;
#endif /* LOCKDEBUG */
MUTEX_WANTLOCK(mtx);
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
/*
* Spin testing the lock word and do exponential backoff
* to reduce cache line ping-ponging between CPUs.
*/
do {
while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins))
MUTEX_ABORT(mtx, "spinout");
#endif /* LOCKDEBUG */
}
} while (!MUTEX_SPINBIT_LOCK_TRY(mtx));
LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_EVENT(lsflag, mtx, LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
LOCKSTAT_EXIT(lsflag);
MUTEX_LOCKED(mtx);
#else /* MULTIPROCESSOR */
MUTEX_ABORT(mtx, "locking against myself");
#endif /* MULTIPROCESSOR */
}
#endif /* defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL) */
/* $NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_magiclinks.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/syslimits.h>
#include <sys/time.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>
#include <sys/fstrans.h>
#include <sys/mount.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/ktrace.h>
#include <sys/dirent.h>
#ifndef MAGICLINKS
#define MAGICLINKS 0
#endif
int vfs_magiclinks = MAGICLINKS;
__CTASSERT(MAXNAMLEN == NAME_MAX);
/*
* Substitute replacement text for 'magic' strings in symlinks.
* Returns 0 if successful, and returns non-zero if an error
* occurs. (Currently, the only possible error is running out
* of temporary pathname space.)
*
* Looks for "@<string>" and "@<string>/", where <string> is a
* recognized 'magic' string. Replaces the "@<string>" with the
* appropriate replacement text. (Note that in some cases the
* replacement text may have zero length.)
*
* This would have been table driven, but the variance in
* replacement strings (and replacement string lengths) made
* that impractical.
*/
#define VNL(x) \
(sizeof(x) - 1)
#define VO '{'
#define VC '}'
#define MATCH(str) \
((termchar == '/' && i + VNL(str) == *len) || \
(i + VNL(str) < *len && \
cp[i + VNL(str)] == termchar)) && \
!strncmp((str), &cp[i], VNL(str))
#define SUBSTITUTE(m, s, sl) \
if ((newlen + (sl)) >= MAXPATHLEN) \
return 1; \
i += VNL(m); \
if (termchar != '/') \
i++; \
(void)memcpy(&tmp[newlen], (s), (sl)); \
newlen += (sl); \
change = 1; \
termchar = '/';
static int
symlink_magic(struct proc *p, char *cp, size_t *len)
{
char *tmp;
size_t change, i, newlen, slen;
char termchar = '/';
char idtmp[11]; /* enough for 32 bit *unsigned* integer */
tmp = PNBUF_GET();
for (change = i = newlen = 0; i < *len; ) {
if (cp[i] != '@') {
tmp[newlen++] = cp[i++];
continue;
}
i++;
/* Check for @{var} syntax. */
if (cp[i] == VO) {
termchar = VC;
i++;
}
/*
* The following checks should be ordered according
* to frequency of use.
*/
if (MATCH("machine_arch")) {
slen = strlen(PROC_MACHINE_ARCH(p));
SUBSTITUTE("machine_arch", PROC_MACHINE_ARCH(p), slen); } else if (MATCH("machine")) {
slen = VNL(MACHINE);
SUBSTITUTE("machine", MACHINE, slen); } else if (MATCH("hostname")) { SUBSTITUTE("hostname", hostname, hostnamelen); } else if (MATCH("osrelease")) {
slen = strlen(osrelease);
SUBSTITUTE("osrelease", osrelease, slen); } else if (MATCH("emul")) {
slen = strlen(p->p_emul->e_name);
SUBSTITUTE("emul", p->p_emul->e_name, slen); } else if (MATCH("kernel_ident")) {
slen = strlen(kernel_ident);
SUBSTITUTE("kernel_ident", kernel_ident, slen); } else if (MATCH("domainname")) { SUBSTITUTE("domainname", domainname, domainnamelen); } else if (MATCH("ostype")) {
slen = strlen(ostype);
SUBSTITUTE("ostype", ostype, slen); } else if (MATCH("uid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_geteuid(kauth_cred_get()));
SUBSTITUTE("uid", idtmp, slen); } else if (MATCH("ruid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getuid(kauth_cred_get()));
SUBSTITUTE("ruid", idtmp, slen); } else if (MATCH("gid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getegid(kauth_cred_get()));
SUBSTITUTE("gid", idtmp, slen); } else if (MATCH("rgid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getgid(kauth_cred_get()));
SUBSTITUTE("rgid", idtmp, slen);
} else {
tmp[newlen++] = '@';
if (termchar == VC) tmp[newlen++] = VO;
}
}
if (change) { (void)memcpy(cp, tmp, newlen);
*len = newlen;
}
PNBUF_PUT(tmp);
return 0;
}
#undef VNL
#undef VO
#undef VC
#undef MATCH
#undef SUBSTITUTE
////////////////////////////////////////////////////////////
/*
* Determine the namei hash (for the namecache) for name.
* If *ep != NULL, hash from name to ep-1.
* If *ep == NULL, hash from name until the first NUL or '/', and
* return the location of this termination character in *ep.
*
* This function returns an equivalent hash to the MI hash32_strn().
* The latter isn't used because in the *ep == NULL case, determining
* the length of the string to the first NUL or `/' and then calling
* hash32_strn() involves unnecessary double-handling of the data.
*/
uint32_t
namei_hash(const char *name, const char **ep)
{
uint32_t hash;
hash = HASH32_STR_INIT;
if (*ep != NULL) {
for (; name < *ep; name++)
hash = hash * 33 + *(const uint8_t *)name;
} else {
for (; *name != '\0' && *name != '/'; name++)
hash = hash * 33 + *(const uint8_t *)name;
*ep = name;
}
return (hash + (hash >> 5));
}
////////////////////////////////////////////////////////////
/*
* Sealed abstraction for pathnames.
*
* System-call-layer level code that is going to call namei should
* first create a pathbuf and adjust all the bells and whistles on it
* as needed by context.
*/
struct pathbuf {
char *pb_path;
char *pb_pathcopy;
unsigned pb_pathcopyuses;
};
static struct pathbuf *
pathbuf_create_raw(void)
{
struct pathbuf *pb;
pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
pb->pb_path = PNBUF_GET();
if (pb->pb_path == NULL) {
kmem_free(pb, sizeof(*pb));
return NULL;
}
pb->pb_pathcopy = NULL;
pb->pb_pathcopyuses = 0;
return pb;
}
void
pathbuf_destroy(struct pathbuf *pb)
{ KASSERT(pb->pb_pathcopyuses == 0); KASSERT(pb->pb_pathcopy == NULL);
PNBUF_PUT(pb->pb_path);
kmem_free(pb, sizeof(*pb));
}
struct pathbuf *
pathbuf_assimilate(char *pnbuf)
{
struct pathbuf *pb;
pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
pb->pb_path = pnbuf;
pb->pb_pathcopy = NULL;
pb->pb_pathcopyuses = 0;
return pb;
}
struct pathbuf *
pathbuf_create(const char *path)
{
struct pathbuf *pb;
int error;
pb = pathbuf_create_raw();
if (pb == NULL) {
return NULL;
}
error = copystr(path, pb->pb_path, PATH_MAX, NULL);
if (error != 0) { KASSERT(!"kernel path too long in pathbuf_create");
/* make sure it's null-terminated, just in case */
pb->pb_path[PATH_MAX-1] = '\0';
}
return pb;
}
int
pathbuf_copyin(const char *userpath, struct pathbuf **ret)
{
struct pathbuf *pb;
int error;
pb = pathbuf_create_raw();
if (pb == NULL) {
return ENOMEM;
}
error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
if (error) {
pathbuf_destroy(pb);
return error;
}
*ret = pb;
return 0;
}
/*
* XXX should not exist:
* 1. whether a pointer is kernel or user should be statically checkable.
* 2. copyin should be handled by the upper part of the syscall layer,
* not in here.
*/
int
pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
{
if (seg == UIO_USERSPACE) {
return pathbuf_copyin(path, ret);
} else {
*ret = pathbuf_create(path);
if (*ret == NULL) {
return ENOMEM;
}
return 0;
}
}
/*
* Get a copy of the path buffer as it currently exists. If this is
* called after namei starts the results may be arbitrary.
*/
void
pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
{
strlcpy(buf, pb->pb_path, maxlen);
}
/*
* These two functions allow access to a saved copy of the original
* path string. The first copy should be gotten before namei is
* called. Each copy that is gotten should be put back.
*/
const char *
pathbuf_stringcopy_get(struct pathbuf *pb)
{ if (pb->pb_pathcopyuses == 0) { pb->pb_pathcopy = PNBUF_GET();
strcpy(pb->pb_pathcopy, pb->pb_path);
}
pb->pb_pathcopyuses++;
return pb->pb_pathcopy;
}
void
pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
{ KASSERT(str == pb->pb_pathcopy); KASSERT(pb->pb_pathcopyuses > 0);
pb->pb_pathcopyuses--;
if (pb->pb_pathcopyuses == 0) { PNBUF_PUT(pb->pb_pathcopy);
pb->pb_pathcopy = NULL;
}
}
////////////////////////////////////////////////////////////
/*
* namei: convert a pathname into a pointer to a (maybe-locked) vnode,
* and maybe also its parent directory vnode, and assorted other guff.
* See namei(9) for the interface documentation.
*
*
* The FOLLOW flag is set when symbolic links are to be followed
* when they occur at the end of the name translation process.
* Symbolic links are always followed for all other pathname
* components other than the last.
*
* The segflg defines whether the name is to be copied from user
* space or kernel space.
*
* Overall outline of namei:
*
* copy in name
* get starting directory
* while (!done && !error) {
* call lookup to search path.
* if symbolic link, massage name in buffer and continue
* }
*/
/*
* Search a pathname.
* This is a very central and rather complicated routine.
*
* The pathname is pointed to by ni_ptr and is of length ni_pathlen.
* The starting directory is passed in. The pathname is descended
* until done, or a symbolic link is encountered. The variable ni_more
* is clear if the path is completed; it is set to one if a symbolic
* link needing interpretation is encountered.
*
* The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
* whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it, the parent directory is returned
* locked. Otherwise the parent directory is not returned. If the target
* of the pathname exists and LOCKLEAF is or'ed into the flag the target
* is returned locked, otherwise it is returned unlocked. When creating
* or renaming and LOCKPARENT is specified, the target may not be ".".
* When deleting and LOCKPARENT is specified, the target may be ".".
*
* Overall outline of lookup:
*
* dirloop:
* identify next component of name at ndp->ni_ptr
* handle degenerate case where name is null string
* if .. and crossing mount points and on mounted filesys, find parent
* call VOP_LOOKUP routine for next component name
* directory vnode returned in ni_dvp, locked.
* component vnode returned in ni_vp (if it exists), locked.
* if result vnode is mounted on and crossing mount points,
* find mounted on vnode
* if more components of name, do next level at dirloop
* return the answer in ni_vp, locked if LOCKLEAF set
* if LOCKPARENT set, return locked parent in ni_dvp
*/
/*
* Internal state for a namei operation.
*
* cnp is always equal to &ndp->ni_cnp.
*/
struct namei_state {
struct nameidata *ndp;
struct componentname *cnp;
int docache; /* == 0 do not cache last component */
int rdonly; /* lookup read-only flag bit */
int slashes;
unsigned attempt_retry:1; /* true if error allows emul retry */
unsigned root_referenced:1; /* true if ndp->ni_rootdir and
ndp->ni_erootdir were referenced */
};
/*
* Initialize the namei working state.
*/
static void
namei_init(struct namei_state *state, struct nameidata *ndp)
{
state->ndp = ndp;
state->cnp = &ndp->ni_cnd;
state->docache = 0;
state->rdonly = 0;
state->slashes = 0;
state->root_referenced = 0;
KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc"); KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0),
"namei: nameiop contaminated with flags: %08"PRIx32,
state->cnp->cn_nameiop);
KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0),
"name: flags contaminated with nameiops: %08"PRIx32,
state->cnp->cn_flags);
/*
* The buffer for name translation shall be the one inside the
* pathbuf.
*/
state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
}
/*
* Clean up the working namei state, leaving things ready for return
* from namei.
*/
static void
namei_cleanup(struct namei_state *state)
{
KASSERT(state->cnp == &state->ndp->ni_cnd); if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir);
}
}
//////////////////////////////
/*
* Get the directory context.
* Initializes the rootdir and erootdir state and returns a reference
* to the starting dir.
*/
static struct vnode *
namei_getstartdir(struct namei_state *state)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct cwdinfo *cwdi; /* pointer to cwd state */
struct lwp *self = curlwp; /* thread doing namei() */
struct vnode *rootdir, *erootdir, *curdir, *startdir;
if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir);
state->root_referenced = 0;
}
cwdi = self->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
/* root dir */
if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
rootdir = rootvnode;
} else {
rootdir = cwdi->cwdi_rdir;
}
/* emulation root dir, if any */
if ((cnp->cn_flags & TRYEMULROOT) == 0) {
/* if we don't want it, don't fetch it */
erootdir = NULL;
} else if (cnp->cn_flags & EMULROOTSET) {
/* explicitly set emulroot; "/../" doesn't override this */
erootdir = ndp->ni_erootdir; } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
/* explicit reference to real rootdir */
erootdir = NULL;
} else {
/* may be null */
erootdir = cwdi->cwdi_edir;
}
/* current dir */
curdir = cwdi->cwdi_cdir;
if (ndp->ni_pnbuf[0] != '/') {
if (ndp->ni_atdir != NULL) {
startdir = ndp->ni_atdir;
} else {
startdir = curdir;
}
erootdir = NULL;
} else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
startdir = erootdir;
} else {
startdir = rootdir;
erootdir = NULL;
}
state->ndp->ni_rootdir = rootdir;
state->ndp->ni_erootdir = erootdir;
/*
* Get a reference to the start dir so we can safely unlock cwdi.
*
* Must hold references to rootdir and erootdir while we're running.
* A multithreaded process may chroot during namei.
*/
if (startdir != NULL) vref(startdir); if (state->ndp->ni_rootdir != NULL) vref(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vref(state->ndp->ni_erootdir);
state->root_referenced = 1;
rw_exit(&cwdi->cwdi_lock);
return startdir;
}
/*
* Get the directory context for the nfsd case, in parallel to
* getstartdir. Initializes the rootdir and erootdir state and
* returns a reference to the passed-in starting dir.
*/
static struct vnode *
namei_getstartdir_for_nfsd(struct namei_state *state)
{
KASSERT(state->ndp->ni_atdir != NULL);
/* always use the real root, and never set an emulation root */
if (rootvnode == NULL) {
return NULL;
}
state->ndp->ni_rootdir = rootvnode;
state->ndp->ni_erootdir = NULL;
vref(state->ndp->ni_atdir);
KASSERT(! state->root_referenced);
vref(state->ndp->ni_rootdir);
state->root_referenced = 1;
return state->ndp->ni_atdir;
}
/*
* Ktrace the namei operation.
*/
static void
namei_ktrace(struct namei_state *state)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct lwp *self = curlwp; /* thread doing namei() */
const char *emul_path;
if (ktrpoint(KTR_NAMEI)) {
if (ndp->ni_erootdir != NULL) {
/*
* To make any sense, the trace entry need to have the
* text of the emulation path prepended.
* Usually we can get this from the current process,
* but when called from emul_find_interp() it is only
* in the exec_package - so we get it passed in ni_next
* (this is a hack).
*/
if (cnp->cn_flags & EMULROOTSET)
emul_path = ndp->ni_next;
else
emul_path = self->l_proc->p_emul->e_path; ktrnamei2(emul_path, strlen(emul_path), ndp->ni_pnbuf, ndp->ni_pathlen);
} else
ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
}
}
/*
* Start up namei. Find the root dir and cwd, establish the starting
* directory for lookup, and lock it. Also calls ktrace when
* appropriate.
*/
static int
namei_start(struct namei_state *state, int isnfsd,
struct vnode **startdir_ret)
{
struct nameidata *ndp = state->ndp;
struct vnode *startdir;
/* length includes null terminator (was originally from copyinstr) */
ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
/*
* POSIX.1 requirement: "" is not a valid file name.
*/
if (ndp->ni_pathlen == 1) {
ndp->ni_erootdir = NULL;
return ENOENT;
}
ndp->ni_loopcnt = 0;
/* Get starting directory, set up root, and ktrace. */
if (isnfsd) {
startdir = namei_getstartdir_for_nfsd(state);
/* no ktrace */
} else {
startdir = namei_getstartdir(state); namei_ktrace(state);
}
if (startdir == NULL) {
return ENOENT;
}
/* NDAT may feed us with a non directory namei_getstartdir */
if (startdir->v_type != VDIR) {
vrele(startdir);
return ENOTDIR;
}
*startdir_ret = startdir;
return 0;
}
/*
* Check for being at a symlink that we're going to follow.
*/
static inline int
namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
{
return (foundobj->v_type == VLNK) &&
(state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
}
/*
* Follow a symlink.
*
* Updates searchdir. inhibitmagic causes magic symlinks to not be
* interpreted; this is used by nfsd.
*
* Unlocks foundobj on success (ugh)
*/
static inline int
namei_follow(struct namei_state *state, int inhibitmagic,
struct vnode *searchdir, struct vnode *foundobj,
struct vnode **newsearchdir_ret)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct lwp *self = curlwp; /* thread doing namei() */
struct iovec aiov; /* uio for reading symbolic links */
struct uio auio;
char *cp; /* pointer into pathname argument */
size_t linklen;
int error;
if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
return ELOOP;
}
vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
if (error != 0) { VOP_UNLOCK(foundobj);
return error;
}
}
/* FUTURE: fix this to not use a second buffer */
cp = PNBUF_GET();
aiov.iov_base = cp;
aiov.iov_len = MAXPATHLEN;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_resid = MAXPATHLEN;
UIO_SETUP_SYSSPACE(&auio);
error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
VOP_UNLOCK(foundobj);
if (error) {
PNBUF_PUT(cp);
return error;
}
linklen = MAXPATHLEN - auio.uio_resid;
if (linklen == 0) {
PNBUF_PUT(cp);
return ENOENT;
}
/*
* Do symlink substitution, if appropriate, and
* check length for potential overflow.
*
* Inhibit symlink substitution for nfsd.
* XXX: This is how it was before; is that a bug or a feature?
*/
if ((!inhibitmagic && vfs_magiclinks && symlink_magic(self->l_proc, cp, &linklen)) ||
(linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
PNBUF_PUT(cp);
return ENAMETOOLONG;
}
if (ndp->ni_pathlen > 1) {
/* includes a null-terminator */
memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
} else {
cp[linklen] = '\0';
}
ndp->ni_pathlen += linklen;
memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
PNBUF_PUT(cp);
/* we're now starting from the beginning of the buffer again */
cnp->cn_nameptr = ndp->ni_pnbuf;
/*
* Check if root directory should replace current directory.
*/
if (ndp->ni_pnbuf[0] == '/') {
vrele(searchdir);
/* Keep absolute symbolic links inside emulation root */
searchdir = ndp->ni_erootdir;
if (searchdir == NULL ||
(ndp->ni_pnbuf[1] == '.'
&& ndp->ni_pnbuf[2] == '.' && ndp->ni_pnbuf[3] == '/')) {
ndp->ni_erootdir = NULL;
searchdir = ndp->ni_rootdir;
}
vref(searchdir);
while (cnp->cn_nameptr[0] == '/') {
cnp->cn_nameptr++;
ndp->ni_pathlen--;
}
}
*newsearchdir_ret = searchdir;
return 0;
}
//////////////////////////////
/*
* Inspect the leading path component and update the state accordingly.
*/
static int
lookup_parsepath(struct namei_state *state, struct vnode *searchdir)
{
const char *cp; /* pointer into pathname argument */
int error;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
KASSERT(cnp == &ndp->ni_cnd);
/*
* Search a new directory.
*
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*
* At this point, our only vnode state is that the search dir
* is held.
*/
error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen);
if (error) {
return error;
}
cp = cnp->cn_nameptr + cnp->cn_namelen;
if (cnp->cn_namelen > KERNEL_NAME_MAX) {
return ENAMETOOLONG;
}
#ifdef NAMEI_DIAGNOSTIC
{ char c = *cp;
*(char *)cp = '\0';
printf("{%s}: ", cnp->cn_nameptr);
*(char *)cp = c; }
#endif /* NAMEI_DIAGNOSTIC */
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
/*
* If this component is followed by a slash, then move the pointer to
* the next component forward, and remember that this component must be
* a directory.
*/
if (*cp == '/') {
do {
cp++;
} while (*cp == '/'); state->slashes = cp - ndp->ni_next;
ndp->ni_pathlen -= state->slashes;
ndp->ni_next = cp;
cnp->cn_flags |= REQUIREDIR;
} else {
state->slashes = 0;
cnp->cn_flags &= ~REQUIREDIR;
}
/*
* We do special processing on the last component, whether or not it's
* a directory. Cache all intervening lookups, but not the final one.
*/
if (*cp == '\0') {
if (state->docache)
cnp->cn_flags |= MAKEENTRY;
else
cnp->cn_flags &= ~MAKEENTRY;
cnp->cn_flags |= ISLASTCN;
} else {
cnp->cn_flags |= MAKEENTRY;
cnp->cn_flags &= ~ISLASTCN;
}
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT;
else
cnp->cn_flags &= ~ISDOTDOT;
return 0;
}
/*
* Take care of crossing a mounted-on vnode. On error, foundobj_ret will be
* vrele'd, but searchdir is left alone.
*/
static int
lookup_crossmount(struct namei_state *state,
struct vnode **searchdir_ret,
struct vnode **foundobj_ret,
bool *searchdir_locked)
{
struct componentname *cnp = state->cnp;
struct vnode *foundobj, *vp;
struct vnode *searchdir;
struct mount *mp;
int error, lktype;
searchdir = *searchdir_ret;
foundobj = *foundobj_ret;
error = 0;
KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
/* First, unlock searchdir (oof). */
if (*searchdir_locked) { KASSERT(searchdir != NULL);
lktype = VOP_ISLOCKED(searchdir);
VOP_UNLOCK(searchdir);
*searchdir_locked = false;
} else {
lktype = LK_NONE;
}
/*
* Do an unlocked check to see if the vnode has been mounted on; if
* so find the root of the mounted file system.
*/
while (foundobj->v_type == VDIR && (mp = foundobj->v_mountedhere) != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
/*
* Try the namecache first. If that doesn't work, do
* it the hard way.
*/
if (cache_lookup_mount(foundobj, &vp)) {
vrele(foundobj);
foundobj = vp;
} else {
/* First get the vnodes mount stable. */
while ((mp = foundobj->v_mountedhere) != NULL) {
fstrans_start(mp);
if (fstrans_held(mp) &&
mp == foundobj->v_mountedhere) {
break;
}
fstrans_done(mp);
}
if (mp == NULL) {
break;
}
/*
* Now get a reference on the root vnode.
* XXX Future - maybe allow only VDIR here.
*/
error = VFS_ROOT(mp, LK_NONE, &vp);
/*
* If successful, enter it into the cache while
* holding the mount busy (competing with unmount).
*/
if (error == 0) {
cache_enter_mount(foundobj, vp);
}
/* Finally, drop references to foundobj & mountpoint. */
vrele(foundobj);
fstrans_done(mp);
if (error) {
foundobj = NULL;
break;
}
foundobj = vp;
}
/*
* Avoid locking vnodes from two filesystems because
* it's prone to deadlock, e.g. when using puffs.
* Also, it isn't a good idea to propagate slowness of
* a filesystem up to the root directory. For now,
* only handle the common case, where foundobj is
* VDIR.
*
* In this case set searchdir to null to avoid using
* it again. It is not correct to set searchdir ==
* foundobj here as that will confuse the caller.
* (See PR 40740.)
*/
if (searchdir == NULL) {
/* already been here once; do nothing further */
} else if (foundobj->v_type == VDIR) { vrele(searchdir);
*searchdir_ret = searchdir = NULL;
lktype = LK_NONE;
}
}
/* If searchdir is still around, re-lock it. */
if (error == 0 && lktype != LK_NONE) { vn_lock(searchdir, lktype | LK_RETRY);
*searchdir_locked = true;
}
*foundobj_ret = foundobj;
return error;
}
/*
* Determine the desired locking mode for the directory of a lookup.
*/
static int
lookup_lktype(struct vnode *searchdir, struct componentname *cnp)
{
/*
* If the file system supports VOP_LOOKUP() with a shared lock, and
* we are not making any modifications (nameiop LOOKUP) or this is
* not the last component then get a shared lock. Where we can't do
* fast-forwarded lookups (for example with layered file systems)
* then this is the fallback for reducing lock contention.
*/
if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
return LK_SHARED;
} else {
return LK_EXCLUSIVE;
}
}
/*
* Call VOP_LOOKUP for a single lookup; return a new search directory
* (used when crossing mountpoints up or searching union mounts down) and
* the found object, which for create operations may be NULL on success.
*
* Note that the new search directory may be null, which means the
* searchdir was unlocked and released. This happens in the common case
* when crossing a mount point downwards, in order to avoid coupling
* locks between different file system volumes. Importantly, this can
* happen even if the call fails. (XXX: this is gross and should be
* tidied somehow.)
*/
static int
lookup_once(struct namei_state *state,
struct vnode *searchdir,
struct vnode **newsearchdir_ret,
struct vnode **foundobj_ret,
bool *newsearchdir_locked_ret)
{
struct vnode *tmpvn; /* scratch vnode */
struct vnode *foundobj; /* result */
struct lwp *l = curlwp;
bool searchdir_locked = false;
int error, lktype;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
KASSERT(cnp == &ndp->ni_cnd);
*newsearchdir_ret = searchdir;
/*
* Handle "..": two special cases.
* 1. If at root directory (e.g. after chroot)
* or at absolute root directory
* then ignore it so can't get out.
* 1a. If at the root of the emulation filesystem go to the real
* root. So "/../<path>" is always absolute.
* 1b. If we have somehow gotten out of a jail, warn
* and also ignore it so we can't get farther out.
* 2. If this vnode is the root of a mounted
* filesystem, then replace it with the
* vnode which was mounted on so we take the
* .. in the other file system.
*/
if (cnp->cn_flags & ISDOTDOT) {
struct proc *p = l->l_proc;
for (;;) {
if (searchdir == ndp->ni_rootdir ||
searchdir == rootvnode) {
foundobj = searchdir;
vref(foundobj);
*foundobj_ret = foundobj;
if (cnp->cn_flags & LOCKPARENT) { lktype = lookup_lktype(searchdir, cnp);
vn_lock(searchdir, lktype | LK_RETRY);
searchdir_locked = true;
}
error = 0;
goto done;
}
if (ndp->ni_rootdir != rootvnode) {
int retval;
retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
if (!retval) {
/* Oops! We got out of jail! */
log(LOG_WARNING,
"chrooted pid %d uid %d (%s) "
"detected outside of its chroot\n",
p->p_pid, kauth_cred_geteuid(l->l_cred),
p->p_comm);
/* Put us at the jail root. */
vrele(searchdir);
searchdir = NULL;
foundobj = ndp->ni_rootdir;
vref(foundobj);
vref(foundobj);
*newsearchdir_ret = foundobj;
*foundobj_ret = foundobj;
error = 0;
goto done;
}
}
if ((searchdir->v_vflag & VV_ROOT) == 0 ||
(cnp->cn_flags & NOCROSSMOUNT))
break;
tmpvn = searchdir;
searchdir = searchdir->v_mount->mnt_vnodecovered;
vref(searchdir);
vrele(tmpvn);
*newsearchdir_ret = searchdir;
}
}
lktype = lookup_lktype(searchdir, cnp);
/*
* We now have a segment name to search for, and a directory to search.
* Our vnode state here is that "searchdir" is held.
*/
unionlookup:
foundobj = NULL;
if (!searchdir_locked) { vn_lock(searchdir, lktype | LK_RETRY);
searchdir_locked = true;
}
error = VOP_LOOKUP(searchdir, &foundobj, cnp);
if (error != 0) {
KASSERTMSG((foundobj == NULL),
"leaf `%s' should be empty but is %p",
cnp->cn_nameptr, foundobj);
#ifdef NAMEI_DIAGNOSTIC
printf("not found\n");
#endif /* NAMEI_DIAGNOSTIC */
/*
* If ENOLCK, the file system needs us to retry the lookup
* with an exclusive lock. It's likely nothing was found in
* cache and/or modifications need to be made.
*/
if (error == ENOLCK) { KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); KASSERT(searchdir_locked); if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { VOP_UNLOCK(searchdir);
searchdir_locked = false;
}
lktype = LK_EXCLUSIVE;
goto unionlookup;
}
if ((error == ENOENT) && (searchdir->v_vflag & VV_ROOT) &&
(searchdir->v_mount->mnt_flag & MNT_UNION)) {
tmpvn = searchdir;
searchdir = searchdir->v_mount->mnt_vnodecovered;
vref(searchdir);
vput(tmpvn);
searchdir_locked = false;
*newsearchdir_ret = searchdir;
goto unionlookup;
}
if (error != EJUSTRETURN)
goto done;
/*
* If this was not the last component, or there were trailing
* slashes, and we are not going to create a directory,
* then the name must exist.
*/
if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
error = ENOENT;
goto done;
}
/*
* If creating and at end of pathname, then can consider
* allowing file to be created.
*/
if (state->rdonly) {
error = EROFS;
goto done;
}
/*
* We return success and a NULL foundobj to indicate
* that the entry doesn't currently exist, leaving a
* pointer to the (normally, locked) directory vnode
* as searchdir.
*/
*foundobj_ret = NULL;
error = 0;
goto done;
}
#ifdef NAMEI_DIAGNOSTIC
printf("found\n");
#endif /* NAMEI_DIAGNOSTIC */
/* Unlock, unless the caller needs the parent locked. */
if (searchdir != NULL) {
KASSERT(searchdir_locked); if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
(ISLASTCN | LOCKPARENT)) {
VOP_UNLOCK(searchdir);
searchdir_locked = false;
}
} else {
KASSERT(!searchdir_locked);
}
*foundobj_ret = foundobj;
error = 0;
done:
*newsearchdir_locked_ret = searchdir_locked;
return error;
}
/*
* Parse out the first path name component that we need to to consider.
*
* While doing this, attempt to use the name cache to fast-forward through
* as many "easy" to find components of the path as possible.
*
* We use the namecache's node locks to form a chain, and avoid as many
* vnode references and locks as possible. In the ideal case, only the
* final vnode will have its reference count adjusted and lock taken.
*/
static int
lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
struct vnode **foundobj_ret)
{
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
krwlock_t *plock;
struct vnode *foundobj, *searchdir;
int error, error2;
size_t oldpathlen;
const char *oldnameptr;
bool terminal;
/*
* Eat as many path name components as possible before giving up and
* letting lookup_once() handle it. Remember the starting point in
* case we can't get vnode references and need to roll back.
*/
plock = NULL;
searchdir = *searchdir_ret;
oldnameptr = cnp->cn_nameptr;
oldpathlen = ndp->ni_pathlen;
terminal = false;
for (;;) {
foundobj = NULL;
/*
* Get the next component name. There should be no slashes
* here, and we shouldn't have looped around if we were
* done.
*/
KASSERT(cnp->cn_nameptr[0] != '/'); KASSERT(cnp->cn_nameptr[0] != '\0'); if ((error = lookup_parsepath(state, searchdir)) != 0) {
break;
}
/*
* Can't deal with DOTDOT lookups if NOCROSSMOUNT or the
* lookup is chrooted.
*/
if ((cnp->cn_flags & ISDOTDOT) != 0) { if ((searchdir->v_vflag & VV_ROOT) != 0 &&
(cnp->cn_flags & NOCROSSMOUNT)) {
error = EOPNOTSUPP;
break;
}
if (ndp->ni_rootdir != rootvnode) {
error = EOPNOTSUPP;
break;
}
}
/*
* Can't deal with last component when modifying; this needs
* searchdir locked and VOP_LOOKUP() called (which can and
* does modify state, despite the name). NB: this case means
* terminal is never set true when LOCKPARENT.
*/
if ((cnp->cn_flags & ISLASTCN) != 0) { if (cnp->cn_nameiop != LOOKUP ||
(cnp->cn_flags & LOCKPARENT) != 0) {
error = EOPNOTSUPP;
break;
}
}
/*
* Good, now look for it in cache. cache_lookup_linked()
* will fail if there's nothing there, or if there's no
* ownership info for the directory, or if the user doesn't
* have permission to look up files in this directory.
*/
if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
error = EOPNOTSUPP;
break;
}
KASSERT(plock != NULL); KASSERT(rw_lock_held(plock));
/*
* Scored a hit. Negative is good too (ENOENT). If there's
* a '-o union' mount here, punt and let lookup_once() deal
* with it.
*/
if (foundobj == NULL) {
if ((searchdir->v_vflag & VV_ROOT) != 0 &&
(searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
error = EOPNOTSUPP;
} else {
error = ENOENT;
terminal = ((cnp->cn_flags & ISLASTCN) != 0);
}
break;
}
/*
* Stop and get a hold on the vnode if we've encountered
* something other than a dirctory.
*/
if (foundobj->v_type != VDIR) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
} else {
terminal = (foundobj->v_type != VLNK &&
(cnp->cn_flags & ISLASTCN) != 0);
}
break;
}
/*
* Try to cross mountpoints, bearing in mind that they can
* be stacked. If at any point we can't go further, stop
* and try to get a reference on the vnode. If we are able
* to get a ref then lookup_crossmount() will take care of
* it, otherwise we'll fall through to lookup_once().
*/
if (foundobj->v_mountedhere != NULL) { while (foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0 &&
cache_cross_mount(&foundobj, &plock)) {
KASSERT(foundobj != NULL); KASSERT(foundobj->v_type == VDIR);
}
if (foundobj->v_mountedhere != NULL) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
}
break;
} else {
searchdir = NULL;
}
}
/*
* Time to stop if we found the last component & traversed
* all mounts.
*/
if ((cnp->cn_flags & ISLASTCN) != 0) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
} else {
terminal = (foundobj->v_type != VLNK);
}
break;
}
/*
* Otherwise, we're still in business. Set the found VDIR
* vnode as the search dir for the next component and
* continue on to it.
*/
cnp->cn_nameptr = ndp->ni_next;
searchdir = foundobj;
}
if (terminal) {
/*
* If we exited the loop above having successfully located
* the last component with a zero error code, and it's not a
* symbolic link, then the parent directory is not needed.
* Release reference to the starting parent and make the
* terminal parent disappear into thin air.
*/
KASSERT(plock != NULL);
rw_exit(plock);
vrele(*searchdir_ret);
*searchdir_ret = NULL;
} else if (searchdir != *searchdir_ret) {
/*
* Otherwise we need to return the parent. If we ended up
* with a new search dir, ref it before dropping the
* namecache's lock. The lock prevents both searchdir and
* foundobj from disappearing. If we can't ref the new
* searchdir, we have a bit of a problem. Roll back the
* fastforward to the beginning and let lookup_once() take
* care of it.
*/
if (searchdir == NULL) {
/*
* It's possible for searchdir to be NULL in the
* case of a root vnode being reclaimed while
* trying to cross a mount.
*/
error2 = EOPNOTSUPP;
} else {
error2 = vcache_tryvget(searchdir);
}
KASSERT(plock != NULL);
rw_exit(plock);
if (__predict_true(error2 == 0)) {
/* Returning new searchdir, and maybe new foundobj. */
vrele(*searchdir_ret);
*searchdir_ret = searchdir;
} else {
/* Returning nothing. */
if (foundobj != NULL) { vrele(foundobj);
foundobj = NULL;
}
cnp->cn_nameptr = oldnameptr;
ndp->ni_pathlen = oldpathlen;
error = lookup_parsepath(state, *searchdir_ret);
if (error == 0) {
error = EOPNOTSUPP;
}
}
} else if (plock != NULL) {
/* Drop any namecache lock still held. */
rw_exit(plock);
}
KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
*foundobj_ret = foundobj;
return error;
}
//////////////////////////////
/*
* Do a complete path search from a single root directory.
* (This is called up to twice if TRYEMULROOT is in effect.)
*/
static int
namei_oneroot(struct namei_state *state,
int neverfollow, int inhibitmagic, int isnfsd)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct vnode *searchdir, *foundobj;
bool searchdir_locked = false;
int error;
error = namei_start(state, isnfsd, &searchdir);
if (error) {
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
return error;
}
KASSERT(searchdir->v_type == VDIR);
/*
* Setup: break out flag bits into variables.
*/
state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
if (cnp->cn_nameiop == DELETE)
state->docache = 0;
state->rdonly = cnp->cn_flags & RDONLY;
/*
* Keep going until we run out of path components.
*/
cnp->cn_nameptr = ndp->ni_pnbuf;
/* drop leading slashes (already used them to choose startdir) */
while (cnp->cn_nameptr[0] == '/') {
cnp->cn_nameptr++;
ndp->ni_pathlen--;
}
/* was it just "/"? */
if (cnp->cn_nameptr[0] == '\0') {
foundobj = searchdir;
searchdir = NULL;
cnp->cn_flags |= ISLASTCN;
/* bleh */
goto skiploop;
}
for (;;) {
KASSERT(searchdir != NULL); KASSERT(!searchdir_locked);
/*
* Parse out the first path name component that we need to
* to consider. While doing this, attempt to use the name
* cache to fast-forward through as many "easy" to find
* components of the path as possible.
*/
error = lookup_fastforward(state, &searchdir, &foundobj);
/*
* If we didn't get a good answer from the namecache, then
* go directly to the file system.
*/
if (error == EOPNOTSUPP) { error = lookup_once(state, searchdir, &searchdir,
&foundobj, &searchdir_locked);
}
/*
* If the vnode we found is mounted on, then cross the mount
* and get the root vnode in foundobj. If this encounters
* an error, it will dispose of foundobj, but searchdir is
* untouched.
*/
if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
error = lookup_crossmount(state, &searchdir,
&foundobj, &searchdir_locked);
}
if (error) {
if (searchdir != NULL) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
}
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
/*
* Note that if we're doing TRYEMULROOT we can
* retry with the normal root. Where this is
* currently set matches previous practice,
* but the previous practice didn't make much
* sense and somebody should sit down and
* figure out which cases should cause retry
* and which shouldn't. XXX.
*/
state->attempt_retry = 1;
return (error);
}
if (foundobj == NULL) {
/*
* Success with no object returned means we're
* creating something and it isn't already
* there. Break out of the main loop now so
* the code below doesn't have to test for
* foundobj == NULL.
*/
/* lookup_once can't have dropped the searchdir */
KASSERT(searchdir != NULL ||
(cnp->cn_flags & ISLASTCN) != 0);
break;
}
/*
* Check for symbolic link. If we've reached one,
* follow it, unless we aren't supposed to. Back up
* over any slashes that we skipped, as we will need
* them again.
*/
if (namei_atsymlink(state, foundobj)) {
/* Don't need searchdir locked any more. */
if (searchdir_locked) {
searchdir_locked = false;
VOP_UNLOCK(searchdir);
}
ndp->ni_pathlen += state->slashes;
ndp->ni_next -= state->slashes;
if (neverfollow) {
error = EINVAL;
} else if (searchdir == NULL) {
/*
* dholland 20160410: lookup_once only
* drops searchdir if it crossed a
* mount point. Therefore, if we get
* here it means we crossed a mount
* point to a mounted filesystem whose
* root vnode is a symlink. In theory
* we could continue at this point by
* using the pre-crossing searchdir
* (e.g. just take out an extra
* reference on it before calling
* lookup_once so we still have it),
* but this will make an ugly mess and
* it should never happen in practice
* as only badly broken filesystems
* have non-directory root vnodes. (I
* have seen this sort of thing with
* NFS occasionally but even then it
* means something's badly wrong.)
*/
error = ENOTDIR;
} else {
/*
* dholland 20110410: if we're at a
* union mount it might make sense to
* use the top of the union stack here
* rather than the layer we found the
* symlink in. (FUTURE)
*/
error = namei_follow(state, inhibitmagic,
searchdir, foundobj,
&searchdir);
}
if (error) {
KASSERT(searchdir != foundobj); if (searchdir != NULL) { vrele(searchdir);
}
vrele(foundobj);
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
return error;
}
vrele(foundobj);
foundobj = NULL;
/*
* If we followed a symlink to `/' and there
* are no more components after the symlink,
* we're done with the loop and what we found
* is the searchdir.
*/
if (cnp->cn_nameptr[0] == '\0') { KASSERT(searchdir != NULL);
foundobj = searchdir;
searchdir = NULL;
cnp->cn_flags |= ISLASTCN;
break;
}
continue;
}
/*
* Not a symbolic link.
*
* Check for directory, if the component was
* followed by a series of slashes.
*/
if ((foundobj->v_type != VDIR) &&
(cnp->cn_flags & REQUIREDIR)) {
KASSERT(foundobj != searchdir);
if (searchdir) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
} else {
KASSERT(!searchdir_locked);
}
vrele(foundobj);
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
return ENOTDIR;
}
/*
* Stop if we've reached the last component.
*/
if (cnp->cn_flags & ISLASTCN) {
break;
}
/*
* Continue with the next component.
*/
cnp->cn_nameptr = ndp->ni_next;
if (searchdir != NULL) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
}
searchdir = foundobj;
foundobj = NULL;
}
KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
skiploop:
if (foundobj != NULL) { if (foundobj == ndp->ni_erootdir) {
/*
* We are about to return the emulation root.
* This isn't a good idea because code might
* repeatedly lookup ".." until the file
* matches that returned for "/" and loop
* forever. So convert it to the real root.
*/
if (searchdir != NULL) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = ndp->ni_rootdir;
vref(foundobj);
}
/*
* If the caller requested the parent node (i.e. it's
* a CREATE, DELETE, or RENAME), and we don't have one
* (because this is the root directory, or we crossed
* a mount point), then we must fail.
*
* 20210604 dholland when NONEXCLHACK is set (open
* with O_CREAT but not O_EXCL) skip this logic. Since
* we have a foundobj, open will not be creating, so
* it doesn't actually need or use the searchdir, so
* it's ok to return it even if it's on a different
* volume, and it's also ok to return NULL; by setting
* NONEXCLHACK the open code promises to cope with
* those cases correctly. (That is, it should do what
* it would do anyway, that is, just release the
* searchdir, except not crash if it's null.) This is
* needed because otherwise opening mountpoints with
* O_CREAT but not O_EXCL fails... which is a silly
* thing to do but ought to work. (This whole issue
* came to light because 3rd party code wanted to open
* certain procfs nodes with O_CREAT for some 3rd
* party reason, and it failed.)
*
* Note that NONEXCLHACK is properly a different
* nameiop (it is partway between LOOKUP and CREATE)
* but it was stuffed in as a flag instead to make the
* resulting patch less invasive for pullup. Blah.
*/
if (cnp->cn_nameiop != LOOKUP &&
(searchdir == NULL ||
searchdir->v_mount != foundobj->v_mount) &&
(cnp->cn_flags & NONEXCLHACK) == 0) {
if (searchdir) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = NULL;
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
switch (cnp->cn_nameiop) {
case CREATE:
return EEXIST;
case DELETE:
case RENAME:
return EBUSY;
default:
break;
}
panic("Invalid nameiop\n");
}
/*
* Disallow directory write attempts on read-only lookups.
* Prefers EEXIST over EROFS for the CREATE case.
*/
if (state->rdonly &&
(cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
if (searchdir) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = NULL;
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
return EROFS;
}
/* Lock the leaf node if requested. */
if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
searchdir == foundobj) {
/*
* Note: if LOCKPARENT but not LOCKLEAF is
* set, and searchdir == foundobj, this code
* necessarily unlocks the parent as well as
* the leaf. That is, just because you specify
* LOCKPARENT doesn't mean you necessarily get
* a locked parent vnode. The code in
* vfs_syscalls.c, and possibly elsewhere,
* that uses this combination "knows" this, so
* it can't be safely changed. Feh. XXX
*/
KASSERT(searchdir_locked);
VOP_UNLOCK(searchdir);
searchdir_locked = false;
} else if ((cnp->cn_flags & LOCKLEAF) != 0 && (searchdir != foundobj ||
(cnp->cn_flags & LOCKPARENT) == 0)) {
const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
LK_SHARED : LK_EXCLUSIVE;
vn_lock(foundobj, lktype | LK_RETRY);
}
}
/*
* Done.
*/
/*
* If LOCKPARENT is not set, the parent directory isn't returned.
*/
if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { vrele(searchdir);
searchdir = NULL;
}
ndp->ni_dvp = searchdir;
ndp->ni_vp = foundobj;
return 0;
}
/*
* Do namei; wrapper layer that handles TRYEMULROOT.
*/
static int
namei_tryemulroot(struct namei_state *state,
int neverfollow, int inhibitmagic, int isnfsd)
{
int error;
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
const char *savepath = NULL;
KASSERT(cnp == &ndp->ni_cnd); if (cnp->cn_flags & TRYEMULROOT) { savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
}
emul_retry:
state->attempt_retry = 0;
error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd); if (error) {
/*
* Once namei has started up, the existence of ni_erootdir
* tells us whether we're working from an emulation root.
* The TRYEMULROOT flag isn't necessarily authoritative.
*/
if (ndp->ni_erootdir != NULL && state->attempt_retry) {
/* Retry the whole thing using the normal root */
cnp->cn_flags &= ~TRYEMULROOT;
state->attempt_retry = 0;
/* kinda gross */
strcpy(ndp->ni_pathbuf->pb_path, savepath);
pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
savepath = NULL;
goto emul_retry;
}
}
if (savepath != NULL) { pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
}
return error;
}
/*
* External interface.
*/
int
namei(struct nameidata *ndp)
{
struct namei_state state;
int error;
namei_init(&state, ndp);
error = namei_tryemulroot(&state,
0/*!neverfollow*/, 0/*!inhibitmagic*/,
0/*isnfsd*/);
namei_cleanup(&state); if (error) {
/* make sure no stray refs leak out */
KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL);
}
return error;
}
////////////////////////////////////////////////////////////
/*
* External interface used by nfsd. This is basically different from
* namei only in that it has the ability to pass in the "current
* directory", and uses an extra flag "neverfollow" for which there's
* no physical flag defined in namei.h. (There used to be a cut&paste
* copy of about half of namei in nfsd to allow these minor
* adjustments to exist.)
*
* XXX: the namei interface should be adjusted so nfsd can just use
* ordinary namei().
*/
int
lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
{
struct namei_state state;
int error;
KASSERT(ndp->ni_atdir == NULL);
ndp->ni_atdir = forcecwd;
namei_init(&state, ndp);
error = namei_tryemulroot(&state,
neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
namei_cleanup(&state);
if (error) {
/* make sure no stray refs leak out */
KASSERT(ndp->ni_dvp == NULL);
KASSERT(ndp->ni_vp == NULL);
}
return error;
}
/*
* A second external interface used by nfsd. This turns out to be a
* single lookup used by the WebNFS code (ha!) to get "index.html" or
* equivalent when asked for a directory. It should eventually evolve
* into some kind of namei_once() call; for the time being it's kind
* of a mess. XXX.
*
* dholland 20110109: I don't think it works, and I don't think it
* worked before I started hacking and slashing either, and I doubt
* anyone will ever notice.
*/
/*
* Internals. This calls lookup_once() after setting up the assorted
* pieces of state the way they ought to be.
*/
static int
do_lookup_for_nfsd_index(struct namei_state *state)
{
int error;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
struct vnode *startdir;
struct vnode *foundobj;
bool startdir_locked;
const char *cp; /* pointer into pathname argument */
KASSERT(cnp == &ndp->ni_cnd);
startdir = state->ndp->ni_atdir;
cnp->cn_nameptr = ndp->ni_pnbuf;
state->docache = 1;
state->rdonly = cnp->cn_flags & RDONLY;
ndp->ni_dvp = NULL;
error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen);
if (error) {
return error;
}
cp = cnp->cn_nameptr + cnp->cn_namelen;
KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
state->slashes = 0;
cnp->cn_flags &= ~REQUIREDIR;
cnp->cn_flags |= MAKEENTRY|ISLASTCN;
if (cnp->cn_namelen == 2 &&
cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
cnp->cn_flags |= ISDOTDOT;
else
cnp->cn_flags &= ~ISDOTDOT;
/*
* Because lookup_once can change the startdir, we need our
* own reference to it to avoid consuming the caller's.
*/
vref(startdir);
error = lookup_once(state, startdir, &startdir, &foundobj,
&startdir_locked);
KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
if (startdir_locked) {
VOP_UNLOCK(startdir);
startdir_locked = false;
}
/*
* If the vnode we found is mounted on, then cross the mount and get
* the root vnode in foundobj. If this encounters an error, it will
* dispose of foundobj, but searchdir is untouched.
*/
if (error == 0 && foundobj != NULL &&
foundobj->v_type == VDIR &&
foundobj->v_mountedhere != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
error = lookup_crossmount(state, &startdir, &foundobj,
&startdir_locked);
}
/* Now toss startdir and see if we have an error. */
if (startdir != NULL)
vrele(startdir);
if (error)
foundobj = NULL;
else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
ndp->ni_vp = foundobj;
return (error);
}
/*
* External interface. The partitioning between this function and the
* above isn't very clear - the above function exists mostly so code
* that uses "state->" can be shuffled around without having to change
* it to "state.".
*/
int
lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
{
struct namei_state state;
int error;
KASSERT(ndp->ni_atdir == NULL);
ndp->ni_atdir = startdir;
/*
* Note: the name sent in here (is not|should not be) allowed
* to contain a slash.
*/
if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
return ENAMETOOLONG;
}
if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
return EINVAL;
}
ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
ndp->ni_pnbuf = NULL;
ndp->ni_cnd.cn_nameptr = NULL;
namei_init(&state, ndp);
error = do_lookup_for_nfsd_index(&state);
namei_cleanup(&state);
return error;
}
////////////////////////////////////////////////////////////
/*
* Reacquire a path name component.
* dvp is locked on entry and exit.
* *vpp is locked on exit unless it's NULL.
*/
int
relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
{
int rdonly; /* lookup read-only flag bit */
int error = 0;
#ifdef DEBUG
size_t newlen; /* DEBUG: check name len */
const char *cp; /* DEBUG: check name ptr */
#endif /* DEBUG */
(void)dummy;
/*
* Setup: break out flag bits into variables.
*/
rdonly = cnp->cn_flags & RDONLY;
/*
* Search a new directory.
*
* The cn_hash value is for use by vfs_cache.
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*/
#ifdef DEBUG
#if 0
cp = NULL;
newhash = namei_hash(cnp->cn_nameptr, &cp);
if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
panic("relookup: bad hash");
#endif
error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen);
if (error) {
panic("relookup: parsepath failed with error %d", error);
}
if (cnp->cn_namelen != newlen)
panic("relookup: bad len");
cp = cnp->cn_nameptr + cnp->cn_namelen;
while (*cp == '/')
cp++;
if (*cp != 0)
panic("relookup: not last component");
#endif /* DEBUG */
/*
* Check for degenerate name (e.g. / or "")
* which is a way of talking about a directory,
* e.g. like "/." or ".".
*/
if (cnp->cn_nameptr[0] == '\0')
panic("relookup: null name");
if (cnp->cn_flags & ISDOTDOT)
panic("relookup: lookup on dot-dot");
/*
* We now have a segment name to search for, and a directory to search.
*/
*vpp = NULL;
error = VOP_LOOKUP(dvp, vpp, cnp);
if ((error) != 0) { KASSERTMSG((*vpp == NULL),
"leaf `%s' should be empty but is %p",
cnp->cn_nameptr, *vpp);
if (error != EJUSTRETURN)
goto bad;
}
/*
* Check for symbolic link
*/
KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK ||
(cnp->cn_flags & FOLLOW) == 0),
"relookup: symlink found");
/*
* Check for read-only lookups.
*/
if (rdonly && cnp->cn_nameiop != LOOKUP) {
error = EROFS;
if (*vpp) { vrele(*vpp);
}
goto bad;
}
/*
* Lock result.
*/
if (*vpp && *vpp != dvp) {
error = vn_lock(*vpp, LK_EXCLUSIVE);
if (error != 0) { vrele(*vpp);
goto bad;
}
}
return (0);
bad:
*vpp = NULL;
return (error);
}
/*
* namei_simple - simple forms of namei.
*
* These are wrappers to allow the simple case callers of namei to be
* left alone while everything else changes under them.
*/
/* Flags */
struct namei_simple_flags_type {
int dummy;
};
static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
static
int
namei_simple_convert_flags(namei_simple_flags_t sflags)
{
if (sflags == NSM_NOFOLLOW_NOEMULROOT)
return NOFOLLOW | 0;
if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
return NOFOLLOW | TRYEMULROOT;
if (sflags == NSM_FOLLOW_NOEMULROOT)
return FOLLOW | 0;
if (sflags == NSM_FOLLOW_TRYEMULROOT)
return FOLLOW | TRYEMULROOT;
panic("namei_simple_convert_flags: bogus sflags\n");
return 0;
}
int
namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
struct vnode **vp_ret)
{
return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
}
int
nameiat_simple_kernel(struct vnode *dvp, const char *path,
namei_simple_flags_t sflags, struct vnode **vp_ret)
{
struct nameidata nd;
struct pathbuf *pb;
int err;
pb = pathbuf_create(path);
if (pb == NULL) {
return ENOMEM;
}
NDINIT(&nd,
LOOKUP,
namei_simple_convert_flags(sflags),
pb);
if (dvp != NULL) NDAT(&nd, dvp);
err = namei(&nd);
if (err != 0) {
pathbuf_destroy(pb);
return err;
}
*vp_ret = nd.ni_vp;
pathbuf_destroy(pb);
return 0;
}
int
namei_simple_user(const char *path, namei_simple_flags_t sflags,
struct vnode **vp_ret)
{
return nameiat_simple_user(NULL, path, sflags, vp_ret);
}
int
nameiat_simple_user(struct vnode *dvp, const char *path,
namei_simple_flags_t sflags, struct vnode **vp_ret)
{
struct pathbuf *pb;
struct nameidata nd;
int err;
err = pathbuf_copyin(path, &pb);
if (err) {
return err;
}
NDINIT(&nd,
LOOKUP,
namei_simple_convert_flags(sflags),
pb);
if (dvp != NULL) NDAT(&nd, dvp);
err = namei(&nd);
if (err != 0) {
pathbuf_destroy(pb);
return err;
}
*vp_ret = nd.ni_vp;
pathbuf_destroy(pb);
return 0;
}
/* $NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/mman.h>
#include <sys/file.h>
#include <sys/kauth.h>
#include <sys/stat.h>
#include <sys/extattr.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>
static void filt_genfsdetach(struct knote *);
static int filt_genfsread(struct knote *, long);
static int filt_genfsvnode(struct knote *, long);
/*
* Find the end of the first path component in NAME and return its
* length.
*/
int
genfs_parsepath(void *v)
{
struct vop_parsepath_args /* {
struct vnode *a_dvp;
const char *a_name;
size_t *a_ret;
} */ *ap = v;
const char *name = ap->a_name;
size_t pos;
(void)ap->a_dvp;
pos = 0;
while (name[pos] != '\0' && name[pos] != '/') { pos++;
}
*ap->a_retval = pos;
return 0;
}
int
genfs_poll(void *v)
{
struct vop_poll_args /* {
struct vnode *a_vp;
int a_events;
struct lwp *a_l;
} */ *ap = v;
return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}
int
genfs_seek(void *v)
{
struct vop_seek_args /* {
struct vnode *a_vp;
off_t a_oldoff;
off_t a_newoff;
kauth_cred_t cred;
} */ *ap = v;
if (ap->a_newoff < 0)
return (EINVAL);
return (0);
}
int
genfs_abortop(void *v)
{
struct vop_abortop_args /* {
struct vnode *a_dvp;
struct componentname *a_cnp;
} */ *ap = v;
(void)ap;
return (0);
}
int
genfs_fcntl(void *v)
{
struct vop_fcntl_args /* {
struct vnode *a_vp;
u_int a_command;
void *a_data;
int a_fflag;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
if (ap->a_command == F_SETFL)
return (0);
else
return (EOPNOTSUPP);
}
/*ARGSUSED*/
int
genfs_badop(void *v)
{
panic("genfs: bad op");
}
/*ARGSUSED*/
int
genfs_nullop(void *v)
{
return (0);
}
/*ARGSUSED*/
int
genfs_einval(void *v)
{
return (EINVAL);
}
int
genfs_erofs_link(void *v)
{
/* also for symlink */
struct vop_link_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
return EROFS;
}
/*
* Called when an fs doesn't support a particular vop.
* This takes care to vrele, vput, or vunlock passed in vnodes
* and calls VOP_ABORTOP for a componentname (in non-rename VOP).
*/
int
genfs_eopnotsupp(void *v)
{
struct vop_generic_args /*
struct vnodeop_desc *a_desc;
/ * other random data follows, presumably * /
} */ *ap = v;
struct vnodeop_desc *desc = ap->a_desc;
struct vnode *vp, *vp_last = NULL;
int flags, i, j, offset_cnp, offset_vp;
KASSERT(desc->vdesc_offset != VOP_LOOKUP_DESCOFFSET);
KASSERT(desc->vdesc_offset != VOP_ABORTOP_DESCOFFSET);
/*
* Abort any componentname that lookup potentially left state in.
*
* As is logical, componentnames for VOP_RENAME are handled by
* the caller of VOP_RENAME. Yay, rename!
*/
if (desc->vdesc_offset != VOP_RENAME_DESCOFFSET &&
(offset_vp = desc->vdesc_vp_offsets[0]) != VDESC_NO_OFFSET &&
(offset_cnp = desc->vdesc_componentname_offset) != VDESC_NO_OFFSET){
struct componentname *cnp;
struct vnode *dvp;
dvp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);
cnp = *VOPARG_OFFSETTO(struct componentname **, offset_cnp, ap);
VOP_ABORTOP(dvp, cnp);
}
flags = desc->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
if ((offset_vp = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
break; /* stop at end of list */
if ((j = flags & VDESC_VP0_WILLPUT)) {
vp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);
/* Skip if NULL */
if (!vp)
continue;
switch (j) {
case VDESC_VP0_WILLPUT:
/* Check for dvp == vp cases */
if (vp == vp_last)
vrele(vp);
else {
vput(vp);
vp_last = vp;
}
break;
case VDESC_VP0_WILLRELE:
vrele(vp);
break;
}
}
}
return (EOPNOTSUPP);
}
/*ARGSUSED*/
int
genfs_ebadf(void *v)
{
return (EBADF);
}
/* ARGSUSED */
int
genfs_enoioctl(void *v)
{
return (EPASSTHROUGH);
}
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
int
genfs_revoke(void *v)
{
struct vop_revoke_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
#ifdef DIAGNOSTIC
if ((ap->a_flags & REVOKEALL) == 0)
panic("genfs_revoke: not revokeall");
#endif
vrevoke(ap->a_vp);
return (0);
}
/*
* Lock the node (for deadfs).
*/
int
genfs_deadlock(void *v)
{
struct vop_lock_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int flags = ap->a_flags;
krw_t op;
if (! ISSET(flags, LK_RETRY))
return ENOENT;
if (ISSET(flags, LK_DOWNGRADE)) {
rw_downgrade(&vip->vi_lock);
} else if (ISSET(flags, LK_UPGRADE)) {
KASSERT(ISSET(flags, LK_NOWAIT));
if (!rw_tryupgrade(&vip->vi_lock)) {
return EBUSY;
}
} else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
if (ISSET(flags, LK_NOWAIT)) {
if (!rw_tryenter(&vip->vi_lock, op))
return EBUSY;
} else {
rw_enter(&vip->vi_lock, op);
}
}
VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED);
return 0;
}
/*
* Unlock the node (for deadfs).
*/
int
genfs_deadunlock(void *v)
{
struct vop_unlock_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
rw_exit(&vip->vi_lock);
return 0;
}
/*
* Lock the node.
*/
int
genfs_lock(void *v)
{
struct vop_lock_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int flags = ap->a_flags;
krw_t op;
if (ISSET(flags, LK_DOWNGRADE)) {
rw_downgrade(&vip->vi_lock);
} else if (ISSET(flags, LK_UPGRADE)) {
KASSERT(ISSET(flags, LK_NOWAIT)); if (!rw_tryupgrade(&vip->vi_lock)) {
return EBUSY;
}
} else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
if (ISSET(flags, LK_NOWAIT)) {
if (!rw_tryenter(&vip->vi_lock, op))
return EBUSY;
} else {
rw_enter(&vip->vi_lock, op);
}
}
VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
return 0;
}
/*
* Unlock the node.
*/
int
genfs_unlock(void *v)
{
struct vop_unlock_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
rw_exit(&vip->vi_lock);
return 0;
}
/*
* Return whether or not the node is locked.
*/
int
genfs_islocked(void *v)
{
struct vop_islocked_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
if (rw_write_held(&vip->vi_lock))
return LK_EXCLUSIVE;
if (rw_read_held(&vip->vi_lock))
return LK_SHARED;
return 0;
}
int
genfs_mmap(void *v)
{
return (0);
}
/*
* VOP_PUTPAGES() for vnodes which never have pages.
*/
int
genfs_null_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(vp->v_uobj.uo_npages == 0);
rw_exit(vp->v_uobj.vmobjlock);
return (0);
}
void
genfs_node_init(struct vnode *vp, const struct genfs_ops *ops)
{
struct genfs_node *gp = VTOG(vp);
rw_init(&gp->g_glock);
gp->g_op = ops;
}
void
genfs_node_destroy(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_destroy(&gp->g_glock);
}
void
genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
{
int bsize;
bsize = 1 << vp->v_mount->mnt_fs_bshift;
*eobp = (size + bsize - 1) & ~(bsize - 1);
}
static void
filt_genfsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
vn_knote_detach(vp, kn);
}
static int
filt_genfsread(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int rv;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, EV_ONESHOT);
return (1);
case 0:
mutex_enter(vp->v_interlock);
kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
rv = (kn->kn_data != 0);
mutex_exit(vp->v_interlock);
return rv;
default:
KASSERT(mutex_owned(vp->v_interlock));
kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
return (kn->kn_data != 0);
}
}
static int
filt_genfswrite(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, EV_ONESHOT);
return (1);
case 0:
mutex_enter(vp->v_interlock);
kn->kn_data = 0;
mutex_exit(vp->v_interlock);
return 1;
default:
KASSERT(mutex_owned(vp->v_interlock));
kn->kn_data = 0;
return 1;
}
}
static int
filt_genfsvnode(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int fflags;
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, 0);
if ((kn->kn_sfflags & hint) != 0)
kn->kn_fflags |= hint;
return (1);
case 0:
mutex_enter(vp->v_interlock);
fflags = kn->kn_fflags;
mutex_exit(vp->v_interlock);
break;
default:
KASSERT(mutex_owned(vp->v_interlock));
if ((kn->kn_sfflags & hint) != 0)
kn->kn_fflags |= hint;
fflags = kn->kn_fflags;
break;
}
return (fflags != 0);
}
static const struct filterops genfsread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfsread,
};
static const struct filterops genfswrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfswrite,
};
static const struct filterops genfsvnode_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfsvnode,
};
int
genfs_kqfilter(void *v)
{
struct vop_kqfilter_args /* {
struct vnode *a_vp;
struct knote *a_kn;
} */ *ap = v;
struct vnode *vp;
struct knote *kn;
vp = ap->a_vp;
kn = ap->a_kn;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &genfsread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &genfswrite_filtops;
break;
case EVFILT_VNODE:
kn->kn_fop = &genfsvnode_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = vp;
vn_knote_attach(vp, kn);
return (0);
}
void
genfs_node_wrlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_enter(&gp->g_glock, RW_WRITER);
}
void
genfs_node_rdlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_enter(&gp->g_glock, RW_READER);
}
int
genfs_node_rdtrylock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
return rw_tryenter(&gp->g_glock, RW_READER);
}
void
genfs_node_unlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_exit(&gp->g_glock);
}
int
genfs_node_wrlocked(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
return rw_write_held(&gp->g_glock);
}
/*
* Common filesystem object access control check routine. Accepts a
* vnode, cred, uid, gid, mode, acl, requested access mode.
* Returns 0 on success, or an errno on failure.
*/
int
genfs_can_access(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid,
mode_t file_mode, struct acl *acl, accmode_t accmode)
{
accmode_t dac_granted;
int error;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
/*
* Look for a normal, non-privileged way to access the file/directory
* as requested. If it exists, go with that.
*/
dac_granted = 0;
/* Check the owner. */
if (kauth_cred_geteuid(cred) == file_uid) {
dac_granted |= VADMIN;
if (file_mode & S_IXUSR)
dac_granted |= VEXEC;
if (file_mode & S_IRUSR)
dac_granted |= VREAD;
if (file_mode & S_IWUSR)
dac_granted |= (VWRITE | VAPPEND);
goto privchk;
}
/* Otherwise, check the groups (first match) */
/* Otherwise, check the groups. */
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error == 0) {
if (file_mode & S_IXGRP)
dac_granted |= VEXEC;
if (file_mode & S_IRGRP)
dac_granted |= VREAD;
if (file_mode & S_IWGRP)
dac_granted |= (VWRITE | VAPPEND);
goto privchk;
}
/* Otherwise, check everyone else. */
if (file_mode & S_IXOTH)
dac_granted |= VEXEC;
if (file_mode & S_IROTH)
dac_granted |= VREAD;
if (file_mode & S_IWOTH)
dac_granted |= (VWRITE | VAPPEND);
privchk:
if ((accmode & dac_granted) == accmode)
return 0;
return (accmode & VADMIN) ? EPERM : EACCES;
}
/*
* Implement a version of genfs_can_access() that understands POSIX.1e ACL
* semantics;
* the access ACL has already been prepared for evaluation by the file system
* and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an
* errno value.
*/
int
genfs_can_access_acl_posix1e(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
gid_t file_gid, mode_t file_mode, struct acl *acl, accmode_t accmode)
{
struct acl_entry *acl_other, *acl_mask;
accmode_t dac_granted;
accmode_t acl_mask_granted;
int group_matched, i;
int error;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
/*
* The owner matches if the effective uid associated with the
* credential matches that of the ACL_USER_OBJ entry. While we're
* doing the first scan, also cache the location of the ACL_MASK and
* ACL_OTHER entries, preventing some future iterations.
*/
acl_mask = acl_other = NULL;
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_USER_OBJ:
if (kauth_cred_geteuid(cred) != file_uid)
break;
dac_granted = 0;
dac_granted |= VADMIN;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
goto out;
case ACL_MASK:
acl_mask = ae;
break;
case ACL_OTHER:
acl_other = ae;
break;
default:
break;
}
}
/*
* An ACL_OTHER entry should always exist in a valid access ACL. If
* it doesn't, then generate a serious failure. For now, this means
* a debugging message and EPERM, but in the future should probably
* be a panic.
*/
if (acl_other == NULL) {
/*
* XXX This should never happen
*/
printf("%s: ACL_OTHER missing\n", __func__);
return EPERM;
}
/*
* Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
* masked by an ACL_MASK entry, if any. As such, first identify the
* ACL_MASK field, then iterate through identifying potential user
* matches, then group matches. If there is no ACL_MASK, assume that
* the mask allows all requests to succeed.
*/
if (acl_mask != NULL) {
acl_mask_granted = 0;
if (acl_mask->ae_perm & ACL_EXECUTE)
acl_mask_granted |= VEXEC;
if (acl_mask->ae_perm & ACL_READ)
acl_mask_granted |= VREAD;
if (acl_mask->ae_perm & ACL_WRITE)
acl_mask_granted |= (VWRITE | VAPPEND);
} else
acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
/*
* Check ACL_USER ACL entries. There will either be one or no
* matches; if there is one, we accept or rejected based on the
* match; otherwise, we continue on to groups.
*/
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_USER:
if (kauth_cred_geteuid(cred) != ae->ae_id)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
}
}
/*
* Group match is best-match, not first-match, so find a "best"
* match. Iterate across, testing each potential group match. Make
* sure we keep track of whether we found a match or not, so that we
* know if we should try again with any available privilege, or if we
* should move on to ACL_OTHER.
*/
group_matched = 0;
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
if ((accmode & dac_granted) == accmode)
return 0;
group_matched = 1;
break;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
if ((accmode & dac_granted) == accmode)
return 0;
group_matched = 1;
break;
default:
break;
}
}
if (group_matched == 1) {
/*
* There was a match, but it did not grant rights via pure
* DAC. Try again, this time with privilege.
*/
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
default:
break;
}
}
/*
* Even with privilege, group membership was not sufficient.
* Return failure.
*/
dac_granted = 0;
goto out;
}
/*
* Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
*/
dac_granted = 0;
if (acl_other->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (acl_other->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (acl_other->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
out:
if ((accmode & dac_granted) == accmode)
return 0;
return (accmode & VADMIN) ? EPERM : EACCES;
}
static struct {
accmode_t accmode;
int mask;
} accmode2mask[] = {
{ VREAD, ACL_READ_DATA },
{ VWRITE, ACL_WRITE_DATA },
{ VAPPEND, ACL_APPEND_DATA },
{ VEXEC, ACL_EXECUTE },
{ VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS },
{ VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS },
{ VDELETE_CHILD, ACL_DELETE_CHILD },
{ VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES },
{ VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES },
{ VDELETE, ACL_DELETE },
{ VREAD_ACL, ACL_READ_ACL },
{ VWRITE_ACL, ACL_WRITE_ACL },
{ VWRITE_OWNER, ACL_WRITE_OWNER },
{ VSYNCHRONIZE, ACL_SYNCHRONIZE },
{ 0, 0 },
};
static int
_access_mask_from_accmode(accmode_t accmode)
{
int access_mask = 0, i;
for (i = 0; accmode2mask[i].accmode != 0; i++) {
if (accmode & accmode2mask[i].accmode)
access_mask |= accmode2mask[i].mask;
}
/*
* VAPPEND is just a modifier for VWRITE; if the caller asked
* for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
*/
if (access_mask & ACL_APPEND_DATA)
access_mask &= ~ACL_WRITE_DATA;
return (access_mask);
}
/*
* Return 0, iff access is allowed, 1 otherwise.
*/
static int
_acl_denies(const struct acl *aclp, int access_mask, kauth_cred_t cred,
int file_uid, int file_gid, int *denied_explicitly)
{
int i, error;
const struct acl_entry *ae;
if (denied_explicitly != NULL)
*denied_explicitly = 0;
KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES);
for (i = 0; i < aclp->acl_cnt; i++) {
ae = &(aclp->acl_entry[i]);
if (ae->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
ae->ae_entry_type != ACL_ENTRY_TYPE_DENY)
continue;
if (ae->ae_flags & ACL_ENTRY_INHERIT_ONLY)
continue;
switch (ae->ae_tag) {
case ACL_USER_OBJ:
if (kauth_cred_geteuid(cred) != file_uid)
continue;
break;
case ACL_USER:
if (kauth_cred_geteuid(cred) != ae->ae_id)
continue;
break;
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error != 0)
continue;
break;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error != 0)
continue;
break;
default:
KASSERT(ae->ae_tag == ACL_EVERYONE);
}
if (ae->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
if (ae->ae_perm & access_mask) {
if (denied_explicitly != NULL)
*denied_explicitly = 1;
return (1);
}
}
access_mask &= ~(ae->ae_perm);
if (access_mask == 0)
return (0);
}
if (access_mask == 0)
return (0);
return (1);
}
int
genfs_can_access_acl_nfs4(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
gid_t file_gid, mode_t file_mode, struct acl *aclp, accmode_t accmode)
{
int denied, explicitly_denied, access_mask, is_directory,
must_be_owner = 0;
file_mode = 0;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0);
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
if (accmode & VADMIN)
must_be_owner = 1;
/*
* Ignore VSYNCHRONIZE permission.
*/
accmode &= ~VSYNCHRONIZE;
access_mask = _access_mask_from_accmode(accmode);
if (vp && vp->v_type == VDIR)
is_directory = 1;
else
is_directory = 0;
/*
* File owner is always allowed to read and write the ACL
* and basic attributes. This is to prevent a situation
* where user would change ACL in a way that prevents him
* from undoing the change.
*/
if (kauth_cred_geteuid(cred) == file_uid)
access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);
/*
* Ignore append permission for regular files; use write
* permission instead.
*/
if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
access_mask &= ~ACL_APPEND_DATA;
access_mask |= ACL_WRITE_DATA;
}
denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
&explicitly_denied);
if (must_be_owner) {
if (kauth_cred_geteuid(cred) != file_uid)
denied = EPERM;
}
/*
* For VEXEC, ensure that at least one execute bit is set for
* non-directories. We have to check the mode here to stay
* consistent with execve(2). See the test in
* exec_check_permissions().
*/
__acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
if (!denied && !is_directory && (accmode & VEXEC) &&
(file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
denied = EACCES;
if (!denied)
return (0);
/*
* Access failed. Iff it was not denied explicitly and
* VEXPLICIT_DENY flag was specified, allow access.
*/
if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
return (0);
accmode &= ~VEXPLICIT_DENY;
if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
denied = EPERM;
else
denied = EACCES;
return (denied);
}
/*
* Common routine to check if chmod() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not set the "sticky" bit (meaningless, see chmod(2))
* - You must be a member of the group if you're trying to set the
* SGIDf bit
*
* vp - vnode of the file-system object
* cred - credentials of the invoker
* cur_uid, cur_gid - current uid/gid of the file-system object
* new_mode - new mode for the file-system object
*
* Returns 0 if the change is allowed, or an error value otherwise.
*/
int
genfs_can_chmod(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
gid_t cur_gid, mode_t new_mode)
{
int error;
/*
* To modify the permissions on a file, must possess VADMIN
* for that file.
*/
if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0)
return (error);
/*
* Unprivileged users can't set the sticky bit on files.
*/
if ((vp->v_type != VDIR) && (new_mode & S_ISTXT))
return (EFTYPE);
/*
* If the invoker is trying to set the SGID bit on the file,
* check group membership.
*/
if (new_mode & S_ISGID) {
int ismember;
error = kauth_cred_ismember_gid(cred, cur_gid,
&ismember);
if (error || !ismember)
return (EPERM);
}
/*
* Deny setting setuid if we are not the file owner.
*/
if ((new_mode & S_ISUID) && cur_uid != kauth_cred_geteuid(cred))
return (EPERM);
return (0);
}
/*
* Common routine to check if chown() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not try to change ownership, and
* - You must be member of the new group
*
* vp - vnode
* cred - credentials of the invoker
* cur_uid, cur_gid - current uid/gid of the file-system object
* new_uid, new_gid - target uid/gid of the file-system object
*
* Returns 0 if the change is allowed, or an error value otherwise.
*/
int
genfs_can_chown(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
gid_t cur_gid, uid_t new_uid, gid_t new_gid)
{
int error, ismember;
/*
* To modify the ownership of a file, must possess VADMIN for that
* file.
*/
if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0)
return (error);
/*
* You can only change ownership of a file if:
* You own the file and...
*/
if (kauth_cred_geteuid(cred) == cur_uid) {
/*
* You don't try to change ownership, and...
*/
if (new_uid != cur_uid)
return (EPERM);
/*
* You don't try to change group (no-op), or...
*/
if (new_gid == cur_gid)
return (0);
/*
* Your effective gid is the new gid, or...
*/
if (kauth_cred_getegid(cred) == new_gid)
return (0);
/*
* The new gid is one you're a member of.
*/
ismember = 0;
error = kauth_cred_ismember_gid(cred, new_gid,
&ismember);
if (!error && ismember)
return (0);
}
return (EPERM);
}
int
genfs_can_chtimes(vnode_t *vp, kauth_cred_t cred, uid_t owner_uid,
u_int vaflags)
{
int error;
/*
* Grant permission if the caller is the owner of the file, or
* the super-user, or has ACL_WRITE_ATTRIBUTES permission on
* on the file. If the time pointer is null, then write
* permission on the file is also sufficient.
*
* From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
* A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
* will be allowed to set the times [..] to the current
* server time.
*/
error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred);
if (error != 0 && (vaflags & VA_UTIMES_NULL) != 0)
error = VOP_ACCESS(vp, VWRITE, cred);
if (error)
return (vaflags & VA_UTIMES_NULL) == 0 ? EPERM : EACCES;
return 0;
}
/*
* Common routine to check if chflags() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not change system flags, and
* - You must not change flags on character/block devices.
*
* vp - vnode
* cred - credentials of the invoker
* owner_uid - uid of the file-system object
* changing_sysflags - true if the invoker wants to change system flags
*/
int
genfs_can_chflags(vnode_t *vp, kauth_cred_t cred,
uid_t owner_uid, bool changing_sysflags)
{
/* The user must own the file. */
if (kauth_cred_geteuid(cred) != owner_uid) {
return EPERM;
}
if (changing_sysflags) {
return EPERM;
}
/*
* Unprivileged users cannot change the flags on devices, even if they
* own them.
*/
if (vp->v_type == VCHR || vp->v_type == VBLK) {
return EPERM;
}
return 0;
}
/*
* Common "sticky" policy.
*
* When a directory is "sticky" (as determined by the caller), this
* function may help implementing the following policy:
* - Renaming a file in it is only possible if the user owns the directory
* or the file being renamed.
* - Deleting a file from it is only possible if the user owns the
* directory or the file being deleted.
*/
int
genfs_can_sticky(vnode_t *vp, kauth_cred_t cred, uid_t dir_uid, uid_t file_uid)
{
if (kauth_cred_geteuid(cred) != dir_uid &&
kauth_cred_geteuid(cred) != file_uid)
return EPERM;
return 0;
}
int
genfs_can_extattr(vnode_t *vp, kauth_cred_t cred, accmode_t accmode,
int attrnamespace)
{
/*
* Kernel-invoked always succeeds.
*/
if (cred == NOCRED)
return 0;
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
return kauth_authorize_system(cred, KAUTH_SYSTEM_FS_EXTATTR,
0, vp->v_mount, NULL, NULL);
case EXTATTR_NAMESPACE_USER:
return VOP_ACCESS(vp, accmode, cred);
default:
return EPERM;
}
}
int
genfs_access(void *v)
{
struct vop_access_args *ap = v;
KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
VAPPEND)) == 0);
return VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred);
}
int
genfs_accessx(void *v)
{
struct vop_accessx_args *ap = v;
int error;
accmode_t accmode = ap->a_accmode;
error = vfs_unixify_accmode(&accmode);
if (error != 0)
return error;
if (accmode == 0)
return 0;
return VOP_ACCESS(ap->a_vp, accmode, ap->a_cred);
}
/*
* genfs_pathconf:
*
* Standard implementation of POSIX pathconf, to get information about limits
* for a filesystem.
* Override per filesystem for the case where the filesystem has smaller
* limits.
*/
int
genfs_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
switch (ap->a_name) {
case _PC_PATH_MAX:
*ap->a_retval = PATH_MAX;
return 0;
case _PC_ACL_EXTENDED:
case _PC_ACL_NFS4:
*ap->a_retval = 0;
return 0;
default:
return EINVAL;
}
}
/* $NetBSD: uvm_fault_i.h,v 1.33 2020/02/23 15:46:43 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp
*/
#ifndef _UVM_UVM_FAULT_I_H_
#define _UVM_UVM_FAULT_I_H_
/*
* uvm_fault_i.h: fault inline functions
*/
void uvmfault_update_stats(struct uvm_faultinfo *);
/*
* uvmfault_unlockmaps: unlock the maps
*/
static __inline void
uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return;
}
#ifndef __HAVE_NO_PMAP_STATS
uvmfault_update_stats(ufi);
#endif
if (write_locked) {
vm_map_unlock(ufi->map);
} else {
vm_map_unlock_read(ufi->map);
}
}
/*
* uvmfault_unlockall: unlock everything passed in.
*
* => maps must be read-locked (not write-locked).
*/
static __inline void
uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct uvm_object *uobj)
{
if (uobj) rw_exit(uobj->vmobjlock); if (amap) amap_unlock(amap); uvmfault_unlockmaps(ufi, false);
}
/*
* uvmfault_lookup: lookup a virtual address in a map
*
* => caller must provide a uvm_faultinfo structure with the IN
* params properly filled in
* => we will lookup the map entry (handling submaps) as we go
* => if the lookup is a success we will return with the maps locked
* => if "write_lock" is true, we write_lock the map, otherwise we only
* get a read lock.
* => note that submaps can only appear in the kernel and they are
* required to use the same virtual addresses as the map they
* are referenced by (thus address translation between the main
* map and the submap is unnecessary).
*/
static __inline bool
uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock)
{
struct vm_map *tmpmap;
/*
* init ufi values for lookup.
*/
ufi->map = ufi->orig_map;
ufi->size = ufi->orig_size;
/*
* keep going down levels until we are done. note that there can
* only be two levels so we won't loop very long.
*/
for (;;) {
/*
* lock map
*/
if (write_lock) {
vm_map_lock(ufi->map);
} else {
vm_map_lock_read(ufi->map);
}
/*
* lookup
*/
if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
&ufi->entry)) {
uvmfault_unlockmaps(ufi, write_lock);
return(false);
}
/*
* reduce size if necessary
*/
if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) ufi->size = ufi->entry->end - ufi->orig_rvaddr;
/*
* submap? replace map with the submap and lookup again.
* note: VAs in submaps must match VAs in main map.
*/
if (UVM_ET_ISSUBMAP(ufi->entry)) {
tmpmap = ufi->entry->object.sub_map;
if (write_lock) {
vm_map_unlock(ufi->map);
} else {
vm_map_unlock_read(ufi->map);
}
ufi->map = tmpmap;
continue;
}
/*
* got it!
*/
ufi->mapv = ufi->map->timestamp;
return(true);
} /* while loop */
/*NOTREACHED*/
}
/*
* uvmfault_relock: attempt to relock the same version of the map
*
* => fault data structures should be unlocked before calling.
* => if a success (true) maps will be locked after call.
*/
static __inline bool
uvmfault_relock(struct uvm_faultinfo *ufi)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return true;
}
cpu_count(CPU_COUNT_FLTRELCK, 1);
/*
* relock map. fail if version mismatch (in which case nothing
* gets locked).
*/
vm_map_lock_read(ufi->map);
if (ufi->mapv != ufi->map->timestamp) {
vm_map_unlock_read(ufi->map);
return(false);
}
cpu_count(CPU_COUNT_FLTRELCKOK, 1);
return(true);
}
#endif /* _UVM_UVM_FAULT_I_H_ */
/* $NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $ */
/*-
* Copyright (c) 1998, 2000, 2004, 2008, 2009, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/unpcb.h>
#include <sys/un.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mbuf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>
#include <compat/sys/socket.h>
#include <compat/net/route_70.h>
/*
* Unix communications domain.
*
* TODO:
* RDM
* rethink name space problems
* need a proper out-of-band
*
* Notes on locking:
*
* The generic rules noted in uipc_socket2.c apply. In addition:
*
* o We have a global lock, uipc_lock.
*
* o All datagram sockets are locked by uipc_lock.
*
* o For stream socketpairs, the two endpoints are created sharing the same
* independent lock. Sockets presented to PRU_CONNECT2 must already have
* matching locks.
*
* o Stream sockets created via socket() start life with their own
* independent lock.
*
* o Stream connections to a named endpoint are slightly more complicated.
* Sockets that have called listen() have their lock pointer mutated to
* the global uipc_lock. When establishing a connection, the connecting
* socket also has its lock mutated to uipc_lock, which matches the head
* (listening socket). We create a new socket for accept() to return, and
* that also shares the head's lock. Until the connection is completely
* done on both ends, all three sockets are locked by uipc_lock. Once the
* connection is complete, the association with the head's lock is broken.
* The connecting socket and the socket returned from accept() have their
* lock pointers mutated away from uipc_lock, and back to the connecting
* socket's original, independent lock. The head continues to be locked
* by uipc_lock.
*
* o If uipc_lock is determined to be a significant source of contention,
* it could easily be hashed out. It is difficult to simply make it an
* independent lock because of visibility / garbage collection issues:
* if a socket has been associated with a lock at any point, that lock
* must remain valid until the socket is no longer visible in the system.
* The lock must not be freed or otherwise destroyed until any sockets
* that had referenced it have also been destroyed.
*/
const struct sockaddr_un sun_noname = {
.sun_len = offsetof(struct sockaddr_un, sun_path),
.sun_family = AF_LOCAL,
};
ino_t unp_ino; /* prototype for fake inode numbers */
static struct mbuf * unp_addsockcred(struct lwp *, struct mbuf *);
static void unp_discard_later(file_t *);
static void unp_discard_now(file_t *);
static void unp_disconnect1(struct unpcb *);
static bool unp_drop(struct unpcb *, int);
static int unp_internalize(struct mbuf **);
static void unp_mark(file_t *);
static void unp_scan(struct mbuf *, void (*)(file_t *), int);
static void unp_shutdown1(struct unpcb *);
static void unp_thread(void *);
static void unp_thread_kick(void);
static kmutex_t *uipc_lock;
static kcondvar_t unp_thread_cv;
static lwp_t *unp_thread_lwp;
static SLIST_HEAD(,file) unp_thread_discard;
static int unp_defer;
static struct sysctllog *usrreq_sysctllog;
static void unp_sysctl_create(void);
/* Compat interface */
struct mbuf * stub_compat_70_unp_addsockcred(lwp_t *, struct mbuf *);
struct mbuf * stub_compat_70_unp_addsockcred(struct lwp *lwp,
struct mbuf *control)
{
/* just copy our initial argument */
return control;
}
bool compat70_ocreds_valid = false;
/*
* Initialize Unix protocols.
*/
void
uipc_init(void)
{
int error;
unp_sysctl_create();
uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
cv_init(&unp_thread_cv, "unpgc");
error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
NULL, &unp_thread_lwp, "unpgc");
if (error != 0)
panic("uipc_init %d", error);
}
static void
unp_connid(struct lwp *l, struct unpcb *unp, int flags)
{
unp->unp_connid.unp_pid = l->l_proc->p_pid;
unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
unp->unp_flags |= flags;
}
/*
* A connection succeeded: disassociate both endpoints from the head's
* lock, and make them share their own lock. There is a race here: for
* a very brief time one endpoint will be locked by a different lock
* than the other end. However, since the current thread holds the old
* lock (the listening socket's lock, the head) access can still only be
* made to one side of the connection.
*/
static void
unp_setpeerlocks(struct socket *so, struct socket *so2)
{
struct unpcb *unp;
kmutex_t *lock;
KASSERT(solocked2(so, so2));
/*
* Bail out if either end of the socket is not yet fully
* connected or accepted. We only break the lock association
* with the head when the pair of sockets stand completely
* on their own.
*/
KASSERT(so->so_head == NULL);
if (so2->so_head != NULL)
return;
/*
* Drop references to old lock. A third reference (from the
* queue head) must be held as we still hold its lock. Bonus:
* we don't need to worry about garbage collecting the lock.
*/
lock = so->so_lock;
KASSERT(lock == uipc_lock);
mutex_obj_free(lock);
mutex_obj_free(lock);
/*
* Grab stream lock from the initiator and share between the two
* endpoints. Issue memory barrier to ensure all modifications
* become globally visible before the lock change. so2 is
* assumed not to have a stream lock, because it was created
* purely for the server side to accept this connection and
* started out life using the domain-wide lock.
*/
unp = sotounpcb(so);
KASSERT(unp->unp_streamlock != NULL);
KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
lock = unp->unp_streamlock;
unp->unp_streamlock = NULL;
mutex_obj_hold(lock);
/*
* Ensure lock is initialized before publishing it with
* solockreset. Pairs with atomic_load_consume in solock and
* various loops to reacquire lock after wakeup.
*/
membar_release();
/*
* possible race if lock is not held - see comment in
* uipc_usrreq(PRU_ACCEPT).
*/
KASSERT(mutex_owned(lock));
solockreset(so, lock);
solockreset(so2, lock);
}
/*
* Reset a socket's lock back to the domain-wide lock.
*/
static void
unp_resetlock(struct socket *so)
{
kmutex_t *olock, *nlock;
struct unpcb *unp;
KASSERT(solocked(so));
olock = so->so_lock;
nlock = uipc_lock;
if (olock == nlock)
return;
unp = sotounpcb(so);
KASSERT(unp->unp_streamlock == NULL);
unp->unp_streamlock = olock;
mutex_obj_hold(nlock);
mutex_enter(nlock);
solockreset(so, nlock);
mutex_exit(olock);
}
static void
unp_free(struct unpcb *unp)
{
if (unp->unp_addr) free(unp->unp_addr, M_SONAME); if (unp->unp_streamlock != NULL) mutex_obj_free(unp->unp_streamlock);
kmem_free(unp, sizeof(*unp));
}
static int
unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp)
{
struct socket *so2;
const struct sockaddr_un *sun;
/* XXX: server side closed the socket */
if (unp->unp_conn == NULL)
return ECONNREFUSED;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked(so2));
if (unp->unp_addr)
sun = unp->unp_addr;
else
sun = &sun_noname;
if (unp->unp_conn->unp_flags & UNP_WANTCRED) control = unp_addsockcred(curlwp, control); if (unp->unp_conn->unp_flags & UNP_OWANTCRED) MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
stub_compat_70_unp_addsockcred(curlwp, control), control);
if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
control) == 0) {
unp_dispose(control);
m_freem(control);
m_freem(m);
/* Don't call soroverflow because we're returning this
* error directly to the sender. */
so2->so_rcv.sb_overflowed++;
return ENOBUFS;
} else {
sorwakeup(so2);
return 0;
}
}
static void
unp_setaddr(struct socket *so, struct sockaddr *nam, bool peeraddr)
{
const struct sockaddr_un *sun = NULL;
struct unpcb *unp;
KASSERT(solocked(so));
unp = sotounpcb(so);
if (peeraddr) {
if (unp->unp_conn && unp->unp_conn->unp_addr)
sun = unp->unp_conn->unp_addr;
} else {
if (unp->unp_addr)
sun = unp->unp_addr;
}
if (sun == NULL)
sun = &sun_noname;
memcpy(nam, sun, sun->sun_len);
}
static int
unp_rcvd(struct socket *so, int flags, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
struct socket *so2;
u_int newhiwat;
KASSERT(solocked(so));
KASSERT(unp != NULL);
switch (so->so_type) {
case SOCK_DGRAM:
panic("uipc 1");
/*NOTREACHED*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
#define rcv (&so->so_rcv)
#define snd (&so2->so_snd)
if (unp->unp_conn == 0)
break;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2));
/*
* Adjust backpressure on sender
* and wakeup any waiting to write.
*/
snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
unp->unp_mbcnt = rcv->sb_mbcnt;
newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
(void)chgsbsize(so2->so_uidinfo,
&snd->sb_hiwat, newhiwat, RLIM_INFINITY);
unp->unp_cc = rcv->sb_cc;
sowwakeup(so2);
#undef snd
#undef rcv
break;
default:
panic("uipc 2");
}
return 0;
}
static int
unp_recvoob(struct socket *so, struct mbuf *m, int flags)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
int error = 0;
u_int newhiwat;
struct socket *so2;
KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(m != NULL);
/*
* Note: unp_internalize() rejects any control message
* other than SCM_RIGHTS, and only allows one. This
* has the side-effect of preventing a caller from
* forging SCM_CREDS.
*/
if (control) {
sounlock(so);
error = unp_internalize(&control); solock(so); if (error != 0) {
m_freem(control);
m_freem(m);
return error;
}
}
switch (so->so_type) {
case SOCK_DGRAM: {
KASSERT(so->so_lock == uipc_lock);
if (nam) {
if ((so->so_state & SS_ISCONNECTED) != 0)
error = EISCONN;
else {
/*
* Note: once connected, the
* socket's lock must not be
* dropped until we have sent
* the message and disconnected.
* This is necessary to prevent
* intervening control ops, like
* another connection.
*/
error = unp_connect(so, nam, l);
}
} else {
if ((so->so_state & SS_ISCONNECTED) == 0)
error = ENOTCONN;
}
if (error) {
unp_dispose(control);
m_freem(control);
m_freem(m);
return error;
}
error = unp_output(m, control, unp); if (nam) unp_disconnect1(unp);
break;
}
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
#define rcv (&so2->so_rcv)
#define snd (&so->so_snd)
if (unp->unp_conn == NULL) {
error = ENOTCONN;
break;
}
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2)); if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
/*
* Credentials are passed only once on
* SOCK_STREAM and SOCK_SEQPACKET.
*/
unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
control = unp_addsockcred(l, control);
}
if (unp->unp_conn->unp_flags & UNP_OWANTCRED) {
/*
* Credentials are passed only once on
* SOCK_STREAM and SOCK_SEQPACKET.
*/
unp->unp_conn->unp_flags &= ~UNP_OWANTCRED;
MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
stub_compat_70_unp_addsockcred(curlwp, control),
control);
}
/*
* Send to paired receive port, and then reduce
* send buffer hiwater marks to maintain backpressure.
* Wake up readers.
*/
if (control) {
if (sbappendcontrol(rcv, m, control) != 0)
control = NULL;
} else {
switch(so->so_type) {
case SOCK_SEQPACKET:
sbappendrecord(rcv, m);
break;
case SOCK_STREAM:
sbappend(rcv, m);
break;
default:
panic("uipc_usrreq");
break;
}
}
snd->sb_mbmax -=
rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
newhiwat = snd->sb_hiwat -
(rcv->sb_cc - unp->unp_conn->unp_cc);
(void)chgsbsize(so->so_uidinfo,
&snd->sb_hiwat, newhiwat, RLIM_INFINITY);
unp->unp_conn->unp_cc = rcv->sb_cc;
sorwakeup(so2);
#undef snd
#undef rcv
if (control != NULL) { unp_dispose(control);
m_freem(control);
}
break;
default:
panic("uipc 4");
}
return error;
}
static int
unp_sendoob(struct socket *so, struct mbuf *m, struct mbuf * control)
{
KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
/*
* Unix domain socket option processing.
*/
int
uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
struct unpcb *unp = sotounpcb(so);
int optval = 0, error = 0;
KASSERT(solocked(so));
if (sopt->sopt_level != SOL_LOCAL) {
error = ENOPROTOOPT;
} else switch (op) {
case PRCO_SETOPT:
switch (sopt->sopt_name) {
case LOCAL_OCREDS:
if (!compat70_ocreds_valid) {
error = ENOPROTOOPT;
break;
}
/* FALLTHROUGH */
case LOCAL_CREDS:
case LOCAL_CONNWAIT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (sopt->sopt_name) {
#define OPTSET(bit) \
if (optval) \
unp->unp_flags |= (bit); \
else \
unp->unp_flags &= ~(bit);
case LOCAL_CREDS:
OPTSET(UNP_WANTCRED);
break;
case LOCAL_CONNWAIT:
OPTSET(UNP_CONNWAIT);
break;
case LOCAL_OCREDS:
OPTSET(UNP_OWANTCRED);
break;
}
break;
#undef OPTSET
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
sounlock(so);
switch (sopt->sopt_name) {
case LOCAL_PEEREID:
if (unp->unp_flags & UNP_EIDSVALID) {
error = sockopt_set(sopt, &unp->unp_connid,
sizeof(unp->unp_connid));
} else {
error = EINVAL;
}
break;
case LOCAL_CREDS:
#define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
optval = OPTBIT(UNP_WANTCRED);
error = sockopt_setint(sopt, optval);
break;
case LOCAL_OCREDS:
if (compat70_ocreds_valid) {
optval = OPTBIT(UNP_OWANTCRED);
error = sockopt_setint(sopt, optval);
break;
}
#undef OPTBIT
/* FALLTHROUGH */
default:
error = ENOPROTOOPT;
break;
}
solock(so);
break;
}
return (error);
}
/*
* Both send and receive buffers are allocated PIPSIZ bytes of buffering
* for stream sockets, although the total for sender and receiver is
* actually only PIPSIZ.
* Datagram sockets really use the sendspace as the maximum datagram size,
* and don't really want to reserve the sendspace. Their recvspace should
* be large enough for at least one max-size datagram plus address.
*/
#ifndef PIPSIZ
#define PIPSIZ 8192
#endif
u_long unpst_sendspace = PIPSIZ;
u_long unpst_recvspace = PIPSIZ;
u_long unpdg_sendspace = 2*1024; /* really max datagram size */
u_long unpdg_recvspace = 16*1024;
u_int unp_rights; /* files in flight */
u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */
static int
unp_attach(struct socket *so, int proto)
{
struct unpcb *unp = sotounpcb(so);
u_long sndspc, rcvspc;
int error;
KASSERT(unp == NULL); switch (so->so_type) {
case SOCK_SEQPACKET:
/* FALLTHROUGH */
case SOCK_STREAM:
if (so->so_lock == NULL) {
so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
solock(so);
}
sndspc = unpst_sendspace;
rcvspc = unpst_recvspace;
break;
case SOCK_DGRAM:
if (so->so_lock == NULL) {
mutex_obj_hold(uipc_lock);
so->so_lock = uipc_lock;
solock(so);
}
sndspc = unpdg_sendspace;
rcvspc = unpdg_recvspace;
break;
default:
panic("unp_attach");
}
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
error = soreserve(so, sndspc, rcvspc);
if (error) {
return error;
}
}
unp = kmem_zalloc(sizeof(*unp), KM_SLEEP);
nanotime(&unp->unp_ctime);
unp->unp_socket = so;
so->so_pcb = unp;
KASSERT(solocked(so));
return 0;
}
static void
unp_detach(struct socket *so)
{
struct unpcb *unp;
vnode_t *vp;
unp = sotounpcb(so);
KASSERT(unp != NULL); KASSERT(solocked(so));
retry:
if ((vp = unp->unp_vnode) != NULL) {
sounlock(so);
/* Acquire v_interlock to protect against unp_connect(). */
/* XXXAD racy */
mutex_enter(vp->v_interlock);
vp->v_socket = NULL;
mutex_exit(vp->v_interlock);
vrele(vp);
solock(so);
unp->unp_vnode = NULL;
}
if (unp->unp_conn) unp_disconnect1(unp);
while (unp->unp_refs) {
KASSERT(solocked2(so, unp->unp_refs->unp_socket)); if (unp_drop(unp->unp_refs, ECONNRESET)) { solock(so);
goto retry;
}
}
soisdisconnected(so);
so->so_pcb = NULL;
if (unp_rights) {
/*
* Normally the receive buffer is flushed later, in sofree,
* but if our receive buffer holds references to files that
* are now garbage, we will enqueue those file references to
* the garbage collector and kick it into action.
*/
sorflush(so);
unp_free(unp); unp_thread_kick();
} else
unp_free(unp);
}
static int
unp_accept(struct socket *so, struct sockaddr *nam)
{
struct unpcb *unp = sotounpcb(so);
struct socket *so2;
KASSERT(solocked(so));
KASSERT(nam != NULL);
/* XXX code review required to determine if unp can ever be NULL */
if (unp == NULL)
return EINVAL;
KASSERT(so->so_lock == uipc_lock);
/*
* Mark the initiating STREAM socket as connected *ONLY*
* after it's been accepted. This prevents a client from
* overrunning a server and receiving ECONNREFUSED.
*/
if (unp->unp_conn == NULL) {
/*
* This will use the empty socket and will not
* allocate.
*/
unp_setaddr(so, nam, true);
return 0;
}
so2 = unp->unp_conn->unp_socket;
if (so2->so_state & SS_ISCONNECTING) {
KASSERT(solocked2(so, so->so_head));
KASSERT(solocked2(so2, so->so_head));
soisconnected(so2);
}
/*
* If the connection is fully established, break the
* association with uipc_lock and give the connected
* pair a separate lock to share.
* There is a race here: sotounpcb(so2)->unp_streamlock
* is not locked, so when changing so2->so_lock
* another thread can grab it while so->so_lock is still
* pointing to the (locked) uipc_lock.
* this should be harmless, except that this makes
* solocked2() and solocked() unreliable.
* Another problem is that unp_setaddr() expects the
* the socket locked. Grabbing sotounpcb(so2)->unp_streamlock
* fixes both issues.
*/
mutex_enter(sotounpcb(so2)->unp_streamlock);
unp_setpeerlocks(so2, so);
/*
* Only now return peer's address, as we may need to
* block in order to allocate memory.
*
* XXX Minor race: connection can be broken while
* lock is dropped in unp_setaddr(). We will return
* error == 0 and sun_noname as the peer address.
*/
unp_setaddr(so, nam, true);
/* so_lock now points to unp_streamlock */
mutex_exit(so2->so_lock);
return 0;
}
static int
unp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
return EOPNOTSUPP;
}
static int
unp_stat(struct socket *so, struct stat *ub)
{
struct unpcb *unp;
struct socket *so2;
KASSERT(solocked(so));
unp = sotounpcb(so);
if (unp == NULL)
return EINVAL;
ub->st_blksize = so->so_snd.sb_hiwat;
switch (so->so_type) {
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
if (unp->unp_conn == 0)
break;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2));
ub->st_blksize += so2->so_rcv.sb_cc;
break;
default:
break;
}
ub->st_dev = NODEV;
if (unp->unp_ino == 0)
unp->unp_ino = unp_ino++;
ub->st_atimespec = ub->st_mtimespec = ub->st_ctimespec = unp->unp_ctime;
ub->st_ino = unp->unp_ino;
ub->st_uid = so->so_uidinfo->ui_uid;
ub->st_gid = so->so_egid;
return (0);
}
static int
unp_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotounpcb(so) != NULL);
KASSERT(nam != NULL);
unp_setaddr(so, nam, true);
return 0;
}
static int
unp_sockaddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotounpcb(so) != NULL);
KASSERT(nam != NULL);
unp_setaddr(so, nam, false);
return 0;
}
/*
* we only need to perform this allocation until syscalls other than
* bind are adjusted to use sockaddr_big.
*/
static struct sockaddr_un *
makeun_sb(struct sockaddr *nam, size_t *addrlen)
{
struct sockaddr_un *sun;
*addrlen = nam->sa_len + 1;
sun = malloc(*addrlen, M_SONAME, M_WAITOK);
memcpy(sun, nam, nam->sa_len);
*(((char *)sun) + nam->sa_len) = '\0';
return sun;
}
static int
unp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct sockaddr_un *sun;
struct unpcb *unp;
vnode_t *vp;
struct vattr vattr;
size_t addrlen;
int error;
struct pathbuf *pb;
struct nameidata nd;
proc_t *p;
unp = sotounpcb(so);
KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(nam != NULL); if (unp->unp_vnode != NULL)
return (EINVAL);
if ((unp->unp_flags & UNP_BUSY) != 0) {
/*
* EALREADY may not be strictly accurate, but since this
* is a major application error it's hardly a big deal.
*/
return (EALREADY);
}
unp->unp_flags |= UNP_BUSY;
sounlock(so);
p = l->l_proc;
sun = makeun_sb(nam, &addrlen);
pb = pathbuf_create(sun->sun_path);
if (pb == NULL) {
error = ENOMEM;
goto bad;
}
NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, pb);
/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto bad;
}
vp = nd.ni_vp;
if (vp != NULL) {
VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
if (nd.ni_dvp == vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(vp);
pathbuf_destroy(pb);
error = EADDRINUSE;
goto bad;
}
vattr_null(&vattr);
vattr.va_type = VSOCK;
vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
if (error) {
vput(nd.ni_dvp);
pathbuf_destroy(pb);
goto bad;
}
vp = nd.ni_vp;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
solock(so);
vp->v_socket = unp->unp_socket;
unp->unp_vnode = vp;
unp->unp_addrlen = addrlen;
unp->unp_addr = sun;
VOP_UNLOCK(vp);
vput(nd.ni_dvp);
unp->unp_flags &= ~UNP_BUSY;
pathbuf_destroy(pb);
return (0);
bad:
free(sun, M_SONAME);
solock(so);
unp->unp_flags &= ~UNP_BUSY;
return (error);
}
static int
unp_listen(struct socket *so, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
KASSERT(solocked(so)); KASSERT(unp != NULL);
/*
* If the socket can accept a connection, it must be
* locked by uipc_lock.
*/
unp_resetlock(so);
if (unp->unp_vnode == NULL)
return EINVAL;
unp_connid(l, unp, UNP_EIDSBIND);
return 0;
}
static int
unp_disconnect(struct socket *so)
{ KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL);
unp_disconnect1(sotounpcb(so));
return 0;
}
static int
unp_shutdown(struct socket *so)
{
KASSERT(solocked(so));
KASSERT(sotounpcb(so) != NULL);
socantsendmore(so);
unp_shutdown1(sotounpcb(so));
return 0;
}
static int
unp_abort(struct socket *so)
{
KASSERT(solocked(so));
KASSERT(sotounpcb(so) != NULL);
(void)unp_drop(sotounpcb(so), ECONNABORTED);
KASSERT(so->so_head == NULL);
KASSERT(so->so_pcb != NULL);
unp_detach(so);
return 0;
}
static int
unp_connect1(struct socket *so, struct socket *so2, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
struct unpcb *unp2;
if (so2->so_type != so->so_type)
return EPROTOTYPE;
/*
* All three sockets involved must be locked by same lock:
*
* local endpoint (so)
* remote endpoint (so2)
* queue head (so2->so_head, only if PR_CONNREQUIRED)
*/
KASSERT(solocked2(so, so2)); KASSERT(so->so_head == NULL); if (so2->so_head != NULL) { KASSERT(so2->so_lock == uipc_lock); KASSERT(solocked2(so2, so2->so_head));
}
unp2 = sotounpcb(so2);
unp->unp_conn = unp2;
switch (so->so_type) {
case SOCK_DGRAM:
unp->unp_nextref = unp2->unp_refs;
unp2->unp_refs = unp;
soisconnected(so);
break;
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
/*
* SOCK_SEQPACKET and SOCK_STREAM cases are handled by callers
* which are unp_connect() or unp_connect2().
*/
break;
default:
panic("unp_connect1");
}
return 0;
}
int
unp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct sockaddr_un *sun;
vnode_t *vp;
struct socket *so2, *so3;
struct unpcb *unp, *unp2, *unp3;
size_t addrlen;
int error;
struct pathbuf *pb;
struct nameidata nd;
unp = sotounpcb(so);
if ((unp->unp_flags & UNP_BUSY) != 0) {
/*
* EALREADY may not be strictly accurate, but since this
* is a major application error it's hardly a big deal.
*/
return (EALREADY);
}
unp->unp_flags |= UNP_BUSY;
sounlock(so);
sun = makeun_sb(nam, &addrlen);
pb = pathbuf_create(sun->sun_path);
if (pb == NULL) {
error = ENOMEM;
goto bad2;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto bad2;
}
vp = nd.ni_vp;
pathbuf_destroy(pb);
if (vp->v_type != VSOCK) {
error = ENOTSOCK;
goto bad;
}
if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
goto bad;
/* Acquire v_interlock to protect against unp_detach(). */
mutex_enter(vp->v_interlock);
so2 = vp->v_socket;
if (so2 == NULL) {
mutex_exit(vp->v_interlock);
error = ECONNREFUSED;
goto bad;
}
if (so->so_type != so2->so_type) {
mutex_exit(vp->v_interlock);
error = EPROTOTYPE;
goto bad;
}
solock(so);
unp_resetlock(so);
mutex_exit(vp->v_interlock);
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
/*
* This may seem somewhat fragile but is OK: if we can
* see SO_ACCEPTCONN set on the endpoint, then it must
* be locked by the domain-wide uipc_lock.
*/
KASSERT((so2->so_options & SO_ACCEPTCONN) == 0 ||
so2->so_lock == uipc_lock);
if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
(so3 = sonewconn(so2, false)) == NULL) {
error = ECONNREFUSED;
sounlock(so);
goto bad;
}
unp2 = sotounpcb(so2);
unp3 = sotounpcb(so3);
if (unp2->unp_addr) {
unp3->unp_addr = malloc(unp2->unp_addrlen,
M_SONAME, M_WAITOK);
memcpy(unp3->unp_addr, unp2->unp_addr,
unp2->unp_addrlen);
unp3->unp_addrlen = unp2->unp_addrlen;
}
unp3->unp_flags = unp2->unp_flags;
so2 = so3;
/*
* The connector's (client's) credentials are copied from its
* process structure at the time of connect() (which is now).
*/
unp_connid(l, unp3, UNP_EIDSVALID);
/*
* The receiver's (server's) credentials are copied from the
* unp_peercred member of socket on which the former called
* listen(); unp_listen() cached that process's credentials
* at that time so we can use them now.
*/
if (unp2->unp_flags & UNP_EIDSBIND) {
memcpy(&unp->unp_connid, &unp2->unp_connid,
sizeof(unp->unp_connid));
unp->unp_flags |= UNP_EIDSVALID;
}
}
error = unp_connect1(so, so2, l);
if (error) {
sounlock(so);
goto bad;
}
unp2 = sotounpcb(so2);
switch (so->so_type) {
/*
* SOCK_DGRAM and default cases are handled in prior call to
* unp_connect1(), do not add a default case without fixing
* unp_connect1().
*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
unp2->unp_conn = unp;
if ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)
soisconnecting(so);
else
soisconnected(so);
soisconnected(so2);
/*
* If the connection is fully established, break the
* association with uipc_lock and give the connected
* pair a separate lock to share.
*/
KASSERT(so2->so_head != NULL);
unp_setpeerlocks(so, so2);
break;
}
sounlock(so);
bad:
vput(vp);
bad2:
free(sun, M_SONAME);
solock(so);
unp->unp_flags &= ~UNP_BUSY;
return (error);
}
int
unp_connect2(struct socket *so, struct socket *so2)
{
struct unpcb *unp = sotounpcb(so);
struct unpcb *unp2;
int error = 0;
KASSERT(solocked2(so, so2));
error = unp_connect1(so, so2, curlwp);
if (error)
return error;
unp2 = sotounpcb(so2);
switch (so->so_type) {
/*
* SOCK_DGRAM and default cases are handled in prior call to
* unp_connect1(), do not add a default case without fixing
* unp_connect1().
*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
unp2->unp_conn = unp;
soisconnected(so);
soisconnected(so2);
break;
}
return error;
}
static void
unp_disconnect1(struct unpcb *unp)
{
struct unpcb *unp2 = unp->unp_conn;
struct socket *so;
if (unp2 == 0)
return;
unp->unp_conn = 0;
so = unp->unp_socket;
switch (so->so_type) {
case SOCK_DGRAM:
if (unp2->unp_refs == unp)
unp2->unp_refs = unp->unp_nextref;
else {
unp2 = unp2->unp_refs;
for (;;) {
KASSERT(solocked2(so, unp2->unp_socket));
if (unp2 == 0)
panic("unp_disconnect1");
if (unp2->unp_nextref == unp)
break;
unp2 = unp2->unp_nextref;
}
unp2->unp_nextref = unp->unp_nextref;
}
unp->unp_nextref = 0;
so->so_state &= ~SS_ISCONNECTED;
break;
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
KASSERT(solocked2(so, unp2->unp_socket));
soisdisconnected(so);
unp2->unp_conn = 0;
soisdisconnected(unp2->unp_socket);
break;
}
}
static void
unp_shutdown1(struct unpcb *unp)
{
struct socket *so;
switch(unp->unp_socket->so_type) {
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
socantrcvmore(so);
break;
default:
break;
}
}
static bool
unp_drop(struct unpcb *unp, int errno)
{
struct socket *so = unp->unp_socket;
KASSERT(solocked(so));
so->so_error = errno;
unp_disconnect1(unp);
if (so->so_head) {
so->so_pcb = NULL;
/* sofree() drops the socket lock */
sofree(so);
unp_free(unp);
return true;
}
return false;
}
#ifdef notdef
unp_drain(void)
{
}
#endif
int
unp_externalize(struct mbuf *rights, struct lwp *l, int flags)
{
struct cmsghdr * const cm = mtod(rights, struct cmsghdr *);
struct proc * const p = l->l_proc;
file_t **rp;
int error = 0;
const size_t nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
sizeof(file_t *);
if (nfds == 0)
goto noop;
int * const fdp = kmem_alloc(nfds * sizeof(int), KM_SLEEP);
rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
/* Make sure the recipient should be able to see the files.. */
rp = (file_t **)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++) {
file_t * const fp = *rp++;
if (fp == NULL) {
error = EINVAL;
goto out;
}
/*
* If we are in a chroot'ed directory, and
* someone wants to pass us a directory, make
* sure it's inside the subtree we're allowed
* to access.
*/
if (p->p_cwdi->cwdi_rdir != NULL && fp->f_type == DTYPE_VNODE) {
vnode_t *vp = fp->f_vnode;
if ((vp->v_type == VDIR) &&
!vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
error = EPERM;
goto out;
}
}
}
restart:
/*
* First loop -- allocate file descriptor table slots for the
* new files.
*/
for (size_t i = 0; i < nfds; i++) {
if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
/*
* Back out what we've done so far.
*/
while (i-- > 0) {
fd_abort(p, NULL, fdp[i]);
}
if (error == ENOSPC) {
fd_tryexpand(p);
error = 0;
goto restart;
}
/*
* This is the error that has historically
* been returned, and some callers may
* expect it.
*/
error = EMSGSIZE;
goto out;
}
}
/*
* Now that adding them has succeeded, update all of the
* file passing state and affix the descriptors.
*/
rp = (file_t **)CMSG_DATA(cm);
int *ofdp = (int *)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++) {
file_t * const fp = *rp++;
const int fd = fdp[i];
atomic_dec_uint(&unp_rights);
fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
fd_affix(p, fp, fd);
/*
* Done with this file pointer, replace it with a fd;
*/
*ofdp++ = fd;
mutex_enter(&fp->f_lock);
fp->f_msgcount--;
mutex_exit(&fp->f_lock);
/*
* Note that fd_affix() adds a reference to the file.
* The file may already have been closed by another
* LWP in the process, so we must drop the reference
* added by unp_internalize() with closef().
*/
closef(fp);
}
/*
* Adjust length, in case of transition from large file_t
* pointers to ints.
*/
if (sizeof(file_t *) != sizeof(int)) {
cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
rights->m_len = CMSG_SPACE(nfds * sizeof(int));
}
out:
if (__predict_false(error != 0)) {
file_t **const fpp = (file_t **)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++)
unp_discard_now(fpp[i]);
/*
* Truncate the array so that nobody will try to interpret
* what is now garbage in it.
*/
cm->cmsg_len = CMSG_LEN(0);
rights->m_len = CMSG_SPACE(0);
}
rw_exit(&p->p_cwdi->cwdi_lock);
kmem_free(fdp, nfds * sizeof(int));
noop:
/*
* Don't disclose kernel memory in the alignment space.
*/
KASSERT(cm->cmsg_len <= rights->m_len);
memset(&mtod(rights, char *)[cm->cmsg_len], 0, rights->m_len -
cm->cmsg_len);
return error;
}
static int
unp_internalize(struct mbuf **controlp)
{
filedesc_t *fdescp = curlwp->l_fd;
fdtab_t *dt;
struct mbuf *control = *controlp;
struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
file_t **rp, **files;
file_t *fp;
int i, fd, *fdp;
int nfds, error;
u_int maxmsg;
error = 0;
newcm = NULL;
/* Sanity check the control message header. */
if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > control->m_len ||
cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
return (EINVAL);
/*
* Verify that the file descriptors are valid, and acquire
* a reference to each.
*/
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
fdp = (int *)CMSG_DATA(cm);
maxmsg = maxfiles / unp_rights_ratio;
for (i = 0; i < nfds; i++) {
fd = *fdp++;
if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
atomic_dec_uint(&unp_rights);
nfds = i;
error = EAGAIN;
goto out;
}
if ((fp = fd_getfile(fd)) == NULL
|| fp->f_type == DTYPE_KQUEUE) {
if (fp)
fd_putfile(fd);
atomic_dec_uint(&unp_rights);
nfds = i;
error = EBADF;
goto out;
}
}
/* Allocate new space and copy header into it. */
newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
if (newcm == NULL) {
error = E2BIG;
goto out;
}
memcpy(newcm, cm, sizeof(struct cmsghdr));
memset(newcm + 1, 0, CMSG_LEN(0) - sizeof(struct cmsghdr));
files = (file_t **)CMSG_DATA(newcm);
/*
* Transform the file descriptors into file_t pointers, in
* reverse order so that if pointers are bigger than ints, the
* int won't get until we're done. No need to lock, as we have
* already validated the descriptors with fd_getfile().
*/
fdp = (int *)CMSG_DATA(cm) + nfds;
rp = files + nfds;
for (i = 0; i < nfds; i++) { dt = atomic_load_consume(&fdescp->fd_dt); fp = atomic_load_consume(&dt->dt_ff[*--fdp]->ff_file); KASSERT(fp != NULL);
mutex_enter(&fp->f_lock);
*--rp = fp;
fp->f_count++;
fp->f_msgcount++;
mutex_exit(&fp->f_lock);
}
out:
/* Release descriptor references. */
fdp = (int *)CMSG_DATA(cm);
for (i = 0; i < nfds; i++) {
fd_putfile(*fdp++);
if (error != 0) { atomic_dec_uint(&unp_rights);
}
}
if (error == 0) { if (control->m_flags & M_EXT) {
m_freem(control);
*controlp = control = m_get(M_WAIT, MT_CONTROL);
}
MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
M_MBUF, NULL, NULL);
cm = newcm;
/*
* Adjust message & mbuf to note amount of space
* actually used.
*/
cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
}
return error;
}
struct mbuf *
unp_addsockcred(struct lwp *l, struct mbuf *control)
{
struct sockcred *sc;
struct mbuf *m;
void *p;
m = sbcreatecontrol1(&p, SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)),
SCM_CREDS, SOL_SOCKET, M_WAITOK);
if (m == NULL)
return control;
sc = p;
sc->sc_pid = l->l_proc->p_pid;
sc->sc_uid = kauth_cred_getuid(l->l_cred);
sc->sc_euid = kauth_cred_geteuid(l->l_cred);
sc->sc_gid = kauth_cred_getgid(l->l_cred);
sc->sc_egid = kauth_cred_getegid(l->l_cred);
sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
for (int i = 0; i < sc->sc_ngroups; i++)
sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
return m_add(control, m);
}
/*
* Do a mark-sweep GC of files in the system, to free up any which are
* caught in flight to an about-to-be-closed socket. Additionally,
* process deferred file closures.
*/
static void
unp_gc(file_t *dp)
{
extern struct domain unixdomain;
file_t *fp, *np;
struct socket *so, *so1;
u_int i, oflags, rflags;
bool didwork;
KASSERT(curlwp == unp_thread_lwp);
KASSERT(mutex_owned(&filelist_lock));
/*
* First, process deferred file closures.
*/
while (!SLIST_EMPTY(&unp_thread_discard)) {
fp = SLIST_FIRST(&unp_thread_discard);
KASSERT(fp->f_unpcount > 0);
KASSERT(fp->f_count > 0);
KASSERT(fp->f_msgcount > 0);
KASSERT(fp->f_count >= fp->f_unpcount);
KASSERT(fp->f_count >= fp->f_msgcount);
KASSERT(fp->f_msgcount >= fp->f_unpcount);
SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
i = fp->f_unpcount;
fp->f_unpcount = 0;
mutex_exit(&filelist_lock);
for (; i != 0; i--) {
unp_discard_now(fp);
}
mutex_enter(&filelist_lock);
}
/*
* Clear mark bits. Ensure that we don't consider new files
* entering the file table during this loop (they will not have
* FSCAN set).
*/
unp_defer = 0;
LIST_FOREACH(fp, &filehead, f_list) {
for (oflags = fp->f_flag;; oflags = rflags) {
rflags = atomic_cas_uint(&fp->f_flag, oflags,
(oflags | FSCAN) & ~(FMARK|FDEFER));
if (__predict_true(oflags == rflags)) {
break;
}
}
}
/*
* Iterate over the set of sockets, marking ones believed (based on
* refcount) to be referenced from a process, and marking for rescan
* sockets which are queued on a socket. Recan continues descending
* and searching for sockets referenced by sockets (FDEFER), until
* there are no more socket->socket references to be discovered.
*/
do {
didwork = false;
for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
KASSERT(mutex_owned(&filelist_lock));
np = LIST_NEXT(fp, f_list);
mutex_enter(&fp->f_lock);
if ((fp->f_flag & FDEFER) != 0) {
atomic_and_uint(&fp->f_flag, ~FDEFER);
unp_defer--;
if (fp->f_count == 0) {
/*
* XXX: closef() doesn't pay attention
* to FDEFER
*/
mutex_exit(&fp->f_lock);
continue;
}
} else {
if (fp->f_count == 0 ||
(fp->f_flag & FMARK) != 0 ||
fp->f_count == fp->f_msgcount ||
fp->f_unpcount != 0) {
mutex_exit(&fp->f_lock);
continue;
}
}
atomic_or_uint(&fp->f_flag, FMARK);
if (fp->f_type != DTYPE_SOCKET ||
(so = fp->f_socket) == NULL ||
so->so_proto->pr_domain != &unixdomain ||
(so->so_proto->pr_flags & PR_RIGHTS) == 0) {
mutex_exit(&fp->f_lock);
continue;
}
/* Gain file ref, mark our position, and unlock. */
didwork = true;
LIST_INSERT_AFTER(fp, dp, f_list);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&filelist_lock);
/*
* Mark files referenced from sockets queued on the
* accept queue as well.
*/
solock(so);
unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
if ((so->so_options & SO_ACCEPTCONN) != 0) {
TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
}
TAILQ_FOREACH(so1, &so->so_q, so_qe) {
unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
}
}
sounlock(so);
/* Re-lock and restart from where we left off. */
closef(fp);
mutex_enter(&filelist_lock);
np = LIST_NEXT(dp, f_list);
LIST_REMOVE(dp, f_list);
}
/*
* Bail early if we did nothing in the loop above. Could
* happen because of concurrent activity causing unp_defer
* to get out of sync.
*/
} while (unp_defer != 0 && didwork);
/*
* Sweep pass.
*
* We grab an extra reference to each of the files that are
* not otherwise accessible and then free the rights that are
* stored in messages on them.
*/
for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
KASSERT(mutex_owned(&filelist_lock));
np = LIST_NEXT(fp, f_list);
mutex_enter(&fp->f_lock);
/*
* Ignore non-sockets.
* Ignore dead sockets, or sockets with pending close.
* Ignore sockets obviously referenced elsewhere.
* Ignore sockets marked as referenced by our scan.
* Ignore new sockets that did not exist during the scan.
*/
if (fp->f_type != DTYPE_SOCKET ||
fp->f_count == 0 || fp->f_unpcount != 0 ||
fp->f_count != fp->f_msgcount ||
(fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
mutex_exit(&fp->f_lock);
continue;
}
/* Gain file ref, mark our position, and unlock. */
LIST_INSERT_AFTER(fp, dp, f_list);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&filelist_lock);
/*
* Flush all data from the socket's receive buffer.
* This will cause files referenced only by the
* socket to be queued for close.
*/
so = fp->f_socket;
solock(so);
sorflush(so);
sounlock(so);
/* Re-lock and restart from where we left off. */
closef(fp);
mutex_enter(&filelist_lock);
np = LIST_NEXT(dp, f_list);
LIST_REMOVE(dp, f_list);
}
}
/*
* Garbage collector thread. While SCM_RIGHTS messages are in transit,
* wake once per second to garbage collect. Run continually while we
* have deferred closes to process.
*/
static void
unp_thread(void *cookie)
{
file_t *dp;
/* Allocate a dummy file for our scans. */
if ((dp = fgetdummy()) == NULL) {
panic("unp_thread");
}
mutex_enter(&filelist_lock);
for (;;) {
KASSERT(mutex_owned(&filelist_lock));
if (SLIST_EMPTY(&unp_thread_discard)) {
if (unp_rights != 0) {
(void)cv_timedwait(&unp_thread_cv,
&filelist_lock, hz);
} else {
cv_wait(&unp_thread_cv, &filelist_lock);
}
}
unp_gc(dp);
}
/* NOTREACHED */
}
/*
* Kick the garbage collector into action if there is something for
* it to process.
*/
static void
unp_thread_kick(void)
{
if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) { mutex_enter(&filelist_lock);
cv_signal(&unp_thread_cv);
mutex_exit(&filelist_lock);
}
}
void
unp_dispose(struct mbuf *m)
{ if (m) unp_scan(m, unp_discard_later, 1);
}
void
unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
{
struct mbuf *m;
file_t **rp, *fp;
struct cmsghdr *cm;
int i, qfds;
while (m0) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL ||
m->m_len < sizeof(*cm)) {
continue;
}
cm = mtod(m, struct cmsghdr *);
if (cm->cmsg_level != SOL_SOCKET ||
cm->cmsg_type != SCM_RIGHTS)
continue;
qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
/ sizeof(file_t *);
rp = (file_t **)CMSG_DATA(cm);
for (i = 0; i < qfds; i++) {
fp = *rp;
if (discard) {
*rp = 0;
}
(*op)(fp);
rp++;
}
}
m0 = m0->m_nextpkt;
}
}
void
unp_mark(file_t *fp)
{
if (fp == NULL)
return;
/* If we're already deferred, don't screw up the defer count */
mutex_enter(&fp->f_lock);
if (fp->f_flag & (FMARK | FDEFER)) {
mutex_exit(&fp->f_lock);
return;
}
/*
* Minimize the number of deferrals... Sockets are the only type of
* file which can hold references to another file, so just mark
* other files, and defer unmarked sockets for the next pass.
*/
if (fp->f_type == DTYPE_SOCKET) {
unp_defer++;
KASSERT(fp->f_count != 0);
atomic_or_uint(&fp->f_flag, FDEFER);
} else {
atomic_or_uint(&fp->f_flag, FMARK);
}
mutex_exit(&fp->f_lock);
}
static void
unp_discard_now(file_t *fp)
{
if (fp == NULL)
return;
KASSERT(fp->f_count > 0);
KASSERT(fp->f_msgcount > 0);
mutex_enter(&fp->f_lock);
fp->f_msgcount--;
mutex_exit(&fp->f_lock);
atomic_dec_uint(&unp_rights);
(void)closef(fp);
}
static void
unp_discard_later(file_t *fp)
{
if (fp == NULL)
return;
KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0);
mutex_enter(&filelist_lock);
if (fp->f_unpcount++ == 0) { SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
}
mutex_exit(&filelist_lock);
}
static void
unp_sysctl_create(void)
{
KASSERT(usrreq_sysctllog == NULL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "sendspace",
SYSCTL_DESCR("Default stream send space"),
NULL, 0, &unpst_sendspace, 0,
CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "recvspace",
SYSCTL_DESCR("Default stream recv space"),
NULL, 0, &unpst_recvspace, 0,
CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "sendspace",
SYSCTL_DESCR("Default datagram send space"),
NULL, 0, &unpdg_sendspace, 0,
CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "recvspace",
SYSCTL_DESCR("Default datagram recv space"),
NULL, 0, &unpdg_recvspace, 0,
CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "inflight",
SYSCTL_DESCR("File descriptors in flight"),
NULL, 0, &unp_rights, 0,
CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "deferred",
SYSCTL_DESCR("File descriptors deferred for close"),
NULL, 0, &unp_defer, 0,
CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
}
const struct pr_usrreqs unp_usrreqs = {
.pr_attach = unp_attach,
.pr_detach = unp_detach,
.pr_accept = unp_accept,
.pr_bind = unp_bind,
.pr_listen = unp_listen,
.pr_connect = unp_connect,
.pr_connect2 = unp_connect2,
.pr_disconnect = unp_disconnect,
.pr_shutdown = unp_shutdown,
.pr_abort = unp_abort,
.pr_ioctl = unp_ioctl,
.pr_stat = unp_stat,
.pr_peeraddr = unp_peeraddr,
.pr_sockaddr = unp_sockaddr,
.pr_rcvd = unp_rcvd,
.pr_recvoob = unp_recvoob,
.pr_send = unp_send,
.pr_sendoob = unp_sendoob,
};
/* $NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The packet queue (pktqueue) interface is a lockless IP input queue
* which also abstracts and handles network ISR scheduling. It provides
* a mechanism to enable receiver-side packet steering (RPS).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/pcq.h>
#include <sys/intr.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#include <sys/once.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <net/pktqueue.h>
#include <net/rss_config.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
struct pktqueue {
/*
* The lock used for a barrier mechanism. The barrier counter,
* as well as the drop counter, are managed atomically though.
* Ensure this group is in a separate cache line.
*/
union {
struct {
kmutex_t pq_lock;
volatile u_int pq_barrier;
};
uint8_t _pad[COHERENCY_UNIT];
};
/* The size of the queue, counters and the interrupt handler. */
u_int pq_maxlen;
percpu_t * pq_counters;
void * pq_sih;
/* The per-CPU queues. */
struct percpu * pq_pcq; /* struct pcq * */
/* The linkage on the list of all pktqueues. */
LIST_ENTRY(pktqueue) pq_list;
};
/* The counters of the packet queue. */
#define PQCNT_ENQUEUE 0
#define PQCNT_DEQUEUE 1
#define PQCNT_DROP 2
#define PQCNT_NCOUNTERS 3
typedef struct {
uint64_t count[PQCNT_NCOUNTERS];
} pktq_counters_t;
/* Special marker value used by pktq_barrier() mechanism. */
#define PKTQ_MARKER ((void *)(~0ULL))
/*
* This is a list of all pktqueues. This list is used by
* pktq_ifdetach() to issue a barrier on every pktqueue.
*
* The r/w lock is acquired for writing in pktq_create() and
* pktq_destroy(), and for reading in pktq_ifdetach().
*
* This list is not performance critical, and will seldom be
* accessed.
*/
static LIST_HEAD(, pktqueue) pktqueue_list __read_mostly;
static krwlock_t pktqueue_list_lock __read_mostly;
static once_t pktqueue_list_init_once __read_mostly;
static int
pktqueue_list_init(void)
{
LIST_INIT(&pktqueue_list);
rw_init(&pktqueue_list_lock);
return 0;
}
static void
pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
struct pcq **qp = vqp;
struct pktqueue *pq = vpq;
*qp = pcq_create(pq->pq_maxlen, KM_SLEEP);
}
static void
pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
struct pcq **qp = vqp, *q = *qp;
KASSERT(pcq_peek(q) == NULL);
pcq_destroy(q);
*qp = NULL; /* paranoia */
}
static struct pcq *
pktq_pcq(struct pktqueue *pq, struct cpu_info *ci)
{
struct pcq **qp, *q;
/*
* As long as preemption is disabled, the xcall to swap percpu
* buffers can't complete, so it is safe to read the pointer.
*/
KASSERT(kpreempt_disabled());
qp = percpu_getptr_remote(pq->pq_pcq, ci);
q = *qp;
return q;
}
pktqueue_t *
pktq_create(size_t maxlen, void (*intrh)(void *), void *sc)
{
const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU;
pktqueue_t *pq;
percpu_t *pc;
void *sih;
RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
pc = percpu_alloc(sizeof(pktq_counters_t));
if ((sih = softint_establish(sflags, intrh, sc)) == NULL) {
percpu_free(pc, sizeof(pktq_counters_t));
return NULL;
}
pq = kmem_zalloc(sizeof(*pq), KM_SLEEP);
mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE);
pq->pq_maxlen = maxlen;
pq->pq_counters = pc;
pq->pq_sih = sih;
pq->pq_pcq = percpu_create(sizeof(struct pcq *),
pktq_init_cpu, pktq_fini_cpu, pq);
rw_enter(&pktqueue_list_lock, RW_WRITER);
LIST_INSERT_HEAD(&pktqueue_list, pq, pq_list);
rw_exit(&pktqueue_list_lock);
return pq;
}
void
pktq_destroy(pktqueue_t *pq)
{
KASSERT(pktqueue_list_init_once.o_status == ONCE_DONE);
rw_enter(&pktqueue_list_lock, RW_WRITER);
LIST_REMOVE(pq, pq_list);
rw_exit(&pktqueue_list_lock);
percpu_free(pq->pq_pcq, sizeof(struct pcq *));
percpu_free(pq->pq_counters, sizeof(pktq_counters_t));
softint_disestablish(pq->pq_sih);
mutex_destroy(&pq->pq_lock);
kmem_free(pq, sizeof(*pq));
}
/*
* - pktq_inc_counter: increment the counter given an ID.
* - pktq_collect_counts: handler to sum up the counts from each CPU.
* - pktq_getcount: return the effective count given an ID.
*/
static inline void
pktq_inc_count(pktqueue_t *pq, u_int i)
{
percpu_t *pc = pq->pq_counters;
pktq_counters_t *c;
c = percpu_getref(pc);
c->count[i]++;
percpu_putref(pc);
}
static void
pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci)
{
const pktq_counters_t *c = mem;
pktq_counters_t *sum = arg;
int s = splnet();
for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) {
sum->count[i] += c->count[i];
}
splx(s);
}
static uint64_t
pktq_get_count(pktqueue_t *pq, pktq_count_t c)
{
pktq_counters_t sum;
if (c != PKTQ_MAXLEN) {
memset(&sum, 0, sizeof(sum));
percpu_foreach_xcall(pq->pq_counters,
XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum);
}
switch (c) {
case PKTQ_NITEMS:
return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE];
case PKTQ_DROPS:
return sum.count[PQCNT_DROP];
case PKTQ_MAXLEN:
return pq->pq_maxlen;
}
return 0;
}
uint32_t
pktq_rps_hash(const pktq_rps_hash_func_t *funcp, const struct mbuf *m)
{
pktq_rps_hash_func_t func = atomic_load_relaxed(funcp);
KASSERT(func != NULL);
return (*func)(m);
}
static uint32_t
pktq_rps_hash_zero(const struct mbuf *m __unused)
{
return 0;
}
static uint32_t
pktq_rps_hash_curcpu(const struct mbuf *m __unused)
{
return cpu_index(curcpu());
}
static uint32_t
pktq_rps_hash_toeplitz(const struct mbuf *m)
{
struct ip *ip;
/*
* Disable UDP port - IP fragments aren't currently being handled
* and so we end up with a mix of 2-tuple and 4-tuple
* traffic.
*/
const u_int flag = RSS_TOEPLITZ_USE_TCP_PORT;
/* glance IP version */
if ((m->m_flags & M_PKTHDR) == 0)
return 0;
ip = mtod(m, struct ip *);
if (ip->ip_v == IPVERSION) {
if (__predict_false(m->m_len < sizeof(struct ip)))
return 0;
return rss_toeplitz_hash_from_mbuf_ipv4(m, flag);
} else if (ip->ip_v == 6) {
if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
return 0;
return rss_toeplitz_hash_from_mbuf_ipv6(m, flag);
}
return 0;
}
/*
* toeplitz without curcpu.
* Generally, this has better performance than toeplitz.
*/
static uint32_t
pktq_rps_hash_toeplitz_othercpus(const struct mbuf *m)
{
uint32_t hash;
if (ncpu == 1)
return 0;
hash = pktq_rps_hash_toeplitz(m);
hash %= ncpu - 1;
if (hash >= cpu_index(curcpu()))
return hash + 1;
else
return hash;
}
static struct pktq_rps_hash_table {
const char* prh_type;
pktq_rps_hash_func_t prh_func;
} const pktq_rps_hash_tab[] = {
{ "zero", pktq_rps_hash_zero },
{ "curcpu", pktq_rps_hash_curcpu },
{ "toeplitz", pktq_rps_hash_toeplitz },
{ "toeplitz-othercpus", pktq_rps_hash_toeplitz_othercpus },
};
const pktq_rps_hash_func_t pktq_rps_hash_default =
#ifdef NET_MPSAFE
pktq_rps_hash_curcpu;
#else
pktq_rps_hash_zero;
#endif
static const char *
pktq_get_rps_hash_type(pktq_rps_hash_func_t func)
{
for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
if (func == pktq_rps_hash_tab[i].prh_func) {
return pktq_rps_hash_tab[i].prh_type;
}
}
return NULL;
}
static int
pktq_set_rps_hash_type(pktq_rps_hash_func_t *func, const char *type)
{
if (strcmp(type, pktq_get_rps_hash_type(*func)) == 0)
return 0;
for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
if (strcmp(type, pktq_rps_hash_tab[i].prh_type) == 0) {
atomic_store_relaxed(func, pktq_rps_hash_tab[i].prh_func);
return 0;
}
}
return ENOENT;
}
int
sysctl_pktq_rps_hash_handler(SYSCTLFN_ARGS)
{
struct sysctlnode node;
pktq_rps_hash_func_t *func;
int error;
char type[PKTQ_RPS_HASH_NAME_LEN];
node = *rnode;
func = node.sysctl_data;
strlcpy(type, pktq_get_rps_hash_type(*func), PKTQ_RPS_HASH_NAME_LEN);
node.sysctl_data = &type;
node.sysctl_size = sizeof(type);
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
error = pktq_set_rps_hash_type(func, type);
return error;
}
/*
* pktq_enqueue: inject the packet into the end of the queue.
*
* => Must be called from the interrupt or with the preemption disabled.
* => Consumes the packet and returns true on success.
* => Returns false on failure; caller is responsible to free the packet.
*/
bool
pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused)
{
#if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI)
struct cpu_info *ci = curcpu();
#else
struct cpu_info *ci = cpu_lookup(hash % ncpu);
#endif
KASSERT(kpreempt_disabled()); if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) { pktq_inc_count(pq, PQCNT_DROP);
return false;
}
softint_schedule_cpu(pq->pq_sih, ci);
pktq_inc_count(pq, PQCNT_ENQUEUE);
return true;
}
/*
* pktq_dequeue: take a packet from the queue.
*
* => Must be called with preemption disabled.
* => Must ensure there are not concurrent dequeue calls.
*/
struct mbuf *
pktq_dequeue(pktqueue_t *pq)
{
struct cpu_info *ci = curcpu();
struct mbuf *m;
KASSERT(kpreempt_disabled());
m = pcq_get(pktq_pcq(pq, ci));
if (__predict_false(m == PKTQ_MARKER)) {
/* Note the marker entry. */
atomic_inc_uint(&pq->pq_barrier);
/* Get the next queue entry. */
m = pcq_get(pktq_pcq(pq, ci));
/*
* There can only be one barrier operation pending
* on a pktqueue at any given time, so we can assert
* that the next item is not a marker.
*/
KASSERT(m != PKTQ_MARKER);
}
if (__predict_true(m != NULL)) {
pktq_inc_count(pq, PQCNT_DEQUEUE);
}
return m;
}
/*
* pktq_barrier: waits for a grace period when all packets enqueued at
* the moment of calling this routine will be processed. This is used
* to ensure that e.g. packets referencing some interface were drained.
*/
void
pktq_barrier(pktqueue_t *pq)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
u_int pending = 0;
mutex_enter(&pq->pq_lock);
KASSERT(pq->pq_barrier == 0);
for (CPU_INFO_FOREACH(cii, ci)) {
struct pcq *q;
kpreempt_disable();
q = pktq_pcq(pq, ci);
kpreempt_enable();
/* If the queue is empty - nothing to do. */
if (pcq_peek(q) == NULL) {
continue;
}
/* Otherwise, put the marker and entry. */
while (!pcq_put(q, PKTQ_MARKER)) {
kpause("pktqsync", false, 1, NULL);
}
kpreempt_disable();
softint_schedule_cpu(pq->pq_sih, ci);
kpreempt_enable();
pending++;
}
/* Wait for each queue to process the markers. */
while (pq->pq_barrier != pending) {
kpause("pktqsync", false, 1, NULL);
}
pq->pq_barrier = 0;
mutex_exit(&pq->pq_lock);
}
/*
* pktq_ifdetach: issue a barrier on all pktqueues when a network
* interface is detached.
*/
void
pktq_ifdetach(void)
{
pktqueue_t *pq;
/* Just in case no pktqueues have been created yet... */
RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
rw_enter(&pktqueue_list_lock, RW_READER);
LIST_FOREACH(pq, &pktqueue_list, pq_list) {
pktq_barrier(pq);
}
rw_exit(&pktqueue_list_lock);
}
/*
* pktq_flush: free mbufs in all queues.
*
* => The caller must ensure there are no concurrent writers or flush calls.
*/
void
pktq_flush(pktqueue_t *pq)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct mbuf *m, *m0 = NULL;
ASSERT_SLEEPABLE();
/*
* Run a dummy softint at IPL_SOFTNET on all CPUs to ensure that any
* already running handler for this pktqueue is no longer running.
*/
xc_barrier(XC_HIGHPRI_IPL(IPL_SOFTNET));
/*
* Acquire the barrier lock. While the caller ensures that
* no explicit pktq_barrier() calls will be issued, this holds
* off any implicit pktq_barrier() calls that would happen
* as the result of pktq_ifdetach().
*/
mutex_enter(&pq->pq_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
struct pcq *q;
kpreempt_disable();
q = pktq_pcq(pq, ci);
kpreempt_enable();
/*
* Pull the packets off the pcq and chain them into
* a list to be freed later.
*/
while ((m = pcq_get(q)) != NULL) {
pktq_inc_count(pq, PQCNT_DEQUEUE);
m->m_nextpkt = m0;
m0 = m;
}
}
mutex_exit(&pq->pq_lock);
/* Free the packets now that the critical section is over. */
while ((m = m0) != NULL) {
m0 = m->m_nextpkt;
m_freem(m);
}
}
static void
pktq_set_maxlen_cpu(void *vpq, void *vqs)
{
struct pktqueue *pq = vpq;
struct pcq **qp, *q, **qs = vqs;
unsigned i = cpu_index(curcpu());
int s;
s = splnet();
qp = percpu_getref(pq->pq_pcq);
q = *qp;
*qp = qs[i];
qs[i] = q;
percpu_putref(pq->pq_pcq);
splx(s);
}
/*
* pktq_set_maxlen: create per-CPU queues using a new size and replace
* the existing queues without losing any packets.
*
* XXX ncpu must remain stable throughout.
*/
int
pktq_set_maxlen(pktqueue_t *pq, size_t maxlen)
{
const u_int slotbytes = ncpu * sizeof(pcq_t *);
pcq_t **qs;
if (!maxlen || maxlen > PCQ_MAXLEN)
return EINVAL;
if (pq->pq_maxlen == maxlen)
return 0;
/* First, allocate the new queues. */
qs = kmem_zalloc(slotbytes, KM_SLEEP);
for (u_int i = 0; i < ncpu; i++) {
qs[i] = pcq_create(maxlen, KM_SLEEP);
}
/*
* Issue an xcall to replace the queue pointers on each CPU.
* This implies all the necessary memory barriers.
*/
mutex_enter(&pq->pq_lock);
xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs));
pq->pq_maxlen = maxlen;
mutex_exit(&pq->pq_lock);
/*
* At this point, the new packets are flowing into the new
* queues. However, the old queues may have some packets
* present which are no longer being processed. We are going
* to re-enqueue them. This may change the order of packet
* arrival, but it is not considered an issue.
*
* There may be in-flight interrupts calling pktq_dequeue()
* which reference the old queues. Issue a barrier to ensure
* that we are going to be the only pcq_get() callers on the
* old queues.
*/
pktq_barrier(pq);
for (u_int i = 0; i < ncpu; i++) {
struct pcq *q;
struct mbuf *m;
kpreempt_disable();
q = pktq_pcq(pq, cpu_lookup(i));
kpreempt_enable();
while ((m = pcq_get(qs[i])) != NULL) {
while (!pcq_put(q, m)) {
kpause("pktqrenq", false, 1, NULL);
}
}
pcq_destroy(qs[i]);
}
/* Well, that was fun. */
kmem_free(qs, slotbytes);
return 0;
}
static int
sysctl_pktq_maxlen(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
pktqueue_t * const pq = node.sysctl_data;
u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN);
int error;
node.sysctl_data = &nmaxlen;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
return pktq_set_maxlen(pq, nmaxlen);
}
static int
sysctl_pktq_count(SYSCTLFN_ARGS, u_int count_id)
{
struct sysctlnode node = *rnode;
pktqueue_t * const pq = node.sysctl_data;
uint64_t count = pktq_get_count(pq, count_id);
node.sysctl_data = &count;
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
static int
sysctl_pktq_nitems(SYSCTLFN_ARGS)
{
return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_NITEMS);
}
static int
sysctl_pktq_drops(SYSCTLFN_ARGS)
{
return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_DROPS);
}
/*
* pktqueue_sysctl_setup: set up the sysctl nodes for a pktqueue
* using standardized names at the specified parent node and
* node ID (or CTL_CREATE).
*/
void
pktq_sysctl_setup(pktqueue_t * const pq, struct sysctllog ** const clog,
const struct sysctlnode * const parent_node, const int qid)
{
const struct sysctlnode *rnode = parent_node, *cnode;
KASSERT(pq != NULL);
KASSERT(parent_node != NULL);
KASSERT(qid == CTL_CREATE || qid >= 0);
/* Create the "ifq" node below the parent node. */
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ifq",
SYSCTL_DESCR("Protocol input queue controls"),
NULL, 0, NULL, 0,
qid, CTL_EOL);
/* Now create the standard child nodes below "ifq". */
rnode = cnode;
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "len",
SYSCTL_DESCR("Current input queue length"),
sysctl_pktq_nitems, 0, (void *)pq, 0,
IFQCTL_LEN, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed input queue length"),
sysctl_pktq_maxlen, 0, (void *)pq, 0,
IFQCTL_MAXLEN, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Packets dropped due to full input queue"),
sysctl_pktq_drops, 0, (void *)pq, 0,
IFQCTL_DROPS, CTL_EOL);
}
/* $NetBSD: wapbl.h,v 1.21 2018/12/10 21:19:33 jdolecek Exp $ */
/*-
* Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_WAPBL_H
#define _SYS_WAPBL_H
#include <sys/mutex.h>
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <miscfs/specfs/specdev.h>
#endif
/* This header file describes the api and data structures for
* write ahead physical block logging (WAPBL) support.
*/
#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif
#ifdef WAPBL_DEBUG
#ifndef WAPBL_DEBUG_PRINT
#define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
#endif
#if 0
#define WAPBL_DEBUG_BUFBYTES
#endif
#endif
#ifdef WAPBL_DEBUG_PRINT
enum {
WAPBL_PRINT_OPEN = 0x1,
WAPBL_PRINT_FLUSH = 0x2,
WAPBL_PRINT_TRUNCATE = 0x4,
WAPBL_PRINT_TRANSACTION = 0x8,
WAPBL_PRINT_BUFFER = 0x10,
WAPBL_PRINT_BUFFER2 = 0x20,
WAPBL_PRINT_ALLOC = 0x40,
WAPBL_PRINT_INODE = 0x80,
WAPBL_PRINT_WRITE = 0x100,
WAPBL_PRINT_IO = 0x200,
WAPBL_PRINT_REPLAY = 0x400,
WAPBL_PRINT_ERROR = 0x800,
WAPBL_PRINT_DISCARD = 0x1000,
WAPBL_PRINT_BIODONE = 0x2000,
};
#define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a
extern int wapbl_debug_print;
#else
#define WAPBL_PRINTF(mask, a)
#endif
/****************************************************************/
#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#ifdef _KERNEL
struct wapbl_entry;
struct wapbl_replay;
struct wapbl;
struct wapbl_dealloc {
TAILQ_ENTRY(wapbl_dealloc) wd_entries;
daddr_t wd_blkno; /* address of block */
int wd_len; /* size of block */
};
typedef void (*wapbl_flush_fn_t)(struct mount *, struct wapbl_dealloc *);
/*
* This structure holds per transaction log information
*/
struct wapbl_entry {
struct wapbl *we_wapbl;
SIMPLEQ_ENTRY(wapbl_entry) we_entries;
size_t we_bufcount; /* Count of unsynced buffers */
size_t we_reclaimable_bytes; /* Number on disk bytes for this
transaction */
int we_error;
#ifdef WAPBL_DEBUG_BUFBYTES
size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */
#endif
};
/* Start using a log */
int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
size_t, size_t, struct wapbl_replay *,
wapbl_flush_fn_t, wapbl_flush_fn_t);
/* Discard the current transaction, potentially dangerous */
void wapbl_discard(struct wapbl *);
/* stop using a log */
int wapbl_stop(struct wapbl *, int);
/*
* Begin a new transaction or increment transaction recursion
* level if called while a transaction is already in progress
* by the current process.
*/
int wapbl_begin(struct wapbl *, const char *, int);
/* End a transaction or decrement the transaction recursion level */
void wapbl_end(struct wapbl *);
/*
* Add a new buffer to the current transaction. The buffers
* data will be copied to the current transaction log and the
* buffer will be marked B_LOCKED so that it will not be
* flushed to disk by the syncer or reallocated.
*/
void wapbl_add_buf(struct wapbl *, struct buf *);
/* Remove a buffer from the current transaction. */
void wapbl_remove_buf(struct wapbl *, struct buf *);
void wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
/*
* This will flush all completed transactions to disk and
* start asynchronous writes on the associated buffers
*/
int wapbl_flush(struct wapbl *, int);
/*
* Inodes that are allocated but have zero link count
* must be registered with the current transaction
* so they may be recorded in the log and cleaned up later.
* registration/unregistration of ino numbers already registered is ok.
*/
void wapbl_register_inode(struct wapbl *, ino_t, mode_t);
void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);
/*
* Metadata block deallocations must be registered so
* that revocations records can be written and to prevent
* the corresponding blocks from being reused as data
* blocks until the log is on disk.
*/
int wapbl_register_deallocation(struct wapbl *, daddr_t, int, bool,
void **);
void wapbl_unregister_deallocation(struct wapbl *, void *);
void wapbl_jlock_assert(struct wapbl *wl);
void wapbl_junlock_assert(struct wapbl *wl);
void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...)
__printflike(1, 2));
#if defined(WAPBL_DEBUG) || defined(DDB)
void wapbl_dump(struct wapbl *);
#endif
void wapbl_biodone(struct buf *);
extern const struct wapbl_ops wapbl_ops;
static __inline struct mount *
wapbl_vptomp(struct vnode *vp)
{
struct mount *mp;
mp = NULL;
if (vp != NULL) { if (vp->v_type == VBLK) mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
}
return mp;
}
static __inline bool
wapbl_vphaswapbl(struct vnode *vp)
{
struct mount *mp;
if (vp == NULL)
return false;
mp = wapbl_vptomp(vp); return mp && mp->mnt_wapbl;
}
#endif /* _KERNEL */
/****************************************************************/
/* Replay support */
#ifdef WAPBL_INTERNAL
LIST_HEAD(wapbl_blk_head, wapbl_blk);
struct wapbl_replay {
struct vnode *wr_logvp;
struct vnode *wr_devvp;
daddr_t wr_logpbn;
int wr_log_dev_bshift;
int wr_fs_dev_bshift;
int64_t wr_circ_off;
int64_t wr_circ_size;
uint32_t wr_generation;
void *wr_scratch;
struct wapbl_blk_head *wr_blkhash;
u_long wr_blkhashmask;
int wr_blkhashcnt;
off_t wr_inodeshead;
off_t wr_inodestail;
int wr_inodescnt;
struct {
uint32_t wr_inumber;
uint32_t wr_imode;
} *wr_inodes;
};
#define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)
/* Supply this to provide i/o support */
int wapbl_write(void *, size_t, struct vnode *, daddr_t);
int wapbl_read(void *, size_t, struct vnode *, daddr_t);
/****************************************************************/
#else
struct wapbl_replay;
#endif /* WAPBL_INTERNAL */
/****************************************************************/
int wapbl_replay_start(struct wapbl_replay **, struct vnode *,
daddr_t, size_t, size_t);
void wapbl_replay_stop(struct wapbl_replay *);
void wapbl_replay_free(struct wapbl_replay *);
int wapbl_replay_write(struct wapbl_replay *, struct vnode *);
int wapbl_replay_can_read(struct wapbl_replay *, daddr_t, long);
int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);
/****************************************************************/
#endif /* !_SYS_WAPBL_H */
/* $NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
***********************************************************************
* *
* Copyright (c) David L. Mills 1993-2001 *
* *
* Permission to use, copy, modify, and distribute this software and *
* its documentation for any purpose and without fee is hereby *
* granted, provided that the above copyright notice appears in all *
* copies and that both the copyright notice and this permission *
* notice appear in supporting documentation, and that the name *
* University of Delaware not be used in advertising or publicity *
* pertaining to distribution of the software without specific, *
* written prior permission. The University of Delaware makes no *
* representations about the suitability this software for any *
* purpose. It is provided "as is" without express or implied *
* warranty. *
* *
**********************************************************************/
/*
* Adapted from the original sources for FreeBSD and timecounters by:
* Poul-Henning Kamp <phk@FreeBSD.org>.
*
* The 32bit version of the "LP" macros seems a bit past its "sell by"
* date so I have retained only the 64bit version and included it directly
* in this file.
*
* Only minor changes done to interface with the timecounters over in
* sys/kern/kern_clock.c. Some of the comments below may be (even more)
* confusing and/or plain wrong in that context.
*/
#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif
#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <compat/sys/timex.h>
/*
* Single-precision macros for 64-bit machines
*/
typedef int64_t l_fp;
#define L_ADD(v, u) ((v) += (u))
#define L_SUB(v, u) ((v) -= (u))
#define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32)
#define L_NEG(v) ((v) = -(v))
#define L_RSHIFT(v, n) \
do { \
if ((v) < 0) \
(v) = -(-(v) >> (n)); \
else \
(v) = (v) >> (n); \
} while (0)
#define L_MPY(v, a) ((v) *= (a))
#define L_CLR(v) ((v) = 0)
#define L_ISNEG(v) ((v) < 0)
#define L_LINT(v, a) ((v) = (int64_t)((uint64_t)(a) << 32))
#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
#ifdef NTP
/*
* Generic NTP kernel interface
*
* These routines constitute the Network Time Protocol (NTP) interfaces
* for user and daemon application programs. The ntp_gettime() routine
* provides the time, maximum error (synch distance) and estimated error
* (dispersion) to client user application programs. The ntp_adjtime()
* routine is used by the NTP daemon to adjust the system clock to an
* externally derived time. The time offset and related variables set by
* this routine are used by other routines in this module to adjust the
* phase and frequency of the clock discipline loop which controls the
* system clock.
*
* When the kernel time is reckoned directly in nanoseconds (NTP_NANO
* defined), the time at each tick interrupt is derived directly from
* the kernel time variable. When the kernel time is reckoned in
* microseconds, (NTP_NANO undefined), the time is derived from the
* kernel time variable together with a variable representing the
* leftover nanoseconds at the last tick interrupt. In either case, the
* current nanosecond time is reckoned from these values plus an
* interpolated value derived by the clock routines in another
* architecture-specific module. The interpolation can use either a
* dedicated counter or a processor cycle counter (PCC) implemented in
* some architectures.
*
* Note that all routines must run at priority splclock or higher.
*/
/*
* Phase/frequency-lock loop (PLL/FLL) definitions
*
* The nanosecond clock discipline uses two variable types, time
* variables and frequency variables. Both types are represented as 64-
* bit fixed-point quantities with the decimal point between two 32-bit
* halves. On a 32-bit machine, each half is represented as a single
* word and mathematical operations are done using multiple-precision
* arithmetic. On a 64-bit machine, ordinary computer arithmetic is
* used.
*
* A time variable is a signed 64-bit fixed-point number in ns and
* fraction. It represents the remaining time offset to be amortized
* over succeeding tick interrupts. The maximum time offset is about
* 0.5 s and the resolution is about 2.3e-10 ns.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s| ns |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
* A frequency variable is a signed 64-bit fixed-point number in ns/s
* and fraction. It represents the ns and fraction to be added to the
* kernel time variable at each second. The maximum frequency offset is
* about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s s s s s s s s s s s| ns/s |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
/*
* The following variables establish the state of the PLL/FLL and the
* residual time and frequency offset of the local clock.
*/
#define SHIFT_PLL 4 /* PLL loop gain (shift) */
#define SHIFT_FLL 2 /* FLL loop gain (shift) */
static int time_state = TIME_OK; /* clock state */
static int time_status = STA_UNSYNC; /* clock status bits */
static long time_tai; /* TAI offset (s) */
static long time_monitor; /* last time offset scaled (ns) */
static long time_constant; /* poll interval (shift) (s) */
static long time_precision = 1; /* clock precision (ns) */
static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
static time_t time_reftime; /* time at last adjustment (s) */
static l_fp time_offset; /* time offset (ns) */
static l_fp time_freq; /* frequency offset (ns/s) */
#endif /* NTP */
static l_fp time_adj; /* tick adjust (ns/s) */
int64_t time_adjtime; /* correction from adjtime(2) (usec) */
#ifdef NTP
#ifdef PPS_SYNC
/*
* The following variables are used when a pulse-per-second (PPS) signal
* is available and connected via a modem control lead. They establish
* the engineering parameters of the clock discipline loop when
* controlled by the PPS signal.
*/
#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
#define PPS_VALID 120 /* PPS signal watchdog max (s) */
#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */
static struct timespec pps_tf[3]; /* phase median filter */
static l_fp pps_freq; /* scaled frequency offset (ns/s) */
static long pps_fcount; /* frequency accumulator */
static long pps_jitter; /* nominal jitter (ns) */
static long pps_stabil; /* nominal stability (scaled ns/s) */
static long pps_lastsec; /* time at last calibration (s) */
static int pps_valid; /* signal watchdog counter */
static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
static int pps_intcnt; /* wander counter */
/*
* PPS signal quality monitors
*/
static long pps_calcnt; /* calibration intervals */
static long pps_jitcnt; /* jitter limit exceeded */
static long pps_stbcnt; /* stability limit exceeded */
static long pps_errcnt; /* calibration errors */
#endif /* PPS_SYNC */
/*
* End of phase/frequency-lock loop (PLL/FLL) definitions
*/
static void hardupdate(long offset);
/*
* ntp_gettime() - NTP user application interface
*/
void
ntp_gettime(struct ntptimeval *ntv)
{
memset(ntv, 0, sizeof(*ntv));
mutex_spin_enter(&timecounter_lock);
nanotime(&ntv->time);
ntv->maxerror = time_maxerror;
ntv->esterror = time_esterror;
ntv->tai = time_tai;
ntv->time_state = time_state;
mutex_spin_exit(&timecounter_lock);
}
/* ARGSUSED */
/*
* ntp_adjtime() - NTP daemon application interface
*/
int
sys_ntp_adjtime(struct lwp *l, const struct sys_ntp_adjtime_args *uap, register_t *retval)
{
/* {
syscallarg(struct timex *) tp;
} */
struct timex ntv;
int error;
error = copyin((void *)SCARG(uap, tp), (void *)&ntv, sizeof(ntv));
if (error != 0)
return (error);
if (ntv.modes != 0 && (error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_NTPADJTIME, NULL,
NULL, NULL)) != 0)
return (error);
ntp_adjtime1(&ntv);
error = copyout((void *)&ntv, (void *)SCARG(uap, tp), sizeof(ntv));
if (!error)
*retval = ntp_timestatus();
return error;
}
void
ntp_adjtime1(struct timex *ntv)
{
long freq;
int modes;
/*
* Update selected clock variables - only the superuser can
* change anything. Note that there is no error checking here on
* the assumption the superuser should know what it is doing.
* Note that either the time constant or TAI offset are loaded
* from the ntv.constant member, depending on the mode bits. If
* the STA_PLL bit in the status word is cleared, the state and
* status words are reset to the initial values at boot.
*/
mutex_spin_enter(&timecounter_lock);
modes = ntv->modes;
if (modes != 0)
/* We need to save the system time during shutdown */
time_adjusted |= 2;
if (modes & MOD_MAXERROR) time_maxerror = ntv->maxerror; if (modes & MOD_ESTERROR) time_esterror = ntv->esterror; if (modes & MOD_STATUS) { if (time_status & STA_PLL && !(ntv->status & STA_PLL)) { time_state = TIME_OK;
time_status = STA_UNSYNC;
#ifdef PPS_SYNC
pps_shift = PPS_FAVG;
#endif /* PPS_SYNC */
}
time_status &= STA_RONLY;
time_status |= ntv->status & ~STA_RONLY;
}
if (modes & MOD_TIMECONST) { if (ntv->constant < 0)
time_constant = 0;
else if (ntv->constant > MAXTC)
time_constant = MAXTC;
else
time_constant = ntv->constant;
}
if (modes & MOD_TAI) { if (ntv->constant > 0) /* XXX zero & negative numbers ? */ time_tai = ntv->constant;
}
#ifdef PPS_SYNC
if (modes & MOD_PPSMAX) {
if (ntv->shift < PPS_FAVG)
pps_shiftmax = PPS_FAVG;
else if (ntv->shift > PPS_FAVGMAX)
pps_shiftmax = PPS_FAVGMAX;
else
pps_shiftmax = ntv->shift;
}
#endif /* PPS_SYNC */
if (modes & MOD_NANO) time_status |= STA_NANO; if (modes & MOD_MICRO) time_status &= ~STA_NANO; if (modes & MOD_CLKB) time_status |= STA_CLK; if (modes & MOD_CLKA) time_status &= ~STA_CLK; if (modes & MOD_FREQUENCY) { freq = MIN(INT32_MAX, MAX(INT32_MIN, ntv->freq));
freq = (freq * (int64_t)1000) >> 16;
if (freq > MAXFREQ)
L_LINT(time_freq, MAXFREQ);
else if (freq < -MAXFREQ)
L_LINT(time_freq, -MAXFREQ);
else {
/*
* ntv.freq is [PPM * 2^16] = [us/s * 2^16]
* time_freq is [ns/s * 2^32]
*/
time_freq = ntv->freq * 1000LL * 65536LL;
}
#ifdef PPS_SYNC
pps_freq = time_freq;
#endif /* PPS_SYNC */
}
if (modes & MOD_OFFSET) { if (time_status & STA_NANO) {
hardupdate(ntv->offset);
} else {
long offset = ntv->offset;
offset = MIN(offset, MAXPHASE/1000);
offset = MAX(offset, -MAXPHASE/1000);
hardupdate(offset * 1000);
}
}
/*
* Retrieve all clock variables. Note that the TAI offset is
* returned only by ntp_gettime();
*/
if (time_status & STA_NANO)
ntv->offset = L_GINT(time_offset);
else
ntv->offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
if (time_freq < 0)
ntv->freq = L_GINT(-((-time_freq / 1000LL) << 16));
else
ntv->freq = L_GINT((time_freq / 1000LL) << 16);
ntv->maxerror = time_maxerror;
ntv->esterror = time_esterror;
ntv->status = time_status;
ntv->constant = time_constant;
if (time_status & STA_NANO)
ntv->precision = time_precision;
else
ntv->precision = time_precision / 1000;
ntv->tolerance = MAXFREQ * SCALE_PPM;
#ifdef PPS_SYNC
ntv->shift = pps_shift;
ntv->ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
if (time_status & STA_NANO)
ntv->jitter = pps_jitter;
else
ntv->jitter = pps_jitter / 1000;
ntv->stabil = pps_stabil;
ntv->calcnt = pps_calcnt;
ntv->errcnt = pps_errcnt;
ntv->jitcnt = pps_jitcnt;
ntv->stbcnt = pps_stbcnt;
#endif /* PPS_SYNC */
mutex_spin_exit(&timecounter_lock);
}
#endif /* NTP */
/*
* second_overflow() - called after ntp_tick_adjust()
*
* This routine is ordinarily called immediately following the above
* routine ntp_tick_adjust(). While these two routines are normally
* combined, they are separated here only for the purposes of
* simulation.
*/
void
ntp_update_second(int64_t *adjustment, time_t *newsec)
{
int tickrate;
l_fp ftemp; /* 32/64-bit temporary */
KASSERT(mutex_owned(&timecounter_lock));
#ifdef NTP
/*
* On rollover of the second both the nanosecond and microsecond
* clocks are updated and the state machine cranked as
* necessary. The phase adjustment to be used for the next
* second is calculated and the maximum error is increased by
* the tolerance.
*/
time_maxerror += MAXFREQ / 1000;
/*
* Leap second processing. If in leap-insert state at
* the end of the day, the system clock is set back one
* second; if in leap-delete state, the system clock is
* set ahead one second. The nano_time() routine or
* external clock driver will insure that reported time
* is always monotonic.
*/
switch (time_state) {
/*
* No warning.
*/
case TIME_OK:
if (time_status & STA_INS)
time_state = TIME_INS;
else if (time_status & STA_DEL)
time_state = TIME_DEL;
break;
/*
* Insert second 23:59:60 following second
* 23:59:59.
*/
case TIME_INS:
if (!(time_status & STA_INS))
time_state = TIME_OK;
else if ((*newsec) % 86400 == 0) {
(*newsec)--;
time_state = TIME_OOP;
time_tai++;
}
break;
/*
* Delete second 23:59:59.
*/
case TIME_DEL:
if (!(time_status & STA_DEL))
time_state = TIME_OK;
else if (((*newsec) + 1) % 86400 == 0) {
(*newsec)++;
time_tai--;
time_state = TIME_WAIT;
}
break;
/*
* Insert second in progress.
*/
case TIME_OOP:
time_state = TIME_WAIT;
break;
/*
* Wait for status bits to clear.
*/
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
}
/*
* Compute the total time adjustment for the next second
* in ns. The offset is reduced by a factor depending on
* whether the PPS signal is operating. Note that the
* value is in effect scaled by the clock frequency,
* since the adjustment is added at each tick interrupt.
*/
ftemp = time_offset;
#ifdef PPS_SYNC
/* XXX even if PPS signal dies we should finish adjustment ? */
if (time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)
L_RSHIFT(ftemp, pps_shift);
else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#endif /* PPS_SYNC */
time_adj = ftemp;
L_SUB(time_offset, ftemp);
L_ADD(time_adj, time_freq);
#ifdef PPS_SYNC
if (pps_valid > 0)
pps_valid--;
else
time_status &= ~STA_PPSSIGNAL;
#endif /* PPS_SYNC */
#else /* !NTP */
L_CLR(time_adj);
#endif /* !NTP */
/*
* Apply any correction from adjtime(2). If more than one second
* off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
* until the last second is slewed the final < 500 usecs.
*/
if (time_adjtime != 0) {
if (time_adjtime > 1000000)
tickrate = 5000;
else if (time_adjtime < -1000000)
tickrate = -5000;
else if (time_adjtime > 500)
tickrate = 500;
else if (time_adjtime < -500)
tickrate = -500;
else
tickrate = time_adjtime;
time_adjtime -= tickrate;
L_LINT(ftemp, tickrate * 1000);
L_ADD(time_adj, ftemp);
}
*adjustment = time_adj;
}
/*
* ntp_init() - initialize variables and structures
*
* This routine must be called after the kernel variables hz and tick
* are set or changed and before the next tick interrupt. In this
* particular implementation, these values are assumed set elsewhere in
* the kernel. The design allows the clock frequency and tick interval
* to be changed while the system is running. So, this routine should
* probably be integrated with the code that does that.
*/
void
ntp_init(void)
{
/*
* The following variables are initialized only at startup. Only
* those structures not cleared by the compiler need to be
* initialized, and these only in the simulator. In the actual
* kernel, any nonzero values here will quickly evaporate.
*/
L_CLR(time_adj);
#ifdef NTP
L_CLR(time_offset);
L_CLR(time_freq);
#ifdef PPS_SYNC
pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
pps_fcount = 0;
L_CLR(pps_freq);
#endif /* PPS_SYNC */
#endif
}
#ifdef NTP
/*
* hardupdate() - local clock update
*
* This routine is called by ntp_adjtime() to update the local clock
* phase and frequency. The implementation is of an adaptive-parameter,
* hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
* time and frequency offset estimates for each call. If the kernel PPS
* discipline code is configured (PPS_SYNC), the PPS signal itself
* determines the new time offset, instead of the calling argument.
* Presumably, calls to ntp_adjtime() occur only when the caller
* believes the local clock is valid within some bound (+-128 ms with
* NTP). If the caller's time is far different than the PPS time, an
* argument will ensue, and it's not clear who will lose.
*
* For uncompensated quartz crystal oscillators and nominal update
* intervals less than 256 s, operation should be in phase-lock mode,
* where the loop is disciplined to phase. For update intervals greater
* than 1024 s, operation should be in frequency-lock mode, where the
* loop is disciplined to frequency. Between 256 s and 1024 s, the mode
* is selected by the STA_MODE status bit.
*
* Note: splclock() is in effect.
*/
void
hardupdate(long offset)
{
long mtemp;
l_fp ftemp;
KASSERT(mutex_owned(&timecounter_lock));
/*
* Select how the phase is to be controlled and from which
* source. If the PPS signal is present and enabled to
* discipline the time, the PPS offset is used; otherwise, the
* argument offset is used.
*/
if (!(time_status & STA_PLL))
return;
if (!(time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)) {
if (offset > MAXPHASE)
time_monitor = MAXPHASE;
else if (offset < -MAXPHASE)
time_monitor = -MAXPHASE;
else
time_monitor = offset;
L_LINT(time_offset, time_monitor);
}
/*
* Select how the frequency is to be controlled and in which
* mode (PLL or FLL). If the PPS signal is present and enabled
* to discipline the frequency, the PPS frequency is used;
* otherwise, the argument offset is used to compute it.
*/
if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
time_reftime = time_second;
return;
}
if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = time_second;
mtemp = time_second - time_reftime;
L_LINT(ftemp, time_monitor);
L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
L_MPY(ftemp, mtemp);
L_ADD(time_freq, ftemp);
time_status &= ~STA_MODE;
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
MAXSEC)) {
L_LINT(ftemp, (time_monitor << 4) / mtemp);
L_RSHIFT(ftemp, SHIFT_FLL + 4);
L_ADD(time_freq, ftemp);
time_status |= STA_MODE;
}
time_reftime = time_second;
if (L_GINT(time_freq) > MAXFREQ)
L_LINT(time_freq, MAXFREQ); else if (L_GINT(time_freq) < -MAXFREQ) L_LINT(time_freq, -MAXFREQ);
}
#ifdef PPS_SYNC
/*
* hardpps() - discipline CPU clock oscillator to external PPS signal
*
* This routine is called at each PPS interrupt in order to discipline
* the CPU clock oscillator to the PPS signal. It measures the PPS phase
* and leaves it in a handy spot for the hardclock() routine. It
* integrates successive PPS phase differences and calculates the
* frequency offset. This is used in hardclock() to discipline the CPU
* clock oscillator so that intrinsic frequency error is cancelled out.
* The code requires the caller to capture the time and hardware counter
* value at the on-time PPS signal transition.
*
* Note that, on some Unix systems, this routine runs at an interrupt
* priority level higher than the timer interrupt routine hardclock().
* Therefore, the variables used are distinct from the hardclock()
* variables, except for certain exceptions: The PPS frequency pps_freq
* and phase pps_offset variables are determined by this routine and
* updated atomically. The time_tolerance variable can be considered a
* constant, since it is infrequently changed, and then only when the
* PPS signal is disabled. The watchdog counter pps_valid is updated
* once per second by hardclock() and is atomically cleared in this
* routine.
*/
void
hardpps(struct timespec *tsp, /* time at PPS */
long nsec /* hardware counter at PPS */)
{
long u_sec, u_nsec, v_nsec; /* temps */
l_fp ftemp;
KASSERT(mutex_owned(&timecounter_lock));
/*
* The signal is first processed by a range gate and frequency
* discriminator. The range gate rejects noise spikes outside
* the range +-500 us. The frequency discriminator rejects input
* signals with apparent frequency outside the range 1 +-500
* PPM. If two hits occur in the same second, we ignore the
* later hit; if not and a hit occurs outside the range gate,
* keep the later hit for later comparison, but do not process
* it.
*/
time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
pps_valid = PPS_VALID;
u_sec = tsp->tv_sec;
u_nsec = tsp->tv_nsec;
if (u_nsec >= (NANOSECOND >> 1)) {
u_nsec -= NANOSECOND;
u_sec++;
}
v_nsec = u_nsec - pps_tf[0].tv_nsec;
if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
MAXFREQ)
return;
pps_tf[2] = pps_tf[1];
pps_tf[1] = pps_tf[0];
pps_tf[0].tv_sec = u_sec;
pps_tf[0].tv_nsec = u_nsec;
/*
* Compute the difference between the current and previous
* counter values. If the difference exceeds 0.5 s, assume it
* has wrapped around, so correct 1.0 s. If the result exceeds
* the tick interval, the sample point has crossed a tick
* boundary during the last second, so correct the tick. Very
* intricate.
*/
u_nsec = nsec;
if (u_nsec > (NANOSECOND >> 1))
u_nsec -= NANOSECOND;
else if (u_nsec < -(NANOSECOND >> 1))
u_nsec += NANOSECOND;
pps_fcount += u_nsec;
if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
return;
time_status &= ~STA_PPSJITTER;
/*
* A three-stage median filter is used to help denoise the PPS
* time. The median sample becomes the time offset estimate; the
* difference between the other two samples becomes the time
* dispersion (jitter) estimate.
*/
if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
}
} else {
if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
}
}
/*
* Nominal jitter is due to PPS signal noise and interrupt
* latency. If it exceeds the popcorn threshold, the sample is
* discarded. otherwise, if so enabled, the time offset is
* updated. We can tolerate a modest loss of data here without
* much degrading time accuracy.
*/
if (u_nsec > (pps_jitter << PPS_POPCORN)) {
time_status |= STA_PPSJITTER;
pps_jitcnt++;
} else if (time_status & STA_PPSTIME) {
time_monitor = -v_nsec;
L_LINT(time_offset, time_monitor);
}
pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
u_sec = pps_tf[0].tv_sec - pps_lastsec;
if (u_sec < (1 << pps_shift))
return;
/*
* At the end of the calibration interval the difference between
* the first and last counter values becomes the scaled
* frequency. It will later be divided by the length of the
* interval to determine the frequency update. If the frequency
* exceeds a sanity threshold, or if the actual calibration
* interval is not equal to the expected length, the data are
* discarded. We can tolerate a modest loss of data here without
* much degrading frequency accuracy.
*/
pps_calcnt++;
v_nsec = -pps_fcount;
pps_lastsec = pps_tf[0].tv_sec;
pps_fcount = 0;
u_nsec = MAXFREQ << pps_shift;
if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
pps_shift)) {
time_status |= STA_PPSERROR;
pps_errcnt++;
return;
}
/*
* Here the raw frequency offset and wander (stability) is
* calculated. If the wander is less than the wander threshold
* for four consecutive averaging intervals, the interval is
* doubled; if it is greater than the threshold for four
* consecutive intervals, the interval is halved. The scaled
* frequency offset is converted to frequency offset. The
* stability metric is calculated as the average of recent
* frequency changes, but is used only for performance
* monitoring.
*/
L_LINT(ftemp, v_nsec);
L_RSHIFT(ftemp, pps_shift);
L_SUB(ftemp, pps_freq);
u_nsec = L_GINT(ftemp);
if (u_nsec > PPS_MAXWANDER) {
L_LINT(ftemp, PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else if (u_nsec < -PPS_MAXWANDER) {
L_LINT(ftemp, -PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else {
pps_intcnt++;
}
if (pps_intcnt >= 4) {
pps_intcnt = 4;
if (pps_shift < pps_shiftmax) {
pps_shift++;
pps_intcnt = 0;
}
} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
pps_intcnt = -4;
if (pps_shift > PPS_FAVG) {
pps_shift--;
pps_intcnt = 0;
}
}
if (u_nsec < 0)
u_nsec = -u_nsec;
pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
/*
* The PPS frequency is recalculated and clamped to the maximum
* MAXFREQ. If enabled, the system clock frequency is updated as
* well.
*/
L_ADD(pps_freq, ftemp);
u_nsec = L_GINT(pps_freq);
if (u_nsec > MAXFREQ)
L_LINT(pps_freq, MAXFREQ);
else if (u_nsec < -MAXFREQ)
L_LINT(pps_freq, -MAXFREQ);
if (time_status & STA_PPSFREQ)
time_freq = pps_freq;
}
#endif /* PPS_SYNC */
#endif /* NTP */
#ifdef NTP
int
ntp_timestatus(void)
{
int rv;
/*
* Status word error decode. If any of these conditions
* occur, an error is returned, instead of the status
* word. Most applications will care only about the fact
* the system clock may not be trusted, not about the
* details.
*
* Hardware or software error
*/
mutex_spin_enter(&timecounter_lock);
if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
/*
* PPS signal lost when either time or frequency
* synchronization requested
*/
(time_status & (STA_PPSFREQ | STA_PPSTIME) &&
!(time_status & STA_PPSSIGNAL)) ||
/*
* PPS jitter exceeded when time synchronization
* requested
*/
(time_status & STA_PPSTIME &&
time_status & STA_PPSJITTER) ||
/*
* PPS wander exceeded or calibration error when
* frequency synchronization requested
*/
(time_status & STA_PPSFREQ &&
time_status & (STA_PPSWANDER | STA_PPSERROR)))
rv = TIME_ERROR;
else
rv = time_state;
mutex_spin_exit(&timecounter_lock);
return rv;
}
/*ARGSUSED*/
/*
* ntp_gettime() - NTP user application interface
*/
int
sys___ntp_gettime50(struct lwp *l, const struct sys___ntp_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(struct ntptimeval *) ntvp;
} */
struct ntptimeval ntv;
int error = 0;
if (SCARG(uap, ntvp)) {
ntp_gettime(&ntv);
error = copyout((void *)&ntv, (void *)SCARG(uap, ntvp),
sizeof(ntv));
}
if (!error) {
*retval = ntp_timestatus();
}
return(error);
}
/*
* return information about kernel precision timekeeping
*/
static int
sysctl_kern_ntptime(SYSCTLFN_ARGS)
{
struct sysctlnode node;
struct ntptimeval ntv;
ntp_gettime(&ntv);
node = *rnode;
node.sysctl_data = &ntv;
node.sysctl_size = sizeof(ntv);
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
SYSCTL_SETUP(sysctl_kern_ntptime_setup, "sysctl kern.ntptime node setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "ntptime",
SYSCTL_DESCR("Kernel clock values for NTP"),
sysctl_kern_ntptime, 0, NULL,
sizeof(struct ntptimeval),
CTL_KERN, KERN_NTPTIME, CTL_EOL);
}
#endif /* !NTP */
/* $NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran and Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
*/
/*
* System calls of synchronous I/O multiplexing subsystem.
*
* Locking
*
* Two locks are used: <object-lock> and selcluster_t::sc_lock.
*
* The <object-lock> might be a device driver or another subsystem, e.g.
* socket or pipe. This lock is not exported, and thus invisible to this
* subsystem. Mainly, synchronisation between selrecord() and selnotify()
* routines depends on this lock, as it will be described in the comments.
*
* Lock order
*
* <object-lock> ->
* selcluster_t::sc_lock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/mount.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sleepq.h>
#include <sys/socketvar.h>
#include <sys/socketvar.h>
#include <sys/syncobj.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>
/* Flags for lwp::l_selflag. */
#define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
#define SEL_SCANNING 1 /* polling descriptors */
#define SEL_BLOCKING 2 /* blocking and waiting for event */
#define SEL_EVENT 3 /* interrupted, events set directly */
/*
* Per-cluster state for select()/poll(). For a system with fewer
* than 64 CPUs, this gives us per-CPU clusters.
*/
#define SELCLUSTERS 64
#define SELCLUSTERMASK (SELCLUSTERS - 1)
typedef struct selcluster {
kmutex_t *sc_lock;
sleepq_t sc_sleepq;
uint64_t sc_mask;
int sc_ncoll;
} selcluster_t;
static inline int selscan(char *, const int, const size_t, register_t *);
static inline int pollscan(struct pollfd *, const int, register_t *);
static void selclear(void);
static const int sel_flag[] = {
POLLRDNORM | POLLHUP | POLLERR,
POLLWRNORM | POLLHUP | POLLERR,
POLLRDBAND
};
/*
* LWPs are woken using the sleep queue only due to a collision, the case
* with the maximum Suck Factor. Save the cost of sorting for named waiters
* by inserting in LIFO order. In the future it would be preferable to not
* enqueue LWPs at all, unless subject to a collision.
*/
syncobj_t select_sobj = {
.sobj_name = "select",
.sobj_flag = SOBJ_SLEEPQ_LIFO,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static selcluster_t *selcluster[SELCLUSTERS] __read_mostly;
static int direct_select __read_mostly = 0;
/* Operations: either select() or poll(). */
const char selop_select[] = "select";
const char selop_poll[] = "poll";
/*
* Select system call.
*/
int
sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) nd;
syscallarg(fd_set *) in;
syscallarg(fd_set *) ou;
syscallarg(fd_set *) ex;
syscallarg(const struct timespec *) ts;
syscallarg(sigset_t *) mask;
} */
struct timespec ats, *ts = NULL;
sigset_t amask, *mask = NULL;
int error;
if (SCARG(uap, ts)) {
error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
if (error)
return error;
ts = &ats;
}
if (SCARG(uap, mask) != NULL) {
error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
if (error)
return error;
mask = &amask;
}
return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
SCARG(uap, ou), SCARG(uap, ex), ts, mask);
}
int
sys___select50(struct lwp *l, const struct sys___select50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) nd;
syscallarg(fd_set *) in;
syscallarg(fd_set *) ou;
syscallarg(fd_set *) ex;
syscallarg(struct timeval *) tv;
} */
struct timeval atv;
struct timespec ats, *ts = NULL;
int error;
if (SCARG(uap, tv)) {
error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
if (error)
return error;
if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
return EINVAL;
TIMEVAL_TO_TIMESPEC(&atv, &ats);
ts = &ats;
}
return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
}
/*
* sel_do_scan: common code to perform the scan on descriptors.
*/
static int
sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni,
struct timespec *ts, sigset_t *mask, register_t *retval)
{
lwp_t * const l = curlwp;
selcluster_t *sc;
kmutex_t *lock;
struct timespec sleepts;
int error, timo;
timo = 0;
if (ts && inittimeleft(ts, &sleepts) == -1) {
return EINVAL;
}
if (__predict_false(mask)) sigsuspendsetup(l, mask);
/*
* We may context switch during or at any time after picking a CPU
* and cluster to associate with, but it doesn't matter. In the
* unlikely event we migrate elsewhere all we risk is a little lock
* contention; correctness is not sacrificed.
*/
sc = curcpu()->ci_data.cpu_selcluster;
lock = sc->sc_lock;
l->l_selcluster = sc;
if (opname == selop_select) {
l->l_selbits = fds;
l->l_selni = ni;
} else {
l->l_selbits = NULL;
}
for (;;) {
int ncoll;
SLIST_INIT(&l->l_selwait);
l->l_selret = 0;
/*
* No need to lock. If this is overwritten by another value
* while scanning, we will retry below. We only need to see
* exact state from the descriptors that we are about to poll,
* and lock activity resulting from fo_poll is enough to
* provide an up to date value for new polling activity.
*/
if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) {
/* Non-blocking: no need for selrecord()/selclear() */
l->l_selflag = SEL_RESET;
} else {
l->l_selflag = SEL_SCANNING;
}
ncoll = sc->sc_ncoll;
membar_release();
if (opname == selop_select) {
error = selscan((char *)fds, nf, ni, retval);
} else {
error = pollscan((struct pollfd *)fds, nf, retval);
}
if (error || *retval)
break;
if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
break;
/*
* Acquire the lock and perform the (re)checks. Note, if
* collision has occurred, then our state does not matter,
* as we must perform re-scan. Therefore, check it first.
*/
state_check:
mutex_spin_enter(lock);
if (__predict_false(sc->sc_ncoll != ncoll)) {
/* Collision: perform re-scan. */
mutex_spin_exit(lock);
selclear();
continue;
}
if (__predict_true(l->l_selflag == SEL_EVENT)) {
/* Events occurred, they are set directly. */
mutex_spin_exit(lock);
break;
}
if (__predict_true(l->l_selflag == SEL_RESET)) {
/* Events occurred, but re-scan is requested. */
mutex_spin_exit(lock);
selclear();
continue;
}
/* Nothing happen, therefore - sleep. */
l->l_selflag = SEL_BLOCKING;
KASSERT(l->l_blcnt == 0);
(void)sleepq_enter(&sc->sc_sleepq, l, lock);
sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true);
error = sleepq_block(timo, true, &select_sobj, 0);
if (error != 0) {
break;
}
/* Awoken: need to check the state. */
goto state_check;
}
selclear();
/* Add direct events if any. */
if (l->l_selflag == SEL_EVENT) { KASSERT(l->l_selret != 0);
*retval += l->l_selret;
}
if (__predict_false(mask)) sigsuspendteardown(l);
/* select and poll are not restarted after signals... */
if (error == ERESTART)
return EINTR;
if (error == EWOULDBLOCK)
return 0; return error;
}
int
selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
fd_set *u_ex, struct timespec *ts, sigset_t *mask)
{
char smallbits[howmany(FD_SETSIZE, NFDBITS) *
sizeof(fd_mask) * 6];
char *bits;
int error, nf;
size_t ni;
if (nd < 0)
return (EINVAL);
nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles;
if (nd > nf) {
/* forgiving; slightly wrong */
nd = nf;
}
ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
if (ni * 6 > sizeof(smallbits)) bits = kmem_alloc(ni * 6, KM_SLEEP);
else
bits = smallbits;
#define getbits(name, x) \
if (u_ ## name) { \
error = copyin(u_ ## name, bits + ni * x, ni); \
if (error) \
goto fail; \
} else \
memset(bits + ni * x, 0, ni);
getbits(in, 0); getbits(ou, 1); getbits(ex, 2);
#undef getbits
error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval);
if (error == 0 && u_in != NULL) error = copyout(bits + ni * 3, u_in, ni); if (error == 0 && u_ou != NULL) error = copyout(bits + ni * 4, u_ou, ni); if (error == 0 && u_ex != NULL) error = copyout(bits + ni * 5, u_ex, ni);
fail:
if (bits != smallbits) kmem_free(bits, ni * 6);
return (error);
}
static inline int
selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
{
fd_mask *ibitp, *obitp;
int msk, i, j, fd, n;
file_t *fp;
lwp_t *l;
ibitp = (fd_mask *)(bits + ni * 0);
obitp = (fd_mask *)(bits + ni * 3);
n = 0;
l = curlwp;
memset(obitp, 0, ni * 3);
for (msk = 0; msk < 3; msk++) { for (i = 0; i < nfd; i += NFDBITS) {
fd_mask ibits, obits;
ibits = *ibitp;
obits = 0;
while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
ibits &= ~(1U << j);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
/*
* Setup an argument to selrecord(), which is
* a file descriptor number.
*/
l->l_selrec = fd;
if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { if (!direct_select) {
/*
* Have events: do nothing in
* selrecord().
*/
l->l_selflag = SEL_RESET;
}
obits |= (1U << j);
n++;
}
fd_putfile(fd);
}
if (obits != 0) {
if (direct_select) {
kmutex_t *lock;
lock = l->l_selcluster->sc_lock;
mutex_spin_enter(lock);
*obitp |= obits;
mutex_spin_exit(lock);
} else {
*obitp |= obits;
}
}
ibitp++;
obitp++;
}
}
*retval = n;
return (0);
}
/*
* Poll system call.
*/
int
sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
{
/* {
syscallarg(struct pollfd *) fds;
syscallarg(u_int) nfds;
syscallarg(int) timeout;
} */
struct timespec ats, *ts = NULL;
if (SCARG(uap, timeout) != INFTIM) { ats.tv_sec = SCARG(uap, timeout) / 1000;
ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
ts = &ats;
}
return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
}
/*
* Poll system call.
*/
int
sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct pollfd *) fds;
syscallarg(u_int) nfds;
syscallarg(const struct timespec *) ts;
syscallarg(const sigset_t *) mask;
} */
struct timespec ats, *ts = NULL;
sigset_t amask, *mask = NULL;
int error;
if (SCARG(uap, ts)) {
error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
if (error)
return error;
ts = &ats;
}
if (SCARG(uap, mask)) {
error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
if (error)
return error;
mask = &amask;
}
return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
}
int
pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
struct timespec *ts, sigset_t *mask)
{
struct pollfd smallfds[32];
struct pollfd *fds;
int error;
size_t ni;
if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) {
/*
* Prevent userland from causing over-allocation.
* Raising the default limit too high can still cause
* a lot of memory to be allocated, but this also means
* that the file descriptor array will also be large.
*
* To reduce the memory requirements here, we could
* process the 'fds' array in chunks, but that
* is a lot of code that isn't normally useful.
* (Or just move the copyin/out into pollscan().)
*
* Historically the code silently truncated 'fds' to
* dt_nfiles entries - but that does cause issues.
*
* Using the max limit equivalent to sysctl
* kern.maxfiles is the moral equivalent of OPEN_MAX
* as specified by POSIX.
*
* We add a slop of 1000 in case the resource limit was
* changed after opening descriptors or the same descriptor
* was specified more than once.
*/
return EINVAL;
}
ni = nfds * sizeof(struct pollfd);
if (ni > sizeof(smallfds))
fds = kmem_alloc(ni, KM_SLEEP);
else
fds = smallfds;
error = copyin(u_fds, fds, ni);
if (error)
goto fail;
error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval);
if (error == 0) error = copyout(fds, u_fds, ni);
fail:
if (fds != smallfds) kmem_free(fds, ni);
return (error);
}
static inline int
pollscan(struct pollfd *fds, const int nfd, register_t *retval)
{
file_t *fp;
int i, n = 0, revents;
for (i = 0; i < nfd; i++, fds++) {
fds->revents = 0;
if (fds->fd < 0) {
revents = 0;
} else if ((fp = fd_getfile(fds->fd)) == NULL) {
revents = POLLNVAL;
} else {
/*
* Perform poll: registers select request or returns
* the events which are set. Setup an argument for
* selrecord(), which is a pointer to struct pollfd.
*/
curlwp->l_selrec = (uintptr_t)fds;
revents = (*fp->f_ops->fo_poll)(fp,
fds->events | POLLERR | POLLHUP);
fd_putfile(fds->fd);
}
if (revents) { if (!direct_select) {
/* Have events: do nothing in selrecord(). */
curlwp->l_selflag = SEL_RESET;
}
fds->revents = revents;
n++;
}
}
*retval = n;
return (0);
}
int
seltrue(dev_t dev, int events, lwp_t *l)
{
return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}
/*
* Record a select request. Concurrency issues:
*
* The caller holds the same lock across calls to selrecord() and
* selnotify(), so we don't need to consider a concurrent wakeup
* while in this routine.
*
* The only activity we need to guard against is selclear(), called by
* another thread that is exiting sel_do_scan().
* `sel_lwp' can only become non-NULL while the caller's lock is held,
* so it cannot become non-NULL due to a change made by another thread
* while we are in this routine. It can only become _NULL_ due to a
* call to selclear().
*
* If it is non-NULL and != selector there is the potential for
* selclear() to be called by another thread. If either of those
* conditions are true, we're not interested in touching the `named
* waiter' part of the selinfo record because we need to record a
* collision. Hence there is no need for additional locking in this
* routine.
*/
void
selrecord(lwp_t *selector, struct selinfo *sip)
{
selcluster_t *sc;
lwp_t *other;
KASSERT(selector == curlwp);
sc = selector->l_selcluster;
other = sip->sel_lwp;
if (selector->l_selflag == SEL_RESET) {
/* 0. We're not going to block - will poll again if needed. */
} else if (other == selector) {
/* 1. We (selector) already claimed to be the first LWP. */
KASSERT(sip->sel_cluster == sc);
} else if (other == NULL) {
/*
* 2. No first LWP, therefore we (selector) are the first.
*
* There may be unnamed waiters (collisions). Issue a memory
* barrier to ensure that we access sel_lwp (above) before
* other fields - this guards against a call to selclear().
*/
membar_acquire();
sip->sel_lwp = selector;
SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
/* Copy the argument, which is for selnotify(). */
sip->sel_fdinfo = selector->l_selrec;
/* Replace selinfo's lock with the chosen cluster's lock. */
sip->sel_cluster = sc;
} else {
/* 3. Multiple waiters: record a collision. */
sip->sel_collision |= sc->sc_mask;
KASSERT(sip->sel_cluster != NULL);
}
}
/*
* Record a knote.
*
* The caller holds the same lock as for selrecord().
*/
void
selrecord_knote(struct selinfo *sip, struct knote *kn)
{
klist_insert(&sip->sel_klist, kn);
}
/*
* Remove a knote.
*
* The caller holds the same lock as for selrecord().
*
* Returns true if the last knote was removed and the list
* is now empty.
*/
bool
selremove_knote(struct selinfo *sip, struct knote *kn)
{
return klist_remove(&sip->sel_klist, kn);
}
/*
* sel_setevents: a helper function for selnotify(), to set the events
* for LWP sleeping in selcommon() or pollcommon().
*/
static inline bool
sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
{
const int oflag = l->l_selflag;
int ret = 0;
/*
* If we require re-scan or it was required by somebody else,
* then just (re)set SEL_RESET and return.
*/
if (__predict_false(events == 0 || oflag == SEL_RESET)) {
l->l_selflag = SEL_RESET;
return true;
}
/*
* Direct set. Note: select state of LWP is locked. First,
* determine whether it is selcommon() or pollcommon().
*/
if (l->l_selbits != NULL) {
const size_t ni = l->l_selni;
fd_mask *fds = (fd_mask *)l->l_selbits;
fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3);
const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK);
const int idx = fd >> __NFDSHIFT;
int n;
for (n = 0; n < 3; n++) {
if ((fds[idx] & fbit) != 0 && (ofds[idx] & fbit) == 0 &&
(sel_flag[n] & events)) {
ofds[idx] |= fbit;
ret++;
}
fds = (fd_mask *)((char *)fds + ni);
ofds = (fd_mask *)((char *)ofds + ni);
}
} else {
struct pollfd *pfd = (void *)sip->sel_fdinfo;
int revents = events & (pfd->events | POLLERR | POLLHUP);
if (revents) {
if (pfd->revents == 0)
ret = 1;
pfd->revents |= revents;
}
}
/* Check whether there are any events to return. */
if (!ret) {
return false;
}
/* Indicate direct set and note the event (cluster lock is held). */
l->l_selflag = SEL_EVENT;
l->l_selret += ret;
return true;
}
/*
* Do a wakeup when a selectable event occurs. Concurrency issues:
*
* As per selrecord(), the caller's object lock is held. If there
* is a named waiter, we must acquire the associated selcluster's lock
* in order to synchronize with selclear() and pollers going to sleep
* in sel_do_scan().
*
* sip->sel_cluser cannot change at this point, as it is only changed
* in selrecord(), and concurrent calls to selrecord() are locked
* out by the caller.
*/
void
selnotify(struct selinfo *sip, int events, long knhint)
{
selcluster_t *sc;
uint64_t mask;
int index, oflag;
lwp_t *l;
kmutex_t *lock;
KNOTE(&sip->sel_klist, knhint); if (sip->sel_lwp != NULL) {
/* One named LWP is waiting. */
sc = sip->sel_cluster;
lock = sc->sc_lock;
mutex_spin_enter(lock);
/* Still there? */
if (sip->sel_lwp != NULL) {
/*
* Set the events for our LWP and indicate that.
* Otherwise, request for a full re-scan.
*/
l = sip->sel_lwp;
oflag = l->l_selflag;
if (!direct_select) {
l->l_selflag = SEL_RESET; } else if (!sel_setevents(l, sip, events)) {
/* No events to return. */
mutex_spin_exit(lock);
return;
}
/*
* If thread is sleeping, wake it up. If it's not
* yet asleep, it will notice the change in state
* and will re-poll the descriptors.
*/
if (oflag == SEL_BLOCKING && l->l_mutex == lock) { KASSERT(l->l_wchan == sc);
sleepq_remove(l->l_sleepq, l, true);
}
}
mutex_spin_exit(lock);
}
if ((mask = sip->sel_collision) != 0) {
/*
* There was a collision (multiple waiters): we must
* inform all potentially interested waiters.
*/
sip->sel_collision = 0;
do {
index = ffs64(mask) - 1;
mask ^= __BIT(index);
sc = selcluster[index];
lock = sc->sc_lock;
mutex_spin_enter(lock);
sc->sc_ncoll++;
sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
} while (__predict_false(mask != 0));
}
}
/*
* Remove an LWP from all objects that it is waiting for. Concurrency
* issues:
*
* The object owner's (e.g. device driver) lock is not held here. Calls
* can be made to selrecord() and we do not synchronize against those
* directly using locks. However, we use `sel_lwp' to lock out changes.
* Before clearing it we must use memory barriers to ensure that we can
* safely traverse the list of selinfo records.
*/
static void
selclear(void)
{
struct selinfo *sip, *next;
selcluster_t *sc;
lwp_t *l;
kmutex_t *lock;
l = curlwp;
sc = l->l_selcluster;
lock = sc->sc_lock;
/*
* If the request was non-blocking, or we found events on the first
* descriptor, there will be no need to clear anything - avoid
* taking the lock.
*/
if (SLIST_EMPTY(&l->l_selwait)) {
return;
}
mutex_spin_enter(lock);
for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { KASSERT(sip->sel_lwp == l); KASSERT(sip->sel_cluster == l->l_selcluster);
/*
* Read link to next selinfo record, if any.
* It's no longer safe to touch `sip' after clearing
* `sel_lwp', so ensure that the read of `sel_chain'
* completes before the clearing of sel_lwp becomes
* globally visible.
*/
next = SLIST_NEXT(sip, sel_chain);
/* Release the record for another named waiter to use. */
atomic_store_release(&sip->sel_lwp, NULL);
}
mutex_spin_exit(lock);
}
/*
* Initialize the select/poll system calls. Called once for each
* CPU in the system, as they are attached.
*/
void
selsysinit(struct cpu_info *ci)
{
selcluster_t *sc;
u_int index;
/* If already a cluster in place for this bit, re-use. */
index = cpu_index(ci) & SELCLUSTERMASK;
sc = selcluster[index];
if (sc == NULL) {
sc = kmem_alloc(roundup2(sizeof(selcluster_t),
coherency_unit) + coherency_unit, KM_SLEEP);
sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
sleepq_init(&sc->sc_sleepq);
sc->sc_ncoll = 0;
sc->sc_mask = __BIT(index);
selcluster[index] = sc;
}
ci->ci_data.cpu_selcluster = sc;
}
/*
* Initialize a selinfo record.
*/
void
selinit(struct selinfo *sip)
{
memset(sip, 0, sizeof(*sip));
klist_init(&sip->sel_klist);
}
/*
* Destroy a selinfo record. The owning object must not gain new
* references while this is in progress: all activity on the record
* must be stopped.
*
* Concurrency issues: we only need guard against a call to selclear()
* by a thread exiting sel_do_scan(). The caller has prevented further
* references being made to the selinfo record via selrecord(), and it
* will not call selnotify() again.
*/
void
seldestroy(struct selinfo *sip)
{
selcluster_t *sc;
kmutex_t *lock;
lwp_t *l;
klist_fini(&sip->sel_klist);
if (sip->sel_lwp == NULL)
return;
/*
* Lock out selclear(). The selcluster pointer can't change while
* we are here since it is only ever changed in selrecord(),
* and that will not be entered again for this record because
* it is dying.
*/
KASSERT(sip->sel_cluster != NULL);
sc = sip->sel_cluster;
lock = sc->sc_lock;
mutex_spin_enter(lock);
if ((l = sip->sel_lwp) != NULL) {
/*
* This should rarely happen, so although SLIST_REMOVE()
* is slow, using it here is not a problem.
*/
KASSERT(l->l_selcluster == sc); SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
sip->sel_lwp = NULL;
}
mutex_spin_exit(lock);
}
/*
* System control nodes.
*/
SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "direct_select",
SYSCTL_DESCR("Enable/disable direct select (for testing)"),
NULL, 0, &direct_select, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: mount.h,v 1.16 2024/01/19 18:39:15 christos Exp $ */
/*
* Copyright (c) 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mount.h 8.21 (Berkeley) 5/20/95
*/
#ifndef _COMPAT_SYS_MOUNT_H_
#define _COMPAT_SYS_MOUNT_H_
#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#endif
#define MFSNAMELEN 16
struct statfs12 {
short f_type; /* type of file system */
u_short f_oflags; /* deprecated copy of mount flags */
long f_bsize; /* fundamental file system block size */
long f_iosize; /* optimal transfer block size */
long f_blocks; /* total data blocks in file system */
long f_bfree; /* free blocks in fs */
long f_bavail; /* free blocks avail to non-superuser */
long f_files; /* total file nodes in file system */
long f_ffree; /* free file nodes in fs */
fsid_t f_fsid; /* file system id */
uid_t f_owner; /* user that mounted the file system */
long f_flags; /* copy of mount flags */
long f_syncwrites; /* count of sync writes since mount */
long f_asyncwrites; /* count of async writes since mount */
long f_spare[1]; /* spare for later */
char f_fstypename[MFSNAMELEN]; /* fs type name */
char f_mntonname[MNAMELEN]; /* directory on which mounted */
char f_mntfromname[MNAMELEN]; /* mounted file system */
};
#ifndef _KERNEL
#include <string.h>
#endif
/*
* Operations supported on mounted file system.
*/
/*
* Convert from a new statvfs to an old statfs structure.
*/
#define MOUNTNO_NONE 0
#define MOUNTNO_UFS 1 /* UNIX "Fast" Filesystem */
#define MOUNTNO_NFS 2 /* Network Filesystem */
#define MOUNTNO_MFS 3 /* Memory Filesystem */
#define MOUNTNO_MSDOS 4 /* MSDOS Filesystem */
#define MOUNTNO_CD9660 5 /* iso9660 cdrom */
#define MOUNTNO_FDESC 6 /* /dev/fd filesystem */
#define MOUNTNO_KERNFS 7 /* kernel variable filesystem */
#define MOUNTNO_DEVFS 8 /* device node filesystem */
#define MOUNTNO_AFS 9 /* AFS 3.x */
static const struct {
const char *name;
const int value;
} __nv[] = {
{ MOUNT_UFS, MOUNTNO_UFS },
{ MOUNT_NFS, MOUNTNO_NFS },
{ MOUNT_MFS, MOUNTNO_MFS },
{ MOUNT_MSDOS, MOUNTNO_MSDOS },
{ MOUNT_CD9660, MOUNTNO_CD9660 },
{ MOUNT_FDESC, MOUNTNO_FDESC },
{ MOUNT_KERNFS, MOUNTNO_KERNFS },
{ MOUNT_AFS, MOUNTNO_AFS },
};
static __inline void
statvfs_to_statfs12(const struct statvfs *fs, struct statfs12 *s12)
{
size_t i = 0;
memset(s12, 0, sizeof(*s12));
s12->f_type = 0;
s12->f_oflags = (short)fs->f_flag;
for (i = 0; i < sizeof(__nv) / sizeof(__nv[0]); i++) {
if (strcmp(__nv[i].name, fs->f_fstypename) == 0) {
s12->f_type = __nv[i].value;
break;
}
}
#define __STATFSCLAMP(a) (long)(((a) & ~LONG_MAX) ? LONG_MAX : (a))
s12->f_bsize = __STATFSCLAMP(fs->f_frsize);
s12->f_iosize = __STATFSCLAMP(fs->f_iosize);
s12->f_blocks = __STATFSCLAMP(fs->f_blocks);
s12->f_bfree = __STATFSCLAMP(fs->f_bfree);
if (fs->f_bfree > fs->f_bresvd)
s12->f_bavail = __STATFSCLAMP(fs->f_bfree - fs->f_bresvd);
else
s12->f_bavail = -__STATFSCLAMP(fs->f_bresvd - fs->f_bfree);
s12->f_files = __STATFSCLAMP(fs->f_files);
s12->f_ffree = __STATFSCLAMP(fs->f_ffree);
s12->f_fsid = fs->f_fsidx;
s12->f_owner = fs->f_owner;
s12->f_flags = (long)fs->f_flag;
s12->f_syncwrites = __STATFSCLAMP(fs->f_syncwrites);
s12->f_asyncwrites = __STATFSCLAMP(fs->f_asyncwrites);
memcpy(s12->f_fstypename, fs->f_fstypename, sizeof(s12->f_fstypename));
memcpy(s12->f_mntonname, fs->f_mntonname, sizeof(s12->f_mntonname));
memcpy(s12->f_mntfromname, fs->f_mntfromname,
sizeof(s12->f_mntfromname));
}
#ifdef _KERNEL
static __inline int
statvfs_to_statfs12_copy(const void *vs, void *vs12, size_t l)
{
struct statfs12 *s12 = kmem_zalloc(sizeof(*s12), KM_SLEEP);
int error;
statvfs_to_statfs12(vs, s12);
error = copyout(s12, vs12, sizeof(*s12));
kmem_free(s12, sizeof(*s12));
return error;
}
/*
* Filesystem configuration information. Not used by NetBSD, but
* defined here to provide a compatible sysctl interface to Lite2.
*/
struct vfsconf {
struct vfsops *vfc_vfsops; /* filesystem operations vector */
char vfc_name[MFSNAMELEN]; /* filesystem type name */
int vfc_typenum; /* historic filesystem type number */
int vfc_refcount; /* number mounted of this type */
int vfc_flags; /* permanent flags */
int (*vfc_mountroot)(void); /* if != NULL, routine to mount root */
struct vfsconf *vfc_next; /* next in list */
};
/* Old, fixed size filehandle structures (used upto (including) 3.x) */
struct compat_30_fid {
unsigned short fid_len;
unsigned short fid_reserved;
char fid_data[16];
};
struct compat_30_fhandle {
fsid_t fh_fsid;
struct compat_30_fid fh_fid;
};
#else
__BEGIN_DECLS
int __compat_fstatfs(int, struct statfs12 *) __dso_hidden;
int __compat_getfsstat(struct statfs12 *, long, int) __dso_hidden;
int __compat_statfs(const char *, struct statfs12 *) __dso_hidden;
int __compat_getmntinfo(struct statfs12 **, int) __dso_hidden;
#if defined(_NETBSD_SOURCE)
struct compat_30_fhandle;
int __compat_fhstatfs(const struct compat_30_fhandle *, struct statfs12 *)
__dso_hidden;
struct stat13;
int __compat_fhstat(const struct compat_30_fhandle *, struct stat13 *)
__dso_hidden;
struct stat30;
int __compat___fhstat30(const struct compat_30_fhandle *, struct stat30 *)
__dso_hidden;
int __compat___fhstat40(const void *, size_t, struct stat30 *) __dso_hidden;
struct stat;
int __fhstat50(const void *, size_t, struct stat *);
int __fhopen40(const void *, size_t, int);
int fhopen(const struct compat_30_fhandle *, int);
int __getfh30(const char *, void*, size_t *);
int getfh(const char *path, struct compat_30_fhandle *fhp);
int mount(const char *, const char *, int, void *);
int __mount50(const char *, const char *, int, void *, size_t);
#endif /* _NETBSD_SOURCE */
__END_DECLS
#endif /* _KERNEL */
#endif /* !_COMPAT_SYS_MOUNT_H_ */
/* $NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $ */
/*-
* Copyright (c) 2005, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/malloc.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_mod.h>
#include <compat/common/compat_util.h>
#include <compat/sys/statvfs.h>
static const struct syscall_package vfs_syscalls_90_syscalls[] = {
{ SYS_compat_90_getvfsstat, 0, (sy_call_t *)compat_90_sys_getvfsstat },
{ SYS_compat_90_statvfs1, 0, (sy_call_t *)compat_90_sys_statvfs1 },
{ SYS_compat_90_fstatvfs1, 0, (sy_call_t *)compat_90_sys_fstatvfs1 },
{ SYS_compat_90_fhstatvfs1, 0, (sy_call_t *)compat_90_sys_fhstatvfs1 },
{ 0,0, NULL }
};
int
compat_90_sys_getvfsstat(struct lwp *l,
const struct compat_90_sys_getvfsstat_args *uap, register_t *retval)
{
/* {
syscallarg(struct statvfs90 *) buf;
syscallarg(size_t) bufsize;
syscallarg(int) flags;
} */
return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
SCARG(uap, flags), statvfs_to_statvfs90_copy,
sizeof(struct statvfs90), retval);
}
int
compat_90_sys_statvfs1(struct lwp *l,
const struct compat_90_sys_statvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
if (!error)
error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
compat_90_sys_fstatvfs1(struct lwp *l,
const struct compat_90_sys_fstatvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
if (!error)
error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
compat_90_sys_fhstatvfs1(struct lwp *l,
const struct compat_90_sys_fhstatvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(const void *) fhp;
syscallarg(size_t) fh_size;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size),
sb, SCARG(uap, flags));
if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
vfs_syscalls_90_init(void)
{
return syscall_establish(NULL, vfs_syscalls_90_syscalls);
}
int
vfs_syscalls_90_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_90_syscalls);
}
/* $NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Entropy pool (`reseedable pseudorandom number generator') based on a
* sponge duplex, following the design described and analyzed in
*
* Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
* Assche, `Sponge-Based Pseudo-Random Number Generators', in
* Stefan Mangard and François-Xavier Standaert, eds.,
* Cryptographic Hardware and Embedded Systems—CHES 2010, Springer
* LNCS 6225, pp. 33–47.
* https://link.springer.com/chapter/10.1007/978-3-642-15031-9_3
* https://keccak.team/files/SpongePRNG.pdf
*
* Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
* Assche, `Duplexing the Sponge: Single-Pass Authenticated
* Encryption and Other Applications', in Ali Miri and Serge
* Vaudenay, eds., Selected Areas in Cryptography—SAC 2011,
* Springer LNCS 7118, pp. 320–337.
* https://link.springer.com/chapter/10.1007/978-3-642-28496-0_19
* https://keccak.team/files/SpongeDuplex.pdf
*
* We make the following tweaks that don't affect security:
*
* - Samples are length-delimited 7-bit variable-length encoding.
* The encoding is still injective, so the security theorems
* continue to apply.
*
* - Output is not buffered -- callers should draw 32 bytes and
* expand with a stream cipher. In effect, every output draws
* the full rate, and we just discard whatever the caller didn't
* ask for; the impact is only on performance, not security.
*
* On top of the underlying sponge state, an entropy pool maintains an
* integer i in [0, RATE-1] indicating where to write the next byte in
* the input buffer. Zeroing an entropy pool initializes it.
*/
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $");
#endif
#include "entpool.h"
#include ENTPOOL_HEADER
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/types.h>
#include <lib/libkern/libkern.h>
#define ASSERT KASSERT
#else
#include <sys/cdefs.h>
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#define ASSERT assert
#define CTASSERT __CTASSERT
#endif
#define secret /* must not use in variable-time operations; should zero */
#define arraycount(A) (sizeof(A)/sizeof((A)[0]))
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define RATE ENTPOOL_RATE
/*
* stir(P)
*
* Internal subroutine to apply the sponge permutation to the
* state in P. Resets P->i to 0 to indicate that the input buffer
* is empty.
*/
static void
stir(struct entpool *P)
{
size_t i;
/*
* Switch to the permutation's byte order, if necessary, apply
* permutation, and then switch back. This way we can data in
* and out byte by byte, but get the same answers out of test
* vectors.
*/
for (i = 0; i < arraycount(P->s.w); i++)
P->s.w[i] = ENTPOOL_WTOH(P->s.w[i]);
ENTPOOL_PERMUTE(P->s.w);
for (i = 0; i < arraycount(P->s.w); i++)
P->s.w[i] = ENTPOOL_HTOW(P->s.w[i]);
/* Reset the input buffer. */
P->i = 0;
}
/*
* entpool_enter(P, buf, len)
*
* Enter len bytes from buf into the entropy pool P, stirring as
* needed. Corresponds to P.feed in the paper.
*/
void
entpool_enter(struct entpool *P, const void *buf, size_t len)
{
const uint8_t *p = buf;
size_t n = len, n1 = n;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* Encode the length, stirring as needed. */
while (n1) {
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
P->s.u8[P->i++] ^= (n1 >= 0x80 ? 0x80 : 0) | (n1 & 0x7f);
n1 >>= 7;
}
/* Enter the sample, stirring as needed. */
while (n --> 0) {
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
P->s.u8[P->i++] ^= *p++;
}
/* If we filled the input buffer exactly, stir once more. */
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
}
/*
* entpool_enter_nostir(P, buf, len)
*
* Enter as many bytes as possible, up to len, from buf into the
* entropy pool P. Roughly corresponds to P.feed in the paper,
* but we stop if we would have run the permutation.
*
* Return true if the sample was consumed in its entirety, or true
* if the sample was truncated so the caller should arrange to
* call entpool_stir when it is next convenient to do so.
*
* This function is cheap -- it only xors the input into the
* state, and never calls the underlying permutation, but it may
* truncate samples.
*/
bool
entpool_enter_nostir(struct entpool *P, const void *buf, size_t len)
{
const uint8_t *p = buf;
size_t n0, n;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If the input buffer is full, fail. */
if (P->i == RATE-1)
return false;
ASSERT(P->i < RATE-1);
/*
* Truncate the sample and enter it with 1-byte length encoding
* -- don't bother with variable-length encoding, not worth the
* trouble.
*/
n = n0 = MIN(127, MIN(len, RATE-1 - P->i - 1));
P->s.u8[P->i++] ^= n;
while (n --> 0)
P->s.u8[P->i++] ^= *p++;
/* Can't guarantee anything better than 0 <= i <= RATE-1. */
ASSERT(P->i <= RATE-1);
/* Return true if all done, false if truncated and in need of stir. */
return (n0 == len);
}
/*
* entpool_stir(P)
*
* Stir the entropy pool after entpool_enter_nostir fails. If it
* has already been stirred already, this has no effect.
*/
void
entpool_stir(struct entpool *P)
{
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If the input buffer is full, stir. */
if (P->i == RATE-1)
stir(P);
ASSERT(P->i < RATE-1);
}
/*
* entpool_extract(P, buf, len)
*
* Extract len bytes from the entropy pool P into buf.
* Corresponds to iterating P.fetch/P.forget in the paper.
* (Feeding the output back in -- as P.forget does -- is the same
* as zeroing what we just read out.)
*/
void
entpool_extract(struct entpool *P, secret void *buf, size_t len)
{
uint8_t *p = buf;
size_t n = len;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If input buffer is not empty, stir. */
if (P->i != 0)
stir(P);
ASSERT(P->i == 0);
/*
* Copy out and zero (RATE-1)-sized chunks at a time, stirring
* with a bit set to distinguish this from inputs.
*/
while (n >= RATE-1) {
memcpy(p, P->s.u8, RATE-1);
memset(P->s.u8, 0, RATE-1);
P->s.u8[RATE-1] ^= 0x80;
stir(P);
p += RATE-1;
n -= RATE-1;
}
/*
* If there's anything left, copy out a partial rate's worth
* and zero the entire rate's worth, stirring with a bit set to
* distinguish this from inputs.
*/
if (n) {
ASSERT(n < RATE-1);
memcpy(p, P->s.u8, n); /* Copy part of it. */
memset(P->s.u8, 0, RATE-1); /* Zero all of it. */
P->s.u8[RATE-1] ^= 0x80;
stir(P);
}
}
/*
* Known-answer tests
*/
#if ENTPOOL_SMALL
#define KATLEN 15
/* Gimli */
static const uint8_t known_answers[][KATLEN] = {
[0] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[1] = {
0x74,0x15,0x16,0x49,0x31,0x07,0x77,0xa1,
0x3b,0x4d,0x78,0xc6,0x5d,0xef,0x87,
},
[2] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[3] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[4] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[5] = {
0xa9,0x3c,0x3c,0xac,0x5f,0x6d,0x80,0xdc,
0x33,0x0c,0xb2,0xe3,0xdd,0x55,0x31,
},
[6] = {
0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
},
[7] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[8] = {
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,
},
[9] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[10] = {
0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
},
[11] = {
0x6f,0xfd,0xd2,0x29,0x78,0x46,0xc0,0x7d,
0xc7,0xf2,0x0a,0x2b,0x72,0xd6,0xc6,
},
[12] = {
0x86,0xf0,0xc1,0xf9,0x95,0x0f,0xc9,0x12,
0xde,0x38,0x39,0x10,0x1f,0x8c,0xc4,
},
};
#else /* !ENTPOOL_SMALL */
#define KATLEN 16
/* Keccak-p[1600, 24] */
static const uint8_t known_answers[][KATLEN] = {
[0] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[1] = {
0x57,0x49,0x6e,0x28,0x7f,0xaa,0xee,0x6c,
0xa8,0xb0,0xf5,0x0b,0x87,0xae,0xd6,0xd6,
},
[2] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[3] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[4] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[5] = {
0x95,0x23,0x77,0xe4,0x84,0xeb,0xaa,0x2e,
0x6a,0x99,0xc2,0x52,0x06,0x6d,0xdf,0xea,
},
[6] = {
0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
},
[7] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[8] = {
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
},
[9] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[10] = {
0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
},
[11] = {
0xf6,0xc1,0x14,0xbb,0x13,0x0a,0xaf,0xed,
0xca,0x0b,0x35,0x2c,0xf1,0x2b,0x1a,0x85,
},
[12] = {
0xf9,0x4b,0x05,0xd1,0x8b,0xcd,0xb3,0xd0,
0x77,0x27,0xfe,0x46,0xf9,0x33,0xb2,0xa2,
},
};
#endif
#define KAT_BEGIN(P, n) memset(P, 0, sizeof(*(P)))
#define KAT_ERROR() return -1
#define KAT_END(P, n) do \
{ \
uint8_t KAT_ACTUAL[KATLEN]; \
entpool_extract(P, KAT_ACTUAL, KATLEN); \
if (memcmp(KAT_ACTUAL, known_answers[n], KATLEN)) \
return -1; \
} while (0)
int
entpool_selftest(void)
{
struct entpool pool, *P = &pool;
uint8_t sample[1] = {0xff};
uint8_t scratch[RATE];
const uint8_t zero[RATE] = {0};
/* Test entpool_enter with empty buffer. */
KAT_BEGIN(P, 0);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 0);
/* Test entpool_enter with partial buffer. */
KAT_BEGIN(P, 1);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
entpool_enter(P, zero, RATE-3);
#else
entpool_enter(P, zero, RATE-4);
#endif
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 1);
/* Test entpool_enter with full buffer. */
KAT_BEGIN(P, 2);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 2);
/* Test entpool_enter with full buffer after stir. */
KAT_BEGIN(P, 3);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_stir(P);
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 3);
/* Test entpool_enter_nostir with empty buffer. */
KAT_BEGIN(P, 4);
entpool_stir(P); /* noop */
if (!entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P); /* noop */
KAT_END(P, 4);
/* Test entpool_enter_nostir with partial buffer. */
KAT_BEGIN(P, 5);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
entpool_enter(P, zero, RATE-3);
#else
entpool_enter(P, zero, RATE-4);
#endif
entpool_stir(P); /* noop */
if (entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P);
KAT_END(P, 5);
/* Test entpool_enter_nostir with full buffer. */
KAT_BEGIN(P, 6);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
if (entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P);
KAT_END(P, 6);
/* Test entpool_enter_nostir with full buffer after stir. */
KAT_BEGIN(P, 7);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_stir(P);
if (!entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P); /* noop */
KAT_END(P, 7);
/* Test entpool_extract with empty input buffer. */
KAT_BEGIN(P, 8);
entpool_stir(P); /* noop */
KAT_END(P, 8);
/* Test entpool_extract with nonempty input buffer. */
KAT_BEGIN(P, 9);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 9);
/* Test entpool_extract with full input buffer. */
KAT_BEGIN(P, 10);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
KAT_END(P, 10);
/* Test entpool_extract with iterated output. */
KAT_BEGIN(P, 11);
entpool_stir(P); /* noop */
entpool_extract(P, scratch, RATE-1 + 1);
entpool_stir(P); /* noop */
KAT_END(P, 11);
/* Test extract, enter, extract. */
KAT_BEGIN(P, 12);
entpool_stir(P); /* noop */
entpool_extract(P, scratch, 1);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 12);
return 0;
}
#if ENTPOOL_TEST
int
main(void)
{
return entpool_selftest();
}
#endif
/*
* Known-answer test generation
*
* This generates the known-answer test vectors from explicitly
* specified duplex inputs that correspond to what entpool_enter
* &c. induce, to confirm the encoding of inputs works as
* intended.
*/
#if ENTPOOL_GENKAT
#include <stdio.h>
struct event {
enum { IN, OUT, STOP } t;
uint8_t b[RATE-1];
};
/* Cases correspond to entpool_selftest above. */
static const struct event *const cases[] = {
[0] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[1] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-3, [RATE-2] = 1}},
#else
{IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7, [RATE-2] = 1}},
#endif
{IN, {0xff}},
{STOP, {0}},
},
[2] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[3] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[4] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[5] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-3, [RATE-2] = 0 /* truncated length */}},
#else
{IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7,
[RATE-2] = 0 /* truncated length */}},
#endif
{STOP, {0}},
},
[6] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{STOP, {0}},
},
[7] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[8] = (const struct event[]) {
{STOP, {0}},
},
[9] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[10] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{STOP, {0}},
},
[11] = (const struct event[]) {
{OUT, {0}},
{OUT, {0}},
{STOP, {0}},
},
[12] = (const struct event[]) {
{OUT, {0}},
{IN, {1, 0xff}},
{STOP, {0}},
},
};
static void
compute(uint8_t output[KATLEN], const struct event *events)
{
union {
uint8_t b[ENTPOOL_SIZE];
ENTPOOL_WORD w[ENTPOOL_SIZE/sizeof(ENTPOOL_WORD)];
} u;
unsigned i, j, k;
memset(&u.b, 0, sizeof u.b);
for (i = 0;; i++) {
if (events[i].t == STOP)
break;
for (j = 0; j < sizeof(events[i].b); j++)
u.b[j] ^= events[i].b[j];
if (events[i].t == OUT) {
memset(u.b, 0, RATE-1);
u.b[RATE-1] ^= 0x80;
}
for (k = 0; k < arraycount(u.w); k++)
u.w[k] = ENTPOOL_WTOH(u.w[k]);
ENTPOOL_PERMUTE(u.w);
for (k = 0; k < arraycount(u.w); k++)
u.w[k] = ENTPOOL_HTOW(u.w[k]);
}
for (j = 0; j < KATLEN; j++)
output[j] = u.b[j];
}
int
main(void)
{
uint8_t output[KATLEN];
unsigned i, j;
printf("static const uint8_t known_answers[][KATLEN] = {\n");
for (i = 0; i < arraycount(cases); i++) {
printf("\t[%u] = {\n", i);
compute(output, cases[i]);
for (j = 0; j < KATLEN; j++) {
if (j % 8 == 0)
printf("\t\t");
printf("0x%02hhx,", output[j]);
if (j % 8 == 7)
printf("\n");
}
if ((KATLEN % 8) != 0)
printf("\n");
printf("\t},\n");
}
printf("};\n");
fflush(stdout);
return ferror(stdout);
}
#endif
/* $NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by William Studenmund and Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.c 8.5 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_atalk.h"
#include "opt_wlan.h"
#include "opt_net_mpsafe.h"
#include "opt_mrouting.h"
#endif
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <sys/msan.h>
#include <sys/hook.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_ether.h>
#include <net/if_media.h>
#include <net80211/ieee80211.h>
#include <net80211/ieee80211_ioctl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <sys/module.h>
#ifdef NETATALK
#include <netatalk/at_extern.h>
#include <netatalk/at.h>
#endif
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_encap.h>
#include <net/bpf.h>
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif
#include "ether.h"
#include "bridge.h"
#if NBRIDGE > 0
#include <net/if_bridgevar.h>
#endif
#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
#include <compat/sys/sockio.h>
MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
/*
* XXX reusing (ifp)->if_snd->ifq_lock rather than having another spin mutex
* for each ifnet. It doesn't matter because:
* - if IFEF_MPSAFE is enabled, if_snd isn't used and lock contentions on
* ifq_lock don't happen
* - if IFEF_MPSAFE is disabled, there is no lock contention on ifq_lock
* because if_snd, if_link_state_change and if_link_state_change_process
* are all called with KERNEL_LOCK
*/
#define IF_LINK_STATE_CHANGE_LOCK(ifp) \
mutex_enter((ifp)->if_snd.ifq_lock)
#define IF_LINK_STATE_CHANGE_UNLOCK(ifp) \
mutex_exit((ifp)->if_snd.ifq_lock)
/*
* Global list of interfaces.
*/
/* DEPRECATED. Remove it once kvm(3) users disappeared */
struct ifnet_head ifnet_list;
struct pslist_head ifnet_pslist;
static ifnet_t ** ifindex2ifnet = NULL;
static u_int if_index = 1;
static size_t if_indexlim = 0;
static uint64_t index_gen;
/* Mutex to protect the above objects. */
kmutex_t ifnet_mtx __cacheline_aligned;
static struct psref_class *ifnet_psref_class __read_mostly;
static pserialize_t ifnet_psz;
static struct workqueue *ifnet_link_state_wq __read_mostly;
static struct workqueue *if_slowtimo_wq __read_mostly;
static kmutex_t if_clone_mtx;
struct ifnet *lo0ifp;
int ifqmaxlen = IFQ_MAXLEN;
struct psref_class *ifa_psref_class __read_mostly;
static int if_delroute_matcher(struct rtentry *, void *);
static bool if_is_unit(const char *);
static struct if_clone *if_clone_lookup(const char *, int *);
static LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners);
static int if_cloners_count;
/* Packet filtering hook for interfaces. */
pfil_head_t * if_pfil __read_mostly;
static kauth_listener_t if_listener;
static int doifioctl(struct socket *, u_long, void *, struct lwp *);
static void sysctl_sndq_setup(struct sysctllog **, const char *,
struct ifaltq *);
static void if_slowtimo_intr(void *);
static void if_slowtimo_work(struct work *, void *);
static int sysctl_if_watchdog(SYSCTLFN_PROTO);
static void sysctl_watchdog_setup(struct ifnet *);
static void if_attachdomain1(struct ifnet *);
static int ifconf(u_long, void *);
static int if_transmit(struct ifnet *, struct mbuf *);
static int if_clone_create(const char *);
static int if_clone_destroy(const char *);
static void if_link_state_change_work(struct work *, void *);
static void if_up_locked(struct ifnet *);
static void _if_down(struct ifnet *);
static void if_down_deactivated(struct ifnet *);
struct if_percpuq {
struct ifnet *ipq_ifp;
void *ipq_si;
struct percpu *ipq_ifqs; /* struct ifqueue */
};
static struct mbuf *if_percpuq_dequeue(struct if_percpuq *);
static void if_percpuq_drops(void *, void *, struct cpu_info *);
static int sysctl_percpuq_drops_handler(SYSCTLFN_PROTO);
static void sysctl_percpuq_setup(struct sysctllog **, const char *,
struct if_percpuq *);
struct if_deferred_start {
struct ifnet *ids_ifp;
void (*ids_if_start)(struct ifnet *);
void *ids_si;
};
static void if_deferred_start_softint(void *);
static void if_deferred_start_common(struct ifnet *);
static void if_deferred_start_destroy(struct ifnet *);
struct if_slowtimo_data {
kmutex_t isd_lock;
struct callout isd_ch;
struct work isd_work;
struct ifnet *isd_ifp;
bool isd_queued;
bool isd_dying;
bool isd_trigger;
};
/*
* Hook for if_vlan - needed by if_agr
*/
struct if_vlan_vlan_input_hook_t if_vlan_vlan_input_hook;
static void if_sysctl_setup(struct sysctllog **);
static int
if_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_network_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg1;
if (action != KAUTH_NETWORK_INTERFACE)
return result;
if ((req == KAUTH_REQ_NETWORK_INTERFACE_GET) ||
(req == KAUTH_REQ_NETWORK_INTERFACE_SET))
result = KAUTH_RESULT_ALLOW;
return result;
}
/*
* Network interface utility routines.
*
* Routines with ifa_ifwith* names take sockaddr *'s as
* parameters.
*/
void
ifinit(void)
{
#if (defined(INET) || defined(INET6))
encapinit();
#endif
if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
if_listener_cb, NULL);
/* interfaces are available, inform socket code */
ifioctl = doifioctl;
}
/*
* XXX Initialization before configure().
* XXX hack to get pfil_add_hook working in autoconf.
*/
void
ifinit1(void)
{
int error __diagused;
#ifdef NET_MPSAFE
printf("NET_MPSAFE enabled\n");
#endif
mutex_init(&if_clone_mtx, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&ifnet_list);
mutex_init(&ifnet_mtx, MUTEX_DEFAULT, IPL_NONE);
ifnet_psz = pserialize_create();
ifnet_psref_class = psref_class_create("ifnet", IPL_SOFTNET);
ifa_psref_class = psref_class_create("ifa", IPL_SOFTNET);
error = workqueue_create(&ifnet_link_state_wq, "iflnkst",
if_link_state_change_work, NULL, PRI_SOFTNET, IPL_NET,
WQ_MPSAFE);
KASSERT(error == 0);
PSLIST_INIT(&ifnet_pslist);
error = workqueue_create(&if_slowtimo_wq, "ifwdog",
if_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTCLOCK, WQ_MPSAFE);
KASSERTMSG(error == 0, "error=%d", error);
if_indexlim = 8;
if_pfil = pfil_head_create(PFIL_TYPE_IFNET, NULL);
KASSERT(if_pfil != NULL);
#if NETHER > 0 || defined(NETATALK) || defined(WLAN)
etherinit();
#endif
}
/* XXX must be after domaininit() */
void
ifinit_post(void)
{
if_sysctl_setup(NULL);
}
ifnet_t *
if_alloc(u_char type)
{
return kmem_zalloc(sizeof(ifnet_t), KM_SLEEP);
}
void
if_free(ifnet_t *ifp)
{
kmem_free(ifp, sizeof(ifnet_t));
}
void
if_initname(struct ifnet *ifp, const char *name, int unit)
{
(void)snprintf(ifp->if_xname, sizeof(ifp->if_xname),
"%s%d", name, unit);
}
/*
* Null routines used while an interface is going away. These routines
* just return an error.
*/
int
if_nulloutput(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *so, const struct rtentry *rt)
{
return ENXIO;
}
void
if_nullinput(struct ifnet *ifp, struct mbuf *m)
{
/* Nothing. */
}
void
if_nullstart(struct ifnet *ifp)
{
/* Nothing. */
}
int
if_nulltransmit(struct ifnet *ifp, struct mbuf *m)
{
m_freem(m);
return ENXIO;
}
int
if_nullioctl(struct ifnet *ifp, u_long cmd, void *data)
{
return ENXIO;
}
int
if_nullinit(struct ifnet *ifp)
{
return ENXIO;
}
void
if_nullstop(struct ifnet *ifp, int disable)
{
/* Nothing. */
}
void
if_nullslowtimo(struct ifnet *ifp)
{
/* Nothing. */
}
void
if_nulldrain(struct ifnet *ifp)
{
/* Nothing. */
}
void
if_set_sadl(struct ifnet *ifp, const void *lla, u_char addrlen, bool factory)
{
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
ifp->if_addrlen = addrlen;
if_alloc_sadl(ifp);
ifa = ifp->if_dl;
sdl = satosdl(ifa->ifa_addr);
(void)sockaddr_dl_setaddr(sdl, sdl->sdl_len, lla, ifp->if_addrlen);
if (factory) {
KASSERT(ifp->if_hwdl == NULL);
ifp->if_hwdl = ifp->if_dl;
ifaref(ifp->if_hwdl);
}
/* TBD routing socket */
}
struct ifaddr *
if_dl_create(const struct ifnet *ifp, const struct sockaddr_dl **sdlp)
{
unsigned socksize, ifasize;
int addrlen, namelen;
struct sockaddr_dl *mask, *sdl;
struct ifaddr *ifa;
namelen = strlen(ifp->if_xname);
addrlen = ifp->if_addrlen;
socksize = roundup(sockaddr_dl_measure(namelen, addrlen),
sizeof(long));
ifasize = sizeof(*ifa) + 2 * socksize;
ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
sdl = (struct sockaddr_dl *)(ifa + 1);
mask = (struct sockaddr_dl *)(socksize + (char *)sdl);
sockaddr_dl_init(sdl, socksize, ifp->if_index, ifp->if_type,
ifp->if_xname, namelen, NULL, addrlen);
mask->sdl_family = AF_LINK;
mask->sdl_len = sockaddr_dl_measure(namelen, 0);
memset(&mask->sdl_data[0], 0xff, namelen);
ifa->ifa_rtrequest = link_rtrequest;
ifa->ifa_addr = (struct sockaddr *)sdl;
ifa->ifa_netmask = (struct sockaddr *)mask;
ifa_psref_init(ifa);
*sdlp = sdl;
return ifa;
}
static void
if_sadl_setrefs(struct ifnet *ifp, struct ifaddr *ifa)
{
const struct sockaddr_dl *sdl;
ifp->if_dl = ifa;
ifaref(ifa);
sdl = satosdl(ifa->ifa_addr);
ifp->if_sadl = sdl;
}
/*
* Allocate the link level name for the specified interface. This
* is an attachment helper. It must be called after ifp->if_addrlen
* is initialized, which may not be the case when if_attach() is
* called.
*/
void
if_alloc_sadl(struct ifnet *ifp)
{
struct ifaddr *ifa;
const struct sockaddr_dl *sdl;
/*
* If the interface already has a link name, release it
* now. This is useful for interfaces that can change
* link types, and thus switch link names often.
*/
if (ifp->if_sadl != NULL)
if_free_sadl(ifp, 0);
ifa = if_dl_create(ifp, &sdl);
ifa_insert(ifp, ifa);
if_sadl_setrefs(ifp, ifa);
}
static void
if_deactivate_sadl(struct ifnet *ifp)
{
struct ifaddr *ifa;
KASSERT(ifp->if_dl != NULL);
ifa = ifp->if_dl;
ifp->if_sadl = NULL;
ifp->if_dl = NULL;
ifafree(ifa);
}
static void
if_replace_sadl(struct ifnet *ifp, struct ifaddr *ifa)
{
struct ifaddr *old;
KASSERT(ifp->if_dl != NULL);
old = ifp->if_dl;
ifaref(ifa);
/* XXX Update if_dl and if_sadl atomically */
ifp->if_dl = ifa;
ifp->if_sadl = satosdl(ifa->ifa_addr);
ifafree(old);
}
void
if_activate_sadl(struct ifnet *ifp, struct ifaddr *ifa0,
const struct sockaddr_dl *sdl)
{
struct ifaddr *ifa;
const int bound = curlwp_bind();
KASSERT(ifa_held(ifa0));
const int s = splsoftnet();
if_replace_sadl(ifp, ifa0);
int ss = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
ifa_acquire(ifa, &psref);
pserialize_read_exit(ss);
rtinit(ifa, RTM_LLINFO_UPD, 0);
ss = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(ss);
splx(s);
curlwp_bindx(bound);
}
/*
* Free the link level name for the specified interface. This is
* a detach helper. This is called from if_detach().
*/
void
if_free_sadl(struct ifnet *ifp, int factory)
{
struct ifaddr *ifa;
if (factory && ifp->if_hwdl != NULL) {
ifa = ifp->if_hwdl;
ifp->if_hwdl = NULL;
ifafree(ifa);
}
ifa = ifp->if_dl;
if (ifa == NULL) {
KASSERT(ifp->if_sadl == NULL);
return;
}
KASSERT(ifp->if_sadl != NULL);
const int s = splsoftnet();
KASSERT(ifa->ifa_addr->sa_family == AF_LINK);
ifa_remove(ifp, ifa);
if_deactivate_sadl(ifp);
splx(s);
}
static void
if_getindex(ifnet_t *ifp)
{
bool hitlimit = false;
char xnamebuf[HOOKNAMSIZ];
ifp->if_index_gen = index_gen++;
snprintf(xnamebuf, sizeof(xnamebuf), "%s-lshk", ifp->if_xname);
ifp->if_linkstate_hooks = simplehook_create(IPL_NET,
xnamebuf);
ifp->if_index = if_index;
if (ifindex2ifnet == NULL) {
if_index++;
goto skip;
}
while (if_byindex(ifp->if_index)) {
/*
* If we hit USHRT_MAX, we skip back to 0 since
* there are a number of places where the value
* of if_index or if_index itself is compared
* to or stored in an unsigned short. By
* jumping back, we won't botch those assignments
* or comparisons.
*/
if (++if_index == 0) {
if_index = 1;
} else if (if_index == USHRT_MAX) {
/*
* However, if we have to jump back to
* zero *twice* without finding an empty
* slot in ifindex2ifnet[], then there
* there are too many (>65535) interfaces.
*/
if (hitlimit)
panic("too many interfaces");
hitlimit = true;
if_index = 1;
}
ifp->if_index = if_index;
}
skip:
/*
* ifindex2ifnet is indexed by if_index. Since if_index will
* grow dynamically, it should grow too.
*/
if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
size_t m, n, oldlim;
void *q;
oldlim = if_indexlim;
while (ifp->if_index >= if_indexlim)
if_indexlim <<= 1;
/* grow ifindex2ifnet */
m = oldlim * sizeof(struct ifnet *);
n = if_indexlim * sizeof(struct ifnet *);
q = malloc(n, M_IFADDR, M_WAITOK | M_ZERO);
if (ifindex2ifnet != NULL) {
memcpy(q, ifindex2ifnet, m);
free(ifindex2ifnet, M_IFADDR);
}
ifindex2ifnet = (struct ifnet **)q;
}
ifindex2ifnet[ifp->if_index] = ifp;
}
/*
* Initialize an interface and assign an index for it.
*
* It must be called prior to a device specific attach routine
* (e.g., ether_ifattach and ieee80211_ifattach) or if_alloc_sadl,
* and be followed by if_register:
*
* if_initialize(ifp);
* ether_ifattach(ifp, enaddr);
* if_register(ifp);
*/
void
if_initialize(ifnet_t *ifp)
{
KASSERT(if_indexlim > 0);
TAILQ_INIT(&ifp->if_addrlist);
/*
* Link level name is allocated later by a separate call to
* if_alloc_sadl().
*/
if (ifp->if_snd.ifq_maxlen == 0)
ifp->if_snd.ifq_maxlen = ifqmaxlen;
ifp->if_broadcastaddr = 0; /* reliably crash if used uninitialized */
ifp->if_link_state = LINK_STATE_UNKNOWN;
ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
ifp->if_link_scheduled = false;
ifp->if_capenable = 0;
ifp->if_csum_flags_tx = 0;
ifp->if_csum_flags_rx = 0;
#ifdef ALTQ
ifp->if_snd.altq_type = 0;
ifp->if_snd.altq_disc = NULL;
ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
ifp->if_snd.altq_tbr = NULL;
ifp->if_snd.altq_ifp = ifp;
#endif
IFQ_LOCK_INIT(&ifp->if_snd);
ifp->if_pfil = pfil_head_create(PFIL_TYPE_IFNET, ifp);
pfil_run_ifhooks(if_pfil, PFIL_IFNET_ATTACH, ifp);
IF_AFDATA_LOCK_INIT(ifp);
PSLIST_ENTRY_INIT(ifp, if_pslist_entry);
PSLIST_INIT(&ifp->if_addr_pslist);
psref_target_init(&ifp->if_psref, ifnet_psref_class);
ifp->if_ioctl_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&ifp->if_multiaddrs);
if_stats_init(ifp);
IFNET_GLOBAL_LOCK();
if_getindex(ifp);
IFNET_GLOBAL_UNLOCK();
}
/*
* Register an interface to the list of "active" interfaces.
*/
void
if_register(ifnet_t *ifp)
{
/*
* If the driver has not supplied its own if_ioctl or if_stop,
* then supply the default.
*/
if (ifp->if_ioctl == NULL)
ifp->if_ioctl = ifioctl_common;
if (ifp->if_stop == NULL)
ifp->if_stop = if_nullstop;
sysctl_sndq_setup(&ifp->if_sysctl_log, ifp->if_xname, &ifp->if_snd);
if (!STAILQ_EMPTY(&domains))
if_attachdomain1(ifp);
/* Announce the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
if (ifp->if_slowtimo != NULL) {
struct if_slowtimo_data *isd;
isd = kmem_zalloc(sizeof(*isd), KM_SLEEP);
mutex_init(&isd->isd_lock, MUTEX_DEFAULT, IPL_SOFTCLOCK);
callout_init(&isd->isd_ch, CALLOUT_MPSAFE);
callout_setfunc(&isd->isd_ch, if_slowtimo_intr, ifp);
isd->isd_ifp = ifp;
ifp->if_slowtimo_data = isd;
if_slowtimo_intr(ifp);
sysctl_watchdog_setup(ifp);
}
if (ifp->if_transmit == NULL || ifp->if_transmit == if_nulltransmit)
ifp->if_transmit = if_transmit;
IFNET_GLOBAL_LOCK();
TAILQ_INSERT_TAIL(&ifnet_list, ifp, if_list);
IFNET_WRITER_INSERT_TAIL(ifp);
IFNET_GLOBAL_UNLOCK();
}
/*
* The if_percpuq framework
*
* It allows network device drivers to execute the network stack
* in softint (so called softint-based if_input). It utilizes
* softint and percpu ifqueue. It doesn't distribute any packets
* between CPUs, unlike pktqueue(9).
*
* Currently we support two options for device drivers to apply the framework:
* - Use it implicitly with less changes
* - If you use if_attach in driver's _attach function and if_input in
* driver's Rx interrupt handler, a packet is queued and a softint handles
* the packet implicitly
* - Use it explicitly in each driver (recommended)
* - You can use if_percpuq_* directly in your driver
* - In this case, you need to allocate struct if_percpuq in driver's softc
* - See wm(4) as a reference implementation
*/
static void
if_percpuq_softint(void *arg)
{
struct if_percpuq *ipq = arg;
struct ifnet *ifp = ipq->ipq_ifp;
struct mbuf *m;
while ((m = if_percpuq_dequeue(ipq)) != NULL) {
if_statinc(ifp, if_ipackets);
bpf_mtap(ifp, m, BPF_D_IN);
ifp->_if_input(ifp, m);
}
}
static void
if_percpuq_init_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
memset(ifq, 0, sizeof(*ifq));
ifq->ifq_maxlen = IFQ_MAXLEN;
}
struct if_percpuq *
if_percpuq_create(struct ifnet *ifp)
{
struct if_percpuq *ipq;
u_int flags = SOFTINT_NET;
flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;
ipq = kmem_zalloc(sizeof(*ipq), KM_SLEEP);
ipq->ipq_ifp = ifp;
ipq->ipq_si = softint_establish(flags, if_percpuq_softint, ipq);
ipq->ipq_ifqs = percpu_alloc(sizeof(struct ifqueue));
percpu_foreach(ipq->ipq_ifqs, &if_percpuq_init_ifq, NULL);
sysctl_percpuq_setup(&ifp->if_sysctl_log, ifp->if_xname, ipq);
return ipq;
}
static struct mbuf *
if_percpuq_dequeue(struct if_percpuq *ipq)
{
struct mbuf *m;
struct ifqueue *ifq;
const int s = splnet();
ifq = percpu_getref(ipq->ipq_ifqs);
IF_DEQUEUE(ifq, m);
percpu_putref(ipq->ipq_ifqs);
splx(s);
return m;
}
static void
if_percpuq_purge_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
IF_PURGE(ifq);
}
void
if_percpuq_destroy(struct if_percpuq *ipq)
{
/* if_detach may already destroy it */
if (ipq == NULL)
return;
softint_disestablish(ipq->ipq_si);
percpu_foreach(ipq->ipq_ifqs, &if_percpuq_purge_ifq, NULL);
percpu_free(ipq->ipq_ifqs, sizeof(struct ifqueue));
kmem_free(ipq, sizeof(*ipq));
}
void
if_percpuq_enqueue(struct if_percpuq *ipq, struct mbuf *m)
{
struct ifqueue *ifq;
KASSERT(ipq != NULL);
const int s = splnet();
ifq = percpu_getref(ipq->ipq_ifqs);
if (IF_QFULL(ifq)) {
IF_DROP(ifq);
percpu_putref(ipq->ipq_ifqs);
m_freem(m);
goto out;
}
IF_ENQUEUE(ifq, m);
percpu_putref(ipq->ipq_ifqs);
softint_schedule(ipq->ipq_si);
out:
splx(s);
}
static void
if_percpuq_drops(void *p, void *arg, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
uint64_t *sum = arg;
*sum += ifq->ifq_drops;
}
static int
sysctl_percpuq_drops_handler(SYSCTLFN_ARGS)
{
struct sysctlnode node;
struct if_percpuq *ipq;
uint64_t sum = 0;
int error;
node = *rnode;
ipq = node.sysctl_data;
percpu_foreach(ipq->ipq_ifqs, if_percpuq_drops, &sum);
node.sysctl_data = ∑
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error != 0 || newp == NULL)
return error;
return 0;
}
static void
sysctl_percpuq_setup(struct sysctllog **clog, const char* ifname,
struct if_percpuq *ipq)
{
const struct sysctlnode *cnode, *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, ifname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "rcvq",
SYSCTL_DESCR("Interface input queue controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
#ifdef NOTYET
/* XXX Should show each per-CPU queue length? */
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "len",
SYSCTL_DESCR("Current input queue length"),
sysctl_percpuq_len, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed input queue length"),
sysctl_percpuq_maxlen_handler, 0, (void *)ipq, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
#endif
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Total packets dropped due to full input queue"),
sysctl_percpuq_drops_handler, 0, (void *)ipq, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl nodes\n", ifname);
return;
}
/*
* The deferred if_start framework
*
* The common APIs to defer if_start to softint when if_start is requested
* from a device driver running in hardware interrupt context.
*/
/*
* Call ifp->if_start (or equivalent) in a dedicated softint for
* deferred if_start.
*/
static void
if_deferred_start_softint(void *arg)
{
struct if_deferred_start *ids = arg;
struct ifnet *ifp = ids->ids_ifp;
ids->ids_if_start(ifp);
}
/*
* The default callback function for deferred if_start.
*/
static void
if_deferred_start_common(struct ifnet *ifp)
{
const int s = splnet();
if_start_lock(ifp);
splx(s);
}
static inline bool
if_snd_is_used(struct ifnet *ifp)
{
return ALTQ_IS_ENABLED(&ifp->if_snd) ||
ifp->if_transmit == if_transmit ||
ifp->if_transmit == NULL ||
ifp->if_transmit == if_nulltransmit;
}
/*
* Schedule deferred if_start.
*/
void
if_schedule_deferred_start(struct ifnet *ifp)
{
KASSERT(ifp->if_deferred_start != NULL);
if (if_snd_is_used(ifp) && IFQ_IS_EMPTY(&ifp->if_snd))
return;
softint_schedule(ifp->if_deferred_start->ids_si);
}
/*
* Create an instance of deferred if_start. A driver should call the function
* only if the driver needs deferred if_start. Drivers can setup their own
* deferred if_start function via 2nd argument.
*/
void
if_deferred_start_init(struct ifnet *ifp, void (*func)(struct ifnet *))
{
struct if_deferred_start *ids;
u_int flags = SOFTINT_NET;
flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;
ids = kmem_zalloc(sizeof(*ids), KM_SLEEP);
ids->ids_ifp = ifp;
ids->ids_si = softint_establish(flags, if_deferred_start_softint, ids);
if (func != NULL)
ids->ids_if_start = func;
else
ids->ids_if_start = if_deferred_start_common;
ifp->if_deferred_start = ids;
}
static void
if_deferred_start_destroy(struct ifnet *ifp)
{
if (ifp->if_deferred_start == NULL)
return;
softint_disestablish(ifp->if_deferred_start->ids_si);
kmem_free(ifp->if_deferred_start, sizeof(*ifp->if_deferred_start));
ifp->if_deferred_start = NULL;
}
/*
* The common interface input routine that is called by device drivers,
* which should be used only when the driver's rx handler already runs
* in softint.
*/
void
if_input(struct ifnet *ifp, struct mbuf *m)
{
KASSERT(ifp->if_percpuq == NULL);
KASSERT(!cpu_intr_p());
if_statinc(ifp, if_ipackets);
bpf_mtap(ifp, m, BPF_D_IN);
ifp->_if_input(ifp, m);
}
/*
* DEPRECATED. Use if_initialize and if_register instead.
* See the above comment of if_initialize.
*
* Note that it implicitly enables if_percpuq to make drivers easy to
* migrate softint-based if_input without much changes. If you don't
* want to enable it, use if_initialize instead.
*/
void
if_attach(ifnet_t *ifp)
{
if_initialize(ifp);
ifp->if_percpuq = if_percpuq_create(ifp);
if_register(ifp);
}
void
if_attachdomain(void)
{
struct ifnet *ifp;
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
struct psref psref;
psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
pserialize_read_exit(s);
if_attachdomain1(ifp);
s = pserialize_read_enter();
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
}
static void
if_attachdomain1(struct ifnet *ifp)
{
struct domain *dp;
const int s = splsoftnet();
/* address family dependent data region */
memset(ifp->if_afdata, 0, sizeof(ifp->if_afdata));
DOMAIN_FOREACH(dp) {
if (dp->dom_ifattach != NULL)
ifp->if_afdata[dp->dom_family] =
(*dp->dom_ifattach)(ifp);
}
splx(s);
}
/*
* Deactivate an interface. This points all of the procedure
* handles at error stubs. May be called from interrupt context.
*/
void
if_deactivate(struct ifnet *ifp)
{
const int s = splsoftnet();
ifp->if_output = if_nulloutput;
ifp->_if_input = if_nullinput;
ifp->if_start = if_nullstart;
ifp->if_transmit = if_nulltransmit;
ifp->if_ioctl = if_nullioctl;
ifp->if_init = if_nullinit;
ifp->if_stop = if_nullstop;
if (ifp->if_slowtimo)
ifp->if_slowtimo = if_nullslowtimo;
ifp->if_drain = if_nulldrain;
/* No more packets may be enqueued. */
ifp->if_snd.ifq_maxlen = 0;
splx(s);
}
bool
if_is_deactivated(const struct ifnet *ifp)
{
return ifp->if_output == if_nulloutput;
}
void
if_purgeaddrs(struct ifnet *ifp, int family,
void (*purgeaddr)(struct ifaddr *))
{
struct ifaddr *ifa, *nifa;
int s;
s = pserialize_read_enter();
for (ifa = IFADDR_READER_FIRST(ifp); ifa; ifa = nifa) {
nifa = IFADDR_READER_NEXT(ifa);
if (ifa->ifa_addr->sa_family != family)
continue;
pserialize_read_exit(s);
(*purgeaddr)(ifa);
s = pserialize_read_enter();
}
pserialize_read_exit(s);
}
#ifdef IFAREF_DEBUG
static struct ifaddr **ifa_list;
static int ifa_list_size;
/* Depends on only one if_attach runs at once */
static void
if_build_ifa_list(struct ifnet *ifp)
{
struct ifaddr *ifa;
int i;
KASSERT(ifa_list == NULL);
KASSERT(ifa_list_size == 0);
IFADDR_READER_FOREACH(ifa, ifp)
ifa_list_size++;
ifa_list = kmem_alloc(sizeof(*ifa) * ifa_list_size, KM_SLEEP);
i = 0;
IFADDR_READER_FOREACH(ifa, ifp) {
ifa_list[i++] = ifa;
ifaref(ifa);
}
}
static void
if_check_and_free_ifa_list(struct ifnet *ifp)
{
int i;
struct ifaddr *ifa;
if (ifa_list == NULL)
return;
for (i = 0; i < ifa_list_size; i++) {
char buf[64];
ifa = ifa_list[i];
sockaddr_format(ifa->ifa_addr, buf, sizeof(buf));
if (ifa->ifa_refcnt > 1) {
log(LOG_WARNING,
"ifa(%s) still referenced (refcnt=%d)\n",
buf, ifa->ifa_refcnt - 1);
} else
log(LOG_DEBUG,
"ifa(%s) not referenced (refcnt=%d)\n",
buf, ifa->ifa_refcnt - 1);
ifafree(ifa);
}
kmem_free(ifa_list, sizeof(*ifa) * ifa_list_size);
ifa_list = NULL;
ifa_list_size = 0;
}
#endif
/*
* Detach an interface from the list of "active" interfaces,
* freeing any resources as we go along.
*
* NOTE: This routine must be called with a valid thread context,
* as it may block.
*/
void
if_detach(struct ifnet *ifp)
{
struct socket so;
struct ifaddr *ifa;
#ifdef IFAREF_DEBUG
struct ifaddr *last_ifa = NULL;
#endif
struct domain *dp;
const struct protosw *pr;
int i, family, purged;
#ifdef IFAREF_DEBUG
if_build_ifa_list(ifp);
#endif
/*
* XXX It's kind of lame that we have to have the
* XXX socket structure...
*/
memset(&so, 0, sizeof(so));
const int s = splnet();
sysctl_teardown(&ifp->if_sysctl_log);
IFNET_LOCK(ifp);
/*
* Unset all queued link states and pretend a
* link state change is scheduled.
* This stops any more link state changes occurring for this
* interface while it's being detached so it's safe
* to drain the workqueue.
*/
IF_LINK_STATE_CHANGE_LOCK(ifp);
ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
ifp->if_link_scheduled = true;
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
workqueue_wait(ifnet_link_state_wq, &ifp->if_link_work);
if_deactivate(ifp);
IFNET_UNLOCK(ifp);
/*
* Unlink from the list and wait for all readers to leave
* from pserialize read sections. Note that we can't do
* psref_target_destroy here. See below.
*/
IFNET_GLOBAL_LOCK();
ifindex2ifnet[ifp->if_index] = NULL;
TAILQ_REMOVE(&ifnet_list, ifp, if_list);
IFNET_WRITER_REMOVE(ifp);
pserialize_perform(ifnet_psz);
IFNET_GLOBAL_UNLOCK();
if (ifp->if_slowtimo != NULL) {
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
mutex_enter(&isd->isd_lock);
isd->isd_dying = true;
mutex_exit(&isd->isd_lock);
callout_halt(&isd->isd_ch, NULL);
workqueue_wait(if_slowtimo_wq, &isd->isd_work);
callout_destroy(&isd->isd_ch);
mutex_destroy(&isd->isd_lock);
kmem_free(isd, sizeof(*isd));
ifp->if_slowtimo_data = NULL; /* paraonia */
ifp->if_slowtimo = NULL; /* paranoia */
}
if_deferred_start_destroy(ifp);
/*
* Do an if_down() to give protocols a chance to do something.
*/
if_down_deactivated(ifp);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
altq_disable(&ifp->if_snd);
if (ALTQ_IS_ATTACHED(&ifp->if_snd))
altq_detach(&ifp->if_snd);
#endif
#if NCARP > 0
/* Remove the interface from any carp group it is a part of. */
if (ifp->if_carp != NULL && ifp->if_type != IFT_CARP)
carp_ifdetach(ifp);
#endif
/*
* Ensure that all packets on protocol input pktqueues have been
* processed, or, at least, removed from the queues.
*
* A cross-call will ensure that the interrupts have completed.
* FIXME: not quite..
*/
pktq_ifdetach();
xc_barrier(0);
/*
* Rip all the addresses off the interface. This should make
* all of the routes go away.
*
* pr_usrreq calls can remove an arbitrary number of ifaddrs
* from the list, including our "cursor", ifa. For safety,
* and to honor the TAILQ abstraction, I just restart the
* loop after each removal. Note that the loop will exit
* when all of the remaining ifaddrs belong to the AF_LINK
* family. I am counting on the historical fact that at
* least one pr_usrreq in each address domain removes at
* least one ifaddr.
*/
again:
/*
* At this point, no other one tries to remove ifa in the list,
* so we don't need to take a lock or psref. Avoid using
* IFADDR_READER_FOREACH to pass over an inspection of contract
* violations of pserialize.
*/
IFADDR_WRITER_FOREACH(ifa, ifp) {
family = ifa->ifa_addr->sa_family;
#ifdef IFAREF_DEBUG
printf("if_detach: ifaddr %p, family %d, refcnt %d\n",
ifa, family, ifa->ifa_refcnt);
if (last_ifa != NULL && ifa == last_ifa)
panic("if_detach: loop detected");
last_ifa = ifa;
#endif
if (family == AF_LINK)
continue;
dp = pffinddomain(family);
KASSERTMSG(dp != NULL, "no domain for AF %d", family);
/*
* XXX These PURGEIF calls are redundant with the
* purge-all-families calls below, but are left in for
* now both to make a smaller change, and to avoid
* unplanned interactions with clearing of
* ifp->if_addrlist.
*/
purged = 0;
for (pr = dp->dom_protosw;
pr < dp->dom_protoswNPROTOSW; pr++) {
so.so_proto = pr;
if (pr->pr_usrreqs) {
(void) (*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
purged = 1;
}
}
if (purged == 0) {
/*
* XXX What's really the best thing to do
* XXX here? --thorpej@NetBSD.org
*/
printf("if_detach: WARNING: AF %d not purged\n",
family);
ifa_remove(ifp, ifa);
}
goto again;
}
if_free_sadl(ifp, 1);
restart:
IFADDR_WRITER_FOREACH(ifa, ifp) {
family = ifa->ifa_addr->sa_family;
KASSERT(family == AF_LINK);
ifa_remove(ifp, ifa);
goto restart;
}
/* Delete stray routes from the routing table. */
for (i = 0; i <= AF_MAX; i++)
rt_delete_matched_entries(i, if_delroute_matcher, ifp, false);
DOMAIN_FOREACH(dp) {
if (dp->dom_ifdetach != NULL && ifp->if_afdata[dp->dom_family])
{
void *p = ifp->if_afdata[dp->dom_family];
if (p) {
ifp->if_afdata[dp->dom_family] = NULL;
(*dp->dom_ifdetach)(ifp, p);
}
}
/*
* One would expect multicast memberships (INET and
* INET6) on UDP sockets to be purged by the PURGEIF
* calls above, but if all addresses were removed from
* the interface prior to destruction, the calls will
* not be made (e.g. ppp, for which pppd(8) generally
* removes addresses before destroying the interface).
* Because there is no invariant that multicast
* memberships only exist for interfaces with IPv4
* addresses, we must call PURGEIF regardless of
* addresses. (Protocols which might store ifnet
* pointers are marked with PR_PURGEIF.)
*/
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
{
so.so_proto = pr;
if (pr->pr_usrreqs && pr->pr_flags & PR_PURGEIF)
(void)(*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
}
}
/*
* Must be done after the above pr_purgeif because if_psref may be
* still used in pr_purgeif.
*/
psref_target_destroy(&ifp->if_psref, ifnet_psref_class);
PSLIST_ENTRY_DESTROY(ifp, if_pslist_entry);
pfil_run_ifhooks(if_pfil, PFIL_IFNET_DETACH, ifp);
(void)pfil_head_destroy(ifp->if_pfil);
/* Announce that the interface is gone. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
IF_AFDATA_LOCK_DESTROY(ifp);
if (ifp->if_percpuq != NULL) {
if_percpuq_destroy(ifp->if_percpuq);
ifp->if_percpuq = NULL;
}
mutex_obj_free(ifp->if_ioctl_lock);
ifp->if_ioctl_lock = NULL;
mutex_obj_free(ifp->if_snd.ifq_lock);
if_stats_fini(ifp);
KASSERT(!simplehook_has_hooks(ifp->if_linkstate_hooks));
simplehook_destroy(ifp->if_linkstate_hooks);
splx(s);
#ifdef IFAREF_DEBUG
if_check_and_free_ifa_list(ifp);
#endif
}
/*
* Callback for a radix tree walk to delete all references to an
* ifnet.
*/
static int
if_delroute_matcher(struct rtentry *rt, void *v)
{
struct ifnet *ifp = (struct ifnet *)v;
if (rt->rt_ifp == ifp)
return 1;
else
return 0;
}
/*
* Create a clone network interface.
*/
static int
if_clone_create(const char *name)
{
struct if_clone *ifc;
struct ifnet *ifp;
struct psref psref;
int unit;
KASSERT(mutex_owned(&if_clone_mtx));
ifc = if_clone_lookup(name, &unit);
if (ifc == NULL)
return EINVAL;
ifp = if_get(name, &psref);
if (ifp != NULL) {
if_put(ifp, &psref);
return EEXIST;
}
return (*ifc->ifc_create)(ifc, unit);
}
/*
* Destroy a clone network interface.
*/
static int
if_clone_destroy(const char *name)
{
struct if_clone *ifc;
struct ifnet *ifp;
struct psref psref;
int error;
int (*if_ioctlfn)(struct ifnet *, u_long, void *);
KASSERT(mutex_owned(&if_clone_mtx));
ifc = if_clone_lookup(name, NULL);
if (ifc == NULL)
return EINVAL;
if (ifc->ifc_destroy == NULL)
return EOPNOTSUPP;
ifp = if_get(name, &psref);
if (ifp == NULL)
return ENXIO;
/* We have to disable ioctls here */
IFNET_LOCK(ifp);
if_ioctlfn = ifp->if_ioctl;
ifp->if_ioctl = if_nullioctl;
IFNET_UNLOCK(ifp);
/*
* We cannot call ifc_destroy with holding ifp.
* Releasing ifp here is safe thanks to if_clone_mtx.
*/
if_put(ifp, &psref);
error = (*ifc->ifc_destroy)(ifp);
if (error != 0) {
/* We have to restore if_ioctl on error */
IFNET_LOCK(ifp);
ifp->if_ioctl = if_ioctlfn;
IFNET_UNLOCK(ifp);
}
return error;
}
static bool
if_is_unit(const char *name)
{
while (*name != '\0') {
if (*name < '0' || *name > '9')
return false;
name++;
}
return true;
}
/*
* Look up a network interface cloner.
*/
static struct if_clone *
if_clone_lookup(const char *name, int *unitp)
{
struct if_clone *ifc;
const char *cp;
char *dp, ifname[IFNAMSIZ + 3];
int unit;
KASSERT(mutex_owned(&if_clone_mtx));
strcpy(ifname, "if_");
/* separate interface name from unit */
/* TODO: search unit number from backward */
for (dp = ifname + 3, cp = name; cp - name < IFNAMSIZ && *cp && !if_is_unit(cp);)
*dp++ = *cp++;
if (cp == name || cp - name == IFNAMSIZ || !*cp)
return NULL; /* No name or unit number */
*dp++ = '\0';
again:
LIST_FOREACH(ifc, &if_cloners, ifc_list) {
if (strcmp(ifname + 3, ifc->ifc_name) == 0)
break;
}
if (ifc == NULL) {
int error;
if (*ifname == '\0')
return NULL;
mutex_exit(&if_clone_mtx);
error = module_autoload(ifname, MODULE_CLASS_DRIVER);
mutex_enter(&if_clone_mtx);
if (error)
return NULL;
*ifname = '\0';
goto again;
}
unit = 0;
while (cp - name < IFNAMSIZ && *cp) { if (*cp < '0' || *cp > '9' || unit >= INT_MAX / 10) {
/* Bogus unit number. */
return NULL;
}
unit = (unit * 10) + (*cp++ - '0');
}
if (unitp != NULL) *unitp = unit;
return ifc;
}
/*
* Register a network interface cloner.
*/
void
if_clone_attach(struct if_clone *ifc)
{
mutex_enter(&if_clone_mtx);
LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
if_cloners_count++;
mutex_exit(&if_clone_mtx);
}
/*
* Unregister a network interface cloner.
*/
void
if_clone_detach(struct if_clone *ifc)
{
mutex_enter(&if_clone_mtx);
LIST_REMOVE(ifc, ifc_list);
if_cloners_count--;
mutex_exit(&if_clone_mtx);
}
/*
* Provide list of interface cloners to userspace.
*/
int
if_clone_list(int buf_count, char *buffer, int *total)
{
char outbuf[IFNAMSIZ], *dst;
struct if_clone *ifc;
int count, error = 0;
mutex_enter(&if_clone_mtx);
*total = if_cloners_count;
if ((dst = buffer) == NULL) {
/* Just asking how many there are. */
goto out;
}
if (buf_count < 0) {
error = EINVAL;
goto out;
}
count = (if_cloners_count < buf_count) ? if_cloners_count : buf_count;
for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0;
ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) {
(void)strncpy(outbuf, ifc->ifc_name, sizeof(outbuf));
if (outbuf[sizeof(outbuf) - 1] != '\0') {
error = ENAMETOOLONG;
goto out;
}
error = copyout(outbuf, dst, sizeof(outbuf));
if (error != 0)
break;
}
out:
mutex_exit(&if_clone_mtx);
return error;
}
void
ifa_psref_init(struct ifaddr *ifa)
{
psref_target_init(&ifa->ifa_psref, ifa_psref_class);
}
void
ifaref(struct ifaddr *ifa)
{
atomic_inc_uint(&ifa->ifa_refcnt);
}
void
ifafree(struct ifaddr *ifa)
{
KASSERT(ifa != NULL);
KASSERTMSG(ifa->ifa_refcnt > 0, "ifa_refcnt=%d", ifa->ifa_refcnt);
membar_release();
if (atomic_dec_uint_nv(&ifa->ifa_refcnt) != 0)
return;
membar_acquire();
free(ifa, M_IFADDR);
}
bool
ifa_is_destroying(struct ifaddr *ifa)
{
return ISSET(ifa->ifa_flags, IFA_DESTROYING);
}
void
ifa_insert(struct ifnet *ifp, struct ifaddr *ifa)
{
ifa->ifa_ifp = ifp;
/*
* Check MP-safety for IFEF_MPSAFE drivers.
* Check !IFF_RUNNING for initialization routines that normally don't
* take IFNET_LOCK but it's safe because there is no competitor.
* XXX there are false positive cases because IFF_RUNNING can be off on
* if_stop.
*/
KASSERT(!if_is_mpsafe(ifp) || !ISSET(ifp->if_flags, IFF_RUNNING) ||
IFNET_LOCKED(ifp));
TAILQ_INSERT_TAIL(&ifp->if_addrlist, ifa, ifa_list);
IFADDR_ENTRY_INIT(ifa);
IFADDR_WRITER_INSERT_TAIL(ifp, ifa);
ifaref(ifa);
}
void
ifa_remove(struct ifnet *ifp, struct ifaddr *ifa)
{
KASSERT(ifa->ifa_ifp == ifp);
/*
* Check MP-safety for IFEF_MPSAFE drivers.
* if_is_deactivated indicates ifa_remove is called from if_detach
* where it is safe even if IFNET_LOCK isn't held.
*/
KASSERT(!if_is_mpsafe(ifp) || if_is_deactivated(ifp) ||
IFNET_LOCKED(ifp));
TAILQ_REMOVE(&ifp->if_addrlist, ifa, ifa_list);
IFADDR_WRITER_REMOVE(ifa);
#ifdef NET_MPSAFE
IFNET_GLOBAL_LOCK();
pserialize_perform(ifnet_psz);
IFNET_GLOBAL_UNLOCK();
#endif
#ifdef NET_MPSAFE
psref_target_destroy(&ifa->ifa_psref, ifa_psref_class);
#endif
IFADDR_ENTRY_DESTROY(ifa);
ifafree(ifa);
}
void
ifa_acquire(struct ifaddr *ifa, struct psref *psref)
{
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifa->ifa_psref, ifa_psref_class);
}
void
ifa_release(struct ifaddr *ifa, struct psref *psref)
{
if (ifa == NULL)
return;
psref_release(psref, &ifa->ifa_psref, ifa_psref_class);
}
bool
ifa_held(struct ifaddr *ifa)
{
return psref_held(&ifa->ifa_psref, ifa_psref_class);
}
static inline int
equal(const struct sockaddr *sa1, const struct sockaddr *sa2)
{
return sockaddr_cmp(sa1, sa2) == 0;
}
/*
* Locate an interface based on a complete address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithaddr(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (equal(addr, ifa->ifa_addr))
return ifa;
if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr &&
/* IP6 doesn't have broadcast */
ifa->ifa_broadaddr->sa_len != 0 &&
equal(ifa->ifa_broadaddr, addr))
return ifa;
}
}
return NULL;
}
struct ifaddr *
ifa_ifwithaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s = pserialize_read_enter();
ifa = ifa_ifwithaddr(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Locate the point to point interface with a given destination address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithdstaddr(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
continue;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != addr->sa_family ||
ifa->ifa_dstaddr == NULL)
continue;
if (equal(addr, ifa->ifa_dstaddr))
return ifa;
}
}
return NULL;
}
struct ifaddr *
ifa_ifwithdstaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithdstaddr(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface on a specific network. If many, choice
* is most specific found.
*/
struct ifaddr *
ifa_ifwithnet(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa, *ifa_maybe = NULL;
const struct sockaddr_dl *sdl;
u_int af = addr->sa_family;
const char *addr_data = addr->sa_data, *cplim;
if (af == AF_LINK) {
sdl = satocsdl(addr);
if (sdl->sdl_index && sdl->sdl_index < if_indexlim &&
ifindex2ifnet[sdl->sdl_index] &&
!if_is_deactivated(ifindex2ifnet[sdl->sdl_index])) {
return ifindex2ifnet[sdl->sdl_index]->if_dl;
}
}
#ifdef NETATALK
if (af == AF_APPLETALK) {
const struct sockaddr_at *sat, *sat2;
sat = (const struct sockaddr_at *)addr;
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
ifa = at_ifawithnet((const struct sockaddr_at *)addr,
ifp);
if (ifa == NULL)
continue;
sat2 = (struct sockaddr_at *)ifa->ifa_addr;
if (sat2->sat_addr.s_net == sat->sat_addr.s_net)
return ifa; /* exact match */
if (ifa_maybe == NULL) {
/* else keep the if with the right range */
ifa_maybe = ifa;
}
}
return ifa_maybe;
}
#endif
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) {
const char *cp, *cp2, *cp3;
if (ifa->ifa_addr->sa_family != af ||
ifa->ifa_netmask == NULL)
next: continue;
cp = addr_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = (const char *)ifa->ifa_netmask +
ifa->ifa_netmask->sa_len;
while (cp3 < cplim) {
if ((*cp++ ^ *cp2++) & *cp3++) {
/* want to continue for() loop */
goto next;
}
}
if (ifa_maybe == NULL ||
rt_refines(ifa->ifa_netmask,
ifa_maybe->ifa_netmask))
ifa_maybe = ifa;
}
}
return ifa_maybe;
}
struct ifaddr *
ifa_ifwithnet_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithnet(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find the interface of the address.
*/
struct ifaddr *
ifa_ifwithladdr(const struct sockaddr *addr)
{
struct ifaddr *ia;
if ((ia = ifa_ifwithaddr(addr)) || (ia = ifa_ifwithdstaddr(addr)) ||
(ia = ifa_ifwithnet(addr)))
return ia;
return NULL;
}
struct ifaddr *
ifa_ifwithladdr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithladdr(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface using a specific address family
*/
struct ifaddr *
ifa_ifwithaf(int af)
{
struct ifnet *ifp;
struct ifaddr *ifa = NULL;
int s;
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family == af)
goto out;
}
}
out:
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface address specific to an interface best matching
* a given address.
*/
struct ifaddr *
ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
{
struct ifaddr *ifa;
const char *cp, *cp2, *cp3;
const char *cplim;
struct ifaddr *ifa_maybe = 0;
u_int af = addr->sa_family;
if (if_is_deactivated(ifp))
return NULL;
if (af >= AF_MAX)
return NULL;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != af)
continue;
ifa_maybe = ifa;
if (ifa->ifa_netmask == NULL) {
if (equal(addr, ifa->ifa_addr) ||
(ifa->ifa_dstaddr &&
equal(addr, ifa->ifa_dstaddr)))
return ifa;
continue;
}
cp = addr->sa_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
for (; cp3 < cplim; cp3++) {
if ((*cp++ ^ *cp2++) & *cp3)
break;
}
if (cp3 == cplim)
return ifa;
}
return ifa_maybe;
}
struct ifaddr *
ifaof_ifpforaddr_psref(const struct sockaddr *addr, struct ifnet *ifp,
struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifaof_ifpforaddr(addr, ifp);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Default action when installing a route with a Link Level gateway.
* Lookup an appropriate real ifa to point to.
* This should be moved to /sys/net/link.c eventually.
*/
void
link_rtrequest(int cmd, struct rtentry *rt, const struct rt_addrinfo *info)
{
struct ifaddr *ifa;
const struct sockaddr *dst;
struct ifnet *ifp;
struct psref psref;
if (cmd != RTM_ADD || ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
return;
ifp = rt->rt_ifa->ifa_ifp;
dst = rt_getkey(rt);
if ((ifa = ifaof_ifpforaddr_psref(dst, ifp, &psref)) != NULL) {
rt_replace_ifa(rt, ifa);
if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
ifa->ifa_rtrequest(cmd, rt, info);
ifa_release(ifa, &psref);
}
}
/*
* bitmask macros to manage a densely packed link_state change queue.
* Because we need to store LINK_STATE_UNKNOWN(0), LINK_STATE_DOWN(1) and
* LINK_STATE_UP(2) we need 2 bits for each state change.
* As a state change to store is 0, treat all bits set as an unset item.
*/
#define LQ_ITEM_BITS 2
#define LQ_ITEM_MASK ((1 << LQ_ITEM_BITS) - 1)
#define LQ_MASK(i) (LQ_ITEM_MASK << (i) * LQ_ITEM_BITS)
#define LINK_STATE_UNSET LQ_ITEM_MASK
#define LQ_ITEM(q, i) (((q) & LQ_MASK((i))) >> (i) * LQ_ITEM_BITS)
#define LQ_STORE(q, i, v) \
do { \
(q) &= ~LQ_MASK((i)); \
(q) |= (v) << (i) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_MAX(q) ((sizeof((q)) * NBBY) / LQ_ITEM_BITS)
#define LQ_POP(q, v) \
do { \
(v) = LQ_ITEM((q), 0); \
(q) >>= LQ_ITEM_BITS; \
(q) |= LINK_STATE_UNSET << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_PUSH(q, v) \
do { \
(q) >>= LQ_ITEM_BITS; \
(q) |= (v) << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_FIND_UNSET(q, i) \
for ((i) = 0; i < LQ_MAX((q)); (i)++) { \
if (LQ_ITEM((q), (i)) == LINK_STATE_UNSET) \
break; \
}
/*
* Handle a change in the interface link state and
* queue notifications.
*/
void
if_link_state_change(struct ifnet *ifp, int link_state)
{
int idx;
/* Ensure change is to a valid state */
switch (link_state) {
case LINK_STATE_UNKNOWN: /* FALLTHROUGH */
case LINK_STATE_DOWN: /* FALLTHROUGH */
case LINK_STATE_UP:
break;
default:
#ifdef DEBUG
printf("%s: invalid link state %d\n",
ifp->if_xname, link_state);
#endif
return;
}
IF_LINK_STATE_CHANGE_LOCK(ifp);
/* Find the last unset event in the queue. */
LQ_FIND_UNSET(ifp->if_link_queue, idx);
if (idx == 0) {
/*
* There is no queue of link state changes.
* As we have the lock we can safely compare against the
* current link state and return if the same.
* Otherwise, if scheduled is true then the interface is being
* detached and the queue is being drained so we need
* to avoid queuing more work.
*/
if (ifp->if_link_state == link_state ||
ifp->if_link_scheduled)
goto out;
} else {
/* Ensure link_state doesn't match the last queued state. */
if (LQ_ITEM(ifp->if_link_queue, idx - 1)
== (uint8_t)link_state)
goto out;
}
/* Handle queue overflow. */
if (idx == LQ_MAX(ifp->if_link_queue)) {
uint8_t lost;
/*
* The DOWN state must be protected from being pushed off
* the queue to ensure that userland will always be
* in a sane state.
* Because DOWN is protected, there is no need to protect
* UNKNOWN.
* It should be invalid to change from any other state to
* UNKNOWN anyway ...
*/
lost = LQ_ITEM(ifp->if_link_queue, 0);
LQ_PUSH(ifp->if_link_queue, (uint8_t)link_state);
if (lost == LINK_STATE_DOWN) {
lost = LQ_ITEM(ifp->if_link_queue, 0);
LQ_STORE(ifp->if_link_queue, 0, LINK_STATE_DOWN);
}
printf("%s: lost link state change %s\n",
ifp->if_xname,
lost == LINK_STATE_UP ? "UP" :
lost == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN");
} else
LQ_STORE(ifp->if_link_queue, idx, (uint8_t)link_state);
if (ifp->if_link_scheduled)
goto out;
ifp->if_link_scheduled = true;
workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work, NULL);
out:
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
}
/*
* Handle interface link state change notifications.
*/
static void
if_link_state_change_process(struct ifnet *ifp, int link_state)
{
struct domain *dp;
const int s = splnet();
bool notify;
KASSERT(!cpu_intr_p());
IF_LINK_STATE_CHANGE_LOCK(ifp);
/* Ensure the change is still valid. */
if (ifp->if_link_state == link_state) {
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
splx(s);
return;
}
#ifdef DEBUG
log(LOG_DEBUG, "%s: link state %s (was %s)\n", ifp->if_xname,
link_state == LINK_STATE_UP ? "UP" :
link_state == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN",
ifp->if_link_state == LINK_STATE_UP ? "UP" :
ifp->if_link_state == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN");
#endif
/*
* When going from UNKNOWN to UP, we need to mark existing
* addresses as tentative and restart DAD as we may have
* erroneously not found a duplicate.
*
* This needs to happen before rt_ifmsg to avoid a race where
* listeners would have an address and expect it to work right
* away.
*/
notify = (link_state == LINK_STATE_UP &&
ifp->if_link_state == LINK_STATE_UNKNOWN);
ifp->if_link_state = link_state;
/* The following routines may sleep so release the spin mutex */
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
if (notify) {
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp,
LINK_STATE_DOWN);
}
}
/* Notify that the link state has changed. */
rt_ifmsg(ifp);
simplehook_dohooks(ifp->if_linkstate_hooks);
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp, link_state);
}
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
splx(s);
}
/*
* Process the interface link state change queue.
*/
static void
if_link_state_change_work(struct work *work, void *arg)
{
struct ifnet *ifp = container_of(work, struct ifnet, if_link_work);
uint8_t state;
KERNEL_LOCK_UNLESS_NET_MPSAFE();
const int s = splnet();
/*
* Pop a link state change from the queue and process it.
* If there is nothing to process then if_detach() has been called.
* We keep if_link_scheduled = true so the queue can safely drain
* without more work being queued.
*/
IF_LINK_STATE_CHANGE_LOCK(ifp);
LQ_POP(ifp->if_link_queue, state);
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
if (state == LINK_STATE_UNSET)
goto out;
if_link_state_change_process(ifp, state);
/* If there is a link state change to come, schedule it. */
IF_LINK_STATE_CHANGE_LOCK(ifp);
if (LQ_ITEM(ifp->if_link_queue, 0) != LINK_STATE_UNSET) {
ifp->if_link_scheduled = true;
workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work,
NULL);
} else
ifp->if_link_scheduled = false;
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
out:
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
void *
if_linkstate_change_establish(struct ifnet *ifp, void (*fn)(void *), void *arg)
{
khook_t *hk;
hk = simplehook_establish(ifp->if_linkstate_hooks, fn, arg);
return (void *)hk;
}
void
if_linkstate_change_disestablish(struct ifnet *ifp, void *vhook,
kmutex_t *lock)
{
simplehook_disestablish(ifp->if_linkstate_hooks, vhook, lock);
}
/*
* Used to mark addresses on an interface as DETATCHED or TENTATIVE
* and thus start Duplicate Address Detection without changing the
* real link state.
*/
void
if_domain_link_state_change(struct ifnet *ifp, int link_state)
{
struct domain *dp;
const int s = splnet();
KERNEL_LOCK_UNLESS_NET_MPSAFE();
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp, link_state);
}
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
/*
* Default action when installing a local route on a point-to-point
* interface.
*/
void
p2p_rtrequest(int req, struct rtentry *rt,
__unused const struct rt_addrinfo *info)
{
struct ifnet *ifp = rt->rt_ifp;
struct ifaddr *ifa, *lo0ifa;
int s = pserialize_read_enter();
switch (req) {
case RTM_ADD:
if ((rt->rt_flags & RTF_LOCAL) == 0)
break;
rt->rt_ifp = lo0ifp;
if (ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
break;
IFADDR_READER_FOREACH(ifa, ifp) {
if (equal(rt_getkey(rt), ifa->ifa_addr))
break;
}
if (ifa == NULL)
break;
/*
* Ensure lo0 has an address of the same family.
*/
IFADDR_READER_FOREACH(lo0ifa, lo0ifp) {
if (lo0ifa->ifa_addr->sa_family ==
ifa->ifa_addr->sa_family)
break;
}
if (lo0ifa == NULL)
break;
/*
* Make sure to set rt->rt_ifa to the interface
* address we are using, otherwise we will have trouble
* with source address selection.
*/
if (ifa != rt->rt_ifa)
rt_replace_ifa(rt, ifa);
break;
case RTM_DELETE:
default:
break;
}
pserialize_read_exit(s);
}
static void
_if_down(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct domain *dp;
struct psref psref;
ifp->if_flags &= ~IFF_UP;
nanotime(&ifp->if_lastchange);
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
IFQ_PURGE(&ifp->if_snd);
#if NCARP > 0
if (ifp->if_carp)
carp_carpdev_state(ifp);
#endif
rt_ifmsg(ifp);
DOMAIN_FOREACH(dp) {
if (dp->dom_if_down)
dp->dom_if_down(ifp);
}
}
static void
if_down_deactivated(struct ifnet *ifp)
{
KASSERT(if_is_deactivated(ifp));
_if_down(ifp);
}
void
if_down_locked(struct ifnet *ifp)
{
KASSERT(IFNET_LOCKED(ifp));
_if_down(ifp);
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splsoftnet or equivalent.
*/
void
if_down(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
if_down_locked(ifp);
IFNET_UNLOCK(ifp);
}
/*
* Must be called with holding if_ioctl_lock.
*/
static void
if_up_locked(struct ifnet *ifp)
{
#ifdef notyet
struct ifaddr *ifa;
#endif
struct domain *dp;
KASSERT(IFNET_LOCKED(ifp));
KASSERT(!if_is_deactivated(ifp));
ifp->if_flags |= IFF_UP;
nanotime(&ifp->if_lastchange);
#ifdef notyet
/* this has no effect on IP, and will kill all ISO connections XXX */
IFADDR_READER_FOREACH(ifa, ifp)
pfctlinput(PRC_IFUP, ifa->ifa_addr);
#endif
#if NCARP > 0
if (ifp->if_carp)
carp_carpdev_state(ifp);
#endif
rt_ifmsg(ifp);
DOMAIN_FOREACH(dp) {
if (dp->dom_if_up)
dp->dom_if_up(ifp);
}
}
/*
* Handle interface slowtimo timer routine. Called
* from softclock, we decrement timer (if set) and
* call the appropriate interface routine on expiration.
*/
static bool
if_slowtimo_countdown(struct ifnet *ifp)
{
bool fire = false;
const int s = splnet();
KERNEL_LOCK(1, NULL);
if (ifp->if_timer != 0 && --ifp->if_timer == 0)
fire = true;
KERNEL_UNLOCK_ONE(NULL);
splx(s);
return fire;
}
static void
if_slowtimo_intr(void *arg)
{
struct ifnet *ifp = arg;
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
mutex_enter(&isd->isd_lock);
if (!isd->isd_dying) {
if (isd->isd_trigger || if_slowtimo_countdown(ifp)) {
if (!isd->isd_queued) {
isd->isd_queued = true;
workqueue_enqueue(if_slowtimo_wq,
&isd->isd_work, NULL);
}
} else
callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
}
mutex_exit(&isd->isd_lock);
}
static void
if_slowtimo_work(struct work *work, void *arg)
{
struct if_slowtimo_data *isd =
container_of(work, struct if_slowtimo_data, isd_work);
struct ifnet *ifp = isd->isd_ifp;
const int s = splnet();
KERNEL_LOCK(1, NULL);
(*ifp->if_slowtimo)(ifp);
KERNEL_UNLOCK_ONE(NULL);
splx(s);
mutex_enter(&isd->isd_lock);
if (isd->isd_trigger) {
isd->isd_trigger = false;
printf("%s: watchdog triggered\n", ifp->if_xname);
}
isd->isd_queued = false;
if (!isd->isd_dying)
callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
mutex_exit(&isd->isd_lock);
}
static int
sysctl_if_watchdog(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
struct ifnet *ifp = node.sysctl_data;
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg) {
mutex_enter(&isd->isd_lock);
KASSERT(!isd->isd_dying);
isd->isd_trigger = true;
callout_schedule(&isd->isd_ch, 0);
mutex_exit(&isd->isd_lock);
}
return 0;
}
static void
sysctl_watchdog_setup(struct ifnet *ifp)
{
struct sysctllog **clog = &ifp->if_sysctl_log;
const struct sysctlnode *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, ifp->if_xname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "watchdog",
SYSCTL_DESCR("Interface watchdog controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "trigger",
SYSCTL_DESCR("Trigger watchdog timeout"),
sysctl_if_watchdog, 0, (int *)ifp, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl watchdog nodes\n", ifp->if_xname);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splsoftnet or equivalent.
*/
void
if_up(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
if_up_locked(ifp);
IFNET_UNLOCK(ifp);
}
/*
* Set/clear promiscuous mode on interface ifp based on the truth value
* of pswitch. The calls are reference counted so that only the first
* "on" request actually has an effect, as does the final "off" request.
* Results are undefined if the "off" and "on" requests are not matched.
*/
int
ifpromisc_locked(struct ifnet *ifp, int pswitch)
{
int pcount, ret = 0;
u_short nflags;
KASSERT(IFNET_LOCKED(ifp));
pcount = ifp->if_pcount;
if (pswitch) {
/*
* Allow the device to be "placed" into promiscuous
* mode even if it is not configured up. It will
* consult IFF_PROMISC when it is brought up.
*/
if (ifp->if_pcount++ != 0)
goto out;
nflags = ifp->if_flags | IFF_PROMISC;
} else {
if (--ifp->if_pcount > 0)
goto out;
nflags = ifp->if_flags & ~IFF_PROMISC;
}
ret = if_flags_set(ifp, nflags);
/* Restore interface state if not successful. */
if (ret != 0)
ifp->if_pcount = pcount;
out:
return ret;
}
int
ifpromisc(struct ifnet *ifp, int pswitch)
{
int e;
IFNET_LOCK(ifp);
e = ifpromisc_locked(ifp, pswitch);
IFNET_UNLOCK(ifp);
return e;
}
/*
* if_ioctl(ifp, cmd, data)
*
* Apply an ioctl command to the interface. Returns 0 on success,
* nonzero errno(3) number on failure.
*
* For SIOCADDMULTI/SIOCDELMULTI, caller need not hold locks -- it
* is the driver's responsibility to take any internal locks.
* (Kernel logic should generally invoke these only through
* if_mcast_op.)
*
* For all other ioctls, caller must hold ifp->if_ioctl_lock,
* a.k.a. IFNET_LOCK. May sleep.
*/
int
if_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{
switch (cmd) {
case SIOCADDMULTI:
case SIOCDELMULTI:
break;
default:
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
}
return (*ifp->if_ioctl)(ifp, cmd, data);
}
/*
* if_init(ifp)
*
* Prepare the hardware underlying ifp to process packets
* according to its current configuration. Returns 0 on success,
* nonzero errno(3) number on failure.
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
int
if_init(struct ifnet *ifp)
{
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
return (*ifp->if_init)(ifp);
}
/*
* if_stop(ifp, disable)
*
* Stop the hardware underlying ifp from processing packets.
*
* If disable is true, ... XXX(?)
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
void
if_stop(struct ifnet *ifp, int disable)
{
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
(*ifp->if_stop)(ifp, disable);
}
/*
* Map interface name to
* interface structure pointer.
*/
struct ifnet *
ifunit(const char *name)
{
struct ifnet *ifp;
const char *cp = name;
u_int unit = 0;
u_int i;
/*
* If the entire name is a number, treat it as an ifindex.
*/
for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
unit = unit * 10 + (*cp - '0');
/*
* If the number took all of the name, then it's a valid ifindex.
*/
if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_byindex(unit);
ifp = NULL;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
if (strcmp(ifp->if_xname, name) == 0)
goto out;
}
out:
pserialize_read_exit(s);
return ifp;
}
/*
* Get a reference of an ifnet object by an interface name.
* The returned reference is protected by psref(9). The caller
* must release a returned reference by if_put after use.
*/
struct ifnet *
if_get(const char *name, struct psref *psref)
{
struct ifnet *ifp;
const char *cp = name;
u_int unit = 0;
u_int i;
/*
* If the entire name is a number, treat it as an ifindex.
*/
for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
unit = unit * 10 + (*cp - '0');
/*
* If the number took all of the name, then it's a valid ifindex.
*/
if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_get_byindex(unit, psref);
ifp = NULL;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
if (strcmp(ifp->if_xname, name) == 0) {
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifp->if_psref,
ifnet_psref_class);
goto out;
}
}
out:
pserialize_read_exit(s);
return ifp;
}
/*
* Release a reference of an ifnet object given by if_get, if_get_byindex
* or if_get_bylla.
*/
void
if_put(const struct ifnet *ifp, struct psref *psref)
{ if (ifp == NULL)
return;
psref_release(psref, &ifp->if_psref, ifnet_psref_class);
}
/*
* Return ifp having idx. Return NULL if not found. Normally if_byindex
* should be used.
*/
ifnet_t *
_if_byindex(u_int idx)
{
return (__predict_true(idx < if_indexlim)) ? ifindex2ifnet[idx] : NULL;
}
/*
* Return ifp having idx. Return NULL if not found or the found ifp is
* already deactivated.
*/
ifnet_t *
if_byindex(u_int idx)
{
ifnet_t *ifp;
ifp = _if_byindex(idx); if (ifp != NULL && if_is_deactivated(ifp))
ifp = NULL;
return ifp;
}
/*
* Get a reference of an ifnet object by an interface index.
* The returned reference is protected by psref(9). The caller
* must release a returned reference by if_put after use.
*/
ifnet_t *
if_get_byindex(u_int idx, struct psref *psref)
{
ifnet_t *ifp;
const int s = pserialize_read_enter();
ifp = if_byindex(idx);
if (__predict_true(ifp != NULL)) {
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
return ifp;
}
ifnet_t *
if_get_bylla(const void *lla, unsigned char lla_len, struct psref *psref)
{
ifnet_t *ifp;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
if (ifp->if_addrlen != lla_len)
continue;
if (memcmp(lla, CLLADDR(ifp->if_sadl), lla_len) == 0) {
psref_acquire(psref, &ifp->if_psref,
ifnet_psref_class);
break;
}
}
pserialize_read_exit(s);
return ifp;
}
/*
* Note that it's safe only if the passed ifp is guaranteed to not be freed,
* for example using pserialize or the ifp is already held or some other
* object is held which guarantes the ifp to not be freed indirectly.
*/
void
if_acquire(struct ifnet *ifp, struct psref *psref)
{ KASSERT(ifp->if_index != 0);
psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
}
bool
if_held(struct ifnet *ifp)
{
return psref_held(&ifp->if_psref, ifnet_psref_class);
}
/*
* Some tunnel interfaces can nest, e.g. IPv4 over IPv4 gif(4) tunnel over
* IPv4. Check the tunnel nesting count.
* Return > 0, if tunnel nesting count is more than limit.
* Return 0, if tunnel nesting count is equal or less than limit.
*/
int
if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, int limit)
{
struct m_tag *mtag;
int *count;
mtag = m_tag_find(m, PACKET_TAG_TUNNEL_INFO);
if (mtag != NULL) {
count = (int *)(mtag + 1);
if (++(*count) > limit) {
log(LOG_NOTICE,
"%s: recursively called too many times(%d)\n",
ifp->if_xname, *count);
return EIO;
}
} else {
mtag = m_tag_get(PACKET_TAG_TUNNEL_INFO, sizeof(*count),
M_NOWAIT);
if (mtag != NULL) {
m_tag_prepend(m, mtag);
count = (int *)(mtag + 1);
*count = 0;
} else {
log(LOG_DEBUG, "%s: m_tag_get() failed, "
"recursion calls are not prevented.\n",
ifp->if_xname);
}
}
return 0;
}
static void
if_tunnel_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
tro->tr_ro = kmem_zalloc(sizeof(*tro->tr_ro), KM_SLEEP);
tro->tr_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
}
static void
if_tunnel_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
rtcache_free(tro->tr_ro);
kmem_free(tro->tr_ro, sizeof(*tro->tr_ro));
mutex_obj_free(tro->tr_lock);
}
percpu_t *
if_tunnel_alloc_ro_percpu(void)
{
return percpu_create(sizeof(struct tunnel_ro),
if_tunnel_ro_init_pc, if_tunnel_ro_fini_pc, NULL);
}
void
if_tunnel_free_ro_percpu(percpu_t *ro_percpu)
{
percpu_free(ro_percpu, sizeof(struct tunnel_ro));
}
static void
if_tunnel_rtcache_free_pc(void *p, void *arg __unused,
struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
mutex_enter(tro->tr_lock);
rtcache_free(tro->tr_ro);
mutex_exit(tro->tr_lock);
}
void if_tunnel_ro_percpu_rtcache_free(percpu_t *ro_percpu)
{
percpu_foreach(ro_percpu, if_tunnel_rtcache_free_pc, NULL);
}
void
if_export_if_data(ifnet_t * const ifp, struct if_data *ifi, bool zero_stats)
{
/* Collect the volatile stats first; this zeros *ifi. */
if_stats_to_if_data(ifp, ifi, zero_stats);
ifi->ifi_type = ifp->if_type;
ifi->ifi_addrlen = ifp->if_addrlen;
ifi->ifi_hdrlen = ifp->if_hdrlen;
ifi->ifi_link_state = ifp->if_link_state;
ifi->ifi_mtu = ifp->if_mtu;
ifi->ifi_metric = ifp->if_metric;
ifi->ifi_baudrate = ifp->if_baudrate;
ifi->ifi_lastchange = ifp->if_lastchange;
}
/* common */
int
ifioctl_common(struct ifnet *ifp, u_long cmd, void *data)
{
struct ifreq *ifr;
struct ifcapreq *ifcr;
struct ifdatareq *ifdr;
unsigned short flags;
char *descr;
int error;
switch (cmd) {
case SIOCSIFCAP:
ifcr = data;
if ((ifcr->ifcr_capenable & ~ifp->if_capabilities) != 0)
return EINVAL;
if (ifcr->ifcr_capenable == ifp->if_capenable)
return 0;
ifp->if_capenable = ifcr->ifcr_capenable;
/* Pre-compute the checksum flags mask. */
ifp->if_csum_flags_tx = 0;
ifp->if_csum_flags_rx = 0;
if (ifp->if_capenable & IFCAP_CSUM_IPv4_Tx)
ifp->if_csum_flags_tx |= M_CSUM_IPv4;
if (ifp->if_capenable & IFCAP_CSUM_IPv4_Rx)
ifp->if_csum_flags_rx |= M_CSUM_IPv4;
if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Tx)
ifp->if_csum_flags_tx |= M_CSUM_TCPv4;
if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Rx)
ifp->if_csum_flags_rx |= M_CSUM_TCPv4;
if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Tx)
ifp->if_csum_flags_tx |= M_CSUM_UDPv4;
if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Rx)
ifp->if_csum_flags_rx |= M_CSUM_UDPv4;
if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Tx)
ifp->if_csum_flags_tx |= M_CSUM_TCPv6;
if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Rx)
ifp->if_csum_flags_rx |= M_CSUM_TCPv6;
if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Tx)
ifp->if_csum_flags_tx |= M_CSUM_UDPv6;
if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Rx)
ifp->if_csum_flags_rx |= M_CSUM_UDPv6;
if (ifp->if_capenable & IFCAP_TSOv4)
ifp->if_csum_flags_tx |= M_CSUM_TSOv4;
if (ifp->if_capenable & IFCAP_TSOv6)
ifp->if_csum_flags_tx |= M_CSUM_TSOv6;
#if NBRIDGE > 0
if (ifp->if_bridge != NULL)
bridge_calc_csum_flags(ifp->if_bridge);
#endif
if (ifp->if_flags & IFF_UP)
return ENETRESET;
return 0;
case SIOCSIFFLAGS:
ifr = data;
/*
* If if_is_mpsafe(ifp), KERNEL_LOCK isn't held here, but if_up
* and if_down aren't MP-safe yet, so we must hold the lock.
*/
KERNEL_LOCK_IF_IFP_MPSAFE(ifp);
if (ifp->if_flags & IFF_UP && (ifr->ifr_flags & IFF_UP) == 0) {
const int s = splsoftnet();
if_down_locked(ifp);
splx(s);
}
if (ifr->ifr_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) {
const int s = splsoftnet();
if_up_locked(ifp);
splx(s);
}
KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
flags = (ifp->if_flags & IFF_CANTCHANGE) |
(ifr->ifr_flags &~ IFF_CANTCHANGE);
if (ifp->if_flags != flags) {
ifp->if_flags = flags;
/* Notify that the flags have changed. */
rt_ifmsg(ifp);
}
break;
case SIOCGIFFLAGS:
ifr = data;
ifr->ifr_flags = ifp->if_flags;
break;
case SIOCGIFMETRIC:
ifr = data;
ifr->ifr_metric = ifp->if_metric;
break;
case SIOCGIFMTU:
ifr = data;
ifr->ifr_mtu = ifp->if_mtu;
break;
case SIOCGIFDLT:
ifr = data;
ifr->ifr_dlt = ifp->if_dlt;
break;
case SIOCGIFCAP:
ifcr = data;
ifcr->ifcr_capabilities = ifp->if_capabilities;
ifcr->ifcr_capenable = ifp->if_capenable;
break;
case SIOCSIFMETRIC:
ifr = data;
ifp->if_metric = ifr->ifr_metric;
break;
case SIOCGIFDATA:
ifdr = data;
if_export_if_data(ifp, &ifdr->ifdr_data, false);
break;
case SIOCGIFINDEX:
ifr = data;
ifr->ifr_index = ifp->if_index;
break;
case SIOCZIFDATA:
ifdr = data;
if_export_if_data(ifp, &ifdr->ifdr_data, true);
getnanotime(&ifp->if_lastchange);
break;
case SIOCSIFMTU:
ifr = data;
if (ifp->if_mtu == ifr->ifr_mtu)
break;
ifp->if_mtu = ifr->ifr_mtu;
return ENETRESET;
case SIOCSIFDESCR:
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
NULL);
if (error)
return error;
ifr = data;
if (ifr->ifr_buflen > IFDESCRSIZE)
return ENAMETOOLONG;
if (ifr->ifr_buf == NULL || ifr->ifr_buflen == 0) {
/* unset description */
descr = NULL;
} else {
descr = kmem_zalloc(IFDESCRSIZE, KM_SLEEP);
/*
* copy (IFDESCRSIZE - 1) bytes to ensure
* terminating nul
*/
error = copyin(ifr->ifr_buf, descr, IFDESCRSIZE - 1);
if (error) {
kmem_free(descr, IFDESCRSIZE);
return error;
}
}
if (ifp->if_description != NULL)
kmem_free(ifp->if_description, IFDESCRSIZE);
ifp->if_description = descr;
break;
case SIOCGIFDESCR:
ifr = data;
descr = ifp->if_description;
if (descr == NULL)
return ENOMSG;
if (ifr->ifr_buflen < IFDESCRSIZE)
return EINVAL;
error = copyout(descr, ifr->ifr_buf, IFDESCRSIZE);
if (error)
return error;
break;
default:
return ENOTTY;
}
return 0;
}
int
ifaddrpref_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
struct if_addrprefreq *ifap = (struct if_addrprefreq *)data;
struct ifaddr *ifa;
const struct sockaddr *any, *sa;
union {
struct sockaddr sa;
struct sockaddr_storage ss;
} u, v;
int s, error = 0;
switch (cmd) {
case SIOCSIFADDRPREF:
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
NULL);
if (error)
return error;
break;
case SIOCGIFADDRPREF:
break;
default:
return EOPNOTSUPP;
}
/* sanity checks */
if (data == NULL || ifp == NULL) {
panic("invalid argument to %s", __func__);
/*NOTREACHED*/
}
/* address must be specified on ADD and DELETE */
sa = sstocsa(&ifap->ifap_addr);
if (sa->sa_family != sofamily(so))
return EINVAL;
if ((any = sockaddr_any(sa)) == NULL || sa->sa_len != any->sa_len)
return EINVAL;
sockaddr_externalize(&v.sa, sizeof(v.ss), sa);
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != sa->sa_family)
continue;
sockaddr_externalize(&u.sa, sizeof(u.ss), ifa->ifa_addr);
if (sockaddr_cmp(&u.sa, &v.sa) == 0)
break;
}
if (ifa == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
switch (cmd) {
case SIOCSIFADDRPREF:
ifa->ifa_preference = ifap->ifap_preference;
goto out;
case SIOCGIFADDRPREF:
/* fill in the if_laddrreq structure */
(void)sockaddr_copy(sstosa(&ifap->ifap_addr),
sizeof(ifap->ifap_addr), ifa->ifa_addr);
ifap->ifap_preference = ifa->ifa_preference;
goto out;
default:
error = EOPNOTSUPP;
}
out:
pserialize_read_exit(s);
return error;
}
/*
* Interface ioctls.
*/
static int
doifioctl(struct socket *so, u_long cmd, void *data, struct lwp *l)
{
struct ifnet *ifp;
struct ifreq *ifr;
int error = 0;
u_long ocmd = cmd;
u_short oif_flags;
struct ifreq ifrb;
struct oifreq *oifr = NULL;
int r;
struct psref psref;
bool do_if43_post = false;
bool do_ifm80_post = false;
switch (cmd) {
case SIOCGIFCONF:
return ifconf(cmd, data);
case SIOCINITIFADDR:
return EPERM;
default:
MODULE_HOOK_CALL(uipc_syscalls_40_hook, (cmd, data), enosys(),
error);
if (error != ENOSYS)
return error;
MODULE_HOOK_CALL(uipc_syscalls_50_hook, (l, cmd, data),
enosys(), error);
if (error != ENOSYS)
return error;
error = 0;
break;
}
ifr = data;
/* Pre-conversion */
MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), error); if (cmd != ocmd) { oifr = data;
data = ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
do_if43_post = true;
}
MODULE_HOOK_CALL(ifmedia_80_pre_hook, (ifr, &cmd, &do_ifm80_post),
enosys(), error);
switch (cmd) {
case SIOCIFCREATE:
case SIOCIFDESTROY: {
const int bound = curlwp_bind();
if (l != NULL) {
ifp = if_get(ifr->ifr_name, &psref);
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
KAUTH_ARG(cmd), NULL);
if (ifp != NULL) if_put(ifp, &psref); if (error != 0) { curlwp_bindx(bound);
return error;
}
}
KERNEL_LOCK_UNLESS_NET_MPSAFE();
mutex_enter(&if_clone_mtx);
r = (cmd == SIOCIFCREATE) ?
if_clone_create(ifr->ifr_name) : if_clone_destroy(ifr->ifr_name);
mutex_exit(&if_clone_mtx);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
curlwp_bindx(bound);
return r;
}
case SIOCIFGCLONERS: {
struct if_clonereq *req = (struct if_clonereq *)data;
return if_clone_list(req->ifcr_count, req->ifcr_buffer,
&req->ifcr_total);
}
}
if ((cmd & IOC_IN) == 0 || IOCPARM_LEN(cmd) < sizeof(ifr->ifr_name))
return EINVAL;
const int bound = curlwp_bind();
ifp = if_get(ifr->ifr_name, &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
return ENXIO;
}
switch (cmd) {
case SIOCALIFADDR:
case SIOCDLIFADDR:
case SIOCSIFADDRPREF:
case SIOCSIFFLAGS:
case SIOCSIFCAP:
case SIOCSIFMETRIC:
case SIOCZIFDATA:
case SIOCSIFMTU:
case SIOCSIFPHYADDR:
case SIOCDIFPHYADDR:
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
#endif
case SIOCSLIFPHYADDR:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCSETHERCAP:
case SIOCSIFMEDIA:
case SIOCSDRVSPEC:
case SIOCG80211:
case SIOCS80211:
case SIOCS80211NWID:
case SIOCS80211NWKEY:
case SIOCS80211POWER:
case SIOCS80211BSSID:
case SIOCS80211CHANNEL:
case SIOCSLINKSTR:
if (l != NULL) {
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
KAUTH_ARG(cmd), NULL);
if (error != 0)
goto out;
}
}
oif_flags = ifp->if_flags;
KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp);
IFNET_LOCK(ifp);
error = if_ioctl(ifp, cmd, data); if (error != ENOTTY)
;
else if (so->so_proto == NULL)
error = EOPNOTSUPP;
else {
KERNEL_LOCK_IF_IFP_MPSAFE(ifp); MODULE_HOOK_CALL(if_ifioctl_43_hook,
(so, ocmd, cmd, data, l), enosys(), error);
if (error == ENOSYS) error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
cmd, data, ifp);
KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
}
if (((oif_flags ^ ifp->if_flags) & IFF_UP) != 0) {
if ((ifp->if_flags & IFF_UP) != 0) {
const int s = splsoftnet();
if_up_locked(ifp);
splx(s);
}
}
/* Post-conversion */
if (do_ifm80_post && (error == 0)) MODULE_HOOK_CALL(ifmedia_80_post_hook, (ifr, cmd),
enosys(), error);
if (do_if43_post) IFREQN2O_43(oifr, ifr);
IFNET_UNLOCK(ifp);
KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp);
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Return interface configuration
* of system. List may be used
* in later ioctl's (above) to get
* other information.
*
* Each record is a struct ifreq. Before the addition of
* sockaddr_storage, the API rule was that sockaddr flavors that did
* not fit would extend beyond the struct ifreq, with the next struct
* ifreq starting sa_len beyond the struct sockaddr. Because the
* union in struct ifreq includes struct sockaddr_storage, every kind
* of sockaddr must fit. Thus, there are no longer any overlength
* records.
*
* Records are added to the user buffer if they fit, and ifc_len is
* adjusted to the length that was written. Thus, the user is only
* assured of getting the complete list if ifc_len on return is at
* least sizeof(struct ifreq) less than it was on entry.
*
* If the user buffer pointer is NULL, this routine copies no data and
* returns the amount of space that would be needed.
*
* Invariants:
* ifrp points to the next part of the user's buffer to be used. If
* ifrp != NULL, space holds the number of bytes remaining that we may
* write at ifrp. Otherwise, space holds the number of bytes that
* would have been written had there been adequate space.
*/
/*ARGSUSED*/
static int
ifconf(u_long cmd, void *data)
{
struct ifconf *ifc = (struct ifconf *)data;
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifreq ifr, *ifrp = NULL;
int space = 0, error = 0;
const int sz = (int)sizeof(struct ifreq);
const bool docopy = ifc->ifc_req != NULL;
struct psref psref;
if (docopy) { if (ifc->ifc_len < 0)
return EINVAL;
space = ifc->ifc_len;
ifrp = ifc->ifc_req;
}
memset(&ifr, 0, sizeof(ifr));
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
pserialize_read_exit(s);
(void)strncpy(ifr.ifr_name, ifp->if_xname,
sizeof(ifr.ifr_name));
if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') {
error = ENAMETOOLONG;
goto release_exit;
}
if (IFADDR_READER_EMPTY(ifp)) {
/* Interface with no addresses - send zero sockaddr. */
memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr));
if (!docopy) {
space += sz;
goto next;
}
if (space >= sz) {
error = copyout(&ifr, ifrp, sz);
if (error != 0)
goto release_exit;
ifrp++;
space -= sz;
}
}
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct sockaddr *sa = ifa->ifa_addr;
/* all sockaddrs must fit in sockaddr_storage */
KASSERT(sa->sa_len <= sizeof(ifr.ifr_ifru));
if (!docopy) {
space += sz;
continue;
}
memcpy(&ifr.ifr_space, sa, sa->sa_len);
pserialize_read_exit(s);
if (space >= sz) {
error = copyout(&ifr, ifrp, sz);
if (error != 0)
goto release_exit;
ifrp++; space -= sz;
}
s = pserialize_read_enter();
}
pserialize_read_exit(s);
next:
s = pserialize_read_enter();
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
if (docopy) {
KASSERT(0 <= space && space <= ifc->ifc_len);
ifc->ifc_len -= space;
} else {
KASSERT(space >= 0);
ifc->ifc_len = space;
}
return 0;
release_exit:
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
curlwp_bindx(bound);
return error;
}
int
ifreq_setaddr(u_long cmd, struct ifreq *ifr, const struct sockaddr *sa)
{
uint8_t len = sizeof(ifr->ifr_ifru.ifru_space);
struct ifreq ifrb;
struct oifreq *oifr = NULL;
u_long ocmd = cmd;
int hook;
MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), hook); if (hook != ENOSYS) {
if (cmd != ocmd) {
oifr = (struct oifreq *)(void *)ifr;
ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
len = sizeof(oifr->ifr_addr);
}
}
if (len < sa->sa_len)
return EFBIG;
memset(&ifr->ifr_addr, 0, len);
sockaddr_copy(&ifr->ifr_addr, len, sa);
if (cmd != ocmd) IFREQN2O_43(oifr, ifr);
return 0;
}
/*
* wrapper function for the drivers which doesn't have if_transmit().
*/
static int
if_transmit(struct ifnet *ifp, struct mbuf *m)
{
int error;
size_t pktlen = m->m_pkthdr.len;
bool mcast = (m->m_flags & M_MCAST) != 0;
const int s = splnet();
IFQ_ENQUEUE(&ifp->if_snd, m, error);
if (error != 0) {
/* mbuf is already freed */
goto out;
}
net_stat_ref_t nsr = IF_STAT_GETREF(ifp);
if_statadd_ref(nsr, if_obytes, pktlen);
if (mcast)
if_statinc_ref(nsr, if_omcasts);
IF_STAT_PUTREF(ifp);
if ((ifp->if_flags & IFF_OACTIVE) == 0)
if_start_lock(ifp);
out:
splx(s);
return error;
}
int
if_transmit_lock(struct ifnet *ifp, struct mbuf *m)
{
int error;
kmsan_check_mbuf(m);
#ifdef ALTQ
KERNEL_LOCK(1, NULL);
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
error = if_transmit(ifp, m);
KERNEL_UNLOCK_ONE(NULL);
} else {
KERNEL_UNLOCK_ONE(NULL);
error = (*ifp->if_transmit)(ifp, m);
/* mbuf is already freed */
}
#else /* !ALTQ */
error = (*ifp->if_transmit)(ifp, m);
/* mbuf is already freed */
#endif /* !ALTQ */
return error;
}
/*
* Queue message on interface, and start output if interface
* not yet active.
*/
int
ifq_enqueue(struct ifnet *ifp, struct mbuf *m)
{
return if_transmit_lock(ifp, m);
}
/*
* Queue message on interface, possibly using a second fast queue
*/
int
ifq_enqueue2(struct ifnet *ifp, struct ifqueue *ifq, struct mbuf *m)
{
int error = 0;
if (ifq != NULL
#ifdef ALTQ
&& ALTQ_IS_ENABLED(&ifp->if_snd) == 0
#endif
) {
if (IF_QFULL(ifq)) {
IF_DROP(&ifp->if_snd);
m_freem(m);
if (error == 0)
error = ENOBUFS;
} else
IF_ENQUEUE(ifq, m);
} else
IFQ_ENQUEUE(&ifp->if_snd, m, error);
if (error != 0) {
if_statinc(ifp, if_oerrors);
return error;
}
return 0;
}
int
if_addr_init(ifnet_t *ifp, struct ifaddr *ifa, const bool src)
{
int rc;
KASSERT(IFNET_LOCKED(ifp));
if (ifp->if_initaddr != NULL)
rc = (*ifp->if_initaddr)(ifp, ifa, src);
else if (src || (rc = if_ioctl(ifp, SIOCSIFDSTADDR, ifa)) == ENOTTY)
rc = if_ioctl(ifp, SIOCINITIFADDR, ifa);
return rc;
}
int
if_do_dad(struct ifnet *ifp)
{
if ((ifp->if_flags & IFF_LOOPBACK) != 0)
return 0;
switch (ifp->if_type) {
case IFT_FAITH:
/*
* These interfaces do not have the IFF_LOOPBACK flag,
* but loop packets back. We do not have to do DAD on such
* interfaces. We should even omit it, because loop-backed
* responses would confuse the DAD procedure.
*/
return 0;
default:
/*
* Our DAD routine requires the interface up and running.
* However, some interfaces can be up before the RUNNING
* status. Additionally, users may try to assign addresses
* before the interface becomes up (or running).
* We simply skip DAD in such a case as a work around.
* XXX: we should rather mark "tentative" on such addresses,
* and do DAD after the interface becomes ready.
*/
if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
(IFF_UP | IFF_RUNNING))
return 0;
return 1;
}
}
/*
* if_flags_set(ifp, flags)
*
* Ask ifp to change ifp->if_flags to flags, as if with the
* SIOCSIFFLAGS ioctl command.
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
int
if_flags_set(ifnet_t *ifp, const u_short flags)
{
int rc;
KASSERT(IFNET_LOCKED(ifp));
if (ifp->if_setflags != NULL)
rc = (*ifp->if_setflags)(ifp, flags);
else {
u_short cantflags, chgdflags;
struct ifreq ifr;
chgdflags = ifp->if_flags ^ flags;
cantflags = chgdflags & IFF_CANTCHANGE;
if (cantflags != 0)
ifp->if_flags ^= cantflags;
/*
* Traditionally, we do not call if_ioctl after
* setting/clearing only IFF_PROMISC if the interface
* isn't IFF_UP. Uphold that tradition.
*/
if (chgdflags == IFF_PROMISC && (ifp->if_flags & IFF_UP) == 0)
return 0;
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = flags & ~IFF_CANTCHANGE;
rc = if_ioctl(ifp, SIOCSIFFLAGS, &ifr);
if (rc != 0 && cantflags != 0)
ifp->if_flags ^= cantflags;
}
return rc;
}
/*
* if_mcast_op(ifp, cmd, sa)
*
* Apply a multicast command, SIOCADDMULTI/SIOCDELMULTI, to the
* interface. Returns 0 on success, nonzero errno(3) number on
* failure.
*
* May sleep.
*
* Use this, not if_ioctl, for the multicast commands.
*/
int
if_mcast_op(ifnet_t *ifp, const unsigned long cmd, const struct sockaddr *sa)
{
int rc;
struct ifreq ifr;
switch (cmd) {
case SIOCADDMULTI:
case SIOCDELMULTI:
break;
default:
panic("invalid ifnet multicast command: 0x%lx", cmd);
}
ifreq_setaddr(cmd, &ifr, sa);
rc = if_ioctl(ifp, cmd, &ifr);
return rc;
}
static void
sysctl_sndq_setup(struct sysctllog **clog, const char *ifname,
struct ifaltq *ifq)
{
const struct sysctlnode *cnode, *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, ifname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sndq",
SYSCTL_DESCR("Interface output queue controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "len",
SYSCTL_DESCR("Current output queue length"),
NULL, 0, &ifq->ifq_len, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed output queue length"),
NULL, 0, &ifq->ifq_maxlen, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Packets dropped due to full output queue"),
NULL, 0, &ifq->ifq_drops, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl nodes\n", ifname);
return;
}
static int
if_sdl_sysctl(SYSCTLFN_ARGS)
{
struct ifnet *ifp;
const struct sockaddr_dl *sdl;
struct psref psref;
int error = 0;
if (namelen != 1)
return EINVAL;
const int bound = curlwp_bind();
ifp = if_get_byindex(name[0], &psref);
if (ifp == NULL) {
error = ENODEV;
goto out0;
}
sdl = ifp->if_sadl;
if (sdl == NULL) {
*oldlenp = 0;
goto out1;
}
if (oldp == NULL) {
*oldlenp = sdl->sdl_alen;
goto out1;
}
if (*oldlenp >= sdl->sdl_alen)
*oldlenp = sdl->sdl_alen;
error = sysctl_copyout(l, &sdl->sdl_data[sdl->sdl_nlen],
oldp, *oldlenp);
out1:
if_put(ifp, &psref);
out0:
curlwp_bindx(bound);
return error;
}
static void
if_sysctl_setup(struct sysctllog **clog)
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sdl",
SYSCTL_DESCR("Get active link-layer address"),
if_sdl_sysctl, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $ */
/*
* Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $");
/*
* This implements a hash-based lookup scheme for UFS directories.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/types.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/sysctl.h>
#include <sys/atomic.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
/*
* Defaults for dirhash cache sizes:
* - use up to 1/64th of system memory.
* - disable dirhash (set the cache size to 0 bytes) if the
* calculated value of hash is less than 2MB.
* - cap maximum size of the dirhash cache at 32MB.
*/
#define DIRHASH_DEFAULT_DIVIDER 64
#define MIN_DEFAULT_DIRHASH_MEM (2 * 1024 * 1024)
#define MAX_DEFAULT_DIRHASH_MEM (32 * 1024 * 1024)
#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1))
#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1))
#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0)
#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
static u_int ufs_dirhashminblks = 5;
static u_int ufs_dirhashmaxmem = 0;
static u_int ufs_dirhashmem;
static u_int ufs_dirhashcheck = 0;
static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
int dirblksiz);
static void ufsdirhash_delslot(struct dirhash *dh, int slot);
static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
int namelen, doff_t offset);
static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
int dirblksiz);
static int ufsdirhash_recycle(int wanted);
static pool_cache_t ufsdirhashblk_cache;
static pool_cache_t ufsdirhash_cache;
#define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock)
#define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock)
#define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock)
#define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock)
#define DIRHASH_BLKALLOC() \
pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
#define DIRHASH_BLKFREE(ptr) \
pool_cache_put(ufsdirhashblk_cache, ptr)
/* Dirhash list; recently-used entries are near the tail. */
static TAILQ_HEAD(, dirhash) ufsdirhash_list;
/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
static kmutex_t ufsdirhash_lock;
/*
* Locking order:
* ufsdirhash_lock
* dh_lock
*
* The dh_lock mutex should be acquired either via the inode lock, or via
* ufsdirhash_lock. Only the owner of the inode may free the associated
* dirhash, but anything can steal its memory and set dh_hash to NULL.
*/
/*
* Attempt to build up a hash table for the directory contents in
* inode 'ip'. Returns 0 on success, or -1 of the operation failed.
*/
int
ufsdirhash_build(struct inode *ip)
{
struct dirhash *dh;
struct buf *bp = NULL;
struct direct *ep;
struct vnode *vp;
doff_t bmask, pos;
int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
/* Check if we can/should use dirhash. */
if (ip->i_dirhash == NULL) {
if (ufs_dirhashmaxmem == 0 || ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
OFSFMT(ip))
return (-1);
} else {
/* Hash exists, but sysctls could have changed. */
if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
ufs_dirhashmem > ufs_dirhashmaxmem) {
ufsdirhash_free(ip);
return (-1);
}
/* Check if hash exists and is intact (note: unlocked read). */
if (ip->i_dirhash->dh_hash != NULL)
return (0);
/* Free the old, recycled hash and build a new one. */
ufsdirhash_free(ip);
}
/* Don't hash removed directories. */
if (ip->i_nlink == 0)
return (-1);
vp = ip->i_vnode;
/* Allocate 50% more entries than this dir size could ever need. */
KASSERT(ip->i_size >= dirblksiz);
nslots = ip->i_size / UFS_DIRECTSIZ(1);
nslots = (nslots * 3 + 1) / 2;
narrays = howmany(nslots, DH_NBLKOFF);
nslots = narrays * DH_NBLKOFF;
dirblocks = howmany(ip->i_size, dirblksiz);
nblocks = (dirblocks * 3 + 1) / 2;
memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
nblocks * sizeof(*dh->dh_blkfree);
while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
ufs_dirhashmaxmem) {
atomic_add_int(&ufs_dirhashmem, -memreqd);
if (memreqd > ufs_dirhashmaxmem / 2)
return (-1);
/* Try to free some space. */
if (ufsdirhash_recycle(memreqd) != 0)
return (-1);
else
DIRHASHLIST_UNLOCK();
}
/*
* Use non-blocking mallocs so that we will revert to a linear
* lookup on failure rather than potentially blocking forever.
*/
dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
if (dh == NULL) {
atomic_add_int(&ufs_dirhashmem, -memreqd);
return (-1);
}
memset(dh, 0, sizeof(*dh));
mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
DIRHASH_LOCK(dh);
dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
goto fail;
for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
goto fail;
for (j = 0; j < DH_NBLKOFF; j++)
dh->dh_hash[i][j] = DIRHASH_EMPTY;
}
/* Initialise the hash table and block statistics. */
dh->dh_narrays = narrays;
dh->dh_hlen = nslots;
dh->dh_nblk = nblocks;
dh->dh_dirblks = dirblocks;
for (i = 0; i < dirblocks; i++)
dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
for (i = 0; i < DH_NFSTATS; i++)
dh->dh_firstfree[i] = -1;
dh->dh_firstfree[DH_NFSTATS] = 0;
dh->dh_seqopt = 0;
dh->dh_seqoff = 0;
dh->dh_score = DH_SCOREINIT;
ip->i_dirhash = dh;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
pos = 0;
while (pos < ip->i_size) {
preempt_point();
/* If necessary, get the next directory block. */
if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp, 0); if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
goto fail;
}
/* Add this entry to the hash. */
ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
if (ep->d_reclen == 0 || ep->d_reclen >
dirblksiz - (pos & (dirblksiz - 1))) {
/* Corrupted directory. */
brelse(bp, 0);
goto fail;
}
if (ep->d_ino != 0) {
/* Add the entry (simplified ufsdirhash_add). */
slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
dh->dh_hused++;
DH_ENTRY(dh, slot) = pos;
ufsdirhash_adjfree(dh, pos, -UFS_DIRSIZ(0, ep, needswap),
dirblksiz);
}
pos += ep->d_reclen;
}
if (bp != NULL) brelse(bp, 0);
DIRHASHLIST_LOCK();
TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 1;
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
return (0);
fail:
ip->i_dirhash = NULL;
DIRHASH_UNLOCK(dh);
if (dh->dh_hash != NULL) { for (i = 0; i < narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); kmem_free(dh->dh_hash, dh->dh_hashsz);
}
if (dh->dh_blkfree != NULL) kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
mutex_destroy(&dh->dh_lock);
pool_cache_put(ufsdirhash_cache, dh);
atomic_add_int(&ufs_dirhashmem, -memreqd);
return (-1);
}
/*
* Free any hash table associated with inode 'ip'.
*/
void
ufsdirhash_free(struct inode *ip)
{
struct dirhash *dh;
int i, mem;
if ((dh = ip->i_dirhash) == NULL)
return;
ip->i_dirhash = NULL;
DIRHASHLIST_LOCK();
if (dh->dh_onlist)
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
DIRHASHLIST_UNLOCK();
/* The dirhash pointed to by 'dh' is exclusively ours now. */
mem = sizeof(*dh);
if (dh->dh_hash != NULL) {
for (i = 0; i < dh->dh_narrays; i++)
DIRHASH_BLKFREE(dh->dh_hash[i]);
kmem_free(dh->dh_hash, dh->dh_hashsz);
kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
mem += dh->dh_hashsz;
mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
}
mutex_destroy(&dh->dh_lock);
pool_cache_put(ufsdirhash_cache, dh);
atomic_add_int(&ufs_dirhashmem, -mem);
}
/*
* Find the offset of the specified name within the given inode.
* Returns 0 on success, ENOENT if the entry does not exist, or
* EJUSTRETURN if the caller should revert to a linear search.
*
* If successful, the directory offset is stored in *offp, and a
* pointer to a struct buf containing the entry is stored in *bpp. If
* prevoffp is non-NULL, the offset of the previous entry within
* the UFS_DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
* is the first in a block, the start of the block is used).
*/
int
ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
struct buf **bpp, doff_t *prevoffp)
{
struct dirhash *dh, *dh_next;
struct direct *dp;
struct vnode *vp;
struct buf *bp;
doff_t blkoff, bmask, offset, prevoff;
int i, slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (EJUSTRETURN);
/*
* Move this dirhash towards the end of the list if it has a
* score higher than the next entry, and acquire the dh_lock.
* Optimise the case where it's already the last by performing
* an unlocked read of the TAILQ_NEXT pointer.
*
* In both cases, end up holding just dh_lock.
*/
if (TAILQ_NEXT(dh, dh_list) != NULL) {
DIRHASHLIST_LOCK();
DIRHASH_LOCK(dh);
/*
* If the new score will be greater than that of the next
* entry, then move this entry past it. With both mutexes
* held, dh_next won't go away, but its dh_score could
* change; that's not important since it is just a hint.
*/
if (dh->dh_hash != NULL && (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
dh->dh_score >= dh_next->dh_score) {
KASSERT(dh->dh_onlist); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
dh_list);
}
DIRHASHLIST_UNLOCK();
} else {
/* Already the last, though that could change as we wait. */
DIRHASH_LOCK(dh);
}
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/* Update the score. */
if (dh->dh_score < DH_SCOREMAX) dh->dh_score++;
vp = ip->i_vnode;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
blkoff = -1;
bp = NULL;
restart:
slot = ufsdirhash_hash(dh, name, namelen); if (dh->dh_seqopt) {
/*
* Sequential access optimisation. dh_seqoff contains the
* offset of the directory entry immediately following
* the last entry that was looked up. Check if this offset
* appears in the hash chain for the name we are looking for.
*/
for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
i = WRAPINCR(i, dh->dh_hlen))
if (offset == dh->dh_seqoff)
break;
if (offset == dh->dh_seqoff) {
/*
* We found an entry with the expected offset. This
* is probably the entry we want, but if not, the
* code below will turn off seqoff and retry.
*/
slot = i;
} else
dh->dh_seqopt = 0;
}
for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) {
if (offset == DIRHASH_DEL)
continue;
if (offset < 0 || offset >= ip->i_size)
panic("ufsdirhash_lookup: bad offset in hash array");
if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp, 0);
blkoff = offset & ~bmask;
if (ufs_blkatoff(vp, (off_t)blkoff,
NULL, &bp, false) != 0) {
DIRHASH_UNLOCK(dh);
return (EJUSTRETURN);
}
}
dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
if (dp->d_reclen == 0 || dp->d_reclen >
dirblksiz - (offset & (dirblksiz - 1))) {
/* Corrupted directory. */
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (EJUSTRETURN);
}
if (dp->d_namlen == namelen &&
memcmp(dp->d_name, name, namelen) == 0) {
/* Found. Get the prev offset if needed. */
if (prevoffp != NULL) { if (offset & (dirblksiz - 1)) { prevoff = ufsdirhash_getprev(dp,
offset, dirblksiz);
if (prevoff == -1) {
brelse(bp, 0);
return (EJUSTRETURN);
}
} else
prevoff = offset;
*prevoffp = prevoff;
}
/* Check for sequential access, and update offset. */
if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) dh->dh_seqopt = 1;
dh->dh_seqoff = offset + UFS_DIRSIZ(0, dp, needswap);
DIRHASH_UNLOCK(dh);
*bpp = bp;
*offp = offset;
return (0);
}
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp, 0);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/*
* When the name doesn't match in the seqopt case, go back
* and search normally.
*/
if (dh->dh_seqopt) {
dh->dh_seqopt = 0;
goto restart;
}
}
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp, 0);
return (ENOENT);
}
/*
* Find a directory block with room for 'slotneeded' bytes. Returns
* the offset of the directory entry that begins the free space.
* This will either be the offset of an existing entry that has free
* space at the end, or the offset of an entry with d_ino == 0 at
* the start of a UFS_DIRBLKSIZ block.
*
* To use the space, the caller may need to compact existing entries in
* the directory. The total number of bytes in all of the entries involved
* in the compaction is stored in *slotsize. In other words, all of
* the entries that must be compacted are exactly contained in the
* region beginning at the returned offset and spanning *slotsize bytes.
*
* Returns -1 if no space was found, indicating that the directory
* must be extended.
*/
doff_t
ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
{
struct direct *dp;
struct dirhash *dh;
struct buf *bp;
doff_t pos, slotstart;
int dirblock, error, freebytes, i;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
/* Find a directory block with the desired free space. */
dirblock = -1;
for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
if ((dirblock = dh->dh_firstfree[i]) != -1)
break;
if (dirblock == -1) {
DIRHASH_UNLOCK(dh);
return (-1);
}
KASSERT(dirblock < dh->dh_nblk &&
dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
pos = dirblock * dirblksiz;
error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
if (error) {
DIRHASH_UNLOCK(dh);
return (-1);
}
/* Find the first entry with free space. */
for (i = 0; i < dirblksiz; ) {
if (dp->d_reclen == 0) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
if (dp->d_ino == 0 || dp->d_reclen > UFS_DIRSIZ(0, dp, needswap))
break;
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > dirblksiz) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
slotstart = pos + i;
/* Find the range of entries needed to get enough space */
freebytes = 0;
while (i < dirblksiz && freebytes < slotneeded) {
freebytes += dp->d_reclen;
if (dp->d_ino != 0) freebytes -= UFS_DIRSIZ(0, dp, needswap);
if (dp->d_reclen == 0) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > dirblksiz) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
if (freebytes < slotneeded)
panic("ufsdirhash_findfree: free mismatch"); DIRHASH_UNLOCK(dh);
brelse(bp, 0);
*slotsize = pos + i - slotstart;
return (slotstart);
}
/*
* Return the start of the unused space at the end of a directory, or
* -1 if there are no trailing unused blocks.
*/
doff_t
ufsdirhash_enduseful(struct inode *ip)
{
struct dirhash *dh;
int i;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
DIRHASH_UNLOCK(dh);
return (-1);
}
for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
break;
DIRHASH_UNLOCK(dh);
return ((doff_t)(i + 1) * dirblksiz);
}
/*
* Insert information into the hash about a new directory entry. dirp
* points to a struct direct containing the entry, and offset specifies
* the offset of this entry.
*/
void
ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset < dh->dh_dirblks * dirblksiz);
/*
* Normal hash usage is < 66%. If the usage gets too high then
* remove the hash entirely and let it be rebuilt later.
*/
if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/* Find a free hash slot (empty or deleted), and add the entry. */
slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
while (DH_ENTRY(dh, slot) >= 0)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
dh->dh_hused++;
DH_ENTRY(dh, slot) = offset;
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, -UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
DIRHASH_UNLOCK(dh);
}
/*
* Remove the specified directory entry from the hash. The entry to remove
* is defined by the name in `dirp', which must exist at the specified
* `offset' within the directory.
*/
void
ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset < dh->dh_dirblks * dirblksiz);
/* Find the entry */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
/* Remove the hash entry. */
ufsdirhash_delslot(dh, slot);
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
DIRHASH_UNLOCK(dh);
}
/*
* Change the offset associated with a directory entry in the hash. Used
* when compacting directory blocks.
*/
void
ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
doff_t newoff)
{
struct dirhash *dh;
int slot;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
/* Find the entry, and update the offset. */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
DH_ENTRY(dh, slot) = newoff;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory has grown by one block that
* begins at offset (i.e. the new length is offset + UFS_DIRBLKSIZ).
*/
void
ufsdirhash_newblk(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset == dh->dh_dirblks * dirblksiz);
block = offset / dirblksiz;
if (block >= dh->dh_nblk) {
/* Out of space; must rebuild. */
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
dh->dh_dirblks = block + 1;
/* Account for the new free block. */
dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
if (dh->dh_firstfree[DH_NFSTATS] == -1)
dh->dh_firstfree[DH_NFSTATS] = block;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory is being truncated.
*/
void
ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block, i;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset <= dh->dh_dirblks * dirblksiz);
block = howmany(offset, dirblksiz);
/*
* If the directory shrinks to less than 1/8 of dh_nblk blocks
* (about 20% of its original size due to the 50% extra added in
* ufsdirhash_build) then free it, and let the caller rebuild
* if necessary.
*/
if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/*
* Remove any `first free' information pertaining to the
* truncated blocks. All blocks we're removing should be
* completely unused.
*/
if (dh->dh_firstfree[DH_NFSTATS] >= block)
dh->dh_firstfree[DH_NFSTATS] = -1;
for (i = block; i < dh->dh_dirblks; i++)
if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
panic("ufsdirhash_dirtrunc: blocks in use");
for (i = 0; i < DH_NFSTATS; i++)
if (dh->dh_firstfree[i] >= block)
panic("ufsdirhash_dirtrunc: first free corrupt");
dh->dh_dirblks = block;
DIRHASH_UNLOCK(dh);
}
/*
* Debugging function to check that the dirhash information about
* a directory block matches its actual contents. Panics if a mismatch
* is detected.
*
* On entry, `sbuf' should point to the start of an in-core
* DIRBLKSIZ-sized directory block, and `offset' should contain the
* offset from the start of the directory of that block.
*/
void
ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
{
struct dirhash *dh;
struct direct *dp;
int block, ffslot, i, nfree;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if (!ufs_dirhashcheck)
return;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
block = offset / dirblksiz;
if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
panic("ufsdirhash_checkblock: bad offset");
nfree = 0;
for (i = 0; i < dirblksiz; i += dp->d_reclen) {
dp = (struct direct *)(sbuf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
panic("ufsdirhash_checkblock: bad dir");
if (dp->d_ino == 0) {
#if 0
/*
* XXX entries with d_ino == 0 should only occur
* at the start of a DIRBLKSIZ block. However the
* ufs code is tolerant of such entries at other
* offsets, and fsck does not fix them.
*/
if (i != 0)
panic("ufsdirhash_checkblock: bad dir inode");
#endif
nfree += dp->d_reclen;
continue;
}
/* Check that the entry exists (will panic if it doesn't). */
ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
nfree += dp->d_reclen - UFS_DIRSIZ(0, dp, needswap);
}
if (i != dirblksiz)
panic("ufsdirhash_checkblock: bad dir end");
if (dh->dh_blkfree[block] * DIRALIGN != nfree)
panic("ufsdirhash_checkblock: bad free count");
ffslot = BLKFREE2IDX(nfree / DIRALIGN);
for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot)
panic("ufsdirhash_checkblock: bad first-free");
if (dh->dh_firstfree[ffslot] == -1)
panic("ufsdirhash_checkblock: missing first-free entry"); DIRHASH_UNLOCK(dh);
}
/*
* Hash the specified filename into a dirhash slot.
*/
static int
ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
{
u_int32_t hash;
/*
* We hash the name and then some other bit of data that is
* invariant over the dirhash's lifetime. Otherwise names
* differing only in the last byte are placed close to one
* another in the table, which is bad for linear probing.
*/
hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
hash = hash32_buf(&dh, sizeof(dh), hash);
return (hash % dh->dh_hlen);
}
/*
* Adjust the number of free bytes in the block containing `offset'
* by the value specified by `diff'.
*
* The caller must ensure we have exclusive access to `dh'; normally
* that means that dh_lock should be held, but this is also called
* from ufsdirhash_build() where exclusive access can be assumed.
*/
static void
ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
{
int block, i, nfidx, ofidx;
KASSERT(mutex_owned(&dh->dh_lock));
/* Update the per-block summary info. */
block = offset / dirblksiz;
KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
/* Update the `first free' list if necessary. */
if (ofidx != nfidx) {
/* If removing, scan forward for the next block. */
if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
break;
dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
}
/* Make this the new `first free' if necessary */
if (dh->dh_firstfree[nfidx] > block ||
dh->dh_firstfree[nfidx] == -1)
dh->dh_firstfree[nfidx] = block;
}
}
/*
* Find the specified name which should have the specified offset.
* Returns a slot number, and panics on failure.
*
* `dh' must be locked on entry and remains so on return.
*/
static int
ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
doff_t offset)
{
int slot;
KASSERT(mutex_owned(&dh->dh_lock));
/* Find the entry. */
KASSERT(dh->dh_hused < dh->dh_hlen); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset &&
DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) != offset)
panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot);
}
/*
* Remove the entry corresponding to the specified slot from the hash array.
*
* `dh' must be locked on entry and remains so on return.
*/
static void
ufsdirhash_delslot(struct dirhash *dh, int slot)
{
int i;
KASSERT(mutex_owned(&dh->dh_lock));
/* Mark the entry as deleted. */
DH_ENTRY(dh, slot) = DIRHASH_DEL;
/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen);
if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
DH_ENTRY(dh, i) = DIRHASH_EMPTY;
dh->dh_hused--;
i = WRAPDECR(i, dh->dh_hlen);
}
KASSERT(dh->dh_hused >= 0);
}
}
/*
* Given a directory entry and its offset, find the offset of the
* previous entry in the same UFS_DIRBLKSIZ-sized block. Returns an
* offset, or -1 if there is no previous entry in the block or some
* other problem occurred.
*/
static doff_t
ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
{
struct direct *dp;
char *blkbuf;
doff_t blkoff, prevoff;
int entrypos, i;
blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */
entrypos = offset & (dirblksiz - 1); /* entry relative to block */
blkbuf = (char *)dirp - entrypos;
prevoff = blkoff;
/* If `offset' is the start of a block, there is no previous entry. */
if (entrypos == 0)
return (-1);
/* Scan from the start of the block until we get to the entry. */
for (i = 0; i < entrypos; i += dp->d_reclen) {
dp = (struct direct *)(blkbuf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
return (-1); /* Corrupted directory. */
prevoff = blkoff + i;
}
return (prevoff);
}
/*
* Try to free up `wanted' bytes by stealing memory from existing
* dirhashes. Returns zero with list locked if successful.
*/
static int
ufsdirhash_recycle(int wanted)
{
struct dirhash *dh;
doff_t **hash;
u_int8_t *blkfree;
int i, mem, narrays;
size_t hashsz, blkfreesz;
DIRHASHLIST_LOCK();
while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
/* Find a dirhash, and lock it. */
if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
DIRHASHLIST_UNLOCK();
return (-1);
}
DIRHASH_LOCK(dh);
KASSERT(dh->dh_hash != NULL);
/* Decrement the score; only recycle if it becomes zero. */
if (--dh->dh_score > 0) {
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
return (-1);
}
/* Remove it from the list and detach its memory. */
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 0;
hash = dh->dh_hash;
hashsz = dh->dh_hashsz;
dh->dh_hash = NULL;
blkfree = dh->dh_blkfree;
blkfreesz = dh->dh_blkfreesz;
dh->dh_blkfree = NULL;
narrays = dh->dh_narrays;
mem = narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
dh->dh_nblk * sizeof(*dh->dh_blkfree);
/* Unlock everything, free the detached memory. */
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
for (i = 0; i < narrays; i++)
DIRHASH_BLKFREE(hash[i]);
kmem_free(hash, hashsz);
kmem_free(blkfree, blkfreesz);
/* Account for the returned memory, and repeat if necessary. */
DIRHASHLIST_LOCK();
atomic_add_int(&ufs_dirhashmem, -mem);
}
/* Success. */
return (0);
}
SYSCTL_SETUP(ufsdirhash_sysctl_init, "ufs_dirhash sysctl")
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ufs",
SYSCTL_DESCR("ufs"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "dirhash",
SYSCTL_DESCR("dirhash"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "minblocks",
SYSCTL_DESCR("minimum hashed directory size in blocks"),
NULL, 0, &ufs_dirhashminblks, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxmem",
SYSCTL_DESCR("maximum dirhash memory usage"),
NULL, 0, &ufs_dirhashmaxmem, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "memused",
SYSCTL_DESCR("current dirhash memory usage"),
NULL, 0, &ufs_dirhashmem, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "docheck",
SYSCTL_DESCR("enable extra sanity checks"),
NULL, 0, &ufs_dirhashcheck, 0,
CTL_CREATE, CTL_EOL);
}
void
ufsdirhash_init(void)
{
/*
* Only initialise defaults for the dirhash size if it hasn't
* hasn't been set.
*/
if (ufs_dirhashmaxmem == 0) {
/* Use 64-bit math to avoid overflows. */
uint64_t physmem_bytes, hash_bytes;
physmem_bytes = ctob((uint64_t)physmem);
hash_bytes = physmem_bytes / DIRHASH_DEFAULT_DIVIDER;
if (hash_bytes < MIN_DEFAULT_DIRHASH_MEM)
hash_bytes = 0;
if (hash_bytes > MAX_DEFAULT_DIRHASH_MEM)
hash_bytes = MAX_DEFAULT_DIRHASH_MEM;
ufs_dirhashmaxmem = (u_int)hash_bytes;
}
mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
TAILQ_INIT(&ufsdirhash_list);
}
void
ufsdirhash_done(void)
{
KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
pool_cache_destroy(ufsdirhashblk_cache);
pool_cache_destroy(ufsdirhash_cache);
mutex_destroy(&ufsdirhash_lock);
}
/* $NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* John Heidemann of the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)null_vnops.c 8.6 (Berkeley) 5/27/95
*
* Ancestors:
* @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92
* Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp
* ...and...
* @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
*/
/*
* Generic layer vnode operations.
*
* The layer.h, layer_extern.h, layer_vfs.c, and layer_vnops.c files provide
* the core implementation of stacked file-systems.
*
* The layerfs duplicates a portion of the file system name space under
* a new name. In this respect, it is similar to the loopback file system.
* It differs from the loopback fs in two respects: it is implemented using
* a stackable layers technique, and it is "layerfs-nodes" stack above all
* lower-layer vnodes, not just over directory vnodes.
*
* OPERATION OF LAYERFS
*
* The layerfs is the minimum file system layer, bypassing all possible
* operations to the lower layer for processing there. The majority of its
* activity centers on the bypass routine, through which nearly all vnode
* operations pass.
*
* The bypass routine accepts arbitrary vnode operations for handling by
* the lower layer. It begins by examining vnode operation arguments and
* replacing any layered nodes by their lower-layer equivalents. It then
* invokes an operation on the lower layer. Finally, it replaces the
* layered nodes in the arguments and, if a vnode is returned by the
* operation, stacks a layered node on top of the returned vnode.
*
* The bypass routine in this file, layer_bypass(), is suitable for use
* by many different layered filesystems. It can be used by multiple
* filesystems simultaneously. Alternatively, a layered fs may provide
* its own bypass routine, in which case layer_bypass() should be used as
* a model. For instance, the main functionality provided by umapfs, the user
* identity mapping file system, is handled by a custom bypass routine.
*
* Typically a layered fs registers its selected bypass routine as the
* default vnode operation in its vnodeopv_entry_desc table. Additionally
* the filesystem must store the bypass entry point in the layerm_bypass
* field of struct layer_mount. All other layer routines in this file will
* use the layerm_bypass() routine.
*
* Although the bypass routine handles most operations outright, a number
* of operations are special cased and handled by the layerfs. For instance,
* layer_getattr() must change the fsid being returned. While layer_lock()
* and layer_unlock() must handle any locking for the current vnode as well
* as pass the lock request down. layer_inactive() and layer_reclaim() are
* not bypassed so that they can handle freeing layerfs-specific data. Also,
* certain vnode operations (create, mknod, remove, link, rename, mkdir,
* rmdir, and symlink) change the locking state within the operation. Ideally
* these operations should not change the lock state, but should be changed
* to let the caller of the function unlock them. Otherwise, all intermediate
* vnode layers (such as union, umapfs, etc) must catch these functions to do
* the necessary locking at their layer.
*
* INSTANTIATING VNODE STACKS
*
* Mounting associates "layerfs-nodes" stack and lower layer, in effect
* stacking two VFSes. The initial mount creates a single vnode stack for
* the root of the new layerfs. All other vnode stacks are created as a
* result of vnode operations on this or other layerfs vnode stacks.
*
* New vnode stacks come into existence as a result of an operation which
* returns a vnode. The bypass routine stacks a layerfs-node above the new
* vnode before returning it to the caller.
*
* For example, imagine mounting a null layer with:
*
* "mount_null /usr/include /dev/layer/null"
*
* Changing directory to /dev/layer/null will assign the root layerfs-node,
* which was created when the null layer was mounted). Now consider opening
* "sys". A layer_lookup() would be performed on the root layerfs-node.
* This operation would bypass through to the lower layer which would return
* a vnode representing the UFS "sys". Then, layer_bypass() builds a
* layerfs-node aliasing the UFS "sys" and returns this to the caller.
* Later operations on the layerfs-node "sys" will repeat this process when
* constructing other vnode stacks.
*
* INVOKING OPERATIONS ON LOWER LAYERS
*
* There are two techniques to invoke operations on a lower layer when the
* operation cannot be completely bypassed. Each method is appropriate in
* different situations. In both cases, it is the responsibility of the
* aliasing layer to make the operation arguments "correct" for the lower
* layer by mapping any vnode arguments to the lower layer.
*
* The first approach is to call the aliasing layer's bypass routine. This
* method is most suitable when you wish to invoke the operation currently
* being handled on the lower layer. It has the advantage that the bypass
* routine already must do argument mapping. An example of this is
* layer_getattr().
*
* A second approach is to directly invoke vnode operations on the lower
* layer with the VOP_OPERATIONNAME interface. The advantage of this method
* is that it is easy to invoke arbitrary operations on the lower layer.
* The disadvantage is that vnode's arguments must be manually mapped.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/kauth.h>
#include <sys/fcntl.h>
#include <sys/fstrans.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
/*
* This is the 08-June-99 bypass routine, based on the 10-Apr-92 bypass
* routine by John Heidemann.
* The new element for this version is that the whole nullfs
* system gained the concept of locks on the lower node.
* The 10-Apr-92 version was optimized for speed, throwing away some
* safety checks. It should still always work, but it's not as
* robust to programmer errors.
*
* In general, we map all vnodes going down and unmap them on the way back.
*
* Also, some BSD vnode operations have the side effect of vrele'ing
* their arguments. With stacking, the reference counts are held
* by the upper node, not the lower one, so we must handle these
* side-effects here. This is not of concern in Sun-derived systems
* since there are no such side-effects.
*
* New for the 08-June-99 version: we also handle operations which unlock
* the passed-in node (typically they vput the node).
*
* This makes the following assumptions:
* - only one returned vpp
* - no INOUT vpp's (Sun's vop_open has one of these)
* - the vnode operation vector of the first vnode should be used
* to determine what implementation of the op should be invoked
* - all mapped vnodes are of our vnode-type (NEEDSWORK:
* problems on rmdir'ing mount points and renaming?)
*/
int
layer_bypass(void *v)
{
struct vop_generic_args /* {
struct vnodeop_desc *a_desc;
<other random data follows, presumably>
} */ *ap = v;
int (**our_vnodeop_p)(void *);
struct vnode **this_vp_p;
int error;
struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
struct vnode **vps_p[VDESC_MAX_VPS];
struct vnode ***vppp;
struct mount *mp;
struct vnodeop_desc *descp = ap->a_desc;
int reles, i, flags;
#ifdef DIAGNOSTIC
/*
* We require at least one vp.
*/
if (descp->vdesc_vp_offsets == NULL ||
descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
panic("%s: no vp's in map.\n", __func__);
#endif
vps_p[0] =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
vp0 = *vps_p[0];
mp = vp0->v_mount;
flags = MOUNTTOLAYERMOUNT(mp)->layerm_flags;
our_vnodeop_p = vp0->v_op;
if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name);
/*
* Map the vnodes going in.
* Later, we'll invoke the operation based on
* the first mapped vnode's operation vector.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
vps_p[i] = this_vp_p =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
ap);
/*
* We're not guaranteed that any but the first vnode
* are of our type. Check for and don't map any
* that aren't. (We must always map first vp or vclean fails.)
*/
if (i && (*this_vp_p == NULL ||
(*this_vp_p)->v_op != our_vnodeop_p)) {
old_vps[i] = NULL;
} else {
old_vps[i] = *this_vp_p;
*(vps_p[i]) = LAYERVPTOLOWERVP(*this_vp_p);
/*
* XXX - Several operations have the side effect
* of vrele'ing their vp's. We must account for
* that. (This should go away in the future.)
*/
if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p);
}
}
/*
* Call the operation on the lower layer
* with the modified argument structure.
*/
error = VCALL(*vps_p[0], descp->vdesc_offset, ap);
/*
* Maintain the illusion of call-by-value
* by restoring vnodes in the argument structure
* to their original value.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
if (old_vps[i]) {
*(vps_p[i]) = old_vps[i];
if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i]));
}
}
/*
* Map the possible out-going vpp
* (Assumes that the lower layer always returns
* a VREF'ed vpp unless it gets an error.)
*/
if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
vppp = VOPARG_OFFSETTO(struct vnode***,
descp->vdesc_vpp_offset, ap);
/*
* Only vop_lookup, vop_create, vop_makedir, vop_mknod
* and vop_symlink return vpp's. vop_lookup doesn't call bypass
* as a lookup on "." would generate a locking error.
* So all the calls which get us here have a unlocked vpp. :-)
*/
error = layer_node_create(mp, **vppp, *vppp);
if (error) { vrele(**vppp);
**vppp = NULL;
}
}
return error;
}
/*
* We have to carry on the locking protocol on the layer vnodes
* as we progress through the tree. We also have to enforce read-only
* if this layer is mounted read-only.
*/
int
layer_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnodeop_desc *a_desc;
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
struct vnode *dvp, *lvp, *ldvp;
int error, flags = cnp->cn_flags;
dvp = ap->a_dvp;
if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { *ap->a_vpp = NULL;
return EROFS;
}
ldvp = LAYERVPTOLOWERVP(dvp);
ap->a_dvp = ldvp;
error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
lvp = *ap->a_vpp;
*ap->a_vpp = NULL;
if (error == EJUSTRETURN && (flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
error = EROFS;
/*
* We must do the same locking and unlocking at this layer as
* is done in the layers below us.
*/
if (ldvp == lvp) {
/*
* Got the same object back, because we looked up ".",
* or ".." in the root node of a mount point.
* So we make another reference to dvp and return it.
*/
vref(dvp);
*ap->a_vpp = dvp;
vrele(lvp);
} else if (lvp != NULL) {
/* Note: dvp and ldvp are both locked. */
KASSERT(error != ENOLCK);
error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp);
if (error) { vrele(lvp);
}
}
return error;
}
/*
* Setattr call. Disallow write attempts if the layer is mounted read-only.
*/
int
layer_setattr(void *v)
{
struct vop_setattr_args /* {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vattr *vap = ap->a_vap;
if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
(vp->v_mount->mnt_flag & MNT_RDONLY))
return EROFS;
if (vap->va_size != VNOVAL) {
switch (vp->v_type) {
case VDIR:
return EISDIR;
case VCHR:
case VBLK:
case VSOCK:
case VFIFO:
return 0;
case VREG:
case VLNK:
default:
/*
* Disallow write attempts if the filesystem is
* mounted read-only.
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
}
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* We handle getattr only to change the fsid.
*/
int
layer_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
int error;
error = LAYERFS_DO_BYPASS(vp, ap);
if (error) {
return error;
}
/* Requires that arguments be restored. */
ap->a_vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
return 0;
}
int
layer_access(void *v)
{
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
accmode_t accmode = ap->a_accmode;
/*
* Disallow write attempts on read-only layers;
* unless the file is a socket, fifo, or a block or
* character device resident on the file system.
*/
if (accmode & VWRITE) { switch (vp->v_type) {
case VDIR:
case VLNK:
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
break;
default:
break;
}
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* We must handle open to be able to catch MNT_NODEV and friends
* and increment the lower v_writecount.
*/
int
layer_open(void *v)
{
struct vop_open_args /* {
const struct vnodeop_desc *a_desc;
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
int error;
if (((lvp->v_type == VBLK) || (lvp->v_type == VCHR)) &&
(vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0 && (ap->a_mode & FWRITE)) { mutex_enter(lvp->v_interlock);
lvp->v_writecount++;
mutex_exit(lvp->v_interlock);
}
return error;
}
/*
* We must handle close to decrement the lower v_writecount.
*/
int
layer_close(void *v)
{
struct vop_close_args /* {
const struct vnodeop_desc *a_desc;
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
if ((ap->a_fflag & FWRITE)) {
mutex_enter(lvp->v_interlock);
KASSERT(lvp->v_writecount > 0);
lvp->v_writecount--;
mutex_exit(lvp->v_interlock);
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* If vinvalbuf is calling us, it's a "shallow fsync" -- don't bother
* syncing the underlying vnodes, since they'll be fsync'ed when
* reclaimed; otherwise, pass it through to the underlying layer.
*
* XXX Do we still need to worry about shallow fsync?
*/
int
layer_fsync(void *v)
{
struct vop_fsync_args /* {
struct vnode *a_vp;
kauth_cred_t a_cred;
int a_flags;
off_t offlo;
off_t offhi;
struct lwp *a_l;
} */ *ap = v;
int error;
if (ap->a_flags & FSYNC_RECLAIM) {
return 0;
}
if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) {
error = spec_fsync(v);
if (error)
return error;
}
return LAYERFS_DO_BYPASS(ap->a_vp, ap);
}
int
layer_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
/*
* If we did a remove, don't cache the node.
*/
*ap->a_recycle = ((VTOLAYER(vp)->layer_flags & LAYERFS_REMOVED) != 0);
/*
* Do nothing (and _don't_ bypass).
* Wait to vrele lowervp until reclaim,
* so that until then our layer_node is in the
* cache and reusable.
*
* NEEDSWORK: Someday, consider inactive'ing
* the lowervp and then trying to reactivate it
* with capabilities (v_id)
* like they do in the name lookup cache code.
* That's too much work for now.
*/
return 0;
}
int
layer_remove(void *v)
{
struct vop_remove_v3_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
nlink_t ctx_vp_new_nlink;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
int error;
vref(vp);
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0) {
VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
}
vrele(vp);
return error;
}
int
layer_rename(void *v)
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
struct vnode *fdvp = ap->a_fdvp, *tvp;
int error;
tvp = ap->a_tvp;
if (tvp) {
if (tvp->v_mount != fdvp->v_mount)
tvp = NULL;
else
vref(tvp);
}
error = LAYERFS_DO_BYPASS(fdvp, ap);
if (tvp) {
if (error == 0)
VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
vrele(tvp);
}
return error;
}
int
layer_rmdir(void *v)
{
struct vop_rmdir_v2_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap = v;
int error;
struct vnode *vp = ap->a_vp;
vref(vp);
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0) {
VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
}
vrele(vp);
return error;
}
int
layer_revoke(void *v)
{
struct vop_revoke_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
int error;
/*
* We will most likely end up in vclean which uses the usecount
* to determine if a vnode is active. Take an extra reference on
* the lower vnode so it will always close and inactivate.
*/
vref(lvp);
error = LAYERFS_DO_BYPASS(vp, ap);
vrele(lvp);
return error;
}
int
layer_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct layer_mount *lmp = MOUNTTOLAYERMOUNT(vp->v_mount);
struct layer_node *xp = VTOLAYER(vp);
struct vnode *lowervp = xp->layer_lowervp;
VOP_UNLOCK(vp);
/*
* Note: in vop_reclaim, the node's struct lock has been
* decomissioned, so we have to be careful about calling
* VOP's on ourself. We must be careful as VXLOCK is set.
*/
if (vp == lmp->layerm_rootvp) {
/*
* Oops! We no longer have a root node. Most likely reason is
* that someone forcably unmunted the underlying fs.
*
* Now getting the root vnode will fail. We're dead. :-(
*/
lmp->layerm_rootvp = NULL;
}
mutex_enter(vp->v_interlock);
KASSERT(vp->v_interlock == lowervp->v_interlock);
lowervp->v_writecount -= vp->v_writecount;
mutex_exit(vp->v_interlock);
/* After this assignment, this node will not be re-used. */
xp->layer_lowervp = NULL;
kmem_free(vp->v_data, lmp->layerm_size);
vp->v_data = NULL;
vrele(lowervp);
return 0;
}
/*
* We just feed the returned vnode up to the caller - there's no need
* to build a layer node on top of the node on which we're going to do
* i/o. :-)
*/
int
layer_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
struct vnode *vp;
vp = LAYERVPTOLOWERVP(ap->a_vp);
ap->a_vp = vp;
return VCALL(vp, ap->a_desc->vdesc_offset, ap);
}
int
layer_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
printf ("\ttag VT_LAYERFS, vp=%p, lowervp=%p\n", vp, LAYERVPTOLOWERVP(vp));
return 0;
}
int
layer_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct mount *mp = vp->v_mount;
int error;
krw_t op;
KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));
if (ap->a_flags & PGO_LOCKED) {
return EBUSY;
}
ap->a_vp = LAYERVPTOLOWERVP(vp);
KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);
/* Just pass the request on to the underlying layer. */
op = rw_lock_op(vp->v_uobj.vmobjlock);
rw_exit(vp->v_uobj.vmobjlock);
fstrans_start(mp);
rw_enter(vp->v_uobj.vmobjlock, op);
if (mp == vp->v_mount) {
/* Will release the lock. */
error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
} else {
rw_exit(vp->v_uobj.vmobjlock);
error = ENOENT;
}
fstrans_done(mp);
return error;
}
int
layer_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(rw_write_held(vp->v_uobj.vmobjlock));
ap->a_vp = LAYERVPTOLOWERVP(vp);
KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);
if (ap->a_flags & PGO_RECLAIM) {
rw_exit(vp->v_uobj.vmobjlock);
return 0;
}
/* Just pass the request on to the underlying layer. */
return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
}
/* $NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993
* The Regents of the University of California.
* Copyright (c) 1990 University of Utah.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94
* from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp
*/
/*
* uvm_vnode.c: the vnode pager.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif
#include <sys/atomic.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_page_array.h>
#ifdef UVMHIST
UVMHIST_DEFINE(ubchist);
#endif
/*
* functions
*/
static void uvn_alloc_ractx(struct uvm_object *);
static void uvn_detach(struct uvm_object *);
static int uvn_get(struct uvm_object *, voff_t, struct vm_page **, int *,
int, vm_prot_t, int, int);
static void uvn_markdirty(struct uvm_object *);
static int uvn_put(struct uvm_object *, voff_t, voff_t, int);
static void uvn_reference(struct uvm_object *);
static int uvn_findpage(struct uvm_object *, voff_t, struct vm_page **,
unsigned int, struct uvm_page_array *a,
unsigned int);
/*
* master pager structure
*/
const struct uvm_pagerops uvm_vnodeops = {
.pgo_reference = uvn_reference,
.pgo_detach = uvn_detach,
.pgo_get = uvn_get,
.pgo_put = uvn_put,
.pgo_markdirty = uvn_markdirty,
};
/*
* the ops!
*/
/*
* uvn_reference
*
* duplicate a reference to a VM object. Note that the reference
* count must already be at least one (the passed in reference) so
* there is no chance of the uvn being killed or locked out here.
*
* => caller must call with object unlocked.
* => caller must be using the same accessprot as was used at attach time
*/
static void
uvn_reference(struct uvm_object *uobj)
{
vref((struct vnode *)uobj);
}
/*
* uvn_detach
*
* remove a reference to a VM object.
*
* => caller must call with object unlocked and map locked.
*/
static void
uvn_detach(struct uvm_object *uobj)
{
vrele((struct vnode *)uobj);
}
/*
* uvn_put: flush page data to backing store.
*
* => object must be locked on entry! VOP_PUTPAGES must unlock it.
* => flags: PGO_SYNCIO -- use sync. I/O
*/
static int
uvn_put(struct uvm_object *uobj, voff_t offlo, voff_t offhi, int flags)
{
struct vnode *vp = (struct vnode *)uobj;
int error;
KASSERT(rw_write_held(uobj->vmobjlock));
error = VOP_PUTPAGES(vp, offlo, offhi, flags);
return error;
}
/*
* uvn_get: get pages (synchronously) from backing store
*
* => prefer map unlocked (not required)
* => object must be locked! we will _unlock_ it before starting any I/O.
* => flags: PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
static int
uvn_get(struct uvm_object *uobj, voff_t offset,
struct vm_page **pps /* IN/OUT */,
int *npagesp /* IN (OUT if PGO_LOCKED)*/,
int centeridx, vm_prot_t access_type, int advice, int flags)
{
struct vnode *vp = (struct vnode *)uobj;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)vp, offset,
0, 0);
if (vp->v_type == VREG && (access_type & VM_PROT_WRITE) == 0
&& (flags & PGO_LOCKED) == 0 && vp->v_tag != VT_TMPFS) {
uvn_alloc_ractx(uobj);
uvm_ra_request(vp->v_ractx, advice, uobj, offset,
*npagesp << PAGE_SHIFT);
}
error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx,
access_type, advice, flags);
if (flags & PGO_LOCKED) KASSERT(rw_lock_held(uobj->vmobjlock));
return error;
}
/*
* uvn_markdirty: called when the object gains first dirty page
*
* => uobj must be write locked.
*/
static void
uvn_markdirty(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
KASSERT(rw_write_held(uobj->vmobjlock));
mutex_enter(vp->v_interlock);
if ((vp->v_iflag & VI_ONWORKLST) == 0) { vn_syncer_add_to_worklist(vp, filedelay);
}
mutex_exit(vp->v_interlock);
}
/*
* uvn_findpages:
* return the page for the uobj and offset requested, allocating if needed.
* => uobj must be locked.
* => returned pages will be BUSY.
*/
int
uvn_findpages(struct uvm_object *uobj, voff_t offset, unsigned int *npagesp,
struct vm_page **pgs, struct uvm_page_array *a, unsigned int flags)
{
unsigned int count, found, npages;
int i, rv;
struct uvm_page_array a_store;
if (a == NULL) {
/*
* XXX fragile API
* note that the array can be the one supplied by the caller of
* uvn_findpages. in that case, fillflags used by the caller
* might not match strictly with ours.
* in particular, the caller might have filled the array
* without DENSE but passed us UFP_DIRTYONLY (thus DENSE).
*/
const unsigned int fillflags =
((flags & UFP_BACKWARD) ? UVM_PAGE_ARRAY_FILL_BACKWARD : 0) |
((flags & UFP_DIRTYONLY) ?
(UVM_PAGE_ARRAY_FILL_DIRTY|UVM_PAGE_ARRAY_FILL_DENSE) : 0);
a = &a_store;
uvm_page_array_init(a, uobj, fillflags);
}
count = found = 0;
npages = *npagesp;
if (flags & UFP_BACKWARD) {
for (i = npages - 1; i >= 0; i--, offset -= PAGE_SIZE) {
rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
i + 1);
if (rv == 0) {
if (flags & UFP_DIRTYONLY)
break;
} else
found++;
count++;
}
} else {
for (i = 0; i < npages; i++, offset += PAGE_SIZE) {
rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
npages - i);
if (rv == 0) {
if (flags & UFP_DIRTYONLY)
break;
} else
found++;
count++;
}
}
if (a == &a_store) { uvm_page_array_fini(a);
}
*npagesp = count;
return (found);
}
/*
* uvn_findpage: find a single page
*
* if a suitable page was found, put it in *pgp and return 1.
* otherwise return 0.
*/
static int
uvn_findpage(struct uvm_object *uobj, voff_t offset, struct vm_page **pgp,
unsigned int flags, struct uvm_page_array *a, unsigned int nleft)
{
struct vm_page *pg;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)uobj, offset,
0, 0);
/*
* NOBUSY must come with NOWAIT and NOALLOC. if NOBUSY is
* specified, this may be called with a reader lock.
*/
KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOWAIT) != 0); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOALLOC) != 0); KASSERT((flags & UFP_NOBUSY) != 0 || rw_write_held(uobj->vmobjlock));
if (*pgp != NULL) {
UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0);
goto skip_offset;
}
for (;;) {
/*
* look for an existing page.
*/
pg = uvm_page_array_fill_and_peek(a, offset, nleft);
if (pg != NULL && pg->offset != offset) {
struct vm_page __diagused *tpg;
KASSERT(
((a->ar_flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0)
== (pg->offset < offset));
KASSERT((tpg = uvm_pagelookup(uobj, offset)) == NULL ||
((a->ar_flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0 &&
!uvm_obj_page_dirty_p(tpg)));
pg = NULL;
if ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0) {
UVMHIST_LOG(ubchist, "dense", 0,0,0,0);
return 0;
}
}
/* nope? allocate one now */
if (pg == NULL) {
if (flags & UFP_NOALLOC) {
UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0);
return 0;
}
pg = uvm_pagealloc(uobj, offset, NULL,
UVM_FLAG_COLORMATCH);
if (pg == NULL) {
if (flags & UFP_NOWAIT) {
UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
return 0;
}
rw_exit(uobj->vmobjlock);
uvm_wait("uvnfp1");
uvm_page_array_clear(a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
UVMHIST_LOG(ubchist, "alloced %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
KASSERTMSG(uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
break;
} else if (flags & UFP_NOCACHE) {
UVMHIST_LOG(ubchist, "nocache",0,0,0,0);
goto skip;
}
/* page is there, see if we need to wait on it */
if ((pg->flags & PG_BUSY) != 0) {
if (flags & UFP_NOWAIT) {
UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
goto skip;
}
UVMHIST_LOG(ubchist, "wait %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
uvm_pagewait(pg, uobj->vmobjlock, "uvnfp2");
uvm_page_array_clear(a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
/* skip PG_RDONLY pages if requested */
if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) {
UVMHIST_LOG(ubchist, "nordonly",0,0,0,0);
goto skip;
}
/* stop on clean pages if requested */
if (flags & UFP_DIRTYONLY) {
const bool dirty = uvm_pagecheckdirty(pg, false);
if (!dirty) {
UVMHIST_LOG(ubchist, "dirtonly", 0,0,0,0);
return 0;
}
}
/* mark the page BUSY and we're done. */
if ((flags & UFP_NOBUSY) == 0) { pg->flags |= PG_BUSY;
UVM_PAGE_OWN(pg, "uvn_findpage");
}
UVMHIST_LOG(ubchist, "found %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
uvm_page_array_advance(a);
break;
}
*pgp = pg;
return 1;
skip_offset:
/*
* skip this offset
*/
pg = uvm_page_array_peek(a);
if (pg != NULL) {
if (pg->offset == offset) {
uvm_page_array_advance(a);
} else {
KASSERT((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) == 0);
}
}
return 0;
skip:
/*
* skip this page
*/
KASSERT(pg != NULL);
uvm_page_array_advance(a);
return 0;
}
/*
* uvm_vnp_setsize: grow or shrink a vnode uobj
*
* grow => just update size value
* shrink => toss un-needed pages
*
* => we assume that the caller has a reference of some sort to the
* vnode in question so that it will not be yanked out from under
* us.
*/
void
uvm_vnp_setsize(struct vnode *vp, voff_t newsize)
{
struct uvm_object *uobj = &vp->v_uobj;
voff_t pgend = round_page(newsize);
voff_t oldsize;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
rw_enter(uobj->vmobjlock, RW_WRITER);
UVMHIST_LOG(ubchist, "vp %#jx old %#jx new %#jx",
(uintptr_t)vp, vp->v_size, newsize, 0);
/*
* now check if the size has changed: if we shrink we had better
* toss some pages...
*/
KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize);
KASSERTMSG((vp->v_size == vp->v_writesize ||
newsize == vp->v_writesize || newsize <= vp->v_size),
"vp=%p v_size=0x%llx v_writesize=0x%llx newsize=0x%llx",
vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
oldsize = vp->v_writesize;
/*
* check whether size shrinks
* if old size hasn't been set, there are no pages to drop
* if there was an integer overflow in pgend, then this is no shrink
*/
if (oldsize > pgend && oldsize != VSIZENOTSET && pgend >= 0) { (void) uvn_put(uobj, pgend, 0, PGO_FREE | PGO_SYNCIO);
rw_enter(uobj->vmobjlock, RW_WRITER);
}
mutex_enter(vp->v_interlock);
vp->v_size = vp->v_writesize = newsize;
mutex_exit(vp->v_interlock);
rw_exit(uobj->vmobjlock);
}
void
uvm_vnp_setwritesize(struct vnode *vp, voff_t newsize)
{
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
KASSERTMSG(vp->v_size <= newsize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
mutex_enter(vp->v_interlock);
vp->v_writesize = newsize;
mutex_exit(vp->v_interlock);
rw_exit(vp->v_uobj.vmobjlock);
}
bool
uvn_text_p(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
int iflag;
/*
* v_interlock is not held here, but VI_EXECMAP is only ever changed
* with the vmobjlock held too.
*/
iflag = atomic_load_relaxed(&vp->v_iflag);
return (iflag & VI_EXECMAP) != 0;
}
static void
uvn_alloc_ractx(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
struct uvm_ractx *ra = NULL;
KASSERT(rw_write_held(uobj->vmobjlock)); if (vp->v_type != VREG) {
return;
}
if (vp->v_ractx != NULL) {
return;
}
if (vp->v_ractx == NULL) {
rw_exit(uobj->vmobjlock);
ra = uvm_ra_allocctx();
rw_enter(uobj->vmobjlock, RW_WRITER);
if (ra != NULL && vp->v_ractx == NULL) { vp->v_ractx = ra;
ra = NULL;
}
}
if (ra != NULL) {
uvm_ra_freectx(ra);
}
}
/* $NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_kern.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_km.c: handle kernel memory allocation and management
*/
/*
* overview of kernel memory management:
*
* the kernel virtual address space is mapped by "kernel_map." kernel_map
* starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS.
* note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map).
*
* the kernel_map has several "submaps." submaps can only appear in
* the kernel_map (user processes can't use them). submaps "take over"
* the management of a sub-range of the kernel's address space. submaps
* are typically allocated at boot time and are never released. kernel
* virtual address space that is mapped by a submap is locked by the
* submap's lock -- not the kernel_map's lock.
*
* thus, the useful feature of submaps is that they allow us to break
* up the locking and protection of the kernel address space into smaller
* chunks.
*
* the vm system has several standard kernel submaps/arenas, including:
* kmem_arena => used for kmem/pool (memoryallocators(9))
* pager_map => used to map "buf" structures into kernel space
* exec_map => used during exec to handle exec args
* etc...
*
* The kmem_arena is a "special submap", as it lives in a fixed map entry
* within the kernel_map and is controlled by vmem(9).
*
* the kernel allocates its private memory out of special uvm_objects whose
* reference count is set to UVM_OBJ_KERN (thus indicating that the objects
* are "special" and never die). all kernel objects should be thought of
* as large, fixed-sized, sparsely populated uvm_objects. each kernel
* object is equal to the size of kernel virtual address space (i.e. the
* value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS").
*
* note that just because a kernel object spans the entire kernel virtual
* address space doesn't mean that it has to be mapped into the entire space.
* large chunks of a kernel object's space go unused either because
* that area of kernel VM is unmapped, or there is some other type of
* object mapped into that range (e.g. a vnode). for submap's kernel
* objects, the only part of the object that can ever be populated is the
* offsets that are managed by the submap.
*
* note that the "offset" in a kernel object is always the kernel virtual
* address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)).
* example:
* suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a
* uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the
* kernel map]. if uvm_km_alloc returns virtual address 0xf8235000,
* then that means that the page at offset 0x235000 in kernel_object is
* mapped at 0xf8235000.
*
* kernel object have one other special property: when the kernel virtual
* memory mapping them is unmapped, the backing memory in the object is
* freed right away. this is done with the uvm_km_pgremove() function.
* this has to be done because there is no backing store for kernel pages
* and no need to save them after they are no longer referenced.
*
* Generic arenas:
*
* kmem_arena:
* Main arena controlling the kernel KVA used by other arenas.
*
* kmem_va_arena:
* Implements quantum caching in order to speedup allocations and
* reduce fragmentation. The pool(9), unless created with a custom
* meta-data allocator, and kmem(9) subsystems use this arena.
*
* Arenas for meta-data allocations are used by vmem(9) and pool(9).
* These arenas cannot use quantum cache. However, kmem_va_meta_arena
* compensates this by importing larger chunks from kmem_arena.
*
* kmem_va_meta_arena:
* Space for meta-data.
*
* kmem_meta_arena:
* Imports from kmem_va_meta_arena. Allocations from this arena are
* backed with the pages.
*
* Arena stacking:
*
* kmem_arena
* kmem_va_arena
* kmem_va_meta_arena
* kmem_meta_arena
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $");
#include "opt_uvmhist.h"
#include "opt_kmempages.h"
#ifndef NKMEMPAGES
#define NKMEMPAGES 0
#endif
/*
* Defaults for lower and upper-bounds for the kmem_arena page count.
* Can be overridden by kernel config options.
*/
#ifndef NKMEMPAGES_MIN
#define NKMEMPAGES_MIN NKMEMPAGES_MIN_DEFAULT
#endif
#ifndef NKMEMPAGES_MAX
#define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/vmem.h>
#include <sys/vmem_impl.h>
#include <sys/kmem.h>
#include <sys/msan.h>
#include <uvm/uvm.h>
/*
* global data structures
*/
struct vm_map *kernel_map = NULL;
/*
* local data structures
*/
static struct vm_map kernel_map_store;
static struct vm_map_entry kernel_image_mapent_store;
static struct vm_map_entry kernel_kmem_mapent_store;
size_t nkmempages = 0;
vaddr_t kmembase;
vsize_t kmemsize;
static struct vmem kmem_arena_store;
vmem_t *kmem_arena = NULL;
static struct vmem kmem_va_arena_store;
vmem_t *kmem_va_arena;
/*
* kmeminit_nkmempages: calculate the size of kmem_arena.
*/
void
kmeminit_nkmempages(void)
{
size_t npages;
if (nkmempages != 0) {
/*
* It's already been set (by us being here before)
* bail out now;
*/
return;
}
#if defined(NKMEMPAGES_MAX_UNLIMITED) && !defined(KMSAN)
npages = physmem;
#else
#if defined(KMSAN)
npages = (physmem / 4);
#elif defined(PMAP_MAP_POOLPAGE)
npages = (physmem / 4);
#else
npages = (physmem / 3) * 2;
#endif /* defined(PMAP_MAP_POOLPAGE) */
#if !defined(NKMEMPAGES_MAX_UNLIMITED)
if (npages > NKMEMPAGES_MAX)
npages = NKMEMPAGES_MAX;
#endif
#endif
if (npages < NKMEMPAGES_MIN)
npages = NKMEMPAGES_MIN;
nkmempages = npages;
}
/*
* uvm_km_bootstrap: init kernel maps and objects to reflect reality (i.e.
* KVM already allocated for text, data, bss, and static data structures).
*
* => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS.
* we assume that [vmin -> start] has already been allocated and that
* "end" is the end.
*/
void
uvm_km_bootstrap(vaddr_t start, vaddr_t end)
{
bool kmem_arena_small;
vaddr_t base = VM_MIN_KERNEL_ADDRESS;
struct uvm_map_args args;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "start=%#jx end=%#jx", start, end, 0,0);
kmeminit_nkmempages();
kmemsize = (vsize_t)nkmempages * PAGE_SIZE;
kmem_arena_small = kmemsize < 64 * 1024 * 1024;
UVMHIST_LOG(maphist, "kmemsize=%#jx", kmemsize, 0,0,0);
/*
* next, init kernel memory objects.
*/
/* kernel_object: for pageable anonymous kernel memory */
uvm_kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS -
VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ);
/*
* init the map and reserve any space that might already
* have been allocated kernel space before installing.
*/
uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE);
kernel_map_store.pmap = pmap_kernel();
if (start != base) {
error = uvm_map_prepare(&kernel_map_store,
base, start - base,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
if (!error) {
kernel_image_mapent_store.flags =
UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
error = uvm_map_enter(&kernel_map_store, &args,
&kernel_image_mapent_store);
}
if (error)
panic(
"uvm_km_bootstrap: could not reserve space for kernel");
kmembase = args.uma_start + args.uma_size;
} else {
kmembase = base;
}
error = uvm_map_prepare(&kernel_map_store,
kmembase, kmemsize,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
if (!error) {
kernel_kmem_mapent_store.flags =
UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
error = uvm_map_enter(&kernel_map_store, &args,
&kernel_kmem_mapent_store);
}
if (error)
panic("uvm_km_bootstrap: could not reserve kernel kmem");
/*
* install!
*/
kernel_map = &kernel_map_store;
pool_subsystem_init();
kmem_arena = vmem_init(&kmem_arena_store, "kmem",
kmembase, kmemsize, PAGE_SIZE, NULL, NULL, NULL,
0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
#ifdef PMAP_GROWKERNEL
/*
* kmem_arena VA allocations happen independently of uvm_map.
* grow kernel to accommodate the kmem_arena.
*/
if (uvm_maxkaddr < kmembase + kmemsize) {
uvm_maxkaddr = pmap_growkernel(kmembase + kmemsize);
KASSERTMSG(uvm_maxkaddr >= kmembase + kmemsize,
"%#"PRIxVADDR" %#"PRIxVADDR" %#"PRIxVSIZE,
uvm_maxkaddr, kmembase, kmemsize);
}
#endif
vmem_subsystem_init(kmem_arena);
UVMHIST_LOG(maphist, "kmem vmem created (base=%#jx, size=%#jx",
kmembase, kmemsize, 0,0);
kmem_va_arena = vmem_init(&kmem_va_arena_store, "kva",
0, 0, PAGE_SIZE, vmem_alloc, vmem_free, kmem_arena,
(kmem_arena_small ? 4 : VMEM_QCACHE_IDX_MAX) * PAGE_SIZE,
VM_NOSLEEP, IPL_VM);
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* uvm_km_init: init the kernel maps virtual memory caches
* and start the pool/kmem allocator.
*/
void
uvm_km_init(void)
{
kmem_init();
}
/*
* uvm_km_suballoc: allocate a submap in the kernel map. once a submap
* is allocated all references to that area of VM must go through it. this
* allows the locking of VAs in kernel_map to be broken up into regions.
*
* => if `fixed' is true, *vmin specifies where the region described
* pager_map => used to map "buf" structures into kernel space
* by the submap must start
* => if submap is non NULL we use that as the submap, otherwise we
* alloc a new map
*/
struct vm_map *
uvm_km_suballoc(struct vm_map *map, vaddr_t *vmin /* IN/OUT */,
vaddr_t *vmax /* OUT */, vsize_t size, int flags, bool fixed,
struct vm_map *submap)
{
int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0);
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(vm_map_pmap(map) == pmap_kernel());
size = round_page(size); /* round up to pagesize */
/*
* first allocate a blank spot in the parent map
*/
if (uvm_map(map, vmin, size, NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, mapflags)) != 0) {
panic("%s: unable to allocate space in parent map", __func__);
}
/*
* set VM bounds (vmin is filled in by uvm_map)
*/
*vmax = *vmin + size;
/*
* add references to pmap and create or init the submap
*/
pmap_reference(vm_map_pmap(map));
if (submap == NULL) {
submap = kmem_alloc(sizeof(*submap), KM_SLEEP);
}
uvm_map_setup(submap, *vmin, *vmax, flags);
submap->pmap = vm_map_pmap(map);
/*
* now let uvm_map_submap plug in it...
*/
if (uvm_map_submap(map, *vmin, *vmax, submap) != 0)
panic("uvm_km_suballoc: submap allocation failed");
return(submap);
}
/*
* uvm_km_pgremove: remove pages from a kernel uvm_object and KVA.
*/
void
uvm_km_pgremove(vaddr_t startva, vaddr_t endva)
{
struct uvm_object * const uobj = uvm_kernel_object;
const voff_t start = startva - vm_map_min(kernel_map);
const voff_t end = endva - vm_map_min(kernel_map);
struct vm_page *pg;
voff_t curoff, nextoff;
int swpgonlydelta = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(VM_MIN_KERNEL_ADDRESS <= startva);
KASSERT(startva < endva);
KASSERT(endva <= VM_MAX_KERNEL_ADDRESS);
rw_enter(uobj->vmobjlock, RW_WRITER);
pmap_remove(pmap_kernel(), startva, endva);
for (curoff = start; curoff < end; curoff = nextoff) {
nextoff = curoff + PAGE_SIZE;
pg = uvm_pagelookup(uobj, curoff);
if (pg != NULL && pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "km_pgrm");
rw_enter(uobj->vmobjlock, RW_WRITER);
nextoff = curoff;
continue;
}
/*
* free the swap slot, then the page.
*/
if (pg == NULL &&
uao_find_swslot(uobj, curoff >> PAGE_SHIFT) > 0) {
swpgonlydelta++;
}
uao_dropswap(uobj, curoff >> PAGE_SHIFT);
if (pg != NULL) {
uvm_pagefree(pg);
}
}
rw_exit(uobj->vmobjlock);
if (swpgonlydelta > 0) {
KASSERT(uvmexp.swpgonly >= swpgonlydelta);
atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
/*
* uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for non object backed
* regions.
*
* => when you unmap a part of anonymous kernel memory you want to toss
* the pages right away. (this is called from uvm_unmap_...).
* => none of the pages will ever be busy, and none of them will ever
* be on the active or inactive queues (because they have no object).
*/
void
uvm_km_pgremove_intrsafe(struct vm_map *map, vaddr_t start, vaddr_t end)
{
#define __PGRM_BATCH 16
struct vm_page *pg;
paddr_t pa[__PGRM_BATCH];
int npgrm, i;
vaddr_t va, batch_vastart;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(VM_MAP_IS_KERNEL(map)); KASSERTMSG(vm_map_min(map) <= start,
"vm_map_min(map) [%#"PRIxVADDR"] <= start [%#"PRIxVADDR"]"
" (size=%#"PRIxVSIZE")",
vm_map_min(map), start, end - start);
KASSERT(start < end); KASSERT(end <= vm_map_max(map)); for (va = start; va < end;) {
batch_vastart = va;
/* create a batch of at most __PGRM_BATCH pages to free */
for (i = 0;
i < __PGRM_BATCH && va < end;
va += PAGE_SIZE) {
if (!pmap_extract(pmap_kernel(), va, &pa[i])) {
continue;
}
i++;
}
npgrm = i;
/* now remove the mappings */
pmap_kremove(batch_vastart, va - batch_vastart);
/* and free the pages */
for (i = 0; i < npgrm; i++) {
pg = PHYS_TO_VM_PAGE(pa[i]);
KASSERT(pg); KASSERT(pg->uobject == NULL); KASSERT(pg->uanon == NULL); KASSERT((pg->flags & PG_BUSY) == 0);
uvm_pagefree(pg);
}
}
#undef __PGRM_BATCH
}
#if defined(DEBUG)
void
uvm_km_check_empty(struct vm_map *map, vaddr_t start, vaddr_t end)
{
vaddr_t va;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KDASSERT(VM_MAP_IS_KERNEL(map)); KDASSERT(vm_map_min(map) <= start); KDASSERT(start < end); KDASSERT(end <= vm_map_max(map)); for (va = start; va < end; va += PAGE_SIZE) {
paddr_t pa;
if (pmap_extract(pmap_kernel(), va, &pa)) {
panic("uvm_km_check_empty: va %p has pa %#llx",
(void *)va, (long long)pa);
}
/*
* kernel_object should not have pages for the corresponding
* region. check it.
*
* why trylock? because:
* - caller might not want to block.
* - we can recurse when allocating radix_node for
* kernel_object.
*/
if (rw_tryenter(uvm_kernel_object->vmobjlock, RW_READER)) {
struct vm_page *pg;
pg = uvm_pagelookup(uvm_kernel_object,
va - vm_map_min(kernel_map));
rw_exit(uvm_kernel_object->vmobjlock);
if (pg) {
panic("uvm_km_check_empty: "
"has page hashed at %p",
(const void *)va);
}
}
}
}
#endif /* defined(DEBUG) */
/*
* uvm_km_alloc: allocate an area of kernel memory.
*
* => NOTE: we can return 0 even if we can wait if there is not enough
* free VM space in the map... caller should be prepared to handle
* this case.
* => we return KVA of memory allocated
*/
vaddr_t
uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
{
vaddr_t kva, loopva;
vaddr_t offset;
vsize_t loopsize;
struct vm_page *pg;
struct uvm_object *obj;
int pgaflags;
vm_prot_t prot, vaprot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(vm_map_pmap(map) == pmap_kernel()); KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
KASSERT((flags & UVM_KMF_VAONLY) != 0 || (flags & UVM_KMF_COLORMATCH) == 0);
KASSERT((flags & UVM_KMF_COLORMATCH) == 0 || (flags & UVM_KMF_VAONLY) != 0);
/*
* setup for call
*/
kva = vm_map_min(map); /* hint */
size = round_page(size);
obj = (flags & UVM_KMF_PAGEABLE) ? uvm_kernel_object : NULL;
UVMHIST_LOG(maphist," (map=%#jx, obj=%#jx, size=%#jx, flags=%#jx)",
(uintptr_t)map, (uintptr_t)obj, size, flags);
/*
* allocate some virtual space
*/
vaprot = (flags & UVM_KMF_EXEC) ? UVM_PROT_ALL : UVM_PROT_RW;
if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET,
align, UVM_MAPFLAG(vaprot, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM,
(flags & (UVM_KMF_TRYLOCK | UVM_KMF_NOWAIT | UVM_KMF_WAITVA
| UVM_KMF_COLORMATCH)))) != 0)) {
UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0);
return(0);
}
/*
* if all we wanted was VA, return now
*/
if (flags & (UVM_KMF_VAONLY | UVM_KMF_PAGEABLE)) {
UVMHIST_LOG(maphist,"<- done valloc (kva=%#jx)", kva,0,0,0);
return(kva);
}
/*
* recover object offset from virtual address
*/
offset = kva - vm_map_min(kernel_map);
UVMHIST_LOG(maphist, " kva=%#jx, offset=%#jx", kva, offset,0,0);
/*
* now allocate and map in the memory... note that we are the only ones
* whom should ever get a handle on this area of VM.
*/
loopva = kva;
loopsize = size;
pgaflags = UVM_FLAG_COLORMATCH;
if (flags & UVM_KMF_NOWAIT)
pgaflags |= UVM_PGA_USERESERVE;
if (flags & UVM_KMF_ZERO)
pgaflags |= UVM_PGA_ZERO;
prot = VM_PROT_READ | VM_PROT_WRITE;
if (flags & UVM_KMF_EXEC)
prot |= VM_PROT_EXECUTE;
while (loopsize) { KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, NULL),
"loopva=%#"PRIxVADDR, loopva);
pg = uvm_pagealloc_strat(NULL, offset, NULL, pgaflags,
#ifdef UVM_KM_VMFREELIST
UVM_PGA_STRAT_ONLY, UVM_KM_VMFREELIST
#else
UVM_PGA_STRAT_NORMAL, 0
#endif
);
/*
* out of memory?
*/
if (__predict_false(pg == NULL)) {
if ((flags & UVM_KMF_NOWAIT) || ((flags & UVM_KMF_CANFAIL) && !uvm_reclaimable())) {
/* free everything! */
uvm_km_free(map, kva, size,
flags & UVM_KMF_TYPEMASK);
return (0);
} else {
uvm_wait("km_getwait2"); /* sleep here */
continue;
}
}
pg->flags &= ~PG_BUSY; /* new page */
UVM_PAGE_OWN(pg, NULL);
/*
* map it in
*/
pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
prot, PMAP_KMPAGE);
loopva += PAGE_SIZE;
offset += PAGE_SIZE;
loopsize -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
if ((flags & UVM_KMF_ZERO) == 0) { kmsan_orig((void *)kva, size, KMSAN_TYPE_UVM, __RET_ADDR);
kmsan_mark((void *)kva, size, KMSAN_STATE_UNINIT);
}
UVMHIST_LOG(maphist,"<- done (kva=%#jx)", kva,0,0,0);
return(kva);
}
/*
* uvm_km_protect: change the protection of an allocated area
*/
int
uvm_km_protect(struct vm_map *map, vaddr_t addr, vsize_t size, vm_prot_t prot)
{
return uvm_map_protect(map, addr, addr + round_page(size), prot, false);
}
/*
* uvm_km_free: free an area of kernel memory
*/
void
uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size, uvm_flag_t flags)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
KASSERT((addr & PAGE_MASK) == 0);
KASSERT(vm_map_pmap(map) == pmap_kernel());
size = round_page(size);
if (flags & UVM_KMF_PAGEABLE) {
uvm_km_pgremove(addr, addr + size);
} else if (flags & UVM_KMF_WIRED) {
/*
* Note: uvm_km_pgremove_intrsafe() extracts mapping, thus
* remove it after. See comment below about KVA visibility.
*/
uvm_km_pgremove_intrsafe(map, addr, addr + size);
}
/*
* Note: uvm_unmap_remove() calls pmap_update() for us, before
* KVA becomes globally available.
*/
uvm_unmap1(map, addr, addr + size, UVM_FLAG_VAONLY);
}
/* Sanity; must specify both or none. */
#if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \
(!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE))
#error Must specify MAP and UNMAP together.
#endif
#if defined(PMAP_ALLOC_POOLPAGE) && \
!defined(PMAP_MAP_POOLPAGE) && !defined(PMAP_UNMAP_POOLPAGE)
#error Must specify ALLOC with MAP and UNMAP
#endif
int
uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
vmem_addr_t *addr)
{
struct vm_page *pg;
vmem_addr_t va;
int rc;
vaddr_t loopva;
vsize_t loopsize;
size = round_page(size);
#if defined(PMAP_MAP_POOLPAGE)
if (size == PAGE_SIZE) {
again:
#ifdef PMAP_ALLOC_POOLPAGE
pg = PMAP_ALLOC_POOLPAGE((flags & VM_SLEEP) ?
0 : UVM_PGA_USERESERVE);
#else
pg = uvm_pagealloc(NULL, 0, NULL,
(flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE);
#endif /* PMAP_ALLOC_POOLPAGE */
if (__predict_false(pg == NULL)) {
if (flags & VM_SLEEP) {
uvm_wait("plpg");
goto again;
}
return ENOMEM;
}
va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
KASSERT(va != 0);
*addr = va;
return 0;
}
#endif /* PMAP_MAP_POOLPAGE */
rc = vmem_alloc(vm, size, flags, &va);
if (rc != 0)
return rc;
#ifdef PMAP_GROWKERNEL
/*
* These VA allocations happen independently of uvm_map
* so this allocation must not extend beyond the current limit.
*/
KASSERTMSG(uvm_maxkaddr >= va + size,
"%#"PRIxVADDR" %#"PRIxPTR" %#zx",
uvm_maxkaddr, va, size);
#endif
loopva = va;
loopsize = size;
while (loopsize) {
paddr_t pa __diagused;
KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, &pa),
"loopva=%#"PRIxVADDR" loopsize=%#"PRIxVSIZE
" pa=%#"PRIxPADDR" vmem=%p",
loopva, loopsize, pa, vm);
pg = uvm_pagealloc(NULL, loopva, NULL,
UVM_FLAG_COLORMATCH
| ((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE));
if (__predict_false(pg == NULL)) {
if (flags & VM_SLEEP) {
uvm_wait("plpg");
continue;
} else {
uvm_km_pgremove_intrsafe(kernel_map, va,
va + size);
vmem_free(vm, va, size);
return ENOMEM;
}
}
pg->flags &= ~PG_BUSY; /* new page */
UVM_PAGE_OWN(pg, NULL);
pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
VM_PROT_READ|VM_PROT_WRITE, PMAP_KMPAGE);
loopva += PAGE_SIZE;
loopsize -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
*addr = va;
return 0;
}
void
uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, size_t size)
{
size = round_page(size);
#if defined(PMAP_UNMAP_POOLPAGE)
if (size == PAGE_SIZE) {
paddr_t pa;
pa = PMAP_UNMAP_POOLPAGE(addr);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
return;
}
#endif /* PMAP_UNMAP_POOLPAGE */
uvm_km_pgremove_intrsafe(kernel_map, addr, addr + size);
pmap_update(pmap_kernel());
vmem_free(vm, addr, size);
}
bool
uvm_km_va_starved_p(void)
{
vmem_size_t total;
vmem_size_t free;
if (kmem_arena == NULL)
return false;
total = vmem_size(kmem_arena, VMEM_ALLOC|VMEM_FREE);
free = vmem_size(kmem_arena, VMEM_FREE);
return (free < (total / 10));
}
/* $NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Generic layer VFS operations.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $");
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
SYSCTL_SETUP_PROTO(sysctl_vfs_layerfs_setup);
MODULE(MODULE_CLASS_MISC, layerfs, NULL);
static int
layerfs_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return 0;
case MODULE_CMD_FINI:
return 0;
default:
return ENOTTY;
}
return 0;
}
/*
* VFS start. Nothing needed here - the start routine on the underlying
* filesystem will have been called when that filesystem was mounted.
*/
int
layerfs_start(struct mount *mp, int flags)
{
#ifdef notyet
return VFS_START(mp->mnt_lower, flags);
#else
return 0;
#endif
}
int
layerfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct vnode *vp;
vp = MOUNTTOLAYERMOUNT(mp)->layerm_rootvp;
if (vp == NULL) {
*vpp = NULL;
return EINVAL;
}
/*
* Return root vnode with locked and with a reference held.
*/
vref(vp);
vn_lock(vp, lktype | LK_RETRY);
*vpp = vp;
return 0;
}
int
layerfs_quotactl(struct mount *mp, struct quotactl_args *args)
{
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_QUOTACTL(mp->mnt_lower, args);
vfs_unbusy(mp);
}
return error;
}
int
layerfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
struct statvfs *sbuf;
int error;
sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
error = vfs_busy(mp);
if (error == 0) {
error = VFS_STATVFS(mp->mnt_lower, sbuf);
vfs_unbusy(mp);
}
if (error) {
goto done;
}
/* Copy across the relevant data and fake the rest. */
sbp->f_flag = sbuf->f_flag;
sbp->f_bsize = sbuf->f_bsize;
sbp->f_frsize = sbuf->f_frsize;
sbp->f_iosize = sbuf->f_iosize;
sbp->f_blocks = sbuf->f_blocks;
sbp->f_bfree = sbuf->f_bfree;
sbp->f_bavail = sbuf->f_bavail;
sbp->f_bresvd = sbuf->f_bresvd;
sbp->f_files = sbuf->f_files;
sbp->f_ffree = sbuf->f_ffree;
sbp->f_favail = sbuf->f_favail;
sbp->f_fresvd = sbuf->f_fresvd;
sbp->f_namemax = sbuf->f_namemax;
copy_statvfs_info(sbp, mp);
done:
kmem_free(sbuf, sizeof(*sbuf));
return error;
}
int
layerfs_sync(struct mount *mp, int waitfor,
kauth_cred_t cred)
{
/*
* XXX - Assumes no data cached at layer.
*/
return 0;
}
int
layerfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct layer_mount *lmp = MOUNTTOLAYERMOUNT(mp);
struct vnode *lowervp;
struct layer_node *xp;
KASSERT(key_len == sizeof(struct vnode *));
memcpy(&lowervp, key, key_len);
xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);
/* Share the interlock, vmobjlock, and klist with the lower node. */
vshareilock(vp, lowervp);
rw_obj_hold(lowervp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
vshareklist(vp, lowervp);
vp->v_tag = lmp->layerm_tag;
vp->v_type = lowervp->v_type;
vp->v_op = lmp->layerm_vnodeop_p;
if (vp->v_type == VBLK || vp->v_type == VCHR)
spec_node_init(vp, lowervp->v_rdev);
vp->v_data = xp;
xp->layer_vnode = vp;
xp->layer_lowervp = lowervp;
xp->layer_flags = 0;
uvm_vnp_setsize(vp, 0);
/* Add a reference to the lower node. */
vref(lowervp);
*new_key = &xp->layer_lowervp;
return 0;
}
int
layerfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
struct vnode *vp;
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_VGET(mp->mnt_lower, ino, lktype, &vp);
vfs_unbusy(mp);
}
if (error) {
*vpp = NULL;
return error;
}
VOP_UNLOCK(vp);
error = layer_node_create(mp, vp, vpp);
if (error) {
vrele(vp);
*vpp = NULL;
return error;
}
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
int
layerfs_fhtovp(struct mount *mp, struct fid *fidp, int lktype,
struct vnode **vpp)
{
struct vnode *vp;
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_FHTOVP(mp->mnt_lower, fidp, lktype, &vp);
vfs_unbusy(mp);
}
if (error) {
*vpp = NULL;
return error;
}
VOP_UNLOCK(vp);
error = layer_node_create(mp, vp, vpp);
if (error) {
vput(vp);
*vpp = NULL;
return (error);
}
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
int
layerfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
return VFS_VPTOFH(LAYERVPTOLOWERVP(vp), fhp, fh_size);
}
/*
* layerfs_snapshot - handle a snapshot through a layered file system
*
* At present, we do NOT support snapshotting through a layered file
* system as the ffs implementation changes v_vnlock of the snapshot
* vnodes to point to one common lock. As there is no way for us to
* absolutely pass this change up the stack, a layered file system
* would end up referencing the wrong lock.
*
* This routine serves as a central resource for this behavior; all
* layered file systems don't need to worry about the above. Also, if
* things get fixed, all layers get the benefit.
*/
int
layerfs_snapshot(struct mount *mp, struct vnode *vp,
struct timespec *ts)
{
return EOPNOTSUPP;
}
/*
* layerfs_suspendctl - suspend a layered file system
*
* Here we should suspend the lower file system(s) too. At present
* this will deadlock as we don't know which to suspend first.
*
* This routine serves as a central resource for this behavior; all
* layered file systems don't need to worry about the above. Also, if
* things get fixed, all layers get the benefit.
*/
int
layerfs_suspendctl(struct mount *mp, int cmd)
{
return genfs_suspendctl(mp, cmd);
}
SYSCTL_SETUP(sysctl_vfs_layerfs_setup, "sysctl vfs.layerfs subtree setup")
{
const struct sysctlnode *layerfs_node = NULL;
sysctl_createv(clog, 0, NULL, &layerfs_node,
#ifdef _MODULE
0,
#else
CTLFLAG_PERMANENT,
#endif
CTLTYPE_NODE, "layerfs",
SYSCTL_DESCR("Generic layered file system"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
#ifdef LAYERFS_DIAGNOSTIC
sysctl_createv(clog, 0, &layerfs_node, NULL,
#ifndef _MODULE
CTLFLAG_PERMANENT |
#endif
CTLFLAG_READWRITE,
CTLTYPE_INT,
"debug",
SYSCTL_DESCR("Verbose debugging messages"),
NULL, 0, &layerfs_debug, 0,
CTL_CREATE, CTL_EOL);
#endif
/*
* other subtrees should really be aliases to this, but since
* they can't tell if layerfs has been instantiated yet, they
* can't do that...not easily. not yet. :-)
*/
}
int
layerfs_renamelock_enter(struct mount *mp)
{
return VFS_RENAMELOCK_ENTER(mp->mnt_lower);
}
void
layerfs_renamelock_exit(struct mount *mp)
{
VFS_RENAMELOCK_EXIT(mp->mnt_lower);
}
/* $NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_mbuftrace.h"
#include "opt_sb_max.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/poll.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/kauth.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>
#ifdef DDB
#include <sys/filedesc.h>
#include <ddb/db_active.h>
#endif
/*
* Primitive routines for operating on sockets and socket buffers.
*
* Connection life-cycle:
*
* Normal sequence from the active (originating) side:
*
* - soisconnecting() is called during processing of connect() call,
* - resulting in an eventual call to soisconnected() if/when the
* connection is established.
*
* When the connection is torn down during processing of disconnect():
*
* - soisdisconnecting() is called and,
* - soisdisconnected() is called when the connection to the peer
* is totally severed.
*
* The semantics of these routines are such that connectionless protocols
* can call soisconnected() and soisdisconnected() only, bypassing the
* in-progress calls when setting up a ``connection'' takes no time.
*
* From the passive side, a socket is created with two queues of sockets:
*
* - so_q0 (0) for partial connections (i.e. connections in progress)
* - so_q (1) for connections already made and awaiting user acceptance.
*
* As a protocol is preparing incoming connections, it creates a socket
* structure queued on so_q0 by calling sonewconn(). When the connection
* is established, soisconnected() is called, and transfers the
* socket structure to so_q, making it available to accept().
*
* If a socket is closed with sockets on either so_q0 or so_q, these
* sockets are dropped.
*
* Locking rules and assumptions:
*
* o socket::so_lock can change on the fly. The low level routines used
* to lock sockets are aware of this. When so_lock is acquired, the
* routine locking must check to see if so_lock still points to the
* lock that was acquired. If so_lock has changed in the meantime, the
* now irrelevant lock that was acquired must be dropped and the lock
* operation retried. Although not proven here, this is completely safe
* on a multiprocessor system, even with relaxed memory ordering, given
* the next two rules:
*
* o In order to mutate so_lock, the lock pointed to by the current value
* of so_lock must be held: i.e., the socket must be held locked by the
* changing thread. The thread must issue membar_release() to prevent
* memory accesses being reordered, and can set so_lock to the desired
* value. If the lock pointed to by the new value of so_lock is not
* held by the changing thread, the socket must then be considered
* unlocked.
*
* o If so_lock is mutated, and the previous lock referred to by so_lock
* could still be visible to other threads in the system (e.g. via file
* descriptor or protocol-internal reference), then the old lock must
* remain valid until the socket and/or protocol control block has been
* torn down.
*
* o If a socket has a non-NULL so_head value (i.e. is in the process of
* connecting), then locking the socket must also lock the socket pointed
* to by so_head: their lock pointers must match.
*
* o If a socket has connections in progress (so_q, so_q0 not empty) then
* locking the socket must also lock the sockets attached to both queues.
* Again, their lock pointers must match.
*
* o Beyond the initial lock assignment in socreate(), assigning locks to
* sockets is the responsibility of the individual protocols / protocol
* domains.
*/
static pool_cache_t socket_cache;
u_long sb_max = SB_MAX;/* maximum socket buffer size */
static u_long sb_max_adj; /* adjusted sb_max */
void
soisconnecting(struct socket *so)
{ KASSERT(solocked(so));
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTING;
}
void
soisconnected(struct socket *so)
{
struct socket *head;
head = so->so_head;
KASSERT(solocked(so)); KASSERT(head == NULL || solocked2(so, head)); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTED;
if (head && so->so_onq == &head->so_q0) {
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
/*
* Re-enqueue and wake up any waiters, e.g.
* processes blocking on accept().
*/
soqremque(so, 0);
soqinsque(head, so, 1);
sorwakeup(head);
cv_broadcast(&head->so_cv);
} else {
so->so_upcall =
head->so_accf->so_accept_filter->accf_callback;
so->so_upcallarg = head->so_accf->so_accept_filter_arg;
so->so_rcv.sb_flags |= SB_UPCALL;
so->so_options &= ~SO_ACCEPTFILTER;
(*so->so_upcall)(so, so->so_upcallarg,
POLLIN|POLLRDNORM, M_DONTWAIT);
}
} else {
cv_broadcast(&so->so_cv);
sorwakeup(so); sowwakeup(so);
}
}
void
soisdisconnecting(struct socket *so)
{
KASSERT(solocked(so));
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
cv_broadcast(&so->so_cv);
sowwakeup(so);
sorwakeup(so);
}
void
soisdisconnected(struct socket *so)
{ KASSERT(solocked(so));
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
cv_broadcast(&so->so_cv);
sowwakeup(so); sorwakeup(so);
}
void
soinit2(void)
{
socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
"socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
}
/*
* sonewconn: accept a new connection.
*
* When an attempt at a new connection is noted on a socket which accepts
* connections, sonewconn(9) is called. If the connection is possible
* (subject to space constraints, etc) then we allocate a new structure,
* properly linked into the data structure of the original socket.
*
* => If 'soready' is true, then socket will become ready for accept() i.e.
* inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
* => May be called from soft-interrupt context.
* => Listening socket should be locked.
* => Returns the new socket locked.
*/
struct socket *
sonewconn(struct socket *head, bool soready)
{
struct socket *so;
int soqueue, error;
KASSERT(solocked(head));
if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
/*
* Listen queue overflow. If there is an accept filter
* active, pass through the oldest cxn it's handling.
*/
if (head->so_accf == NULL) {
return NULL;
} else {
struct socket *so2, *next;
/* Pass the oldest connection waiting in the
accept filter */
for (so2 = TAILQ_FIRST(&head->so_q0);
so2 != NULL; so2 = next) {
next = TAILQ_NEXT(so2, so_qe);
if (so2->so_upcall == NULL) {
continue;
}
so2->so_upcall = NULL;
so2->so_upcallarg = NULL;
so2->so_options &= ~SO_ACCEPTFILTER;
so2->so_rcv.sb_flags &= ~SB_UPCALL;
soisconnected(so2);
break;
}
/* If nothing was nudged out of the acept filter, bail
* out; otherwise proceed allocating the socket. */
if (so2 == NULL) {
return NULL;
}
}
}
if ((head->so_options & SO_ACCEPTFILTER) != 0) {
soready = false;
}
soqueue = soready ? 1 : 0;
if ((so = soget(false)) == NULL) {
return NULL;
}
so->so_type = head->so_type;
so->so_options = head->so_options & ~SO_ACCEPTCONN;
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_proto = head->so_proto;
so->so_timeo = head->so_timeo;
so->so_pgid = head->so_pgid;
so->so_send = head->so_send;
so->so_receive = head->so_receive;
so->so_uidinfo = head->so_uidinfo;
so->so_egid = head->so_egid;
so->so_cpid = head->so_cpid;
/*
* Share the lock with the listening-socket, it may get unshared
* once the connection is complete.
*
* so_lock is stable while we hold the socket locked, so no
* need for atomic_load_* here.
*/
mutex_obj_hold(head->so_lock);
so->so_lock = head->so_lock;
/*
* Reserve the space for socket buffers.
*/
#ifdef MBUFTRACE
so->so_mowner = head->so_mowner;
so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
so->so_snd.sb_mowner = head->so_snd.sb_mowner;
#endif
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
goto out;
}
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
so->so_snd.sb_timeo = head->so_snd.sb_timeo;
so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
/*
* Finally, perform the protocol attach. Note: a new socket
* lock may be assigned at this point (if so, it will be held).
*/
error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0);
if (error) {
out:
KASSERT(solocked(so));
KASSERT(so->so_accf == NULL);
soput(so);
/* Note: the listening socket shall stay locked. */
KASSERT(solocked(head));
return NULL;
}
KASSERT(solocked2(head, so));
/*
* Insert into the queue. If ready, update the connection status
* and wake up any waiters, e.g. processes blocking on accept().
*/
soqinsque(head, so, soqueue);
if (soready) {
so->so_state |= SS_ISCONNECTED;
sorwakeup(head);
cv_broadcast(&head->so_cv);
}
return so;
}
struct socket *
soget(bool waitok)
{
struct socket *so;
so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); if (__predict_false(so == NULL))
return (NULL);
memset(so, 0, sizeof(*so));
TAILQ_INIT(&so->so_q0);
TAILQ_INIT(&so->so_q);
cv_init(&so->so_cv, "socket");
cv_init(&so->so_rcv.sb_cv, "netio");
cv_init(&so->so_snd.sb_cv, "netio");
selinit(&so->so_rcv.sb_sel);
selinit(&so->so_snd.sb_sel);
so->so_rcv.sb_so = so;
so->so_snd.sb_so = so;
return so;
}
void
soput(struct socket *so)
{ KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
seldestroy(&so->so_rcv.sb_sel);
seldestroy(&so->so_snd.sb_sel);
mutex_obj_free(so->so_lock);
cv_destroy(&so->so_cv);
cv_destroy(&so->so_rcv.sb_cv);
cv_destroy(&so->so_snd.sb_cv);
pool_cache_put(socket_cache, so);
}
/*
* soqinsque: insert socket of a new connection into the specified
* accept queue of the listening socket (head).
*
* q = 0: queue of partial connections
* q = 1: queue of incoming connections
*/
void
soqinsque(struct socket *head, struct socket *so, int q)
{
KASSERT(q == 0 || q == 1);
KASSERT(solocked2(head, so));
KASSERT(so->so_onq == NULL);
KASSERT(so->so_head == NULL);
so->so_head = head;
if (q == 0) {
head->so_q0len++;
so->so_onq = &head->so_q0;
} else {
head->so_qlen++;
so->so_onq = &head->so_q;
}
TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
}
/*
* soqremque: remove socket from the specified queue.
*
* => Returns true if socket was removed from the specified queue.
* => False if socket was not removed (because it was in other queue).
*/
bool
soqremque(struct socket *so, int q)
{
struct socket *head = so->so_head;
KASSERT(q == 0 || q == 1);
KASSERT(solocked(so));
KASSERT(so->so_onq != NULL);
KASSERT(head != NULL);
if (q == 0) {
if (so->so_onq != &head->so_q0)
return false;
head->so_q0len--;
} else {
if (so->so_onq != &head->so_q)
return false;
head->so_qlen--;
}
KASSERT(solocked2(so, head));
TAILQ_REMOVE(so->so_onq, so, so_qe);
so->so_onq = NULL;
so->so_head = NULL;
return true;
}
/*
* socantsendmore: indicates that no more data will be sent on the
* socket; it would normally be applied to a socket when the user
* informs the system that no more data is to be sent, by the protocol
* code (in case pr_shutdown()).
*/
void
socantsendmore(struct socket *so)
{ KASSERT(solocked(so));
so->so_state |= SS_CANTSENDMORE;
sowwakeup(so);
}
/*
* socantrcvmore(): indicates that no more data will be received and
* will normally be applied to the socket by a protocol when it detects
* that the peer will send no more data. Data queued for reading in
* the socket may yet be read.
*/
void
socantrcvmore(struct socket *so)
{ KASSERT(solocked(so));
so->so_state |= SS_CANTRCVMORE;
sorwakeup(so);
}
/*
* soroverflow(): indicates that data was attempted to be sent
* but the receiving buffer overflowed.
*/
void
soroverflow(struct socket *so)
{
KASSERT(solocked(so));
so->so_rcv.sb_overflowed++;
if (so->so_options & SO_RERROR) {
so->so_rerror = ENOBUFS;
sorwakeup(so);
}
}
/*
* Wait for data to arrive at/drain from a socket buffer.
*/
int
sbwait(struct sockbuf *sb)
{
struct socket *so;
kmutex_t *lock;
int error;
so = sb->sb_so;
KASSERT(solocked(so));
sb->sb_flags |= SB_NOTIFY;
lock = so->so_lock;
if ((sb->sb_flags & SB_NOINTR) != 0)
error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
else
error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock);
return error;
}
/*
* Wakeup processes waiting on a socket buffer.
* Do asynchronous notification via SIGIO
* if the socket buffer has the SB_ASYNC flag set.
*/
void
sowakeup(struct socket *so, struct sockbuf *sb, int code)
{
int band;
KASSERT(solocked(so)); KASSERT(sb->sb_so == so); switch (code) {
case POLL_IN:
band = POLLIN|POLLRDNORM;
break;
case POLL_OUT:
band = POLLOUT|POLLWRNORM;
break;
case POLL_HUP:
band = POLLHUP;
break;
default:
band = 0;
#ifdef DIAGNOSTIC
printf("bad siginfo code %d in socket notification.\n", code);
#endif
break;
}
sb->sb_flags &= ~SB_NOTIFY;
selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
cv_broadcast(&sb->sb_cv);
if (sb->sb_flags & SB_ASYNC) fownsignal(so->so_pgid, SIGIO, code, band, so); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
}
/*
* Reset a socket's lock pointer. Wake all threads waiting on the
* socket's condition variables so that they can restart their waits
* using the new lock. The existing lock must be held.
*
* Caller must have issued membar_release before this.
*/
void
solockreset(struct socket *so, kmutex_t *lock)
{ KASSERT(solocked(so));
so->so_lock = lock;
cv_broadcast(&so->so_snd.sb_cv);
cv_broadcast(&so->so_rcv.sb_cv);
cv_broadcast(&so->so_cv);
}
/*
* Socket buffer (struct sockbuf) utility routines.
*
* Each socket contains two socket buffers: one for sending data and
* one for receiving data. Each buffer contains a queue of mbufs,
* information about the number of mbufs and amount of data in the
* queue, and other fields allowing poll() statements and notification
* on data availability to be implemented.
*
* Data stored in a socket buffer is maintained as a list of records.
* Each record is a list of mbufs chained together with the m_next
* field. Records are chained together with the m_nextpkt field. The upper
* level routine soreceive() expects the following conventions to be
* observed when placing information in the receive buffer:
*
* 1. If the protocol requires each message be preceded by the sender's
* name, then a record containing that name must be present before
* any associated data (mbuf's must be of type MT_SONAME).
* 2. If the protocol supports the exchange of ``access rights'' (really
* just additional data associated with the message), and there are
* ``rights'' to be received, then a record containing this data
* should be present (mbuf's must be of type MT_CONTROL).
* 3. If a name or rights record exists, then it must be followed by
* a data record, perhaps of zero length.
*
* Before using a new socket structure it is first necessary to reserve
* buffer space to the socket, by calling sbreserve(). This should commit
* some of the available buffer space in the system buffer pool for the
* socket (currently, it does nothing but enforce limits). The space
* should be released by calling sbrelease() when the socket is destroyed.
*/
int
sb_max_set(u_long new_sbmax)
{
int s;
if (new_sbmax < (16 * 1024))
return (EINVAL);
s = splsoftnet();
sb_max = new_sbmax;
sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
splx(s);
return (0);
}
int
soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
{ KASSERT(so->so_pcb == NULL || solocked(so));
/*
* there's at least one application (a configure script of screen)
* which expects a fifo is writable even if it has "some" bytes
* in its buffer.
* so we want to make sure (hiwat - lowat) >= (some bytes).
*
* PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
* we expect it's large enough for such applications.
*/
u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
u_long hiwat = lowat + PIPE_BUF;
if (sndcc < hiwat)
sndcc = hiwat;
if (sbreserve(&so->so_snd, sndcc, so) == 0)
goto bad;
if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
goto bad2;
if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = lowat; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
return (0);
bad2:
sbrelease(&so->so_snd, so);
bad:
return (ENOBUFS);
}
/*
* Allot mbufs to a sockbuf.
* Attempt to scale mbmax so that mbcnt doesn't become limiting
* if buffering efficiency is near the normal case.
*/
int
sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
{
struct lwp *l = curlwp; /* XXX */
rlim_t maxcc;
struct uidinfo *uidinfo;
KASSERT(so->so_pcb == NULL || solocked(so)); KASSERT(sb->sb_so == so); KASSERT(sb_max_adj != 0); if (cc == 0 || cc > sb_max_adj)
return (0);
maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
uidinfo = so->so_uidinfo;
if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
return 0;
sb->sb_mbmax = uimin(cc * 2, sb_max);
if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat;
return (1);
}
/*
* Free mbufs held by a socket, and reserved mbuf space. We do not assert
* that the socket is held locked here: see sorflush().
*/
void
sbrelease(struct sockbuf *sb, struct socket *so)
{ KASSERT(sb->sb_so == so);
sbflush(sb);
(void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
sb->sb_mbmax = 0;
}
/*
* Routines to add and remove
* data from an mbuf queue.
*
* The routines sbappend() or sbappendrecord() are normally called to
* append new mbufs to a socket buffer, after checking that adequate
* space is available, comparing the function sbspace() with the amount
* of data to be added. sbappendrecord() differs from sbappend() in
* that data supplied is treated as the beginning of a new record.
* To place a sender's address, optional access rights, and data in a
* socket receive buffer, sbappendaddr() should be used. To place
* access rights and data in a socket receive buffer, sbappendrights()
* should be used. In either case, the new data begins a new record.
* Note that unlike sbappend() and sbappendrecord(), these routines check
* for the caller that there will be enough space to store the data.
* Each fails if there is not enough space, or if it cannot find mbufs
* to store additional information in.
*
* Reliable protocols may use the socket send buffer to hold data
* awaiting acknowledgement. Data is normally copied from a socket
* send buffer in a protocol with m_copym for output to a peer,
* and then removing the data from the socket buffer with sbdrop()
* or sbdroprecord() when the data is acknowledged by the peer.
*/
#ifdef SOCKBUF_DEBUG
void
sblastrecordchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
KASSERT(solocked(sb->sb_so));
while (m && m->m_nextpkt)
m = m->m_nextpkt;
if (m != sb->sb_lastrecord) {
printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
sb->sb_mb, sb->sb_lastrecord, m);
printf("packet chain:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
printf("\t%p\n", m);
panic("sblastrecordchk from %s", where);
}
}
void
sblastmbufchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
struct mbuf *n;
KASSERT(solocked(sb->sb_so));
while (m && m->m_nextpkt)
m = m->m_nextpkt;
while (m && m->m_next)
m = m->m_next;
if (m != sb->sb_mbtail) {
printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
sb->sb_mb, sb->sb_mbtail, m);
printf("packet tree:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
printf("\t");
for (n = m; n != NULL; n = n->m_next)
printf("%p ", n);
printf("\n");
}
panic("sblastmbufchk from %s", where);
}
}
#endif /* SOCKBUF_DEBUG */
/*
* Link a chain of records onto a socket buffer
*/
#define SBLINKRECORDCHAIN(sb, m0, mlast) \
do { \
if ((sb)->sb_lastrecord != NULL) \
(sb)->sb_lastrecord->m_nextpkt = (m0); \
else \
(sb)->sb_mb = (m0); \
(sb)->sb_lastrecord = (mlast); \
} while (/*CONSTCOND*/0)
#define SBLINKRECORD(sb, m0) \
SBLINKRECORDCHAIN(sb, m0, m0)
/*
* Append mbuf chain m to the last record in the
* socket buffer sb. The additional space associated
* the mbuf chain is recorded in sb. Empty mbufs are
* discarded and mbufs are compacted where possible.
*/
void
sbappend(struct sockbuf *sb, struct mbuf *m)
{
struct mbuf *n;
KASSERT(solocked(sb->sb_so)); if (m == NULL)
return;
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
SBLASTRECORDCHK(sb, "sbappend 1");
if ((n = sb->sb_lastrecord) != NULL) {
/*
* XXX Would like to simply use sb_mbtail here, but
* XXX I need to verify that I won't miss an EOR that
* XXX way.
*/
do {
if (n->m_flags & M_EOR) {
sbappendrecord(sb, m); /* XXXXXX!!!! */
return;
}
} while (n->m_next && (n = n->m_next));
} else {
/*
* If this is the first record in the socket buffer, it's
* also the last record.
*/
sb->sb_lastrecord = m;
}
sbcompress(sb, m, n);
SBLASTRECORDCHK(sb, "sbappend 2");
}
/*
* This version of sbappend() should only be used when the caller
* absolutely knows that there will never be more than one record
* in the socket buffer, that is, a stream protocol (such as TCP).
*/
void
sbappendstream(struct sockbuf *sb, struct mbuf *m)
{
KASSERT(solocked(sb->sb_so));
KDASSERT(m->m_nextpkt == NULL);
KASSERT(sb->sb_mb == sb->sb_lastrecord);
SBLASTMBUFCHK(sb, __func__);
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
sbcompress(sb, m, sb->sb_mbtail);
sb->sb_lastrecord = sb->sb_mb;
SBLASTRECORDCHK(sb, __func__);
}
#ifdef SOCKBUF_DEBUG
void
sbcheck(struct sockbuf *sb)
{
struct mbuf *m, *m2;
u_long len, mbcnt;
KASSERT(solocked(sb->sb_so));
len = 0;
mbcnt = 0;
for (m = sb->sb_mb; m; m = m->m_nextpkt) {
for (m2 = m; m2 != NULL; m2 = m2->m_next) {
len += m2->m_len;
mbcnt += MSIZE;
if (m2->m_flags & M_EXT)
mbcnt += m2->m_ext.ext_size;
if (m2->m_nextpkt != NULL)
panic("sbcheck nextpkt");
}
}
if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
mbcnt, sb->sb_mbcnt);
panic("sbcheck");
}
}
#endif
/*
* As above, except the mbuf chain
* begins a new record.
*/
void
sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
{
struct mbuf *m;
KASSERT(solocked(sb->sb_so)); if (m0 == NULL)
return;
#ifdef MBUFTRACE
m_claimm(m0, sb->sb_mowner);
#endif
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 1");
SBLINKRECORD(sb, m0);
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR;
m->m_flags |= M_EOR;
}
sbcompress(sb, m, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 2");
}
/*
* As above except that OOB data
* is inserted at the beginning of the sockbuf,
* but after any other OOB data.
*/
void
sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
{
struct mbuf *m, **mp;
KASSERT(solocked(sb->sb_so));
if (m0 == NULL)
return;
SBLASTRECORDCHK(sb, "sbinsertoob 1");
for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
again:
switch (m->m_type) {
case MT_OOBDATA:
continue; /* WANT next train */
case MT_CONTROL:
if ((m = m->m_next) != NULL)
goto again; /* inspect THIS train further */
}
break;
}
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
m0->m_nextpkt = *mp;
if (*mp == NULL) {
/* m0 is actually the new tail */
sb->sb_lastrecord = m0;
}
*mp = m0;
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) {
m0->m_flags &= ~M_EOR;
m->m_flags |= M_EOR;
}
sbcompress(sb, m, m0);
SBLASTRECORDCHK(sb, "sbinsertoob 2");
}
/*
* Append address and data, and optionally, control (ancillary) data
* to the receive queue of a socket. If present,
* m0 must include a packet header with total length.
* Returns 0 if no space in sockbuf or insufficient mbufs.
*/
int
sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
struct mbuf *control)
{
struct mbuf *m, *n, *nlast;
int space, len;
KASSERT(solocked(sb->sb_so));
space = asa->sa_len;
if (m0 != NULL) {
if ((m0->m_flags & M_PKTHDR) == 0)
panic("sbappendaddr"); space += m0->m_pkthdr.len;
#ifdef MBUFTRACE
m_claimm(m0, sb->sb_mowner);
#endif
}
for (n = control; n; n = n->m_next) {
space += n->m_len;
MCLAIM(n, sb->sb_mowner);
if (n->m_next == NULL) /* keep pointer to last control buf */
break;
}
if (space > sbspace(sb))
return (0);
m = m_get(M_DONTWAIT, MT_SONAME);
if (m == NULL)
return (0);
MCLAIM(m, sb->sb_mowner);
/*
* XXX avoid 'comparison always true' warning which isn't easily
* avoided.
*/
len = asa->sa_len;
if (len > MLEN) {
MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return (0);
}
}
m->m_len = asa->sa_len;
memcpy(mtod(m, void *), asa, asa->sa_len);
if (n) n->m_next = m0; /* concatenate data to control */
else
control = m0;
m->m_next = control;
SBLASTRECORDCHK(sb, "sbappendaddr 1");
for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n);
nlast = n;
SBLINKRECORD(sb, m);
sb->sb_mbtail = nlast;
SBLASTMBUFCHK(sb, "sbappendaddr");
SBLASTRECORDCHK(sb, "sbappendaddr 2");
return (1);
}
/*
* Helper for sbappendchainaddr: prepend a struct sockaddr* to
* an mbuf chain.
*/
static inline struct mbuf *
m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
const struct sockaddr *asa)
{
struct mbuf *m;
const int salen = asa->sa_len;
KASSERT(solocked(sb->sb_so));
/* only the first in each chain need be a pkthdr */
m = m_gethdr(M_DONTWAIT, MT_SONAME);
if (m == NULL)
return NULL;
MCLAIM(m, sb->sb_mowner);
#ifdef notyet
if (salen > MHLEN) {
MEXTMALLOC(m, salen, M_NOWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
#else
KASSERT(salen <= MHLEN);
#endif
m->m_len = salen;
memcpy(mtod(m, void *), asa, salen);
m->m_next = m0;
m->m_pkthdr.len = salen + m0->m_pkthdr.len;
return m;
}
int
sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
struct mbuf *m0, int sbprio)
{
struct mbuf *m, *n, *n0, *nlast;
int error;
KASSERT(solocked(sb->sb_so));
/*
* XXX sbprio reserved for encoding priority of this* request:
* SB_PRIO_NONE --> honour normal sb limits
* SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
* take whole chain. Intended for large requests
* that should be delivered atomically (all, or none).
* SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
* over normal socket limits, for messages indicating
* buffer overflow in earlier normal/lower-priority messages
* SB_PRIO_BESTEFFORT --> ignore limits entirely.
* Intended for kernel-generated messages only.
* Up to generator to avoid total mbuf resource exhaustion.
*/
(void)sbprio;
if (m0 && (m0->m_flags & M_PKTHDR) == 0)
panic("sbappendaddrchain");
#ifdef notyet
space = sbspace(sb);
/*
* Enforce SB_PRIO_* limits as described above.
*/
#endif
n0 = NULL;
nlast = NULL;
for (m = m0; m; m = m->m_nextpkt) {
struct mbuf *np;
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
/* Prepend sockaddr to this record (m) of input chain m0 */
n = m_prepend_sockaddr(sb, m, asa);
if (n == NULL) {
error = ENOBUFS;
goto bad;
}
/* Append record (asa+m) to end of new chain n0 */
if (n0 == NULL) {
n0 = n;
} else {
nlast->m_nextpkt = n;
}
/* Keep track of last record on new chain */
nlast = n;
for (np = n; np; np = np->m_next)
sballoc(sb, np);
}
SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
/* Drop the entire chain of (asa+m) records onto the socket */
SBLINKRECORDCHAIN(sb, n0, nlast);
SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
for (m = nlast; m->m_next; m = m->m_next)
;
sb->sb_mbtail = m;
SBLASTMBUFCHK(sb, "sbappendaddrchain");
return (1);
bad:
/*
* On error, free the prepended addresses. For consistency
* with sbappendaddr(), leave it to our caller to free
* the input record chain passed to us as m0.
*/
while ((n = n0) != NULL) {
struct mbuf *np;
/* Undo the sballoc() of this record */
for (np = n; np; np = np->m_next)
sbfree(sb, np);
n0 = n->m_nextpkt; /* iterate at next prepended address */
np = m_free(n); /* free prepended address (not data) */
}
return error;
}
int
sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
{
struct mbuf *m, *mlast, *n;
int space;
KASSERT(solocked(sb->sb_so));
space = 0;
if (control == NULL)
panic("sbappendcontrol");
for (m = control; ; m = m->m_next) {
space += m->m_len;
MCLAIM(m, sb->sb_mowner);
if (m->m_next == NULL)
break;
}
n = m; /* save pointer to last control buffer */
for (m = m0; m; m = m->m_next) {
MCLAIM(m, sb->sb_mowner);
space += m->m_len;
}
if (space > sbspace(sb))
return (0);
n->m_next = m0; /* concatenate data to control */
SBLASTRECORDCHK(sb, "sbappendcontrol 1");
for (m = control; m->m_next != NULL; m = m->m_next)
sballoc(sb, m);
sballoc(sb, m);
mlast = m;
SBLINKRECORD(sb, control);
sb->sb_mbtail = mlast;
SBLASTMBUFCHK(sb, "sbappendcontrol");
SBLASTRECORDCHK(sb, "sbappendcontrol 2");
return (1);
}
/*
* Compress mbuf chain m into the socket
* buffer sb following mbuf n. If n
* is null, the buffer is presumed empty.
*/
void
sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
{
int eor;
struct mbuf *o;
KASSERT(solocked(sb->sb_so));
eor = 0;
while (m) {
eor |= m->m_flags & M_EOR;
if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) &&
o->m_type == m->m_type))) {
if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next;
m = m_free(m);
continue;
}
if (n && (n->m_flags & M_EOR) == 0 &&
/* M_TRAILINGSPACE() checks buffer writeability */
m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
(unsigned)m->m_len);
n->m_len += m->m_len;
sb->sb_cc += m->m_len;
m = m_free(m);
continue;
}
if (n)
n->m_next = m;
else
sb->sb_mb = m;
sb->sb_mbtail = m;
sballoc(sb, m);
n = m;
m->m_flags &= ~M_EOR;
m = m->m_next;
n->m_next = 0;
}
if (eor) {
if (n)
n->m_flags |= eor;
else
printf("semi-panic: sbcompress\n");
}
SBLASTMBUFCHK(sb, __func__);
}
/*
* Free all mbufs in a sockbuf.
* Check that all resources are reclaimed.
*/
void
sbflush(struct sockbuf *sb)
{ KASSERT(solocked(sb->sb_so)); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt)
sbdrop(sb, (int)sb->sb_cc);
KASSERT(sb->sb_cc == 0); KASSERT(sb->sb_mb == NULL); KASSERT(sb->sb_mbtail == NULL); KASSERT(sb->sb_lastrecord == NULL);
}
/*
* Drop data from (the front of) a sockbuf.
*/
void
sbdrop(struct sockbuf *sb, int len)
{
struct mbuf *m, *next;
KASSERT(solocked(sb->sb_so)); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { if (m == NULL) {
if (next == NULL)
panic("sbdrop(%p,%d): cc=%lu",
sb, len, sb->sb_cc);
m = next;
next = m->m_nextpkt;
continue;
}
if (m->m_len > len) {
m->m_len -= len;
m->m_data += len;
sb->sb_cc -= len;
break;
}
len -= m->m_len;
sbfree(sb, m);
m = m_free(m);
}
while (m && m->m_len == 0) { sbfree(sb, m);
m = m_free(m);
}
if (m) {
sb->sb_mb = m;
m->m_nextpkt = next;
} else
sb->sb_mb = next;
/*
* First part is an inline SB_EMPTY_FIXUP(). Second part
* makes sure sb_lastrecord is up-to-date if we dropped
* part of the last record.
*/
m = sb->sb_mb;
if (m == NULL) {
sb->sb_mbtail = NULL;
sb->sb_lastrecord = NULL;
} else if (m->m_nextpkt == NULL)
sb->sb_lastrecord = m;
}
/*
* Drop a record off the front of a sockbuf
* and move the next record to the front.
*/
void
sbdroprecord(struct sockbuf *sb)
{
struct mbuf *m, *mn;
KASSERT(solocked(sb->sb_so));
m = sb->sb_mb;
if (m) {
sb->sb_mb = m->m_nextpkt;
do {
sbfree(sb, m);
mn = m_free(m);
} while ((m = mn) != NULL);
}
SB_EMPTY_FIXUP(sb);
}
/*
* Create a "control" mbuf containing the specified data
* with the specified type for presentation on a socket buffer.
*/
struct mbuf *
sbcreatecontrol1(void **p, int size, int type, int level, int flags)
{
struct cmsghdr *cp;
struct mbuf *m;
int space = CMSG_SPACE(size);
if ((flags & M_DONTWAIT) && space > MCLBYTES) {
printf("%s: message too large %d\n", __func__, space);
return NULL;
}
if ((m = m_get(flags, MT_CONTROL)) == NULL)
return NULL;
if (space > MLEN) {
if (space > MCLBYTES)
MEXTMALLOC(m, space, M_WAITOK);
else
MCLGET(m, flags);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
cp = mtod(m, struct cmsghdr *);
*p = CMSG_DATA(cp);
m->m_len = space;
cp->cmsg_len = CMSG_LEN(size);
cp->cmsg_level = level;
cp->cmsg_type = type;
memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp));
memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size);
return m;
}
struct mbuf *
sbcreatecontrol(void *p, int size, int type, int level)
{
struct mbuf *m;
void *v;
m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
if (m == NULL)
return NULL;
memcpy(v, p, size);
return m;
}
void
solockretry(struct socket *so, kmutex_t *lock)
{
while (lock != atomic_load_relaxed(&so->so_lock)) {
mutex_exit(lock);
lock = atomic_load_consume(&so->so_lock);
mutex_enter(lock);
}
}
bool
solocked(const struct socket *so)
{
/*
* Used only for diagnostic assertions, so so_lock should be
* stable at this point, hence on need for atomic_load_*.
*/
return mutex_owned(so->so_lock);
}
bool
solocked2(const struct socket *so1, const struct socket *so2)
{
const kmutex_t *lock;
/*
* Used only for diagnostic assertions, so so_lock should be
* stable at this point, hence on need for atomic_load_*.
*/
lock = so1->so_lock;
if (lock != so2->so_lock)
return false;
return mutex_owned(lock);
}
/*
* sosetlock: assign a default lock to a new socket.
*/
void
sosetlock(struct socket *so)
{ if (so->so_lock == NULL) { kmutex_t *lock = softnet_lock;
so->so_lock = lock;
mutex_obj_hold(lock);
mutex_enter(lock);
}
KASSERT(solocked(so));
}
/*
* Set lock on sockbuf sb; sleep if lock is already held.
* Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
* Returns error without lock if sleep is interrupted.
*/
int
sblock(struct sockbuf *sb, int wf)
{
struct socket *so;
kmutex_t *lock;
int error;
KASSERT(solocked(sb->sb_so));
for (;;) {
if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
sb->sb_flags |= SB_LOCK;
return 0;
}
if (wf != M_WAITOK)
return EWOULDBLOCK;
so = sb->sb_so;
lock = so->so_lock;
if ((sb->sb_flags & SB_NOINTR) != 0) {
cv_wait(&so->so_cv, lock);
error = 0;
} else
error = cv_wait_sig(&so->so_cv, lock); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); if (error != 0)
return error;
}
}
void
sbunlock(struct sockbuf *sb)
{
struct socket *so;
so = sb->sb_so;
KASSERT(solocked(so)); KASSERT((sb->sb_flags & SB_LOCK) != 0);
sb->sb_flags &= ~SB_LOCK;
cv_broadcast(&so->so_cv);
}
int
sowait(struct socket *so, bool catch_p, int timo)
{
kmutex_t *lock;
int error;
KASSERT(solocked(so));
KASSERT(catch_p || timo != 0);
lock = so->so_lock;
if (catch_p)
error = cv_timedwait_sig(&so->so_cv, lock, timo);
else
error = cv_timedwait(&so->so_cv, lock, timo);
if (__predict_false(lock != atomic_load_relaxed(&so->so_lock)))
solockretry(so, lock);
return error;
}
#ifdef DDB
/*
* Currently, sofindproc() is used only from DDB. It could be used from others
* by using db_mutex_enter()
*/
static inline int
db_mutex_enter(kmutex_t *mtx)
{
int rv;
if (!db_active) {
mutex_enter(mtx);
rv = 1;
} else
rv = mutex_tryenter(mtx);
return rv;
}
int
sofindproc(struct socket *so, int all, void (*pr)(const char *, ...))
{
proc_t *p;
filedesc_t *fdp;
fdtab_t *dt;
fdfile_t *ff;
file_t *fp = NULL;
int found = 0;
int i, t;
if (so == NULL)
return 0;
t = db_mutex_enter(&proc_lock);
if (!t) {
pr("could not acquire proc_lock mutex\n");
return 0;
}
PROCLIST_FOREACH(p, &allproc) {
if (p->p_stat == SIDL)
continue;
fdp = p->p_fd;
t = db_mutex_enter(&fdp->fd_lock);
if (!t) {
pr("could not acquire fd_lock mutex\n");
continue;
}
dt = atomic_load_consume(&fdp->fd_dt);
for (i = 0; i < dt->dt_nfiles; i++) {
ff = dt->dt_ff[i];
if (ff == NULL)
continue;
fp = atomic_load_consume(&ff->ff_file);
if (fp == NULL)
continue;
t = db_mutex_enter(&fp->f_lock);
if (!t) {
pr("could not acquire f_lock mutex\n");
continue;
}
if ((struct socket *)fp->f_data != so) {
mutex_exit(&fp->f_lock);
continue;
}
found++;
if (pr)
pr("socket %p: owner %s(pid=%d)\n",
so, p->p_comm, p->p_pid);
mutex_exit(&fp->f_lock);
if (all == 0)
break;
}
mutex_exit(&fdp->fd_lock);
if (all == 0 && found != 0)
break;
}
mutex_exit(&proc_lock);
return found;
}
void
socket_print(const char *modif, void (*pr)(const char *, ...))
{
file_t *fp;
struct socket *so;
struct sockbuf *sb_snd, *sb_rcv;
struct mbuf *m_rec, *m;
bool opt_v = false;
bool opt_m = false;
bool opt_a = false;
bool opt_p = false;
int nrecs, nmbufs;
char ch;
const char *family;
while ( (ch = *(modif++)) != '\0') {
switch (ch) {
case 'v':
opt_v = true;
break;
case 'm':
opt_m = true;
break;
case 'a':
opt_a = true;
break;
case 'p':
opt_p = true;
break;
}
}
if (opt_v == false && pr)
(pr)("Ignore empty sockets. use /v to print all.\n");
if (opt_p == true && pr)
(pr)("Don't search owner process.\n");
LIST_FOREACH(fp, &filehead, f_list) {
if (fp->f_type != DTYPE_SOCKET)
continue;
so = (struct socket *)fp->f_data;
if (so == NULL)
continue;
if (so->so_proto->pr_domain->dom_family == AF_INET)
family = "INET";
#ifdef INET6
else if (so->so_proto->pr_domain->dom_family == AF_INET6)
family = "INET6";
#endif
else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY)
family = "KEY";
else if (so->so_proto->pr_domain->dom_family == AF_ROUTE)
family = "ROUTE";
else
continue;
sb_snd = &so->so_snd;
sb_rcv = &so->so_rcv;
if (opt_v != true &&
sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0)
continue;
pr("---SOCKET %p: type %s\n", so, family);
if (opt_p != true)
sofindproc(so, opt_a == true ? 1 : 0, pr);
pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc);
pr("Send Buffer mbufs:\n");
m_rec = m = sb_snd->sb_mb;
nrecs = 0;
nmbufs = 0;
while (m_rec) {
nrecs++;
if (opt_m == true)
pr(" mbuf chain %p\n", m_rec);
while (m) {
nmbufs++;
m = m->m_next;
}
m_rec = m = m_rec->m_nextpkt;
}
pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc);
pr("Recv Buffer mbufs:\n");
m_rec = m = sb_rcv->sb_mb;
nrecs = 0;
nmbufs = 0;
while (m_rec) {
nrecs++;
if (opt_m == true)
pr(" mbuf chain %p\n", m_rec);
while (m) {
nmbufs++;
m = m->m_next;
}
m_rec = m = m_rec->m_nextpkt;
}
pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
}
}
#endif /* DDB */
/* $NetBSD: ip6_var.h,v 1.94 2024/02/09 22:08:37 andvar Exp $ */
/* $KAME: ip6_var.h,v 1.33 2000/06/11 14:59:20 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET6_IP6_VAR_H_
#define _NETINET6_IP6_VAR_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/socketvar.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
struct ip6_moptions {
if_index_t im6o_multicast_if_index; /* I/F for outgoing multicasts */
u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */
u_char im6o_multicast_loop; /* 1 >= hear sends if a member */
LIST_HEAD(, in6_multi_mship) im6o_memberships;
};
/*
* Control options for outgoing packets
*/
/* Routing header related info */
struct ip6po_rhinfo {
struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
struct route ip6po_rhi_route; /* Route to the 1st hop */
};
#define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr
#define ip6po_route ip6po_rhinfo.ip6po_rhi_route
/* Nexthop related info */
struct ip6po_nhinfo {
struct sockaddr *ip6po_nhi_nexthop;
struct route ip6po_nhi_route; /* Route to the nexthop */
};
#define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop
#define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route
struct ip6_pktopts {
int ip6po_hlim; /* Hoplimit for outgoing packets */
struct in6_pktinfo *ip6po_pktinfo; /* Outgoing IF/address information */
struct ip6po_nhinfo ip6po_nhinfo; /* Next-hop address information */
struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
struct ip6_dest *ip6po_dest1; /* Destination options header(1st part) */
struct ip6po_rhinfo ip6po_rhinfo; /* Routing header related info. */
struct ip6_dest *ip6po_dest2; /* Destination options header(2nd part) */
int ip6po_tclass; /* traffic class */
int ip6po_minmtu; /* fragment vs PMTU discovery policy */
#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/
#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */
#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */
int ip6po_prefer_tempaddr; /* whether temporary addresses are
* preferred as source address */
#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */
#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */
#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */
int ip6po_flags;
#if 0 /* parameters in this block is obsolete. do not reuse the values. */
#define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */
#define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */
#endif
#define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */
};
/*
* IPv6 statistics.
* Each counter is an unsigned 64-bit value.
*/
#define IP6_STAT_TOTAL 0 /* total packets received */
#define IP6_STAT_TOOSHORT 1 /* packet too short */
#define IP6_STAT_TOOSMALL 2 /* not enough data */
#define IP6_STAT_FRAGMENTS 3 /* fragments received */
#define IP6_STAT_FRAGDROPPED 4 /* frags dropped (dups, out of space) */
#define IP6_STAT_FRAGTIMEOUT 5 /* fragments timed out */
#define IP6_STAT_FRAGOVERFLOW 6 /* fragments that exceed limit */
#define IP6_STAT_FORWARD 7 /* packets forwarded */
#define IP6_STAT_CANTFORWARD 8 /* packets rcvd for uncreachable dst */
#define IP6_STAT_REDIRECTSENT 9 /* packets forwarded on same net */
#define IP6_STAT_DELIVERED 10 /* datagrams delivered to upper level */
#define IP6_STAT_LOCALOUT 11 /* total IP packets generated here */
#define IP6_STAT_ODROPPED 12 /* lost packets due to nobufs, etc. */
#define IP6_STAT_REASSEMBLED 13 /* total packets reassembled ok */
#define IP6_STAT_FRAGMENTED 14 /* datagrams successfully fragmented */
#define IP6_STAT_OFRAGMENTS 15 /* output fragments created */
#define IP6_STAT_CANTFRAG 16 /* don't fragment flag was set, etc. */
#define IP6_STAT_BADOPTIONS 17 /* error in option processing */
#define IP6_STAT_NOROUTE 18 /* packets discarded due to no route */
#define IP6_STAT_BADVERS 19 /* ip6 version != 6 */
#define IP6_STAT_RAWOUT 20 /* total raw ip packets generated */
#define IP6_STAT_BADSCOPE 21 /* scope error */
#define IP6_STAT_NOTMEMBER 22 /* don't join this multicast group */
#define IP6_STAT_NXTHIST 23 /* next header histogram */
/* space for 256 counters */
#define IP6_STAT_M1 279 /* one mbuf */
#define IP6_STAT_M2M 280 /* two or more mbuf */
/* space for 32 counters */
#define IP6_STAT_MEXT1 312 /* one ext mbuf */
#define IP6_STAT_MEXT2M 313 /* two or more ext mbuf */
#define IP6_STAT_EXTHDRTOOLONG 314 /* ext hdr are not contiguous */
#define IP6_STAT_NOGIF 315 /* no match gif found */
#define IP6_STAT_TOOMANYHDR 316 /* discarded due to too many headers */
/*
* statistics for improvement of the source address selection
* algorithm:
* XXX: hardcoded 16 = # of ip6 multicast scope types + 1
*/
#define IP6_STAT_SOURCES_NONE 317 /* number of times that address
selection fails */
#define IP6_STAT_SOURCES_SAMEIF 318 /* number of times that an address
on the outgoing I/F is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_OTHERIF 334 /* number of times that an address on
a non-outgoing I/F is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_SAMESCOPE 350 /* number of times that an address that
has the same scope from the dest.
is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_OTHERSCOPE 366 /* number of times that an address that
has a different scope from the dest.
is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_DEPRECATED 382 /* number of times that a deprecated
address is chosen */
/* space for 16 counters */
#define IP6_STAT_FORWARD_CACHEHIT 398
#define IP6_STAT_FORWARD_CACHEMISS 399
#define IP6_STAT_FASTFORWARD 400 /* packets fast forwarded */
#define IP6_STAT_FASTFORWARDFLOWS 401 /* number of fast forward flows */
#define IP6_STAT_NOIPSEC 402 /* no match ipsec(4) found */
#define IP6_STAT_PFILDROP_IN 403 /* dropped by pfil (PFIL_IN) */
#define IP6_STAT_PFILDROP_OUT 404 /* dropped by pfil (PFIL_OUT) */
#define IP6_STAT_IPSECDROP_IN 405 /* dropped by IPsec SP check */
#define IP6_STAT_IPSECDROP_OUT 406 /* dropped by IPsec SP check */
#define IP6_STAT_IFDROP 407 /* dropped due to interface state */
#define IP6_STAT_IDROPPED 408 /* lost packets due to nobufs, etc. */
#define IP6_STAT_TIMXCEED 409 /* hop limit exceeded */
#define IP6_STAT_TOOBIG 410 /* packet bigger than MTU */
#define IP6_STAT_RTREJECT 411 /* rejected by route */
#define IP6_NSTATS 412
#define IP6FLOW_HASHBITS 6 /* should not be a multiple of 8 */
/*
* Structure for an IPv6 flow (ip6_fastforward).
*/
struct ip6flow {
TAILQ_ENTRY(ip6flow) ip6f_list; /* next in active list */
TAILQ_ENTRY(ip6flow) ip6f_hash; /* next ip6flow in bucket */
size_t ip6f_hashidx; /* own hash index of ipflowtable[] */
struct in6_addr ip6f_dst; /* destination address */
struct in6_addr ip6f_src; /* source address */
struct route ip6f_ro; /* associated route entry */
u_int32_t ip6f_flow; /* flow (tos) */
u_quad_t ip6f_uses; /* number of uses in this period */
u_quad_t ip6f_last_uses; /* number of uses in last period */
u_quad_t ip6f_dropped; /* ENOBUFS returned by if_output */
u_quad_t ip6f_forwarded; /* packets forwarded */
u_int ip6f_timer; /* lifetime timer */
};
#ifdef _KERNEL
#include <sys/protosw.h>
#include <sys/cprng.h>
/*
* Auxiliary attributes of incoming IPv6 packets, which is initialized when we
* come into ip6_input().
* XXX do not make it a kitchen sink!
*/
struct ip6aux {
/* ip6.ip6_dst */
struct in6_addr ip6a_src;
uint32_t ip6a_scope_id;
int ip6a_flags;
};
/* flags passed to ip6_output as last parameter */
#define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */
#define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */
#define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */
extern u_int32_t ip6_id; /* fragment identifier */
extern int ip6_defhlim; /* default hop limit */
extern int ip6_defmcasthlim; /* default multicast hop limit */
extern int ip6_forwarding; /* act as router? */
extern int ip6_sendredirect; /* send ICMPv6 redirect? */
extern int ip6_use_deprecated; /* allow deprecated addr as source */
extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */
extern int ip6_v6only;
extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */
extern int ip6_maxdynroutes; /* Max # of routes created via redirect */
extern int ip6_param_rt_msg; /* How to send parameter changing rtm */
extern struct socket *ip6_mrouter; /* multicast routing daemon */
extern int ip6_sendredirects; /* send IP redirects when forwarding? */
extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */
extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */
extern int ip6_keepfaith; /* Firewall Aided Internet Translator */
extern int ip6_log_interval;
extern time_t ip6_log_time;
extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */
extern int ip6_dad_count; /* DupAddrDetectionTransmits */
extern int ip6_auto_flowlabel;
extern int ip6_auto_linklocal;
extern int ip6_anonportmin; /* minimum ephemeral port */
extern int ip6_anonportmax; /* maximum ephemeral port */
extern int ip6_lowportmin; /* minimum reserved port */
extern int ip6_lowportmax; /* maximum reserved port */
extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses
in the source address selection */
extern int ip6_use_defzone; /* whether to use the default scope zone
when unspecified */
#ifdef GATEWAY
extern int ip6_maxflows; /* maximum amount of flows for ip6ff */
extern int ip6_hashsize; /* size of hash table */
#endif
struct inpcb;
extern const struct pr_usrreqs rip6_usrreqs;
int icmp6_ctloutput(int, struct socket *, struct sockopt *);
struct mbuf;
void ip6_init(void);
const struct ip6aux *ip6_getdstifaddr(struct mbuf *);
void ip6_freepcbopts(struct ip6_pktopts *);
void ip6_freemoptions(struct ip6_moptions *);
int ip6_unknown_opt(u_int8_t *, struct mbuf *, int);
int ip6_get_prevhdr(struct mbuf *, int);
int ip6_nexthdr(struct mbuf *, int, int, int *);
int ip6_lasthdr(struct mbuf *, int, int, int *);
struct ip6_hdr;
int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *,
struct mbuf *);
void ip6_notify_pmtu(struct inpcb *, const struct sockaddr_in6 *,
u_int32_t *);
int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);
void ip6_forward(struct mbuf *, int, struct ifnet *);
void ip6_mloopback(struct ifnet *, struct mbuf *,
const struct sockaddr_in6 *);
int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route *, int,
struct ip6_moptions *, struct inpcb *, struct ifnet **);
int ip6_if_output(struct ifnet * const, struct ifnet * const,
struct mbuf * const,
const struct sockaddr_in6 * const, const struct rtentry *);
int ip6_ctloutput(int, struct socket *, struct sockopt *);
int ip6_raw_ctloutput(int, struct socket *, struct sockopt *);
void ip6_initpktopts(struct ip6_pktopts *);
int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *,
struct ip6_pktopts *, kauth_cred_t, int);
void ip6_clearpktopts(struct ip6_pktopts *, int);
struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int);
int ip6_optlen(struct inpcb *);
void ip6_statinc(u_int);
int route6_input(struct mbuf **, int *, int);
void frag6_init(void);
int frag6_input(struct mbuf **, int *, int);
int ip6_reass_packet(struct mbuf **, int);
void frag6_slowtimo(void);
void frag6_fasttimo(void);
void frag6_drain(void);
void frag6_drainstub(void);
int ip6flow_init(int);
void ip6flow_poolinit(void);
struct ip6flow *ip6flow_reap(int);
void ip6flow_create(struct route *, struct mbuf *);
void ip6flow_slowtimo(void);
int ip6flow_invalidate_all(int);
void rip6_init(void);
int rip6_input(struct mbuf **, int *, int);
void *rip6_ctlinput(int, const struct sockaddr *, void *);
int rip6_ctloutput(int, struct socket *, struct sockopt *);
int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *,
struct mbuf *);
int rip6_attach(struct socket *, int);
int rip6_usrreq(struct socket *,
int, struct mbuf *, struct mbuf *, struct mbuf *, struct lwp *);
int dest6_input(struct mbuf **, int *, int);
int none_input(struct mbuf **, int *, int);
struct route;
int in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route *, struct in6_addr *,
struct ifnet **, struct psref *, struct in6_addr *);
int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct route **, struct rtentry **, bool);
int ip6_get_membership(const struct sockopt *, struct ifnet **,
struct psref *, void *, size_t);
static __inline uint32_t
ip6_randomid(void)
{
return cprng_fast32();
}
static __inline uint32_t
ip6_randomflowlabel(void)
{
return cprng_fast32() & 0xfffff;
}
static __inline bool
ip6_dad_enabled(void)
{
return ip6_dad_count > 0;
}
#endif /* _KERNEL */
#endif /* !_NETINET6_IP6_VAR_H_ */
/* $NetBSD: bufq_impl.h,v 1.10 2016/11/16 00:46:46 pgoyette Exp $ */
/* NetBSD: bufq.h,v 1.3 2005/03/31 11:28:53 yamt Exp */
/* NetBSD: buf.h,v 1.75 2004/09/18 16:40:11 yamt Exp */
/*-
* Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
*/
#if !defined(_KERNEL)
#error not supposed to be exposed to userland.
#endif
struct bufq_strat;
/*
* Device driver buffer queue.
*/
struct bufq_state {
void (*bq_put)(struct bufq_state *, struct buf *);
struct buf *(*bq_get)(struct bufq_state *, int);
struct buf *(*bq_cancel)(struct bufq_state *, struct buf *);
void (*bq_fini)(struct bufq_state *);
void *bq_private;
int bq_flags; /* Flags from bufq_alloc() */
struct bufq_strat *bq_strat;
};
static __inline void *bufq_private(const struct bufq_state *) __unused;
static __inline bool buf_inorder(const struct buf *, const struct buf *, int)
__unused;
#include <sys/null.h> /* for NULL */
static __inline void *
bufq_private(const struct bufq_state *bufq)
{
return bufq->bq_private;
}
/*
* Check if two buf's are in ascending order.
*
* this function consider a NULL buf is after any non-NULL buf.
*
* this function returns false if two are "same".
*/
static __inline bool
buf_inorder(const struct buf *bp, const struct buf *bq, int sortby)
{
KASSERT(bp != NULL || bq != NULL); if (bp == NULL || bq == NULL)
return (bq == NULL);
if (sortby == BUFQ_SORT_CYLINDER) {
if (bp->b_cylinder != bq->b_cylinder)
return bp->b_cylinder < bq->b_cylinder;
else
return bp->b_rawblkno < bq->b_rawblkno;
} else
return bp->b_rawblkno < bq->b_rawblkno;
}
struct bufq_strat {
const char *bs_name;
void (*bs_initfn)(struct bufq_state *);
int bs_prio;
int bs_refcnt;
SLIST_ENTRY(bufq_strat) bs_next;
};
#define BUFQ_DEFINE(name, prio, initfn) \
static struct bufq_strat bufq_strat_##name = { \
.bs_name = #name, \
.bs_prio = prio, \
.bs_initfn = initfn, \
.bs_refcnt = 0 \
};
int bufq_register(struct bufq_strat *);
int bufq_unregister(struct bufq_strat *);
/* $NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Entropy subsystem
*
* * Each CPU maintains a per-CPU entropy pool so that gathering
* entropy requires no interprocessor synchronization, except
* early at boot when we may be scrambling to gather entropy as
* soon as possible.
*
* - entropy_enter gathers entropy and never drops it on the
* floor, at the cost of sometimes having to do cryptography.
*
* - entropy_enter_intr gathers entropy or drops it on the
* floor, with low latency. Work to stir the pool or kick the
* housekeeping thread is scheduled in soft interrupts.
*
* * entropy_enter immediately enters into the global pool if it
* can transition to full entropy in one swell foop. Otherwise,
* it defers to a housekeeping thread that consolidates entropy,
* but only when the CPUs collectively have full entropy, in
* order to mitigate iterative-guessing attacks.
*
* * The entropy housekeeping thread continues to consolidate
* entropy even after we think we have full entropy, in case we
* are wrong, but is limited to one discretionary consolidation
* per minute, and only when new entropy is actually coming in,
* to limit performance impact.
*
* * The entropy epoch is the number that changes when we
* transition from partial entropy to full entropy, so that
* users can easily determine when to reseed. This also
* facilitates an operator explicitly causing everything to
* reseed by sysctl -w kern.entropy.consolidate=1.
*
* * Entropy depletion is available for testing (or if you're into
* that sort of thing), with sysctl -w kern.entropy.depletion=1;
* the logic to support it is small, to minimize chance of bugs.
*
* * While cold, a single global entropy pool is available for
* entering and extracting, serialized through splhigh/splx.
* The per-CPU entropy pool data structures are initialized in
* entropy_init and entropy_init_late (separated mainly for
* hysterical raisins at this point), but are not used until the
* system is warm, at which point access to the global entropy
* pool is limited to thread and softint context and serialized
* by E->lock.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/compat_stub.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/event.h>
#include <sys/file.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/lwp.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/reboot.h>
#include <sys/rnd.h> /* legacy kernel API */
#include <sys/rndio.h> /* userland ioctl interface */
#include <sys/rndsource.h> /* kernel rndsource driver API */
#include <sys/select.h>
#include <sys/selinfo.h>
#include <sys/sha1.h> /* for boot seed checksum */
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/xcall.h>
#include <lib/libkern/entpool.h>
#include <machine/limits.h>
#ifdef __HAVE_CPU_COUNTER
#include <machine/cpu_counter.h>
#endif
#define MINENTROPYBYTES ENTROPY_CAPACITY
#define MINENTROPYBITS (MINENTROPYBYTES*NBBY)
#define MINSAMPLES (2*MINENTROPYBITS)
/*
* struct entropy_cpu
*
* Per-CPU entropy state. The pool is allocated separately
* because percpu(9) sometimes moves per-CPU objects around
* without zeroing them, which would lead to unwanted copies of
* sensitive secrets. The evcnt is allocated separately because
* evcnt(9) assumes it stays put in memory.
*/
struct entropy_cpu {
struct entropy_cpu_evcnt {
struct evcnt softint;
struct evcnt intrdrop;
struct evcnt intrtrunc;
} *ec_evcnt;
struct entpool *ec_pool;
unsigned ec_bitspending;
unsigned ec_samplespending;
bool ec_locked;
};
/*
* struct entropy_cpu_lock
*
* State for locking the per-CPU entropy state.
*/
struct entropy_cpu_lock {
int ecl_s;
long ecl_pctr;
};
/*
* struct rndsource_cpu
*
* Per-CPU rndsource state.
*/
struct rndsource_cpu {
unsigned rc_entropybits;
unsigned rc_timesamples;
unsigned rc_datasamples;
rnd_delta_t rc_timedelta;
};
/*
* entropy_global (a.k.a. E for short in this file)
*
* Global entropy state. Writes protected by the global lock.
* Some fields, marked (A), can be read outside the lock, and are
* maintained with atomic_load/store_relaxed.
*/
struct {
kmutex_t lock; /* covers all global state */
struct entpool pool; /* global pool for extraction */
unsigned bitsneeded; /* (A) needed globally */
unsigned bitspending; /* pending in per-CPU pools */
unsigned samplesneeded; /* (A) needed globally */
unsigned samplespending; /* pending in per-CPU pools */
unsigned timestamp; /* (A) time of last consolidation */
unsigned epoch; /* (A) changes when needed -> 0 */
kcondvar_t cv; /* notifies state changes */
struct selinfo selq; /* notifies needed -> 0 */
struct lwp *sourcelock; /* lock on list of sources */
kcondvar_t sourcelock_cv; /* notifies sourcelock release */
LIST_HEAD(,krndsource) sources; /* list of entropy sources */
bool consolidate; /* kick thread to consolidate */
bool seed_rndsource; /* true if seed source is attached */
bool seeded; /* true if seed file already loaded */
} entropy_global __cacheline_aligned = {
/* Fields that must be initialized when the kernel is loaded. */
.bitsneeded = MINENTROPYBITS,
.samplesneeded = MINSAMPLES,
.epoch = (unsigned)-1, /* -1 means entropy never consolidated */
.sources = LIST_HEAD_INITIALIZER(entropy_global.sources),
};
#define E (&entropy_global) /* declutter */
/* Read-mostly globals */
static struct percpu *entropy_percpu __read_mostly; /* struct entropy_cpu */
static void *entropy_sih __read_mostly; /* softint handler */
static struct lwp *entropy_lwp __read_mostly; /* housekeeping thread */
static struct krndsource seed_rndsource __read_mostly;
/*
* Event counters
*
* Must be careful with adding these because they can serve as
* side channels.
*/
static struct evcnt entropy_discretionary_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "discretionary");
EVCNT_ATTACH_STATIC(entropy_discretionary_evcnt);
static struct evcnt entropy_immediate_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "immediate");
EVCNT_ATTACH_STATIC(entropy_immediate_evcnt);
static struct evcnt entropy_partial_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "partial");
EVCNT_ATTACH_STATIC(entropy_partial_evcnt);
static struct evcnt entropy_consolidate_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "consolidate");
EVCNT_ATTACH_STATIC(entropy_consolidate_evcnt);
static struct evcnt entropy_extract_fail_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "extract fail");
EVCNT_ATTACH_STATIC(entropy_extract_fail_evcnt);
static struct evcnt entropy_request_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "request");
EVCNT_ATTACH_STATIC(entropy_request_evcnt);
static struct evcnt entropy_deplete_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "deplete");
EVCNT_ATTACH_STATIC(entropy_deplete_evcnt);
static struct evcnt entropy_notify_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "notify");
EVCNT_ATTACH_STATIC(entropy_notify_evcnt);
/* Sysctl knobs */
static bool entropy_collection = 1;
static bool entropy_depletion = 0; /* Silly! */
static const struct sysctlnode *entropy_sysctlroot;
static struct sysctllog *entropy_sysctllog;
/* Forward declarations */
static void entropy_init_cpu(void *, void *, struct cpu_info *);
static void entropy_fini_cpu(void *, void *, struct cpu_info *);
static void entropy_account_cpu(struct entropy_cpu *);
static void entropy_enter(const void *, size_t, unsigned, bool);
static bool entropy_enter_intr(const void *, size_t, unsigned, bool);
static void entropy_softintr(void *);
static void entropy_thread(void *);
static bool entropy_pending(void);
static void entropy_pending_cpu(void *, void *, struct cpu_info *);
static void entropy_do_consolidate(void);
static void entropy_consolidate_xc(void *, void *);
static void entropy_notify(void);
static int sysctl_entropy_consolidate(SYSCTLFN_ARGS);
static int sysctl_entropy_gather(SYSCTLFN_ARGS);
static void filt_entropy_read_detach(struct knote *);
static int filt_entropy_read_event(struct knote *, long);
static int entropy_request(size_t, int);
static void rnd_add_data_internal(struct krndsource *, const void *,
uint32_t, uint32_t, bool);
static void rnd_add_data_1(struct krndsource *, const void *, uint32_t,
uint32_t, bool, uint32_t, bool);
static unsigned rndsource_entropybits(struct krndsource *);
static void rndsource_entropybits_cpu(void *, void *, struct cpu_info *);
static void rndsource_to_user(struct krndsource *, rndsource_t *);
static void rndsource_to_user_est(struct krndsource *, rndsource_est_t *);
static void rndsource_to_user_est_cpu(void *, void *, struct cpu_info *);
/*
* entropy_timer()
*
* Cycle counter, time counter, or anything that changes a wee bit
* unpredictably.
*/
static inline uint32_t
entropy_timer(void)
{
struct bintime bt;
uint32_t v;
/* If we have a CPU cycle counter, use the low 32 bits. */
#ifdef __HAVE_CPU_COUNTER
if (__predict_true(cpu_hascounter()))
return cpu_counter32();
#endif /* __HAVE_CPU_COUNTER */
/* If we're cold, tough. Can't binuptime while cold. */
if (__predict_false(cold))
return 0;
/* Fold the 128 bits of binuptime into 32 bits. */
binuptime(&bt);
v = bt.frac;
v ^= bt.frac >> 32;
v ^= bt.sec;
v ^= bt.sec >> 32;
return v;
}
static void
attach_seed_rndsource(void)
{
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* First called no later than entropy_init, while we are still
* single-threaded, so no need for RUN_ONCE.
*/
if (E->seed_rndsource)
return;
rnd_attach_source(&seed_rndsource, "seed", RND_TYPE_UNKNOWN,
RND_FLAG_COLLECT_VALUE);
E->seed_rndsource = true;
}
/*
* entropy_init()
*
* Initialize the entropy subsystem. Panic on failure.
*
* Requires percpu(9) and sysctl(9) to be initialized. Must run
* while cold.
*/
static void
entropy_init(void)
{
uint32_t extra[2];
struct krndsource *rs;
unsigned i = 0;
KASSERT(cold);
/* Grab some cycle counts early at boot. */
extra[i++] = entropy_timer();
/* Run the entropy pool cryptography self-test. */
if (entpool_selftest() == -1)
panic("entropy pool crypto self-test failed");
/* Create the sysctl directory. */
sysctl_createv(&entropy_sysctllog, 0, NULL, &entropy_sysctlroot,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "entropy",
SYSCTL_DESCR("Entropy (random number sources) options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
/* Create the sysctl knobs. */
/* XXX These shouldn't be writable at securelevel>0. */
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "collection",
SYSCTL_DESCR("Automatically collect entropy from hardware"),
NULL, 0, &entropy_collection, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "depletion",
SYSCTL_DESCR("`Deplete' entropy pool when observed"),
NULL, 0, &entropy_depletion, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "consolidate",
SYSCTL_DESCR("Trigger entropy consolidation now"),
sysctl_entropy_consolidate, 0, NULL, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "gather",
SYSCTL_DESCR("Trigger entropy gathering from sources now"),
sysctl_entropy_gather, 0, NULL, 0, CTL_CREATE, CTL_EOL);
/* XXX These should maybe not be readable at securelevel>0. */
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"needed",
SYSCTL_DESCR("Systemwide entropy deficit (bits of entropy)"),
NULL, 0, &E->bitsneeded, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"pending",
SYSCTL_DESCR("Number of bits of entropy pending on CPUs"),
NULL, 0, &E->bitspending, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"samplesneeded",
SYSCTL_DESCR("Systemwide entropy deficit (samples)"),
NULL, 0, &E->samplesneeded, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"samplespending",
SYSCTL_DESCR("Number of samples pending on CPUs"),
NULL, 0, &E->samplespending, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"epoch", SYSCTL_DESCR("Entropy epoch"),
NULL, 0, &E->epoch, 0, CTL_CREATE, CTL_EOL);
/* Initialize the global state for multithreaded operation. */
mutex_init(&E->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
cv_init(&E->cv, "entropy");
selinit(&E->selq);
cv_init(&E->sourcelock_cv, "entsrclock");
/* Make sure the seed source is attached. */
attach_seed_rndsource();
/* Note if the bootloader didn't provide a seed. */
if (!E->seeded)
aprint_debug("entropy: no seed from bootloader\n");
/* Allocate the per-CPU records for all early entropy sources. */
LIST_FOREACH(rs, &E->sources, list)
rs->state = percpu_alloc(sizeof(struct rndsource_cpu));
/* Allocate and initialize the per-CPU state. */
entropy_percpu = percpu_create(sizeof(struct entropy_cpu),
entropy_init_cpu, entropy_fini_cpu, NULL);
/* Enter the boot cycle count to get started. */
extra[i++] = entropy_timer();
KASSERT(i == __arraycount(extra));
entropy_enter(extra, sizeof extra, /*nbits*/0, /*count*/false);
explicit_memset(extra, 0, sizeof extra);
}
/*
* entropy_init_late()
*
* Late initialization. Panic on failure.
*
* Requires CPUs to have been detected and LWPs to have started.
* Must run while cold.
*/
static void
entropy_init_late(void)
{
int error;
KASSERT(cold);
/*
* Establish the softint at the highest softint priority level.
* Must happen after CPU detection.
*/
entropy_sih = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
&entropy_softintr, NULL);
if (entropy_sih == NULL)
panic("unable to establish entropy softint");
/*
* Create the entropy housekeeping thread. Must happen after
* lwpinit.
*/
error = kthread_create(PRI_NONE, KTHREAD_MPSAFE|KTHREAD_TS, NULL,
entropy_thread, NULL, &entropy_lwp, "entbutler");
if (error)
panic("unable to create entropy housekeeping thread: %d",
error);
}
/*
* entropy_init_cpu(ptr, cookie, ci)
*
* percpu(9) constructor for per-CPU entropy pool.
*/
static void
entropy_init_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
const char *cpuname;
ec->ec_evcnt = kmem_alloc(sizeof(*ec->ec_evcnt), KM_SLEEP);
ec->ec_pool = kmem_zalloc(sizeof(*ec->ec_pool), KM_SLEEP);
ec->ec_bitspending = 0;
ec->ec_samplespending = 0;
ec->ec_locked = false;
/* XXX ci_cpuname may not be initialized early enough. */
cpuname = ci->ci_cpuname[0] == '\0' ? "cpu0" : ci->ci_cpuname;
evcnt_attach_dynamic(&ec->ec_evcnt->softint, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy softint");
evcnt_attach_dynamic(&ec->ec_evcnt->intrdrop, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy intrdrop");
evcnt_attach_dynamic(&ec->ec_evcnt->intrtrunc, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy intrtrunc");
}
/*
* entropy_fini_cpu(ptr, cookie, ci)
*
* percpu(9) destructor for per-CPU entropy pool.
*/
static void
entropy_fini_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
/*
* Zero any lingering data. Disclosure of the per-CPU pool
* shouldn't retroactively affect the security of any keys
* generated, because entpool(9) erases whatever we have just
* drawn out of any pool, but better safe than sorry.
*/
explicit_memset(ec->ec_pool, 0, sizeof(*ec->ec_pool));
evcnt_detach(&ec->ec_evcnt->intrtrunc);
evcnt_detach(&ec->ec_evcnt->intrdrop);
evcnt_detach(&ec->ec_evcnt->softint);
kmem_free(ec->ec_pool, sizeof(*ec->ec_pool));
kmem_free(ec->ec_evcnt, sizeof(*ec->ec_evcnt));
}
/*
* ec = entropy_cpu_get(&lock)
* entropy_cpu_put(&lock, ec)
*
* Lock and unlock the per-CPU entropy state. This only prevents
* access on the same CPU -- by hard interrupts, by soft
* interrupts, or by other threads.
*
* Blocks soft interrupts and preemption altogether; doesn't block
* hard interrupts, but causes samples in hard interrupts to be
* dropped.
*/
static struct entropy_cpu *
entropy_cpu_get(struct entropy_cpu_lock *lock)
{
struct entropy_cpu *ec;
ec = percpu_getref(entropy_percpu);
lock->ecl_s = splsoftserial();
KASSERT(!ec->ec_locked);
ec->ec_locked = true;
lock->ecl_pctr = lwp_pctr();
__insn_barrier();
return ec;
}
static void
entropy_cpu_put(struct entropy_cpu_lock *lock, struct entropy_cpu *ec)
{ KASSERT(ec == percpu_getptr_remote(entropy_percpu, curcpu())); KASSERT(ec->ec_locked);
__insn_barrier();
KASSERT(lock->ecl_pctr == lwp_pctr());
ec->ec_locked = false;
splx(lock->ecl_s);
percpu_putref(entropy_percpu);
}
/*
* entropy_seed(seed)
*
* Seed the entropy pool with seed. Meant to be called as early
* as possible by the bootloader; may be called before or after
* entropy_init. Must be called before system reaches userland.
* Must be called in thread or soft interrupt context, not in hard
* interrupt context. Must be called at most once.
*
* Overwrites the seed in place. Caller may then free the memory.
*/
static void
entropy_seed(rndsave_t *seed)
{
SHA1_CTX ctx;
uint8_t digest[SHA1_DIGEST_LENGTH];
bool seeded;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* Verify the checksum. If the checksum fails, take the data
* but ignore the entropy estimate -- the file may have been
* incompletely written with garbage, which is harmless to add
* but may not be as unpredictable as alleged.
*/
SHA1Init(&ctx);
SHA1Update(&ctx, (const void *)&seed->entropy, sizeof(seed->entropy));
SHA1Update(&ctx, seed->data, sizeof(seed->data));
SHA1Final(digest, &ctx);
CTASSERT(sizeof(seed->digest) == sizeof(digest));
if (!consttime_memequal(digest, seed->digest, sizeof(digest))) {
printf("entropy: invalid seed checksum\n");
seed->entropy = 0;
}
explicit_memset(&ctx, 0, sizeof ctx);
explicit_memset(digest, 0, sizeof digest);
/*
* If the entropy is insensibly large, try byte-swapping.
* Otherwise assume the file is corrupted and act as though it
* has zero entropy.
*/
if (howmany(seed->entropy, NBBY) > sizeof(seed->data)) {
seed->entropy = bswap32(seed->entropy);
if (howmany(seed->entropy, NBBY) > sizeof(seed->data))
seed->entropy = 0;
}
/* Make sure the seed source is attached. */
attach_seed_rndsource();
/* Test and set E->seeded. */
seeded = E->seeded;
E->seeded = (seed->entropy > 0);
/*
* If we've been seeded, may be re-entering the same seed
* (e.g., bootloader vs module init, or something). No harm in
* entering it twice, but it contributes no additional entropy.
*/
if (seeded) {
printf("entropy: double-seeded by bootloader\n");
seed->entropy = 0;
} else {
printf("entropy: entering seed from bootloader"
" with %u bits of entropy\n", (unsigned)seed->entropy);
}
/* Enter it into the pool and promptly zero it. */
rnd_add_data(&seed_rndsource, seed->data, sizeof(seed->data),
seed->entropy);
explicit_memset(seed, 0, sizeof(*seed));
}
/*
* entropy_bootrequest()
*
* Request entropy from all sources at boot, once config is
* complete and interrupts are running but we are still cold.
*/
void
entropy_bootrequest(void)
{
int error;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* Request enough to satisfy the maximum entropy shortage.
* This is harmless overkill if the bootloader provided a seed.
*/
error = entropy_request(MINENTROPYBYTES, ENTROPY_WAIT);
KASSERTMSG(error == 0, "error=%d", error);
}
/*
* entropy_epoch()
*
* Returns the current entropy epoch. If this changes, you should
* reseed. If -1, means system entropy has not yet reached full
* entropy or been explicitly consolidated; never reverts back to
* -1. Never zero, so you can always use zero as an uninitialized
* sentinel value meaning `reseed ASAP'.
*
* Usage model:
*
* struct foo {
* struct crypto_prng prng;
* unsigned epoch;
* } *foo;
*
* unsigned epoch = entropy_epoch();
* if (__predict_false(epoch != foo->epoch)) {
* uint8_t seed[32];
* if (entropy_extract(seed, sizeof seed, 0) != 0)
* warn("no entropy");
* crypto_prng_reseed(&foo->prng, seed, sizeof seed);
* foo->epoch = epoch;
* }
*/
unsigned
entropy_epoch(void)
{
/*
* Unsigned int, so no need for seqlock for an atomic read, but
* make sure we read it afresh each time.
*/
return atomic_load_relaxed(&E->epoch);
}
/*
* entropy_ready()
*
* True if the entropy pool has full entropy.
*/
bool
entropy_ready(void)
{
return atomic_load_relaxed(&E->bitsneeded) == 0;
}
/*
* entropy_account_cpu(ec)
*
* Consider whether to consolidate entropy into the global pool
* after we just added some into the current CPU's pending pool.
*
* - If this CPU can provide enough entropy now, do so.
*
* - If this and whatever else is available on other CPUs can
* provide enough entropy, kick the consolidation thread.
*
* - Otherwise, do as little as possible, except maybe consolidate
* entropy at most once a minute.
*
* Caller must be bound to a CPU and therefore have exclusive
* access to ec. Will acquire and release the global lock.
*/
static void
entropy_account_cpu(struct entropy_cpu *ec)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec0;
unsigned bitsdiff, samplesdiff;
KASSERT(!cpu_intr_p()); KASSERT(!cold); KASSERT(curlwp->l_pflag & LP_BOUND);
/*
* If there's no entropy needed, and entropy has been
* consolidated in the last minute, do nothing.
*/
if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0) && __predict_true(!atomic_load_relaxed(&entropy_depletion)) &&
__predict_true((time_uptime - E->timestamp) <= 60))
return;
/*
* Consider consolidation, under the global lock and with the
* per-CPU state locked.
*/
mutex_enter(&E->lock);
ec0 = entropy_cpu_get(&lock); KASSERT(ec0 == ec); if (ec->ec_bitspending == 0 && ec->ec_samplespending == 0) {
/* Raced with consolidation xcall. Nothing to do. */
} else if (E->bitsneeded != 0 && E->bitsneeded <= ec->ec_bitspending) {
/*
* If we have not yet attained full entropy but we can
* now, do so. This way we disseminate entropy
* promptly when it becomes available early at boot;
* otherwise we leave it to the entropy consolidation
* thread, which is rate-limited to mitigate side
* channels and abuse.
*/
uint8_t buf[ENTPOOL_CAPACITY];
/* Transfer from the local pool to the global pool. */
entpool_extract(ec->ec_pool, buf, sizeof buf);
entpool_enter(&E->pool, buf, sizeof buf);
atomic_store_relaxed(&ec->ec_bitspending, 0); atomic_store_relaxed(&ec->ec_samplespending, 0);
atomic_store_relaxed(&E->bitsneeded, 0);
atomic_store_relaxed(&E->samplesneeded, 0);
/* Notify waiters that we now have full entropy. */
entropy_notify();
entropy_immediate_evcnt.ev_count++;
} else {
/* Determine how much we can add to the global pool. */
KASSERTMSG(E->bitspending <= MINENTROPYBITS,
"E->bitspending=%u", E->bitspending);
bitsdiff = MIN(ec->ec_bitspending,
MINENTROPYBITS - E->bitspending);
KASSERTMSG(E->samplespending <= MINSAMPLES,
"E->samplespending=%u", E->samplespending);
samplesdiff = MIN(ec->ec_samplespending,
MINSAMPLES - E->samplespending);
/*
* This should make a difference unless we are already
* saturated.
*/
KASSERTMSG((bitsdiff || samplesdiff ||
E->bitspending == MINENTROPYBITS ||
E->samplespending == MINSAMPLES),
"bitsdiff=%u E->bitspending=%u ec->ec_bitspending=%u"
"samplesdiff=%u E->samplespending=%u"
" ec->ec_samplespending=%u"
" minentropybits=%u minsamples=%u",
bitsdiff, E->bitspending, ec->ec_bitspending,
samplesdiff, E->samplespending, ec->ec_samplespending,
(unsigned)MINENTROPYBITS, (unsigned)MINSAMPLES);
/* Add to the global, subtract from the local. */
E->bitspending += bitsdiff;
KASSERTMSG(E->bitspending <= MINENTROPYBITS,
"E->bitspending=%u", E->bitspending);
atomic_store_relaxed(&ec->ec_bitspending,
ec->ec_bitspending - bitsdiff);
E->samplespending += samplesdiff;
KASSERTMSG(E->samplespending <= MINSAMPLES,
"E->samplespending=%u", E->samplespending);
atomic_store_relaxed(&ec->ec_samplespending,
ec->ec_samplespending - samplesdiff);
/* One or the other must have gone up from zero. */
KASSERT(E->bitspending || E->samplespending); if (E->bitsneeded <= E->bitspending ||
E->samplesneeded <= E->samplespending) {
/*
* Enough bits or at least samples between all
* the per-CPU pools. Leave a note for the
* housekeeping thread to consolidate entropy
* next time it wakes up -- and wake it up if
* this is the first time, to speed things up.
*
* If we don't need any entropy, this doesn't
* mean much, but it is the only time we ever
* gather additional entropy in case the
* accounting has been overly optimistic. This
* happens at most once a minute, so there's
* negligible performance cost.
*/
E->consolidate = true;
if (E->epoch == (unsigned)-1) cv_broadcast(&E->cv); if (E->bitsneeded == 0) entropy_discretionary_evcnt.ev_count++;
} else {
/* Can't get full entropy. Keep gathering. */
entropy_partial_evcnt.ev_count++;
}
}
entropy_cpu_put(&lock, ec);
mutex_exit(&E->lock);
}
/*
* entropy_enter_early(buf, len, nbits)
*
* Do entropy bookkeeping globally, before we have established
* per-CPU pools. Enter directly into the global pool in the hope
* that we enter enough before the first entropy_extract to thwart
* iterative-guessing attacks; entropy_extract will warn if not.
*/
static void
entropy_enter_early(const void *buf, size_t len, unsigned nbits)
{
bool notify = false;
int s;
KASSERT(cold);
/*
* We're early at boot before multithreading and multi-CPU
* operation, and we don't have softints yet to defer
* processing from interrupt context, so we have to enter the
* samples directly into the global pool. But interrupts may
* be enabled, and we enter this path from interrupt context,
* so block interrupts until we're done.
*/
s = splhigh();
/* Enter it into the pool. */
entpool_enter(&E->pool, buf, len);
/*
* Decide whether to notify reseed -- we will do so if either:
* (a) we transition from partial entropy to full entropy, or
* (b) we get a batch of full entropy all at once.
* We don't count timing samples because we assume, while cold,
* there's not likely to be much jitter yet.
*/
notify |= (E->bitsneeded && E->bitsneeded <= nbits);
notify |= (nbits >= MINENTROPYBITS);
/*
* Subtract from the needed count and notify if appropriate.
* We don't count samples here because entropy_timer might
* still be returning zero at this point if there's no CPU
* cycle counter.
*/
E->bitsneeded -= MIN(E->bitsneeded, nbits);
if (notify) {
entropy_notify();
entropy_immediate_evcnt.ev_count++;
}
splx(s);
}
/*
* entropy_enter(buf, len, nbits, count)
*
* Enter len bytes of data from buf into the system's entropy
* pool, stirring as necessary when the internal buffer fills up.
* nbits is a lower bound on the number of bits of entropy in the
* process that led to this sample.
*/
static void
entropy_enter(const void *buf, size_t len, unsigned nbits, bool count)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
unsigned bitspending, samplespending;
int bound;
KASSERTMSG(!cpu_intr_p(),
"use entropy_enter_intr from interrupt context");
KASSERTMSG(howmany(nbits, NBBY) <= len,
"impossible entropy rate: %u bits in %zu-byte string", nbits, len);
/*
* If we're still cold, just use entropy_enter_early to put
* samples directly into the global pool.
*/
if (__predict_false(cold)) {
entropy_enter_early(buf, len, nbits);
return;
}
/*
* Bind ourselves to the current CPU so we don't switch CPUs
* between entering data into the current CPU's pool (and
* updating the pending count) and transferring it to the
* global pool in entropy_account_cpu.
*/
bound = curlwp_bind();
/*
* With the per-CPU state locked, enter into the per-CPU pool
* and count up what we can add.
*
* We don't count samples while cold because entropy_timer
* might still be returning zero if there's no CPU cycle
* counter.
*/
ec = entropy_cpu_get(&lock);
entpool_enter(ec->ec_pool, buf, len);
bitspending = ec->ec_bitspending;
bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
atomic_store_relaxed(&ec->ec_bitspending, bitspending);
samplespending = ec->ec_samplespending;
if (__predict_true(count)) {
samplespending += MIN(MINSAMPLES - samplespending, 1);
atomic_store_relaxed(&ec->ec_samplespending, samplespending);
}
entropy_cpu_put(&lock, ec);
/* Consolidate globally if appropriate based on what we added. */
if (bitspending > 0 || samplespending >= MINSAMPLES) entropy_account_cpu(ec); curlwp_bindx(bound);
}
/*
* entropy_enter_intr(buf, len, nbits, count)
*
* Enter up to len bytes of data from buf into the system's
* entropy pool without stirring. nbits is a lower bound on the
* number of bits of entropy in the process that led to this
* sample. If the sample could be entered completely, assume
* nbits of entropy pending; otherwise assume none, since we don't
* know whether some parts of the sample are constant, for
* instance. Schedule a softint to stir the entropy pool if
* needed. Return true if used fully, false if truncated at all.
*
* Using this in thread or softint context with no spin locks held
* will work, but you might as well use entropy_enter in that
* case.
*/
static bool
entropy_enter_intr(const void *buf, size_t len, unsigned nbits, bool count)
{
struct entropy_cpu *ec;
bool fullyused = false;
uint32_t bitspending, samplespending;
int s;
KASSERTMSG(howmany(nbits, NBBY) <= len,
"impossible entropy rate: %u bits in %zu-byte string", nbits, len);
/*
* If we're still cold, just use entropy_enter_early to put
* samples directly into the global pool.
*/
if (__predict_false(cold)) {
entropy_enter_early(buf, len, nbits);
return true;
}
/*
* In case we were called in thread or interrupt context with
* interrupts unblocked, block soft interrupts up to
* IPL_SOFTSERIAL. This way logic that is safe in interrupt
* context or under a spin lock is also safe in less
* restrictive contexts.
*/
s = splsoftserial();
/*
* Acquire the per-CPU state. If someone is in the middle of
* using it, drop the sample. Otherwise, take the lock so that
* higher-priority interrupts will drop their samples.
*/
ec = percpu_getref(entropy_percpu);
if (ec->ec_locked) {
ec->ec_evcnt->intrdrop.ev_count++;
goto out0;
}
ec->ec_locked = true;
__insn_barrier();
/*
* Enter as much as we can into the per-CPU pool. If it was
* truncated, schedule a softint to stir the pool and stop.
*/
if (!entpool_enter_nostir(ec->ec_pool, buf, len)) {
if (__predict_true(!cold)) softint_schedule(entropy_sih);
ec->ec_evcnt->intrtrunc.ev_count++;
goto out1;
}
fullyused = true;
/*
* Count up what we can contribute.
*
* We don't count samples while cold because entropy_timer
* might still be returning zero if there's no CPU cycle
* counter.
*/
bitspending = ec->ec_bitspending;
bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
atomic_store_relaxed(&ec->ec_bitspending, bitspending); if (__predict_true(count)) {
samplespending = ec->ec_samplespending;
samplespending += MIN(MINSAMPLES - samplespending, 1);
atomic_store_relaxed(&ec->ec_samplespending, samplespending);
}
/* Schedule a softint if we added anything and it matters. */
if (__predict_false(atomic_load_relaxed(&E->bitsneeded) || atomic_load_relaxed(&entropy_depletion)) && (nbits != 0 || count) &&
__predict_true(!cold))
softint_schedule(entropy_sih);
out1: /* Release the per-CPU state. */
KASSERT(ec->ec_locked);
__insn_barrier();
ec->ec_locked = false;
out0: percpu_putref(entropy_percpu);
splx(s);
return fullyused;
}
/*
* entropy_softintr(cookie)
*
* Soft interrupt handler for entering entropy. Takes care of
* stirring the local CPU's entropy pool if it filled up during
* hard interrupts, and promptly crediting entropy from the local
* CPU's entropy pool to the global entropy pool if needed.
*/
static void
entropy_softintr(void *cookie)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
unsigned bitspending, samplespending;
/*
* With the per-CPU state locked, stir the pool if necessary
* and determine if there's any pending entropy on this CPU to
* account globally.
*/
ec = entropy_cpu_get(&lock);
ec->ec_evcnt->softint.ev_count++;
entpool_stir(ec->ec_pool);
bitspending = ec->ec_bitspending;
samplespending = ec->ec_samplespending;
entropy_cpu_put(&lock, ec);
/* Consolidate globally if appropriate based on what we added. */
if (bitspending > 0 || samplespending >= MINSAMPLES)
entropy_account_cpu(ec);
}
/*
* entropy_thread(cookie)
*
* Handle any asynchronous entropy housekeeping.
*/
static void
entropy_thread(void *cookie)
{
bool consolidate;
#ifndef _RUMPKERNEL /* XXX rump starts threads before cold */
KASSERT(!cold);
#endif
for (;;) {
/*
* Wait until there's full entropy somewhere among the
* CPUs, as confirmed at most once per minute, or
* someone wants to consolidate.
*/
if (entropy_pending()) {
consolidate = true;
} else {
mutex_enter(&E->lock);
if (!E->consolidate)
cv_timedwait(&E->cv, &E->lock, 60*hz);
consolidate = E->consolidate;
E->consolidate = false;
mutex_exit(&E->lock);
}
if (consolidate) {
/* Do it. */
entropy_do_consolidate();
/* Mitigate abuse. */
kpause("entropy", false, hz, NULL);
}
}
}
struct entropy_pending_count {
uint32_t bitspending;
uint32_t samplespending;
};
/*
* entropy_pending()
*
* True if enough bits or samples are pending on other CPUs to
* warrant consolidation.
*/
static bool
entropy_pending(void)
{
struct entropy_pending_count count = { 0, 0 }, *C = &count;
percpu_foreach(entropy_percpu, &entropy_pending_cpu, C);
return C->bitspending >= MINENTROPYBITS ||
C->samplespending >= MINSAMPLES;
}
static void
entropy_pending_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
struct entropy_pending_count *C = cookie;
uint32_t cpu_bitspending;
uint32_t cpu_samplespending;
cpu_bitspending = atomic_load_relaxed(&ec->ec_bitspending);
cpu_samplespending = atomic_load_relaxed(&ec->ec_samplespending);
C->bitspending += MIN(MINENTROPYBITS - C->bitspending,
cpu_bitspending);
C->samplespending += MIN(MINSAMPLES - C->samplespending,
cpu_samplespending);
}
/*
* entropy_do_consolidate()
*
* Issue a cross-call to gather entropy on all CPUs and advance
* the entropy epoch.
*/
static void
entropy_do_consolidate(void)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
struct entpool pool;
uint8_t buf[ENTPOOL_CAPACITY];
unsigned bitsdiff, samplesdiff;
uint64_t ticket;
KASSERT(!cold);
ASSERT_SLEEPABLE();
/* Gather entropy on all CPUs into a temporary pool. */
memset(&pool, 0, sizeof pool);
ticket = xc_broadcast(0, &entropy_consolidate_xc, &pool, NULL);
xc_wait(ticket);
/* Acquire the lock to notify waiters. */
mutex_enter(&E->lock);
/* Count another consolidation. */
entropy_consolidate_evcnt.ev_count++;
/* Note when we last consolidated, i.e. now. */
E->timestamp = time_uptime;
/* Mix what we gathered into the global pool. */
entpool_extract(&pool, buf, sizeof buf);
entpool_enter(&E->pool, buf, sizeof buf);
explicit_memset(&pool, 0, sizeof pool);
/* Count the entropy that was gathered. */
bitsdiff = MIN(E->bitsneeded, E->bitspending);
atomic_store_relaxed(&E->bitsneeded, E->bitsneeded - bitsdiff);
E->bitspending -= bitsdiff;
if (__predict_false(E->bitsneeded > 0) && bitsdiff != 0) {
if ((boothowto & AB_DEBUG) != 0 &&
ratecheck(&lasttime, &interval)) {
printf("WARNING:"
" consolidating less than full entropy\n");
}
}
samplesdiff = MIN(E->samplesneeded, E->samplespending);
atomic_store_relaxed(&E->samplesneeded,
E->samplesneeded - samplesdiff);
E->samplespending -= samplesdiff;
/* Advance the epoch and notify waiters. */
entropy_notify();
/* Release the lock. */
mutex_exit(&E->lock);
}
/*
* entropy_consolidate_xc(vpool, arg2)
*
* Extract output from the local CPU's input pool and enter it
* into a temporary pool passed as vpool.
*/
static void
entropy_consolidate_xc(void *vpool, void *arg2 __unused)
{
struct entpool *pool = vpool;
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
uint8_t buf[ENTPOOL_CAPACITY];
uint32_t extra[7];
unsigned i = 0;
/* Grab CPU number and cycle counter to mix extra into the pool. */
extra[i++] = cpu_number();
extra[i++] = entropy_timer();
/*
* With the per-CPU state locked, extract from the per-CPU pool
* and count it as no longer pending.
*/
ec = entropy_cpu_get(&lock);
extra[i++] = entropy_timer();
entpool_extract(ec->ec_pool, buf, sizeof buf);
atomic_store_relaxed(&ec->ec_bitspending, 0);
atomic_store_relaxed(&ec->ec_samplespending, 0);
extra[i++] = entropy_timer();
entropy_cpu_put(&lock, ec);
extra[i++] = entropy_timer();
/*
* Copy over statistics, and enter the per-CPU extract and the
* extra timing into the temporary pool, under the global lock.
*/
mutex_enter(&E->lock);
extra[i++] = entropy_timer();
entpool_enter(pool, buf, sizeof buf);
explicit_memset(buf, 0, sizeof buf);
extra[i++] = entropy_timer();
KASSERT(i == __arraycount(extra));
entpool_enter(pool, extra, sizeof extra);
explicit_memset(extra, 0, sizeof extra);
mutex_exit(&E->lock);
}
/*
* entropy_notify()
*
* Caller just contributed entropy to the global pool. Advance
* the entropy epoch and notify waiters.
*
* Caller must hold the global entropy lock.
*/
static void
entropy_notify(void)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
static bool ready = false, besteffort = false;
unsigned epoch;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
/*
* If this is the first time, print a message to the console
* that we're ready so operators can compare it to the timing
* of other events.
*
* If we didn't get full entropy from reliable sources, report
* instead that we are running on fumes with best effort. (If
* we ever do get full entropy after that, print the ready
* message once.)
*/
if (__predict_false(!ready)) {
if (E->bitsneeded == 0) {
printf("entropy: ready\n");
ready = true;
} else if (E->samplesneeded == 0 && !besteffort) {
printf("entropy: best effort\n");
besteffort = true;
}
}
/* Set the epoch; roll over from UINTMAX-1 to 1. */
if (__predict_true(!atomic_load_relaxed(&entropy_depletion)) ||
ratecheck(&lasttime, &interval)) {
epoch = E->epoch + 1;
if (epoch == 0 || epoch == (unsigned)-1)
epoch = 1;
atomic_store_relaxed(&E->epoch, epoch);
}
KASSERT(E->epoch != (unsigned)-1);
/* Notify waiters. */
if (__predict_true(!cold)) {
cv_broadcast(&E->cv);
selnotify(&E->selq, POLLIN|POLLRDNORM, NOTE_SUBMIT);
}
/* Count another notification. */
entropy_notify_evcnt.ev_count++;
}
/*
* entropy_consolidate()
*
* Trigger entropy consolidation and wait for it to complete.
*
* This should be used sparingly, not periodically -- requiring
* conscious intervention by the operator or a clear policy
* decision. Otherwise, the kernel will automatically consolidate
* when enough entropy has been gathered into per-CPU pools to
* transition to full entropy.
*/
void
entropy_consolidate(void)
{
uint64_t ticket;
int error;
KASSERT(!cold);
ASSERT_SLEEPABLE();
mutex_enter(&E->lock);
ticket = entropy_consolidate_evcnt.ev_count;
E->consolidate = true;
cv_broadcast(&E->cv);
while (ticket == entropy_consolidate_evcnt.ev_count) {
error = cv_wait_sig(&E->cv, &E->lock);
if (error)
break;
}
mutex_exit(&E->lock);
}
/*
* sysctl -w kern.entropy.consolidate=1
*
* Trigger entropy consolidation and wait for it to complete.
* Writable only by superuser. This, writing to /dev/random, and
* ioctl(RNDADDDATA) are the only ways for the system to
* consolidate entropy if the operator knows something the kernel
* doesn't about how unpredictable the pending entropy pools are.
*/
static int
sysctl_entropy_consolidate(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg)
entropy_consolidate();
return error;
}
/*
* sysctl -w kern.entropy.gather=1
*
* Trigger gathering entropy from all on-demand sources, and wait
* for synchronous sources (but not asynchronous sources) to
* complete. Writable only by superuser.
*/
static int
sysctl_entropy_gather(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg) {
mutex_enter(&E->lock);
error = entropy_request(ENTROPY_CAPACITY,
ENTROPY_WAIT|ENTROPY_SIG);
mutex_exit(&E->lock);
}
return 0;
}
/*
* entropy_extract(buf, len, flags)
*
* Extract len bytes from the global entropy pool into buf.
*
* Caller MUST NOT expose these bytes directly -- must use them
* ONLY to seed a cryptographic pseudorandom number generator
* (`CPRNG'), a.k.a. deterministic random bit generator (`DRBG'),
* and then erase them. entropy_extract does not, on its own,
* provide backtracking resistance -- it must be combined with a
* PRNG/DRBG that does.
*
* This may be used very early at boot, before even entropy_init
* has been called.
*
* You generally shouldn't use this directly -- use cprng(9)
* instead.
*
* Flags may have:
*
* ENTROPY_WAIT Wait for entropy if not available yet.
* ENTROPY_SIG Allow interruption by a signal during wait.
* ENTROPY_HARDFAIL Either fill the buffer with full entropy,
* or fail without filling it at all.
*
* Return zero on success, or error on failure:
*
* EWOULDBLOCK No entropy and ENTROPY_WAIT not set.
* EINTR/ERESTART No entropy, ENTROPY_SIG set, and interrupted.
*
* If ENTROPY_WAIT is set, allowed only in thread context. If
* ENTROPY_WAIT is not set, allowed also in softint context -- may
* sleep on an adaptive lock up to IPL_SOFTSERIAL. Forbidden in
* hard interrupt context.
*/
int
entropy_extract(void *buf, size_t len, int flags)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
bool printed = false;
int s = -1/*XXXGCC*/, error;
if (ISSET(flags, ENTROPY_WAIT)) {
ASSERT_SLEEPABLE();
KASSERT(!cold);
}
/* Refuse to operate in interrupt context. */
KASSERT(!cpu_intr_p());
/*
* If we're cold, we are only contending with interrupts on the
* current CPU, so block them. Otherwise, we are _not_
* contending with interrupts on the current CPU, but we are
* contending with other threads, to exclude them with a mutex.
*/
if (__predict_false(cold))
s = splhigh();
else
mutex_enter(&E->lock);
/* Wait until there is enough entropy in the system. */
error = 0;
if (E->bitsneeded > 0 && E->samplesneeded == 0) {
/*
* We don't have full entropy from reliable sources,
* but we gathered a plausible number of samples from
* other sources such as timers. Try asking for more
* from any sources we can, but don't worry if it
* fails -- best effort.
*/
(void)entropy_request(ENTROPY_CAPACITY, flags);
} else while (E->bitsneeded > 0 && E->samplesneeded > 0) {
/* Ask for more, synchronously if possible. */
error = entropy_request(len, flags);
if (error)
break;
/* If we got enough, we're done. */
if (E->bitsneeded == 0 || E->samplesneeded == 0) {
KASSERT(error == 0);
break;
}
/* If not waiting, stop here. */
if (!ISSET(flags, ENTROPY_WAIT)) {
error = EWOULDBLOCK;
break;
}
/* Wait for some entropy to come in and try again. */
KASSERT(!cold);
if (!printed) {
printf("entropy: pid %d (%s) waiting for entropy(7)\n",
curproc->p_pid, curproc->p_comm);
printed = true;
}
if (ISSET(flags, ENTROPY_SIG)) {
error = cv_timedwait_sig(&E->cv, &E->lock, hz);
if (error && error != EWOULDBLOCK)
break;
} else {
cv_timedwait(&E->cv, &E->lock, hz);
}
}
/*
* Count failure -- but fill the buffer nevertheless, unless
* the caller specified ENTROPY_HARDFAIL.
*/
if (error) {
if (ISSET(flags, ENTROPY_HARDFAIL))
goto out;
entropy_extract_fail_evcnt.ev_count++;
}
/*
* Report a warning if we haven't yet reached full entropy.
* This is the only case where we consider entropy to be
* `depleted' without kern.entropy.depletion enabled -- when we
* only have partial entropy, an adversary may be able to
* narrow the state of the pool down to a small number of
* possibilities; the output then enables them to confirm a
* guess, reducing its entropy from the adversary's perspective
* to zero.
*
* This should only happen if the operator has chosen to
* consolidate, either through sysctl kern.entropy.consolidate
* or by writing less than full entropy to /dev/random as root
* (which /dev/random promises will immediately affect
* subsequent output, for better or worse).
*/
if (E->bitsneeded > 0 && E->samplesneeded > 0) {
if (__predict_false(E->epoch == (unsigned)-1) &&
ratecheck(&lasttime, &interval)) {
printf("WARNING:"
" system needs entropy for security;"
" see entropy(7)\n");
}
atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
}
/* Extract data from the pool, and `deplete' if we're doing that. */
entpool_extract(&E->pool, buf, len);
if (__predict_false(atomic_load_relaxed(&entropy_depletion)) &&
error == 0) {
unsigned cost = MIN(len, ENTROPY_CAPACITY)*NBBY;
unsigned bitsneeded = E->bitsneeded;
unsigned samplesneeded = E->samplesneeded;
bitsneeded += MIN(MINENTROPYBITS - bitsneeded, cost);
samplesneeded += MIN(MINSAMPLES - samplesneeded, cost);
atomic_store_relaxed(&E->bitsneeded, bitsneeded);
atomic_store_relaxed(&E->samplesneeded, samplesneeded);
entropy_deplete_evcnt.ev_count++;
}
out: /* Release the global lock and return the error. */
if (__predict_false(cold))
splx(s);
else
mutex_exit(&E->lock);
return error;
}
/*
* entropy_poll(events)
*
* Return the subset of events ready, and if it is not all of
* events, record curlwp as waiting for entropy.
*/
int
entropy_poll(int events)
{
int revents = 0;
KASSERT(!cold);
/* Always ready for writing. */
revents |= events & (POLLOUT|POLLWRNORM);
/* Narrow it down to reads. */
events &= POLLIN|POLLRDNORM;
if (events == 0)
return revents;
/*
* If we have reached full entropy and we're not depleting
* entropy, we are forever ready.
*/
if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0 ||
atomic_load_relaxed(&E->samplesneeded) == 0) &&
__predict_true(!atomic_load_relaxed(&entropy_depletion)))
return revents | events;
/*
* Otherwise, check whether we need entropy under the lock. If
* we don't, we're ready; if we do, add ourselves to the queue.
*/
mutex_enter(&E->lock);
if (E->bitsneeded == 0 || E->samplesneeded == 0)
revents |= events;
else
selrecord(curlwp, &E->selq);
mutex_exit(&E->lock);
return revents;
}
/*
* filt_entropy_read_detach(kn)
*
* struct filterops::f_detach callback for entropy read events:
* remove kn from the list of waiters.
*/
static void
filt_entropy_read_detach(struct knote *kn)
{
KASSERT(!cold);
mutex_enter(&E->lock);
selremove_knote(&E->selq, kn);
mutex_exit(&E->lock);
}
/*
* filt_entropy_read_event(kn, hint)
*
* struct filterops::f_event callback for entropy read events:
* poll for entropy. Caller must hold the global entropy lock if
* hint is NOTE_SUBMIT, and must not if hint is not NOTE_SUBMIT.
*/
static int
filt_entropy_read_event(struct knote *kn, long hint)
{
int ret;
KASSERT(!cold);
/* Acquire the lock, if caller is outside entropy subsystem. */
if (hint == NOTE_SUBMIT)
KASSERT(mutex_owned(&E->lock));
else
mutex_enter(&E->lock);
/*
* If we still need entropy, can't read anything; if not, can
* read arbitrarily much.
*/
if (E->bitsneeded != 0 && E->samplesneeded != 0) {
ret = 0;
} else {
if (atomic_load_relaxed(&entropy_depletion))
kn->kn_data = ENTROPY_CAPACITY; /* bytes */
else
kn->kn_data = MIN(INT64_MAX, SSIZE_MAX);
ret = 1;
}
/* Release the lock, if caller is outside entropy subsystem. */
if (hint == NOTE_SUBMIT)
KASSERT(mutex_owned(&E->lock));
else
mutex_exit(&E->lock);
return ret;
}
/* XXX Makes sense only for /dev/u?random. */
static const struct filterops entropy_read_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_entropy_read_detach,
.f_event = filt_entropy_read_event,
};
/*
* entropy_kqfilter(kn)
*
* Register kn to receive entropy event notifications. May be
* EVFILT_READ or EVFILT_WRITE; anything else yields EINVAL.
*/
int
entropy_kqfilter(struct knote *kn)
{
KASSERT(!cold);
switch (kn->kn_filter) {
case EVFILT_READ:
/* Enter into the global select queue. */
mutex_enter(&E->lock);
kn->kn_fop = &entropy_read_filtops;
selrecord_knote(&E->selq, kn);
mutex_exit(&E->lock);
return 0;
case EVFILT_WRITE:
/* Can always dump entropy into the system. */
kn->kn_fop = &seltrue_filtops;
return 0;
default:
return EINVAL;
}
}
/*
* rndsource_setcb(rs, get, getarg)
*
* Set the request callback for the entropy source rs, if it can
* provide entropy on demand. Must precede rnd_attach_source.
*/
void
rndsource_setcb(struct krndsource *rs, void (*get)(size_t, void *),
void *getarg)
{
rs->get = get;
rs->getarg = getarg;
}
/*
* rnd_attach_source(rs, name, type, flags)
*
* Attach the entropy source rs. Must be done after
* rndsource_setcb, if any, and before any calls to rnd_add_data.
*/
void
rnd_attach_source(struct krndsource *rs, const char *name, uint32_t type,
uint32_t flags)
{
uint32_t extra[4];
unsigned i = 0;
KASSERTMSG(name[0] != '\0', "rndsource must have nonempty name");
/* Grab cycle counter to mix extra into the pool. */
extra[i++] = entropy_timer();
/*
* Apply some standard flags:
*
* - We do not bother with network devices by default, for
* hysterical raisins (perhaps: because it is often the case
* that an adversary can influence network packet timings).
*/
switch (type) {
case RND_TYPE_NET:
flags |= RND_FLAG_NO_COLLECT;
break;
}
/* Sanity-check the callback if RND_FLAG_HASCB is set. */
KASSERT(!ISSET(flags, RND_FLAG_HASCB) || rs->get != NULL);
/* Initialize the random source. */
memset(rs->name, 0, sizeof(rs->name)); /* paranoia */
strlcpy(rs->name, name, sizeof(rs->name));
memset(&rs->time_delta, 0, sizeof(rs->time_delta));
memset(&rs->value_delta, 0, sizeof(rs->value_delta));
rs->total = 0;
rs->type = type;
rs->flags = flags;
if (entropy_percpu != NULL)
rs->state = percpu_alloc(sizeof(struct rndsource_cpu));
extra[i++] = entropy_timer();
/* Wire it into the global list of random sources. */
if (__predict_true(!cold))
mutex_enter(&E->lock);
LIST_INSERT_HEAD(&E->sources, rs, list);
if (__predict_true(!cold))
mutex_exit(&E->lock);
extra[i++] = entropy_timer();
/* Request that it provide entropy ASAP, if we can. */
if (ISSET(flags, RND_FLAG_HASCB))
(*rs->get)(ENTROPY_CAPACITY, rs->getarg);
extra[i++] = entropy_timer();
/* Mix the extra into the pool. */
KASSERT(i == __arraycount(extra));
entropy_enter(extra, sizeof extra, 0, /*count*/__predict_true(!cold));
explicit_memset(extra, 0, sizeof extra);
}
/*
* rnd_detach_source(rs)
*
* Detach the entropy source rs. May sleep waiting for users to
* drain. Further use is not allowed.
*/
void
rnd_detach_source(struct krndsource *rs)
{
/*
* If we're cold (shouldn't happen, but hey), just remove it
* from the list -- there's nothing allocated.
*/
if (__predict_false(cold) && entropy_percpu == NULL) {
LIST_REMOVE(rs, list);
return;
}
/* We may have to wait for entropy_request. */
ASSERT_SLEEPABLE();
/* Wait until the source list is not in use, and remove it. */
mutex_enter(&E->lock);
while (E->sourcelock)
cv_wait(&E->sourcelock_cv, &E->lock);
LIST_REMOVE(rs, list);
mutex_exit(&E->lock);
/* Free the per-CPU data. */
percpu_free(rs->state, sizeof(struct rndsource_cpu));
}
/*
* rnd_lock_sources(flags)
*
* Lock the list of entropy sources. Caller must hold the global
* entropy lock. If successful, no rndsource will go away until
* rnd_unlock_sources even while the caller releases the global
* entropy lock.
*
* May be called very early at boot, before entropy_init.
*
* If flags & ENTROPY_WAIT, wait for concurrent access to finish.
* If flags & ENTROPY_SIG, allow interruption by signal.
*/
static int __attribute__((warn_unused_result))
rnd_lock_sources(int flags)
{
int error;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
while (E->sourcelock) {
KASSERT(!cold);
if (!ISSET(flags, ENTROPY_WAIT))
return EWOULDBLOCK;
if (ISSET(flags, ENTROPY_SIG)) {
error = cv_wait_sig(&E->sourcelock_cv, &E->lock);
if (error)
return error;
} else {
cv_wait(&E->sourcelock_cv, &E->lock);
}
}
E->sourcelock = curlwp;
return 0;
}
/*
* rnd_unlock_sources()
*
* Unlock the list of sources after rnd_lock_sources. Caller must
* hold the global entropy lock.
*
* May be called very early at boot, before entropy_init.
*/
static void
rnd_unlock_sources(void)
{
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
KASSERTMSG(E->sourcelock == curlwp, "lwp %p releasing lock held by %p",
curlwp, E->sourcelock);
E->sourcelock = NULL;
if (__predict_true(!cold))
cv_signal(&E->sourcelock_cv);
}
/*
* rnd_sources_locked()
*
* True if we hold the list of rndsources locked, for diagnostic
* assertions.
*
* May be called very early at boot, before entropy_init.
*/
static bool __diagused
rnd_sources_locked(void)
{
return E->sourcelock == curlwp;
}
/*
* entropy_request(nbytes, flags)
*
* Request nbytes bytes of entropy from all sources in the system.
* OK if we overdo it. Caller must hold the global entropy lock;
* will release and re-acquire it.
*
* May be called very early at boot, before entropy_init.
*
* If flags & ENTROPY_WAIT, wait for concurrent access to finish.
* If flags & ENTROPY_SIG, allow interruption by signal.
*/
static int
entropy_request(size_t nbytes, int flags)
{
struct krndsource *rs;
int error;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
if ((flags & ENTROPY_WAIT) != 0 && __predict_false(!cold))
ASSERT_SLEEPABLE();
/*
* Lock the list of entropy sources to block rnd_detach_source
* until we're done, and to serialize calls to the entropy
* callbacks as guaranteed to drivers.
*/
error = rnd_lock_sources(flags);
if (error)
return error;
entropy_request_evcnt.ev_count++;
/* Clamp to the maximum reasonable request. */
nbytes = MIN(nbytes, ENTROPY_CAPACITY);
/* Walk the list of sources. */
LIST_FOREACH(rs, &E->sources, list) {
/* Skip sources without callbacks. */
if (!ISSET(rs->flags, RND_FLAG_HASCB))
continue;
/*
* Skip sources that are disabled altogether -- we
* would just ignore their samples anyway.
*/
if (ISSET(rs->flags, RND_FLAG_NO_COLLECT))
continue;
/* Drop the lock while we call the callback. */
if (__predict_true(!cold))
mutex_exit(&E->lock);
(*rs->get)(nbytes, rs->getarg);
if (__predict_true(!cold))
mutex_enter(&E->lock);
}
/* Request done; unlock the list of entropy sources. */
rnd_unlock_sources();
return 0;
}
static inline uint32_t
rnd_delta_estimate(rnd_delta_t *d, uint32_t v, int32_t delta)
{
int32_t delta2, delta3;
/*
* Calculate the second and third order differentials
*/
delta2 = d->dx - delta;
if (delta2 < 0)
delta2 = -delta2; /* XXX arithmetic overflow */
delta3 = d->d2x - delta2;
if (delta3 < 0)
delta3 = -delta3; /* XXX arithmetic overflow */
d->x = v;
d->dx = delta;
d->d2x = delta2;
/*
* If any delta is 0, we got no entropy. If all are non-zero, we
* might have something.
*/
if (delta == 0 || delta2 == 0 || delta3 == 0)
return 0;
return 1;
}
static inline uint32_t
rnd_dt_estimate(struct krndsource *rs, uint32_t t)
{
int32_t delta;
uint32_t ret;
rnd_delta_t *d;
struct rndsource_cpu *rc;
rc = percpu_getref(rs->state);
d = &rc->rc_timedelta;
if (t < d->x) {
delta = UINT32_MAX - d->x + t;
} else {
delta = d->x - t;
}
if (delta < 0) {
delta = -delta; /* XXX arithmetic overflow */
}
ret = rnd_delta_estimate(d, t, delta);
KASSERT(d->x == t);
KASSERT(d->dx == delta);
percpu_putref(rs->state);
return ret;
}
/*
* rnd_add_uint32(rs, value)
*
* Enter 32 bits of data from an entropy source into the pool.
*
* May be called from any context or with spin locks held, but may
* drop data.
*
* This is meant for cheaply taking samples from devices that
* aren't designed to be hardware random number generators.
*/
void
rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
void
_rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
void
_rnd_add_uint64(struct krndsource *rs, uint64_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
/*
* rnd_add_data(rs, buf, len, entropybits)
*
* Enter data from an entropy source into the pool, with a
* driver's estimate of how much entropy the physical source of
* the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's
* estimate and treat it as zero.
*
* rs MAY but SHOULD NOT be NULL. If rs is NULL, MUST NOT be
* called from interrupt context or with spin locks held.
*
* If rs is non-NULL, MAY but SHOULD NOT be called from interrupt
* context, in which case act like rnd_add_data_intr -- if the
* sample buffer is full, schedule a softint and drop any
* additional data on the floor. (This may change later once we
* fix drivers that still call this from interrupt context to use
* rnd_add_data_intr instead.) MUST NOT be called with spin locks
* held if not in hard interrupt context -- i.e., MUST NOT be
* called in thread context or softint context with spin locks
* held.
*/
void
rnd_add_data(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
bool intr_p = cpu_intr_p(); /* XXX make this unconditionally false */
/*
* Weird legacy exception that we should rip out and replace by
* creating new rndsources to attribute entropy to the callers:
* If there's no rndsource, just enter the data and time now.
*/
if (rs == NULL) {
uint32_t extra;
KASSERT(!intr_p); KASSERTMSG(howmany(entropybits, NBBY) <= len,
"%s: impossible entropy rate:"
" %"PRIu32" bits in %"PRIu32"-byte string",
rs ? rs->name : "(anonymous)", entropybits, len);
entropy_enter(buf, len, entropybits, /*count*/false);
extra = entropy_timer();
entropy_enter(&extra, sizeof extra, 0, /*count*/false);
explicit_memset(&extra, 0, sizeof extra);
return;
}
rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}
/*
* rnd_add_data_intr(rs, buf, len, entropybits)
*
* Try to enter data from an entropy source into the pool, with a
* driver's estimate of how much entropy the physical source of
* the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's
* estimate and treat it as zero. If the sample buffer is full,
* schedule a softint and drop any additional data on the floor.
*/
void
rnd_add_data_intr(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
bool intr_p = true;
rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}
/*
* rnd_add_data_internal(rs, buf, len, entropybits, intr_p)
*
* Internal subroutine to decide whether or not to enter data or
* timing for a particular rndsource, and if so, to enter it.
*
* intr_p is true for callers from interrupt context or spin locks
* held, and false for callers from thread or soft interrupt
* context and no spin locks held.
*/
static void
rnd_add_data_internal(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits, bool intr_p)
{
uint32_t flags;
KASSERTMSG(howmany(entropybits, NBBY) <= len,
"%s: impossible entropy rate:"
" %"PRIu32" bits in %"PRIu32"-byte string",
rs ? rs->name : "(anonymous)", entropybits, len);
/*
* Hold up the reset xcall before it zeroes the entropy counts
* on this CPU or globally. Otherwise, we might leave some
* nonzero entropy attributed to an untrusted source in the
* event of a race with a change to flags.
*/
kpreempt_disable();
/* Load a snapshot of the flags. Ioctl may change them under us. */
flags = atomic_load_relaxed(&rs->flags);
/*
* Skip if:
* - we're not collecting entropy, or
* - the operator doesn't want to collect entropy from this, or
* - neither data nor timings are being collected from this.
*/
if (!atomic_load_relaxed(&entropy_collection) ||
ISSET(flags, RND_FLAG_NO_COLLECT) ||
!ISSET(flags, RND_FLAG_COLLECT_VALUE|RND_FLAG_COLLECT_TIME))
goto out;
/* If asked, ignore the estimate. */
if (ISSET(flags, RND_FLAG_NO_ESTIMATE))
entropybits = 0;
/* If we are collecting data, enter them. */
if (ISSET(flags, RND_FLAG_COLLECT_VALUE)) {
rnd_add_data_1(rs, buf, len, entropybits, /*count*/false,
RND_FLAG_COLLECT_VALUE, intr_p);
}
/* If we are collecting timings, enter one. */
if (ISSET(flags, RND_FLAG_COLLECT_TIME)) {
uint32_t extra;
bool count;
/* Sample a timer. */
extra = entropy_timer();
/* If asked, do entropy estimation on the time. */
if ((flags & (RND_FLAG_ESTIMATE_TIME|RND_FLAG_NO_ESTIMATE)) ==
RND_FLAG_ESTIMATE_TIME && __predict_true(!cold)) count = rnd_dt_estimate(rs, extra);
else
count = false;
rnd_add_data_1(rs, &extra, sizeof extra, 0, count,
RND_FLAG_COLLECT_TIME, intr_p);
}
out: /* Allow concurrent changes to flags to finish. */
kpreempt_enable();
}
static unsigned
add_sat(unsigned a, unsigned b)
{
unsigned c = a + b;
return (c < a ? UINT_MAX : c);
}
/*
* rnd_add_data_1(rs, buf, len, entropybits, count, flag)
*
* Internal subroutine to call either entropy_enter_intr, if we're
* in interrupt context, or entropy_enter if not, and to count the
* entropy in an rndsource.
*/
static void
rnd_add_data_1(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits, bool count, uint32_t flag, bool intr_p)
{
bool fullyused;
/*
* For the interrupt-like path, use entropy_enter_intr and take
* note of whether it consumed the full sample; otherwise, use
* entropy_enter, which always consumes the full sample.
*/
if (intr_p) {
fullyused = entropy_enter_intr(buf, len, entropybits, count);
} else {
entropy_enter(buf, len, entropybits, count);
fullyused = true;
}
/*
* If we used the full sample, note how many bits were
* contributed from this source.
*/
if (fullyused) {
if (__predict_false(cold)) {
const int s = splhigh();
rs->total = add_sat(rs->total, entropybits);
switch (flag) {
case RND_FLAG_COLLECT_TIME:
rs->time_delta.insamples =
add_sat(rs->time_delta.insamples, 1);
break;
case RND_FLAG_COLLECT_VALUE:
rs->value_delta.insamples =
add_sat(rs->value_delta.insamples, 1);
break;
}
splx(s);
} else {
struct rndsource_cpu *rc = percpu_getref(rs->state);
atomic_store_relaxed(&rc->rc_entropybits,
add_sat(rc->rc_entropybits, entropybits));
switch (flag) {
case RND_FLAG_COLLECT_TIME:
atomic_store_relaxed(&rc->rc_timesamples,
add_sat(rc->rc_timesamples, 1));
break;
case RND_FLAG_COLLECT_VALUE:
atomic_store_relaxed(&rc->rc_datasamples,
add_sat(rc->rc_datasamples, 1));
break;
}
percpu_putref(rs->state);
}
}
}
/*
* rnd_add_data_sync(rs, buf, len, entropybits)
*
* Same as rnd_add_data. Originally used in rndsource callbacks,
* to break an unnecessary cycle; no longer really needed.
*/
void
rnd_add_data_sync(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
rnd_add_data(rs, buf, len, entropybits);
}
/*
* rndsource_entropybits(rs)
*
* Return approximately the number of bits of entropy that have
* been contributed via rs so far. Approximate if other CPUs may
* be calling rnd_add_data concurrently.
*/
static unsigned
rndsource_entropybits(struct krndsource *rs)
{
unsigned nbits = rs->total;
KASSERT(!cold);
KASSERT(rnd_sources_locked());
percpu_foreach(rs->state, rndsource_entropybits_cpu, &nbits);
return nbits;
}
static void
rndsource_entropybits_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct rndsource_cpu *rc = ptr;
unsigned *nbitsp = cookie;
unsigned cpu_nbits;
cpu_nbits = atomic_load_relaxed(&rc->rc_entropybits);
*nbitsp += MIN(UINT_MAX - *nbitsp, cpu_nbits);
}
/*
* rndsource_to_user(rs, urs)
*
* Copy a description of rs out to urs for userland.
*/
static void
rndsource_to_user(struct krndsource *rs, rndsource_t *urs)
{
KASSERT(!cold);
KASSERT(rnd_sources_locked());
/* Avoid kernel memory disclosure. */
memset(urs, 0, sizeof(*urs));
CTASSERT(sizeof(urs->name) == sizeof(rs->name));
strlcpy(urs->name, rs->name, sizeof(urs->name));
urs->total = rndsource_entropybits(rs);
urs->type = rs->type;
urs->flags = atomic_load_relaxed(&rs->flags);
}
/*
* rndsource_to_user_est(rs, urse)
*
* Copy a description of rs and estimation statistics out to urse
* for userland.
*/
static void
rndsource_to_user_est(struct krndsource *rs, rndsource_est_t *urse)
{
KASSERT(!cold);
KASSERT(rnd_sources_locked());
/* Avoid kernel memory disclosure. */
memset(urse, 0, sizeof(*urse));
/* Copy out the rndsource description. */
rndsource_to_user(rs, &urse->rt);
/* Gather the statistics. */
urse->dt_samples = rs->time_delta.insamples;
urse->dt_total = 0;
urse->dv_samples = rs->value_delta.insamples;
urse->dv_total = urse->rt.total;
percpu_foreach(rs->state, rndsource_to_user_est_cpu, urse);
}
static void
rndsource_to_user_est_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct rndsource_cpu *rc = ptr;
rndsource_est_t *urse = cookie;
urse->dt_samples = add_sat(urse->dt_samples,
atomic_load_relaxed(&rc->rc_timesamples));
urse->dv_samples = add_sat(urse->dv_samples,
atomic_load_relaxed(&rc->rc_datasamples));
}
/*
* entropy_reset_xc(arg1, arg2)
*
* Reset the current CPU's pending entropy to zero.
*/
static void
entropy_reset_xc(void *arg1 __unused, void *arg2 __unused)
{
uint32_t extra = entropy_timer();
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
/*
* With the per-CPU state locked, zero the pending count and
* enter a cycle count for fun.
*/
ec = entropy_cpu_get(&lock);
ec->ec_bitspending = 0;
ec->ec_samplespending = 0;
entpool_enter(ec->ec_pool, &extra, sizeof extra);
entropy_cpu_put(&lock, ec);
}
/*
* entropy_ioctl(cmd, data)
*
* Handle various /dev/random ioctl queries.
*/
int
entropy_ioctl(unsigned long cmd, void *data)
{
struct krndsource *rs;
bool privileged;
int error;
KASSERT(!cold);
/* Verify user's authorization to perform the ioctl. */
switch (cmd) {
case RNDGETENTCNT:
case RNDGETPOOLSTAT:
case RNDGETSRCNUM:
case RNDGETSRCNAME:
case RNDGETESTNUM:
case RNDGETESTNAME:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_GETPRIV, NULL, NULL, NULL, NULL);
break;
case RNDCTL:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_SETPRIV, NULL, NULL, NULL, NULL);
break;
case RNDADDDATA:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_ADDDATA, NULL, NULL, NULL, NULL);
/* Ascertain whether the user's inputs should be counted. */
if (kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_ADDDATA_ESTIMATE,
NULL, NULL, NULL, NULL) == 0)
privileged = true;
break;
default: {
/*
* XXX Hack to avoid changing module ABI so this can be
* pulled up. Later, we can just remove the argument.
*/
static const struct fileops fops = {
.fo_ioctl = rnd_system_ioctl,
};
struct file f = {
.f_ops = &fops,
};
MODULE_HOOK_CALL(rnd_ioctl_50_hook, (&f, cmd, data),
enosys(), error);
#if defined(_LP64)
if (error == ENOSYS)
MODULE_HOOK_CALL(rnd_ioctl32_50_hook, (&f, cmd, data),
enosys(), error);
#endif
if (error == ENOSYS)
error = ENOTTY;
break;
}
}
/* If anything went wrong with authorization, stop here. */
if (error)
return error;
/* Dispatch on the command. */
switch (cmd) {
case RNDGETENTCNT: { /* Get current entropy count in bits. */
uint32_t *countp = data;
mutex_enter(&E->lock);
*countp = MINENTROPYBITS - E->bitsneeded;
mutex_exit(&E->lock);
break;
}
case RNDGETPOOLSTAT: { /* Get entropy pool statistics. */
rndpoolstat_t *pstat = data;
mutex_enter(&E->lock);
/* parameters */
pstat->poolsize = ENTPOOL_SIZE/sizeof(uint32_t); /* words */
pstat->threshold = MINENTROPYBITS/NBBY; /* bytes */
pstat->maxentropy = ENTROPY_CAPACITY*NBBY; /* bits */
/* state */
pstat->added = 0; /* XXX total entropy_enter count */
pstat->curentropy = MINENTROPYBITS - E->bitsneeded; /* bits */
pstat->removed = 0; /* XXX total entropy_extract count */
pstat->discarded = 0; /* XXX bits of entropy beyond capacity */
/*
* This used to be bits of data fabricated in some
* sense; we'll take it to mean number of samples,
* excluding the bits of entropy from HWRNG or seed.
*/
pstat->generated = MINSAMPLES - E->samplesneeded;
pstat->generated -= MIN(pstat->generated, pstat->curentropy);
mutex_exit(&E->lock);
break;
}
case RNDGETSRCNUM: { /* Get entropy sources by number. */
rndstat_t *stat = data;
uint32_t start = 0, i = 0;
/* Skip if none requested; fail if too many requested. */
if (stat->count == 0)
break;
if (stat->count > RND_MAXSTATCOUNT)
return EINVAL;
/*
* Under the lock, find the first one, copy out as many
* as requested, and report how many we copied out.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (start++ == stat->start)
break;
}
while (i < stat->count && rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user(rs, &stat->source[i++]);
mutex_enter(&E->lock);
rs = LIST_NEXT(rs, list);
}
KASSERT(i <= stat->count);
stat->count = i;
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETESTNUM: { /* Get sources and estimates by number. */
rndstat_est_t *estat = data;
uint32_t start = 0, i = 0;
/* Skip if none requested; fail if too many requested. */
if (estat->count == 0)
break;
if (estat->count > RND_MAXSTATCOUNT)
return EINVAL;
/*
* Under the lock, find the first one, copy out as many
* as requested, and report how many we copied out.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (start++ == estat->start)
break;
}
while (i < estat->count && rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user_est(rs, &estat->source[i++]);
mutex_enter(&E->lock);
rs = LIST_NEXT(rs, list);
}
KASSERT(i <= estat->count);
estat->count = i;
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETSRCNAME: { /* Get entropy sources by name. */
rndstat_name_t *nstat = data;
const size_t n = sizeof(rs->name);
CTASSERT(sizeof(rs->name) == sizeof(nstat->name));
/*
* Under the lock, search by name. If found, copy it
* out; if not found, fail with ENOENT.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (strncmp(rs->name, nstat->name, n) == 0)
break;
}
if (rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user(rs, &nstat->source);
mutex_enter(&E->lock);
} else {
error = ENOENT;
}
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETESTNAME: { /* Get sources and estimates by name. */
rndstat_est_name_t *enstat = data;
const size_t n = sizeof(rs->name);
CTASSERT(sizeof(rs->name) == sizeof(enstat->name));
/*
* Under the lock, search by name. If found, copy it
* out; if not found, fail with ENOENT.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (strncmp(rs->name, enstat->name, n) == 0)
break;
}
if (rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user_est(rs, &enstat->source);
mutex_enter(&E->lock);
} else {
error = ENOENT;
}
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDCTL: { /* Modify entropy source flags. */
rndctl_t *rndctl = data;
const size_t n = sizeof(rs->name);
uint32_t resetflags = RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;
uint32_t flags;
bool reset = false, request = false;
CTASSERT(sizeof(rs->name) == sizeof(rndctl->name));
/* Whitelist the flags that user can change. */
rndctl->mask &= RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;
/*
* For each matching rndsource, either by type if
* specified or by name if not, set the masked flags.
*/
mutex_enter(&E->lock);
LIST_FOREACH(rs, &E->sources, list) {
if (rndctl->type != 0xff) {
if (rs->type != rndctl->type)
continue;
} else if (rndctl->name[0] != '\0') {
if (strncmp(rs->name, rndctl->name, n) != 0)
continue;
}
flags = rs->flags & ~rndctl->mask;
flags |= rndctl->flags & rndctl->mask;
if ((rs->flags & resetflags) == 0 &&
(flags & resetflags) != 0)
reset = true;
if ((rs->flags ^ flags) & resetflags)
request = true;
atomic_store_relaxed(&rs->flags, flags);
}
mutex_exit(&E->lock);
/*
* If we disabled estimation or collection, nix all the
* pending entropy and set needed to the maximum.
*/
if (reset) {
xc_broadcast(0, &entropy_reset_xc, NULL, NULL);
mutex_enter(&E->lock);
E->bitspending = 0;
E->samplespending = 0;
atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
E->consolidate = false;
mutex_exit(&E->lock);
}
/*
* If we changed any of the estimation or collection
* flags, request new samples from everyone -- either
* to make up for what we just lost, or to get new
* samples from what we just added.
*
* Failing on signal, while waiting for another process
* to finish requesting entropy, is OK here even though
* we have committed side effects, because this ioctl
* command is idempotent, so repeating it is safe.
*/
if (request) {
mutex_enter(&E->lock);
error = entropy_request(ENTROPY_CAPACITY,
ENTROPY_WAIT|ENTROPY_SIG);
mutex_exit(&E->lock);
}
break;
}
case RNDADDDATA: { /* Enter seed into entropy pool. */
rnddata_t *rdata = data;
unsigned entropybits = 0;
if (!atomic_load_relaxed(&entropy_collection))
break; /* thanks but no thanks */
if (rdata->len > MIN(sizeof(rdata->data), UINT32_MAX/NBBY))
return EINVAL;
/*
* This ioctl serves as the userland alternative a
* bootloader-provided seed -- typically furnished by
* /etc/rc.d/random_seed. We accept the user's entropy
* claim only if
*
* (a) the user is privileged, and
* (b) we have not entered a bootloader seed.
*
* under the assumption that the user may use this to
* load a seed from disk that we have already loaded
* from the bootloader, so we don't double-count it.
*/
if (privileged && rdata->entropy && rdata->len) {
mutex_enter(&E->lock);
if (!E->seeded) {
entropybits = MIN(rdata->entropy,
MIN(rdata->len, ENTROPY_CAPACITY)*NBBY);
E->seeded = true;
}
mutex_exit(&E->lock);
}
/* Enter the data and consolidate entropy. */
rnd_add_data(&seed_rndsource, rdata->data, rdata->len,
entropybits);
entropy_consolidate();
break;
}
default:
error = ENOTTY;
}
/* Return any error that may have come up. */
return error;
}
/* Legacy entry points */
void
rnd_seed(void *seed, size_t len)
{
if (len != sizeof(rndsave_t)) {
printf("entropy: invalid seed length: %zu,"
" expected sizeof(rndsave_t) = %zu\n",
len, sizeof(rndsave_t));
return;
}
entropy_seed(seed);
}
void
rnd_init(void)
{
entropy_init();
}
void
rnd_init_softint(void)
{
entropy_init_late();
entropy_bootrequest();
}
int
rnd_system_ioctl(struct file *fp, unsigned long cmd, void *data)
{
return entropy_ioctl(cmd, data);
}
/* $NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/rwlock.h>
/* Mutex cache */
#define RW_OBJ_MAGIC 0x85d3c85d
struct krwobj {
krwlock_t ro_lock;
u_int ro_magic;
u_int ro_refcnt;
uint8_t mo_pad[COHERENCY_UNIT - sizeof(krwlock_t) -
sizeof(u_int) * 2];
};
/*
* rw_obj_alloc:
*
* Allocate a single lock object, waiting for memory if needed.
*/
krwlock_t *
rw_obj_alloc(void)
{
struct krwobj *ro;
ro = kmem_intr_alloc(sizeof(*ro), KM_SLEEP);
KASSERT(ALIGNED_POINTER(ro, coherency_unit));
_rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
ro->ro_magic = RW_OBJ_MAGIC;
ro->ro_refcnt = 1;
return (krwlock_t *)ro;
}
/*
* rw_obj_tryalloc:
*
* Allocate a single lock object, but fail if no memory is available.
*/
krwlock_t *
rw_obj_tryalloc(void)
{
struct krwobj *ro;
ro = kmem_intr_alloc(sizeof(*ro), KM_NOSLEEP);
KASSERT(ALIGNED_POINTER(ro, coherency_unit));
if (__predict_true(ro != NULL)) {
_rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
ro->ro_magic = RW_OBJ_MAGIC;
ro->ro_refcnt = 1;
}
return (krwlock_t *)ro;
}
/*
* rw_obj_hold:
*
* Add a single reference to a lock object. A reference to the object
* must already be held, and must be held across this call.
*/
void
rw_obj_hold(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0);
atomic_inc_uint(&ro->ro_refcnt);
}
/*
* rw_obj_free:
*
* Drop a reference from a lock object. If the last reference is being
* dropped, free the object and return true. Otherwise, return false.
*/
bool
rw_obj_free(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0);
membar_release();
if (atomic_dec_uint_nv(&ro->ro_refcnt) > 0) {
return false;
}
membar_acquire();
rw_destroy(&ro->ro_lock);
kmem_intr_free(ro, sizeof(*ro));
return true;
}
/*
* rw_obj_refcnt:
*
* Return the reference count for a lock object.
*/
u_int
rw_obj_refcnt(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
return ro->ro_refcnt;
}
/* $NetBSD: uipc_syscalls_50.c,v 1.12 2022/09/28 15:32:09 msaitoh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <compat/net/if.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_mod.h>
/*ARGSUSED*/
static int
compat_ifdatareq(struct lwp *l, u_long cmd, void *data)
{
struct if_data ifi;
struct ifdatareq50 *ifdr = data;
struct ifnet *ifp;
int error;
/* Validate arguments. */
switch (cmd) {
case OSIOCGIFDATA:
case OSIOCZIFDATA:
break;
default:
return ENOSYS;
}
ifp = ifunit(ifdr->ifdr_name);
if (ifp == NULL)
return ENXIO;
/* Do work. */
switch (cmd) {
case OSIOCGIFDATA:
if_export_if_data(ifp, &ifi, false);
ifdatan2o(&ifdr->ifdr_data, &ifi);
return 0;
case OSIOCZIFDATA:
if (l != NULL) {
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
(void *)cmd, NULL);
if (error != 0)
return error;
}
if_export_if_data(ifp, &ifi, true);
ifdatan2o(&ifdr->ifdr_data, &ifi);
/* XXX if_lastchange? */
return 0;
default:
/* Impossible due to above validation, but makes gcc happy. */
return ENOSYS;
}
}
void
uipc_syscalls_50_init(void)
{
MODULE_HOOK_SET(uipc_syscalls_50_hook, compat_ifdatareq);
}
void
uipc_syscalls_50_fini(void)
{
MODULE_HOOK_UNSET(uipc_syscalls_50_hook);
}
/* $NetBSD: pmap.h,v 1.134 2022/08/20 23:49:31 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* pmap.h: see pmap.c for the history of this pmap module.
*/
#ifndef _X86_PMAP_H_
#define _X86_PMAP_H_
#if defined(_KERNEL)
#include <x86/pmap_pv.h>
#include <uvm/pmap/pmap_pvt.h>
/*
* MD flags that we use for pmap_enter and pmap_kenter_pa:
*/
/*
* macros
*/
#define pmap_clear_modify(pg) pmap_clear_attrs(pg, PP_ATTRS_D)
#define pmap_clear_reference(pg) pmap_clear_attrs(pg, PP_ATTRS_A)
#define pmap_copy(DP,SP,D,L,S) __USE(L)
#define pmap_is_modified(pg) pmap_test_attrs(pg, PP_ATTRS_D)
#define pmap_is_referenced(pg) pmap_test_attrs(pg, PP_ATTRS_A)
#define pmap_move(DP,SP,D,L,S)
#define pmap_phys_address(ppn) (x86_ptob(ppn) & ~X86_MMAP_FLAG_MASK)
#define pmap_mmap_flags(ppn) x86_mmap_flags(ppn)
#if defined(__x86_64__) || defined(PAE)
#define X86_MMAP_FLAG_SHIFT (64 - PGSHIFT)
#else
#define X86_MMAP_FLAG_SHIFT (32 - PGSHIFT)
#endif
#define X86_MMAP_FLAG_MASK 0xf
#define X86_MMAP_FLAG_PREFETCH 0x1
/*
* prototypes
*/
void pmap_activate(struct lwp *);
void pmap_bootstrap(vaddr_t);
bool pmap_clear_attrs(struct vm_page *, unsigned);
bool pmap_pv_clear_attrs(paddr_t, unsigned);
void pmap_deactivate(struct lwp *);
void pmap_page_remove(struct vm_page *);
void pmap_pv_remove(paddr_t);
void pmap_remove(struct pmap *, vaddr_t, vaddr_t);
bool pmap_test_attrs(struct vm_page *, unsigned);
void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
void pmap_load(void);
paddr_t pmap_init_tmp_pgtbl(paddr_t);
bool pmap_remove_all(struct pmap *);
void pmap_ldt_cleanup(struct lwp *);
void pmap_ldt_sync(struct pmap *);
void pmap_kremove_local(vaddr_t, vsize_t);
#define __HAVE_PMAP_PV_TRACK 1
void pmap_pv_init(void);
void pmap_pv_track(paddr_t, psize_t);
void pmap_pv_untrack(paddr_t, psize_t);
u_int x86_mmap_flags(paddr_t);
#define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */
#define PMAP_FORK /* turn on pmap_fork interface */
/*
* inline functions
*/
/*
* pmap_page_protect: change the protection of all recorded mappings
* of a managed page
*
* => this function is a frontend for pmap_page_remove/pmap_clear_attrs
* => we only have to worry about making the page more protected.
* unprotecting a page is done on-demand at fault time.
*/
__inline static void __unused
pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
(void)pmap_clear_attrs(pg, PP_ATTRS_W);
} else {
pmap_page_remove(pg);
}
}
}
/*
* pmap_pv_protect: change the protection of all recorded mappings
* of an unmanaged page
*/
__inline static void __unused
pmap_pv_protect(paddr_t pa, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
(void)pmap_pv_clear_attrs(pa, PP_ATTRS_W);
} else {
pmap_pv_remove(pa);
}
}
}
/*
* pmap_protect: change the protection of pages in a pmap
*
* => this function is a frontend for pmap_remove/pmap_write_protect
* => we only have to worry about making the page more protected.
* unprotecting a page is done on-demand at fault time.
*/
__inline static void __unused
pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
pmap_write_protect(pmap, sva, eva, prot);
} else {
pmap_remove(pmap, sva, eva);
}
}
}
paddr_t vtophys(vaddr_t);
vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
void pmap_cpu_init_late(struct cpu_info *);
/* pmap functions with machine addresses */
void pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t, u_int);
int pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, paddr_t,
vm_prot_t, u_int, int);
bool pmap_extract_ma(pmap_t, vaddr_t, paddr_t *);
paddr_t pmap_get_physpage(void);
/*
* Hooks for the pool allocator.
*/
#define POOL_VTOPHYS(va) vtophys((vaddr_t) (va))
#ifdef __HAVE_DIRECT_MAP
extern vaddr_t pmap_direct_base;
extern vaddr_t pmap_direct_end;
#define PMAP_DIRECT_BASE pmap_direct_base
#define PMAP_DIRECT_END pmap_direct_end
#define PMAP_DIRECT_MAP(pa) ((vaddr_t)PMAP_DIRECT_BASE + (pa))
#define PMAP_DIRECT_UNMAP(va) ((paddr_t)(va) - PMAP_DIRECT_BASE)
/*
* Alternate mapping hooks for pool pages.
*/
#define PMAP_MAP_POOLPAGE(pa) PMAP_DIRECT_MAP((pa))
#define PMAP_UNMAP_POOLPAGE(va) PMAP_DIRECT_UNMAP((va))
#endif /* __HAVE_DIRECT_MAP */
#define __HAVE_VM_PAGE_MD
#define VM_MDPAGE_INIT(pg) \
memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage)); \
PMAP_PAGE_INIT(&(pg)->mdpage.mp_pp)
struct vm_page_md {
struct pmap_page mp_pp;
};
#endif /* _KERNEL */
#endif /* _X86_PMAP_H_ */
/* $NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_map.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_map.c: uvm map operations
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $");
#include "opt_ddb.h"
#include "opt_pax.h"
#include "opt_uvmhist.h"
#include "opt_uvm.h"
#include "opt_sysv.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/pax.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/lockdebug.h>
#include <sys/atomic.h>
#include <sys/sysctl.h>
#ifndef __USER_VA0_IS_SAFE
#include <sys/kauth.h>
#include "opt_user_va0_disable_default.h"
#endif
#include <sys/shm.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#if defined(DDB) || defined(DEBUGPRINT)
#include <uvm/uvm_ddb.h>
#endif
#ifdef UVMHIST
#ifndef UVMHIST_MAPHIST_SIZE
#define UVMHIST_MAPHIST_SIZE 100
#endif
static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
#endif
#if !defined(UVMMAP_COUNTERS)
#define UVMMAP_EVCNT_DEFINE(name) /* nothing */
#define UVMMAP_EVCNT_INCR(ev) /* nothing */
#define UVMMAP_EVCNT_DECR(ev) /* nothing */
#else /* defined(UVMMAP_NOCOUNTERS) */
#include <sys/evcnt.h>
#define UVMMAP_EVCNT_DEFINE(name) \
struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
"uvmmap", #name); \
EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
#define UVMMAP_EVCNT_INCR(ev) uvmmap_evcnt_##ev.ev_count++
#define UVMMAP_EVCNT_DECR(ev) uvmmap_evcnt_##ev.ev_count--
#endif /* defined(UVMMAP_NOCOUNTERS) */
UVMMAP_EVCNT_DEFINE(ubackmerge)
UVMMAP_EVCNT_DEFINE(uforwmerge)
UVMMAP_EVCNT_DEFINE(ubimerge)
UVMMAP_EVCNT_DEFINE(unomerge)
UVMMAP_EVCNT_DEFINE(kbackmerge)
UVMMAP_EVCNT_DEFINE(kforwmerge)
UVMMAP_EVCNT_DEFINE(kbimerge)
UVMMAP_EVCNT_DEFINE(knomerge)
UVMMAP_EVCNT_DEFINE(map_call)
UVMMAP_EVCNT_DEFINE(mlk_call)
UVMMAP_EVCNT_DEFINE(mlk_hint)
UVMMAP_EVCNT_DEFINE(mlk_tree)
UVMMAP_EVCNT_DEFINE(mlk_treeloop)
const char vmmapbsy[] = "vmmapbsy";
/*
* cache for dynamically-allocated map entries.
*/
static struct pool_cache uvm_map_entry_cache;
#ifdef PMAP_GROWKERNEL
/*
* This global represents the end of the kernel virtual address
* space. If we want to exceed this, we must grow the kernel
* virtual address space dynamically.
*
* Note, this variable is locked by kernel_map's lock.
*/
vaddr_t uvm_maxkaddr;
#endif
#ifndef __USER_VA0_IS_SAFE
#ifndef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT 1
#endif
#ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
#undef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
#endif
int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
#endif
/*
* macros
*/
/*
* uvm_map_align_va: round down or up virtual address
*/
static __inline void
uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
{
KASSERT(powerof2(align)); if (align != 0 && (*vap & (align - 1)) != 0) {
if (topdown)
*vap = rounddown2(*vap, align);
else
*vap = roundup2(*vap, align);
}
}
/*
* UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
*/
extern struct vm_map *pager_map;
#define UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
prot, maxprot, inh, adv, wire) \
((ent)->etype == (type) && \
(((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
(ent)->object.uvm_obj == (uobj) && \
(ent)->protection == (prot) && \
(ent)->max_protection == (maxprot) && \
(ent)->inheritance == (inh) && \
(ent)->advice == (adv) && \
(ent)->wired_count == (wire))
/*
* uvm_map_entry_link: insert entry into a map
*
* => map must be locked
*/
#define uvm_map_entry_link(map, after_where, entry) do { \
uvm_mapent_check(entry); \
(map)->nentries++; \
(entry)->prev = (after_where); \
(entry)->next = (after_where)->next; \
(entry)->prev->next = (entry); \
(entry)->next->prev = (entry); \
uvm_rb_insert((map), (entry)); \
} while (/*CONSTCOND*/ 0)
/*
* uvm_map_entry_unlink: remove entry from a map
*
* => map must be locked
*/
#define uvm_map_entry_unlink(map, entry) do { \
KASSERT((entry) != (map)->first_free); \
KASSERT((entry) != (map)->hint); \
uvm_mapent_check(entry); \
(map)->nentries--; \
(entry)->next->prev = (entry)->prev; \
(entry)->prev->next = (entry)->next; \
uvm_rb_remove((map), (entry)); \
} while (/*CONSTCOND*/ 0)
/*
* SAVE_HINT: saves the specified entry as the hint for future lookups.
*
* => map need not be locked.
*/
#define SAVE_HINT(map, check, value) do { \
if ((map)->hint == (check)) \
(map)->hint = (value); \
} while (/*CONSTCOND*/ 0)
/*
* clear_hints: ensure that hints don't point to the entry.
*
* => map must be write-locked.
*/
static void
clear_hints(struct vm_map *map, struct vm_map_entry *ent)
{
SAVE_HINT(map, ent, ent->prev); if (map->first_free == ent) { map->first_free = ent->prev;
}
}
/*
* VM_MAP_RANGE_CHECK: check and correct range
*
* => map must at least be read locked
*/
#define VM_MAP_RANGE_CHECK(map, start, end) do { \
if (start < vm_map_min(map)) \
start = vm_map_min(map); \
if (end > vm_map_max(map)) \
end = vm_map_max(map); \
if (start > end) \
start = end; \
} while (/*CONSTCOND*/ 0)
/*
* local prototypes
*/
static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *, int);
static void uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
static void uvm_mapent_free(struct vm_map_entry *);
#if defined(DEBUG)
static void _uvm_mapent_check(const struct vm_map_entry *, int);
#define uvm_mapent_check(map) _uvm_mapent_check(map, __LINE__)
#else /* defined(DEBUG) */
#define uvm_mapent_check(e) /* nothing */
#endif /* defined(DEBUG) */
static void uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
static void uvm_map_reference_amap(struct vm_map_entry *, int);
static int uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
int, struct vm_map_entry *);
static void uvm_map_unreference_amap(struct vm_map_entry *, int);
int _uvm_map_sanity(struct vm_map *);
int _uvm_tree_sanity(struct vm_map *);
static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);
#define ROOT_ENTRY(map) ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
#define LEFT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_left)
#define RIGHT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_right)
#define PARENT_ENTRY(map, entry) \
(ROOT_ENTRY(map) == (entry) \
? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))
/*
* These get filled in if/when SYSVSHM shared memory code is loaded
*
* We do this with function pointers rather the #ifdef SYSVSHM so the
* SYSVSHM code can be loaded and unloaded
*/
void (*uvm_shmexit)(struct vmspace *) = NULL;
void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;
static int
uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
{
const struct vm_map_entry *eparent = nparent;
const struct vm_map_entry *ekey = nkey;
KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end); KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end); if (eparent->start < ekey->start)
return -1;
if (eparent->end >= ekey->start)
return 1;
return 0;
}
static int
uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
{
const struct vm_map_entry *eparent = nparent;
const vaddr_t va = *(const vaddr_t *) vkey;
if (eparent->start < va)
return -1;
if (eparent->end >= va)
return 1;
return 0;
}
static const rb_tree_ops_t uvm_map_tree_ops = {
.rbto_compare_nodes = uvm_map_compare_nodes,
.rbto_compare_key = uvm_map_compare_key,
.rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
.rbto_context = NULL
};
/*
* uvm_rb_gap: return the gap size between our entry and next entry.
*/
static inline vsize_t
uvm_rb_gap(const struct vm_map_entry *entry)
{
KASSERT(entry->next != NULL);
return entry->next->start - entry->end;
}
static vsize_t
uvm_rb_maxgap(const struct vm_map_entry *entry)
{
struct vm_map_entry *child;
vsize_t maxgap = entry->gap;
/*
* We need maxgap to be the largest gap of us or any of our
* descendents. Since each of our children's maxgap is the
* cached value of their largest gap of themselves or their
* descendents, we can just use that value and avoid recursing
* down the tree to calculate it.
*/
if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
maxgap = child->maxgap;
if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
maxgap = child->maxgap;
return maxgap;
}
static void
uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *parent;
KASSERT(entry->gap == uvm_rb_gap(entry)); entry->maxgap = uvm_rb_maxgap(entry); while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
struct vm_map_entry *brother;
vsize_t maxgap = parent->gap;
unsigned int which;
KDASSERT(parent->gap == uvm_rb_gap(parent));
if (maxgap < entry->maxgap)
maxgap = entry->maxgap;
/*
* Since we work towards the root, we know entry's maxgap
* value is OK, but its brothers may now be out-of-date due
* to rebalancing. So refresh it.
*/
which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
if (brother != NULL) { KDASSERT(brother->gap == uvm_rb_gap(brother)); brother->maxgap = uvm_rb_maxgap(brother);
if (maxgap < brother->maxgap)
maxgap = brother->maxgap;
}
parent->maxgap = maxgap;
entry = parent;
}
}
static void
uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *ret __diagused;
entry->gap = entry->maxgap = uvm_rb_gap(entry); if (entry->prev != &map->header) entry->prev->gap = uvm_rb_gap(entry->prev);
ret = rb_tree_insert_node(&map->rb_tree, entry);
KASSERTMSG(ret == entry,
"uvm_rb_insert: map %p: duplicate entry %p", map, ret);
/*
* If the previous entry is not our immediate left child, then it's an
* ancestor and will be fixed up on the way to the root. We don't
* have to check entry->prev against &map->header since &map->header
* will never be in the tree.
*/
uvm_rb_fixup(map,
LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
}
static void
uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;
/*
* If we are removing an interior node, then an adjacent node will
* be used to replace its position in the tree. Therefore we will
* need to fixup the tree starting at the parent of the replacement
* node. So record their parents for later use.
*/
if (entry->prev != &map->header) prev_parent = PARENT_ENTRY(map, entry->prev); if (entry->next != &map->header) next_parent = PARENT_ENTRY(map, entry->next);
rb_tree_remove_node(&map->rb_tree, entry);
/*
* If the previous node has a new parent, fixup the tree starting
* at the previous node's old parent.
*/
if (entry->prev != &map->header) {
/*
* Update the previous entry's gap due to our absence.
*/
entry->prev->gap = uvm_rb_gap(entry->prev);
uvm_rb_fixup(map, entry->prev);
if (prev_parent != NULL && prev_parent != entry && prev_parent != PARENT_ENTRY(map, entry->prev)) uvm_rb_fixup(map, prev_parent);
}
/*
* If the next node has a new parent, fixup the tree starting
* at the next node's old parent.
*/
if (entry->next != &map->header) {
uvm_rb_fixup(map, entry->next);
if (next_parent != NULL && next_parent != entry && next_parent != PARENT_ENTRY(map, entry->next)) uvm_rb_fixup(map, next_parent);
}
}
#if defined(DEBUG)
int uvm_debug_check_map = 0;
int uvm_debug_check_rbtree = 0;
#define uvm_map_check(map, name) \
_uvm_map_check((map), (name), __FILE__, __LINE__)
static void
_uvm_map_check(struct vm_map *map, const char *name,
const char *file, int line)
{
if ((uvm_debug_check_map && _uvm_map_sanity(map)) || (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
name, map, file, line);
}
}
#else /* defined(DEBUG) */
#define uvm_map_check(map, name) /* nothing */
#endif /* defined(DEBUG) */
#if defined(DEBUG) || defined(DDB)
int
_uvm_map_sanity(struct vm_map *map)
{
bool first_free_found = false;
bool hint_found = false;
const struct vm_map_entry *e;
struct vm_map_entry *hint = map->hint;
e = &map->header;
for (;;) {
if (map->first_free == e) {
first_free_found = true;
} else if (!first_free_found && e->next->start > e->end) { printf("first_free %p should be %p\n",
map->first_free, e);
return -1;
}
if (hint == e) {
hint_found = true;
}
e = e->next;
if (e == &map->header) {
break;
}
}
if (!first_free_found) {
printf("stale first_free\n");
return -1;
}
if (!hint_found) { printf("stale hint\n");
return -1;
}
return 0;
}
int
_uvm_tree_sanity(struct vm_map *map)
{
struct vm_map_entry *tmp, *trtmp;
int n = 0, i = 1;
for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
if (tmp->gap != uvm_rb_gap(tmp)) {
printf("%d/%d gap %#lx != %#lx %s\n",
n + 1, map->nentries,
(ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
tmp->next == &map->header ? "(last)" : "");
goto error;
}
/*
* If any entries are out of order, tmp->gap will be unsigned
* and will likely exceed the size of the map.
*/
if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
printf("too large gap %zu\n", (size_t)tmp->gap);
goto error;
}
n++;
}
if (n != map->nentries) {
printf("nentries: %d vs %d\n", n, map->nentries);
goto error;
}
trtmp = NULL;
for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
printf("maxgap %#lx != %#lx\n",
(ulong)tmp->maxgap,
(ulong)uvm_rb_maxgap(tmp));
goto error;
}
if (trtmp != NULL && trtmp->start >= tmp->start) {
printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
trtmp->start, tmp->start);
goto error;
}
trtmp = tmp;
}
for (tmp = map->header.next; tmp != &map->header;
tmp = tmp->next, i++) {
trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
if (trtmp == NULL)
trtmp = &map->header;
if (tmp->prev != trtmp) {
printf("lookup: %d: %p->prev=%p: %p\n",
i, tmp, tmp->prev, trtmp);
goto error;
}
trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
if (trtmp == NULL)
trtmp = &map->header;
if (tmp->next != trtmp) {
printf("lookup: %d: %p->next=%p: %p\n",
i, tmp, tmp->next, trtmp);
goto error;
}
trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
if (trtmp != tmp) {
printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
PARENT_ENTRY(map, tmp));
goto error;
}
}
return (0);
error:
return (-1);
}
#endif /* defined(DEBUG) || defined(DDB) */
/*
* vm_map_lock: acquire an exclusive (write) lock on a map.
*
* => The locking protocol provides for guaranteed upgrade from shared ->
* exclusive by whichever thread currently has the map marked busy.
* See "LOCKING PROTOCOL NOTES" in uvm_map.h. This is horrible; among
* other problems, it defeats any fairness guarantees provided by RW
* locks.
*/
void
vm_map_lock(struct vm_map *map)
{
for (;;) {
rw_enter(&map->lock, RW_WRITER);
if (map->busy == NULL || map->busy == curlwp) {
break;
}
mutex_enter(&map->misc_lock);
rw_exit(&map->lock);
if (map->busy != NULL) { cv_wait(&map->cv, &map->misc_lock);
}
mutex_exit(&map->misc_lock);
}
map->timestamp++;
}
/*
* vm_map_lock_try: try to lock a map, failing if it is already locked.
*/
bool
vm_map_lock_try(struct vm_map *map)
{
if (!rw_tryenter(&map->lock, RW_WRITER)) {
return false;
}
if (map->busy != NULL) {
rw_exit(&map->lock);
return false;
}
map->timestamp++;
return true;
}
/*
* vm_map_unlock: release an exclusive lock on a map.
*/
void
vm_map_unlock(struct vm_map *map)
{ KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL || map->busy == curlwp);
rw_exit(&map->lock);
}
/*
* vm_map_unbusy: mark the map as unbusy, and wake any waiters that
* want an exclusive lock.
*/
void
vm_map_unbusy(struct vm_map *map)
{
KASSERT(map->busy == curlwp);
/*
* Safe to clear 'busy' and 'waiters' with only a read lock held:
*
* o they can only be set with a write lock held
* o writers are blocked out with a read or write hold
* o at any time, only one thread owns the set of values
*/
mutex_enter(&map->misc_lock);
map->busy = NULL;
cv_broadcast(&map->cv);
mutex_exit(&map->misc_lock);
}
/*
* vm_map_lock_read: acquire a shared (read) lock on a map.
*/
void
vm_map_lock_read(struct vm_map *map)
{
rw_enter(&map->lock, RW_READER);
}
/*
* vm_map_unlock_read: release a shared lock on a map.
*/
void
vm_map_unlock_read(struct vm_map *map)
{
rw_exit(&map->lock);
}
/*
* vm_map_busy: mark a map as busy.
*
* => the caller must hold the map write locked
*/
void
vm_map_busy(struct vm_map *map)
{
KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL);
map->busy = curlwp;
}
/*
* vm_map_locked_p: return true if the map is write locked.
*
* => only for debug purposes like KASSERTs.
* => should not be used to verify that a map is not locked.
*/
bool
vm_map_locked_p(struct vm_map *map)
{
return rw_write_held(&map->lock);
}
/*
* uvm_mapent_alloc: allocate a map entry
*/
static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *map, int flags)
{
struct vm_map_entry *me;
int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
me = pool_cache_get(&uvm_map_entry_cache, pflags); if (__predict_false(me == NULL)) {
return NULL;
}
me->flags = 0;
UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
(map == kernel_map), 0, 0);
return me;
}
/*
* uvm_mapent_free: free map entry
*/
static void
uvm_mapent_free(struct vm_map_entry *me)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]",
(uintptr_t)me, me->flags, 0, 0);
pool_cache_put(&uvm_map_entry_cache, me);
}
/*
* uvm_mapent_copy: copy a map entry, preserving flags
*/
static inline void
uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
{
memcpy(dst, src, sizeof(*dst));
dst->flags = 0;
}
#if defined(DEBUG)
static void
_uvm_mapent_check(const struct vm_map_entry *entry, int line)
{
if (entry->start >= entry->end) {
goto bad;
}
if (UVM_ET_ISOBJ(entry)) {
if (entry->object.uvm_obj == NULL) {
goto bad;
}
} else if (UVM_ET_ISSUBMAP(entry)) {
if (entry->object.sub_map == NULL) {
goto bad;
}
} else {
if (entry->object.uvm_obj != NULL ||
entry->object.sub_map != NULL) {
goto bad;
}
}
if (!UVM_ET_ISOBJ(entry)) { if (entry->offset != 0) {
goto bad;
}
}
return;
bad:
panic("%s: bad entry %p, line %d", __func__, entry, line);
}
#endif /* defined(DEBUG) */
/*
* uvm_map_entry_unwire: unwire a map entry
*
* => map should be locked by caller
*/
static inline void
uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
{
entry->wired_count = 0;
uvm_fault_unwire_locked(map, entry->start, entry->end);
}
/*
* wrapper for calling amap_ref()
*/
static inline void
uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
{
amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
(entry->end - entry->start) >> PAGE_SHIFT, flags);
}
/*
* wrapper for calling amap_unref()
*/
static inline void
uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
{
amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
(entry->end - entry->start) >> PAGE_SHIFT, flags);
}
/*
* uvm_map_init: init mapping system at boot time.
*/
void
uvm_map_init(void)
{
/*
* first, init logging system.
*/
UVMHIST_FUNC(__func__);
UVMHIST_LINK_STATIC(maphist);
UVMHIST_LINK_STATIC(pdhist);
UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
/*
* initialize the global lock for kernel map entry.
*/
mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
}
/*
* uvm_map_init_caches: init mapping system caches.
*/
void
uvm_map_init_caches(void)
{
/*
* initialize caches.
*/
pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL,
NULL, NULL);
}
/*
* clippers
*/
/*
* uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
*/
static void
uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
vaddr_t splitat)
{
vaddr_t adj;
KASSERT(entry1->start < splitat); KASSERT(splitat < entry1->end);
adj = splitat - entry1->start;
entry1->end = entry2->start = splitat;
if (entry1->aref.ar_amap) {
amap_splitref(&entry1->aref, &entry2->aref, adj);
}
if (UVM_ET_ISSUBMAP(entry1)) {
/* ... unlikely to happen, but play it safe */
uvm_map_reference(entry1->object.sub_map); } else if (UVM_ET_ISOBJ(entry1)) { KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
entry2->offset += adj;
if (entry1->object.uvm_obj->pgops &&
entry1->object.uvm_obj->pgops->pgo_reference)
entry1->object.uvm_obj->pgops->pgo_reference(
entry1->object.uvm_obj);
}
}
/*
* uvm_map_clip_start: ensure that the entry begins at or after
* the starting address, if it doesn't we split the entry.
*
* => caller should use UVM_MAP_CLIP_START macro rather than calling
* this directly
* => map must be locked by caller
*/
void
uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
vaddr_t start)
{
struct vm_map_entry *new_entry;
/* uvm_map_simplify_entry(map, entry); */ /* XXX */
uvm_map_check(map, "clip_start entry"); uvm_mapent_check(entry);
/*
* Split off the front portion. note that we must insert the new
* entry BEFORE this one, so that this entry has the specified
* starting address.
*/
new_entry = uvm_mapent_alloc(map, 0);
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
uvm_mapent_splitadj(new_entry, entry, start);
uvm_map_entry_link(map, entry->prev, new_entry); uvm_map_check(map, "clip_start leave");
}
/*
* uvm_map_clip_end: ensure that the entry ends at or before
* the ending address, if it does't we split the reference
*
* => caller should use UVM_MAP_CLIP_END macro rather than calling
* this directly
* => map must be locked by caller
*/
void
uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
{
struct vm_map_entry *new_entry;
uvm_map_check(map, "clip_end entry"); uvm_mapent_check(entry);
/*
* Create a new entry and insert it
* AFTER the specified entry
*/
new_entry = uvm_mapent_alloc(map, 0);
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
uvm_mapent_splitadj(entry, new_entry, end);
uvm_map_entry_link(map, entry, new_entry); uvm_map_check(map, "clip_end leave");
}
/*
* M A P - m a i n e n t r y p o i n t
*/
/*
* uvm_map: establish a valid mapping in a map
*
* => assume startp is page aligned.
* => assume size is a multiple of PAGE_SIZE.
* => assume sys_mmap provides enough of a "hint" to have us skip
* over text/data/bss area.
* => map must be unlocked (we will lock it)
* => <uobj,uoffset> value meanings (4 cases):
* [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER
* [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER
* [3] <uobj,uoffset> == normal mapping
* [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA
*
* case [4] is for kernel mappings where we don't know the offset until
* we've found a virtual address. note that kernel object offsets are
* always relative to vm_map_min(kernel_map).
*
* => if `align' is non-zero, we align the virtual address to the specified
* alignment.
* this is provided as a mechanism for large pages.
*
* => XXXCDC: need way to map in external amap?
*/
int
uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
{
struct uvm_map_args args;
struct vm_map_entry *new_entry;
int error;
KASSERT((size & PAGE_MASK) == 0); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
/*
* for pager_map, allocate the new entry first to avoid sleeping
* for memory while we have the map locked.
*/
new_entry = NULL;
if (map == pager_map) { new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
if (__predict_false(new_entry == NULL))
return ENOMEM;
}
if (map == pager_map)
flags |= UVM_FLAG_NOMERGE;
error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
flags, &args);
if (!error) {
error = uvm_map_enter(map, &args, new_entry);
*startp = args.uma_start;
} else if (new_entry) { uvm_mapent_free(new_entry);
}
#if defined(DEBUG)
if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) { uvm_km_check_empty(map, *startp, *startp + size);
}
#endif /* defined(DEBUG) */
return error;
}
/*
* uvm_map_prepare:
*
* called with map unlocked.
* on success, returns the map locked.
*/
int
uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
struct uvm_map_args *args)
{
struct vm_map_entry *prev_entry;
vm_prot_t prot = UVM_PROTECTION(flags);
vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)",
(uintptr_t)map, start, size, flags);
UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj,
uoffset,0,0);
/*
* detect a popular device driver bug.
*/
KASSERT(doing_shutdown || curlwp != NULL);
/*
* zero-sized mapping doesn't make any sense.
*/
KASSERT(size > 0); KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0); uvm_map_check(map, "map entry");
/*
* check sanity of protection code
*/
if ((prot & maxprot) != prot) {
UVMHIST_LOG(maphist, "<- prot. failure: prot=%#jx, max=%#jx",
prot, maxprot,0,0);
return EACCES;
}
/*
* figure out where to put new VM range
*/
retry:
if (vm_map_lock_try(map) == false) { if ((flags & UVM_FLAG_TRYLOCK) != 0) {
return EAGAIN;
}
vm_map_lock(map); /* could sleep here */
}
if (flags & UVM_FLAG_UNMAP) {
KASSERT(flags & UVM_FLAG_FIXED); KASSERT((flags & UVM_FLAG_NOWAIT) == 0);
/*
* Set prev_entry to what it will need to be after any existing
* entries are removed later in uvm_map_enter().
*/
if (uvm_map_lookup_entry(map, start, &prev_entry)) {
if (start == prev_entry->start)
prev_entry = prev_entry->prev;
else
UVM_MAP_CLIP_END(map, prev_entry, start);
SAVE_HINT(map, map->hint, prev_entry);
}
} else {
prev_entry = uvm_map_findspace(map, start, size, &start,
uobj, uoffset, align, flags);
}
if (prev_entry == NULL) {
unsigned int timestamp;
timestamp = map->timestamp;
UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
timestamp,0,0,0);
map->flags |= VM_MAP_WANTVA;
vm_map_unlock(map);
/*
* try to reclaim kva and wait until someone does unmap.
* fragile locking here, so we awaken every second to
* recheck the condition.
*/
mutex_enter(&map->misc_lock);
while ((map->flags & VM_MAP_WANTVA) != 0 &&
map->timestamp == timestamp) {
if ((flags & UVM_FLAG_WAITVA) == 0) {
mutex_exit(&map->misc_lock);
UVMHIST_LOG(maphist,
"<- uvm_map_findspace failed!", 0,0,0,0);
return ENOMEM;
} else {
cv_timedwait(&map->cv, &map->misc_lock, hz);
}
}
mutex_exit(&map->misc_lock);
goto retry;
}
#ifdef PMAP_GROWKERNEL
/*
* If the kernel pmap can't map the requested space,
* then allocate more resources for it.
*/
if (map == kernel_map && uvm_maxkaddr < (start + size)) uvm_maxkaddr = pmap_growkernel(start + size);
#endif
UVMMAP_EVCNT_INCR(map_call);
/*
* if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
* [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in
* either case we want to zero it before storing it in the map entry
* (because it looks strange and confusing when debugging...)
*
* if uobj is not null
* if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
* and we do not need to change uoffset.
* if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
* now (based on the starting address of the map). this case is
* for kernel object mappings where we don't know the offset until
* the virtual address is found (with uvm_map_findspace). the
* offset is the distance we are from the start of the map.
*/
if (uobj == NULL) {
uoffset = 0;
} else {
if (uoffset == UVM_UNKNOWN_OFFSET) { KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
uoffset = start - vm_map_min(kernel_map);
}
}
args->uma_flags = flags;
args->uma_prev = prev_entry;
args->uma_start = start;
args->uma_size = size;
args->uma_uobj = uobj;
args->uma_uoffset = uoffset;
UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
return 0;
}
/*
* uvm_map_enter:
*
* called with map locked.
* unlock the map before returning.
*/
int
uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
struct vm_map_entry *new_entry)
{
struct vm_map_entry *prev_entry = args->uma_prev;
struct vm_map_entry *dead = NULL, *dead_entries = NULL;
const uvm_flag_t flags = args->uma_flags;
const vm_prot_t prot = UVM_PROTECTION(flags);
const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
const vm_inherit_t inherit = UVM_INHERIT(flags);
const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
AMAP_EXTEND_NOWAIT : 0;
const int advice = UVM_ADVICE(flags);
vaddr_t start = args->uma_start;
vsize_t size = args->uma_size;
struct uvm_object *uobj = args->uma_uobj;
voff_t uoffset = args->uma_uoffset;
const int kmap = (vm_map_pmap(map) == pmap_kernel());
int merged = 0;
int error;
int newetype;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
(uintptr_t)map, start, size, flags);
UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj,
uoffset,0,0);
KASSERT(map->hint == prev_entry); /* bimerge case assumes this */ KASSERT(vm_map_locked_p(map)); KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
(UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));
if (uobj)
newetype = UVM_ET_OBJ;
else
newetype = 0;
if (flags & UVM_FLAG_COPYONW) {
newetype |= UVM_ET_COPYONWRITE;
if ((flags & UVM_FLAG_OVERLAY) == 0)
newetype |= UVM_ET_NEEDSCOPY;
}
/*
* For mappings with unmap, remove any old entries now. Adding the new
* entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
* is set, and we do not support nowait and unmap together.
*/
if (flags & UVM_FLAG_UNMAP) { KASSERT(flags & UVM_FLAG_FIXED);
uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
#ifdef DEBUG
struct vm_map_entry *tmp_entry __diagused;
bool rv __diagused;
rv = uvm_map_lookup_entry(map, start, &tmp_entry);
KASSERT(!rv); KASSERTMSG(prev_entry == tmp_entry,
"args %p prev_entry %p tmp_entry %p",
args, prev_entry, tmp_entry);
#endif
SAVE_HINT(map, map->hint, prev_entry);
}
/*
* try and insert in map by extending previous entry, if possible.
* XXX: we don't try and pull back the next entry. might be useful
* for a stack, but we are currently allocating our stack in advance.
*/
if (flags & UVM_FLAG_NOMERGE)
goto nomerge;
if (prev_entry->end == start &&
prev_entry != &map->header &&
UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
prot, maxprot, inherit, advice, 0)) {
if (uobj && prev_entry->offset +
(prev_entry->end - prev_entry->start) != uoffset)
goto forwardmerge;
/*
* can't extend a shared amap. note: no need to lock amap to
* look at refs since we don't care about its exact value.
* if it is one (i.e. we have only reference) it will stay there
*/
if (prev_entry->aref.ar_amap &&
amap_refs(prev_entry->aref.ar_amap) != 1) {
goto forwardmerge;
}
if (prev_entry->aref.ar_amap) {
error = amap_extend(prev_entry, size,
amapwaitflag | AMAP_EXTEND_FORWARDS);
if (error)
goto nomerge;
}
if (kmap) {
UVMMAP_EVCNT_INCR(kbackmerge);
} else {
UVMMAP_EVCNT_INCR(ubackmerge);
}
UVMHIST_LOG(maphist," starting back merge", 0, 0, 0, 0);
/*
* drop our reference to uobj since we are extending a reference
* that we already have (the ref count can not drop to zero).
*/
if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj);
/*
* Now that we've merged the entries, note that we've grown
* and our gap has shrunk. Then fix the tree.
*/
prev_entry->end += size;
prev_entry->gap -= size;
uvm_rb_fixup(map, prev_entry);
uvm_map_check(map, "map backmerged");
UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
merged++;
}
forwardmerge:
if (prev_entry->next->start == (start + size) &&
prev_entry->next != &map->header &&
UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
prot, maxprot, inherit, advice, 0)) {
if (uobj && prev_entry->next->offset != uoffset + size)
goto nomerge;
/*
* can't extend a shared amap. note: no need to lock amap to
* look at refs since we don't care about its exact value.
* if it is one (i.e. we have only reference) it will stay there.
*
* note that we also can't merge two amaps, so if we
* merged with the previous entry which has an amap,
* and the next entry also has an amap, we give up.
*
* Interesting cases:
* amap, new, amap -> give up second merge (single fwd extend)
* amap, new, none -> double forward extend (extend again here)
* none, new, amap -> double backward extend (done here)
* uobj, new, amap -> single backward extend (done here)
*
* XXX should we attempt to deal with someone refilling
* the deallocated region between two entries that are
* backed by the same amap (ie, arefs is 2, "prev" and
* "next" refer to it, and adding this allocation will
* close the hole, thus restoring arefs to 1 and
* deallocating the "next" vm_map_entry)? -- @@@
*/
if (prev_entry->next->aref.ar_amap && (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
(merged && prev_entry->aref.ar_amap))) {
goto nomerge;
}
if (merged) {
/*
* Try to extend the amap of the previous entry to
* cover the next entry as well. If it doesn't work
* just skip on, don't actually give up, since we've
* already completed the back merge.
*/
if (prev_entry->aref.ar_amap) { if (amap_extend(prev_entry,
prev_entry->next->end -
prev_entry->next->start,
amapwaitflag | AMAP_EXTEND_FORWARDS))
goto nomerge;
}
/*
* Try to extend the amap of the *next* entry
* back to cover the new allocation *and* the
* previous entry as well (the previous merge
* didn't have an amap already otherwise we
* wouldn't be checking here for an amap). If
* it doesn't work just skip on, again, don't
* actually give up, since we've already
* completed the back merge.
*/
else if (prev_entry->next->aref.ar_amap) {
if (amap_extend(prev_entry->next,
prev_entry->end -
prev_entry->start,
amapwaitflag | AMAP_EXTEND_BACKWARDS))
goto nomerge;
}
} else {
/*
* Pull the next entry's amap backwards to cover this
* new allocation.
*/
if (prev_entry->next->aref.ar_amap) {
error = amap_extend(prev_entry->next, size,
amapwaitflag | AMAP_EXTEND_BACKWARDS);
if (error)
goto nomerge;
}
}
if (merged) {
if (kmap) {
UVMMAP_EVCNT_DECR(kbackmerge);
UVMMAP_EVCNT_INCR(kbimerge);
} else {
UVMMAP_EVCNT_DECR(ubackmerge);
UVMMAP_EVCNT_INCR(ubimerge);
}
} else {
if (kmap) {
UVMMAP_EVCNT_INCR(kforwmerge);
} else {
UVMMAP_EVCNT_INCR(uforwmerge);
}
}
UVMHIST_LOG(maphist," starting forward merge", 0, 0, 0, 0);
/*
* drop our reference to uobj since we are extending a reference
* that we already have (the ref count can not drop to zero).
*/
if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj);
if (merged) {
dead = prev_entry->next;
prev_entry->end = dead->end;
uvm_map_entry_unlink(map, dead); if (dead->aref.ar_amap != NULL) {
prev_entry->aref = dead->aref;
dead->aref.ar_amap = NULL;
}
} else {
prev_entry->next->start -= size;
if (prev_entry != &map->header) {
prev_entry->gap -= size;
KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
uvm_rb_fixup(map, prev_entry);
}
if (uobj) prev_entry->next->offset = uoffset;
}
uvm_map_check(map, "map forwardmerged");
UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
merged++;
}
nomerge:
if (!merged) {
UVMHIST_LOG(maphist," allocating new map entry", 0, 0, 0, 0);
if (kmap) {
UVMMAP_EVCNT_INCR(knomerge);
} else {
UVMMAP_EVCNT_INCR(unomerge);
}
/*
* allocate new entry and link it in.
*/
if (new_entry == NULL) { new_entry = uvm_mapent_alloc(map,
(flags & UVM_FLAG_NOWAIT));
if (__predict_false(new_entry == NULL)) {
error = ENOMEM;
goto done;
}
}
new_entry->start = start;
new_entry->end = new_entry->start + size;
new_entry->object.uvm_obj = uobj;
new_entry->offset = uoffset;
new_entry->etype = newetype;
if (flags & UVM_FLAG_NOMERGE) { new_entry->flags |= UVM_MAP_NOMERGE;
}
new_entry->protection = prot;
new_entry->max_protection = maxprot;
new_entry->inheritance = inherit;
new_entry->wired_count = 0;
new_entry->advice = advice;
if (flags & UVM_FLAG_OVERLAY) {
/*
* to_add: for BSS we overallocate a little since we
* are likely to extend
*/
vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
struct vm_amap *amap = amap_alloc(size, to_add,
(flags & UVM_FLAG_NOWAIT));
if (__predict_false(amap == NULL)) {
error = ENOMEM;
goto done;
}
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = amap;
} else {
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = NULL;
}
uvm_map_entry_link(map, prev_entry, new_entry);
/*
* Update the free space hint
*/
if ((map->first_free == prev_entry) &&
(prev_entry->end >= new_entry->start))
map->first_free = new_entry;
new_entry = NULL;
}
map->size += size;
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
error = 0;
done:
vm_map_unlock(map); if (new_entry) { uvm_mapent_free(new_entry);
}
if (dead) { KDASSERT(merged);
uvm_mapent_free(dead);
}
if (dead_entries) uvm_unmap_detach(dead_entries, 0);
return error;
}
/*
* uvm_map_lookup_entry_bytree: lookup an entry in tree
*/
static inline bool
uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
struct vm_map_entry **entry /* OUT */)
{
struct vm_map_entry *prev = &map->header;
struct vm_map_entry *cur = ROOT_ENTRY(map);
while (cur) {
UVMMAP_EVCNT_INCR(mlk_treeloop);
if (address >= cur->start) {
if (address < cur->end) {
*entry = cur;
return true;
}
prev = cur;
cur = RIGHT_ENTRY(cur);
} else
cur = LEFT_ENTRY(cur);
}
*entry = prev;
return false;
}
/*
* uvm_map_lookup_entry: find map entry at or before an address
*
* => map must at least be read-locked by caller
* => entry is returned in "entry"
* => return value is true if address is in the returned entry
*/
bool
uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
struct vm_map_entry **entry /* OUT */)
{
struct vm_map_entry *cur;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
(uintptr_t)map, address, (uintptr_t)entry, 0);
/*
* make a quick check to see if we are already looking at
* the entry we want (which is usually the case). note also
* that we don't need to save the hint here... it is the
* same hint (unless we are at the header, in which case the
* hint didn't buy us anything anyway).
*/
cur = map->hint;
UVMMAP_EVCNT_INCR(mlk_call);
if (cur != &map->header && address >= cur->start && cur->end > address) {
UVMMAP_EVCNT_INCR(mlk_hint);
*entry = cur;
UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
(uintptr_t)cur, 0, 0, 0);
uvm_mapent_check(*entry);
return (true);
}
uvm_map_check(map, __func__);
/*
* lookup in the tree.
*/
UVMMAP_EVCNT_INCR(mlk_tree);
if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) {
SAVE_HINT(map, map->hint, *entry);
UVMHIST_LOG(maphist,"<- search got it (%#jx)",
(uintptr_t)cur, 0, 0, 0);
KDASSERT((*entry)->start <= address); KDASSERT(address < (*entry)->end); uvm_mapent_check(*entry);
return (true);
}
SAVE_HINT(map, map->hint, *entry);
UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
KDASSERT((*entry) == &map->header || (*entry)->end <= address); KDASSERT((*entry)->next == &map->header ||
address < (*entry)->next->start);
return (false);
}
/*
* See if the range between start and start + length fits in the gap
* entry->next->start and entry->end. Returns 1 if fits, 0 if doesn't
* fit, and -1 address wraps around.
*/
static int
uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
{
vaddr_t end;
#ifdef PMAP_PREFER
/*
* push start address forward as needed to avoid VAC alias problems.
* we only do this if a valid offset is specified.
*/
if (uoffset != UVM_UNKNOWN_OFFSET)
PMAP_PREFER(uoffset, start, length, topdown);
#endif
if ((flags & UVM_FLAG_COLORMATCH) != 0) {
KASSERT(align < uvmexp.ncolors); if (uvmexp.ncolors > 1) {
const u_int colormask = uvmexp.colormask;
const u_int colorsize = colormask + 1;
vaddr_t hint = atop(*start);
const u_int color = hint & colormask;
if (color != align) {
hint -= color; /* adjust to color boundary */
KASSERT((hint & colormask) == 0);
if (topdown) {
if (align > color) hint -= colorsize;
} else {
if (align < color) hint += colorsize;
}
*start = ptoa(hint + align); /* adjust to color */
}
}
} else {
KASSERT(powerof2(align)); uvm_map_align_va(start, align, topdown);
/*
* XXX Should we PMAP_PREFER() here again?
* eh...i think we're okay
*/
}
/*
* Find the end of the proposed new region. Be sure we didn't
* wrap around the address; if so, we lose. Otherwise, if the
* proposed new region fits before the next entry, we win.
*/
end = *start + length;
if (end < *start)
return (-1);
if (entry->next->start >= end && *start >= entry->end)
return (1);
return (0);
}
static void
uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags,
vaddr_t hint, struct vm_map_entry *entry, int line)
{
const int topdown = map->flags & VM_MAP_TOPDOWN;
KASSERTMSG( topdown || hint >= orig_hint,
"map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
" length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
" flags=%#x entry=%p (uvm_map_findspace line %d)",
map, hint, orig_hint,
length, uobj, (unsigned long long)uoffset, align,
flags, entry, line);
#ifndef __sh3__ /* XXXRO: kern/51254 */
KASSERTMSG(!topdown || hint <= orig_hint,
#else
if (__predict_false(!(!topdown || hint <= orig_hint)))
printf(
#endif
"map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
" length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
" flags=%#x entry=%p (uvm_map_findspace line %d)",
map, hint, orig_hint,
length, uobj, (unsigned long long)uoffset, align,
flags, entry, line);
}
/*
* uvm_map_findspace: find "length" sized space in "map".
*
* => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
* set in "flags" (in which case we insist on using "hint").
* => "result" is VA returned
* => uobj/uoffset are to be used to handle VAC alignment, if required
* => if "align" is non-zero, we attempt to align to that value.
* => caller must at least have read-locked map
* => returns NULL on failure, or pointer to prev. map entry if success
* => note this is a cross between the old vm_map_findspace and vm_map_find
*/
struct vm_map_entry *
uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
vsize_t align, int flags)
{
#define INVARIANTS() \
uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\
flags, hint, entry, __LINE__)
struct vm_map_entry *entry = NULL;
struct vm_map_entry *child, *prev, *tmp;
vaddr_t orig_hint __diagused;
const int topdown = map->flags & VM_MAP_TOPDOWN;
int avail;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...",
(uintptr_t)map, hint, length, flags);
UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)",
(uintptr_t)uobj, uoffset, align, 0);
KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align)); KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0); uvm_map_check(map, "map_findspace entry");
/*
* Clamp the hint to the VM map's min/max address, and remmeber
* the clamped original hint. Remember the original hint,
* clamped to the min/max address. If we are aligning, then we
* may have to try again with no alignment constraint if we
* fail the first time.
*
* We use the original hint to verify later that the search has
* been monotonic -- that is, nonincreasing or nondecreasing,
* according to topdown or !topdown respectively. But the
* clamping is not monotonic.
*/
if (hint < vm_map_min(map)) { /* check ranges ... */ if (flags & UVM_FLAG_FIXED) {
UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
return (NULL);
}
hint = vm_map_min(map);
}
if (hint > vm_map_max(map)) {
UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
return (NULL);
}
orig_hint = hint;
INVARIANTS();
UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
/*
* hint may not be aligned properly; we need round up or down it
* before proceeding further.
*/
if ((flags & UVM_FLAG_COLORMATCH) == 0) { uvm_map_align_va(&hint, align, topdown); INVARIANTS();
}
UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
/*
* Look for the first possible address; if there's already
* something at this address, we have to start after it.
*/
/*
* @@@: there are four, no, eight cases to consider.
*
* 0: found, fixed, bottom up -> fail
* 1: found, fixed, top down -> fail
* 2: found, not fixed, bottom up -> start after entry->end,
* loop up
* 3: found, not fixed, top down -> start before entry->start,
* loop down
* 4: not found, fixed, bottom up -> check entry->next->start, fail
* 5: not found, fixed, top down -> check entry->next->start, fail
* 6: not found, not fixed, bottom up -> check entry->next->start,
* loop up
* 7: not found, not fixed, top down -> check entry->next->start,
* loop down
*
* as you can see, it reduces to roughly five cases, and that
* adding top down mapping only adds one unique case (without
* it, there would be four cases).
*/
if ((flags & UVM_FLAG_FIXED) == 0 &&
hint == (topdown ? vm_map_max(map) : vm_map_min(map))) {
/*
* The uvm_map_findspace algorithm is monotonic -- for
* topdown VM it starts with a high hint and returns a
* lower free address; for !topdown VM it starts with a
* low hint and returns a higher free address. As an
* optimization, start with the first (highest for
* topdown, lowest for !topdown) free address.
*
* XXX This `optimization' probably doesn't actually do
* much in practice unless userland explicitly passes
* the VM map's minimum or maximum address, which
* varies from machine to machine (VM_MAX/MIN_ADDRESS,
* e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on
* aarch64) and may vary according to other factors
* like sysctl vm.user_va0_disable. In particular, if
* the user specifies 0 as a hint to mmap, then mmap
* will choose a default address which is usually _not_
* VM_MAX/MIN_ADDRESS but something else instead like
* VM_MAX_ADDRESS - stack size - guard page overhead,
* in which case this branch is never hit.
*
* In fact, this branch appears to have been broken for
* two decades between when topdown was introduced in
* ~2003 and when it was adapted to handle the topdown
* case without violating the monotonicity assertion in
* 2022. Maybe Someone^TM should either ditch the
* optimization or find a better way to do it.
*/
entry = map->first_free;
} else {
if (uvm_map_lookup_entry(map, hint, &entry)) {
/* "hint" address already in use ... */
if (flags & UVM_FLAG_FIXED) {
UVMHIST_LOG(maphist, "<- fixed & VA in use",
0, 0, 0, 0);
return (NULL);
}
if (topdown)
/* Start from lower gap. */
entry = entry->prev;
} else if (flags & UVM_FLAG_FIXED) {
if (entry->next->start >= hint + length &&
hint + length > hint)
goto found;
/* "hint" address is gap but too small */
UVMHIST_LOG(maphist, "<- fixed mapping failed",
0, 0, 0, 0);
return (NULL); /* only one shot at it ... */
} else {
/*
* See if given hint fits in this gap.
*/
avail = uvm_map_space_avail(&hint, length,
uoffset, align, flags, topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
if (topdown) {
/*
* Still there is a chance to fit
* if hint > entry->end.
*/
} else {
/* Start from higher gap. */
entry = entry->next;
if (entry == &map->header)
goto notfound;
goto nextgap;
}
}
}
/*
* Note that all UVM_FLAGS_FIXED case is already handled.
*/
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
/* Try to find the space in the red-black tree */
/* Check slot before any entry */
if (topdown) { KASSERTMSG(entry->next->start >= vm_map_min(map),
"map=%p entry=%p entry->next=%p"
" entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR,
map, entry, entry->next,
entry->next->start, vm_map_min(map));
if (length > entry->next->start - vm_map_min(map))
hint = vm_map_min(map); /* XXX goto wraparound? */
else
hint = entry->next->start - length;
KASSERT(hint >= vm_map_min(map));
} else {
hint = entry->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align, flags,
topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
nextgap:
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
/* If there is not enough space in the whole tree, we fail */
tmp = ROOT_ENTRY(map);
if (tmp == NULL || tmp->maxgap < length)
goto notfound;
prev = NULL; /* previous candidate */
/* Find an entry close to hint that has enough space */
for (; tmp;) {
KASSERT(tmp->next->start == tmp->end + tmp->gap);
if (topdown) {
if (tmp->next->start < hint + length && (prev == NULL || tmp->end > prev->end)) { if (tmp->gap >= length)
prev = tmp;
else if ((child = LEFT_ENTRY(tmp)) != NULL
&& child->maxgap >= length)
prev = tmp;
}
} else {
if (tmp->end >= hint && (prev == NULL || tmp->end < prev->end)) { if (tmp->gap >= length)
prev = tmp;
else if ((child = RIGHT_ENTRY(tmp)) != NULL
&& child->maxgap >= length)
prev = tmp;
}
}
if (tmp->next->start < hint + length)
child = RIGHT_ENTRY(tmp);
else if (tmp->end > hint)
child = LEFT_ENTRY(tmp);
else {
if (tmp->gap >= length)
break;
if (topdown)
child = LEFT_ENTRY(tmp);
else
child = RIGHT_ENTRY(tmp);
}
if (child == NULL || child->maxgap < length)
break;
tmp = child;
}
if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
/*
* Check if the entry that we found satifies the
* space requirement
*/
if (topdown) {
if (hint > tmp->next->start - length)
hint = tmp->next->start - length;
} else {
if (hint < tmp->end)
hint = tmp->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, tmp);
INVARIANTS(); switch (avail) {
case 1:
entry = tmp;
goto found;
case -1:
goto wraparound;
}
if (tmp->gap >= length)
goto listsearch;
}
if (prev == NULL)
goto notfound;
if (topdown) {
KASSERT(orig_hint >= prev->next->start - length ||
prev->next->start - length > prev->next->start);
hint = prev->next->start - length;
} else {
KASSERT(orig_hint <= prev->end);
hint = prev->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, prev);
INVARIANTS(); switch (avail) {
case 1:
entry = prev;
goto found;
case -1:
goto wraparound;
}
if (prev->gap >= length)
goto listsearch;
if (topdown)
tmp = LEFT_ENTRY(prev);
else
tmp = RIGHT_ENTRY(prev);
for (;;) {
KASSERT(tmp); KASSERTMSG(tmp->maxgap >= length,
"tmp->maxgap=0x%"PRIxVSIZE" length=0x%"PRIxVSIZE,
tmp->maxgap, length);
if (topdown)
child = RIGHT_ENTRY(tmp);
else
child = LEFT_ENTRY(tmp);
if (child && child->maxgap >= length) {
tmp = child;
continue;
}
if (tmp->gap >= length)
break;
if (topdown)
tmp = LEFT_ENTRY(tmp);
else
tmp = RIGHT_ENTRY(tmp);
}
if (topdown) {
KASSERT(orig_hint >= tmp->next->start - length ||
tmp->next->start - length > tmp->next->start);
hint = tmp->next->start - length;
} else {
KASSERT(orig_hint <= tmp->end);
hint = tmp->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, tmp);
INVARIANTS(); switch (avail) {
case 1:
entry = tmp;
goto found;
case -1:
goto wraparound;
}
/*
* The tree fails to find an entry because of offset or alignment
* restrictions. Search the list instead.
*/
listsearch:
/*
* Look through the rest of the map, trying to fit a new region in
* the gap between existing regions, or after the very last region.
* note: entry->end = base VA of current gap,
* entry->next->start = VA of end of current gap
*/
INVARIANTS();
for (;;) {
/* Update hint for current gap. */
hint = topdown ? entry->next->start - length : entry->end; INVARIANTS();
/* See if it fits. */
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
/* Advance to next/previous gap */
if (topdown) {
if (entry == &map->header) {
UVMHIST_LOG(maphist, "<- failed (off start)",
0,0,0,0);
goto notfound;
}
entry = entry->prev;
} else {
entry = entry->next;
if (entry == &map->header) {
UVMHIST_LOG(maphist, "<- failed (off end)",
0,0,0,0);
goto notfound;
}
}
}
found:
SAVE_HINT(map, map->hint, entry);
*result = hint;
UVMHIST_LOG(maphist,"<- got it! (result=%#jx)", hint, 0,0,0);
INVARIANTS(); KASSERT(entry->end <= hint); KASSERT(hint + length <= entry->next->start);
return (entry);
wraparound:
UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
return (NULL);
notfound:
UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);
return (NULL);
#undef INVARIANTS
}
/*
* U N M A P - m a i n h e l p e r f u n c t i o n s
*/
/*
* uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
*
* => caller must check alignment and size
* => map must be locked by caller
* => we return a list of map entries that we've remove from the map
* in "entry_list"
*/
void
uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map_entry **entry_list /* OUT */, int flags)
{
struct vm_map_entry *entry, *first_entry, *next;
vaddr_t len;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
(uintptr_t)map, start, end, 0);
VM_MAP_RANGE_CHECK(map, start, end);
uvm_map_check(map, "unmap_remove entry");
/*
* find first entry
*/
if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
/* clip and go... */
entry = first_entry;
UVM_MAP_CLIP_START(map, entry, start);
/* critical! prevents stale hint */
SAVE_HINT(map, entry, entry->prev);
} else {
entry = first_entry->next;
}
/*
* save the free space hint
*/
if (map->first_free != &map->header && map->first_free->start >= start) map->first_free = entry->prev;
/*
* note: we now re-use first_entry for a different task. we remove
* a number of map entries from the map and save them in a linked
* list headed by "first_entry". once we remove them from the map
* the caller should unlock the map and drop the references to the
* backing objects [c.f. uvm_unmap_detach]. the object is to
* separate unmapping from reference dropping. why?
* [1] the map has to be locked for unmapping
* [2] the map need not be locked for reference dropping
* [3] dropping references may trigger pager I/O, and if we hit
* a pager that does synchronous I/O we may have to wait for it.
* [4] we would like all waiting for I/O to occur with maps unlocked
* so that we don't block other threads.
*/
first_entry = NULL;
*entry_list = NULL;
/*
* break up the area into map entry sized regions and unmap. note
* that all mappings have to be removed before we can even consider
* dropping references to amaps or VM objects (otherwise we could end
* up with a mapping to a page on the free list which would be very bad)
*/
while ((entry != &map->header) && (entry->start < end)) { KASSERT((entry->flags & UVM_MAP_STATIC) == 0); UVM_MAP_CLIP_END(map, entry, end);
next = entry->next;
len = entry->end - entry->start;
/*
* unwire before removing addresses from the pmap; otherwise
* unwiring will put the entries back into the pmap (XXX).
*/
if (VM_MAPENT_ISWIRED(entry)) { uvm_map_entry_unwire(map, entry);
}
if (flags & UVM_FLAG_VAONLY) {
/* nothing */
} else if ((map->flags & VM_MAP_PAGEABLE) == 0) {
/*
* if the map is non-pageable, any pages mapped there
* must be wired and entered with pmap_kenter_pa(),
* and we should free any such pages immediately.
* this is mostly used for kmem_map.
*/
KASSERT(vm_map_pmap(map) == pmap_kernel());
uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
} else if (UVM_ET_ISOBJ(entry) &&
UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
panic("%s: kernel object %p %p\n",
__func__, map, entry);
} else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
/*
* remove mappings the standard way. lock object
* and/or amap to ensure vm_page state does not
* change while in pmap_remove().
*/
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(entry, RW_WRITER);
#else
uvm_map_lock_entry(entry, RW_READER);
#endif
pmap_remove(map->pmap, entry->start, entry->end);
/*
* note: if map is dying, leave pmap_update() for
* later. if the map is to be reused (exec) then
* pmap_update() will be called. if the map is
* being disposed of (exit) then pmap_destroy()
* will be called.
*/
if ((map->flags & VM_MAP_DYING) == 0) {
pmap_update(vm_map_pmap(map));
} else {
KASSERT(vm_map_pmap(map) != pmap_kernel());
}
uvm_map_unlock_entry(entry);
}
#if defined(UVMDEBUG)
/*
* check if there's remaining mapping,
* which is a bug in caller.
*/
vaddr_t va;
for (va = entry->start; va < entry->end;
va += PAGE_SIZE) {
if (pmap_extract(vm_map_pmap(map), va, NULL)) {
panic("%s: %#"PRIxVADDR" has mapping",
__func__, va);
}
}
if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
uvm_km_check_empty(map, entry->start, entry->end);
}
#endif /* defined(UVMDEBUG) */
/*
* remove entry from map and put it on our list of entries
* that we've nuked. then go to next entry.
*/
UVMHIST_LOG(maphist, " removed map entry %#jx",
(uintptr_t)entry, 0, 0, 0);
/* critical! prevents stale hint */
SAVE_HINT(map, entry, entry->prev); uvm_map_entry_unlink(map, entry); KASSERT(map->size >= len);
map->size -= len;
entry->prev = NULL;
entry->next = first_entry;
first_entry = entry;
entry = next;
}
uvm_map_check(map, "unmap_remove leave");
/*
* now we've cleaned up the map and are ready for the caller to drop
* references to the mapped objects.
*/
*entry_list = first_entry;
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
if (map->flags & VM_MAP_WANTVA) { mutex_enter(&map->misc_lock);
map->flags &= ~VM_MAP_WANTVA;
cv_broadcast(&map->cv);
mutex_exit(&map->misc_lock);
}
}
/*
* uvm_unmap_detach: drop references in a chain of map entries
*
* => we will free the map entries as we traverse the list.
*/
void
uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
{
struct vm_map_entry *next_entry;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
while (first_entry) { KASSERT(!VM_MAPENT_ISWIRED(first_entry));
UVMHIST_LOG(maphist,
" detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
(uintptr_t)first_entry,
(uintptr_t)first_entry->aref.ar_amap,
(uintptr_t)first_entry->object.uvm_obj,
UVM_ET_ISSUBMAP(first_entry));
/*
* drop reference to amap, if we've got one
*/
if (first_entry->aref.ar_amap) uvm_map_unreference_amap(first_entry, flags);
/*
* drop reference to our backing object, if we've got one
*/
KASSERT(!UVM_ET_ISSUBMAP(first_entry)); if (UVM_ET_ISOBJ(first_entry) &&
first_entry->object.uvm_obj->pgops->pgo_detach) {
(*first_entry->object.uvm_obj->pgops->pgo_detach)
(first_entry->object.uvm_obj);
}
next_entry = first_entry->next;
uvm_mapent_free(first_entry);
first_entry = next_entry;
}
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* E X T R A C T I O N F U N C T I O N S
*/
/*
* uvm_map_reserve: reserve space in a vm_map for future use.
*
* => we reserve space in a map by putting a dummy map entry in the
* map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
* => map should be unlocked (we will write lock it)
* => we return true if we were able to reserve space
* => XXXCDC: should be inline?
*/
int
uvm_map_reserve(struct vm_map *map, vsize_t size,
vaddr_t offset /* hint for pmap_prefer */,
vsize_t align /* alignment */,
vaddr_t *raddr /* IN:hint, OUT: reserved VA */,
uvm_flag_t flags /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
(uintptr_t)map, size, offset, (uintptr_t)raddr);
size = round_page(size);
/*
* reserve some virtual space.
*/
if (uvm_map(map, raddr, size, NULL, offset, align,
UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
return (false);
}
UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
return (true);
}
/*
* uvm_map_replace: replace a reserved (blank) area of memory with
* real mappings.
*
* => caller must WRITE-LOCK the map
* => we return true if replacement was a success
* => we expect the newents chain to have nnewents entrys on it and
* we expect newents->prev to point to the last entry on the list
* => note newents is allowed to be NULL
*/
static int
uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map_entry *newents, int nnewents, vsize_t nsize,
struct vm_map_entry **oldentryp)
{
struct vm_map_entry *oldent, *last;
uvm_map_check(map, "map_replace entry");
/*
* first find the blank map entry at the specified address
*/
if (!uvm_map_lookup_entry(map, start, &oldent)) {
return (false);
}
/*
* check to make sure we have a proper blank entry
*/
if (end < oldent->end) {
UVM_MAP_CLIP_END(map, oldent, end);
}
if (oldent->start != start || oldent->end != end ||
oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
return (false);
}
#ifdef DIAGNOSTIC
/*
* sanity check the newents chain
*/
{
struct vm_map_entry *tmpent = newents;
int nent = 0;
vsize_t sz = 0;
vaddr_t cur = start;
while (tmpent) {
nent++;
sz += tmpent->end - tmpent->start;
if (tmpent->start < cur)
panic("uvm_map_replace1");
if (tmpent->start >= tmpent->end || tmpent->end > end) {
panic("uvm_map_replace2: "
"tmpent->start=%#"PRIxVADDR
", tmpent->end=%#"PRIxVADDR
", end=%#"PRIxVADDR,
tmpent->start, tmpent->end, end);
}
cur = tmpent->end;
if (tmpent->next) {
if (tmpent->next->prev != tmpent)
panic("uvm_map_replace3");
} else {
if (newents->prev != tmpent)
panic("uvm_map_replace4");
}
tmpent = tmpent->next;
}
if (nent != nnewents)
panic("uvm_map_replace5");
if (sz != nsize)
panic("uvm_map_replace6");
}
#endif
/*
* map entry is a valid blank! replace it. (this does all the
* work of map entry link/unlink...).
*/
if (newents) {
last = newents->prev;
/* critical: flush stale hints out of map */
SAVE_HINT(map, map->hint, newents);
if (map->first_free == oldent)
map->first_free = last;
last->next = oldent->next;
last->next->prev = last;
/* Fix RB tree */
uvm_rb_remove(map, oldent);
newents->prev = oldent->prev;
newents->prev->next = newents;
map->nentries = map->nentries + (nnewents - 1);
/* Fixup the RB tree */
{
int i;
struct vm_map_entry *tmp;
tmp = newents;
for (i = 0; i < nnewents && tmp; i++) {
uvm_rb_insert(map, tmp);
tmp = tmp->next;
}
}
} else {
/* NULL list of new entries: just remove the old one */
clear_hints(map, oldent);
uvm_map_entry_unlink(map, oldent);
}
map->size -= end - start - nsize;
uvm_map_check(map, "map_replace leave");
/*
* now we can free the old blank entry and return.
*/
*oldentryp = oldent;
return (true);
}
/*
* uvm_map_extract: extract a mapping from a map and put it somewhere
* (maybe removing the old mapping)
*
* => maps should be unlocked (we will write lock them)
* => returns 0 on success, error code otherwise
* => start must be page aligned
* => len must be page sized
* => flags:
* UVM_EXTRACT_REMOVE: remove mappings from srcmap
* UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
* UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
* UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
* UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
* >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
* >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
* be used from within the kernel in a kernel level map <<<
*/
int
uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
{
vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
*deadentry, *oldentry;
struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
vsize_t elen __unused;
int nchain, error, copy_ok;
vsize_t nsize;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
(uintptr_t)srcmap, start, len, 0);
UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
(uintptr_t)dstmap, flags, 0, 0);
/*
* step 0: sanity check: start must be on a page boundary, length
* must be page sized. can't ask for CONTIG/QREF if you asked for
* REMOVE.
*/
KASSERTMSG((start & PAGE_MASK) == 0, "start=0x%"PRIxVADDR, start);
KASSERTMSG((len & PAGE_MASK) == 0, "len=0x%"PRIxVADDR, len);
KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
(flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
/*
* step 1: reserve space in the target map for the extracted area
*/
if ((flags & UVM_EXTRACT_RESERVED) == 0) {
dstaddr = vm_map_min(dstmap);
if (!uvm_map_reserve(dstmap, len, start,
atop(start) & uvmexp.colormask, &dstaddr,
UVM_FLAG_COLORMATCH))
return (ENOMEM);
KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
*dstaddrp = dstaddr; /* pass address back to caller */
UVMHIST_LOG(maphist, " dstaddr=%#jx", dstaddr,0,0,0);
} else {
dstaddr = *dstaddrp;
}
/*
* step 2: setup for the extraction process loop by init'ing the
* map entry chain, locking src map, and looking up the first useful
* entry in the map.
*/
end = start + len;
newend = dstaddr + len;
chain = endchain = NULL;
nchain = 0;
nsize = 0;
vm_map_lock(srcmap);
if (uvm_map_lookup_entry(srcmap, start, &entry)) {
/* "start" is within an entry */
if (flags & UVM_EXTRACT_QREF) {
/*
* for quick references we don't clip the entry, so
* the entry may map space "before" the starting
* virtual address... this is the "fudge" factor
* (which can be non-zero only the first time
* through the "while" loop in step 3).
*/
fudge = start - entry->start;
} else {
/*
* normal reference: we clip the map to fit (thus
* fudge is zero)
*/
UVM_MAP_CLIP_START(srcmap, entry, start);
SAVE_HINT(srcmap, srcmap->hint, entry->prev);
fudge = 0;
}
} else {
/* "start" is not within an entry ... skip to next entry */
if (flags & UVM_EXTRACT_CONTIG) {
error = EINVAL;
goto bad; /* definite hole here ... */
}
entry = entry->next;
fudge = 0;
}
/* save values from srcmap for step 6 */
orig_entry = entry;
orig_fudge = fudge;
/*
* step 3: now start looping through the map entries, extracting
* as we go.
*/
while (entry->start < end && entry != &srcmap->header) {
/* if we are not doing a quick reference, clip it */
if ((flags & UVM_EXTRACT_QREF) == 0)
UVM_MAP_CLIP_END(srcmap, entry, end);
/* clear needs_copy (allow chunking) */
if (UVM_ET_ISNEEDSCOPY(entry)) {
amap_copy(srcmap, entry,
AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */
error = ENOMEM;
goto bad;
}
/* amap_copy could clip (during chunk)! update fudge */
if (fudge) {
fudge = start - entry->start;
orig_fudge = fudge;
}
}
/* calculate the offset of this from "start" */
oldoffset = (entry->start + fudge) - start;
/* allocate a new map entry */
newentry = uvm_mapent_alloc(dstmap, 0);
if (newentry == NULL) {
error = ENOMEM;
goto bad;
}
/* set up new map entry */
newentry->next = NULL;
newentry->prev = endchain;
newentry->start = dstaddr + oldoffset;
newentry->end =
newentry->start + (entry->end - (entry->start + fudge));
if (newentry->end > newend || newentry->end < newentry->start)
newentry->end = newend;
newentry->object.uvm_obj = entry->object.uvm_obj;
if (newentry->object.uvm_obj) {
if (newentry->object.uvm_obj->pgops->pgo_reference)
newentry->object.uvm_obj->pgops->
pgo_reference(newentry->object.uvm_obj);
newentry->offset = entry->offset + fudge;
} else {
newentry->offset = 0;
}
newentry->etype = entry->etype;
if (flags & UVM_EXTRACT_PROT_ALL) {
newentry->protection = newentry->max_protection =
UVM_PROT_ALL;
} else {
newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
entry->max_protection : entry->protection;
newentry->max_protection = entry->max_protection;
}
newentry->inheritance = entry->inheritance;
newentry->wired_count = 0;
newentry->aref.ar_amap = entry->aref.ar_amap;
if (newentry->aref.ar_amap) {
newentry->aref.ar_pageoff =
entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
uvm_map_reference_amap(newentry, AMAP_SHARED |
((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
} else {
newentry->aref.ar_pageoff = 0;
}
newentry->advice = entry->advice;
if ((flags & UVM_EXTRACT_QREF) != 0) {
newentry->flags |= UVM_MAP_NOMERGE;
}
/* now link it on the chain */
nchain++;
nsize += newentry->end - newentry->start;
if (endchain == NULL) {
chain = endchain = newentry;
} else {
endchain->next = newentry;
endchain = newentry;
}
/* end of 'while' loop! */
if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
(entry->next == &srcmap->header ||
entry->next->start != entry->end)) {
error = EINVAL;
goto bad;
}
entry = entry->next;
fudge = 0;
}
/*
* step 4: close off chain (in format expected by uvm_map_replace)
*/
if (chain)
chain->prev = endchain;
/*
* step 5: attempt to lock the dest map so we can pmap_copy.
* note usage of copy_ok:
* 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
* 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
*/
if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
copy_ok = 1;
if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
nchain, nsize, &resentry)) {
if (srcmap != dstmap)
vm_map_unlock(dstmap);
error = EIO;
goto bad;
}
} else {
copy_ok = 0;
/* replace deferred until step 7 */
}
/*
* step 6: traverse the srcmap a second time to do the following:
* - if we got a lock on the dstmap do pmap_copy
* - if UVM_EXTRACT_REMOVE remove the entries
* we make use of orig_entry and orig_fudge (saved in step 2)
*/
if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
/* purge possible stale hints from srcmap */
if (flags & UVM_EXTRACT_REMOVE) {
SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
if (srcmap->first_free != &srcmap->header &&
srcmap->first_free->start >= start)
srcmap->first_free = orig_entry->prev;
}
entry = orig_entry;
fudge = orig_fudge;
deadentry = NULL; /* for UVM_EXTRACT_REMOVE */
while (entry->start < end && entry != &srcmap->header) {
if (copy_ok) {
oldoffset = (entry->start + fudge) - start;
elen = MIN(end, entry->end) -
(entry->start + fudge);
pmap_copy(dstmap->pmap, srcmap->pmap,
dstaddr + oldoffset, elen,
entry->start + fudge);
}
/* we advance "entry" in the following if statement */
if (flags & UVM_EXTRACT_REMOVE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(entry, RW_WRITER);
#else
uvm_map_lock_entry(entry, RW_READER);
#endif
pmap_remove(srcmap->pmap, entry->start,
entry->end);
uvm_map_unlock_entry(entry);
oldentry = entry; /* save entry */
entry = entry->next; /* advance */
uvm_map_entry_unlink(srcmap, oldentry);
/* add to dead list */
oldentry->next = deadentry;
deadentry = oldentry;
} else {
entry = entry->next; /* advance */
}
/* end of 'while' loop */
fudge = 0;
}
pmap_update(srcmap->pmap);
/*
* unlock dstmap. we will dispose of deadentry in
* step 7 if needed
*/
if (copy_ok && srcmap != dstmap)
vm_map_unlock(dstmap);
} else {
deadentry = NULL;
}
/*
* step 7: we are done with the source map, unlock. if copy_ok
* is 0 then we have not replaced the dummy mapping in dstmap yet
* and we need to do so now.
*/
vm_map_unlock(srcmap);
if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
uvm_unmap_detach(deadentry, 0); /* dispose of old entries */
/* now do the replacement if we didn't do it in step 5 */
if (copy_ok == 0) {
vm_map_lock(dstmap);
error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
nchain, nsize, &resentry);
vm_map_unlock(dstmap);
if (error == false) {
error = EIO;
goto bad2;
}
}
if (resentry != NULL)
uvm_mapent_free(resentry);
return (0);
/*
* bad: failure recovery
*/
bad:
vm_map_unlock(srcmap);
bad2: /* src already unlocked */
if (chain)
uvm_unmap_detach(chain,
(flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);
if (resentry != NULL)
uvm_mapent_free(resentry);
if ((flags & UVM_EXTRACT_RESERVED) == 0) {
uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */
}
return (error);
}
/* end of extraction functions */
/*
* uvm_map_submap: punch down part of a map into a submap
*
* => only the kernel_map is allowed to be submapped
* => the purpose of submapping is to break up the locking granularity
* of a larger map
* => the range specified must have been mapped previously with a uvm_map()
* call [with uobj==NULL] to create a blank map entry in the main map.
* [And it had better still be blank!]
* => maps which contain submaps should never be copied or forked.
* => to remove a submap, use uvm_unmap() on the main map
* and then uvm_map_deallocate() the submap.
* => main map must be unlocked.
* => submap must have been init'd and have a zero reference count.
* [need not be locked as we don't actually reference it]
*/
int
uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map *submap)
{
struct vm_map_entry *entry;
int error;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &entry)) {
UVM_MAP_CLIP_START(map, entry, start);
UVM_MAP_CLIP_END(map, entry, end); /* to be safe */
} else {
entry = NULL;
}
if (entry != NULL &&
entry->start == start && entry->end == end &&
entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
!UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
entry->etype |= UVM_ET_SUBMAP;
entry->object.sub_map = submap;
entry->offset = 0;
uvm_map_reference(submap);
error = 0;
} else {
error = EINVAL;
}
vm_map_unlock(map);
return error;
}
/*
* uvm_map_protect_user: change map protection on behalf of the user.
* Enforces PAX settings as necessary.
*/
int
uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
vm_prot_t new_prot)
{
int error;
if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
return error;
return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
new_prot, false);
}
/*
* uvm_map_protect: change map protection
*
* => set_max means set max_protection.
* => map must be unlocked.
*/
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
~VM_PROT_WRITE : VM_PROT_ALL)
int
uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t new_prot, bool set_max)
{
struct vm_map_entry *current, *entry;
int error = 0;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
(uintptr_t)map, start, end, new_prot);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &entry)) {
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = entry->next;
}
/*
* make a first pass to check for protection violations.
*/
current = entry;
while ((current != &map->header) && (current->start < end)) { if (UVM_ET_ISSUBMAP(current)) {
error = EINVAL;
goto out;
}
if ((new_prot & current->max_protection) != new_prot) {
error = EACCES;
goto out;
}
/*
* Don't allow VM_PROT_EXECUTE to be set on entries that
* point to vnodes that are associated with a NOEXEC file
* system.
*/
if (UVM_ET_ISOBJ(current) &&
UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
struct vnode *vp =
(struct vnode *) current->object.uvm_obj;
if ((new_prot & VM_PROT_EXECUTE) != 0 &&
(vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
error = EACCES;
goto out;
}
}
current = current->next;
}
/* go back and fix up protections (no need to clip this time). */
current = entry;
while ((current != &map->header) && (current->start < end)) {
vm_prot_t old_prot;
UVM_MAP_CLIP_END(map, current, end);
old_prot = current->protection;
if (set_max)
current->protection =
(current->max_protection = new_prot) & old_prot;
else
current->protection = new_prot;
/*
* update physical map if necessary. worry about copy-on-write
* here -- CHECK THIS XXX
*/
if (current->protection != old_prot) {
/* update pmap! */
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(current, RW_WRITER);
#else
uvm_map_lock_entry(current, RW_READER);
#endif
pmap_protect(map->pmap, current->start, current->end,
current->protection & MASK(current));
uvm_map_unlock_entry(current);
/*
* If this entry points at a vnode, and the
* protection includes VM_PROT_EXECUTE, mark
* the vnode as VEXECMAP.
*/
if (UVM_ET_ISOBJ(current)) {
struct uvm_object *uobj =
current->object.uvm_obj;
if (UVM_OBJ_IS_VNODE(uobj) &&
(current->protection & VM_PROT_EXECUTE)) {
vn_markexec((struct vnode *) uobj);
}
}
}
/*
* If the map is configured to lock any future mappings,
* wire this entry now if the old protection was VM_PROT_NONE
* and the new protection is not VM_PROT_NONE.
*/
if ((map->flags & VM_MAP_WIREFUTURE) != 0 && VM_MAPENT_ISWIRED(current) == 0 &&
old_prot == VM_PROT_NONE &&
new_prot != VM_PROT_NONE) {
/*
* We must call pmap_update() here because the
* pmap_protect() call above might have removed some
* pmap entries and uvm_map_pageable() might create
* some new pmap entries that rely on the prior
* removals being completely finished.
*/
pmap_update(map->pmap);
if (uvm_map_pageable(map, current->start,
current->end, false,
UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
/*
* If locking the entry fails, remember the
* error if it's the first one. Note we
* still continue setting the protection in
* the map, but will return the error
* condition regardless.
*
* XXX Ignore what the actual error is,
* XXX just call it a resource shortage
* XXX so that it doesn't get confused
* XXX what uvm_map_protect() itself would
* XXX normally return.
*/
error = ENOMEM;
}
}
current = current->next;
}
pmap_update(map->pmap);
out:
vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
return error;
}
#undef MASK
/*
* uvm_map_inherit: set inheritance code for range of addrs in map.
*
* => map must be unlocked
* => note that the inherit code is used during a "fork". see fork
* code for details.
*/
int
uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_inherit_t new_inheritance)
{
struct vm_map_entry *entry, *temp_entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
(uintptr_t)map, start, end, new_inheritance);
switch (new_inheritance) {
case MAP_INHERIT_NONE:
case MAP_INHERIT_COPY:
case MAP_INHERIT_SHARE:
case MAP_INHERIT_ZERO:
break;
default:
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
return EINVAL;
}
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
entry = temp_entry;
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = temp_entry->next;
}
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end);
entry->inheritance = new_inheritance;
entry = entry->next;
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_advice: set advice code for range of addrs in map.
*
* => map must be unlocked
*/
int
uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
{
struct vm_map_entry *entry, *temp_entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
(uintptr_t)map, start, end, new_advice);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
entry = temp_entry;
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = temp_entry->next;
}
/*
* XXXJRT: disallow holes?
*/
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end);
switch (new_advice) {
case MADV_NORMAL:
case MADV_RANDOM:
case MADV_SEQUENTIAL:
/* nothing special here */
break;
default:
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
return EINVAL;
}
entry->advice = new_advice;
entry = entry->next;
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_willneed: apply MADV_WILLNEED
*/
int
uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct vm_map_entry *entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
(uintptr_t)map, start, end, 0);
vm_map_lock_read(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!uvm_map_lookup_entry(map, start, &entry)) {
entry = entry->next;
}
while (entry->start < end) {
struct vm_amap * const amap = entry->aref.ar_amap;
struct uvm_object * const uobj = entry->object.uvm_obj;
KASSERT(entry != &map->header); KASSERT(start < entry->end);
/*
* For now, we handle only the easy but commonly-requested case.
* ie. start prefetching of backing uobj pages.
*
* XXX It might be useful to pmap_enter() the already-in-core
* pages by inventing a "weak" mode for uvm_fault() which would
* only do the PGO_LOCKED pgo_get().
*/
if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
off_t offset;
off_t size;
offset = entry->offset;
if (start < entry->start) {
offset += entry->start - start;
}
size = entry->offset + (entry->end - entry->start);
if (entry->end < end) {
size -= end - entry->end;
}
uvm_readahead(uobj, offset, size);
}
entry = entry->next;
}
vm_map_unlock_read(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_pageable: sets the pageability of a range in a map.
*
* => wires map entries. should not be used for transient page locking.
* for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
* => regions specified as not pageable require lock-down (wired) memory
* and page tables.
* => map must never be read-locked
* => if islocked is true, map is already write-locked
* => we always unlock the map, since we must downgrade to a read-lock
* to call uvm_fault_wire()
* => XXXCDC: check this and try and clean it up.
*/
int
uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
bool new_pageable, int lockflags)
{
struct vm_map_entry *entry, *start_entry, *failed_entry;
int rv;
#ifdef DIAGNOSTIC
u_int timestamp_save;
#endif
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
(uintptr_t)map, start, end, new_pageable);
KASSERT(map->flags & VM_MAP_PAGEABLE); if ((lockflags & UVM_LK_ENTER) == 0) vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
/*
* only one pageability change may take place at one time, since
* uvm_fault_wire assumes it will be called only once for each
* wiring/unwiring. therefore, we have to make sure we're actually
* changing the pageability for the entire region. we do so before
* making any changes.
*/
if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
return EFAULT;
}
entry = start_entry;
if (start == end) { /* nothing required */
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
return 0;
}
/*
* handle wiring and unwiring separately.
*/
if (new_pageable) { /* unwire */
UVM_MAP_CLIP_START(map, entry, start);
/*
* unwiring. first ensure that the range to be unwired is
* really wired down and that there are no holes.
*/
while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0 || (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
return EINVAL;
}
entry = entry->next;
}
/*
* POSIX 1003.1b - a single munlock call unlocks a region,
* regardless of the number of mlock calls made on that
* region.
*/
entry = start_entry;
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end); if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry);
entry = entry->next;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
return 0;
}
/*
* wire case: in two passes [XXXCDC: ugly block of code here]
*
* 1: holding the write lock, we create any anonymous maps that need
* to be created. then we clip each map entry to the region to
* be wired and increment its wiring count.
*
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the pages for any newly wired area (wired_count == 1).
*
* downgrading to a read lock for uvm_fault_wire avoids a possible
* deadlock with another thread that may have faulted on one of
* the pages to be wired (it would mark the page busy, blocking
* us, then in turn block on the map lock that we hold). because
* of problems in the recursive lock package, we cannot upgrade
* to a write lock in vm_map_lookup. thus, any actions that
* require the write lock must be done beforehand. because we
* keep the read lock on the map, the copy-on-write status of the
* entries we modify here cannot change.
*/
while ((entry != &map->header) && (entry->start < end)) { if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
/*
* perform actions of vm_map_lookup that need the
* write lock on the map: create an anonymous map
* for a copy-on-write region, or an anonymous map
* for a zero-fill region. (XXXCDC: submap case
* ok?)
*/
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ if (UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL))) {
amap_copy(map, entry, 0, start, end);
/* XXXCDC: wait OK? */
}
}
}
UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end);
entry->wired_count++;
/*
* Check for holes
*/
if (entry->protection == VM_PROT_NONE || (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
/*
* found one. amap creation actions do not need to
* be undone, but the wired counts need to be restored.
*/
while (entry != &map->header && entry->end > start) {
entry->wired_count--;
entry = entry->prev;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
return EINVAL;
}
entry = entry->next;
}
/*
* Pass 2.
*/
#ifdef DIAGNOSTIC
timestamp_save = map->timestamp;
#endif
vm_map_busy(map); vm_map_unlock(map);
rv = 0;
entry = start_entry;
while (entry != &map->header && entry->start < end) { if (entry->wired_count == 1) {
rv = uvm_fault_wire(map, entry->start, entry->end,
entry->max_protection, 1);
if (rv) {
/*
* wiring failed. break out of the loop.
* we'll clean up the map below, once we
* have a write lock again.
*/
break;
}
}
entry = entry->next;
}
if (rv) { /* failed? */
/*
* Get back to an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save + 1 != map->timestamp)
panic("uvm_map_pageable: stale map");
#endif
/*
* first drop the wiring count on all the entries
* which haven't actually been wired yet.
*/
failed_entry = entry;
while (entry != &map->header && entry->start < end) {
entry->wired_count--;
entry = entry->next;
}
/*
* now, unwire all the entries that were successfully
* wired above.
*/
entry = start_entry;
while (entry != failed_entry) {
entry->wired_count--;
if (VM_MAPENT_ISWIRED(entry) == 0) uvm_map_entry_unwire(map, entry);
entry = entry->next;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
return (rv);
}
if ((lockflags & UVM_LK_EXIT) == 0) {
vm_map_unbusy(map);
} else {
/*
* Get back to an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
}
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
return 0;
}
/*
* uvm_map_pageable_all: special case of uvm_map_pageable - affects
* all mapped regions.
*
* => map must not be locked.
* => if no flags are specified, all regions are unwired.
* => XXXJRT: has some of the same problems as uvm_map_pageable() above.
*/
int
uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
{
struct vm_map_entry *entry, *failed_entry;
vsize_t size;
int rv;
#ifdef DIAGNOSTIC
u_int timestamp_save;
#endif
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
0, 0);
KASSERT(map->flags & VM_MAP_PAGEABLE);
vm_map_lock(map);
/*
* handle wiring and unwiring separately.
*/
if (flags == 0) { /* unwire */
/*
* POSIX 1003.1b -- munlockall unlocks all regions,
* regardless of how many times mlockall has been called.
*/
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (VM_MAPENT_ISWIRED(entry))
uvm_map_entry_unwire(map, entry);
}
map->flags &= ~VM_MAP_WIREFUTURE;
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
return 0;
}
if (flags & MCL_FUTURE) {
/*
* must wire all future mappings; remember this.
*/
map->flags |= VM_MAP_WIREFUTURE;
}
if ((flags & MCL_CURRENT) == 0) {
/*
* no more work to do!
*/
UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
vm_map_unlock(map);
return 0;
}
/*
* wire case: in three passes [XXXCDC: ugly block of code here]
*
* 1: holding the write lock, count all pages mapped by non-wired
* entries. if this would cause us to go over our limit, we fail.
*
* 2: still holding the write lock, we create any anonymous maps that
* need to be created. then we increment its wiring count.
*
* 3: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the pages for any newly wired area (wired_count == 1).
*
* downgrading to a read lock for uvm_fault_wire avoids a possible
* deadlock with another thread that may have faulted on one of
* the pages to be wired (it would mark the page busy, blocking
* us, then in turn block on the map lock that we hold). because
* of problems in the recursive lock package, we cannot upgrade
* to a write lock in vm_map_lookup. thus, any actions that
* require the write lock must be done beforehand. because we
* keep the read lock on the map, the copy-on-write status of the
* entries we modify here cannot change.
*/
for (size = 0, entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->protection != VM_PROT_NONE &&
VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
size += entry->end - entry->start;
}
}
if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
vm_map_unlock(map);
return ENOMEM;
}
if (limit != 0 &&
(size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
vm_map_unlock(map);
return ENOMEM;
}
/*
* Pass 2.
*/
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
/*
* perform actions of vm_map_lookup that need the
* write lock on the map: create an anonymous map
* for a copy-on-write region, or an anonymous map
* for a zero-fill region. (XXXCDC: submap case
* ok?)
*/
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */
if (UVM_ET_ISNEEDSCOPY(entry) &&
((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL))) {
amap_copy(map, entry, 0, entry->start,
entry->end);
/* XXXCDC: wait OK? */
}
}
}
entry->wired_count++;
}
/*
* Pass 3.
*/
#ifdef DIAGNOSTIC
timestamp_save = map->timestamp;
#endif
vm_map_busy(map);
vm_map_unlock(map);
rv = 0;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->wired_count == 1) {
rv = uvm_fault_wire(map, entry->start, entry->end,
entry->max_protection, 1);
if (rv) {
/*
* wiring failed. break out of the loop.
* we'll clean up the map below, once we
* have a write lock again.
*/
break;
}
}
}
if (rv) {
/*
* Get back an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save + 1 != map->timestamp)
panic("uvm_map_pageable_all: stale map");
#endif
/*
* first drop the wiring count on all the entries
* which haven't actually been wired yet.
*
* Skip VM_PROT_NONE entries like we did above.
*/
failed_entry = entry;
for (/* nothing */; entry != &map->header;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
entry->wired_count--;
}
/*
* now, unwire all the entries that were successfully
* wired above.
*
* Skip VM_PROT_NONE entries like we did above.
*/
for (entry = map->header.next; entry != failed_entry;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
entry->wired_count--;
if (VM_MAPENT_ISWIRED(entry))
uvm_map_entry_unwire(map, entry);
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
return (rv);
}
vm_map_unbusy(map);
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
return 0;
}
/*
* uvm_map_clean: clean out a map range
*
* => valid flags:
* if (flags & PGO_CLEANIT): dirty pages are cleaned first
* if (flags & PGO_SYNCIO): dirty pages are written synchronously
* if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
* if (flags & PGO_FREE): any cached pages are freed after clean
* => returns an error if any part of the specified range isn't mapped
* => never a need to flush amap layer since the anonymous memory has
* no permanent home, but may deactivate pages there
* => called from sys_msync() and sys_madvise()
* => caller must not have map locked
*/
int
uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
struct vm_map_entry *current, *entry;
struct uvm_object *uobj;
struct vm_amap *amap;
struct vm_anon *anon;
struct vm_page *pg;
vaddr_t offset;
vsize_t size;
voff_t uoff;
int error, refs;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
(uintptr_t)map, start, end, flags);
KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
(PGO_FREE|PGO_DEACTIVATE));
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!uvm_map_lookup_entry(map, start, &entry)) {
vm_map_unlock(map);
return EFAULT;
}
/*
* Make a first pass to check for holes and wiring problems.
*/
for (current = entry; current->start < end; current = current->next) {
if (UVM_ET_ISSUBMAP(current)) {
vm_map_unlock(map);
return EINVAL;
}
if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) { vm_map_unlock(map);
return EBUSY;
}
if (end <= current->end) {
break;
}
if (current->end != current->next->start) { vm_map_unlock(map);
return EFAULT;
}
}
vm_map_busy(map); vm_map_unlock(map);
error = 0;
for (current = entry; start < end; current = current->next) {
amap = current->aref.ar_amap; /* upper layer */
uobj = current->object.uvm_obj; /* lower layer */
KASSERT(start >= current->start);
/*
* No amap cleaning necessary if:
*
* (1) There's no amap.
*
* (2) We're not deactivating or freeing pages.
*/
if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
goto flush_object;
offset = start - current->start;
size = MIN(end, current->end) - start;
amap_lock(amap, RW_WRITER);
for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
anon = amap_lookup(¤t->aref, offset);
if (anon == NULL)
continue;
KASSERT(anon->an_lock == amap->am_lock);
pg = anon->an_page;
if (pg == NULL) {
continue;
}
if (pg->flags & PG_BUSY) {
continue;
}
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* In these first 3 cases, we just deactivate the page.
*/
case PGO_CLEANIT|PGO_FREE:
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
/*
* skip the page if it's loaned or wired,
* since it shouldn't be on a paging queue
* at all in these cases.
*/
if (pg->loan_count != 0 ||
pg->wire_count != 0) {
continue;
}
KASSERT(pg->uanon == anon);
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
continue;
case PGO_FREE:
/*
* If there are multiple references to
* the amap, just deactivate the page.
*/
if (amap_refs(amap) > 1)
goto deactivate_it;
/* skip the page if it's wired */
if (pg->wire_count != 0) {
continue;
}
amap_unadd(¤t->aref, offset);
refs = --anon->an_ref;
if (refs == 0) { uvm_anfree(anon);
}
continue;
}
}
amap_unlock(amap);
flush_object:
/*
* flush pages if we've got a valid backing object.
* note that we must always clean object pages before
* freeing them since otherwise we could reveal stale
* data from files.
*/
uoff = current->offset + (start - current->start);
size = MIN(end, current->end) - start; if (uobj != NULL) {
rw_enter(uobj->vmobjlock, RW_WRITER);
if (uobj->pgops->pgo_put != NULL)
error = (uobj->pgops->pgo_put)(uobj, uoff,
uoff + size, flags | PGO_CLEANIT);
else
error = 0;
}
start += size;
}
vm_map_unbusy(map);
return error;
}
/*
* uvm_map_checkprot: check protection in map
*
* => must allow specified protection in a fully allocated region.
* => map must be read or write locked by caller.
*/
bool
uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t protection)
{
struct vm_map_entry *entry;
struct vm_map_entry *tmp_entry;
if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
return (false);
}
entry = tmp_entry;
while (start < end) {
if (entry == &map->header) {
return (false);
}
/*
* no holes allowed
*/
if (start < entry->start) {
return (false);
}
/*
* check protection associated with entry
*/
if ((entry->protection & protection) != protection) {
return (false);
}
start = entry->end;
entry = entry->next;
}
return (true);
}
/*
* uvmspace_alloc: allocate a vmspace structure.
*
* - structure includes vm_map and pmap
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
struct vmspace *
uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
{
struct vmspace *vm;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
vm = kmem_alloc(sizeof(*vm), KM_SLEEP);
uvmspace_init(vm, NULL, vmin, vmax, topdown);
UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
return (vm);
}
/*
* uvmspace_init: initialize a vmspace structure.
*
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
void
uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
vaddr_t vmax, bool topdown)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
(uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
UVMHIST_LOG(maphist, " topdown=%ju)", topdown, 0, 0, 0);
memset(vm, 0, sizeof(*vm));
uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
| (topdown ? VM_MAP_TOPDOWN : 0)
);
if (pmap)
pmap_reference(pmap);
else
pmap = pmap_create();
vm->vm_map.pmap = pmap;
vm->vm_refcnt = 1;
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
}
/*
* uvmspace_share: share a vmspace between two processes
*
* - used for vfork, threads(?)
*/
void
uvmspace_share(struct proc *p1, struct proc *p2)
{
uvmspace_addref(p1->p_vmspace);
p2->p_vmspace = p1->p_vmspace;
}
#if 0
/*
* uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
*
* - XXX: no locking on vmspace
*/
void
uvmspace_unshare(struct lwp *l)
{
struct proc *p = l->l_proc;
struct vmspace *nvm, *ovm = p->p_vmspace;
if (ovm->vm_refcnt == 1)
/* nothing to do: vmspace isn't shared in the first place */
return;
/* make a new vmspace, still holding old one */
nvm = uvmspace_fork(ovm);
kpreempt_disable();
pmap_deactivate(l); /* unbind old vmspace */
p->p_vmspace = nvm;
pmap_activate(l); /* switch to new vmspace */
kpreempt_enable();
uvmspace_free(ovm); /* drop reference to old vmspace */
}
#endif
/*
* uvmspace_spawn: a new process has been spawned and needs a vmspace
*/
void
uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
struct proc *p = l->l_proc;
struct vmspace *nvm;
#ifdef __HAVE_CPU_VMSPACE_EXEC
cpu_vmspace_exec(l, start, end);
#endif
nvm = uvmspace_alloc(start, end, topdown);
kpreempt_disable();
p->p_vmspace = nvm;
pmap_activate(l);
kpreempt_enable();
}
/*
* uvmspace_exec: the process wants to exec a new program
*/
void
uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
struct proc *p = l->l_proc;
struct vmspace *nvm, *ovm = p->p_vmspace;
struct vm_map *map;
int flags;
KASSERT(ovm != NULL);
#ifdef __HAVE_CPU_VMSPACE_EXEC
cpu_vmspace_exec(l, start, end);
#endif
map = &ovm->vm_map;
/*
* see if more than one process is using this vmspace...
*/
if (ovm->vm_refcnt == 1
&& topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) {
/*
* if p is the only process using its vmspace then we can safely
* recycle that vmspace for the program that is being exec'd.
* But only if TOPDOWN matches the requested value for the new
* vm space!
*/
/*
* SYSV SHM semantics require us to kill all segments on an exec
*/
if (uvm_shmexit && ovm->vm_shm)
(*uvm_shmexit)(ovm);
/*
* POSIX 1003.1b -- "lock future mappings" is revoked
* when a process execs another program image.
*/
map->flags &= ~VM_MAP_WIREFUTURE;
/*
* now unmap the old program.
*
* XXX set VM_MAP_DYING for the duration, so pmap_update()
* is not called until the pmap has been totally cleared out
* after pmap_remove_all(), or it can confuse some pmap
* implementations. it would be nice to handle this by
* deferring the pmap_update() while it is known the address
* space is not visible to any user LWP other than curlwp,
* but there isn't an elegant way of inferring that right
* now.
*/
flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
map->flags |= VM_MAP_DYING;
uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags);
map->flags &= ~VM_MAP_DYING;
pmap_update(map->pmap);
KASSERT(map->header.prev == &map->header);
KASSERT(map->nentries == 0);
/*
* resize the map
*/
vm_map_setmin(map, start);
vm_map_setmax(map, end);
} else {
/*
* p's vmspace is being shared, so we can't reuse it for p since
* it is still being used for others. allocate a new vmspace
* for p
*/
nvm = uvmspace_alloc(start, end, topdown);
/*
* install new vmspace and drop our ref to the old one.
*/
kpreempt_disable();
pmap_deactivate(l);
p->p_vmspace = nvm;
pmap_activate(l);
kpreempt_enable();
uvmspace_free(ovm);
}
}
/*
* uvmspace_addref: add a reference to a vmspace.
*/
void
uvmspace_addref(struct vmspace *vm)
{
KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0);
KASSERT(vm->vm_refcnt > 0);
atomic_inc_uint(&vm->vm_refcnt);
}
/*
* uvmspace_free: free a vmspace data structure
*/
void
uvmspace_free(struct vmspace *vm)
{
struct vm_map_entry *dead_entries;
struct vm_map *map = &vm->vm_map;
int flags;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm,
vm->vm_refcnt, 0, 0);
membar_release();
if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0)
return;
membar_acquire();
/*
* at this point, there should be no other references to the map.
* delete all of the mappings, then destroy the pmap.
*/
map->flags |= VM_MAP_DYING;
flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
/* Get rid of any SYSV shared memory segments. */
if (uvm_shmexit && vm->vm_shm != NULL)
(*uvm_shmexit)(vm);
if (map->nentries) {
uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
&dead_entries, flags);
if (dead_entries != NULL)
uvm_unmap_detach(dead_entries, 0);
}
KASSERT(map->nentries == 0);
KASSERT(map->size == 0);
mutex_destroy(&map->misc_lock);
rw_destroy(&map->lock);
cv_destroy(&map->cv);
pmap_destroy(map->pmap);
kmem_free(vm, sizeof(*vm));
}
static struct vm_map_entry *
uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
int flags)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_alloc(new_map, 0);
/* old_entry -> new_entry */
uvm_mapent_copy(old_entry, new_entry);
/* new pmap has nothing wired in it */
new_entry->wired_count = 0;
/*
* gain reference to object backing the map (can't
* be a submap, already checked this case).
*/
if (new_entry->aref.ar_amap) uvm_map_reference_amap(new_entry, flags); if (new_entry->object.uvm_obj &&
new_entry->object.uvm_obj->pgops->pgo_reference)
new_entry->object.uvm_obj->pgops->pgo_reference(
new_entry->object.uvm_obj);
/* insert entry at end of new_map's entry list */
uvm_map_entry_link(new_map, new_map->header.prev,
new_entry);
return new_entry;
}
/*
* share the mapping: this means we want the old and
* new entries to share amaps and backing objects.
*/
static void
uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
/*
* if the old_entry needs a new amap (due to prev fork)
* then we need to allocate it now so that we have
* something we own to share with the new_entry. [in
* other words, we need to clear needs_copy]
*/
if (UVM_ET_ISNEEDSCOPY(old_entry)) {
/* get our own amap, clears needs_copy */
amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
0, 0);
/* XXXCDC: WAITOK??? */
}
uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
}
static void
uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
struct vm_map_entry *new_entry;
/*
* copy-on-write the mapping (using mmap's
* MAP_PRIVATE semantics)
*
* allocate new_entry, adjust reference counts.
* (note that new references are read-only).
*/
new_entry = uvm_mapent_clone(new_map, old_entry, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
/*
* the new entry will need an amap. it will either
* need to be copied from the old entry or created
* from scratch (if the old entry does not have an
* amap). can we defer this process until later
* (by setting "needs_copy") or do we need to copy
* the amap now?
*
* we must copy the amap now if any of the following
* conditions hold:
* 1. the old entry has an amap and that amap is
* being shared. this means that the old (parent)
* process is sharing the amap with another
* process. if we do not clear needs_copy here
* we will end up in a situation where both the
* parent and child process are referring to the
* same amap with "needs_copy" set. if the
* parent write-faults, the fault routine will
* clear "needs_copy" in the parent by allocating
* a new amap. this is wrong because the
* parent is supposed to be sharing the old amap
* and the new amap will break that.
*
* 2. if the old entry has an amap and a non-zero
* wire count then we are going to have to call
* amap_cow_now to avoid page faults in the
* parent process. since amap_cow_now requires
* "needs_copy" to be clear we might as well
* clear it here as well.
*
*/
if (old_entry->aref.ar_amap != NULL) { if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
VM_MAPENT_ISWIRED(old_entry)) {
amap_copy(new_map, new_entry,
AMAP_COPY_NOCHUNK, 0, 0);
/* XXXCDC: M_WAITOK ... ok? */
}
}
/*
* if the parent's entry is wired down, then the
* parent process does not want page faults on
* access to that memory. this means that we
* cannot do copy-on-write because we can't write
* protect the old entry. in this case we
* resolve all copy-on-write faults now, using
* amap_cow_now. note that we have already
* allocated any needed amap (above).
*/
if (VM_MAPENT_ISWIRED(old_entry)) {
/*
* resolve all copy-on-write faults now
* (note that there is nothing to do if
* the old mapping does not have an amap).
*/
if (old_entry->aref.ar_amap) amap_cow_now(new_map, new_entry);
} else {
/*
* setup mappings to trigger copy-on-write faults
* we must write-protect the parent if it has
* an amap and it is not already "needs_copy"...
* if it is already "needs_copy" then the parent
* has already been write-protected by a previous
* fork operation.
*/
if (old_entry->aref.ar_amap &&
!UVM_ET_ISNEEDSCOPY(old_entry)) {
if (old_entry->max_protection & VM_PROT_WRITE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(old_entry, RW_WRITER);
#else
uvm_map_lock_entry(old_entry, RW_READER);
#endif
pmap_protect(old_map->pmap,
old_entry->start, old_entry->end,
old_entry->protection & ~VM_PROT_WRITE); uvm_map_unlock_entry(old_entry);
}
old_entry->etype |= UVM_ET_NEEDSCOPY;
}
}
}
/*
* zero the mapping: the new entry will be zero initialized
*/
static void
uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_clone(new_map, old_entry, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
if (new_entry->aref.ar_amap) { uvm_map_unreference_amap(new_entry, 0);
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = NULL;
}
if (UVM_ET_ISOBJ(new_entry)) { if (new_entry->object.uvm_obj->pgops->pgo_detach) new_entry->object.uvm_obj->pgops->pgo_detach(
new_entry->object.uvm_obj);
new_entry->object.uvm_obj = NULL;
new_entry->offset = 0;
new_entry->etype &= ~UVM_ET_OBJ;
}
}
/*
* F O R K - m a i n e n t r y p o i n t
*/
/*
* uvmspace_fork: fork a process' main map
*
* => create a new vmspace for child process from parent.
* => parent's map must not be locked.
*/
struct vmspace *
uvmspace_fork(struct vmspace *vm1)
{
struct vmspace *vm2;
struct vm_map *old_map = &vm1->vm_map;
struct vm_map *new_map;
struct vm_map_entry *old_entry;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
vm_map_lock(old_map);
vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
vm1->vm_map.flags & VM_MAP_TOPDOWN);
memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
(char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
new_map = &vm2->vm_map; /* XXX */
old_entry = old_map->header.next;
new_map->size = old_map->size;
/*
* go entry-by-entry
*/
while (old_entry != &old_map->header) {
/*
* first, some sanity checks on the old entry
*/
KASSERT(!UVM_ET_ISSUBMAP(old_entry)); KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
!UVM_ET_ISNEEDSCOPY(old_entry));
switch (old_entry->inheritance) {
case MAP_INHERIT_NONE:
/*
* drop the mapping, modify size
*/
new_map->size -= old_entry->end - old_entry->start;
break;
case MAP_INHERIT_SHARE:
uvm_mapent_forkshared(new_map, old_map, old_entry);
break;
case MAP_INHERIT_COPY:
uvm_mapent_forkcopy(new_map, old_map, old_entry);
break;
case MAP_INHERIT_ZERO:
uvm_mapent_forkzero(new_map, old_map, old_entry);
break;
default:
KASSERT(0);
break;
}
old_entry = old_entry->next;
}
pmap_update(old_map->pmap);
vm_map_unlock(old_map); if (uvm_shmfork && vm1->vm_shm) (*uvm_shmfork)(vm1, vm2);
#ifdef PMAP_FORK
pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
#endif
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
return (vm2);
}
/*
* uvm_mapent_trymerge: try to merge an entry with its neighbors.
*
* => called with map locked.
* => return non zero if successfully merged.
*/
int
uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
{
struct uvm_object *uobj;
struct vm_map_entry *next;
struct vm_map_entry *prev;
vsize_t size;
int merged = 0;
bool copying;
int newetype;
if (entry->aref.ar_amap != NULL) {
return 0;
}
if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
return 0;
}
uobj = entry->object.uvm_obj;
size = entry->end - entry->start;
copying = (flags & UVM_MERGE_COPYING) != 0;
newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;
next = entry->next;
if (next != &map->header && next->start == entry->end && ((copying && next->aref.ar_amap != NULL && amap_refs(next->aref.ar_amap) == 1) || (!copying && next->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(next, newetype,
uobj, entry->flags, entry->protection,
entry->max_protection, entry->inheritance, entry->advice,
entry->wired_count) && (uobj == NULL || entry->offset + size == next->offset)) {
int error;
if (copying) {
error = amap_extend(next, size,
AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
} else {
error = 0;
}
if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj);
}
}
entry->end = next->end;
clear_hints(map, next); uvm_map_entry_unlink(map, next); if (copying) { entry->aref = next->aref;
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
uvm_map_check(map, "trymerge forwardmerge");
uvm_mapent_free(next);
merged++;
}
}
prev = entry->prev;
if (prev != &map->header && prev->end == entry->start && ((copying && !merged && prev->aref.ar_amap != NULL && amap_refs(prev->aref.ar_amap) == 1) || (!copying && prev->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(prev, newetype,
uobj, entry->flags, entry->protection,
entry->max_protection, entry->inheritance, entry->advice,
entry->wired_count) && (uobj == NULL ||
prev->offset + prev->end - prev->start == entry->offset)) {
int error;
if (copying) {
error = amap_extend(prev, size,
AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
} else {
error = 0;
}
if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj);
}
entry->offset = prev->offset;
}
entry->start = prev->start;
clear_hints(map, prev); uvm_map_entry_unlink(map, prev); if (copying) { entry->aref = prev->aref;
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
uvm_map_check(map, "trymerge backmerge");
uvm_mapent_free(prev);
merged++;
}
}
return merged;
}
/*
* uvm_map_setup: init map
*
* => map must not be in service yet.
*/
void
uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
{
rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
map->header.next = map->header.prev = &map->header;
map->nentries = 0;
map->size = 0;
map->ref_count = 1;
vm_map_setmin(map, vmin);
vm_map_setmax(map, vmax);
map->flags = flags;
map->first_free = &map->header;
map->hint = &map->header;
map->timestamp = 0;
map->busy = NULL;
rw_init(&map->lock);
cv_init(&map->cv, "vm_map");
mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
}
/*
* U N M A P - m a i n e n t r y p o i n t
*/
/*
* uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
*
* => caller must check alignment and size
* => map must be unlocked (we will lock it)
* => flags is UVM_FLAG_QUANTUM or 0.
*/
void
uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
struct vm_map_entry *dead_entries;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (map=%#jx, start=%#jx, end=%#jx)",
(uintptr_t)map, start, end, 0);
KASSERTMSG(start < end,
"%s: map %p: start %#jx < end %#jx", __func__, map,
(uintmax_t)start, (uintmax_t)end);
if (map == kernel_map) { LOCKDEBUG_MEM_CHECK((void *)start, end - start);
}
/*
* work now done by helper functions. wipe the pmap's and then
* detach from the dead entries...
*/
vm_map_lock(map);
uvm_unmap_remove(map, start, end, &dead_entries, flags);
vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0);
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* uvm_map_reference: add reference to a map
*
* => map need not be locked
*/
void
uvm_map_reference(struct vm_map *map)
{
atomic_inc_uint(&map->ref_count);
}
void
uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op)
{ if (entry->aref.ar_amap != NULL) { amap_lock(entry->aref.ar_amap, op);
}
if (UVM_ET_ISOBJ(entry)) { rw_enter(entry->object.uvm_obj->vmobjlock, op);
}
}
void
uvm_map_unlock_entry(struct vm_map_entry *entry)
{ if (UVM_ET_ISOBJ(entry)) { rw_exit(entry->object.uvm_obj->vmobjlock);
}
if (entry->aref.ar_amap != NULL) { amap_unlock(entry->aref.ar_amap);
}
}
#define UVM_VOADDR_TYPE_MASK 0x3UL
#define UVM_VOADDR_TYPE_UOBJ 0x1UL
#define UVM_VOADDR_TYPE_ANON 0x2UL
#define UVM_VOADDR_OBJECT_MASK ~UVM_VOADDR_TYPE_MASK
#define UVM_VOADDR_GET_TYPE(voa) \
((voa)->object & UVM_VOADDR_TYPE_MASK)
#define UVM_VOADDR_GET_OBJECT(voa) \
((voa)->object & UVM_VOADDR_OBJECT_MASK)
#define UVM_VOADDR_SET_OBJECT(voa, obj, type) \
do { \
KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0); \
(voa)->object = ((uintptr_t)(obj)) | (type); \
} while (/*CONSTCOND*/0)
#define UVM_VOADDR_GET_UOBJ(voa) \
((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa))
#define UVM_VOADDR_SET_UOBJ(voa, uobj) \
UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ)
#define UVM_VOADDR_GET_ANON(voa) \
((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa))
#define UVM_VOADDR_SET_ANON(voa, anon) \
UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON)
/*
* uvm_voaddr_acquire: returns the virtual object address corresponding
* to the specified virtual address.
*
* => resolves COW so the true page identity is tracked.
*
* => acquires a reference on the page's owner (uvm_object or vm_anon)
*/
bool
uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va,
struct uvm_voaddr * const voaddr)
{
struct vm_map_entry *entry;
struct vm_anon *anon = NULL;
bool result = false;
bool exclusive = false;
void (*unlock_fn)(struct vm_map *);
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0);
const vaddr_t start = trunc_page(va);
const vaddr_t end = round_page(va+1);
lookup_again:
if (__predict_false(exclusive)) {
vm_map_lock(map);
unlock_fn = vm_map_unlock;
} else {
vm_map_lock_read(map);
unlock_fn = vm_map_unlock_read;
}
if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0);
return false;
}
if (__predict_false(entry->protection == VM_PROT_NONE)) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0);
return false;
}
/*
* We have a fast path for the common case of "no COW resolution
* needed" whereby we have taken a read lock on the map and if
* we don't encounter any need to create a vm_anon then great!
* But if we do, we loop around again, instead taking an exclusive
* lock so that we can perform the fault.
*
* In the event that we have to resolve the fault, we do nearly the
* same work as uvm_map_pageable() does:
*
* 1: holding the write lock, we create any anonymous maps that need
* to be created. however, we do NOT need to clip the map entries
* in this case.
*
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the page (assuming the entry is not already wired). this
* is done because we need the vm_anon to be present.
*/
if (__predict_true(!VM_MAPENT_ISWIRED(entry))) {
bool need_fault = false;
/*
* perform the action of vm_map_lookup that need the
* write lock on the map: create an anonymous map for
* a copy-on-write region, or an anonymous map for
* a zero-fill region.
*/
if (__predict_false(UVM_ET_ISSUBMAP(entry))) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0);
return false;
}
if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) &&
((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL)))) {
if (!exclusive) {
/* need to take the slow path */
KASSERT(unlock_fn == vm_map_unlock_read);
vm_map_unlock_read(map);
exclusive = true;
goto lookup_again;
}
need_fault = true;
amap_copy(map, entry, 0, start, end);
/* XXXCDC: wait OK? */
}
/*
* do a quick check to see if the fault has already
* been resolved to the upper layer.
*/
if (__predict_true(entry->aref.ar_amap != NULL &&
need_fault == false)) {
amap_lock(entry->aref.ar_amap, RW_WRITER);
anon = amap_lookup(&entry->aref, start - entry->start);
if (__predict_true(anon != NULL)) {
/* amap unlocked below */
goto found_anon;
}
amap_unlock(entry->aref.ar_amap);
need_fault = true;
}
/*
* we predict this test as false because if we reach
* this point, then we are likely dealing with a
* shared memory region backed by a uvm_object, in
* which case a fault to create the vm_anon is not
* necessary.
*/
if (__predict_false(need_fault)) {
if (exclusive) {
vm_map_busy(map);
vm_map_unlock(map);
unlock_fn = vm_map_unbusy;
}
if (uvm_fault_wire(map, start, end,
entry->max_protection, 1)) {
/* wiring failed */
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (wire failed)",
0,0,0,0);
return false;
}
/*
* now that we have resolved the fault, we can unwire
* the page.
*/
if (exclusive) {
vm_map_lock(map);
vm_map_unbusy(map);
unlock_fn = vm_map_unlock;
}
uvm_fault_unwire_locked(map, start, end);
}
}
/* check the upper layer */
if (entry->aref.ar_amap) {
amap_lock(entry->aref.ar_amap, RW_WRITER);
anon = amap_lookup(&entry->aref, start - entry->start);
if (anon) {
found_anon: KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock);
anon->an_ref++;
rw_obj_hold(anon->an_lock);
KASSERT(anon->an_ref != 0);
UVM_VOADDR_SET_ANON(voaddr, anon);
voaddr->offset = va & PAGE_MASK;
result = true;
}
amap_unlock(entry->aref.ar_amap);
}
/* check the lower layer */
if (!result && UVM_ET_ISOBJ(entry)) {
struct uvm_object *uobj = entry->object.uvm_obj;
KASSERT(uobj != NULL);
(*uobj->pgops->pgo_reference)(uobj);
UVM_VOADDR_SET_UOBJ(voaddr, uobj);
voaddr->offset = entry->offset + (va - entry->start);
result = true;
}
unlock_fn(map);
if (result) {
UVMHIST_LOG(maphist,
"<- done OK (type=%jd,owner=%#jx,offset=%#jx)",
UVM_VOADDR_GET_TYPE(voaddr),
UVM_VOADDR_GET_OBJECT(voaddr),
voaddr->offset, 0);
} else {
UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0);
}
return result;
}
/*
* uvm_voaddr_release: release the references held by the
* vitual object address.
*/
void
uvm_voaddr_release(struct uvm_voaddr * const voaddr)
{
switch (UVM_VOADDR_GET_TYPE(voaddr)) {
case UVM_VOADDR_TYPE_UOBJ: {
struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr);
KASSERT(uobj != NULL);
KASSERT(uobj->pgops->pgo_detach != NULL);
(*uobj->pgops->pgo_detach)(uobj);
break;
}
case UVM_VOADDR_TYPE_ANON: {
struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr);
krwlock_t *lock;
KASSERT(anon != NULL);
rw_enter((lock = anon->an_lock), RW_WRITER);
KASSERT(anon->an_ref > 0);
if (--anon->an_ref == 0) {
uvm_anfree(anon);
}
rw_exit(lock);
rw_obj_free(lock);
break;
}
default:
panic("uvm_voaddr_release: bad type");
}
memset(voaddr, 0, sizeof(*voaddr));
}
/*
* uvm_voaddr_compare: compare two uvm_voaddr objects.
*
* => memcmp() semantics
*/
int
uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,
const struct uvm_voaddr * const voaddr2)
{
const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1);
const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2);
KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ ||
type1 == UVM_VOADDR_TYPE_ANON);
KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ ||
type2 == UVM_VOADDR_TYPE_ANON);
if (type1 < type2)
return -1;
if (type1 > type2)
return 1;
const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1);
const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2);
if (addr1 < addr2)
return -1;
if (addr1 > addr2)
return 1;
if (voaddr1->offset < voaddr2->offset)
return -1;
if (voaddr1->offset > voaddr2->offset)
return 1;
return 0;
}
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_map_printit: actually prints the map
*/
void
uvm_map_printit(struct vm_map *map, bool full,
void (*pr)(const char *, ...))
{
struct vm_map_entry *entry;
(*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
vm_map_max(map));
(*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
map->nentries, map->size, map->ref_count, map->timestamp,
map->flags);
(*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
if (!full)
return;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
(*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
entry, entry->start, entry->end, entry->object.uvm_obj,
(long long)entry->offset, entry->aref.ar_amap,
entry->aref.ar_pageoff);
(*pr)(
"\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
"wc=%d, adv=%d%s\n",
(entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
(entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
(entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
entry->protection, entry->max_protection,
entry->inheritance, entry->wired_count, entry->advice,
entry == map->first_free ? " (first_free)" : "");
}
}
void
uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
struct vm_map *map;
for (map = kernel_map;;) {
struct vm_map_entry *entry;
if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
break;
}
(*pr)("%p is %p+%zu from VMMAP %p\n",
(void *)addr, (void *)entry->start,
(size_t)(addr - (uintptr_t)entry->start), map);
if (!UVM_ET_ISSUBMAP(entry)) {
break;
}
map = entry->object.sub_map;
}
}
#endif /* DDB || DEBUGPRINT */
#ifndef __USER_VA0_IS_SAFE
static int
sysctl_user_va0_disable(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int t, error;
node = *rnode;
node.sysctl_data = &t;
t = user_va0_disable;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return (error);
if (!t && user_va0_disable &&
kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
NULL, NULL, NULL))
return EPERM;
user_va0_disable = !!t;
return 0;
}
#endif
static int
fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
struct vm_map *m, struct vm_map_entry *e)
{
#ifndef _RUMPKERNEL
int error;
memset(kve, 0, sizeof(*kve));
KASSERT(e != NULL);
if (UVM_ET_ISOBJ(e)) {
struct uvm_object *uobj = e->object.uvm_obj;
KASSERT(uobj != NULL);
kve->kve_ref_count = uobj->uo_refs;
kve->kve_count = uobj->uo_npages;
if (UVM_OBJ_IS_VNODE(uobj)) {
struct vattr va;
struct vnode *vp = (struct vnode *)uobj;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, l->l_cred);
VOP_UNLOCK(vp);
kve->kve_type = KVME_TYPE_VNODE;
if (error == 0) {
kve->kve_vn_size = vp->v_size;
kve->kve_vn_type = (int)vp->v_type;
kve->kve_vn_mode = va.va_mode;
kve->kve_vn_rdev = va.va_rdev;
kve->kve_vn_fileid = va.va_fileid;
kve->kve_vn_fsid = va.va_fsid;
error = vnode_to_path(kve->kve_path,
sizeof(kve->kve_path) / 2, vp, l, p);
}
} else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
kve->kve_type = KVME_TYPE_KERN;
} else if (UVM_OBJ_IS_DEVICE(uobj)) {
kve->kve_type = KVME_TYPE_DEVICE;
} else if (UVM_OBJ_IS_AOBJ(uobj)) {
kve->kve_type = KVME_TYPE_ANON;
} else {
kve->kve_type = KVME_TYPE_OBJECT;
}
} else if (UVM_ET_ISSUBMAP(e)) {
struct vm_map *map = e->object.sub_map;
KASSERT(map != NULL);
kve->kve_ref_count = map->ref_count;
kve->kve_count = map->nentries;
kve->kve_type = KVME_TYPE_SUBMAP;
} else
kve->kve_type = KVME_TYPE_UNKNOWN;
kve->kve_start = e->start;
kve->kve_end = e->end;
kve->kve_offset = e->offset;
kve->kve_wired_count = e->wired_count;
kve->kve_inheritance = e->inheritance;
kve->kve_attributes = 0; /* unused */
kve->kve_advice = e->advice;
#define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
(((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
(((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
kve->kve_protection = PROT(e->protection);
kve->kve_max_protection = PROT(e->max_protection);
kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
? KVME_FLAG_COW : 0;
kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
? KVME_FLAG_NEEDS_COPY : 0;
kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
? KVME_FLAG_PAGEABLE : 0;
#endif
return 0;
}
static int
fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
size_t *oldlenp)
{
int error;
struct proc *p;
struct kinfo_vmentry *vme;
struct vmspace *vm;
struct vm_map *map;
struct vm_map_entry *entry;
char *dp;
size_t count, vmesize;
if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
return EINVAL;
if (oldp) {
if (*oldlenp > 10UL * 1024UL * 1024UL)
return E2BIG;
count = *oldlenp / elem_size;
if (count == 0)
return ENOMEM;
vmesize = count * sizeof(*vme);
} else
vmesize = 0;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
vme = NULL;
count = 0;
if ((error = proc_vmspace_getref(p, &vm)) != 0)
goto out;
map = &vm->vm_map;
vm_map_lock_read(map);
dp = oldp;
if (oldp)
vme = kmem_alloc(vmesize, KM_SLEEP);
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (oldp && (dp - (char *)oldp) < vmesize) {
error = fill_vmentry(l, p, &vme[count], map, entry);
if (error)
goto out;
dp += elem_size;
}
count++;
}
vm_map_unlock_read(map);
uvmspace_free(vm);
out:
if (pid != -1)
mutex_exit(p->p_lock);
if (error == 0) {
const u_int esize = uimin(sizeof(*vme), elem_size);
dp = oldp;
for (size_t i = 0; i < count; i++) {
if (oldp && (dp - (char *)oldp) < vmesize) {
error = sysctl_copyout(l, &vme[i], dp, esize);
if (error)
break;
dp += elem_size;
} else
break;
}
count *= elem_size;
if (oldp != NULL && *oldlenp < count)
error = ENOSPC;
*oldlenp = count;
}
if (vme)
kmem_free(vme, vmesize);
return error;
}
static int
sysctl_vmproc(SYSCTLFN_ARGS)
{
int error;
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (namelen == 0)
return EINVAL;
switch (name[0]) {
case VM_PROC_MAP:
if (namelen != 3)
return EINVAL;
sysctl_unlock();
error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
sysctl_relock();
return error;
default:
return EINVAL;
}
}
SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "proc",
SYSCTL_DESCR("Process vm information"),
sysctl_vmproc, 0, NULL, 0,
CTL_VM, VM_PROC, CTL_EOL);
#ifndef __USER_VA0_IS_SAFE
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "user_va0_disable",
SYSCTL_DESCR("Disable VA 0"),
sysctl_user_va0_disable, 0, &user_va0_disable, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
#endif
}
/* $NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "opt_ktrace.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#endif
/* XXX To get syscall prototypes. */
#define SYSVSHM
#define SYSVSEM
#define SYSVMSG
#include <sys/param.h>
#include <sys/module.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/systm.h>
#include <sys/xcall.h>
#include <sys/ktrace.h>
#include <sys/ptrace.h>
int
sys_nomodule(struct lwp *l, const void *v, register_t *retval)
{
#ifdef MODULAR
const struct sysent *sy;
const struct emul *em;
const struct sc_autoload *auto_list;
u_int code;
/*
* Restart the syscall if we interrupted a module unload that
* failed. Acquiring kernconfig_lock delays us until any unload
* has been completed or rolled back.
*/
kernconfig_lock();
sy = l->l_sysent;
if (sy->sy_call != sys_nomodule) {
kernconfig_unlock();
return ERESTART;
}
/*
* Try to autoload a module to satisfy the request. If it
* works, retry the request.
*/
em = l->l_proc->p_emul;
code = sy - em->e_sysent;
if ((auto_list = em->e_sc_autoload) != NULL)
for (; auto_list->al_code > 0; auto_list++) {
if (auto_list->al_code != code) {
continue;
}
if (module_autoload(auto_list->al_module,
MODULE_CLASS_ANY) != 0 ||
sy->sy_call == sys_nomodule) {
break;
}
kernconfig_unlock();
return ERESTART;
}
kernconfig_unlock();
#endif /* MODULAR */
return sys_nosys(l, v, retval);
}
int
syscall_establish(const struct emul *em, const struct syscall_package *sp)
{
struct sysent *sy;
int i;
KASSERT(kernconfig_is_held());
if (em == NULL) {
em = &emul_netbsd;
}
sy = em->e_sysent;
/*
* Ensure that all preconditions are valid, since this is
* an all or nothing deal. Once a system call is entered,
* it can become busy and we could be unable to remove it
* on error.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
if (sp[i].sp_code >= SYS_NSYSENT)
return EINVAL;
if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
sy[sp[i].sp_code].sy_call != sys_nosys) {
#ifdef DIAGNOSTIC
printf("syscall %d is busy\n", sp[i].sp_code);
#endif
return EBUSY;
}
}
/* Everything looks good, patch them in. */
for (i = 0; sp[i].sp_call != NULL; i++) {
sy[sp[i].sp_code].sy_call = sp[i].sp_call;
}
return 0;
}
int
syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
{
struct sysent *sy;
const uint32_t *sb;
lwp_t *l;
int i;
KASSERT(kernconfig_is_held());
if (em == NULL) {
em = &emul_netbsd;
}
sy = em->e_sysent;
sb = em->e_nomodbits;
/*
* First, patch the system calls to sys_nomodule or sys_nosys
* to gate further activity.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
sy[sp[i].sp_code].sy_call =
sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
sys_nomodule : sys_nosys;
}
/*
* Run a cross call to cycle through all CPUs. This does two
* things: lock activity provides a barrier and makes our update
* of sy_call visible to all CPUs, and upon return we can be sure
* that we see pertinent values of l_sysent posted by remote CPUs.
*/
xc_barrier(0);
/*
* Now it's safe to check l_sysent. Run through all LWPs and see
* if anyone is still using the system call.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
if (l->l_sysent == &sy[sp[i].sp_code]) {
break;
}
}
mutex_exit(&proc_lock);
if (l == NULL) {
continue;
}
/*
* We lose: one or more calls are still in use. Put back
* the old entrypoints and act like nothing happened.
* When we drop kernconfig_lock, any system calls held in
* sys_nomodule() will be restarted.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
sy[sp[i].sp_code].sy_call = sp[i].sp_call;
}
return EBUSY;
}
return 0;
}
/*
* Return true if system call tracing is enabled for the specified process.
*/
bool
trace_is_enabled(struct proc *p)
{
#ifdef SYSCALL_DEBUG
return (true);
#endif
#ifdef KTRACE
if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
return (true);
#endif
#ifdef PTRACE
if (ISSET(p->p_slflag, PSL_SYSCALL))
return (true);
#endif
return (false);
}
/*
* Start trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just before
* a system call is actually executed.
*/
int
trace_enter(register_t code, const struct sysent *sy, const void *args)
{
int error = 0;
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
struct proc *p = curlwp->l_proc;
#endif
#ifdef KDTRACE_HOOKS
if (sy->sy_entry) {
struct emul *e = p->p_emul;
if (e->e_dtrace_syscall) (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args,
NULL, 0);
}
#endif
#ifdef SYSCALL_DEBUG
scdebug_call(code, args);
#endif /* SYSCALL_DEBUG */
ktrsyscall(code, args, sy->sy_narg);
#ifdef PTRACE
if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
(PSL_SYSCALL|PSL_TRACED)) {
proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
/* tracer will emulate syscall for us */
error = EJUSTRETURN;
}
}
#endif
return error;
}
/*
* End trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just after
* a system call finishes.
* MD caller guarantees the passed 'code' is within the supported
* system call number range for emulation the process runs under.
*/
void
trace_exit(register_t code, const struct sysent *sy, const void *args,
register_t rval[], int error)
{
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
struct proc *p = curlwp->l_proc;
#endif
#ifdef KDTRACE_HOOKS
if (sy->sy_return) {
struct emul *e = p->p_emul;
if (e->e_dtrace_syscall) (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy,
args, rval, error);
}
#endif
#ifdef SYSCALL_DEBUG
scdebug_ret(code, error, rval);
#endif /* SYSCALL_DEBUG */
ktrsysret(code, error, rval);
#ifdef PTRACE
if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
(PSL_SYSCALL|PSL_TRACED)) {
proc_stoptrace(TRAP_SCX, code, args, rval, error);
}
CLR(p->p_slflag, PSL_SYSCALLEMU);
#endif
}
/* $NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $ */
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $");
#define FS struct fs
#define I_FS i_fs
#define READ ffs_read
#define READ_S "ffs_read"
#define WRITE ffs_write
#define WRITE_S "ffs_write"
#define BUFRD ffs_bufrd
#define BUFWR ffs_bufwr
#define ufs_blkoff ffs_blkoff
#define ufs_blksize ffs_blksize
#define ufs_lblkno ffs_lblkno
#define ufs_lblktosize ffs_lblktosize
#define ufs_blkroundup ffs_blkroundup
static int ufs_post_read_update(struct vnode *, int, int);
static int ufs_post_write_update(struct vnode *, struct uio *, int,
kauth_cred_t, off_t, int, int);
/*
* Vnode op for reading.
*/
/* ARGSUSED */
int
READ(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp;
struct inode *ip;
struct uio *uio;
struct ufsmount *ump;
vsize_t bytelen;
int error, ioflag, advice;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ip->i_ump;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
error = 0;
KASSERT(uio->uio_rw == UIO_READ); KASSERT(vp->v_type == VREG || vp->v_type == VDIR);
/* XXX Eliminate me by refusing directory reads from userland. */
if (vp->v_type == VDIR)
return BUFRD(vp, uio, ioflag, ap->a_cred); if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
return (EFBIG);
if (uio->uio_resid == 0)
return (0);
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
return ffs_snapshot_read(vp, uio, ioflag); if (uio->uio_offset >= ip->i_size)
goto out;
KASSERT(vp->v_type == VREG);
advice = IO_ADV_DECODE(ap->a_ioflag);
while (uio->uio_resid > 0) { if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag);
}
bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
if (bytelen == 0)
break;
error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp));
if (error)
break;
}
out:
error = ufs_post_read_update(vp, ap->a_ioflag, error);
return (error);
}
/*
* UFS op for reading via the buffer cache
*/
int
BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
struct inode *ip;
struct ufsmount *ump;
FS *fs;
struct buf *bp;
daddr_t lbn, nextlbn;
off_t bytesinfile;
long size, xfersize, blkoffset;
int error;
KASSERT(VOP_ISLOCKED(vp)); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(uio->uio_rw == UIO_READ);
ip = VTOI(vp);
ump = ip->i_ump;
fs = ip->I_FS;
error = 0;
KASSERT(vp->v_type != VLNK || ip->i_size >= ump->um_maxsymlinklen); KASSERT(vp->v_type != VLNK || ump->um_maxsymlinklen != 0 ||
DIP(ip, blocks) != 0);
if (uio->uio_offset > ump->um_maxfilesize)
return EFBIG;
if (uio->uio_resid == 0)
return 0;
KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL))); if (uio->uio_offset >= ip->i_size)
goto out;
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
bytesinfile = ip->i_size - uio->uio_offset;
if (bytesinfile <= 0)
break;
lbn = ufs_lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
size = ufs_blksize(fs, ip, lbn);
blkoffset = ufs_blkoff(fs, uio->uio_offset);
xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
bytesinfile);
if (ufs_lblktosize(fs, nextlbn) >= ip->i_size)
error = bread(vp, lbn, size, 0, &bp);
else {
int nextsize = ufs_blksize(fs, ip, nextlbn);
error = breadn(vp, lbn,
size, &nextlbn, &nextsize, 1, 0, &bp);
}
if (error)
break;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < xfersize) { if (size == 0)
break;
xfersize = size;
}
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
if (error)
break;
brelse(bp, 0);
}
if (bp != NULL) brelse(bp, 0);
out:
error = ufs_post_read_update(vp, ioflag, error);
return (error);
}
static int
ufs_post_read_update(struct vnode *vp, int ioflag, int oerror)
{
struct inode *ip = VTOI(vp);
int error = oerror;
if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
ip->i_flag |= IN_ACCESS;
if ((ioflag & IO_SYNC) == IO_SYNC) { error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error)
goto out;
error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); UFS_WAPBL_END(vp->v_mount);
}
}
out:
/* Read error overrides any inode update error. */
if (oerror)
error = oerror;
return error;
}
/*
* Vnode op for writing.
*/
int
WRITE(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp;
struct uio *uio;
struct inode *ip;
FS *fs;
kauth_cred_t cred;
off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
int blkoffset, error, flags, ioflag, resid;
int aflag;
vsize_t bytelen;
bool async;
struct ufsmount *ump;
cred = ap->a_cred;
ioflag = ap->a_ioflag;
uio = ap->a_uio;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ip->i_ump;
KASSERT(vp->v_size == ip->i_size); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(vp->v_type == VREG); KASSERT(!ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
return (EPERM);
fs = ip->I_FS;
if (uio->uio_offset < 0 ||
(u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
return (EFBIG);
if (uio->uio_resid == 0)
return (0);
flags = ioflag & IO_SYNC ? B_SYNC : 0;
async = vp->v_mount->mnt_flag & MNT_ASYNC;
origoff = uio->uio_offset;
resid = uio->uio_resid;
osize = ip->i_size;
error = 0;
KASSERT(vp->v_type == VREG);
/*
* XXX The entire write operation must occur in a single WAPBL
* transaction because it may allocate disk blocks, if
* appending or filling holes, which is allowed to happen only
* if the write fully succeeds.
*
* If ubc_uiomove fails in the middle with EFAULT, we can clean
* up at the end with UFS_TRUNCATE. But if the power fails in
* the middle, there would be nobody to deallocate the blocks,
* without an fsck to globally analyze the file system.
*
* If the increasingly inaccurately named WAPBL were augmented
* with rollback records for block allocations, then we could
* split this into multiple transactions and commit the
* allocations in the last one.
*
* But WAPBL doesn't have that notion now, so we'll have to
* live with gigantic transactions and WAPBL tentacles in
* genfs_getpages/putpages to cope with the possibility that
* the transaction may or may not be locked on entry to the
* page cache.
*
* And even if we added that notion to WAPBL, it wouldn't help
* us get rid of the tentacles in genfs_getpages/putpages
* because we'd have to interoperate with old implementations
* that assume they can replay the log without fsck.
*/
error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error) {
return error;
}
preallocoff = round_page(ufs_blkroundup(fs, MAX(osize, uio->uio_offset)));
aflag = ioflag & IO_SYNC ? B_SYNC : 0;
nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
endallocoff = nsize - ufs_blkoff(fs, nsize);
/*
* if we're increasing the file size, deal with expanding
* the fragment if there is one.
*/
if (nsize > osize && ufs_lblkno(fs, osize) < UFS_NDADDR &&
ufs_lblkno(fs, osize) != ufs_lblkno(fs, nsize) &&
ufs_blkroundup(fs, osize) != osize) {
off_t eob;
eob = ufs_blkroundup(fs, osize);
uvm_vnp_setwritesize(vp, eob);
error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
if (error)
goto out;
if (flags & B_SYNC) {
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
round_page(eob),
PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
}
}
while (uio->uio_resid > 0) {
int ubc_flags = UBC_WRITE;
bool overwrite; /* if we're overwrite a whole block */
off_t newoff;
if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
}
oldoff = uio->uio_offset;
blkoffset = ufs_blkoff(fs, uio->uio_offset);
bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
if (bytelen == 0) {
break;
}
/*
* if we're filling in a hole, allocate the blocks now and
* initialize the pages first. if we're extending the file,
* we can safely allocate blocks without initializing pages
* since the new blocks will be inaccessible until the write
* is complete.
*/
overwrite = uio->uio_offset >= preallocoff &&
uio->uio_offset < endallocoff;
if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && ufs_blkoff(fs, uio->uio_offset) == 0 &&
(uio->uio_offset & PAGE_MASK) == 0) {
vsize_t len;
len = trunc_page(bytelen);
len -= ufs_blkoff(fs, len);
if (len > 0) {
overwrite = true;
bytelen = len;
}
}
newoff = oldoff + bytelen;
if (vp->v_size < newoff) { uvm_vnp_setwritesize(vp, newoff);
}
if (!overwrite) {
error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
cred, aflag);
if (error)
break;
} else {
genfs_node_wrlock(vp);
error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
aflag, cred);
genfs_node_unlock(vp);
if (error)
break;
ubc_flags |= UBC_FAULTBUSY;
}
/*
* copy the data.
*/
error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
IO_ADV_DECODE(ioflag), ubc_flags | UBC_VNODE_FLAGS(vp));
/*
* update UVM's notion of the size now that we've
* copied the data into the vnode's pages.
*
* we should update the size even when uiomove failed.
*/
if (vp->v_size < newoff) { uvm_vnp_setsize(vp, newoff);
}
if (error)
break;
/*
* flush what we just wrote if necessary.
* XXXUBC simplistic async flushing.
*/
if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
(uio->uio_offset >> 16) << 16,
PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY);
if (error)
break;
}
}
if (error == 0 && ioflag & IO_SYNC) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
round_page(ufs_blkroundup(fs, uio->uio_offset)),
PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
}
out:
error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
error);
UFS_WAPBL_END(vp->v_mount);
return (error);
}
/*
* UFS op for writing via the buffer cache
*/
int
BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
struct inode *ip;
struct ufsmount *ump;
FS *fs;
int flags;
struct buf *bp;
off_t osize;
int resid, xfersize, size, blkoffset;
daddr_t lbn;
int error;
KASSERT(ISSET(ioflag, IO_NODELOCKED)); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC)); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
ip = VTOI(vp);
ump = ip->i_ump;
fs = ip->I_FS;
KASSERT(vp->v_size == ip->i_size); if (uio->uio_offset < 0 || uio->uio_resid > ump->um_maxfilesize ||
uio->uio_offset > (ump->um_maxfilesize - uio->uio_resid))
return EFBIG;
if (uio->uio_resid == 0)
return 0;
flags = ioflag & IO_SYNC ? B_SYNC : 0;
resid = uio->uio_resid;
osize = ip->i_size;
error = 0;
KASSERT(vp->v_type != VREG);
/* XXX Should never have pages cached here. */
KASSERT(vp->v_uobj.uo_npages == 0); while (uio->uio_resid > 0) {
lbn = ufs_lblkno(fs, uio->uio_offset);
blkoffset = ufs_blkoff(fs, uio->uio_offset);
xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
if (fs->fs_bsize > xfersize)
flags |= B_CLRBUF;
else
flags &= ~B_CLRBUF;
error = UFS_BALLOC(vp, uio->uio_offset, xfersize, cred, flags,
&bp);
if (error)
break;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
DIP_ASSIGN(ip, size, ip->i_size);
uvm_vnp_setsize(vp, ip->i_size);
}
size = ufs_blksize(fs, ip, lbn) - bp->b_resid;
if (xfersize > size)
xfersize = size;
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
/*
* if we didn't clear the block and the uiomove failed,
* the buf will now contain part of some other file,
* so we need to invalidate it.
*/
if (error && (flags & B_CLRBUF) == 0) {
brelse(bp, BC_INVAL);
break;
}
if (ioflag & IO_SYNC)
(void)bwrite(bp);
else if (xfersize + blkoffset == fs->fs_bsize)
bawrite(bp);
else
bdwrite(bp); if (error || xfersize == 0)
break;
}
error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
error);
return (error);
}
static int
ufs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
kauth_cred_t cred, off_t osize, int resid, int oerror)
{
struct inode *ip = VTOI(vp);
int error = oerror;
/* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS;
/*
* If we successfully wrote any data and we are not the superuser,
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
if (resid > uio->uio_resid && cred) { if (ip->i_mode & ISUID) { if (kauth_authorize_vnode(cred,
KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
ip->i_mode &= ~ISUID;
DIP_ASSIGN(ip, mode, ip->i_mode);
}
}
if (ip->i_mode & ISGID) { if (kauth_authorize_vnode(cred,
KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
ip->i_mode &= ~ISGID;
DIP_ASSIGN(ip, mode, ip->i_mode);
}
}
}
/*
* Update the size on disk: truncate back to original size on
* error, or reflect the new size on success.
*/
if (error) {
(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, cred);
uio->uio_offset -= resid - uio->uio_resid;
uio->uio_resid = resid;
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
else
UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
/* Make sure the vnode uvm size matches the inode file size. */
KASSERT(vp->v_size == ip->i_size);
/* Write error overrides any inode update error. */
if (oerror)
error = oerror;
return error;
}
/* $NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $ */
/*-
* Copyright (c) 2003, 2006, 2007, 2008, 2009, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org>
* Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $");
/*
* Timeouts are kept in a hierarchical timing wheel. The c_time is the
* value of c_cpu->cc_ticks when the timeout should be called. There are
* four levels with 256 buckets each. See 'Scheme 7' in "Hashed and
* Hierarchical Timing Wheels: Efficient Data Structures for Implementing
* a Timer Facility" by George Varghese and Tony Lauck.
*
* Some of the "math" in here is a bit tricky. We have to beware of
* wrapping ints.
*
* We use the fact that any element added to the queue must be added with
* a positive time. That means that any element `to' on the queue cannot
* be scheduled to timeout further in time than INT_MAX, but c->c_time can
* be positive or negative so comparing it with anything is dangerous.
* The only way we can use the c->c_time value in any predictable way is
* when we calculate how far in the future `to' will timeout - "c->c_time
* - c->c_cpu->cc_ticks". The result will always be positive for future
* timeouts and 0 or negative for due timeouts.
*/
#define _CALLOUT_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/sdt.h>
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_access.h>
#include <ddb/db_cpu.h>
#include <ddb/db_sym.h>
#include <ddb/db_output.h>
#endif
#define BUCKETS 1024
#define WHEELSIZE 256
#define WHEELMASK 255
#define WHEELBITS 8
#define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)
#define BUCKET(cc, rel, abs) \
(((rel) <= (1 << (2*WHEELBITS))) \
? ((rel) <= (1 << WHEELBITS)) \
? &(cc)->cc_wheel[MASKWHEEL(0, (abs))] \
: &(cc)->cc_wheel[MASKWHEEL(1, (abs)) + WHEELSIZE] \
: ((rel) <= (1 << (3*WHEELBITS))) \
? &(cc)->cc_wheel[MASKWHEEL(2, (abs)) + 2*WHEELSIZE] \
: &(cc)->cc_wheel[MASKWHEEL(3, (abs)) + 3*WHEELSIZE])
#define MOVEBUCKET(cc, wheel, time) \
CIRCQ_APPEND(&(cc)->cc_todo, \
&(cc)->cc_wheel[MASKWHEEL((wheel), (time)) + (wheel)*WHEELSIZE])
/*
* Circular queue definitions.
*/
#define CIRCQ_INIT(list) \
do { \
(list)->cq_next_l = (list); \
(list)->cq_prev_l = (list); \
} while (/*CONSTCOND*/0)
#define CIRCQ_INSERT(elem, list) \
do { \
(elem)->cq_prev_e = (list)->cq_prev_e; \
(elem)->cq_next_l = (list); \
(list)->cq_prev_l->cq_next_l = (elem); \
(list)->cq_prev_l = (elem); \
} while (/*CONSTCOND*/0)
#define CIRCQ_APPEND(fst, snd) \
do { \
if (!CIRCQ_EMPTY(snd)) { \
(fst)->cq_prev_l->cq_next_l = (snd)->cq_next_l; \
(snd)->cq_next_l->cq_prev_l = (fst)->cq_prev_l; \
(snd)->cq_prev_l->cq_next_l = (fst); \
(fst)->cq_prev_l = (snd)->cq_prev_l; \
CIRCQ_INIT(snd); \
} \
} while (/*CONSTCOND*/0)
#define CIRCQ_REMOVE(elem) \
do { \
(elem)->cq_next_l->cq_prev_e = (elem)->cq_prev_e; \
(elem)->cq_prev_l->cq_next_e = (elem)->cq_next_e; \
} while (/*CONSTCOND*/0)
#define CIRCQ_FIRST(list) ((list)->cq_next_e)
#define CIRCQ_NEXT(elem) ((elem)->cq_next_e)
#define CIRCQ_LAST(elem,list) ((elem)->cq_next_l == (list))
#define CIRCQ_EMPTY(list) ((list)->cq_next_l == (list))
struct callout_cpu {
kmutex_t *cc_lock;
sleepq_t cc_sleepq;
u_int cc_nwait;
u_int cc_ticks;
lwp_t *cc_lwp;
callout_impl_t *cc_active;
struct evcnt cc_ev_late;
struct evcnt cc_ev_block;
struct callout_circq cc_todo; /* Worklist */
struct callout_circq cc_wheel[BUCKETS]; /* Queues of timeouts */
char cc_name1[12];
char cc_name2[12];
struct cpu_info *cc_cpu;
};
#ifdef DDB
static struct callout_cpu ccb;
#endif
#ifndef CRASH /* _KERNEL */
static void callout_softclock(void *);
static void callout_wait(callout_impl_t *, void *, kmutex_t *);
static struct callout_cpu callout_cpu0 __cacheline_aligned;
static void *callout_sih __read_mostly;
SDT_PROBE_DEFINE2(sdt, kernel, callout, init,
"struct callout *"/*ch*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, callout, destroy,
"struct callout *"/*ch*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, setfunc,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, schedule,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"int"/*ticks*/);
SDT_PROBE_DEFINE6(sdt, kernel, callout, migrate,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"struct cpu_info *"/*ocpu*/,
"struct cpu_info *"/*ncpu*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, entry,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, return,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, stop,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"bool"/*expired*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, halt,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, halt__done,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"bool"/*expired*/);
syncobj_t callout_syncobj = {
.sobj_name = "callout",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static inline kmutex_t *
callout_lock(callout_impl_t *c)
{
struct callout_cpu *cc;
kmutex_t *lock;
for (;;) {
cc = c->c_cpu;
lock = cc->cc_lock;
mutex_spin_enter(lock);
if (__predict_true(cc == c->c_cpu))
return lock;
mutex_spin_exit(lock);
}
}
/*
* Check if the callout is currently running on an LWP that isn't curlwp.
*/
static inline bool
callout_running_somewhere_else(callout_impl_t *c, struct callout_cpu *cc)
{
KASSERT(c->c_cpu == cc); return cc->cc_active == c && cc->cc_lwp != curlwp;
}
/*
* callout_startup:
*
* Initialize the callout facility, called at system startup time.
* Do just enough to allow callouts to be safely registered.
*/
void
callout_startup(void)
{
struct callout_cpu *cc;
int b;
KASSERT(curcpu()->ci_data.cpu_callout == NULL);
cc = &callout_cpu0;
cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
CIRCQ_INIT(&cc->cc_todo);
for (b = 0; b < BUCKETS; b++)
CIRCQ_INIT(&cc->cc_wheel[b]);
curcpu()->ci_data.cpu_callout = cc;
}
/*
* callout_init_cpu:
*
* Per-CPU initialization.
*/
CTASSERT(sizeof(callout_impl_t) <= sizeof(callout_t));
void
callout_init_cpu(struct cpu_info *ci)
{
struct callout_cpu *cc;
int b;
if ((cc = ci->ci_data.cpu_callout) == NULL) {
cc = kmem_zalloc(sizeof(*cc), KM_SLEEP);
cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
CIRCQ_INIT(&cc->cc_todo);
for (b = 0; b < BUCKETS; b++)
CIRCQ_INIT(&cc->cc_wheel[b]);
} else {
/* Boot CPU, one time only. */
callout_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
callout_softclock, NULL);
if (callout_sih == NULL)
panic("callout_init_cpu (2)");
}
sleepq_init(&cc->cc_sleepq);
snprintf(cc->cc_name1, sizeof(cc->cc_name1), "late/%u",
cpu_index(ci));
evcnt_attach_dynamic(&cc->cc_ev_late, EVCNT_TYPE_MISC,
NULL, "callout", cc->cc_name1);
snprintf(cc->cc_name2, sizeof(cc->cc_name2), "wait/%u",
cpu_index(ci));
evcnt_attach_dynamic(&cc->cc_ev_block, EVCNT_TYPE_MISC,
NULL, "callout", cc->cc_name2);
cc->cc_cpu = ci;
ci->ci_data.cpu_callout = cc;
}
/*
* callout_init:
*
* Initialize a callout structure. This must be quick, so we fill
* only the minimum number of fields.
*/
void
callout_init(callout_t *cs, u_int flags)
{
callout_impl_t *c = (callout_impl_t *)cs;
struct callout_cpu *cc;
KASSERT((flags & ~CALLOUT_FLAGMASK) == 0); SDT_PROBE2(sdt, kernel, callout, init, cs, flags);
cc = curcpu()->ci_data.cpu_callout;
c->c_func = NULL;
c->c_magic = CALLOUT_MAGIC;
if (__predict_true((flags & CALLOUT_MPSAFE) != 0 && cc != NULL)) {
c->c_flags = flags;
c->c_cpu = cc;
return;
}
c->c_flags = flags | CALLOUT_BOUND;
c->c_cpu = &callout_cpu0;
}
/*
* callout_destroy:
*
* Destroy a callout structure. The callout must be stopped.
*/
void
callout_destroy(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
SDT_PROBE1(sdt, kernel, callout, destroy, cs); KASSERTMSG(c->c_magic == CALLOUT_MAGIC,
"callout %p: c_magic (%#x) != CALLOUT_MAGIC (%#x)",
c, c->c_magic, CALLOUT_MAGIC);
/*
* It's not necessary to lock in order to see the correct value
* of c->c_flags. If the callout could potentially have been
* running, the current thread should have stopped it.
*/
KASSERTMSG((c->c_flags & CALLOUT_PENDING) == 0,
"pending callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
c, c->c_func, c->c_flags, __builtin_return_address(0));
KASSERTMSG(!callout_running_somewhere_else(c, c->c_cpu),
"running callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
c, c->c_func, c->c_flags, __builtin_return_address(0));
c->c_magic = 0;
}
/*
* callout_schedule_locked:
*
* Schedule a callout to run. The function and argument must
* already be set in the callout structure. Must be called with
* callout_lock.
*/
static void
callout_schedule_locked(callout_impl_t *c, kmutex_t *lock, int to_ticks)
{
struct callout_cpu *cc, *occ;
int old_time;
SDT_PROBE5(sdt, kernel, callout, schedule,
c, c->c_func, c->c_arg, c->c_flags, to_ticks);
KASSERT(to_ticks >= 0); KASSERT(c->c_func != NULL);
/* Initialize the time here, it won't change. */
occ = c->c_cpu;
c->c_flags &= ~(CALLOUT_FIRED | CALLOUT_INVOKING);
/*
* If this timeout is already scheduled and now is moved
* earlier, reschedule it now. Otherwise leave it in place
* and let it be rescheduled later.
*/
if ((c->c_flags & CALLOUT_PENDING) != 0) {
/* Leave on existing CPU. */
old_time = c->c_time;
c->c_time = to_ticks + occ->cc_ticks;
if (c->c_time - old_time < 0) { CIRCQ_REMOVE(&c->c_list);
CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
}
mutex_spin_exit(lock);
return;
}
cc = curcpu()->ci_data.cpu_callout;
if ((c->c_flags & CALLOUT_BOUND) != 0 || cc == occ ||
!mutex_tryenter(cc->cc_lock)) {
/* Leave on existing CPU. */
c->c_time = to_ticks + occ->cc_ticks;
c->c_flags |= CALLOUT_PENDING;
CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
} else {
/* Move to this CPU. */
c->c_cpu = cc;
c->c_time = to_ticks + cc->cc_ticks;
c->c_flags |= CALLOUT_PENDING;
CIRCQ_INSERT(&c->c_list, &cc->cc_todo);
mutex_spin_exit(cc->cc_lock);
SDT_PROBE6(sdt, kernel, callout, migrate,
c, c->c_func, c->c_arg, c->c_flags,
occ->cc_cpu, cc->cc_cpu);
}
mutex_spin_exit(lock);
}
/*
* callout_reset:
*
* Reset a callout structure with a new function and argument, and
* schedule it to run.
*/
void
callout_reset(callout_t *cs, int to_ticks, void (*func)(void *), void *arg)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags);
c->c_func = func;
c->c_arg = arg;
callout_schedule_locked(c, lock, to_ticks);
}
/*
* callout_schedule:
*
* Schedule a callout to run. The function and argument must
* already be set in the callout structure.
*/
void
callout_schedule(callout_t *cs, int to_ticks)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
callout_schedule_locked(c, lock, to_ticks);
}
/*
* callout_stop:
*
* Try to cancel a pending callout. It may be too late: the callout
* could be running on another CPU. If called from interrupt context,
* the callout could already be in progress at a lower priority.
*/
bool
callout_stop(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool expired;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list);
expired = ((c->c_flags & CALLOUT_FIRED) != 0);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
SDT_PROBE5(sdt, kernel, callout, stop,
c, c->c_func, c->c_arg, c->c_flags, expired);
mutex_spin_exit(lock);
return expired;
}
/*
* callout_halt:
*
* Cancel a pending callout. If in-flight, block until it completes.
* May not be called from a hard interrupt handler. If the callout
* can take locks, the caller of callout_halt() must not hold any of
* those locks, otherwise the two could deadlock. If 'interlock' is
* non-NULL and we must wait for the callout to complete, it will be
* released and re-acquired before returning.
*/
bool
callout_halt(callout_t *cs, void *interlock)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(!cpu_intr_p()); KASSERT(interlock == NULL || mutex_owned(interlock));
/* Fast path. */
lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, halt,
c, c->c_func, c->c_arg, c->c_flags);
if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
if (__predict_false(callout_running_somewhere_else(c, c->c_cpu))) { callout_wait(c, interlock, lock);
return true;
}
SDT_PROBE5(sdt, kernel, callout, halt__done,
c, c->c_func, c->c_arg, c->c_flags, /*expired*/false);
mutex_spin_exit(lock);
return false;
}
/*
* callout_wait:
*
* Slow path for callout_halt(). Deliberately marked __noinline to
* prevent unneeded overhead in the caller.
*/
static void __noinline
callout_wait(callout_impl_t *c, void *interlock, kmutex_t *lock)
{
struct callout_cpu *cc;
struct lwp *l;
kmutex_t *relock;
int nlocks;
l = curlwp;
relock = NULL;
for (;;) {
/*
* At this point we know the callout is not pending, but it
* could be running on a CPU somewhere. That can be curcpu
* in a few cases:
*
* - curlwp is a higher priority soft interrupt
* - the callout blocked on a lock and is currently asleep
* - the callout itself has called callout_halt() (nice!)
*/
cc = c->c_cpu;
if (__predict_true(!callout_running_somewhere_else(c, cc)))
break;
/* It's running - need to wait for it to complete. */
if (interlock != NULL) {
/*
* Avoid potential scheduler lock order problems by
* dropping the interlock without the callout lock
* held; then retry.
*/
mutex_spin_exit(lock);
mutex_exit(interlock);
relock = interlock;
interlock = NULL;
} else {
/* XXX Better to do priority inheritance. */
KASSERT(l->l_wchan == NULL);
cc->cc_nwait++;
cc->cc_ev_block.ev_count++;
nlocks = sleepq_enter(&cc->cc_sleepq, l, cc->cc_lock);
sleepq_enqueue(&cc->cc_sleepq, cc, "callout",
&callout_syncobj, false);
sleepq_block(0, false, &callout_syncobj, nlocks);
}
/*
* Re-lock the callout and check the state of play again.
* It's a common design pattern for callouts to re-schedule
* themselves so put a stop to it again if needed.
*/
lock = callout_lock(c);
if ((c->c_flags & CALLOUT_PENDING) != 0)
CIRCQ_REMOVE(&c->c_list);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
}
SDT_PROBE5(sdt, kernel, callout, halt__done,
c, c->c_func, c->c_arg, c->c_flags, /*expired*/true);
mutex_spin_exit(lock);
if (__predict_false(relock != NULL))
mutex_enter(relock);
}
#ifdef notyet
/*
* callout_bind:
*
* Bind a callout so that it will only execute on one CPU.
* The callout must be stopped, and must be MPSAFE.
*
* XXX Disabled for now until it is decided how to handle
* offlined CPUs. We may want weak+strong binding.
*/
void
callout_bind(callout_t *cs, struct cpu_info *ci)
{
callout_impl_t *c = (callout_impl_t *)cs;
struct callout_cpu *cc;
kmutex_t *lock;
KASSERT((c->c_flags & CALLOUT_PENDING) == 0);
KASSERT(c->c_cpu->cc_active != c);
KASSERT(c->c_magic == CALLOUT_MAGIC);
KASSERT((c->c_flags & CALLOUT_MPSAFE) != 0);
lock = callout_lock(c);
cc = ci->ci_data.cpu_callout;
c->c_flags |= CALLOUT_BOUND;
if (c->c_cpu != cc) {
/*
* Assigning c_cpu effectively unlocks the callout
* structure, as we don't hold the new CPU's lock.
* Issue memory barrier to prevent accesses being
* reordered.
*/
membar_exit();
c->c_cpu = cc;
}
mutex_spin_exit(lock);
}
#endif
void
callout_setfunc(callout_t *cs, void (*func)(void *), void *arg)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags);
c->c_func = func;
c->c_arg = arg;
mutex_spin_exit(lock);
}
bool
callout_expired(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC);
lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_FIRED) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_active(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
rv = ((c->c_flags & (CALLOUT_PENDING|CALLOUT_FIRED)) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_pending(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_PENDING) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_invoking(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC);
lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_INVOKING) != 0);
mutex_spin_exit(lock);
return rv;
}
void
callout_ack(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC);
lock = callout_lock(c);
c->c_flags &= ~CALLOUT_INVOKING;
mutex_spin_exit(lock);
}
/*
* callout_hardclock:
*
* Called from hardclock() once every tick. We schedule a soft
* interrupt if there is work to be done.
*/
void
callout_hardclock(void)
{
struct callout_cpu *cc;
int needsoftclock, ticks;
cc = curcpu()->ci_data.cpu_callout;
mutex_spin_enter(cc->cc_lock);
ticks = ++cc->cc_ticks;
MOVEBUCKET(cc, 0, ticks);
if (MASKWHEEL(0, ticks) == 0) {
MOVEBUCKET(cc, 1, ticks);
if (MASKWHEEL(1, ticks) == 0) {
MOVEBUCKET(cc, 2, ticks);
if (MASKWHEEL(2, ticks) == 0)
MOVEBUCKET(cc, 3, ticks);
}
}
needsoftclock = !CIRCQ_EMPTY(&cc->cc_todo);
mutex_spin_exit(cc->cc_lock);
if (needsoftclock)
softint_schedule(callout_sih);
}
/*
* callout_softclock:
*
* Soft interrupt handler, scheduled above if there is work to
* be done. Callouts are made in soft interrupt context.
*/
static void
callout_softclock(void *v)
{
callout_impl_t *c;
struct callout_cpu *cc;
void (*func)(void *);
void *arg;
int mpsafe, count, ticks, delta;
u_int flags __unused;
lwp_t *l;
l = curlwp;
KASSERT(l->l_cpu == curcpu());
cc = l->l_cpu->ci_data.cpu_callout;
mutex_spin_enter(cc->cc_lock);
cc->cc_lwp = l;
while (!CIRCQ_EMPTY(&cc->cc_todo)) {
c = CIRCQ_FIRST(&cc->cc_todo);
KASSERT(c->c_magic == CALLOUT_MAGIC);
KASSERT(c->c_func != NULL);
KASSERT(c->c_cpu == cc);
KASSERT((c->c_flags & CALLOUT_PENDING) != 0);
KASSERT((c->c_flags & CALLOUT_FIRED) == 0);
CIRCQ_REMOVE(&c->c_list);
/* If due run it, otherwise insert it into the right bucket. */
ticks = cc->cc_ticks;
delta = (int)((unsigned)c->c_time - (unsigned)ticks);
if (delta > 0) {
CIRCQ_INSERT(&c->c_list, BUCKET(cc, delta, c->c_time));
continue;
}
if (delta < 0)
cc->cc_ev_late.ev_count++;
c->c_flags = (c->c_flags & ~CALLOUT_PENDING) |
(CALLOUT_FIRED | CALLOUT_INVOKING);
mpsafe = (c->c_flags & CALLOUT_MPSAFE);
func = c->c_func;
arg = c->c_arg;
cc->cc_active = c;
flags = c->c_flags;
mutex_spin_exit(cc->cc_lock);
KASSERT(func != NULL);
SDT_PROBE4(sdt, kernel, callout, entry, c, func, arg, flags);
if (__predict_false(!mpsafe)) {
KERNEL_LOCK(1, NULL);
(*func)(arg);
KERNEL_UNLOCK_ONE(NULL);
} else
(*func)(arg);
SDT_PROBE4(sdt, kernel, callout, return, c, func, arg, flags);
KASSERTMSG(l->l_blcnt == 0,
"callout %p func %p leaked %d biglocks",
c, func, l->l_blcnt);
mutex_spin_enter(cc->cc_lock);
/*
* We can't touch 'c' here because it might be
* freed already. If LWPs waiting for callout
* to complete, awaken them.
*/
cc->cc_active = NULL;
if ((count = cc->cc_nwait) != 0) {
cc->cc_nwait = 0;
/* sleepq_wake() drops the lock. */
sleepq_wake(&cc->cc_sleepq, cc, count, cc->cc_lock);
mutex_spin_enter(cc->cc_lock);
}
}
cc->cc_lwp = NULL;
mutex_spin_exit(cc->cc_lock);
}
#endif /* !CRASH */
#ifdef DDB
static void
db_show_callout_bucket(struct callout_cpu *cc, struct callout_circq *kbucket,
struct callout_circq *bucket)
{
callout_impl_t *c, ci;
db_expr_t offset;
const char *name;
static char question[] = "?";
int b;
if (CIRCQ_LAST(bucket, kbucket))
return;
for (c = CIRCQ_FIRST(bucket); /*nothing*/; c = CIRCQ_NEXT(&c->c_list)) {
db_read_bytes((db_addr_t)c, sizeof(ci), (char *)&ci);
c = &ci;
db_find_sym_and_offset((db_addr_t)(intptr_t)c->c_func, &name,
&offset);
name = name ? name : question;
b = (bucket - cc->cc_wheel);
if (b < 0)
b = -WHEELSIZE;
db_printf("%9d %2d/%-4d %16lx %s\n",
c->c_time - cc->cc_ticks, b / WHEELSIZE, b,
(u_long)c->c_arg, name);
if (CIRCQ_LAST(&c->c_list, kbucket))
break;
}
}
void
db_show_callout(db_expr_t addr, bool haddr, db_expr_t count, const char *modif)
{
struct callout_cpu *cc;
struct cpu_info *ci;
int b;
#ifndef CRASH
db_printf("hardclock_ticks now: %d\n", getticks());
#endif
db_printf(" ticks wheel arg func\n");
/*
* Don't lock the callwheel; all the other CPUs are paused
* anyhow, and we might be called in a circumstance where
* some other CPU was paused while holding the lock.
*/
for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
db_read_bytes((db_addr_t)ci +
offsetof(struct cpu_info, ci_data.cpu_callout),
sizeof(cc), (char *)&cc);
db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
db_show_callout_bucket(&ccb, &cc->cc_todo, &ccb.cc_todo);
}
for (b = 0; b < BUCKETS; b++) {
for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
db_read_bytes((db_addr_t)ci +
offsetof(struct cpu_info, ci_data.cpu_callout),
sizeof(cc), (char *)&cc);
db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
db_show_callout_bucket(&ccb, &cc->cc_wheel[b],
&ccb.cc_wheel[b]);
}
}
}
#endif /* DDB */
/* $NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 2000-2020 NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Bill Sommerfeld of RedBack Networks Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1999 Stefan Grefen
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_mpbios.h" /* for MPDEBUG */
#include "opt_mtrr.h"
#include "opt_multiprocessor.h"
#include "opt_svs.h"
#include "lapic.h"
#include "ioapic.h"
#include "acpica.h"
#include "hpet.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/cpu.h>
#include <sys/cpufreq.h>
#include <sys/idle.h>
#include <sys/atomic.h>
#include <sys/reboot.h>
#include <sys/csan.h>
#include <uvm/uvm.h>
#include "acpica.h" /* for NACPICA, for mp_verbose */
#include <x86/machdep.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#if defined(MULTIPROCESSOR)
#include <machine/mpbiosvar.h>
#endif
#include <machine/mpconfig.h> /* for mp_verbose */
#include <machine/pcb.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/gdt.h>
#include <machine/mtrr.h>
#include <machine/pio.h>
#include <machine/cpu_counter.h>
#include <machine/pmap_private.h>
#include <x86/fpu.h>
#if NACPICA > 0
#include <dev/acpi/acpi_srat.h>
#endif
#if NLAPIC > 0
#include <machine/apicvar.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#endif
#include <dev/ic/mc146818reg.h>
#include <dev/ic/hpetvar.h>
#include <i386/isa/nvram.h>
#include <dev/isa/isareg.h>
#include "tsc.h"
#ifndef XENPV
#include "hyperv.h"
#if NHYPERV > 0
#include <x86/x86/hypervvar.h>
#endif
#endif
#ifdef XEN
#include <xen/hypervisor.h>
#endif
static int cpu_match(device_t, cfdata_t, void *);
static void cpu_attach(device_t, device_t, void *);
static void cpu_defer(device_t);
static int cpu_rescan(device_t, const char *, const int *);
static void cpu_childdetached(device_t, device_t);
static bool cpu_stop(device_t);
static bool cpu_suspend(device_t, const pmf_qual_t *);
static bool cpu_resume(device_t, const pmf_qual_t *);
static bool cpu_shutdown(device_t, int);
struct cpu_softc {
device_t sc_dev; /* device tree glue */
struct cpu_info *sc_info; /* pointer to CPU info */
bool sc_wasonline;
};
#ifdef MULTIPROCESSOR
int mp_cpu_start(struct cpu_info *, paddr_t);
void mp_cpu_start_cleanup(struct cpu_info *);
const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
mp_cpu_start_cleanup };
#endif
CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc),
cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached);
/*
* Statically-allocated CPU info for the primary CPU (or the only
* CPU, on uniprocessors). The CPU info list is initialized to
* point at it.
*/
struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = {
.ci_dev = 0,
.ci_self = &cpu_info_primary,
.ci_idepth = -1,
.ci_curlwp = &lwp0,
.ci_curldt = -1,
.ci_kfpu_spl = -1,
};
struct cpu_info *cpu_info_list = &cpu_info_primary;
#ifdef i386
void cpu_set_tss_gates(struct cpu_info *);
#endif
static void cpu_init_idle_lwp(struct cpu_info *);
uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits */
/* [0] basic features cpuid.1:%edx
* [1] basic features cpuid.1:%ecx (CPUID2_xxx bits)
* [2] extended features cpuid:80000001:%edx
* [3] extended features cpuid:80000001:%ecx
* [4] VIA padlock features
* [5] structured extended features cpuid.7:%ebx
* [6] structured extended features cpuid.7:%ecx
*/
#ifdef MULTIPROCESSOR
bool x86_mp_online;
paddr_t mp_trampoline_paddr = MP_TRAMPOLINE;
#endif
#if NLAPIC > 0
static vaddr_t cmos_data_mapping;
#endif
struct cpu_info *cpu_starting;
#ifdef MULTIPROCESSOR
void cpu_hatch(void *);
static void cpu_boot_secondary(struct cpu_info *ci);
static void cpu_start_secondary(struct cpu_info *ci);
#if NLAPIC > 0
static void cpu_copy_trampoline(paddr_t);
#endif
#endif /* MULTIPROCESSOR */
/*
* Runs once per boot once multiprocessor goo has been detected and
* the local APIC on the boot processor has been mapped.
*
* Called from lapic_boot_init() (from mpbios_scan()).
*/
#if NLAPIC > 0
void
cpu_init_first(void)
{
cpu_info_primary.ci_cpuid = lapic_cpu_number();
cmos_data_mapping = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY);
if (cmos_data_mapping == 0)
panic("No KVA for page 0");
pmap_kenter_pa(cmos_data_mapping, 0, VM_PROT_READ|VM_PROT_WRITE, 0);
pmap_update(pmap_kernel());
}
#endif
static int
cpu_match(device_t parent, cfdata_t match, void *aux)
{
return 1;
}
#ifdef __HAVE_PCPU_AREA
void
cpu_pcpuarea_init(struct cpu_info *ci)
{
struct vm_page *pg;
size_t i, npages;
vaddr_t base, va;
paddr_t pa;
CTASSERT(sizeof(struct pcpu_entry) % PAGE_SIZE == 0);
npages = sizeof(struct pcpu_entry) / PAGE_SIZE;
base = (vaddr_t)&pcpuarea->ent[cpu_index(ci)];
for (i = 0; i < npages; i++) {
pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
if (pg == NULL) {
panic("failed to allocate pcpu PA");
}
va = base + i * PAGE_SIZE;
pa = VM_PAGE_TO_PHYS(pg);
pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
}
#endif
static void
cpu_vm_init(struct cpu_info *ci)
{
unsigned int ncolors = 2;
/*
* XXX: for AP's the cache info has not been initialized yet
* but that does not matter because uvm only pays attention at
* the maximum only. We should fix it once cpus have different
* cache sizes.
*/
for (unsigned int i = CAI_ICACHE; i <= CAI_L2CACHE; i++) {
struct x86_cache_info *cai;
unsigned int tcolors;
cai = &ci->ci_cinfo[i];
tcolors = atop(cai->cai_totalsize);
switch (cai->cai_associativity) {
case 0xff:
tcolors = 1; /* fully associative */
break;
case 0:
case 1:
break;
default:
tcolors /= cai->cai_associativity;
}
if (tcolors <= ncolors)
continue;
ncolors = tcolors;
}
/*
* If the desired number of colors is not a power of
* two, it won't be good. Find the greatest power of
* two which is an even divisor of the number of colors,
* to preserve even coloring of pages.
*/
if (ncolors & (ncolors - 1) ) {
unsigned int try, picked = 1;
for (try = 1; try < ncolors; try *= 2) {
if (ncolors % try == 0) picked = try;
}
if (picked == 1) {
panic("desired number of cache colors %u is "
" > 1, but not even!", ncolors);
}
ncolors = picked;
}
/*
* Knowing the size of the largest cache on this CPU, potentially
* re-color our pages.
*/
aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
uvm_page_recolor(ncolors);
pmap_tlb_cpu_init(ci);
#ifndef __HAVE_DIRECT_MAP
pmap_vpage_cpu_init(ci);
#endif
}
static void
cpu_attach(device_t parent, device_t self, void *aux)
{
struct cpu_softc *sc = device_private(self);
struct cpu_attach_args *caa = aux;
struct cpu_info *ci;
uintptr_t ptr;
#if NLAPIC > 0
int cpunum = caa->cpu_number;
#endif
static bool again;
sc->sc_dev = self;
if (ncpu > maxcpus) {
#ifndef _LP64
aprint_error(": too many CPUs, please use NetBSD/amd64\n");
#else
aprint_error(": too many CPUs\n");
#endif
return;
}
/*
* If we're an Application Processor, allocate a cpu_info
* structure, otherwise use the primary's.
*/
if (caa->cpu_role == CPU_ROLE_AP) {
if ((boothowto & RB_MD1) != 0) {
aprint_error(": multiprocessor boot disabled\n");
if (!pmf_device_register(self, NULL, NULL))
aprint_error_dev(self,
"couldn't establish power handler\n");
return;
}
aprint_naive(": Application Processor\n");
ptr = (uintptr_t)uvm_km_alloc(kernel_map,
sizeof(*ci) + CACHE_LINE_SIZE - 1, 0,
UVM_KMF_WIRED|UVM_KMF_ZERO);
ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
ci->ci_curldt = -1;
} else {
aprint_naive(": %s Processor\n",
caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot");
ci = &cpu_info_primary;
#if NLAPIC > 0
if (cpunum != lapic_cpu_number()) {
/* XXX should be done earlier. */
uint32_t reg;
aprint_verbose("\n");
aprint_verbose_dev(self, "running CPU at apic %d"
" instead of at expected %d", lapic_cpu_number(),
cpunum);
reg = lapic_readreg(LAPIC_ID);
lapic_writereg(LAPIC_ID, (reg & ~LAPIC_ID_MASK) |
(cpunum << LAPIC_ID_SHIFT));
}
if (cpunum != lapic_cpu_number()) {
aprint_error_dev(self, "unable to reset apic id\n");
}
#endif
}
ci->ci_self = ci;
sc->sc_info = ci;
ci->ci_dev = self;
ci->ci_acpiid = caa->cpu_id;
ci->ci_cpuid = caa->cpu_number;
ci->ci_func = caa->cpu_func;
ci->ci_kfpu_spl = -1;
aprint_normal("\n");
/* Must be before mi_cpu_attach(). */
cpu_vm_init(ci);
if (caa->cpu_role == CPU_ROLE_AP) {
int error;
error = mi_cpu_attach(ci);
if (error != 0) {
aprint_error_dev(self,
"mi_cpu_attach failed with %d\n", error);
return;
}
#ifdef __HAVE_PCPU_AREA
cpu_pcpuarea_init(ci);
#endif
cpu_init_tss(ci);
} else {
KASSERT(ci->ci_data.cpu_idlelwp != NULL);
#if NACPICA > 0
/* Parse out NUMA info for cpu_identify(). */
acpisrat_init();
#endif
}
#ifdef SVS
cpu_svs_init(ci);
#endif
pmap_reference(pmap_kernel());
ci->ci_pmap = pmap_kernel();
ci->ci_tlbstate = TLBSTATE_STALE;
/*
* Boot processor may not be attached first, but the below
* must be done to allow booting other processors.
*/
if (!again) {
/* Make sure DELAY() (likely i8254_delay()) is initialized. */
DELAY(1);
/*
* Basic init. Compute an approximate frequency for the TSC
* using the i8254. If there's a HPET we'll redo it later.
*/
atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY);
cpu_intr_init(ci);
tsc_setfunc(ci);
cpu_get_tsc_freq(ci);
cpu_init(ci);
#ifdef i386
cpu_set_tss_gates(ci);
#endif
pmap_cpu_init_late(ci);
#if NLAPIC > 0
if (caa->cpu_role != CPU_ROLE_SP) {
/* Enable lapic. */
lapic_enable();
lapic_set_lvt();
if (!vm_guest_is_xenpvh_or_pvhvm())
lapic_calibrate_timer(false);
}
#endif
kcsan_cpu_init(ci);
again = true;
}
/* further PCB init done later. */
switch (caa->cpu_role) {
case CPU_ROLE_SP:
atomic_or_32(&ci->ci_flags, CPUF_SP);
cpu_identify(ci);
x86_errata();
x86_cpu_idle_init();
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
break;
case CPU_ROLE_BP:
atomic_or_32(&ci->ci_flags, CPUF_BSP);
cpu_identify(ci);
x86_errata();
x86_cpu_idle_init();
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
break;
#ifdef MULTIPROCESSOR
case CPU_ROLE_AP:
/*
* report on an AP
*/
cpu_intr_init(ci);
idt_vec_init_cpu_md(&ci->ci_idtvec, cpu_index(ci));
gdt_alloc_cpu(ci);
#ifdef i386
cpu_set_tss_gates(ci);
#endif
pmap_cpu_init_late(ci);
cpu_start_secondary(ci);
if (ci->ci_flags & CPUF_PRESENT) {
struct cpu_info *tmp;
cpu_identify(ci);
tmp = cpu_info_list;
while (tmp->ci_next)
tmp = tmp->ci_next;
tmp->ci_next = ci;
}
break;
#endif
default:
panic("unknown processor type??\n");
}
pat_init(ci);
if (!pmf_device_register1(self, cpu_suspend, cpu_resume, cpu_shutdown))
aprint_error_dev(self, "couldn't establish power handler\n");
#ifdef MULTIPROCESSOR
if (mp_verbose) {
struct lwp *l = ci->ci_data.cpu_idlelwp;
struct pcb *pcb = lwp_getpcb(l);
aprint_verbose_dev(self,
"idle lwp at %p, idle sp at %p\n",
l,
#ifdef i386
(void *)pcb->pcb_esp
#else
(void *)pcb->pcb_rsp
#endif
);
}
#endif
/*
* Postpone the "cpufeaturebus" scan.
* It is safe to scan the pseudo-bus
* only after all CPUs have attached.
*/
(void)config_defer(self, cpu_defer);
}
static void
cpu_defer(device_t self)
{
cpu_rescan(self, NULL, NULL);
}
static int
cpu_rescan(device_t self, const char *ifattr, const int *locators)
{
struct cpu_softc *sc = device_private(self);
struct cpufeature_attach_args cfaa;
struct cpu_info *ci = sc->sc_info;
/*
* If we booted with RB_MD1 to disable multiprocessor, the
* auto-configuration data still contains the additional
* CPUs. But their initialization was mostly bypassed
* during attach, so we have to make sure we don't look at
* their featurebus info, since it wasn't retrieved.
*/
if (ci == NULL)
return 0;
memset(&cfaa, 0, sizeof(cfaa));
cfaa.ci = ci;
if (ifattr_match(ifattr, "cpufeaturebus")) {
if (ci->ci_frequency == NULL) {
cfaa.name = "frequency";
ci->ci_frequency =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_padlock == NULL) {
cfaa.name = "padlock";
ci->ci_padlock =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_temperature == NULL) {
cfaa.name = "temperature";
ci->ci_temperature =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_vm == NULL) {
cfaa.name = "vm";
ci->ci_vm =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
}
return 0;
}
static void
cpu_childdetached(device_t self, device_t child)
{
struct cpu_softc *sc = device_private(self);
struct cpu_info *ci = sc->sc_info;
if (ci->ci_frequency == child)
ci->ci_frequency = NULL;
if (ci->ci_padlock == child)
ci->ci_padlock = NULL;
if (ci->ci_temperature == child)
ci->ci_temperature = NULL;
if (ci->ci_vm == child)
ci->ci_vm = NULL;
}
/*
* Initialize the processor appropriately.
*/
void
cpu_init(struct cpu_info *ci)
{
extern int x86_fpu_save;
uint32_t cr4 = 0;
lcr0(rcr0() | CR0_WP);
/* If global TLB caching is supported, enable it */
if (cpu_feature[0] & CPUID_PGE)
cr4 |= CR4_PGE;
/*
* If we have FXSAVE/FXRESTOR, use them.
*/
if (cpu_feature[0] & CPUID_FXSR) {
cr4 |= CR4_OSFXSR;
/*
* If we have SSE/SSE2, enable XMM exceptions.
*/
if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2))
cr4 |= CR4_OSXMMEXCPT;
}
/* If xsave is supported, enable it */
if (cpu_feature[1] & CPUID2_XSAVE)
cr4 |= CR4_OSXSAVE;
/* If SMEP is supported, enable it */
if (cpu_feature[5] & CPUID_SEF_SMEP)
cr4 |= CR4_SMEP;
/* If SMAP is supported, enable it */
if (cpu_feature[5] & CPUID_SEF_SMAP)
cr4 |= CR4_SMAP;
#ifdef SVS
/* If PCID is supported, enable it */
if (svs_pcid)
cr4 |= CR4_PCIDE;
#endif
if (cr4) {
cr4 |= rcr4();
lcr4(cr4);
}
/*
* Changing CR4 register may change cpuid values. For example, setting
* CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in
* ci_feat_val[1], so update it.
* XXX Other than ci_feat_val[1] might be changed.
*/
if (cpuid_level >= 1) {
u_int descs[4];
x86_cpuid(1, descs);
ci->ci_feat_val[1] = descs[2];
}
if (CPU_IS_PRIMARY(ci) &&
x86_fpu_save >= FPU_SAVE_FXSAVE) {
fpuinit_mxcsr_mask();
}
/* If xsave is enabled, enable all fpu features */
if (cr4 & CR4_OSXSAVE)
wrxcr(0, x86_xsave_features & XCR0_FPU);
#ifdef MTRR
/*
* On a P6 or above, initialize MTRR's if the hardware supports them.
*/
if (cpu_feature[0] & CPUID_MTRR) {
if ((ci->ci_flags & CPUF_AP) == 0)
i686_mtrr_init_first();
mtrr_init_cpu(ci);
}
#ifdef i386
if (strcmp((char *)(ci->ci_vendor), "AuthenticAMD") == 0) {
/*
* Must be a K6-2 Step >= 7 or a K6-III.
*/
if (CPUID_TO_FAMILY(ci->ci_signature) == 5) {
if (CPUID_TO_MODEL(ci->ci_signature) > 8 ||
(CPUID_TO_MODEL(ci->ci_signature) == 8 &&
CPUID_TO_STEPPING(ci->ci_signature) >= 7)) {
mtrr_funcs = &k6_mtrr_funcs;
k6_mtrr_init_first();
mtrr_init_cpu(ci);
}
}
}
#endif /* i386 */
#endif /* MTRR */
if (ci != &cpu_info_primary) {
/* Synchronize TSC */
atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
tsc_sync_ap(ci);
} else {
atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
}
}
#ifdef MULTIPROCESSOR
void
cpu_boot_secondary_processors(void)
{
struct cpu_info *ci;
kcpuset_t *cpus;
u_long i;
/* Now that we know the number of CPUs, patch the text segment. */
x86_patch(false);
#if NACPICA > 0
/* Finished with NUMA info for now. */
acpisrat_exit();
#endif
kcpuset_create(&cpus, true);
kcpuset_set(cpus, cpu_index(curcpu()));
for (i = 0; i < maxcpus; i++) {
ci = cpu_lookup(i);
if (ci == NULL)
continue;
if (ci->ci_data.cpu_idlelwp == NULL)
continue;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
continue;
if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY))
continue;
cpu_boot_secondary(ci);
kcpuset_set(cpus, cpu_index(ci));
}
while (!kcpuset_match(cpus, kcpuset_running))
;
kcpuset_destroy(cpus);
x86_mp_online = true;
/* Now that we know about the TSC, attach the timecounter. */
tsc_tc_init();
}
#endif
static void
cpu_init_idle_lwp(struct cpu_info *ci)
{
struct lwp *l = ci->ci_data.cpu_idlelwp;
struct pcb *pcb = lwp_getpcb(l);
pcb->pcb_cr0 = rcr0();
}
void
cpu_init_idle_lwps(void)
{
struct cpu_info *ci;
u_long i;
for (i = 0; i < maxcpus; i++) {
ci = cpu_lookup(i);
if (ci == NULL)
continue;
if (ci->ci_data.cpu_idlelwp == NULL)
continue;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
continue;
cpu_init_idle_lwp(ci);
}
}
#ifdef MULTIPROCESSOR
void
cpu_start_secondary(struct cpu_info *ci)
{
u_long psl;
int i;
#if NLAPIC > 0
paddr_t mp_pdirpa;
mp_pdirpa = pmap_init_tmp_pgtbl(mp_trampoline_paddr);
cpu_copy_trampoline(mp_pdirpa);
#endif
atomic_or_32(&ci->ci_flags, CPUF_AP);
ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
if (CPU_STARTUP(ci, mp_trampoline_paddr) != 0) {
return;
}
/*
* Wait for it to become ready. Setting cpu_starting opens the
* initial gate and allows the AP to start soft initialization.
*/
KASSERT(cpu_starting == NULL);
cpu_starting = ci;
for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) {
delay_func(10);
}
if ((ci->ci_flags & CPUF_PRESENT) == 0) {
aprint_error_dev(ci->ci_dev, "failed to become ready\n");
#if defined(MPDEBUG) && defined(DDB)
printf("dropping into debugger; continue from here to resume boot\n");
Debugger();
#endif
} else {
/*
* Synchronize time stamp counters. Invalidate cache and do
* twice (in tsc_sync_bp) to minimize possible cache effects.
* Disable interrupts to try and rule out any external
* interference.
*/
psl = x86_read_psl();
x86_disable_intr();
tsc_sync_bp(ci);
x86_write_psl(psl);
}
CPU_START_CLEANUP(ci);
cpu_starting = NULL;
}
void
cpu_boot_secondary(struct cpu_info *ci)
{
int64_t drift;
u_long psl;
int i;
atomic_or_32(&ci->ci_flags, CPUF_GO);
for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) {
delay_func(10);
}
if ((ci->ci_flags & CPUF_RUNNING) == 0) {
aprint_error_dev(ci->ci_dev, "failed to start\n");
#if defined(MPDEBUG) && defined(DDB)
printf("dropping into debugger; continue from here to resume boot\n");
Debugger();
#endif
} else {
/* Synchronize TSC again, check for drift. */
drift = ci->ci_data.cpu_cc_skew;
psl = x86_read_psl();
x86_disable_intr();
tsc_sync_bp(ci);
x86_write_psl(psl);
drift -= ci->ci_data.cpu_cc_skew;
aprint_debug_dev(ci->ci_dev, "TSC skew=%lld drift=%lld\n",
(long long)ci->ci_data.cpu_cc_skew, (long long)drift);
tsc_sync_drift(drift);
}
}
/*
* The CPU ends up here when it's ready to run.
* This is called from code in mptramp.s; at this point, we are running
* in the idle pcb/idle stack of the new CPU. When this function returns,
* this processor will enter the idle loop and start looking for work.
*/
void
cpu_hatch(void *v)
{
struct cpu_info *ci = (struct cpu_info *)v;
struct pcb *pcb;
int s, i;
/* ------------------------------------------------------------- */
/*
* This section of code must be compiled with SSP disabled, to
* prevent a race against cpu0. See sys/conf/ssp.mk.
*/
/*
* Initialize MSRs on this CPU:
*
* - On amd64: Enables SYSCALL/SYSRET.
*
* - On amd64: Sets up %fs and %gs so that %gs points to the
* current struct cpu_info as needed for CPUVAR(...),
* curcpu(), and curlwp.
*
* (On i386, CPUVAR(...), curcpu(), and curlwp are made to
* work first by the conifguration of segment descriptors in
* the Global Descriptor Table (GDT) in initgdt.)
*
* - Enables the no-execute bit if supported.
*
* Thus, after this point, CPUVAR(...), curcpu(), and curlwp
* will work on this CPU.
*
* Note: The call to cpu_init_msrs for cpu0 happens in
* init386/init_x86_64.
*/
cpu_init_msrs(ci, true);
cpu_probe(ci);
cpu_speculation_init(ci);
#if NHYPERV > 0
hyperv_init_cpu(ci);
#endif
ci->ci_data.cpu_cc_freq = cpu_info_primary.ci_data.cpu_cc_freq;
/* cpu_get_tsc_freq(ci); */
KDASSERT((ci->ci_flags & CPUF_PRESENT) == 0);
/*
* Synchronize the TSC for the first time. Note that interrupts are
* off at this point.
*/
atomic_or_32(&ci->ci_flags, CPUF_PRESENT);
tsc_sync_ap(ci);
/* ------------------------------------------------------------- */
/*
* Wait to be brought online.
*
* Use MONITOR/MWAIT if available. These instructions put the CPU in
* a low consumption mode (C-state), and if the TSC is not invariant,
* this causes the TSC to drift. We want this to happen, so that we
* can later detect (in tsc_tc_init) any abnormal drift with invariant
* TSCs. That's just for safety; by definition such drifts should
* never occur with invariant TSCs.
*
* If not available, try PAUSE. We'd like to use HLT, but we have
* interrupts off.
*/
while ((ci->ci_flags & CPUF_GO) == 0) {
if ((cpu_feature[1] & CPUID2_MONITOR) != 0) {
x86_monitor(&ci->ci_flags, 0, 0);
if ((ci->ci_flags & CPUF_GO) != 0) {
continue;
}
x86_mwait(0, 0);
} else {
/*
* XXX The loop repetition count could be a lot higher, but
* XXX currently qemu emulator takes a _very_long_time_ to
* XXX execute the pause instruction. So for now, use a low
* XXX value to allow the cpu to hatch before timing out.
*/
for (i = 50; i != 0; i--) {
x86_pause();
}
}
}
/* Because the text may have been patched in x86_patch(). */
wbinvd();
x86_flush();
tlbflushg();
KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);
#ifdef PAE
pd_entry_t * l3_pd = ci->ci_pae_l3_pdir;
for (i = 0 ; i < PDP_SIZE; i++) {
l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PTE_P;
}
lcr3(ci->ci_pae_l3_pdirpa);
#else
lcr3(pmap_pdirpa(pmap_kernel(), 0));
#endif
pcb = lwp_getpcb(curlwp);
pcb->pcb_cr3 = rcr3();
pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp);
lcr0(pcb->pcb_cr0);
cpu_init_idt(ci);
gdt_init_cpu(ci);
#if NLAPIC > 0
lapic_enable();
lapic_set_lvt();
#endif
fpuinit(ci);
lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
ltr(ci->ci_tss_sel);
/*
* cpu_init will re-synchronize the TSC, and will detect any abnormal
* drift that would have been caused by the use of MONITOR/MWAIT
* above.
*/
cpu_init(ci);
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
(*x86_initclock_func)();
cpu_get_tsc_freq(ci);
s = splhigh();
#if NLAPIC > 0
lapic_write_tpri(0);
#endif
x86_enable_intr();
splx(s);
x86_errata();
aprint_debug_dev(ci->ci_dev, "running\n");
kcsan_cpu_init(ci);
idle_loop(NULL);
KASSERT(false);
}
#endif
#if defined(DDB)
#include <ddb/db_output.h>
#include <machine/db_machdep.h>
/*
* Dump CPU information from ddb.
*/
void
cpu_debug_dump(void)
{
struct cpu_info *ci;
CPU_INFO_ITERATOR cii;
const char sixtyfour64space[] =
#ifdef _LP64
" "
#endif
"";
db_printf("addr %sdev id flags ipis spl curlwp "
"\n", sixtyfour64space);
for (CPU_INFO_FOREACH(cii, ci)) {
db_printf("%p %s %ld %x %x %d %10p\n",
ci,
ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev),
(long)ci->ci_cpuid,
ci->ci_flags, ci->ci_ipis, ci->ci_ilevel,
ci->ci_curlwp);
}
}
#endif
#ifdef MULTIPROCESSOR
#if NLAPIC > 0
static void
cpu_copy_trampoline(paddr_t pdir_pa)
{
extern uint32_t nox_flag;
extern u_char cpu_spinup_trampoline[];
extern u_char cpu_spinup_trampoline_end[];
vaddr_t mp_trampoline_vaddr;
struct {
uint32_t large;
uint32_t nox;
uint32_t pdir;
} smp_data;
CTASSERT(sizeof(smp_data) == 3 * 4);
smp_data.large = (pmap_largepages != 0);
smp_data.nox = nox_flag;
smp_data.pdir = (uint32_t)(pdir_pa & 0xFFFFFFFF);
/* Enter the physical address */
mp_trampoline_vaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
pmap_kenter_pa(mp_trampoline_vaddr, mp_trampoline_paddr,
VM_PROT_READ | VM_PROT_WRITE, 0);
pmap_update(pmap_kernel());
/* Copy boot code */
memcpy((void *)mp_trampoline_vaddr,
cpu_spinup_trampoline,
cpu_spinup_trampoline_end - cpu_spinup_trampoline);
/* Copy smp_data at the end */
memcpy((void *)(mp_trampoline_vaddr + PAGE_SIZE - sizeof(smp_data)),
&smp_data, sizeof(smp_data));
pmap_kremove(mp_trampoline_vaddr, PAGE_SIZE);
pmap_update(pmap_kernel());
uvm_km_free(kernel_map, mp_trampoline_vaddr, PAGE_SIZE, UVM_KMF_VAONLY);
}
#endif
int
mp_cpu_start(struct cpu_info *ci, paddr_t target)
{
int error;
/*
* Bootstrap code must be addressable in real mode
* and it must be page aligned.
*/
KASSERT(target < 0x10000 && target % PAGE_SIZE == 0);
/*
* "The BSP must initialize CMOS shutdown code to 0Ah ..."
*/
outb(IO_RTC, NVRAM_RESET);
outb(IO_RTC+1, NVRAM_RESET_JUMP);
#if NLAPIC > 0
/*
* "and the warm reset vector (DWORD based at 40:67) to point
* to the AP startup code ..."
*/
unsigned short dwordptr[2];
dwordptr[0] = 0;
dwordptr[1] = target >> 4;
memcpy((uint8_t *)cmos_data_mapping + 0x467, dwordptr, 4);
#endif
if ((cpu_feature[0] & CPUID_APIC) == 0) {
aprint_error("mp_cpu_start: CPU does not have APIC\n");
return ENODEV;
}
/*
* ... prior to executing the following sequence:". We'll also add in
* local cache flush, in case the BIOS has left the AP with its cache
* disabled. It may not be able to cope with MP coherency.
*/
wbinvd();
if (ci->ci_flags & CPUF_AP) {
error = x86_ipi_init(ci->ci_cpuid);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (1)\n",
__func__);
return error;
}
delay_func(10000);
error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (2)\n",
__func__);
return error;
}
delay_func(200);
error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (3)\n",
__func__);
return error;
}
delay_func(200);
}
return 0;
}
void
mp_cpu_start_cleanup(struct cpu_info *ci)
{
/*
* Ensure the NVRAM reset byte contains something vaguely sane.
*/
outb(IO_RTC, NVRAM_RESET);
outb(IO_RTC+1, NVRAM_RESET_RST);
}
#endif
#ifdef __x86_64__
typedef void (vector)(void);
extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
#endif
/*
* cpu_init_msrs(ci, full)
*
* Initialize some Model-Specific Registers (MSRs) on the current
* CPU, whose struct cpu_info pointer is ci, for:
*
* - SYSCALL/SYSRET.
* - %fs/%gs on amd64 if `full' is true; needed to make
* CPUVAR(...), curcpu(), and curlwp work. (We do this at boot,
* but skip it on ACPI wakeup.)
* - No-execute bit, if supported.
*
* References:
*
* - Intel 64 and IA-32 Architectures Software Developer's Manual,
* Volume 3: System Programming Guide, Order Number 325384,
* April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode',
* pp. 5-22 through 5-23.
*
* - Intel 64 and IA-32 Architectures Software Developer's Manual,
* Volume 4: Model-Specific Registers, Order Number 335592,
* April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2,
* pp. 2-60 through 2-61.
*/
void
cpu_init_msrs(struct cpu_info *ci, bool full)
{
#ifdef __x86_64__
/*
* On amd64, set up the syscall target address registers
* for SYSCALL/SYSRET:
*
* - IA32_STAR, c000_0081h (MSR_STAR): System Call Target
* Address. Code and stack segment selectors for SYSRET
* (bits 48:63) and SYSCALL (bits 32:47).
*
* - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System
* Call Target Address. Target rip for SYSCALL when executed
* in 64-bit mode.
*
* - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System
* Call Target Address. Target rip for SYSCALL when executed
* in compatibility mode. (XXX Manual says this is `[n]ot
* used, as the SYSCALL instruction is not recognized in
* compatibility mode', so why do we set it?)
*
* - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag
* Mask. Mask for the RFLAGS register on SYSCALL.
*/
wrmsr(MSR_STAR,
((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48));
wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32);
wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC);
#ifdef SVS
if (svs_enabled)
wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
#endif
/*
* On amd64 if `full' is true -- used at boot, but not on ACPI
* wakeup -- then additionally set up %fs and %gs:
*
* - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of
* %fs. Not used in NetBSD kernel, so zero it.
*
* - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of
* %gs. Used in NetBSD kernel by CPUVAR(...), curcpu(), and
* curlwp for access to the CPU-local area, so set it to ci.
*
* - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base
* address of what swapgs will leave in %gs when switching to
* userland. Zero for now; will be set to pcb->pcb_gs in
* cpu_switchto for user threads.
*/
if (full) {
wrmsr(MSR_FSBASE, 0);
wrmsr(MSR_GSBASE, (uint64_t)ci);
wrmsr(MSR_KERNELGSBASE, 0);
}
#endif /* __x86_64__ */
/*
* If the no-execute bit is supported, enable it in:
*
* - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature
* Enables.
*/
if (cpu_feature[2] & CPUID_NOX)
wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
}
void
cpu_offline_md(void)
{
return;
}
/* XXX joerg restructure and restart CPUs individually */
static bool
cpu_stop(device_t dv)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
int err;
KASSERT((ci->ci_flags & CPUF_PRESENT) != 0);
if (CPU_IS_PRIMARY(ci))
return true;
if (ci->ci_data.cpu_idlelwp == NULL)
return true;
sc->sc_wasonline = !(ci->ci_schedstate.spc_flags & SPCF_OFFLINE);
if (sc->sc_wasonline) {
mutex_enter(&cpu_lock);
err = cpu_setstate(ci, false);
mutex_exit(&cpu_lock);
if (err != 0)
return false;
}
return true;
}
static bool
cpu_suspend(device_t dv, const pmf_qual_t *qual)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
else {
cpufreq_suspend(ci);
}
return cpu_stop(dv);
}
static bool
cpu_resume(device_t dv, const pmf_qual_t *qual)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
int err = 0;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
if (CPU_IS_PRIMARY(ci))
goto out;
if (ci->ci_data.cpu_idlelwp == NULL)
goto out;
if (sc->sc_wasonline) {
mutex_enter(&cpu_lock);
err = cpu_setstate(ci, true);
mutex_exit(&cpu_lock);
}
out:
if (err != 0)
return false;
cpufreq_resume(ci);
return true;
}
static bool
cpu_shutdown(device_t dv, int how)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
if ((ci->ci_flags & CPUF_BSP) != 0)
return false;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
return cpu_stop(dv);
}
/* Get the TSC frequency and set it to ci->ci_data.cpu_cc_freq. */
void
cpu_get_tsc_freq(struct cpu_info *ci)
{
uint64_t freq = 0, freq_from_cpuid, t0, t1;
int64_t overhead;
if (CPU_IS_PRIMARY(ci) && cpu_hascounter()) {
/*
* If it's the first call of this function, try to get TSC
* freq from CPUID by calling cpu_tsc_freq_cpuid().
* The function also set lapic_per_second variable if it's
* known. This is required for Intel's Comet Lake and newer
* processors to set LAPIC timer correctly.
*/
if (ci->ci_data.cpu_cc_freq == 0)
freq = freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
if (freq != 0)
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from CPUID %" PRIu64 " Hz\n", freq);
#if NHPET > 0
if (freq == 0) {
freq = hpet_tsc_freq();
if (freq != 0)
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from HPET %" PRIu64 " Hz\n", freq);
}
#endif
if (freq == 0) {
/*
* Work out the approximate overhead involved below.
* Discard the result of the first go around the
* loop.
*/
overhead = 0;
for (int i = 0; i <= 8; i++) {
const int s = splhigh();
t0 = cpu_counter();
delay_func(0);
t1 = cpu_counter();
splx(s);
if (i > 0) {
overhead += (t1 - t0);
}
}
overhead >>= 3;
/*
* Now do the calibration.
*/
freq = 0;
for (int i = 0; i < 1000; i++) {
const int s = splhigh();
t0 = cpu_counter();
delay_func(100);
t1 = cpu_counter();
splx(s);
freq += t1 - t0 - overhead;
}
freq = freq * 10;
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from delay %" PRIu64 " Hz\n", freq);
}
if (ci->ci_data.cpu_cc_freq != 0) {
freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
if ((freq_from_cpuid != 0)
&& (freq != freq_from_cpuid))
aprint_verbose_dev(ci->ci_dev, "TSC freq "
"calibrated %" PRIu64 " Hz\n", freq);
}
} else {
freq = cpu_info_primary.ci_data.cpu_cc_freq;
}
ci->ci_data.cpu_cc_freq = freq;
}
void
x86_cpu_idle_mwait(void)
{
struct cpu_info *ci = curcpu();
KASSERT(ci->ci_ilevel == IPL_NONE);
x86_monitor(&ci->ci_want_resched, 0, 0);
if (__predict_false(ci->ci_want_resched)) {
return;
}
x86_mwait(0, 0);
}
void
x86_cpu_idle_halt(void)
{
struct cpu_info *ci = curcpu();
KASSERT(ci->ci_ilevel == IPL_NONE);
x86_disable_intr();
if (!__predict_false(ci->ci_want_resched)) {
x86_stihlt();
} else {
x86_enable_intr();
}
}
/*
* Loads pmap for the current CPU.
*/
void
cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
{ KASSERT(kpreempt_disabled());
#ifdef SVS
if (svs_enabled && pmap_is_user(pmap)) {
svs_pdir_switch(pmap);
}
#endif
#ifdef PAE
struct cpu_info *ci = curcpu();
bool interrupts_enabled;
pd_entry_t *l3_pd = ci->ci_pae_l3_pdir;
int i;
/*
* disable interrupts to block TLB shootdowns, which can reload cr3.
* while this doesn't block NMIs, it's probably ok as NMIs unlikely
* reload cr3.
*/
interrupts_enabled = (x86_read_flags() & PSL_I) != 0;
if (interrupts_enabled)
x86_disable_intr();
for (i = 0 ; i < PDP_SIZE; i++) {
l3_pd[i] = pmap->pm_pdirpa[i] | PTE_P;
}
if (interrupts_enabled)
x86_enable_intr();
tlbflush();
#else
lcr3(pmap_pdirpa(pmap, 0));
#endif
}
/*
* Notify all other cpus to halt.
*/
void
cpu_broadcast_halt(void)
{
x86_broadcast_ipi(X86_IPI_HALT);
}
/*
* Send a dummy ipi to a cpu to force it to run splraise()/spllower(),
* and trigger an AST on the running LWP.
*/
void
cpu_kick(struct cpu_info *ci)
{
x86_send_ipi(ci, X86_IPI_AST);
}
/* $NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $ */
/*
* Copyright (c) 2010, 2011, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* tmpfs memory allocation routines.
* Implements memory usage accounting and limiting.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <fs/tmpfs/tmpfs.h>
extern struct pool tmpfs_dirent_pool;
extern struct pool tmpfs_node_pool;
void
tmpfs_mntmem_init(struct tmpfs_mount *mp, uint64_t memlimit)
{
mutex_init(&mp->tm_acc_lock, MUTEX_DEFAULT, IPL_NONE);
mp->tm_mem_limit = memlimit;
mp->tm_bytes_used = 0;
}
void
tmpfs_mntmem_destroy(struct tmpfs_mount *mp)
{
KASSERT(mp->tm_bytes_used == 0);
mutex_destroy(&mp->tm_acc_lock);
}
int
tmpfs_mntmem_set(struct tmpfs_mount *mp, uint64_t memlimit)
{
int error;
mutex_enter(&mp->tm_acc_lock);
if (round_page(mp->tm_bytes_used) >= memlimit)
error = EBUSY;
else {
error = 0;
mp->tm_mem_limit = memlimit;
}
mutex_exit(&mp->tm_acc_lock);
return error;
}
/*
* tmpfs_mem_info: return the number of available memory pages.
*
* => If 'total' is true, then return _total_ amount of pages.
* => If false, then return the amount of _free_ memory pages.
*
* Remember to remove uvmexp.freetarg from the returned value to avoid
* excessive memory usage.
*/
size_t
tmpfs_mem_info(bool total)
{
size_t size = 0;
size += uvmexp.swpgavail;
if (!total) {
size -= uvmexp.swpgonly;
}
size += uvm_availmem(true);
size += uvmexp.filepages;
if (size > uvmexp.wired) {
size -= uvmexp.wired;
} else {
size = 0;
}
return size;
}
uint64_t
tmpfs_bytes_max(struct tmpfs_mount *mp)
{
psize_t freepages = tmpfs_mem_info(false);
int freetarg = uvmexp.freetarg; // XXX unlocked
uint64_t avail_mem;
if (freepages < freetarg) {
freepages = 0;
} else {
freepages -= freetarg;
}
avail_mem = round_page(mp->tm_bytes_used) + (freepages << PAGE_SHIFT);
return MIN(mp->tm_mem_limit, avail_mem);
}
size_t
tmpfs_pages_avail(struct tmpfs_mount *mp)
{
return (tmpfs_bytes_max(mp) - mp->tm_bytes_used) >> PAGE_SHIFT;
}
bool
tmpfs_mem_incr(struct tmpfs_mount *mp, size_t sz)
{
uint64_t lim;
mutex_enter(&mp->tm_acc_lock);
lim = tmpfs_bytes_max(mp);
if (mp->tm_bytes_used + sz >= lim) {
mutex_exit(&mp->tm_acc_lock);
return false;
}
mp->tm_bytes_used += sz;
mutex_exit(&mp->tm_acc_lock);
return true;
}
void
tmpfs_mem_decr(struct tmpfs_mount *mp, size_t sz)
{
mutex_enter(&mp->tm_acc_lock);
KASSERT(mp->tm_bytes_used >= sz);
mp->tm_bytes_used -= sz;
mutex_exit(&mp->tm_acc_lock);
}
struct tmpfs_dirent *
tmpfs_dirent_get(struct tmpfs_mount *mp)
{ if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_dirent))) {
return NULL;
}
return pool_get(&tmpfs_dirent_pool, PR_WAITOK);
}
void
tmpfs_dirent_put(struct tmpfs_mount *mp, struct tmpfs_dirent *de)
{
tmpfs_mem_decr(mp, sizeof(struct tmpfs_dirent));
pool_put(&tmpfs_dirent_pool, de);
}
struct tmpfs_node *
tmpfs_node_get(struct tmpfs_mount *mp)
{
if (atomic_inc_uint_nv(&mp->tm_nodes_cnt) >= mp->tm_nodes_max) {
atomic_dec_uint(&mp->tm_nodes_cnt);
return NULL;
}
if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_node))) {
atomic_dec_uint(&mp->tm_nodes_cnt);
return NULL;
}
return pool_get(&tmpfs_node_pool, PR_WAITOK);
}
void
tmpfs_node_put(struct tmpfs_mount *mp, struct tmpfs_node *tn)
{
atomic_dec_uint(&mp->tm_nodes_cnt);
tmpfs_mem_decr(mp, sizeof(struct tmpfs_node));
pool_put(&tmpfs_node_pool, tn);
}
/*
* Quantum size to round-up the tmpfs names in order to reduce re-allocations.
*/
#define TMPFS_NAME_QUANTUM (32)
char *
tmpfs_strname_alloc(struct tmpfs_mount *mp, size_t len)
{
const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);
KASSERT(sz > 0 && sz <= 1024); if (!tmpfs_mem_incr(mp, sz)) {
return NULL;
}
return kmem_alloc(sz, KM_SLEEP);
}
void
tmpfs_strname_free(struct tmpfs_mount *mp, char *str, size_t len)
{
const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);
KASSERT(sz > 0 && sz <= 1024); tmpfs_mem_decr(mp, sz);
kmem_free(str, sz);
}
bool
tmpfs_strname_neqlen(struct componentname *fcnp, struct componentname *tcnp)
{
const size_t fln = fcnp->cn_namelen;
const size_t tln = tcnp->cn_namelen;
return (fln != tln) || memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fln);
}
/* $NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
* by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm_ddb.h>
SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add,
"struct vnode *"/*vp*/,
"int"/*delayx*/,
"int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update,
"struct vnode *"/*vp*/,
"int"/*delayx*/,
"int"/*oslot*/,
"int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove,
"struct vnode *"/*vp*/);
SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add,
"struct mount *"/*mp*/,
"int"/*vdelay*/,
"int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update,
"struct mount *"/*vp*/,
"int"/*vdelay*/,
"int"/*oslot*/,
"int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, start,
"int"/*starttime*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done,
"struct mount *"/*mp*/,
"int"/*error*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start,
"struct vnode *"/*vp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, done,
"int"/*starttime*/,
"int"/*endtime*/);
const enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
const int vttoif_tab[9] = {
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT,
};
/*
* Insq/Remq for the vnode usage lists.
*/
#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
#define bufremvn(bp) { \
LIST_REMOVE(bp, b_vnbufs); \
(bp)->b_vnbufs.le_next = NOLIST; \
}
int doforce = 1; /* 1 => permit forcible unmounting */
/*
* Local declarations.
*/
static void vn_initialize_syncerd(void);
/*
* Initialize the vnode management data structures.
*/
void
vntblinit(void)
{
vn_initialize_syncerd();
vfs_mount_sysinit();
vfs_vnode_sysinit();
}
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
(flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
/* XXXUBC this doesn't look at flags or slp* */
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, 0, 0, flushflags);
if (error) {
return error;
}
if (flags & V_SAVE) {
error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
if (error)
return (error);
KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
}
mutex_enter(&bufcache_lock);
restart:
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
/*
* XXX Since there are no node locks for NFS, I believe
* there is a slight chance that a delayed write will
* occur while sleeping just above, so check for it.
*/
if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
#ifdef DEBUG
printf("buffer still DELWRI\n");
#endif
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
VOP_BWRITE(bp->b_vp, bp);
mutex_enter(&bufcache_lock);
goto restart;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
#ifdef DIAGNOSTIC
if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
panic("vinvalbuf: flush failed, vp %p", vp);
#endif
mutex_exit(&bufcache_lock);
return (0);
}
/*
* Destroy any in core blocks past the truncation length.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
voff_t off;
off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
if (error) {
return error;
}
mutex_enter(&bufcache_lock);
restart:
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
mutex_exit(&bufcache_lock);
return (0);
}
/*
* Flush all dirty buffers from a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vflushbuf(struct vnode *vp, int flags)
{
struct buf *bp, *nbp;
int error, pflags;
bool dirty, sync;
sync = (flags & FSYNC_WAIT) != 0;
pflags = PGO_CLEANIT | PGO_ALLPAGES |
(sync ? PGO_SYNCIO : 0) |
((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
(void) VOP_PUTPAGES(vp, 0, 0, pflags);
loop:
mutex_enter(&bufcache_lock);
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if ((bp->b_cflags & BC_BUSY))
continue;
if ((bp->b_oflags & BO_DELWRI) == 0)
panic("vflushbuf: not dirty, bp %p", bp);
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
/*
* Wait for I/O associated with indirect blocks to complete,
* since there is no way to quickly wait for them below.
*/
if (bp->b_vp == vp || !sync)
(void) bawrite(bp);
else {
error = bwrite(bp);
if (error)
return error;
}
goto loop;
}
mutex_exit(&bufcache_lock);
if (!sync)
return 0;
mutex_enter(vp->v_interlock);
while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, vp->v_interlock);
dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
mutex_exit(vp->v_interlock);
if (dirty) { vprint("vflushbuf: dirty", vp);
goto loop;
}
return 0;
}
/*
* Create a vnode for a block device.
* Used for root filesystem and swap areas.
* Also used for memory file system special devices.
*/
int
bdevvp(dev_t dev, vnode_t **vpp)
{
struct vattr va;
vattr_null(&va);
va.va_type = VBLK;
va.va_rdev = dev;
return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}
/*
* Create a vnode for a character device.
* Used for kernfs and some console handling.
*/
int
cdevvp(dev_t dev, vnode_t **vpp)
{
struct vattr va;
vattr_null(&va);
va.va_type = VCHR;
va.va_rdev = dev;
return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}
/*
* Associate a buffer with a vnode. There must already be a hold on
* the vnode.
*/
void
bgetvp(struct vnode *vp, struct buf *bp)
{ KASSERT(bp->b_vp == NULL); KASSERT(bp->b_objlock == &buffer_lock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done));
vholdl(vp);
bp->b_vp = vp;
if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev;
else
bp->b_dev = NODEV;
/*
* Insert onto list for new vnode.
*/
bufinsvn(bp, &vp->v_cleanblkhd);
bp->b_objlock = vp->v_interlock;
}
/*
* Disassociate a buffer from a vnode.
*/
void
brelvp(struct buf *bp)
{
struct vnode *vp = bp->b_vp;
KASSERT(vp != NULL); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done));
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp); if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
vn_syncer_remove_from_worklist(vp);
bp->b_objlock = &buffer_lock;
bp->b_vp = NULL;
holdrelel(vp);
}
/*
* Reassign a buffer from one vnode list to another.
* The list reassignment must be within the same vnode.
* Used to assign file specific control information
* (indirect blocks) to the list to which they belong.
*/
void
reassignbuf(struct buf *bp, struct vnode *vp)
{
struct buflists *listheadp;
int delayx;
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT((bp->b_cflags & BC_BUSY) != 0);
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp);
/*
* If dirty, put on list of dirty buffers;
* otherwise insert onto list of clean buffers.
*/
if ((bp->b_oflags & BO_DELWRI) == 0) {
listheadp = &vp->v_cleanblkhd;
if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
vn_syncer_remove_from_worklist(vp);
} else {
listheadp = &vp->v_dirtyblkhd;
if ((vp->v_iflag & VI_ONWORKLST) == 0) { switch (vp->v_type) {
case VDIR:
delayx = dirdelay;
break;
case VBLK:
if (spec_node_getmountedfs(vp) != NULL) {
delayx = metadelay;
break;
}
/* fall through */
default:
delayx = filedelay;
break;
}
if (!vp->v_mount ||
(vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
vn_syncer_add_to_worklist(vp, delayx);
}
}
bufinsvn(bp, listheadp);
}
/*
* Lookup a vnode by device number and return it referenced.
*/
int
vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
{
return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0);
}
/*
* Revoke all the vnodes corresponding to the specified minor number
* range (endpoints inclusive) of the specified major.
*/
void
vdevgone(int maj, int minl, int minh, enum vtype type)
{
vnode_t *vp;
dev_t dev;
int mn;
for (mn = minl; mn <= minh; mn++) {
dev = makedev(maj, mn);
/*
* Notify anyone trying to get at this device that it
* has been detached, and then revoke it.
*/
switch (type) {
case VBLK:
bdev_detached(dev);
break;
case VCHR:
cdev_detached(dev);
break;
default:
panic("invalid specnode type: %d", type);
}
/*
* Passing 0 as flags, instead of VDEAD_NOWAIT, means
* spec_node_lookup_by_dev will wait for vnodes it
* finds concurrently being revoked before returning.
*/
while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) {
VOP_REVOKE(vp, REVOKEALL);
vrele(vp);
}
}
}
/*
* The filesystem synchronizer mechanism - syncer.
*
* It is useful to delay writes of file data and filesystem metadata for
* a certain amount of time so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To implement this,
* vnodes are appended to a "workitem" queue.
*
* Most pending metadata should not wait for more than ten seconds. Thus,
* mounted on block devices are delayed only about a half the time that file
* data is delayed. Similarly, directory updates are more critical, so are
* only delayed about a third the time that file data is delayed.
*
* There are SYNCER_MAXDELAY queues that are processed in a round-robin
* manner at a rate of one each second (driven off the filesystem syner
* thread). The syncer_delayno variable indicates the next queue that is
* to be processed. Items that need to be processed soon are placed in
* this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of e.g. fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
* Flag VI_ONWORKLST indicates that vnode is added into the queue.
*/
#define SYNCER_MAXDELAY 32
typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t;
static void vn_syncer_add1(struct vnode *, int);
static void sysctl_vfs_syncfs_setup(struct sysctllog **);
/*
* Defines and variables for the syncer process.
*/
int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
time_t syncdelay = 30; /* max time to delay syncing data */
time_t filedelay = 30; /* time to delay syncing files */
time_t dirdelay = 15; /* time to delay syncing directories */
time_t metadelay = 10; /* time to delay syncing metadata */
time_t lockdelay = 1; /* time to delay if locking fails */
static kmutex_t syncer_data_lock; /* short term lock on data structs */
static int syncer_delayno = 0;
static long syncer_last;
static synclist_t * syncer_workitem_pending;
static void
vn_initialize_syncerd(void)
{
int i;
syncer_last = SYNCER_MAXDELAY + 2;
sysctl_vfs_syncfs_setup(NULL);
syncer_workitem_pending =
kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
for (i = 0; i < syncer_last; i++)
TAILQ_INIT(&syncer_workitem_pending[i]);
mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Return delay factor appropriate for the given file system. For
* WAPBL we use the sync vnode to burst out metadata updates: sync
* those file systems more frequently.
*/
static inline int
sync_delay(struct mount *mp)
{
return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
}
/*
* Compute the next slot index from delay.
*/
static inline int
sync_delay_slot(int delayx)
{
if (delayx > syncer_maxdelay - 2)
delayx = syncer_maxdelay - 2;
return (syncer_delayno + delayx) % syncer_last;
}
/*
* Add an item to the syncer work queue.
*/
static void
vn_syncer_add1(struct vnode *vp, int delayx)
{
synclist_t *slp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(&syncer_data_lock));
if (vp->v_iflag & VI_ONWORKLST) {
/*
* Remove in order to adjust the position of the vnode.
* Note: called from sched_sync(), which will not hold
* interlock, therefore we cannot modify v_iflag here.
*/
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist);
} else {
KASSERT(mutex_owned(vp->v_interlock));
vp->v_iflag |= VI_ONWORKLST;
}
vip->vi_synclist_slot = sync_delay_slot(delayx);
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_INSERT_TAIL(slp, vip, vi_synclist);
}
void
vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(vp->v_interlock));
mutex_enter(&syncer_data_lock);
vn_syncer_add1(vp, delayx);
SDT_PROBE3(vfs, syncer, worklist, vnode__add,
vp, delayx, vip->vi_synclist_slot);
mutex_exit(&syncer_data_lock);
}
/*
* Remove an item from the syncer work queue.
*/
void
vn_syncer_remove_from_worklist(struct vnode *vp)
{
synclist_t *slp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_iflag & VI_ONWORKLST) {
mutex_enter(&syncer_data_lock);
SDT_PROBE1(vfs, syncer, worklist, vnode__remove, vp);
vp->v_iflag &= ~VI_ONWORKLST;
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist);
mutex_exit(&syncer_data_lock);
}
}
/*
* Add this mount point to the syncer.
*/
void
vfs_syncer_add_to_worklist(struct mount *mp)
{
static int start, incr, next;
int vdelay;
KASSERT(mutex_owned(mp->mnt_updating)); KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);
/*
* We attempt to scatter the mount points on the list
* so that they will go off at evenly distributed times
* even if all the filesystems are mounted at once.
*/
next += incr;
if (next == 0 || next > syncer_maxdelay) {
start /= 2;
incr /= 2;
if (start == 0) { start = syncer_maxdelay / 2;
incr = syncer_maxdelay;
}
next = start;
}
mp->mnt_iflag |= IMNT_ONWORKLIST;
vdelay = sync_delay(mp);
mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0; SDT_PROBE3(vfs, syncer, worklist, mount__add,
mp, vdelay, mp->mnt_synclist_slot);
}
/*
* Remove the mount point from the syncer.
*/
void
vfs_syncer_remove_from_worklist(struct mount *mp)
{
KASSERT(mutex_owned(mp->mnt_updating));
KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0);
SDT_PROBE1(vfs, syncer, worklist, mount__remove, mp);
mp->mnt_iflag &= ~IMNT_ONWORKLIST;
}
/*
* Try lazy sync, return true on success.
*/
static bool
lazy_sync_vnode(struct vnode *vp)
{
bool synced;
int error;
KASSERT(mutex_owned(&syncer_data_lock));
synced = false;
if ((error = vcache_tryvget(vp)) == 0) {
mutex_exit(&syncer_data_lock);
if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) {
synced = true;
SDT_PROBE1(vfs, syncer, sync, vnode__start, vp);
error = VOP_FSYNC(vp, curlwp->l_cred,
FSYNC_LAZY, 0, 0);
SDT_PROBE2(vfs, syncer, sync, vnode__done, vp, error);
vput(vp);
} else {
SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock,
vp, error);
vrele(vp);
}
mutex_enter(&syncer_data_lock);
} else {
SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget, vp, error);
}
return synced;
}
/*
* System filesystem synchronizer daemon.
*/
void
sched_sync(void *arg)
{
mount_iterator_t *iter;
synclist_t *slp;
struct vnode_impl *vi;
struct vnode *vp;
struct mount *mp;
time_t starttime, endtime;
int vdelay, oslot, nslot, delayx;
bool synced;
int error;
for (;;) {
starttime = time_second;
SDT_PROBE1(vfs, syncer, sync, start, starttime);
/*
* Sync mounts whose dirty time has expired.
*/
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_trynext(iter)) != NULL) {
if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
mp->mnt_synclist_slot != syncer_delayno) {
SDT_PROBE1(vfs, syncer, sync, mount__skip,
mp);
continue;
}
vdelay = sync_delay(mp);
oslot = mp->mnt_synclist_slot;
nslot = sync_delay_slot(vdelay);
mp->mnt_synclist_slot = nslot;
SDT_PROBE4(vfs, syncer, worklist, mount__update,
mp, vdelay, oslot, nslot);
SDT_PROBE1(vfs, syncer, sync, mount__start, mp);
error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
SDT_PROBE2(vfs, syncer, sync, mount__done,
mp, error);
}
mountlist_iterator_destroy(iter);
mutex_enter(&syncer_data_lock);
/*
* Push files whose dirty time has expired.
*/
slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno >= syncer_last)
syncer_delayno = 0;
while ((vi = TAILQ_FIRST(slp)) != NULL) {
vp = VIMPL_TO_VNODE(vi);
synced = lazy_sync_vnode(vp);
/*
* XXX The vnode may have been recycled, in which
* case it may have a new identity.
*/
vi = TAILQ_FIRST(slp);
if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*
* Try again sooner rather than later if
* we were unable to lock the vnode. Lock
* failure should not prevent us from doing
* the sync "soon".
*
* If we locked it yet arrive here, it's
* likely that lazy sync is in progress and
* so the vnode still has dirty metadata.
* syncdelay is mainly to get this vnode out
* of the way so we do not consider it again
* "soon" in this loop, so the delay time is
* not critical as long as it is not "soon".
* While write-back strategy is the file
* system's domain, we expect write-back to
* occur no later than syncdelay seconds
* into the future.
*/
delayx = synced ? syncdelay : lockdelay;
oslot = vi->vi_synclist_slot;
vn_syncer_add1(vp, delayx);
nslot = vi->vi_synclist_slot;
SDT_PROBE4(vfs, syncer, worklist,
vnode__update,
vp, delayx, oslot, nslot);
}
}
endtime = time_second;
SDT_PROBE2(vfs, syncer, sync, done, starttime, endtime);
/*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
if (endtime == starttime) {
kpause("syncer", false, hz, &syncer_data_lock);
}
mutex_exit(&syncer_data_lock);
}
}
static void
sysctl_vfs_syncfs_setup(struct sysctllog **clog)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sync",
SYSCTL_DESCR("syncer options"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "delay",
SYSCTL_DESCR("max time to delay syncing data"),
NULL, 0, &syncdelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "filedelay",
SYSCTL_DESCR("time to delay syncing files"),
NULL, 0, &filedelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "dirdelay",
SYSCTL_DESCR("time to delay syncing directories"),
NULL, 0, &dirdelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "metadelay",
SYSCTL_DESCR("time to delay syncing metadata"),
NULL, 0, &metadelay, 0,
CTL_CREATE, CTL_EOL);
}
/*
* sysctl helper routine to return list of supported fstypes
*/
int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
{
char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
char *where = oldp;
struct vfsops *v;
size_t needed, left, slen;
int error, first;
if (newp != NULL)
return (EPERM);
if (namelen != 0)
return (EINVAL);
first = 1;
error = 0;
needed = 0;
left = *oldlenp;
sysctl_unlock();
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (where == NULL)
needed += strlen(v->vfs_name) + 1;
else {
memset(bf, 0, sizeof(bf));
if (first) {
strncpy(bf, v->vfs_name, sizeof(bf));
first = 0;
} else {
bf[0] = ' ';
strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
}
bf[sizeof(bf)-1] = '\0';
slen = strlen(bf);
if (left < slen + 1)
break;
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
/* +1 to copy out the trailing NUL byte */
error = copyout(bf, where, slen + 1);
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (error)
break;
where += slen;
needed += slen;
left -= slen;
}
}
mutex_exit(&vfs_list_lock);
sysctl_relock();
*oldlenp = needed;
return (error);
}
int kinfo_vdebug = 1;
int kinfo_vgetfailed;
#define KINFO_VNODESLOP 10
/*
* Dump vnode list (via sysctl).
* Copyout address of vnode followed by vnode.
*/
int
sysctl_kern_vnode(SYSCTLFN_ARGS)
{
char *where = oldp;
size_t *sizep = oldlenp;
struct mount *mp;
vnode_t *vp, vbuf;
mount_iterator_t *iter;
struct vnode_iterator *marker;
char *bp = where;
char *ewhere;
int error;
if (namelen != 0)
return (EOPNOTSUPP);
if (newp != NULL)
return (EPERM);
#define VPTRSZ sizeof(vnode_t *)
#define VNODESZ sizeof(vnode_t)
if (where == NULL) {
*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
return (0);
}
ewhere = where + *sizep;
sysctl_unlock();
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_next(iter)) != NULL) {
vfs_vnode_iterator_init(mp, &marker);
while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
if (bp + VPTRSZ + VNODESZ > ewhere) {
vrele(vp);
vfs_vnode_iterator_destroy(marker);
mountlist_iterator_destroy(iter);
sysctl_relock();
*sizep = bp - where;
return (ENOMEM);
}
memcpy(&vbuf, vp, VNODESZ);
if ((error = copyout(&vp, bp, VPTRSZ)) ||
(error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
vrele(vp);
vfs_vnode_iterator_destroy(marker);
mountlist_iterator_destroy(iter);
sysctl_relock();
return (error);
}
vrele(vp);
bp += VPTRSZ + VNODESZ;
}
vfs_vnode_iterator_destroy(marker);
}
mountlist_iterator_destroy(iter);
sysctl_relock();
*sizep = bp - where;
return (0);
}
/*
* Set vnode attributes to VNOVAL
*/
void
vattr_null(struct vattr *vap)
{
memset(vap, 0, sizeof(*vap));
vap->va_type = VNON;
/*
* Assign individually so that it is safe even if size and
* sign of each member are varied.
*/
vap->va_mode = VNOVAL;
vap->va_nlink = VNOVAL;
vap->va_uid = VNOVAL;
vap->va_gid = VNOVAL;
vap->va_fsid = VNOVAL;
vap->va_fileid = VNOVAL;
vap->va_size = VNOVAL;
vap->va_blocksize = VNOVAL;
vap->va_atime.tv_sec =
vap->va_mtime.tv_sec =
vap->va_ctime.tv_sec =
vap->va_birthtime.tv_sec = VNOVAL;
vap->va_atime.tv_nsec =
vap->va_mtime.tv_nsec =
vap->va_ctime.tv_nsec =
vap->va_birthtime.tv_nsec = VNOVAL;
vap->va_gen = VNOVAL;
vap->va_flags = VNOVAL;
vap->va_rdev = VNOVAL;
vap->va_bytes = VNOVAL;
}
/*
* Vnode state to string.
*/
const char *
vstate_name(enum vnode_state state)
{
switch (state) {
case VS_ACTIVE:
return "ACTIVE";
case VS_MARKER:
return "MARKER";
case VS_LOADING:
return "LOADING";
case VS_LOADED:
return "LOADED";
case VS_BLOCKED:
return "BLOCKED";
case VS_RECLAIMING:
return "RECLAIMING";
case VS_RECLAIMED:
return "RECLAIMED";
default:
return "ILLEGAL";
}
}
/*
* Print a description of a vnode (common part).
*/
static void
vprint_common(struct vnode *vp, const char *prefix,
void (*pr)(const char *, ...) __printflike(1, 2))
{
int n;
char bf[96];
const uint8_t *cp;
vnode_impl_t *vip;
const char * const vnode_tags[] = { VNODE_TAGS };
const char * const vnode_types[] = { VNODE_TYPES };
const char vnode_flagbits[] = VNODE_FLAGBITS;
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#define ARRAY_PRINT(idx, arr) \
((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
vip = VNODE_TO_VIMPL(vp);
snprintb(bf, sizeof(bf),
vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
(*pr)("vnode %p flags %s\n", vp, bf);
(*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix,
ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
vp->v_mount, vp->v_mountedhere);
(*pr)("%susecount %d writecount %d holdcount %d\n", prefix,
vrefcnt(vp), vp->v_writecount, vp->v_holdcnt);
(*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
(*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);
(*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
n = vip->vi_key.vk_key_len;
cp = vip->vi_key.vk_key;
while (n-- > 0)
(*pr)(" %02x", *cp++);
(*pr)("\n");
(*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd);
#undef ARRAY_PRINT
#undef ARRAY_SIZE
}
/*
* Print out a description of a vnode.
*/
void
vprint(const char *label, struct vnode *vp)
{
if (label != NULL)
printf("%s: ", label);
vprint_common(vp, "\t", printf);
if (vp->v_data != NULL) {
printf("\t");
VOP_PRINT(vp);
}
}
/*
* Given a file system name, look up the vfsops for that
* file system, or return NULL if file system isn't present
* in the kernel.
*/
struct vfsops *
vfs_getopsbyname(const char *name)
{
struct vfsops *v;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (strcmp(v->vfs_name, name) == 0)
break;
}
if (v != NULL)
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
return (v);
}
void
copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
{
const struct statvfs *mbp;
if (sbp == (mbp = &mp->mnt_stat))
return;
(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
sbp->f_fsid = mbp->f_fsid;
sbp->f_owner = mbp->f_owner;
sbp->f_flag = mbp->f_flag;
sbp->f_syncwrites = mbp->f_syncwrites;
sbp->f_asyncwrites = mbp->f_asyncwrites;
sbp->f_syncreads = mbp->f_syncreads;
sbp->f_asyncreads = mbp->f_asyncreads;
(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
sizeof(sbp->f_fstypename));
(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
sizeof(sbp->f_mntonname));
(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
sizeof(sbp->f_mntfromname));
(void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel,
sizeof(sbp->f_mntfromlabel));
sbp->f_namemax = mbp->f_namemax;
}
int
set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
const char *vfsname, struct mount *mp, struct lwp *l)
{
int error;
size_t size;
struct statvfs *sfs = &mp->mnt_stat;
int (*fun)(const void *, void *, size_t, size_t *);
(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
sizeof(mp->mnt_stat.f_fstypename));
if (onp) {
struct cwdinfo *cwdi = l->l_proc->p_cwdi;
fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
if (cwdi->cwdi_rdir != NULL) {
size_t len;
char *bp;
char *path = PNBUF_GET();
bp = path + MAXPATHLEN;
*--bp = '\0';
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
path, MAXPATHLEN / 2, 0, l);
rw_exit(&cwdi->cwdi_lock);
if (error) {
PNBUF_PUT(path);
return error;
}
len = strlen(bp);
if (len > sizeof(sfs->f_mntonname) - 1)
len = sizeof(sfs->f_mntonname) - 1;
(void)strncpy(sfs->f_mntonname, bp, len);
PNBUF_PUT(path);
if (len < sizeof(sfs->f_mntonname) - 1) {
error = (*fun)(onp, &sfs->f_mntonname[len],
sizeof(sfs->f_mntonname) - len - 1, &size);
if (error)
return error;
size += len;
} else {
size = len;
}
} else {
error = (*fun)(onp, &sfs->f_mntonname,
sizeof(sfs->f_mntonname) - 1, &size);
if (error)
return error;
}
(void)memset(sfs->f_mntonname + size, 0,
sizeof(sfs->f_mntonname) - size);
}
if (fromp) {
fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
error = (*fun)(fromp, sfs->f_mntfromname,
sizeof(sfs->f_mntfromname) - 1, &size);
if (error)
return error;
(void)memset(sfs->f_mntfromname + size, 0,
sizeof(sfs->f_mntfromname) - size);
}
return 0;
}
/*
* Knob to control the precision of file timestamps:
*
* 0 = seconds only; nanoseconds zeroed.
* 1 = seconds and nanoseconds, accurate within 1/HZ.
* 2 = seconds and nanoseconds, truncated to microseconds.
* >=3 = seconds and nanoseconds, maximum precision.
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
int vfs_timestamp_precision __read_mostly = TSP_NSEC;
void
vfs_timestamp(struct timespec *tsp)
{
struct timeval tv;
switch (vfs_timestamp_precision) {
case TSP_SEC:
tsp->tv_sec = time_second;
tsp->tv_nsec = 0;
break;
case TSP_HZ:
getnanotime(tsp);
break;
case TSP_USEC:
microtime(&tv);
TIMEVAL_TO_TIMESPEC(&tv, tsp);
break;
case TSP_NSEC:
default:
nanotime(tsp);
break;
}
}
/*
* The purpose of this routine is to remove granularity from accmode_t,
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
* VADMIN and VAPPEND.
*
* If it returns 0, the caller is supposed to continue with the usual
* access checks using 'accmode' as modified by this routine. If it
* returns nonzero value, the caller is supposed to return that value
* as errno.
*
* Note that after this routine runs, accmode may be zero.
*/
int
vfs_unixify_accmode(accmode_t *accmode)
{
/*
* There is no way to specify explicit "deny" rule using
* file mode or POSIX.1e ACLs.
*/
if (*accmode & VEXPLICIT_DENY) {
*accmode = 0;
return (0);
}
/*
* None of these can be translated into usual access bits.
* Also, the common case for NFSv4 ACLs is to not contain
* either of these bits. Caller should check for VWRITE
* on the containing directory instead.
*/
if (*accmode & (VDELETE_CHILD | VDELETE))
return (EPERM);
if (*accmode & VADMIN_PERMS) {
*accmode &= ~VADMIN_PERMS;
*accmode |= VADMIN;
}
/*
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
*/
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
return (0);
}
time_t rootfstime; /* recorded root fs time, if known */
void
setrootfstime(time_t t)
{
rootfstime = t;
}
static const uint8_t vttodt_tab[ ] = {
[VNON] = DT_UNKNOWN,
[VREG] = DT_REG,
[VDIR] = DT_DIR,
[VBLK] = DT_BLK,
[VCHR] = DT_CHR,
[VLNK] = DT_LNK,
[VSOCK] = DT_SOCK,
[VFIFO] = DT_FIFO,
[VBAD] = DT_UNKNOWN
};
uint8_t
vtype2dt(enum vtype vt)
{
CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
return vttodt_tab[vt];
}
int
VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
{
int mpsafe = mp->mnt_iflag & IMNT_MPSAFE;
int error;
/*
* Note: The first time through, the vfs_mount function may set
* IMNT_MPSAFE, so we have to cache it on entry in order to
* avoid leaking a kernel lock.
*
* XXX Maybe the MPSAFE bit should be set in struct vfsops and
* not in struct mount.
*/
if (mpsafe) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
if (mpsafe) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_START(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_start))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_UNMOUNT(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_unmount))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_ROOT(struct mount *mp, int lktype, struct vnode **a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_root))(mp, lktype, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_STATVFS(struct mount *mp, struct statvfs *a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
{
int error;
if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
{
int error;
KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */
error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
KERNEL_UNLOCK_ONE(NULL); /* XXX */
return error;
}
int
VFS_SUSPENDCTL(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
#if defined(DDB) || defined(DEBUGPRINT)
static const char buf_flagbits[] = BUF_FLAGBITS;
void
vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
{
char bf[1024];
(*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
PRIx64 " dev 0x%x\n",
bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
snprintb(bf, sizeof(bf),
buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
(*pr)(" error %d flags %s\n", bp->b_error, bf);
(*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
bp->b_bufsize, bp->b_bcount, bp->b_resid);
(*pr)(" data %p saveaddr %p\n",
bp->b_data, bp->b_saveaddr);
(*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
}
void
vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
{
uvm_object_printit(&vp->v_uobj, full, pr);
(*pr)("\n");
vprint_common(vp, "", pr);
if (full) {
struct buf *bp;
(*pr)("clean bufs:\n");
LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
(*pr)("dirty bufs:\n");
LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
}
}
void
vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...))
{
struct mount *mp;
vnode_impl_t *vip;
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
if (&vip->vi_lock == vlock ||
VIMPL_TO_VNODE(vip)->v_interlock == vlock)
vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
}
}
}
void
vfs_mount_print_all(int full, void (*pr)(const char *, ...))
{
struct mount *mp;
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
vfs_mount_print(mp, full, pr);
}
void
vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
{
char sbuf[256];
(*pr)("vnodecovered = %p data = %p\n",
mp->mnt_vnodecovered, mp->mnt_data);
(*pr)("fs_bshift %d dev_bshift = %d\n",
mp->mnt_fs_bshift, mp->mnt_dev_bshift);
snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
(*pr)("flag = %s\n", sbuf);
snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
(*pr)("iflag = %s\n", sbuf);
(*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating);
(*pr)("statvfs cache:\n");
(*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize);
(*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize);
(*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize);
(*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks);
(*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree);
(*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail);
(*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd);
(*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files);
(*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree);
(*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail);
(*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd);
(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
mp->mnt_stat.f_fsidx.__fsid_val[0],
mp->mnt_stat.f_fsidx.__fsid_val[1]);
(*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner);
(*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax);
snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
(*pr)("\tflag = %s\n", sbuf);
(*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites);
(*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites);
(*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads);
(*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads);
(*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename);
(*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname);
(*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname);
{
int cnt = 0;
vnode_t *vp;
vnode_impl_t *vip;
(*pr)("locked vnodes =");
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (VOP_ISLOCKED(vp)) {
if ((++cnt % 6) == 0) {
(*pr)(" %p,\n\t", vp);
} else {
(*pr)(" %p,", vp);
}
}
}
(*pr)("\n");
}
if (full) {
int cnt = 0;
vnode_t *vp;
vnode_impl_t *vip;
(*pr)("all vnodes =");
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (!TAILQ_NEXT(vip, vi_mntvnodes)) {
(*pr)(" %p", vp);
} else if ((++cnt % 6) == 0) {
(*pr)(" %p,\n\t", vp);
} else {
(*pr)(" %p,", vp);
}
}
(*pr)("\n");
}
}
/*
* List all of the locked vnodes in the system.
*/
void printlockedvnodes(void);
void
printlockedvnodes(void)
{
struct mount *mp;
vnode_t *vp;
vnode_impl_t *vip;
printf("Locked vnodes\n");
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (VOP_ISLOCKED(vp))
vprint(NULL, vp);
}
}
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
/*-
* Copyright (c) 1993 Jan-Simon Pendry.
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#include "opt_ktrace.h"
#include "opt_pax.h"
#include "opt_compat_netbsd32.h"
#endif
#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
&& !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/module.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
# ifdef PTRACE_DEBUG
# define DPRINTF(a) uprintf a
# else
# define DPRINTF(a)
# endif
static kauth_listener_t ptrace_listener;
static int process_auxv_offset(struct proc *, struct uio *);
extern int user_va0_disable;
#if 0
static int ptrace_cbref;
static kmutex_t ptrace_mtx;
static kcondvar_t ptrace_cv;
#endif
#ifdef PT_GETREGS
# define case_PT_GETREGS case PT_GETREGS:
#else
# define case_PT_GETREGS
#endif
#ifdef PT_SETREGS
# define case_PT_SETREGS case PT_SETREGS:
#else
# define case_PT_SETREGS
#endif
#ifdef PT_GETFPREGS
# define case_PT_GETFPREGS case PT_GETFPREGS:
#else
# define case_PT_GETFPREGS
#endif
#ifdef PT_SETFPREGS
# define case_PT_SETFPREGS case PT_SETFPREGS:
#else
# define case_PT_SETFPREGS
#endif
#ifdef PT_GETDBREGS
# define case_PT_GETDBREGS case PT_GETDBREGS:
#else
# define case_PT_GETDBREGS
#endif
#ifdef PT_SETDBREGS
# define case_PT_SETDBREGS case PT_SETDBREGS:
#else
# define case_PT_SETDBREGS
#endif
static int
ptrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
#ifdef PT_SETDBREGS
extern int user_set_dbregs;
#endif
result = KAUTH_RESULT_DEFER;
p = arg0;
#if 0
mutex_enter(&ptrace_mtx);
ptrace_cbref++;
mutex_exit(&ptrace_mtx);
#endif
if (action != KAUTH_PROCESS_PTRACE)
goto out;
switch ((u_long)arg1) {
#ifdef PT_SETDBREGS
case_PT_SETDBREGS
if (kauth_cred_getuid(cred) != 0 && user_set_dbregs == 0) {
result = KAUTH_RESULT_DENY;
break;
}
#endif
/* FALLTHROUGH */
case PT_TRACE_ME:
case PT_ATTACH:
case PT_WRITE_I:
case PT_WRITE_D:
case PT_READ_I:
case PT_READ_D:
case PT_IO:
case_PT_GETREGS
case_PT_SETREGS
case_PT_GETFPREGS
case_PT_SETFPREGS
case_PT_GETDBREGS
case PT_SET_EVENT_MASK:
case PT_GET_EVENT_MASK:
case PT_GET_PROCESS_STATE:
case PT_SET_SIGINFO:
case PT_GET_SIGINFO:
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
#endif
if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) || ISSET(p->p_flag, PK_SUGID)) {
break;
}
result = KAUTH_RESULT_ALLOW;
break;
#ifdef PT_STEP
case PT_STEP:
case PT_SETSTEP:
case PT_CLEARSTEP:
#endif
case PT_CONTINUE:
case PT_KILL:
case PT_DETACH:
case PT_LWPINFO:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_DUMPCORE:
case PT_RESUME:
case PT_SUSPEND:
case PT_STOP:
case PT_LWPSTATUS:
case PT_LWPNEXT:
case PT_SET_SIGPASS:
case PT_GET_SIGPASS:
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
out:
#if 0
mutex_enter(&ptrace_mtx);
if (--ptrace_cbref == 0)
cv_broadcast(&ptrace_cv);
mutex_exit(&ptrace_mtx);
#endif
return result;
}
static struct proc *
ptrace_find(struct lwp *l, int req, pid_t pid)
{
struct proc *t;
/* "A foolish consistency..." XXX */
if (req == PT_TRACE_ME) {
t = l->l_proc;
mutex_enter(t->p_lock);
return t;
}
/* Find the process we're supposed to be operating on. */
t = proc_find(pid);
if (t == NULL)
return NULL;
/* XXX-elad */
mutex_enter(t->p_lock);
int error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
t, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error) { mutex_exit(t->p_lock);
return NULL;
}
return t;
}
static int
ptrace_allowed(struct lwp *l, int req, struct proc *t, struct proc *p,
bool *locked)
{
*locked = false;
/*
* Grab a reference on the process to prevent it from execing or
* exiting.
*/
if (!rw_tryenter(&t->p_reflock, RW_READER))
return EBUSY;
*locked = true;
/* Make sure we can operate on it. */
switch (req) {
case PT_TRACE_ME:
/*
* You can't say to the parent of a process to start tracing if:
* (1) the parent is initproc,
*/
if (p->p_pptr == initproc)
return EPERM;
/*
* (2) the process is initproc, or
*/
if (p == initproc)
return EPERM;
/*
* (3) the child is already traced.
*/
if (ISSET(p->p_slflag, PSL_TRACED))
return EBUSY;
return 0;
case PT_ATTACH:
/*
* You can't attach to a process if:
* (1) it's the process that's doing the attaching,
*/
if (t == p)
return EINVAL;
/*
* (2) it's a system process,
*/
if (t->p_flag & PK_SYSTEM)
return EPERM;
/*
* (3) the tracer is initproc,
*/
if (p == initproc)
return EPERM;
/*
* (4) it's already being traced,
*/
if (ISSET(t->p_slflag, PSL_TRACED))
return EBUSY;
/*
* (5) it's a vfork(2)ed parent of the current process, or
*/
if (ISSET(p->p_lflag, PL_PPWAIT) && p->p_pptr == t)
return EPERM;
/*
* (6) the tracer is chrooted, and its root directory is
* not at or above the root directory of the tracee
*/
mutex_exit(t->p_lock); /* XXXSMP */
int tmp = proc_isunder(t, l);
mutex_enter(t->p_lock); /* XXXSMP */
if (!tmp)
return EPERM;
return 0;
case PT_READ_I:
case PT_READ_D:
case PT_WRITE_I:
case PT_WRITE_D:
case PT_IO:
case PT_SET_SIGINFO:
case PT_GET_SIGINFO:
case_PT_GETREGS
case_PT_SETREGS
case_PT_GETFPREGS
case_PT_SETFPREGS
case_PT_GETDBREGS
case_PT_SETDBREGS
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
#endif
/*
* You can't read/write the memory or registers of a process
* if the tracer is chrooted, and its root directory is not at
* or above the root directory of the tracee.
*/
mutex_exit(t->p_lock); /* XXXSMP */
tmp = proc_isunder(t, l);
mutex_enter(t->p_lock); /* XXXSMP */
if (!tmp)
return EPERM;
/*FALLTHROUGH*/
case PT_CONTINUE:
case PT_KILL:
case PT_DETACH:
case PT_LWPINFO:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_DUMPCORE:
#ifdef PT_STEP
case PT_STEP:
case PT_SETSTEP:
case PT_CLEARSTEP:
#endif
case PT_SET_EVENT_MASK:
case PT_GET_EVENT_MASK:
case PT_GET_PROCESS_STATE:
case PT_RESUME:
case PT_SUSPEND:
case PT_STOP:
case PT_LWPSTATUS:
case PT_LWPNEXT:
case PT_SET_SIGPASS:
case PT_GET_SIGPASS:
/*
* You can't do what you want to the process if:
* (1) It's not being traced at all,
*/
if (!ISSET(t->p_slflag, PSL_TRACED))
return EPERM;
/*
* (2) it's not being traced by _you_, or
*/
if (t->p_pptr != p) {
DPRINTF(("parent %d != %d\n", t->p_pptr->p_pid,
p->p_pid));
return EBUSY;
}
/*
* (3) it's not currently stopped.
*
* As an exception allow PT_KILL and PT_STOP here.
*/
if (req != PT_KILL && req != PT_STOP && (t->p_stat != SSTOP || !t->p_waited /* XXXSMP */)) {
DPRINTF(("stat %d flag %d\n", t->p_stat,
!t->p_waited));
return EBUSY;
}
return 0;
default: /* It was not a legal request. */
return EINVAL;
}
}
static int
ptrace_needs_hold(int req)
{
switch (req) {
#ifdef PT_STEP
case PT_STEP:
#endif
case PT_CONTINUE:
case PT_DETACH:
case PT_KILL:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_ATTACH:
case PT_TRACE_ME:
case PT_GET_SIGINFO:
case PT_SET_SIGINFO:
case PT_STOP:
return 1;
default:
return 0;
}
}
static int
ptrace_get_siginfo(struct proc *t, struct ptrace_methods *ptm, void *addr,
size_t data)
{
struct ptrace_siginfo psi;
memset(&psi, 0, sizeof(psi));
psi.psi_siginfo._info = t->p_sigctx.ps_info;
psi.psi_lwpid = t->p_sigctx.ps_lwp;
DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
psi.psi_siginfo.si_signo));
return ptm->ptm_copyout_siginfo(&psi, addr, data);
}
static int
ptrace_set_siginfo(struct proc *t, struct lwp **lt, struct ptrace_methods *ptm,
void *addr, size_t data)
{
struct ptrace_siginfo psi;
int error = ptm->ptm_copyin_siginfo(&psi, addr, data);
if (error)
return error;
/* Check that the data is a valid signal number or zero. */
if (psi.psi_siginfo.si_signo < 0 || psi.psi_siginfo.si_signo >= NSIG)
return EINVAL;
t->p_sigctx.ps_faked = true;
t->p_sigctx.ps_info = psi.psi_siginfo._info;
t->p_sigctx.ps_lwp = psi.psi_lwpid;
DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
psi.psi_siginfo.si_signo));
return 0;
}
static int
ptrace_get_sigpass(struct proc *t, void *addr, size_t data)
{
sigset_t set;
if (data > sizeof(set) || data <= 0) {
DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
__func__, sizeof(set), data));
return EINVAL;
}
set = t->p_sigctx.ps_sigpass;
return copyout(&set, addr, data);
}
static int
ptrace_set_sigpass(struct proc *t, void *addr, size_t data)
{
sigset_t set;
int error;
if (data > sizeof(set) || data <= 0) {
DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
__func__, sizeof(set), data));
return EINVAL;
}
memset(&set, 0, sizeof(set));
if ((error = copyin(addr, &set, data)))
return error;
/* We catch SIGSTOP and cannot intercept SIGKILL. */
sigminusset(&sigcantmask, &set);
t->p_sigctx.ps_sigpass = set;
return 0;
}
static int
ptrace_get_event_mask(struct proc *t, void *addr, size_t data)
{
struct ptrace_event pe;
if (data != sizeof(pe)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
return EINVAL;
}
memset(&pe, 0, sizeof(pe));
pe.pe_set_event = ISSET(t->p_slflag, PSL_TRACEFORK) ?
PTRACE_FORK : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK) ?
PTRACE_VFORK : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK_DONE) ?
PTRACE_VFORK_DONE : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_CREATE) ?
PTRACE_LWP_CREATE : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_EXIT) ?
PTRACE_LWP_EXIT : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEPOSIX_SPAWN) ?
PTRACE_POSIX_SPAWN : 0;
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
t->p_sigctx.ps_lwp, pe.pe_set_event));
return copyout(&pe, addr, sizeof(pe));
}
static int
ptrace_set_event_mask(struct proc *t, void *addr, size_t data)
{
struct ptrace_event pe;
int error;
if (data != sizeof(pe)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
return EINVAL;
}
if ((error = copyin(addr, &pe, sizeof(pe))) != 0)
return error;
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
t->p_sigctx.ps_lwp, pe.pe_set_event));
if (pe.pe_set_event & PTRACE_FORK)
SET(t->p_slflag, PSL_TRACEFORK);
else
CLR(t->p_slflag, PSL_TRACEFORK);
if (pe.pe_set_event & PTRACE_VFORK)
SET(t->p_slflag, PSL_TRACEVFORK);
else
CLR(t->p_slflag, PSL_TRACEVFORK);
if (pe.pe_set_event & PTRACE_VFORK_DONE)
SET(t->p_slflag, PSL_TRACEVFORK_DONE);
else
CLR(t->p_slflag, PSL_TRACEVFORK_DONE);
if (pe.pe_set_event & PTRACE_LWP_CREATE)
SET(t->p_slflag, PSL_TRACELWP_CREATE);
else
CLR(t->p_slflag, PSL_TRACELWP_CREATE);
if (pe.pe_set_event & PTRACE_LWP_EXIT)
SET(t->p_slflag, PSL_TRACELWP_EXIT);
else
CLR(t->p_slflag, PSL_TRACELWP_EXIT);
if (pe.pe_set_event & PTRACE_POSIX_SPAWN)
SET(t->p_slflag, PSL_TRACEPOSIX_SPAWN);
else
CLR(t->p_slflag, PSL_TRACEPOSIX_SPAWN);
return 0;
}
static int
ptrace_get_process_state(struct proc *t, void *addr, size_t data)
{
struct _ksiginfo *si;
struct ptrace_state ps;
if (data != sizeof(ps)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(ps)));
return EINVAL;
}
if (t->p_sigctx.ps_info._signo != SIGTRAP || (t->p_sigctx.ps_info._code != TRAP_CHLD &&
t->p_sigctx.ps_info._code != TRAP_LWP)) {
memset(&ps, 0, sizeof(ps));
} else {
si = &t->p_sigctx.ps_info;
KASSERT(si->_reason._ptrace_state._pe_report_event > 0); KASSERT(si->_reason._ptrace_state._option._pe_other_pid > 0);
ps.pe_report_event = si->_reason._ptrace_state._pe_report_event;
CTASSERT(sizeof(ps.pe_other_pid) == sizeof(ps.pe_lwp));
ps.pe_other_pid =
si->_reason._ptrace_state._option._pe_other_pid;
}
DPRINTF(("%s: lwp=%d event=%#x pid=%d lwp=%d\n", __func__,
t->p_sigctx.ps_lwp, ps.pe_report_event,
ps.pe_other_pid, ps.pe_lwp));
return copyout(&ps, addr, sizeof(ps));
}
static int
ptrace_lwpinfo(struct proc *t, struct lwp **lt, void *addr, size_t data)
{
struct ptrace_lwpinfo pl;
if (data != sizeof(pl)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pl)));
return EINVAL;
}
int error = copyin(addr, &pl, sizeof(pl));
if (error)
return error;
lwpid_t tmp = pl.pl_lwpid;
lwp_delref(*lt);
mutex_enter(t->p_lock);
if (tmp == 0)
*lt = lwp_find_first(t);
else {
*lt = lwp_find(t, tmp);
if (*lt == NULL) {
mutex_exit(t->p_lock);
return ESRCH;
}
*lt = LIST_NEXT(*lt, l_sibling);
}
while (*lt != NULL && (!lwp_alive(*lt) ||
((*lt)->l_flag & LW_SYSTEM) != 0))
*lt = LIST_NEXT(*lt, l_sibling);
pl.pl_lwpid = 0;
pl.pl_event = 0;
if (*lt) {
lwp_addref(*lt);
pl.pl_lwpid = (*lt)->l_lid;
if ((*lt)->l_flag & LW_WSUSPEND)
pl.pl_event = PL_EVENT_SUSPENDED;
/*
* If we match the lwp, or it was sent to every lwp,
* we set PL_EVENT_SIGNAL.
* XXX: ps_lwp == 0 means everyone and noone, so
* check ps_signo too.
*/
else if ((*lt)->l_lid == t->p_sigctx.ps_lwp || (t->p_sigctx.ps_lwp == 0 &&
t->p_sigctx.ps_info._signo)) {
DPRINTF(("%s: lwp=%d siglwp=%d signo %d\n", __func__,
pl.pl_lwpid, t->p_sigctx.ps_lwp,
t->p_sigctx.ps_info._signo));
pl.pl_event = PL_EVENT_SIGNAL;
}
}
mutex_exit(t->p_lock);
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
pl.pl_lwpid, pl.pl_event));
return copyout(&pl, addr, sizeof(pl));
}
static int
ptrace_lwpstatus(struct proc *t, struct ptrace_methods *ptm, struct lwp **lt,
void *addr, size_t data, bool next)
{
struct ptrace_lwpstatus pls;
struct lwp *l;
int error;
if (data > sizeof(pls) || data < sizeof(lwpid_t)) {
DPRINTF(("%s: invalid data: %zu < %zu < %zu\n",
__func__, sizeof(lwpid_t), data, sizeof(pls)));
return EINVAL;
}
error = copyin(addr, &pls.pl_lwpid, sizeof(lwpid_t));
if (error)
return error;
if (next) {
lwp_delref(*lt);
lwpid_t tmp = pls.pl_lwpid;
mutex_enter(t->p_lock);
if (tmp == 0)
*lt = lwp_find_first(t);
else {
*lt = lwp_find(t, tmp);
if (*lt == NULL) {
mutex_exit(t->p_lock);
return ESRCH;
}
*lt = LIST_NEXT(*lt, l_sibling);
}
while (*lt != NULL && (!lwp_alive(*lt) ||
((*lt)->l_flag & LW_SYSTEM) != 0))
*lt = LIST_NEXT(*lt, l_sibling);
if (*lt == NULL) {
memset(&pls, 0, sizeof(pls));
mutex_exit(t->p_lock);
goto out;
}
lwp_addref(*lt);
mutex_exit(t->p_lock);
pls.pl_lwpid = (*lt)->l_lid;
} else {
if ((error = ptrace_update_lwp(t, lt, pls.pl_lwpid)) != 0)
return error;
}
l = *lt;
ptrace_read_lwpstatus(l, &pls);
out:
DPRINTF(("%s: lwp=%d sigpend=%02x%02x%02x%02x sigmask=%02x%02x%02x%02x "
"name='%s' private=%p\n", __func__, pls.pl_lwpid,
pls.pl_sigpend.__bits[0], pls.pl_sigpend.__bits[1],
pls.pl_sigpend.__bits[2], pls.pl_sigpend.__bits[3],
pls.pl_sigmask.__bits[0], pls.pl_sigmask.__bits[1],
pls.pl_sigmask.__bits[2], pls.pl_sigmask.__bits[3],
pls.pl_name, pls.pl_private));
return ptm->ptm_copyout_lwpstatus(&pls, addr, data);
}
static int
ptrace_startstop(struct proc *t, struct lwp **lt, int rq, void *addr,
size_t data)
{
int error;
if ((error = ptrace_update_lwp(t, lt, data)) != 0)
return error;
DPRINTF(("%s: lwp=%d request=%d\n", __func__, (*lt)->l_lid, rq));
lwp_lock(*lt);
if (rq == PT_SUSPEND)
(*lt)->l_flag |= LW_DBGSUSPEND;
else {
(*lt)->l_flag &= ~LW_DBGSUSPEND;
if ((*lt)->l_flag != LSSUSPENDED) (*lt)->l_stat = LSSTOP;
}
lwp_unlock(*lt);
return 0;
}
#ifdef PT_REGISTERS
static int
ptrace_uio_dir(int req)
{
switch (req) {
case_PT_GETREGS
case_PT_GETFPREGS
case_PT_GETDBREGS
return UIO_READ;
case_PT_SETREGS
case_PT_SETFPREGS
case_PT_SETDBREGS
return UIO_WRITE;
default:
return -1;
}
}
static int
ptrace_regs(struct lwp *l, struct lwp **lt, int rq, struct ptrace_methods *ptm,
void *addr, size_t data)
{
int error;
struct proc *p, *t;
struct vmspace *vm;
p = l->l_proc; /* tracer */
t = (*lt)->l_proc; /* traced */
if ((error = ptrace_update_lwp(t, lt, data)) != 0)
return error;
int dir = ptrace_uio_dir(rq);
size_t size;
int (*func)(struct lwp *, struct lwp *, struct uio *);
DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, rq));
switch (rq) {
#if defined(PT_SETREGS) || defined(PT_GETREGS)
case_PT_GETREGS
case_PT_SETREGS
if (!process_validregs(*lt))
return EINVAL;
size = PROC_REGSZ(p);
func = ptm->ptm_doregs;
break;
#endif
#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
case_PT_GETFPREGS
case_PT_SETFPREGS
if (!process_validfpregs(*lt))
return EINVAL;
size = PROC_FPREGSZ(p);
func = ptm->ptm_dofpregs;
break;
#endif
#if defined(PT_SETDBREGS) || defined(PT_GETDBREGS)
case_PT_GETDBREGS
case_PT_SETDBREGS
if (!process_validdbregs(*lt))
return EINVAL;
size = PROC_DBREGSZ(p);
func = ptm->ptm_dodbregs;
break;
#endif
default:
return EINVAL;
}
error = proc_vmspace_getref(l->l_proc, &vm);
if (error)
return error;
struct uio uio;
struct iovec iov;
iov.iov_base = addr;
iov.iov_len = size;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_resid = iov.iov_len;
uio.uio_rw = dir;
uio.uio_vmspace = vm;
error = (*func)(l, *lt, &uio);
uvmspace_free(vm);
return error;
}
#endif
static int
ptrace_sendsig(struct lwp *l, int req, struct proc *t, struct lwp *lt, int signo, int resume_all)
{
ksiginfo_t ksi;
/* Finally, deliver the requested signal (or none). */
if (t->p_stat == SSTOP) {
/*
* Unstop the process. If it needs to take a
* signal, make all efforts to ensure that at
* an LWP runs to see it.
*/
t->p_xsig = signo;
/*
* signo > 0 check prevents a potential panic, as
* sigismember(&...,0) is invalid check and signo
* can be equal to 0 as a special case of no-signal.
*/
if (signo > 0 && sigismember(&stopsigmask, signo)) { t->p_waited = 0;
child_psignal(t, 0);
} else if (resume_all)
proc_unstop(t);
else
lwp_unstop(lt);
return 0;
}
KASSERT(req == PT_KILL || req == PT_STOP || req == PT_ATTACH);
KSI_INIT(&ksi);
ksi.ksi_signo = signo;
ksi.ksi_code = SI_USER;
ksi.ksi_pid = l->l_proc->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
t->p_sigctx.ps_faked = false;
DPRINTF(("%s: pid=%d.%d signal=%d resume_all=%d\n", __func__, t->p_pid,
lt->l_lid, signo, resume_all));
return kpsignal2(t, &ksi);
}
static int
ptrace_dumpcore(struct lwp *lt, char *path, size_t len)
{
int error;
if (path != NULL) { if (len >= MAXPATHLEN)
return EINVAL;
char *src = path;
path = kmem_alloc(len + 1, KM_SLEEP);
error = copyin(src, path, len);
if (error)
goto out;
path[len] = '\0';
}
DPRINTF(("%s: lwp=%d\n", __func__, lt->l_lid));
MODULE_HOOK_CALL(coredump_hook, (lt, path), 0, error);
out:
if (path) kmem_free(path, len + 1);
return error;
}
static int
ptrace_doio(struct lwp *l, struct proc *t, struct lwp *lt,
struct ptrace_io_desc *piod, void *addr, bool sysspace)
{
struct uio uio;
struct iovec iov;
int error, tmp;
error = 0;
iov.iov_base = piod->piod_addr;
iov.iov_len = piod->piod_len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(unsigned long)piod->piod_offs;
uio.uio_resid = piod->piod_len;
DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, piod->piod_op));
switch (piod->piod_op) {
case PIOD_READ_D:
case PIOD_READ_I:
uio.uio_rw = UIO_READ;
break;
case PIOD_WRITE_D:
case PIOD_WRITE_I:
/*
* Can't write to a RAS
*/
if (ras_lookup(t, addr) != (void *)-1) {
return EACCES;
}
uio.uio_rw = UIO_WRITE;
break;
case PIOD_READ_AUXV:
uio.uio_rw = UIO_READ;
tmp = t->p_execsw->es_arglen;
if (uio.uio_offset > tmp)
return EIO;
if (uio.uio_resid > tmp - uio.uio_offset)
uio.uio_resid = tmp - uio.uio_offset;
piod->piod_len = iov.iov_len = uio.uio_resid;
error = process_auxv_offset(t, &uio);
break;
default:
error = EINVAL;
break;
}
if (error)
return error;
if (sysspace) {
uio.uio_vmspace = vmspace_kernel();
} else {
error = proc_vmspace_getref(l->l_proc, &uio.uio_vmspace);
if (error)
return error;
}
error = process_domem(l, lt, &uio);
if (!sysspace)
uvmspace_free(uio.uio_vmspace);
if (error)
return error;
piod->piod_len -= uio.uio_resid;
return 0;
}
int
do_ptrace(struct ptrace_methods *ptm, struct lwp *l, int req, pid_t pid,
void *addr, int data, register_t *retval)
{
struct proc *p = l->l_proc;
struct lwp *lt = NULL;
struct lwp *lt2;
struct proc *t; /* target process */
struct ptrace_io_desc piod;
int error, write, tmp, pheld;
int signo = 0;
int resume_all;
bool locked;
error = 0;
/*
* If attaching or detaching, we need to get a write hold on the
* proclist lock so that we can re-parent the target process.
*/
mutex_enter(&proc_lock);
t = ptrace_find(l, req, pid);
if (t == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
pheld = 1;
if ((error = ptrace_allowed(l, req, t, p, &locked)) != 0)
goto out;
if ((error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_PTRACE, t, KAUTH_ARG(req), NULL, NULL)) != 0)
goto out;
if ((lt = lwp_find_first(t)) == NULL) {
error = ESRCH;
goto out;
}
/* Do single-step fixup if needed. */
FIX_SSTEP(t);
KASSERT(lt != NULL);
lwp_addref(lt);
/*
* Which locks do we need held? XXX Ugly.
*/
if ((pheld = ptrace_needs_hold(req)) == 0) {
mutex_exit(t->p_lock);
mutex_exit(&proc_lock);
}
/* Now do the operation. */
write = 0;
*retval = 0;
tmp = 0;
resume_all = 1;
switch (req) {
case PT_TRACE_ME:
/* Just set the trace flag. */
SET(t->p_slflag, PSL_TRACED);
t->p_opptr = t->p_pptr;
break;
/*
* The I and D separate address space has been inherited from PDP-11.
* The 16-bit UNIX started with a single address space per program,
* but was extended to two 16-bit (2 x 64kb) address spaces.
*
* We no longer maintain this feature in maintained architectures, but
* we keep the API for backward compatibility. Currently the I and D
* operations are exactly the same and not distinguished in debuggers.
*/
case PT_WRITE_I:
case PT_WRITE_D:
write = 1;
tmp = data;
/* FALLTHROUGH */
case PT_READ_I:
case PT_READ_D:
piod.piod_addr = &tmp;
piod.piod_len = sizeof(tmp);
piod.piod_offs = addr;
piod.piod_op = write ? PIOD_WRITE_D : PIOD_READ_D;
if ((error = ptrace_doio(l, t, lt, &piod, addr, true)) != 0)
break;
/*
* For legacy reasons we treat here two results as success:
* - incomplete transfer piod.piod_len < sizeof(tmp)
* - no transfer piod.piod_len == 0
*
* This means that there is no way to determine whether
* transfer operation was performed in PT_WRITE and PT_READ
* calls.
*/
if (!write)
*retval = tmp;
break;
case PT_IO:
if ((error = ptm->ptm_copyin_piod(&piod, addr, data)) != 0)
break;
if (piod.piod_len < 1) {
error = EINVAL;
break;
}
if ((error = ptrace_doio(l, t, lt, &piod, addr, false)) != 0)
break;
/*
* For legacy reasons we treat here two results as success:
* - incomplete transfer piod.piod_len < sizeof(tmp)
* - no transfer piod.piod_len == 0
*/
error = ptm->ptm_copyout_piod(&piod, addr, data);
break;
case PT_DUMPCORE:
error = ptrace_dumpcore(lt, addr, data);
break;
#ifdef PT_STEP
case PT_STEP:
/*
* From the 4.4BSD PRM:
* "Execution continues as in request PT_CONTINUE; however
* as soon as possible after execution of at least one
* instruction, execution stops again. [ ... ]"
*/
#endif
case PT_CONTINUE:
case PT_SYSCALL:
case PT_DETACH:
if (req == PT_SYSCALL) {
if (!ISSET(t->p_slflag, PSL_SYSCALL)) { SET(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
(*t->p_emul->e_syscall_intern)(t);
#endif
}
} else {
if (ISSET(t->p_slflag, PSL_SYSCALL)) { CLR(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
(*t->p_emul->e_syscall_intern)(t);
#endif
}
}
t->p_trace_enabled = trace_is_enabled(t);
/*
* Pick up the LWPID, if supplied. There are two cases:
* data < 0 : step or continue single thread, lwp = -data
* data > 0 in PT_STEP : step this thread, continue others
* For operations other than PT_STEP, data > 0 means
* data is the signo to deliver to the process.
*/
tmp = data;
if (tmp >= 0) {
#ifdef PT_STEP
if (req == PT_STEP)
signo = 0;
else
#endif
{
signo = tmp;
tmp = 0; /* don't search for LWP */
}
} else if (tmp == INT_MIN) {
error = ESRCH;
break;
} else {
tmp = -tmp;
}
if (tmp > 0) { if (req == PT_DETACH) {
error = EINVAL;
break;
}
lwp_delref2 (lt);
lt = lwp_find(t, tmp);
if (lt == NULL) {
error = ESRCH;
break;
}
lwp_addref(lt);
resume_all = 0;
signo = 0;
}
/*
* From the 4.4BSD PRM:
* "The data argument is taken as a signal number and the
* child's execution continues at location addr as if it
* incurred that signal. Normally the signal number will
* be either 0 to indicate that the signal that caused the
* stop should be ignored, or that value fetched out of
* the process's image indicating which signal caused
* the stop. If addr is (int *)1 then execution continues
* from where it stopped."
*/
/* Check that the data is a valid signal number or zero. */
if (signo < 0 || signo >= NSIG) {
error = EINVAL;
break;
}
/* Prevent process deadlock */
if (resume_all) {
#ifdef PT_STEP
if (req == PT_STEP) {
if (lt->l_flag &
(LW_WSUSPEND | LW_DBGSUSPEND)) {
error = EDEADLK;
break;
}
} else
#endif
{
error = EDEADLK;
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) { if ((lt2->l_flag &
(LW_WSUSPEND | LW_DBGSUSPEND)) == 0
) {
error = 0;
break;
}
}
if (error != 0)
break;
}
} else {
if (lt->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) {
error = EDEADLK;
break;
}
}
/*
* Reject setting program counter to 0x0 if VA0 is disabled.
*
* Not all kernels implement this feature to set Program
* Counter in one go in PT_CONTINUE and similar operations.
* This causes portability issues as passing address 0x0
* on these kernels is no-operation, but can cause failure
* in most cases on NetBSD.
*/
if (user_va0_disable && addr == 0) {
error = EINVAL;
break;
}
/* If the address parameter is not (int *)1, set the pc. */
if ((int *)addr != (int *)1) {
error = process_set_pc(lt, addr);
if (error != 0)
break;
}
#ifdef PT_STEP
/*
* Arrange for a single-step, if that's requested and possible.
* More precisely, set the single step status as requested for
* the requested thread, and clear it for other threads.
*/
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
error = process_sstep(lt2,
ISSET(lt2->l_pflag, LP_SINGLESTEP));
if (error)
break;
}
if (error)
break;
error = process_sstep(lt,
ISSET(lt->l_pflag, LP_SINGLESTEP) || req == PT_STEP);
if (error)
break;
#endif
if (req == PT_DETACH) {
CLR(t->p_slflag,
PSL_TRACED|PSL_TRACEDCHILD|PSL_SYSCALL);
/* clear sigpass mask */
sigemptyset(&t->p_sigctx.ps_sigpass);
/* give process back to original parent or init */
if (t->p_opptr != t->p_pptr) {
struct proc *pp = t->p_opptr;
proc_reparent(t, pp ? pp : initproc);
}
/* not being traced any more */
t->p_opptr = NULL;
/* clear single step */
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
CLR(lt2->l_pflag, LP_SINGLESTEP);
}
CLR(lt->l_pflag, LP_SINGLESTEP);
}
sendsig:
error = ptrace_sendsig(l, req, t, lt, signo, resume_all);
break;
case PT_SYSCALLEMU:
if (!ISSET(t->p_slflag, PSL_SYSCALL) || t->p_stat != SSTOP) {
error = EINVAL;
break;
}
SET(t->p_slflag, PSL_SYSCALLEMU);
break;
#ifdef PT_STEP
case PT_SETSTEP:
write = 1;
/* FALLTHROUGH */
case PT_CLEARSTEP:
/* write = 0 done above. */
if ((error = ptrace_update_lwp(t, <, data)) != 0)
break;
if (write)
SET(lt->l_pflag, LP_SINGLESTEP);
else
CLR(lt->l_pflag, LP_SINGLESTEP);
break;
#endif
case PT_KILL:
/* just send the process a KILL signal. */
signo = SIGKILL;
goto sendsig; /* in PT_CONTINUE, above. */
case PT_STOP:
/* just send the process a STOP signal. */
signo = SIGSTOP;
goto sendsig; /* in PT_CONTINUE, above. */
case PT_ATTACH:
/*
* Go ahead and set the trace flag.
* Save the old parent (it's reset in
* _DETACH, and also in kern_exit.c:wait4()
* Reparent the process so that the tracing
* proc gets to see all the action.
* Stop the target.
*/
proc_changeparent(t, p);
signo = SIGSTOP;
goto sendsig;
case PT_GET_EVENT_MASK:
error = ptrace_get_event_mask(t, addr, data);
break;
case PT_SET_EVENT_MASK:
error = ptrace_set_event_mask(t, addr, data);
break;
case PT_GET_PROCESS_STATE:
error = ptrace_get_process_state(t, addr, data);
break;
case PT_LWPINFO:
error = ptrace_lwpinfo(t, <, addr, data);
break;
case PT_SET_SIGINFO:
error = ptrace_set_siginfo(t, <, ptm, addr, data);
break;
case PT_GET_SIGINFO:
error = ptrace_get_siginfo(t, ptm, addr, data);
break;
case PT_RESUME:
case PT_SUSPEND:
error = ptrace_startstop(t, <, req, addr, data);
break;
case PT_LWPSTATUS:
error = ptrace_lwpstatus(t, ptm, <, addr, data, false);
break;
case PT_LWPNEXT:
error = ptrace_lwpstatus(t, ptm, <, addr, data, true);
break;
case PT_SET_SIGPASS:
error = ptrace_set_sigpass(t, addr, data);
break;
case PT_GET_SIGPASS:
error = ptrace_get_sigpass(t, addr, data);
break;
#ifdef PT_REGISTERS
case_PT_SETREGS
case_PT_GETREGS
case_PT_SETFPREGS
case_PT_GETFPREGS
case_PT_SETDBREGS
case_PT_GETDBREGS
error = ptrace_regs(l, <, req, ptm, addr, data);
break;
#endif
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
error = ptrace_machdep_dorequest(l, <, req, addr, data);
break;
#endif
}
out:
if (pheld) {
mutex_exit(t->p_lock);
mutex_exit(&proc_lock);
}
if (lt != NULL) lwp_delref(lt); if (locked) rw_exit(&t->p_reflock);
return error;
}
static int
process_auxv_offset(struct proc *p, struct uio *uio)
{
struct ps_strings pss;
int error;
off_t off = (off_t)p->p_psstrp;
if ((error = copyin_psstrings(p, &pss)) != 0)
return error;
if (pss.ps_envstr == NULL)
return EIO;
#ifdef COMPAT_NETBSD32
if (p->p_flag & PK_32)
uio->uio_offset += (off_t)((vaddr_t)pss.ps_envstr +
sizeof(uint32_t) * (pss.ps_nenvstr + 1));
else
#endif
uio->uio_offset += (off_t)(vaddr_t)(pss.ps_envstr +
pss.ps_nenvstr + 1);
#ifdef __MACHINE_STACK_GROWS_UP
if (uio->uio_offset < off)
return EIO;
#else
if (uio->uio_offset > off)
return EIO;
if ((uio->uio_offset + uio->uio_resid) > off)
uio->uio_resid = off - uio->uio_offset;
#endif
return 0;
}
MODULE(MODULE_CLASS_EXEC, ptrace_common, NULL);
static int
ptrace_common_init(void)
{
#if 0
mutex_init(&ptrace_mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ptrace_cv, "ptracecb");
ptrace_cbref = 0;
#endif
ptrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
ptrace_listener_cb, NULL);
return 0;
}
static int
ptrace_common_fini(void)
{
kauth_unlisten_scope(ptrace_listener);
#if 0
/* Make sure no-one is executing our kauth listener */
mutex_enter(&ptrace_mtx);
while (ptrace_cbref != 0)
cv_wait(&ptrace_cv, &ptrace_mtx);
mutex_exit(&ptrace_mtx);
mutex_destroy(&ptrace_mtx);
cv_destroy(&ptrace_cv);
#endif
return 0;
}
static int
ptrace_common_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = ptrace_common_init();
break;
case MODULE_CMD_FINI:
error = ptrace_common_fini();
break;
default:
ptrace_hooks();
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $");
#include <sys/types.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/statvfs.h>
#include <sys/vnode.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
int
genfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
sbp->f_bsize = DEV_BSIZE;
sbp->f_frsize = DEV_BSIZE;
sbp->f_iosize = DEV_BSIZE;
sbp->f_blocks = 2; /* 1k to keep df happy */
sbp->f_bfree = 0;
sbp->f_bavail = 0;
sbp->f_bresvd = 0;
sbp->f_files = 0;
sbp->f_ffree = 0;
sbp->f_favail = 0;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, mp);
return 0;
}
int
genfs_renamelock_enter(struct mount *mp)
{
mutex_enter(mp->mnt_renamelock);
/* Preserve possible error return in case we become interruptible. */
return 0;
}
void
genfs_renamelock_exit(struct mount *mp)
{
mutex_exit(mp->mnt_renamelock);
}
int
genfs_suspendctl(struct mount *mp, int cmd)
{
int error;
switch (cmd) {
case SUSPEND_SUSPEND:
error = fstrans_setstate(mp, FSTRANS_SUSPENDING);
if (error)
return error;
error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
return error;
case SUSPEND_RESUME:
error = fstrans_setstate(mp, FSTRANS_NORMAL);
KASSERT(error == 0);
return 0;
default:
panic("%s: bogus command %d", __func__, cmd);
}
}
/* $NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $ */
/* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/errno.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/ip_var.h>
#include <netinet/icmp6.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_offload.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif
extern pfil_head_t *inet6_pfil_hook; /* XXX */
struct ip6_exthdrs {
struct mbuf *ip6e_ip6;
struct mbuf *ip6e_hbh;
struct mbuf *ip6e_dest1;
struct mbuf *ip6e_rthdr;
struct mbuf *ip6e_dest2;
};
static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
kauth_cred_t, int);
static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, kauth_cred_t,
int, int, int);
static int ip6_setmoptions(const struct sockopt *, struct inpcb *);
static int ip6_getmoptions(struct sockopt *, struct inpcb *);
static int ip6_copyexthdr(struct mbuf **, void *, int);
static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
struct ip6_frag **);
static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
static int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *, int *);
static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
static int ip6_ifaddrvalid(const struct in6_addr *, const struct in6_addr *);
static int ip6_handle_rthdr(struct ip6_rthdr *, struct ip6_hdr *);
#ifdef RFC2292
static int ip6_pcbopts(struct ip6_pktopts **, struct socket *, struct sockopt *);
#endif
static int
ip6_handle_rthdr(struct ip6_rthdr *rh, struct ip6_hdr *ip6)
{
int error = 0;
switch (rh->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
/* Dropped, RFC5095. */
default: /* is it possible? */
error = EINVAL;
}
return error;
}
/*
* Send an IP packet to a host.
*/
int
ip6_if_output(struct ifnet * const ifp, struct ifnet * const origifp,
struct mbuf * const m, const struct sockaddr_in6 * const dst,
const struct rtentry *rt)
{
int error = 0;
if (rt != NULL) {
error = rt_check_reject_route(rt, ifp);
if (error != 0) { IP6_STATINC(IP6_STAT_RTREJECT);
m_freem(m);
return error;
}
}
/* discard the packet if IPv6 operation is disabled on the interface */
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
m_freem(m);
return ENETDOWN; /* better error? */
}
if ((ifp->if_flags & IFF_LOOPBACK) != 0)
error = if_output_lock(ifp, origifp, m, sin6tocsa(dst), rt);
else
error = if_output_lock(ifp, ifp, m, sin6tocsa(dst), rt);
return error;
}
/*
* IP6 output. The packet in mbuf chain m contains a skeletal IP6
* header (with pri, len, nxt, hlim, src, dst).
*
* This function may modify ver and hlim only. The mbuf chain containing the
* packet will be freed. The mbuf opt, if present, will not be freed.
*
* Type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
* nd_ifinfo.linkmtu is u_int32_t. So we use u_long to hold largest one,
* which is rt_rmx.rmx_mtu.
*/
int
ip6_output(
struct mbuf *m0,
struct ip6_pktopts *opt,
struct route *ro,
int flags,
struct ip6_moptions *im6o,
struct inpcb *inp,
struct ifnet **ifpp /* XXX: just for statistics */
)
{
struct ip6_hdr *ip6, *mhip6;
struct ifnet *ifp = NULL, *origifp = NULL;
struct mbuf *m = m0;
int tlen, len, off;
bool tso;
struct route ip6route;
struct rtentry *rt = NULL, *rt_pmtu;
const struct sockaddr_in6 *dst;
struct sockaddr_in6 src_sa, dst_sa;
int error = 0;
struct in6_ifaddr *ia = NULL;
u_long mtu;
int alwaysfrag, dontfrag;
u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
struct ip6_exthdrs exthdrs;
struct in6_addr finaldst, src0, dst0;
u_int32_t zone;
struct route *ro_pmtu = NULL;
int hdrsplit = 0;
int needipsec = 0;
#ifdef IPSEC
struct secpolicy *sp = NULL;
#endif
struct psref psref, psref_ia;
int bound = curlwp_bind();
bool release_psref_ia = false;
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("ip6_output: no HDR");
if ((m->m_pkthdr.csum_flags &
(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TSOv4)) != 0) {
panic("ip6_output: IPv4 checksum offload flags: %d",
m->m_pkthdr.csum_flags);
}
if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) ==
(M_CSUM_TCPv6|M_CSUM_UDPv6)) {
panic("ip6_output: conflicting checksum offload flags: %d",
m->m_pkthdr.csum_flags);
}
#endif
M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data, sizeof(struct ip6_hdr));
#define MAKE_EXTHDR(hp, mp) \
do { \
if (hp) { \
struct ip6_ext *eh = (struct ip6_ext *)(hp); \
error = ip6_copyexthdr((mp), (void *)(hp), \
((eh)->ip6e_len + 1) << 3); \
if (error) \
goto freehdrs; \
} \
} while (/*CONSTCOND*/ 0)
memset(&exthdrs, 0, sizeof(exthdrs));
if (opt) {
/* Hop-by-Hop options header */
MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
/* Destination options header (1st part) */
MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
/* Routing header */
MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
/* Destination options header (2nd part) */
MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
}
/*
* Calculate the total length of the extension header chain.
* Keep the length of the unfragmentable part for fragmentation.
*/
optlen = 0;
if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len;
unfragpartlen = optlen + sizeof(struct ip6_hdr);
/* NOTE: we don't add AH/ESP length here. do that later. */
if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len;
#ifdef IPSEC
if (ipsec_used) {
/* Check the security policy (SP) for the packet */
sp = ipsec6_check_policy(m, inp, flags, &needipsec, &error);
if (error != 0) {
/*
* Hack: -EINVAL is used to signal that a packet
* should be silently discarded. This is typically
* because we asked key management for an SA and
* it was delayed (e.g. kicked up to IKE).
*/
if (error == -EINVAL)
error = 0;
IP6_STATINC(IP6_STAT_IPSECDROP_OUT);
goto freehdrs;
}
}
#endif
if (needipsec &&
(m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) {
in6_undefer_cksum_tcpudp(m);
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
/*
* If we need IPsec, or there is at least one extension header,
* separate IP6 header from the payload.
*/
if ((needipsec || optlen) && !hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
/* adjust mbuf packet header length */
m->m_pkthdr.len += optlen;
plen = m->m_pkthdr.len - sizeof(*ip6);
/* If this is a jumbo payload, insert a jumbo payload option. */
if (plen > IPV6_MAXPACKET) {
if (!hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto freehdrs;
}
optlen += 8; /* XXX JUMBOOPTLEN */
ip6->ip6_plen = 0;
} else
ip6->ip6_plen = htons(plen);
/*
* Concatenate headers and fill in next header fields.
* Here we have, on "m"
* IPv6 payload
* and we insert headers accordingly. Finally, we should be getting:
* IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
*
* during the header composing process, "m" points to IPv6 header.
* "mprev" points to an extension header prior to esp.
*/
{
u_char *nexthdrp = &ip6->ip6_nxt;
struct mbuf *mprev = m;
/*
* we treat dest2 specially. this makes IPsec processing
* much easier. the goal here is to make mprev point the
* mbuf prior to dest2.
*
* result: IPv6 dest2 payload
* m and mprev will point to IPv6 header.
*/
if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split");
exthdrs.ip6e_dest2->m_next = m->m_next;
m->m_next = exthdrs.ip6e_dest2;
*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_DSTOPTS;
}
#define MAKE_CHAIN(m, mp, p, i)\
do {\
if (m) {\
if (!hdrsplit) \
panic("assumption failed: hdr not split"); \
*mtod((m), u_char *) = *(p);\
*(p) = (i);\
p = mtod((m), u_char *);\
(m)->m_next = (mp)->m_next;\
(mp)->m_next = (m);\
(mp) = (m);\
}\
} while (/*CONSTCOND*/ 0)
/*
* result: IPv6 hbh dest1 rthdr dest2 payload
* m will point to IPv6 header. mprev will point to the
* extension header prior to dest2 (rthdr in the above case).
*/
MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
IPPROTO_DSTOPTS);
MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
IPPROTO_ROUTING);
M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data,
sizeof(struct ip6_hdr) + optlen);
}
/* Need to save for pmtu */
finaldst = ip6->ip6_dst;
/*
* If there is a routing header, replace destination address field
* with the first hop of the routing header.
*/
if (exthdrs.ip6e_rthdr) {
struct ip6_rthdr *rh;
rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
error = ip6_handle_rthdr(rh, ip6);
if (error != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
}
/* Source address validation */
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
(flags & IPV6_UNSPECSRC) == 0) {
error = EOPNOTSUPP;
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
error = EOPNOTSUPP;
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
IP6_STATINC(IP6_STAT_LOCALOUT);
/*
* Route packet.
*/
/* initialize cached route */
if (ro == NULL) { memset(&ip6route, 0, sizeof(ip6route));
ro = &ip6route;
}
ro_pmtu = ro;
if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route;
/*
* if specified, try to fill in the traffic class field.
* do not override if a non-zero value is already set.
* we check the diffserv field and the ecn field separately.
*/
if (opt && opt->ip6po_tclass >= 0) {
int mask = 0;
if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
mask |= 0xfc;
if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
mask |= 0x03;
if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
}
/* fill in or override the hop limit field, if necessary. */
if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
if (im6o != NULL)
ip6->ip6_hlim = im6o->im6o_multicast_hlim;
else
ip6->ip6_hlim = ip6_defmcasthlim;
}
#ifdef IPSEC
if (needipsec) {
error = ipsec6_process_packet(m, sp->req, flags);
/*
* Preserve KAME behaviour: ENOENT can be returned
* when an SA acquire is in progress. Don't propagate
* this to user-level; it confuses applications.
* XXX this will go away when the SADB is redone.
*/
if (error == ENOENT)
error = 0;
goto done;
}
#endif
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);
/* We do not need a route for multicast */
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
struct in6_pktinfo *pi = NULL;
/*
* If the outgoing interface for the address is specified by
* the caller, use it.
*/
if (opt && (pi = opt->ip6po_pktinfo) != NULL) {
/* XXX boundary check is assumed to be already done. */
ifp = if_get_byindex(pi->ipi6_ifindex, &psref); } else if (im6o != NULL) { ifp = if_get_byindex(im6o->im6o_multicast_if_index,
&psref);
}
}
if (ifp == NULL) {
error = in6_selectroute(&dst_sa, opt, &ro, &rt, true);
if (error != 0)
goto bad;
ifp = if_get_byindex(rt->rt_ifp->if_index, &psref);
}
if (rt == NULL) {
/*
* If in6_selectroute() does not return a route entry,
* dst may not have been updated.
*/
error = rtcache_setdst(ro, sin6tosa(&dst_sa));
if (error) { IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
}
/*
* then rt (for unicast) and ifp must be non-NULL valid values.
*/
if ((flags & IPV6_FORWARDING) == 0) {
/* XXX: the FORWARDING flag can be set for mrouting. */
in6_ifstat_inc(ifp, ifs6_out_request);
}
if (rt != NULL) {
ia = (struct in6_ifaddr *)(rt->rt_ifa);
rt->rt_use++;
}
/*
* The outgoing interface must be in the zone of source and
* destination addresses. We should use ia_ifp to support the
* case of sending packets to an address of our own.
*/
if (ia != NULL) {
origifp = ia->ia_ifp;
if (if_is_deactivated(origifp)) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
if_acquire(origifp, &psref_ia);
release_psref_ia = true;
} else
origifp = ifp;
src0 = ip6->ip6_src;
if (in6_setscope(&src0, origifp, &zone))
goto badscope;
sockaddr_in6_init(&src_sa, &ip6->ip6_src, 0, 0, 0);
if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
goto badscope;
dst0 = ip6->ip6_dst;
if (in6_setscope(&dst0, origifp, &zone))
goto badscope;
/* re-initialize to be sure */
sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);
if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id)
goto badscope;
/* scope check is done. */
/* Ensure we only send from a valid address. */
if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
(flags & IPV6_FORWARDING) == 0 &&
(error = ip6_ifaddrvalid(&src0, &dst0)) != 0)
{
char ip6buf[INET6_ADDRSTRLEN];
nd6log(LOG_ERR,
"refusing to send from invalid address %s (pid %d)\n",
IN6_PRINT(ip6buf, &src0), curproc->p_pid);
IP6_STATINC(IP6_STAT_ODROPPED);
in6_ifstat_inc(origifp, ifs6_out_discard);
if (error == 1)
/*
* Address exists, but is tentative or detached.
* We can't send from it because it's invalid,
* so we drop the packet.
*/
error = 0;
else
error = EADDRNOTAVAIL;
goto bad;
}
if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) &&
!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
dst = satocsin6(rt->rt_gateway);
else
dst = satocsin6(rtcache_getdst(ro));
/*
* XXXXXX: original code follows:
*/
if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
else {
bool ingroup;
m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
in6_ifstat_inc(ifp, ifs6_out_mcast);
/*
* Confirm that the outgoing interface supports multicast.
*/
if (!(ifp->if_flags & IFF_MULTICAST)) {
IP6_STATINC(IP6_STAT_NOROUTE);
in6_ifstat_inc(ifp, ifs6_out_discard);
error = ENETUNREACH;
goto bad;
}
ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
if (ingroup && (im6o == NULL || im6o->im6o_multicast_loop)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
*/
KASSERT(dst != NULL);
ip6_mloopback(ifp, m, dst);
} else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IPV6_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip6_mloopback(),
* above, will be forwarded by the ip6_input() routine,
* if necessary.
*/
if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m);
goto done;
}
}
}
/*
* Multicasts with a hoplimit of zero may be looped back,
* above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip6_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
m_freem(m);
goto done;
}
}
/*
* Fill the outgoing interface to tell the upper layer
* to increment per-interface statistics.
*/
if (ifpp) *ifpp = ifp;
/* Determine path MTU. */
/*
* ro_pmtu represent final destination while
* ro might represent immediate destination.
* Use ro_pmtu destination since MTU might differ.
*/
if (ro_pmtu != ro) {
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
/* ro_pmtu may not have a cache */
sockaddr_in6_init(&u.dst6, &finaldst, 0, 0, 0);
rt_pmtu = rtcache_lookup(ro_pmtu, &u.dst);
} else
rt_pmtu = rt; error = ip6_getpmtu(rt_pmtu, ifp, &mtu, &alwaysfrag); if (rt_pmtu != NULL && rt_pmtu != rt)
rtcache_unref(rt_pmtu, ro_pmtu);
KASSERT(error == 0); /* ip6_getpmtu never fail if ifp is passed */
/*
* The caller of this function may specify to use the minimum MTU
* in some cases.
* An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
* setting. The logic is a bit complicated; by default, unicast
* packets will follow path MTU while multicast packets will be sent at
* the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets
* including unicast ones will be sent at the minimum MTU. Multicast
* packets will always be sent at the minimum MTU unless
* IP6PO_MINMTU_DISABLE is explicitly specified.
* See RFC 3542 for more details.
*/
if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU))
mtu = IPV6_MMTU;
else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
mtu = IPV6_MMTU;
else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
(opt == NULL ||
opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
mtu = IPV6_MMTU;
}
}
/*
* clear embedded scope identifiers if necessary.
* in6_clearscope will touch the addresses only when necessary.
*/
in6_clearscope(&ip6->ip6_src);
in6_clearscope(&ip6->ip6_dst);
/*
* If the outgoing packet contains a hop-by-hop options header,
* it must be examined and processed even by the source node.
* (RFC 2460, section 4.)
*
* XXX Is this really necessary?
*/
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
u_int32_t dummy1 = 0; /* XXX unused */
u_int32_t dummy2; /* XXX unused */
int hoff = sizeof(struct ip6_hdr);
if (ip6_hopopts_input(&dummy1, &dummy2, &m, &hoff)) {
/* m was already freed at this point */
error = EINVAL;
goto done;
}
ip6 = mtod(m, struct ip6_hdr *);
}
/*
* Run through list of hooks for output packets.
*/
error = pfil_run_hooks(inet6_pfil_hook, &m, ifp, PFIL_OUT);
if (error != 0 || m == NULL) {
IP6_STATINC(IP6_STAT_PFILDROP_OUT);
goto done;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Send the packet to the outgoing interface.
* If necessary, do IPv6 fragmentation before sending.
*
* the logic here is rather complex:
* 1: normal case (dontfrag == 0, alwaysfrag == 0)
* 1-a: send as is if tlen <= path mtu
* 1-b: fragment if tlen > path mtu
*
* 2: if user asks us not to fragment (dontfrag == 1)
* 2-a: send as is if tlen <= interface mtu
* 2-b: error if tlen > interface mtu
*
* 3: if we always need to attach fragment header (alwaysfrag == 1)
* always fragment
*
* 4: if dontfrag == 1 && alwaysfrag == 1
* error, as we cannot handle this conflicting request
*/
tlen = m->m_pkthdr.len;
tso = (m->m_pkthdr.csum_flags & M_CSUM_TSOv6) != 0;
if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
dontfrag = 1;
else
dontfrag = 0;
if (dontfrag && alwaysfrag) { /* case 4 */
/* conflicting request - can't transmit */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
goto bad;
}
if (dontfrag && (!tso && tlen > ifp->if_mtu)) { /* case 2-b */
/*
* Even if the DONTFRAG option is specified, we cannot send the
* packet when the data length is larger than the MTU of the
* outgoing interface.
* Notify the error by sending IPV6_PATHMTU ancillary data as
* well as returning an error code (the latter is not described
* in the API spec.)
*/
u_int32_t mtu32;
struct ip6ctlparam ip6cp;
mtu32 = (u_int32_t)mtu;
memset(&ip6cp, 0, sizeof(ip6cp));
ip6cp.ip6c_cmdarg = (void *)&mtu32;
pfctlinput2(PRC_MSGSIZE,
rtcache_getdst(ro_pmtu), &ip6cp);
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
goto bad;
}
/*
* transmit packet without fragmentation
*/
if (dontfrag || (!alwaysfrag && (tlen <= mtu || tso))) {
/* case 1-a and 2-a */
struct in6_ifaddr *ia6;
int sw_csum;
int s;
ip6 = mtod(m, struct ip6_hdr *);
s = pserialize_read_enter();
ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
if (ia6) {
/* Record statistics for this interface address. */
ia6->ia_ifa.ifa_data.ifad_outbytes += m->m_pkthdr.len;
}
pserialize_read_exit(s);
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
if ((sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) { if (IN6_NEED_CHECKSUM(ifp,
sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6))) {
in6_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
KASSERT(dst != NULL);
if (__predict_false(sw_csum & M_CSUM_TSOv6)) {
/*
* TSO6 is required by a packet, but disabled for
* the interface.
*/
error = ip6_tso_output(ifp, origifp, m, dst, rt);
} else
error = ip6_if_output(ifp, origifp, m, dst, rt);
goto done;
}
if (tso) {
IP6_STATINC(IP6_STAT_CANTFRAG); /* XXX */
error = EINVAL; /* XXX */
goto bad;
}
/*
* try to fragment the packet. case 1-b and 3
*/
if (mtu < IPV6_MMTU) {
/* path MTU cannot be less than IPV6_MMTU */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
} else if (ip6->ip6_plen == 0) {
/* jumbo payload cannot be fragmented */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
} else {
const uint32_t id = ip6_randomid();
struct mbuf **mnext, *m_frgpart;
const int hlen = unfragpartlen;
struct ip6_frag *ip6f;
u_char nextproto;
if (mtu > IPV6_MAXPACKET)
mtu = IPV6_MAXPACKET;
/*
* Must be able to put at least 8 bytes per fragment.
*/
len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
if (len < 8) {
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
}
mnext = &m->m_nextpkt;
/*
* Change the next header field of the last header in the
* unfragmentable part.
*/
if (exthdrs.ip6e_rthdr) {
nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_dest1) {
nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_hbh) {
nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
} else {
nextproto = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_FRAGMENT;
}
if ((m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6))
!= 0) {
if (IN6_NEED_CHECKSUM(ifp,
m->m_pkthdr.csum_flags &
(M_CSUM_UDPv6|M_CSUM_TCPv6))) {
in6_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto
* chain.
*/
m0 = m;
for (off = hlen; off < tlen; off += len) {
struct mbuf *mlast;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (!m) {
error = ENOBUFS;
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
m_reset_rcvif(m);
m->m_flags = m0->m_flags & M_COPYFLAGS;
*mnext = m;
mnext = &m->m_nextpkt;
m->m_data += max_linkhdr;
mhip6 = mtod(m, struct ip6_hdr *);
*mhip6 = *ip6;
m->m_len = sizeof(*mhip6);
ip6f = NULL;
error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
if (error) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
/* Fill in the Frag6 Header */
ip6f->ip6f_offlg = htons((u_int16_t)((off - hlen) & ~7));
if (off + len >= tlen)
len = tlen - off;
else
ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
ip6f->ip6f_reserved = 0;
ip6f->ip6f_ident = id;
ip6f->ip6f_nxt = nextproto;
mhip6->ip6_plen = htons((u_int16_t)(len + hlen +
sizeof(*ip6f) - sizeof(struct ip6_hdr)));
if ((m_frgpart = m_copym(m0, off, len, M_DONTWAIT)) == NULL) {
error = ENOBUFS;
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
for (mlast = m; mlast->m_next; mlast = mlast->m_next)
;
mlast->m_next = m_frgpart;
m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
m_reset_rcvif(m);
IP6_STATINC(IP6_STAT_OFRAGMENTS);
in6_ifstat_inc(ifp, ifs6_out_fragcreat);
}
in6_ifstat_inc(ifp, ifs6_out_fragok);
}
sendorfree:
m = m0->m_nextpkt;
m0->m_nextpkt = 0;
m_freem(m0);
for (m0 = m; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = 0;
if (error == 0) {
struct in6_ifaddr *ia6;
int s;
ip6 = mtod(m, struct ip6_hdr *);
s = pserialize_read_enter();
ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
if (ia6) {
/*
* Record statistics for this interface
* address.
*/
ia6->ia_ifa.ifa_data.ifad_outbytes +=
m->m_pkthdr.len;
}
pserialize_read_exit(s);
KASSERT(dst != NULL);
error = ip6_if_output(ifp, origifp, m, dst, rt);
} else
m_freem(m);
}
if (error == 0) IP6_STATINC(IP6_STAT_FRAGMENTED);
done:
rtcache_unref(rt, ro);
if (ro == &ip6route) rtcache_free(&ip6route);
#ifdef IPSEC
if (sp != NULL) KEY_SP_UNREF(&sp);
#endif
if_put(ifp, &psref);
if (release_psref_ia) if_put(origifp, &psref_ia); curlwp_bindx(bound);
return error;
freehdrs:
m_freem(exthdrs.ip6e_hbh);
m_freem(exthdrs.ip6e_dest1);
m_freem(exthdrs.ip6e_rthdr);
m_freem(exthdrs.ip6e_dest2);
/* FALLTHROUGH */
bad:
m_freem(m);
goto done;
badscope:
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0)
error = EHOSTUNREACH; /* XXX */
goto bad;
}
static int
ip6_copyexthdr(struct mbuf **mp, void *hdr, int hlen)
{
struct mbuf *m;
if (hlen > MCLBYTES)
return ENOBUFS; /* XXX */
MGET(m, M_DONTWAIT, MT_DATA);
if (!m)
return ENOBUFS;
if (hlen > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) { m_free(m);
return ENOBUFS;
}
}
m->m_len = hlen;
if (hdr)
memcpy(mtod(m, void *), hdr, hlen);
*mp = m;
return 0;
}
/*
* Insert jumbo payload option.
*/
static int
ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
{
struct mbuf *mopt;
u_int8_t *optbuf;
u_int32_t v;
#define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */
/*
* If there is no hop-by-hop options header, allocate new one.
* If there is one but it doesn't have enough space to store the
* jumbo payload option, allocate a cluster to store the whole options.
* Otherwise, use it to store the options.
*/
if (exthdrs->ip6e_hbh == NULL) {
MGET(mopt, M_DONTWAIT, MT_DATA);
if (mopt == 0)
return (ENOBUFS);
mopt->m_len = JUMBOOPTLEN;
optbuf = mtod(mopt, u_int8_t *);
optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */
exthdrs->ip6e_hbh = mopt;
} else {
struct ip6_hbh *hbh;
mopt = exthdrs->ip6e_hbh;
if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
const int oldoptlen = mopt->m_len;
struct mbuf *n;
/*
* Assumptions:
* - exthdrs->ip6e_hbh is not referenced from places
* other than exthdrs.
* - exthdrs->ip6e_hbh is not an mbuf chain.
*/
KASSERT(mopt->m_next == NULL);
/*
* Give up if the whole (new) hbh header does not fit
* even in an mbuf cluster.
*/
if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
return ENOBUFS;
/*
* At this point, we must always prepare a cluster.
*/
MGET(n, M_DONTWAIT, MT_DATA);
if (n) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n)
return ENOBUFS;
n->m_len = oldoptlen + JUMBOOPTLEN;
bcopy(mtod(mopt, void *), mtod(n, void *),
oldoptlen);
optbuf = mtod(n, u_int8_t *) + oldoptlen;
m_freem(mopt);
mopt = exthdrs->ip6e_hbh = n;
} else {
optbuf = mtod(mopt, u_int8_t *) + mopt->m_len;
mopt->m_len += JUMBOOPTLEN;
}
optbuf[0] = IP6OPT_PADN;
optbuf[1] = 0;
/*
* Adjust the header length according to the pad and
* the jumbo payload option.
*/
hbh = mtod(mopt, struct ip6_hbh *);
hbh->ip6h_len += (JUMBOOPTLEN >> 3);
}
/* fill in the option. */
optbuf[2] = IP6OPT_JUMBO;
optbuf[3] = 4;
v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
memcpy(&optbuf[4], &v, sizeof(u_int32_t));
/* finally, adjust the packet header length */
exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
return 0;
#undef JUMBOOPTLEN
}
/*
* Insert fragment header and copy unfragmentable header portions.
*
* *frghdrp will not be read, and it is guaranteed that either an
* error is returned or that *frghdrp will point to space allocated
* for the fragment header.
*
* On entry, m contains:
* IPv6 Header
* On exit, it contains:
* IPv6 Header -> Unfragmentable Part -> Frag6 Header
*/
static int
ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
struct ip6_frag **frghdrp)
{
struct mbuf *n, *mlast;
if (hlen > sizeof(struct ip6_hdr)) {
n = m_copym(m0, sizeof(struct ip6_hdr),
hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
if (n == NULL)
return ENOBUFS;
m->m_next = n;
} else
n = m;
/* Search for the last mbuf of unfragmentable part. */
for (mlast = n; mlast->m_next; mlast = mlast->m_next)
;
if ((mlast->m_flags & M_EXT) == 0 &&
M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
/* use the trailing space of the last mbuf for the fragment hdr */
*frghdrp = (struct ip6_frag *)(mtod(mlast, char *) +
mlast->m_len);
mlast->m_len += sizeof(struct ip6_frag);
} else {
/* allocate a new mbuf for the fragment header */
struct mbuf *mfrg;
MGET(mfrg, M_DONTWAIT, MT_DATA);
if (mfrg == NULL)
return ENOBUFS;
mfrg->m_len = sizeof(struct ip6_frag);
*frghdrp = mtod(mfrg, struct ip6_frag *);
mlast->m_next = mfrg;
}
return 0;
}
static int
ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup,
int *alwaysfragp)
{
u_int32_t mtu = 0;
int alwaysfrag = 0;
int error = 0;
if (rt != NULL) {
if (ifp == NULL)
ifp = rt->rt_ifp;
mtu = rt->rt_rmx.rmx_mtu;
if (mtu == 0)
mtu = ifp->if_mtu; else if (mtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph:
* if we record ICMPv6 too big message with
* mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
* or smaller, with fragment header attached.
* (fragment header is needed regardless from the
* packet size, for translators to identify packets)
*/
alwaysfrag = 1;
mtu = IPV6_MMTU;
} else if (mtu > ifp->if_mtu) {
/*
* The MTU on the route is larger than the MTU on
* the interface! This shouldn't happen, unless the
* MTU of the interface has been changed after the
* interface was brought up. Change the MTU in the
* route to match the interface MTU (as long as the
* field isn't locked).
*/
mtu = ifp->if_mtu;
if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) rt->rt_rmx.rmx_mtu = mtu;
}
} else if (ifp) {
mtu = ifp->if_mtu;
} else
error = EHOSTUNREACH; /* XXX */
*mtup = mtu;
if (alwaysfragp)
*alwaysfragp = alwaysfrag;
return (error);
}
/*
* IP6 socket option processing.
*/
int
ip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int optdatalen, uproto;
void *optdata;
struct inpcb *inp = sotoinpcb(so);
struct ip_moptions **mopts;
int error, optval;
int level, optname;
KASSERT(solocked(so)); KASSERT(sopt != NULL);
level = sopt->sopt_level;
optname = sopt->sopt_name;
error = optval = 0;
uproto = (int)so->so_proto->pr_protocol;
switch (level) {
case IPPROTO_IP:
switch (optname) {
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
case IP_MULTICAST_IF:
case IP_MULTICAST_LOOP:
case IP_MULTICAST_TTL:
mopts = &inp->inp_moptions;
switch (op) {
case PRCO_GETOPT:
return ip_getmoptions(*mopts, sopt);
case PRCO_SETOPT:
return ip_setmoptions(mopts, sopt);
default:
return EINVAL;
}
default:
return ENOPROTOOPT;
}
case IPPROTO_IPV6:
break;
default:
return ENOPROTOOPT;
}
switch (op) {
case PRCO_SETOPT:
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTOPTIONS:
error = ip6_pcbopts(&in6p_outputopts(inp), so, sopt);
break;
#endif
/*
* Use of some Hop-by-Hop options or some
* Destination options, might require special
* privilege. That is, normal applications
* (without special privilege) might be forbidden
* from setting certain options in outgoing packets,
* and might never see certain options in received
* packets. [RFC 2292 Section 6]
* KAME specific note:
* KAME prevents non-privileged users from sending or
* receiving ANY hbh/dst options in order to avoid
* overhead of parsing options in the kernel.
*/
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
case IPV6_RECVRTHDRDSTOPTS:
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP,
NULL, NULL, NULL);
if (error)
break;
/* FALLTHROUGH */
case IPV6_UNICAST_HOPS:
case IPV6_HOPLIMIT:
case IPV6_FAITH:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_RECVTCLASS:
case IPV6_V6ONLY:
case IPV6_BINDANY:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optname) {
case IPV6_UNICAST_HOPS:
if (optval < -1 || optval >= 256)
error = EINVAL;
else {
/* -1 = kernel default */
in6p_hops6(inp) = optval;
}
break;
#define OPTSET(bit) \
do { \
if (optval) \
inp->inp_flags |= (bit); \
else \
inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)
#ifdef RFC2292
#define OPTSET2292(bit) \
do { \
inp->inp_flags |= IN6P_RFC2292; \
if (optval) \
inp->inp_flags |= (bit); \
else \
inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)
#endif
#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
case IPV6_RECVPKTINFO:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_PKTINFO);
break;
case IPV6_HOPLIMIT:
{
struct ip6_pktopts **optp;
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(IPV6_HOPLIMIT,
(u_char *)&optval,
sizeof(optval),
optp,
kauth_cred_get(), uproto);
break;
}
case IPV6_RECVHOPLIMIT:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_HOPLIMIT);
break;
case IPV6_RECVHOPOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_DSTOPTS);
break;
case IPV6_RECVRTHDRDSTOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_RTHDRDSTOPTS);
break;
case IPV6_RECVRTHDR:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_RTHDR);
break;
case IPV6_FAITH:
OPTSET(IN6P_FAITH);
break;
case IPV6_RECVPATHMTU:
/*
* We ignore this option for TCP
* sockets.
* (RFC3542 leaves this case
* unspecified.)
*/
if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU);
break;
case IPV6_V6ONLY:
/*
* make setsockopt(IPV6_V6ONLY)
* available only prior to bind(2).
* see ipng mailing list, Jun 22 2001.
*/
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
error = EINVAL;
break;
}
#ifdef INET6_BINDV6ONLY
if (!optval)
error = EINVAL;
#else
OPTSET(IN6P_IPV6_V6ONLY);
#endif
break;
case IPV6_RECVTCLASS:
#ifdef RFC2292
/* cannot mix with RFC2292 XXX */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_TCLASS);
break;
case IPV6_BINDANY:
error = kauth_authorize_network(
kauth_cred_get(), KAUTH_NETWORK_BIND,
KAUTH_REQ_NETWORK_BIND_ANYADDR, so, NULL,
NULL);
if (error)
break;
OPTSET(IN6P_BINDANY);
break;
}
break;
case IPV6_OTCLASS:
{
struct ip6_pktopts **optp;
u_int8_t tclass;
error = sockopt_get(sopt, &tclass, sizeof(tclass));
if (error)
break;
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname,
(u_char *)&tclass,
sizeof(tclass),
optp,
kauth_cred_get(), uproto);
break;
}
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
case IPV6_PREFER_TEMPADDR:
error = sockopt_getint(sopt, &optval);
if (error)
break;
{
struct ip6_pktopts **optp;
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname,
(u_char *)&optval,
sizeof(optval),
optp,
kauth_cred_get(), uproto);
break;
}
#ifdef RFC2292
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292HOPOPTS:
case IPV6_2292DSTOPTS:
case IPV6_2292RTHDR:
/* RFC 2292 */
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optname) {
case IPV6_2292PKTINFO:
OPTSET2292(IN6P_PKTINFO);
break;
case IPV6_2292HOPLIMIT:
OPTSET2292(IN6P_HOPLIMIT);
break;
case IPV6_2292HOPOPTS:
/*
* Check super-user privilege.
* See comments for IPV6_RECVHOPOPTS.
*/
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
NULL, NULL);
if (error)
return (error);
OPTSET2292(IN6P_HOPOPTS);
break;
case IPV6_2292DSTOPTS:
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
NULL, NULL);
if (error)
return (error);
OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
break;
case IPV6_2292RTHDR:
OPTSET2292(IN6P_RTHDR);
break;
}
break;
#endif
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_NEXTHOP: {
/* new advanced API (RFC3542) */
void *optbuf;
int optbuflen;
struct ip6_pktopts **optp;
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
optbuflen = sopt->sopt_size;
optbuf = malloc(optbuflen, M_IP6OPT, M_NOWAIT);
if (optbuf == NULL) {
error = ENOBUFS;
break;
}
error = sockopt_get(sopt, optbuf, optbuflen);
if (error) {
free(optbuf, M_IP6OPT);
break;
}
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname, optbuf, optbuflen,
optp, kauth_cred_get(), uproto);
free(optbuf, M_IP6OPT);
break;
}
#undef OPTSET
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_setmoptions(sopt, inp);
break;
case IPV6_PORTRANGE:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optval) {
case IPV6_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags &= ~(IN6P_HIGHPORT);
break;
case IPV6_PORTRANGE_HIGH:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags |= IN6P_HIGHPORT;
break;
case IPV6_PORTRANGE_LOW:
inp->inp_flags &= ~(IN6P_HIGHPORT);
inp->inp_flags |= IN6P_LOWPORT;
break;
default:
error = EINVAL;
break;
}
break;
case IPV6_PORTALGO:
error = sockopt_getint(sopt, &optval);
if (error)
break;
error = portalgo_algo_index_select(inp, optval);
break;
#if defined(IPSEC)
case IPV6_IPSEC_POLICY:
if (ipsec_enabled) {
error = ipsec_set_policy(inp,
sopt->sopt_data, sopt->sopt_size,
kauth_cred_get());
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTOPTIONS:
/*
* RFC3542 (effectively) deprecated the
* semantics of the 2292-style pktoptions.
* Since it was not reliable in nature (i.e.,
* applications had to expect the lack of some
* information after all), it would make sense
* to simplify this part by always returning
* empty data.
*/
break;
#endif
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
case IPV6_RECVRTHDRDSTOPTS:
case IPV6_UNICAST_HOPS:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_FAITH:
case IPV6_V6ONLY:
case IPV6_PORTRANGE:
case IPV6_RECVTCLASS:
case IPV6_BINDANY:
switch (optname) {
case IPV6_RECVHOPOPTS:
optval = OPTBIT(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
optval = OPTBIT(IN6P_DSTOPTS);
break;
case IPV6_RECVRTHDRDSTOPTS:
optval = OPTBIT(IN6P_RTHDRDSTOPTS);
break;
case IPV6_UNICAST_HOPS:
optval = in6p_hops6(inp);
break;
case IPV6_RECVPKTINFO:
optval = OPTBIT(IN6P_PKTINFO);
break;
case IPV6_RECVHOPLIMIT:
optval = OPTBIT(IN6P_HOPLIMIT);
break;
case IPV6_RECVRTHDR:
optval = OPTBIT(IN6P_RTHDR);
break;
case IPV6_RECVPATHMTU:
optval = OPTBIT(IN6P_MTU);
break;
case IPV6_FAITH:
optval = OPTBIT(IN6P_FAITH);
break;
case IPV6_V6ONLY:
optval = OPTBIT(IN6P_IPV6_V6ONLY);
break;
case IPV6_PORTRANGE:
{
int flags;
flags = inp->inp_flags;
if (flags & IN6P_HIGHPORT)
optval = IPV6_PORTRANGE_HIGH;
else if (flags & IN6P_LOWPORT)
optval = IPV6_PORTRANGE_LOW;
else
optval = 0;
break;
}
case IPV6_RECVTCLASS:
optval = OPTBIT(IN6P_TCLASS);
break;
case IPV6_BINDANY:
optval = OPTBIT(IN6P_BINDANY);
break;
}
if (error)
break;
error = sockopt_setint(sopt, optval);
break;
case IPV6_PATHMTU:
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
struct route *ro = &inp->inp_route;
struct rtentry *rt;
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
/*
* XXX: we dot not consider the case of source
* routing, or optional information to specify
* the outgoing interface.
*/
sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
rt = rtcache_lookup(ro, &u.dst);
error = ip6_getpmtu(rt, NULL, &pmtu, NULL);
rtcache_unref(rt, ro);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)
pmtu = IPV6_MAXPACKET;
memset(&mtuinfo, 0, sizeof(mtuinfo));
mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
optdata = (void *)&mtuinfo;
optdatalen = sizeof(mtuinfo);
if (optdatalen > MCLBYTES)
return (EMSGSIZE); /* XXX */
error = sockopt_set(sopt, optdata, optdatalen);
break;
}
#ifdef RFC2292
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292HOPOPTS:
case IPV6_2292RTHDR:
case IPV6_2292DSTOPTS:
switch (optname) {
case IPV6_2292PKTINFO:
optval = OPTBIT(IN6P_PKTINFO);
break;
case IPV6_2292HOPLIMIT:
optval = OPTBIT(IN6P_HOPLIMIT);
break;
case IPV6_2292HOPOPTS:
optval = OPTBIT(IN6P_HOPOPTS);
break;
case IPV6_2292RTHDR:
optval = OPTBIT(IN6P_RTHDR);
break;
case IPV6_2292DSTOPTS:
optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
break;
}
error = sockopt_setint(sopt, optval);
break;
#endif
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_NEXTHOP:
case IPV6_OTCLASS:
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
case IPV6_PREFER_TEMPADDR:
error = ip6_getpcbopt(in6p_outputopts(inp),
optname, sopt);
break;
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_getmoptions(sopt, inp);
break;
case IPV6_PORTALGO:
optval = inp->inp_portalgo;
error = sockopt_setint(sopt, optval);
break;
#if defined(IPSEC)
case IPV6_IPSEC_POLICY:
if (ipsec_used) {
struct mbuf *m = NULL;
/*
* XXX: this will return EINVAL as sopt is
* empty
*/
error = ipsec_get_policy(inp, sopt->sopt_data,
sopt->sopt_size, &m);
if (!error) error = sockopt_setmbuf(sopt, m);
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
int
ip6_raw_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int error = 0, optval;
const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
struct inpcb *inp = sotoinpcb(so);
int level, optname;
KASSERT(sopt != NULL);
level = sopt->sopt_level;
optname = sopt->sopt_name;
if (level != IPPROTO_IPV6) {
return ENOPROTOOPT;
}
switch (optname) {
case IPV6_CHECKSUM:
/*
* For ICMPv6 sockets, no modification allowed for checksum
* offset, permit "no change" values to help existing apps.
*
* XXX RFC3542 says: "An attempt to set IPV6_CHECKSUM
* for an ICMPv6 socket will fail." The current
* behavior does not meet RFC3542.
*/
switch (op) {
case PRCO_SETOPT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
if (optval < -1 ||
(optval > 0 && (optval % 2) != 0)) {
/*
* The API assumes non-negative even offset
* values or -1 as a special value.
*/
error = EINVAL;
} else if (so->so_proto->pr_protocol ==
IPPROTO_ICMPV6) {
if (optval != icmp6off)
error = EINVAL;
} else
in6p_cksum(inp) = optval;
break;
case PRCO_GETOPT:
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
optval = icmp6off;
else
optval = in6p_cksum(inp);
error = sockopt_setint(sopt, optval);
break;
default:
error = EINVAL;
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
return (error);
}
#ifdef RFC2292
/*
* Set up IP6 options in pcb for insertion in output packets or
* specifying behavior of outgoing packets.
*/
static int
ip6_pcbopts(struct ip6_pktopts **pktopt, struct socket *so,
struct sockopt *sopt)
{
struct ip6_pktopts *opt = *pktopt;
struct mbuf *m;
int error = 0;
KASSERT(solocked(so));
/* turn off any old options. */
if (opt) {
#ifdef DIAGNOSTIC
if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
opt->ip6po_rhinfo.ip6po_rhi_rthdr)
printf("ip6_pcbopts: all specified options are cleared.\n");
#endif
ip6_clearpktopts(opt, -1);
} else {
opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
if (opt == NULL)
return (ENOBUFS);
}
*pktopt = NULL;
if (sopt == NULL || sopt->sopt_size == 0) {
/*
* Only turning off any previous options, regardless of
* whether the opt is just created or given.
*/
free(opt, M_IP6OPT);
return (0);
}
/* set options specified by user. */
m = sockopt_getmbuf(sopt);
if (m == NULL) {
free(opt, M_IP6OPT);
return (ENOBUFS);
}
error = ip6_setpktopts(m, opt, NULL, kauth_cred_get(),
so->so_proto->pr_protocol);
m_freem(m);
if (error != 0) {
ip6_clearpktopts(opt, -1); /* XXX: discard all options */
free(opt, M_IP6OPT);
return (error);
}
*pktopt = opt;
return (0);
}
#endif
/*
* initialize ip6_pktopts. beware that there are non-zero default values in
* the struct.
*/
void
ip6_initpktopts(struct ip6_pktopts *opt)
{
memset(opt, 0, sizeof(*opt));
opt->ip6po_hlim = -1; /* -1 means default hop limit */
opt->ip6po_tclass = -1; /* -1 means default traffic class */
opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
}
#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) /* XXX */
static int
ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
kauth_cred_t cred, int uproto)
{
struct ip6_pktopts *opt;
if (*pktopt == NULL) {
*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
M_NOWAIT);
if (*pktopt == NULL)
return (ENOBUFS);
ip6_initpktopts(*pktopt);
}
opt = *pktopt;
return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
}
static int
ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
{
void *optdata = NULL;
int optdatalen = 0;
struct ip6_ext *ip6e;
int error = 0;
struct in6_pktinfo null_pktinfo;
int deftclass = 0, on;
int defminmtu = IP6PO_MINMTU_MCASTONLY;
int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
switch (optname) {
case IPV6_PKTINFO:
if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo;
else {
/* XXX: we don't have to do this every time... */
memset(&null_pktinfo, 0, sizeof(null_pktinfo));
optdata = (void *)&null_pktinfo;
}
optdatalen = sizeof(struct in6_pktinfo);
break;
case IPV6_OTCLASS:
/* XXX */
return (EINVAL);
case IPV6_TCLASS:
if (pktopt && pktopt->ip6po_tclass >= 0)
optdata = (void *)&pktopt->ip6po_tclass;
else
optdata = (void *)&deftclass;
optdatalen = sizeof(int);
break;
case IPV6_HOPOPTS:
if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh;
ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDR:
if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr;
ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDRDSTOPTS:
if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_DSTOPTS:
if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_NEXTHOP:
if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop;
optdatalen = pktopt->ip6po_nexthop->sa_len;
}
break;
case IPV6_USE_MIN_MTU:
if (pktopt)
optdata = (void *)&pktopt->ip6po_minmtu;
else
optdata = (void *)&defminmtu;
optdatalen = sizeof(int);
break;
case IPV6_DONTFRAG:
if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
on = 1;
else
on = 0;
optdata = (void *)&on;
optdatalen = sizeof(on);
break;
case IPV6_PREFER_TEMPADDR:
if (pktopt)
optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
else
optdata = (void *)&defpreftemp;
optdatalen = sizeof(int);
break;
default: /* should not happen */
#ifdef DIAGNOSTIC
panic("ip6_getpcbopt: unexpected option\n");
#endif
return (ENOPROTOOPT);
}
error = sockopt_set(sopt, optdata, optdatalen);
return (error);
}
void
ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
{ if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT);
pktopt->ip6po_pktinfo = NULL;
}
if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1;
if (optname == -1 || optname == IPV6_TCLASS)
pktopt->ip6po_tclass = -1;
if (optname == -1 || optname == IPV6_NEXTHOP) {
rtcache_free(&pktopt->ip6po_nextroute);
if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT);
pktopt->ip6po_nexthop = NULL;
}
if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT);
pktopt->ip6po_hbh = NULL;
}
if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT);
pktopt->ip6po_dest1 = NULL;
}
if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
rtcache_free(&pktopt->ip6po_route);
}
if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT);
pktopt->ip6po_dest2 = NULL;
}
}
#define PKTOPT_EXTHDRCPY(type) \
do { \
if (src->type) { \
int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
dst->type = malloc(hlen, M_IP6OPT, canwait); \
if (dst->type == NULL) \
goto bad; \
memcpy(dst->type, src->type, hlen); \
} \
} while (/*CONSTCOND*/ 0)
static int
copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
{
dst->ip6po_hlim = src->ip6po_hlim;
dst->ip6po_tclass = src->ip6po_tclass;
dst->ip6po_flags = src->ip6po_flags;
dst->ip6po_minmtu = src->ip6po_minmtu;
dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
if (src->ip6po_pktinfo) {
dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
M_IP6OPT, canwait);
if (dst->ip6po_pktinfo == NULL)
goto bad;
*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
}
if (src->ip6po_nexthop) {
dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
M_IP6OPT, canwait);
if (dst->ip6po_nexthop == NULL)
goto bad;
memcpy(dst->ip6po_nexthop, src->ip6po_nexthop,
src->ip6po_nexthop->sa_len);
}
PKTOPT_EXTHDRCPY(ip6po_hbh);
PKTOPT_EXTHDRCPY(ip6po_dest1);
PKTOPT_EXTHDRCPY(ip6po_dest2);
PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
return (0);
bad:
if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT);
if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT);
if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT);
if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT);
if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT);
if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT);
return (ENOBUFS);
}
#undef PKTOPT_EXTHDRCPY
struct ip6_pktopts *
ip6_copypktopts(struct ip6_pktopts *src, int canwait)
{
int error;
struct ip6_pktopts *dst;
dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
if (dst == NULL)
return (NULL);
ip6_initpktopts(dst);
if ((error = copypktopts(dst, src, canwait)) != 0) {
free(dst, M_IP6OPT);
return (NULL);
}
return (dst);
}
void
ip6_freepcbopts(struct ip6_pktopts *pktopt)
{
if (pktopt == NULL)
return;
ip6_clearpktopts(pktopt, -1);
free(pktopt, M_IP6OPT);
}
int
ip6_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
struct psref *psref, void *v, size_t l)
{
struct ipv6_mreq mreq;
int error;
struct in6_addr *ia = &mreq.ipv6mr_multiaddr;
struct in_addr *ia4 = (void *)&ia->s6_addr32[3];
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error != 0)
return error;
if (IN6_IS_ADDR_UNSPECIFIED(ia)) {
/*
* We use the unspecified address to specify to accept
* all multicast addresses. Only super user is allowed
* to do this.
*/
if (kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST, NULL, NULL, NULL))
return EACCES;
} else if (IN6_IS_ADDR_V4MAPPED(ia)) {
// Don't bother if we are not going to use ifp.
if (l == sizeof(*ia)) { memcpy(v, ia, l);
return 0;
}
} else if (!IN6_IS_ADDR_MULTICAST(ia)) {
return EINVAL;
}
/*
* If no interface was explicitly specified, choose an
* appropriate one according to the given multicast address.
*/
if (mreq.ipv6mr_interface == 0) {
struct rtentry *rt;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
struct sockaddr_in6 dst6;
} u;
struct route ro;
/*
* Look up the routing table for the
* address, and choose the outgoing interface.
* XXX: is it a good approach?
*/
memset(&ro, 0, sizeof(ro));
if (IN6_IS_ADDR_V4MAPPED(ia)) sockaddr_in_init(&u.dst4, ia4, 0);
else
sockaddr_in6_init(&u.dst6, ia, 0, 0, 0);
error = rtcache_setdst(&ro, &u.dst);
if (error != 0)
return error;
rt = rtcache_init(&ro);
*ifp = rt != NULL ? if_get_byindex(rt->rt_ifp->if_index, psref) : NULL;
rtcache_unref(rt, &ro);
rtcache_free(&ro);
} else {
/*
* If the interface is specified, validate it.
*/
*ifp = if_get_byindex(mreq.ipv6mr_interface, psref);
if (*ifp == NULL)
return ENXIO; /* XXX EINVAL? */
}
if (sizeof(*ia) == l)
memcpy(v, ia, l);
else
memcpy(v, ia4, l);
return 0;
}
/*
* Set the IP6 multicast options in response to user setsockopt().
*/
static int
ip6_setmoptions(const struct sockopt *sopt, struct inpcb *inp)
{
int error = 0;
u_int loop, ifindex;
struct ipv6_mreq mreq;
struct in6_addr ia;
struct ifnet *ifp;
struct ip6_moptions *im6o = in6p_moptions(inp);
struct in6_multi_mship *imm;
KASSERT(inp_locked(inp)); if (im6o == NULL) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_NOWAIT); if (im6o == NULL)
return (ENOBUFS);
in6p_moptions(inp) = im6o;
im6o->im6o_multicast_if_index = 0;
im6o->im6o_multicast_hlim = ip6_defmcasthlim;
im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
LIST_INIT(&im6o->im6o_memberships);
}
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF: {
int s;
/*
* Select the interface for outgoing multicast packets.
*/
error = sockopt_get(sopt, &ifindex, sizeof(ifindex));
if (error != 0)
break;
s = pserialize_read_enter();
if (ifindex != 0) {
if ((ifp = if_byindex(ifindex)) == NULL) {
pserialize_read_exit(s);
error = ENXIO; /* XXX EINVAL? */
break;
}
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
} else
ifp = NULL;
im6o->im6o_multicast_if_index = if_get_index(ifp);
pserialize_read_exit(s);
break;
}
case IPV6_MULTICAST_HOPS:
{
/*
* Set the IP6 hoplimit for outgoing multicast packets.
*/
int optval;
error = sockopt_getint(sopt, &optval);
if (error != 0)
break;
if (optval < -1 || optval >= 256)
error = EINVAL;
else if (optval == -1)
im6o->im6o_multicast_hlim = ip6_defmcasthlim;
else
im6o->im6o_multicast_hlim = optval;
break;
}
case IPV6_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
error = sockopt_get(sopt, &loop, sizeof(loop));
if (error != 0)
break;
if (loop > 1) {
error = EINVAL;
break;
}
im6o->im6o_multicast_loop = loop;
break;
case IPV6_JOIN_GROUP: {
int bound;
struct psref psref;
/*
* Add a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
bound = curlwp_bind();
ifp = NULL;
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
if (error != 0) {
KASSERT(ifp == NULL); curlwp_bindx(bound);
return error;
}
if (IN6_IS_ADDR_V4MAPPED(&ia)) { error = ip_setmoptions(&inp->inp_moptions, sopt);
goto put_break;
}
/*
* See if we found an interface, and confirm that it
* supports multicast
*/
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
goto put_break;
}
if (in6_setscope(&ia, ifp, NULL)) {
error = EADDRNOTAVAIL; /* XXX: should not happen */
goto put_break;
}
/*
* See if the membership already exists.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if (imm->i6mm_maddr->in6m_ifp == ifp &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&ia))
goto put_break;
}
if (imm != NULL) {
error = EADDRINUSE;
goto put_break;
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
imm = in6_joingroup(ifp, &ia, &error, 0);
if (imm == NULL)
goto put_break;
LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
put_break:
if_put(ifp, &psref);
curlwp_bindx(bound);
break;
}
case IPV6_LEAVE_GROUP: {
/*
* Drop a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error != 0)
break;
if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { error = ip_setmoptions(&inp->inp_moptions, sopt);
break;
}
/*
* If an interface address was specified, get a pointer
* to its ifnet structure.
*/
if (mreq.ipv6mr_interface != 0) {
if ((ifp = if_byindex(mreq.ipv6mr_interface)) == NULL) {
error = ENXIO; /* XXX EINVAL? */
break;
}
} else
ifp = NULL;
/* Fill in the scope zone ID */
if (ifp) {
if (in6_setscope(&mreq.ipv6mr_multiaddr, ifp, NULL)) {
/* XXX: should not happen */
error = EADDRNOTAVAIL;
break;
}
} else if (mreq.ipv6mr_interface != 0) {
/*
* XXX: This case would happens when the (positive)
* index is in the valid range, but the corresponding
* interface has been detached dynamically. The above
* check probably avoids such case to happen here, but
* we check it explicitly for safety.
*/
error = EADDRNOTAVAIL;
break;
} else { /* ipv6mr_interface == 0 */
struct sockaddr_in6 sa6_mc;
/*
* The API spec says as follows:
* If the interface index is specified as 0, the
* system may choose a multicast group membership to
* drop by matching the multicast address only.
* On the other hand, we cannot disambiguate the scope
* zone unless an interface is provided. Thus, we
* check if there's ambiguity with the default scope
* zone as the last resort.
*/
sockaddr_in6_init(&sa6_mc, &mreq.ipv6mr_multiaddr,
0, 0, 0);
error = sa6_embedscope(&sa6_mc, ip6_use_defzone);
if (error != 0)
break;
mreq.ipv6mr_multiaddr = sa6_mc.sin6_addr; }
/*
* Find the membership in the membership list.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&mreq.ipv6mr_multiaddr))
break;
}
if (imm == NULL) {
/* Unable to resolve interface */
error = EADDRNOTAVAIL;
break;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
/* in6m_ifp should not leave thanks to inp_lock */
break;
}
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the mbuf.
*/
if (im6o->im6o_multicast_if_index == 0 && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
LIST_EMPTY(&im6o->im6o_memberships)) {
free(in6p_moptions(inp), M_IPMOPTS);
in6p_moptions(inp) = NULL;
}
return (error);
}
/*
* Return the IP6 multicast options in response to user getsockopt().
*/
static int
ip6_getmoptions(struct sockopt *sopt, struct inpcb *inp)
{
u_int optval;
int error;
struct ip6_moptions *im6o = in6p_moptions(inp);
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF:
if (im6o == NULL || im6o->im6o_multicast_if_index == 0)
optval = 0;
else
optval = im6o->im6o_multicast_if_index;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IPV6_MULTICAST_HOPS:
if (im6o == NULL)
optval = ip6_defmcasthlim;
else
optval = im6o->im6o_multicast_hlim;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IPV6_MULTICAST_LOOP:
if (im6o == NULL)
optval = IPV6_DEFAULT_MULTICAST_LOOP;
else
optval = im6o->im6o_multicast_loop;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
default:
error = EOPNOTSUPP;
}
return (error);
}
/*
* Discard the IP6 multicast options.
*/
void
ip6_freemoptions(struct ip6_moptions *im6o)
{
struct in6_multi_mship *imm, *nimm;
if (im6o == NULL)
return;
/* The owner of im6o (inp) should be protected by solock */
LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) { LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
free(im6o, M_IPMOPTS);
}
/*
* Set IPv6 outgoing packet options based on advanced API.
*/
int
ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
struct ip6_pktopts *stickyopt, kauth_cred_t cred, int uproto)
{
struct cmsghdr *cm = 0;
if (control == NULL || opt == NULL)
return (EINVAL);
ip6_initpktopts(opt);
if (stickyopt) {
int error;
/*
* If stickyopt is provided, make a local copy of the options
* for this particular packet, then override them by ancillary
* objects.
* XXX: copypktopts() does not copy the cached route to a next
* hop (if any). This is not very good in terms of efficiency,
* but we can allow this since this option should be rarely
* used.
*/
if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
return (error);
}
/*
* XXX: Currently, we assume all the optional information is stored
* in a single mbuf.
*/
if (control->m_next)
return (EINVAL);
/* XXX if cm->cmsg_len is not aligned, control->m_len can become <0 */
for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
int error;
if (control->m_len < CMSG_LEN(0))
return (EINVAL);
cm = mtod(control, struct cmsghdr *);
if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > control->m_len)
return (EINVAL);
if (cm->cmsg_level != IPPROTO_IPV6)
continue;
error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
if (error)
return (error);
}
return (0);
}
/*
* Set a particular packet option, as a sticky option or an ancillary data
* item. "len" can be 0 only when it's a sticky option.
* We have 4 cases of combination of "sticky" and "cmsg":
* "sticky=0, cmsg=0": impossible
* "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
* "sticky=1, cmsg=0": RFC3542 socket option
* "sticky=1, cmsg=1": RFC2292 socket option
*/
static int
ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
kauth_cred_t cred, int sticky, int cmsg, int uproto)
{
int minmtupolicy;
int error;
if (!sticky && !cmsg) {
#ifdef DIAGNOSTIC
printf("ip6_setpktopt: impossible case\n");
#endif
return (EINVAL);
}
/*
* IPV6_2292xxx is for backward compatibility to RFC2292, and should
* not be specified in the context of RFC3542. Conversely,
* RFC3542 types should not be specified in the context of RFC2292.
*/
if (!cmsg) {
switch (optname) {
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292NEXTHOP:
case IPV6_2292HOPOPTS:
case IPV6_2292DSTOPTS:
case IPV6_2292RTHDR:
case IPV6_2292PKTOPTIONS:
return (ENOPROTOOPT);
}
}
if (sticky && cmsg) {
switch (optname) {
case IPV6_PKTINFO:
case IPV6_HOPLIMIT:
case IPV6_NEXTHOP:
case IPV6_HOPOPTS:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
case IPV6_USE_MIN_MTU:
case IPV6_DONTFRAG:
case IPV6_OTCLASS:
case IPV6_TCLASS:
case IPV6_PREFER_TEMPADDR: /* XXX not an RFC3542 option */
return (ENOPROTOOPT);
}
}
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTINFO:
#endif
case IPV6_PKTINFO:
{
struct in6_pktinfo *pktinfo;
if (len != sizeof(struct in6_pktinfo))
return (EINVAL);
pktinfo = (struct in6_pktinfo *)buf;
/*
* An application can clear any sticky IPV6_PKTINFO option by
* doing a "regular" setsockopt with ipi6_addr being
* in6addr_any and ipi6_ifindex being zero.
* [RFC 3542, Section 6]
*/
if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname);
break;
}
if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
return (EINVAL);
}
/* Validate the interface index if specified. */
if (pktinfo->ipi6_ifindex) {
struct ifnet *ifp;
int s = pserialize_read_enter();
ifp = if_byindex(pktinfo->ipi6_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
}
/*
* We store the address anyway, and let in6_selectsrc()
* validate the specified address. This is because ipi6_addr
* may not have enough information about its scope zone, and
* we may need additional information (such as outgoing
* interface or the scope zone of a destination address) to
* disambiguate the scope.
* XXX: the delay of the validation may confuse the
* application when it is used as a sticky option.
*/
if (opt->ip6po_pktinfo == NULL) {
opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
M_IP6OPT, M_NOWAIT);
if (opt->ip6po_pktinfo == NULL)
return (ENOBUFS);
}
memcpy(opt->ip6po_pktinfo, pktinfo, sizeof(*pktinfo));
break;
}
#ifdef RFC2292
case IPV6_2292HOPLIMIT:
#endif
case IPV6_HOPLIMIT:
{
int *hlimp;
/*
* RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
* to simplify the ordering among hoplimit options.
*/
if (optname == IPV6_HOPLIMIT && sticky)
return (ENOPROTOOPT);
if (len != sizeof(int))
return (EINVAL);
hlimp = (int *)buf;
if (*hlimp < -1 || *hlimp > 255)
return (EINVAL);
opt->ip6po_hlim = *hlimp;
break;
}
case IPV6_OTCLASS:
if (len != sizeof(u_int8_t))
return (EINVAL);
opt->ip6po_tclass = *(u_int8_t *)buf;
break;
case IPV6_TCLASS:
{
int tclass;
if (len != sizeof(int))
return (EINVAL);
tclass = *(int *)buf;
if (tclass < -1 || tclass > 255)
return (EINVAL);
opt->ip6po_tclass = tclass;
break;
}
#ifdef RFC2292
case IPV6_2292NEXTHOP:
#endif
case IPV6_NEXTHOP:
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { /* just remove the option */
ip6_clearpktopts(opt, IPV6_NEXTHOP);
break;
}
/* check if cmsg_len is large enough for sa_len */
if (len < sizeof(struct sockaddr) || len < *buf)
return (EINVAL);
switch (((struct sockaddr *)buf)->sa_family) {
case AF_INET6:
{
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
if (sa6->sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
return (EINVAL);
}
if ((error = sa6_embedscope(sa6, ip6_use_defzone))
!= 0) {
return (error);
}
break;
}
case AF_LINK: /* eventually be supported? */
default:
return (EAFNOSUPPORT);
}
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, IPV6_NEXTHOP);
opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_nexthop == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_nexthop, buf, *buf);
break;
#ifdef RFC2292
case IPV6_2292HOPOPTS:
#endif
case IPV6_HOPOPTS:
{
struct ip6_hbh *hbh;
int hbhlen;
/*
* XXX: We don't allow a non-privileged user to set ANY HbH
* options, since per-option restriction has too much
* overhead.
*/
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_hbh))
return (EINVAL);
hbh = (struct ip6_hbh *)buf;
hbhlen = (hbh->ip6h_len + 1) << 3;
if (len != hbhlen)
return (EINVAL);
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, IPV6_HOPOPTS);
opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_hbh == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_hbh, hbh, hbhlen);
break;
}
#ifdef RFC2292
case IPV6_2292DSTOPTS:
#endif
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
{
struct ip6_dest *dest, **newdest = NULL;
int destlen;
/* XXX: see the comment for IPV6_HOPOPTS */
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { ip6_clearpktopts(opt, optname);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_dest))
return (EINVAL);
dest = (struct ip6_dest *)buf;
destlen = (dest->ip6d_len + 1) << 3;
if (len != destlen)
return (EINVAL);
/*
* Determine the position that the destination options header
* should be inserted; before or after the routing header.
*/
switch (optname) {
case IPV6_2292DSTOPTS:
/*
* The old advanced API is ambiguous on this point.
* Our approach is to determine the position based
* according to the existence of a routing header.
* Note, however, that this depends on the order of the
* extension headers in the ancillary data; the 1st
* part of the destination options header must appear
* before the routing header in the ancillary data,
* too.
* RFC3542 solved the ambiguity by introducing
* separate ancillary data or option types.
*/
if (opt->ip6po_rthdr == NULL)
newdest = &opt->ip6po_dest1;
else
newdest = &opt->ip6po_dest2;
break;
case IPV6_RTHDRDSTOPTS:
newdest = &opt->ip6po_dest1;
break;
case IPV6_DSTOPTS:
newdest = &opt->ip6po_dest2;
break;
}
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, optname);
*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
if (*newdest == NULL)
return (ENOBUFS);
memcpy(*newdest, dest, destlen);
break;
}
#ifdef RFC2292
case IPV6_2292RTHDR:
#endif
case IPV6_RTHDR:
{
struct ip6_rthdr *rth;
int rthlen;
if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_rthdr))
return (EINVAL);
rth = (struct ip6_rthdr *)buf;
rthlen = (rth->ip6r_len + 1) << 3;
if (len != rthlen)
return (EINVAL);
switch (rth->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
/* Dropped, RFC5095. */
default:
return (EINVAL); /* not supported */
}
/* turn off the previous option */
ip6_clearpktopts(opt, IPV6_RTHDR);
opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_rthdr == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_rthdr, rth, rthlen);
break;
}
case IPV6_USE_MIN_MTU:
if (len != sizeof(int))
return (EINVAL);
minmtupolicy = *(int *)buf;
if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
minmtupolicy != IP6PO_MINMTU_DISABLE &&
minmtupolicy != IP6PO_MINMTU_ALL) {
return (EINVAL);
}
opt->ip6po_minmtu = minmtupolicy;
break;
case IPV6_DONTFRAG:
if (len != sizeof(int))
return (EINVAL);
if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
/*
* we ignore this option for TCP sockets.
* (RFC3542 leaves this case unspecified.)
*/
opt->ip6po_flags &= ~IP6PO_DONTFRAG;
} else
opt->ip6po_flags |= IP6PO_DONTFRAG;
break;
case IPV6_PREFER_TEMPADDR:
{
int preftemp;
if (len != sizeof(int))
return (EINVAL);
preftemp = *(int *)buf;
switch (preftemp) {
case IP6PO_TEMPADDR_SYSTEM:
case IP6PO_TEMPADDR_NOTPREFER:
case IP6PO_TEMPADDR_PREFER:
break;
default:
return (EINVAL);
}
opt->ip6po_prefer_tempaddr = preftemp;
break;
}
default:
return (ENOPROTOOPT);
} /* end of switch */
return (0);
}
/*
* Routine called from ip6_output() to loop back a copy of an IP6 multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be lo0ifp -- easier than replicating that code here.
*/
void
ip6_mloopback(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr_in6 *dst)
{
struct mbuf *copym;
struct ip6_hdr *ip6;
copym = m_copypacket(m, M_DONTWAIT);
if (copym == NULL)
return;
/*
* Make sure to deep-copy IPv6 header portion in case the data
* is in an mbuf cluster, so that we can safely override the IPv6
* header portion later.
*/
if ((copym->m_flags & M_EXT) != 0 ||
copym->m_len < sizeof(struct ip6_hdr)) {
copym = m_pullup(copym, sizeof(struct ip6_hdr));
if (copym == NULL)
return;
}
#ifdef DIAGNOSTIC
if (copym->m_len < sizeof(*ip6)) {
m_freem(copym);
return;
}
#endif
ip6 = mtod(copym, struct ip6_hdr *);
/*
* clear embedded scope identifiers if necessary.
* in6_clearscope will touch the addresses only when necessary.
*/
in6_clearscope(&ip6->ip6_src);
in6_clearscope(&ip6->ip6_dst);
(void)looutput(ifp, copym, (const struct sockaddr *)dst, NULL);
}
/*
* Chop IPv6 header off from the payload.
*/
static int
ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
{
struct mbuf *mh;
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
if (m->m_len > sizeof(*ip6)) {
MGETHDR(mh, M_DONTWAIT, MT_HEADER);
if (mh == NULL) {
m_freem(m);
return ENOBUFS;
}
m_move_pkthdr(mh, m);
m_align(mh, sizeof(*ip6));
m->m_len -= sizeof(*ip6);
m->m_data += sizeof(*ip6);
mh->m_next = m;
mh->m_len = sizeof(*ip6);
memcpy(mtod(mh, void *), (void *)ip6, sizeof(*ip6));
m = mh;
}
exthdrs->ip6e_ip6 = m;
return 0;
}
/*
* Compute IPv6 extension header length.
*/
int
ip6_optlen(struct inpcb *inp)
{
int len;
if (!in6p_outputopts(inp))
return 0;
len = 0;
#define elen(x) \
(((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
len += elen(in6p_outputopts(inp)->ip6po_hbh); len += elen(in6p_outputopts(inp)->ip6po_dest1); len += elen(in6p_outputopts(inp)->ip6po_rthdr); len += elen(in6p_outputopts(inp)->ip6po_dest2);
return len;
#undef elen
}
/*
* Ensure sending address is valid.
* Returns 0 on success, -1 if an error should be sent back or 1
* if the packet could be dropped without error (protocol dependent).
*/
static int
ip6_ifaddrvalid(const struct in6_addr *src, const struct in6_addr *dst)
{
struct sockaddr_in6 sin6;
int s, error;
struct ifaddr *ifa;
struct in6_ifaddr *ia6;
if (IN6_IS_ADDR_UNSPECIFIED(src))
return 0;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = *src;
s = pserialize_read_enter();
ifa = ifa_ifwithaddr(sin6tosa(&sin6));
if ((ia6 = ifatoia6(ifa)) == NULL ||
ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED))
error = -1;
else if (ia6->ia6_flags & IN6_IFF_TENTATIVE)
error = 1;
else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
(sin6.sin6_addr = *dst, ifa_ifwithaddr(sin6tosa(&sin6)) == NULL))
/* Allow internal traffic to DETACHED addresses */
error = 1;
else
error = 0;
pserialize_read_exit(s);
return error;
}
/* $NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $ */
/*
* Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $");
#ifdef _KERNEL_OPT
#include "opt_script.h"
#endif
#if defined(SETUIDSCRIPTS) && !defined(FDSCRIPTS)
#define FDSCRIPTS /* Need this for safe set-id scripts. */
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/file.h>
#ifdef SETUIDSCRIPTS
#include <sys/stat.h>
#endif
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/resourcevar.h>
#include <sys/module.h>
#include <sys/exec_script.h>
#include <sys/exec_elf.h>
MODULE(MODULE_CLASS_EXEC, exec_script, NULL);
static struct execsw exec_script_execsw = {
.es_hdrsz = SCRIPT_HDR_SIZE,
.es_makecmds = exec_script_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = NULL,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = 0,
.es_copyargs = NULL,
.es_setregs = NULL,
.es_coredump = NULL,
.es_setup_stack = exec_setup_stack,
};
static int
exec_script_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(&exec_script_execsw, 1);
case MODULE_CMD_FINI:
return exec_remove(&exec_script_execsw, 1);
case MODULE_CMD_AUTOUNLOAD:
/*
* We don't want to be autounloaded because our use is
* transient: no executables with p_execsw equal to
* exec_script_execsw will exist, so FINI will never
* return EBUSY. However, the system will run scripts
* often. Return EBUSY here to prevent this module from
* ping-ponging in and out of the kernel.
*/
return EBUSY;
default:
return ENOTTY;
}
}
/*
* exec_script_makecmds(): Check if it's an executable shell script.
*
* Given a proc pointer and an exec package pointer, see if the referent
* of the epp is in shell script. If it is, then set thing up so that
* the script can be run. This involves preparing the address space
* and arguments for the shell which will run the script.
*
* This function is ultimately responsible for creating a set of vmcmds
* which can be used to build the process's vm space and inserting them
* into the exec package.
*/
int
exec_script_makecmds(struct lwp *l, struct exec_package *epp)
{
int error, hdrlinelen, shellnamelen, shellarglen;
char *hdrstr = epp->ep_hdr;
char *cp, *shellname, *shellarg;
size_t shellargp_len;
struct exec_fakearg *shellargp;
struct exec_fakearg *tmpsap;
struct pathbuf *shell_pathbuf;
struct vnode *scriptvp;
#ifdef SETUIDSCRIPTS
/* Gcc needs those initialized for spurious uninitialized warning */
uid_t script_uid = (uid_t) -1;
gid_t script_gid = NOGROUP;
u_short script_sbits;
#endif
/*
* if the magic isn't that of a shell script, or we've already
* done shell script processing for this exec, punt on it.
*/
if ((epp->ep_flags & EXEC_INDIR) != 0 || epp->ep_hdrvalid < EXEC_SCRIPT_MAGICLEN ||
strncmp(hdrstr, EXEC_SCRIPT_MAGIC, EXEC_SCRIPT_MAGICLEN))
return ENOEXEC;
/*
* Check that the shell spec is terminated by a newline, and that
* it isn't too large.
*/
hdrlinelen = uimin(epp->ep_hdrvalid, SCRIPT_HDR_SIZE);
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; cp < hdrstr + hdrlinelen;
cp++) {
if (*cp == '\n') {
*cp = '\0';
break;
}
}
if (cp >= hdrstr + hdrlinelen)
return ENOEXEC;
/* strip spaces before the shell name */
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; *cp == ' ' || *cp == '\t';
cp++)
;
if (*cp == '\0')
return ENOEXEC;
shellarg = NULL;
shellarglen = 0;
/* collect the shell name; remember its length for later */
shellname = cp;
shellnamelen = 0;
for ( /* cp = cp */ ; *cp != '\0' && *cp != ' ' && *cp != '\t'; cp++)
shellnamelen++;
if (*cp == '\0')
goto check_shell;
*cp++ = '\0';
/* skip spaces before any argument */
for ( /* cp = cp */ ; *cp == ' ' || *cp == '\t'; cp++)
;
if (*cp == '\0')
goto check_shell;
/*
* collect the shell argument. everything after the shell name
* is passed as ONE argument; that's the correct (historical)
* behaviour.
*/
shellarg = cp;
for ( /* cp = cp */ ; *cp != '\0'; cp++)
shellarglen++;
*cp++ = '\0';
check_shell:
#ifdef SETUIDSCRIPTS
/*
* MNT_NOSUID has already taken care of by check_exec,
* so we don't need to worry about it now or later. We
* will need to check PSL_TRACED later, however.
*/
script_sbits = epp->ep_vap->va_mode & (S_ISUID | S_ISGID);
if (script_sbits != 0) {
script_uid = epp->ep_vap->va_uid;
script_gid = epp->ep_vap->va_gid;
}
#endif
#ifdef FDSCRIPTS
/*
* if the script isn't readable, or it's set-id, then we've
* gotta supply a "/dev/fd/..." for the shell to read.
* Note that stupid shells (csh) do the wrong thing, and
* close all open fd's when they start. That kills this
* method of implementing "safe" set-id and x-only scripts.
*/
vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY);
error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred);
VOP_UNLOCK(epp->ep_vp);
if (error == EACCES
#ifdef SETUIDSCRIPTS
|| script_sbits
#endif
) {
struct file *fp;
KASSERT(!(epp->ep_flags & EXEC_HASFD));
if ((error = fd_allocfile(&fp, &epp->ep_fd)) != 0) {
scriptvp = NULL;
shellargp = NULL;
goto fail;
}
epp->ep_flags |= EXEC_HASFD;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_vnode = epp->ep_vp;
fp->f_flag = FREAD;
fd_affix(curproc, fp, epp->ep_fd);
}
#endif
/* set up the fake args list */
shellargp_len = 4 * sizeof(*shellargp);
shellargp = kmem_alloc(shellargp_len, KM_SLEEP);
tmpsap = shellargp;
tmpsap->fa_len = shellnamelen + 1;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
strlcpy(tmpsap->fa_arg, shellname, tmpsap->fa_len);
tmpsap++;
if (shellarg != NULL) { tmpsap->fa_len = shellarglen + 1;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
strlcpy(tmpsap->fa_arg, shellarg, tmpsap->fa_len);
tmpsap++;
}
tmpsap->fa_len = MAXPATHLEN;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
#ifdef FDSCRIPTS
if ((epp->ep_flags & EXEC_HASFD) == 0) {
#endif
/* normally can't fail, but check for it if diagnostic */
error = copystr(epp->ep_kname, tmpsap->fa_arg, MAXPATHLEN,
NULL);
KASSERT(error == 0);
tmpsap++;
#ifdef FDSCRIPTS
} else {
snprintf(tmpsap->fa_arg, MAXPATHLEN, "/dev/fd/%d", epp->ep_fd);
tmpsap++;
}
#endif
tmpsap->fa_arg = NULL;
/* Save the old vnode so we can clean it up later. */
scriptvp = epp->ep_vp;
epp->ep_vp = NULL;
/* Note that we're trying recursively. */
epp->ep_flags |= EXEC_INDIR;
/*
* mark the header we have as invalid; check_exec will read
* the header from the new executable
*/
epp->ep_hdrvalid = 0;
/* try loading the interpreter */
if ((error = exec_makepathbuf(l, shellname, UIO_SYSSPACE,
&shell_pathbuf, NULL)) == 0) {
error = check_exec(l, epp, shell_pathbuf, NULL);
pathbuf_destroy(shell_pathbuf);
}
/* note that we've clobbered the header */
epp->ep_flags |= EXEC_DESTR;
if (error == 0) {
/*
* It succeeded. Unlock the script and
* close it if we aren't using it any more.
* Also, set things up so that the fake args
* list will be used.
*/
if ((epp->ep_flags & EXEC_HASFD) == 0) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(scriptvp, FREAD, l->l_cred);
vput(scriptvp);
}
epp->ep_flags |= (EXEC_HASARGL | EXEC_SKIPARG);
epp->ep_fa = shellargp;
epp->ep_fa_len = shellargp_len;
#ifdef SETUIDSCRIPTS
/*
* set thing up so that set-id scripts will be
* handled appropriately. PSL_TRACED will be
* checked later when the shell is actually
* exec'd.
*/
epp->ep_vap->va_mode |= script_sbits;
if (script_sbits & S_ISUID)
epp->ep_vap->va_uid = script_uid;
if (script_sbits & S_ISGID)
epp->ep_vap->va_gid = script_gid;
#endif
return (0);
}
#ifdef FDSCRIPTS
fail:
#endif
/* kill the opened file descriptor, else close the file */
if (epp->ep_flags & EXEC_HASFD) {
epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
} else if (scriptvp) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(scriptvp, FREAD, l->l_cred);
vput(scriptvp);
}
/* free the fake arg list, because we're not returning it */
if ((tmpsap = shellargp) != NULL) {
while (tmpsap->fa_arg != NULL) {
kmem_free(tmpsap->fa_arg, tmpsap->fa_len);
tmpsap++;
}
kmem_free(shellargp, shellargp_len);
}
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
return error;
}
/* $NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $ */
/*
* Copyright (c) 2015 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95
* from: NetBSD: kern_xxx.c,v 1.74 2017/10/28 00:37:11 pgoyette Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $");
#ifdef _KERNEL_OPT
#include "opt_syscall_debug.h"
#include "opt_kernhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/kernhist.h>
/*
* Pull in the indirect syscall functions here.
* They are only actually used if the ports syscall entry code
* doesn't special-case SYS_SYSCALL and SYS___SYSCALL
*
* In some cases the generated code for the two functions is identical,
* but there isn't a MI way of determining that - so we don't try.
*/
#define SYS_SYSCALL sys_syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL
#define SYS_SYSCALL sys___syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL
#ifdef SYSCALL_DEBUG
#define SCDEBUG_CALLS 0x0001 /* show calls */
#define SCDEBUG_RETURNS 0x0002 /* show returns */
#define SCDEBUG_ALL 0x0004 /* even syscalls that are not implemented */
#define SCDEBUG_SHOWARGS 0x0008 /* show arguments to calls */
#define SCDEBUG_KERNHIST 0x0010 /* use kernhist instead of printf */
#ifndef SCDEBUG_DEFAULT
#define SCDEBUG_DEFAULT (SCDEBUG_CALLS|SCDEBUG_RETURNS|SCDEBUG_SHOWARGS)
#endif
int scdebug = SCDEBUG_DEFAULT;
#ifdef KERNHIST
KERNHIST_DEFINE(scdebughist);
#define SCDEBUG_KERNHIST_FUNC(a) KERNHIST_FUNC(a)
#define SCDEBUG_KERNHIST_CALLED(a) KERNHIST_CALLED(a)
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) KERNHIST_LOG(a,b,c,d,e,f)
#else
#define SCDEBUG_KERNHIST_FUNC(a) {} /* nothing */
#define SCDEBUG_KERNHIST_CALLED(a) {} /* nothing */
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) {} /* nothing */
/* The non-kernhist support version can elide all this code easily. */
#undef SCDEBUG_KERNHIST
#define SCDEBUG_KERNHIST 0
#endif
#ifdef __HAVE_MINIMAL_EMUL
#define CODE_NOT_OK(code, em) ((int)(code) < 0)
#else
#define CODE_NOT_OK(code, em) (((int)(code) < 0) || \
((int)(code) >= (em)->e_nsysent))
#endif
void
scdebug_call(register_t code, const register_t args[])
{
SCDEBUG_KERNHIST_FUNC("scdebug_call");
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
const struct sysent *sy;
const struct emul *em;
int i;
if ((scdebug & SCDEBUG_CALLS) == 0)
return;
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_CALLED(scdebughist);
em = p->p_emul;
sy = &em->e_sysent[code];
if ((scdebug & SCDEBUG_ALL) == 0 &&
(CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
return;
}
/*
* The kernhist version of scdebug needs to restrict the usage
* compared to the normal version. histories must avoid these
* sorts of usage:
*
* - the format string *must* be literal, as it is used
* at display time in the kernel or userland
* - strings in the format will cause vmstat -u to crash
* so avoid using %s formats
*
* to avoid these, we have a fairly long block to print args
* as the format needs to change for each, and we can't just
* call printf() on each argument until we're done.
*/
if (scdebug & SCDEBUG_KERNHIST) {
if (CODE_NOT_OK(code, em)) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: OUT OF RANGE (%jd)",
p->p_pid, l->l_lid, code, 0);
} else {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: num %jd call %#jx",
p->p_pid, l->l_lid, code, (uintptr_t)sy->sy_call);
if ((scdebug & SCDEBUG_SHOWARGS) == 0)
return;
if (sy->sy_narg > 7) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-7]: (%jx, %jx, %jx, %jx, ...)",
(long)args[4], (long)args[5],
(long)args[6], (long)args[7]);
} else if (sy->sy_narg > 6) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-6]: (%jx, %jx, %jx)",
(long)args[4], (long)args[5],
(long)args[6], 0);
} else if (sy->sy_narg > 5) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-5]: (%jx, %jx)",
(long)args[4], (long)args[5], 0, 0);
} else if (sy->sy_narg == 5) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4]: (%jx)",
(long)args[4], 0, 0, 0);
}
if (sy->sy_narg > 3) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-3]: (%jx, %jx, %jx, %jx, ...)",
(long)args[0], (long)args[1],
(long)args[2], (long)args[3]);
} else if (sy->sy_narg > 2) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-2]: (%jx, %jx, %jx)",
(long)args[0], (long)args[1],
(long)args[2], 0);
} else if (sy->sy_narg > 1) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-1]: (%jx, %jx)",
(long)args[0], (long)args[1], 0, 0);
} else if (sy->sy_narg == 1) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0]: (%jx)",
(long)args[0], 0, 0, 0);
}
}
return;
}
printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
if (CODE_NOT_OK(code, em))
printf("OUT OF RANGE (%ld)", (long)code);
else {
printf("%ld call: %s", (long)code, em->e_syscallnames[code]);
if (scdebug & SCDEBUG_SHOWARGS) {
printf("(");
for (i = 0; i < sy->sy_argsize/sizeof(register_t); i++)
printf("%s0x%lx", i == 0 ? "" : ", ",
(long)args[i]);
printf(")");
}
}
printf("\n");
}
void
scdebug_ret(register_t code, int error, const register_t retval[])
{
SCDEBUG_KERNHIST_FUNC("scdebug_ret");
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
const struct sysent *sy;
const struct emul *em;
if ((scdebug & SCDEBUG_RETURNS) == 0)
return;
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_CALLED(scdebughist);
em = p->p_emul;
sy = &em->e_sysent[code];
if ((scdebug & SCDEBUG_ALL) == 0 &&
(CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
return;
}
if (scdebug & SCDEBUG_KERNHIST) {
if (CODE_NOT_OK(code, em)) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: OUT OF RANGE (%jd)",
p->p_pid, l->l_lid, code, 0);
} else {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: num %jd",
p->p_pid, l->l_lid, code, 0);
SCDEBUG_KERNHIST_LOG(scdebughist,
"ret: err = %jd, rv = 0x%jx,0x%jx",
error, (long)retval[0], (long)retval[1], 0);
}
return;
}
printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
if (CODE_NOT_OK(code, em))
printf("OUT OF RANGE (%ld)", (long)code);
else
printf("%ld ret %s: err = %d, rv = 0x%lx,0x%lx", (long)code,
em->e_syscallnames[code], error,
(long)retval[0], (long)retval[1]);
printf("\n");
}
#endif /* SYSCALL_DEBUG */
#ifndef SCDEBUG_KERNHIST_SIZE
#define SCDEBUG_KERNHIST_SIZE 500
#endif
void
scdebug_init(void)
{
#if defined(SYSCALL_DEBUG) && defined(KERNHIST)
/* Setup scdebughist kernel history */
KERNHIST_INIT(scdebughist, SCDEBUG_KERNHIST_SIZE);
#endif
}
/* $NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Scooter Morris at Genentech Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
/*
* The lockf structure is a kernel structure which contains the information
* associated with a byte range lock. The lockf structures are linked into
* the vnode structure. Locks are sorted by the starting byte of the lock for
* efficiency.
*
* lf_next is used for two purposes, depending on whether the lock is
* being held, or is in conflict with an existing lock. If this lock
* is held, it indicates the next lock on the same vnode.
* For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
* must be queued on the lf_blkhd TAILQ of lock->lf_next.
*/
TAILQ_HEAD(locklist, lockf);
struct lockf {
kcondvar_t lf_cv; /* Signalling */
short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
short lf_type; /* Lock type: F_RDLCK, F_WRLCK */
off_t lf_start; /* The byte # of the start of the lock */
off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/
void *lf_id; /* process or file description holding lock */
struct lockf **lf_head; /* Back pointer to the head of lockf list */
struct lockf *lf_next; /* Next lock on this vnode, or blocking lock */
struct locklist lf_blkhd; /* List of requests blocked on this lock */
TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
struct uidinfo *lf_uip; /* Cached pointer to uidinfo */
};
/* Maximum length of sleep chains to traverse to try and detect deadlock. */
#define MAXDEPTH 50
static kmutex_t lockf_lock __cacheline_aligned;
static char lockstr[] = "lockf";
/*
* This variable controls the maximum number of processes that will
* be checked in doing deadlock detection.
*/
int maxlockdepth = MAXDEPTH;
#ifdef LOCKF_DEBUG
int lockf_debug = 0;
#endif
#define SELF 0x1
#define OTHERS 0x2
/*
* XXX TODO
* Misc cleanups: "void *id" should be visible in the API as a
* "struct proc *".
* (This requires rototilling all VFS's which support advisory locking).
*/
/*
* If there's a lot of lock contention on a single vnode, locking
* schemes which allow for more paralleism would be needed. Given how
* infrequently byte-range locks are actually used in typical BSD
* code, a more complex approach probably isn't worth it.
*/
/*
* We enforce a limit on locks by uid, so that a single user cannot
* run the kernel out of memory. For now, the limit is pretty coarse.
* There is no limit on root.
*
* Splitting a lock will always succeed, regardless of current allocations.
* If you're slightly above the limit, we still have to permit an allocation
* so that the unlock can succeed. If the unlocking causes too many splits,
* however, you're totally cutoff.
*/
#define MAXLOCKSPERUID (2 * maxfiles)
#ifdef LOCKF_DEBUG
/*
* Print out a lock.
*/
static void
lf_print(const char *tag, struct lockf *lock)
{
printf("%s: lock %p for ", tag, lock);
if (lock->lf_flags & F_POSIX)
printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
else
printf("file %p", (struct file *)lock->lf_id);
printf(" %s, start %jd, end %jd",
lock->lf_type == F_RDLCK ? "shared" :
lock->lf_type == F_WRLCK ? "exclusive" :
lock->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
if (TAILQ_FIRST(&lock->lf_blkhd))
printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
else
printf("\n");
}
static void
lf_printlist(const char *tag, struct lockf *lock)
{
struct lockf *lf, *blk;
printf("%s: Lock list:\n", tag);
for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
printf("\tlock %p for ", lf);
if (lf->lf_flags & F_POSIX)
printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
else
printf("file %p", (struct file *)lf->lf_id);
printf(", %s, start %jd, end %jd",
lf->lf_type == F_RDLCK ? "shared" :
lf->lf_type == F_WRLCK ? "exclusive" :
lf->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
if (blk->lf_flags & F_POSIX)
printf("; proc %d",
((struct proc *)blk->lf_id)->p_pid);
else
printf("; file %p", (struct file *)blk->lf_id);
printf(", %s, start %jd, end %jd",
blk->lf_type == F_RDLCK ? "shared" :
blk->lf_type == F_WRLCK ? "exclusive" :
blk->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)blk->lf_start, (intmax_t)blk->lf_end);
if (TAILQ_FIRST(&blk->lf_blkhd))
panic("lf_printlist: bad list");
}
printf("\n");
}
}
#endif /* LOCKF_DEBUG */
/*
* 3 options for allowfail.
* 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit.
*/
static struct lockf *
lf_alloc(int allowfail)
{
struct uidinfo *uip;
struct lockf *lock;
u_long lcnt;
const uid_t uid = kauth_cred_geteuid(kauth_cred_get());
uip = uid_find(uid);
lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
if (uid && allowfail && lcnt >
(allowfail == 1 ? MAXLOCKSPERUID : (MAXLOCKSPERUID * 2))) {
atomic_dec_ulong(&uip->ui_lockcnt);
return NULL;
}
lock = kmem_alloc(sizeof(*lock), KM_SLEEP);
lock->lf_uip = uip;
cv_init(&lock->lf_cv, lockstr);
return lock;
}
static void
lf_free(struct lockf *lock)
{
atomic_dec_ulong(&lock->lf_uip->ui_lockcnt);
cv_destroy(&lock->lf_cv);
kmem_free(lock, sizeof(*lock));
}
/*
* Walk the list of locks for an inode to
* find an overlapping lock (if any).
*
* NOTE: this returns only the FIRST overlapping lock. There
* may be more than one.
*/
static int
lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
struct lockf ***prev, struct lockf **overlap)
{
off_t start, end;
*overlap = lf;
if (lf == NULL)
return 0;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("lf_findoverlap: looking for overlap in", lock);
#endif /* LOCKF_DEBUG */
start = lock->lf_start;
end = lock->lf_end;
while (lf != NULL) { if (((type == SELF) && lf->lf_id != lock->lf_id) ||
((type == OTHERS) && lf->lf_id == lock->lf_id)) {
*prev = &lf->lf_next;
*overlap = lf = lf->lf_next;
continue;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("\tchecking", lf);
#endif /* LOCKF_DEBUG */
/*
* OK, check for overlap
*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
if ((lf->lf_end != -1 && start > lf->lf_end) || (end != -1 && lf->lf_start > end)) {
/* Case 0 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("no overlap\n");
#endif /* LOCKF_DEBUG */
if ((type & SELF) && end != -1 && lf->lf_start > end)
return 0;
*prev = &lf->lf_next;
*overlap = lf = lf->lf_next;
continue;
}
if ((lf->lf_start == start) && (lf->lf_end == end)) {
/* Case 1 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap == lock\n");
#endif /* LOCKF_DEBUG */
return 1;
}
if ((lf->lf_start <= start) &&
(end != -1) &&
((lf->lf_end >= end) || (lf->lf_end == -1))) {
/* Case 2 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap contains lock\n");
#endif /* LOCKF_DEBUG */
return 2;
}
if (start <= lf->lf_start &&
(end == -1 || (lf->lf_end != -1 && end >= lf->lf_end))) {
/* Case 3 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("lock contains overlap\n");
#endif /* LOCKF_DEBUG */
return 3;
}
if ((lf->lf_start < start) &&
((lf->lf_end >= start) || (lf->lf_end == -1))) {
/* Case 4 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap starts before lock\n");
#endif /* LOCKF_DEBUG */
return 4;
}
if ((lf->lf_start > start) &&
(end != -1) &&
((lf->lf_end > end) || (lf->lf_end == -1))) {
/* Case 5 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap ends after lock\n");
#endif /* LOCKF_DEBUG */
return 5;
}
panic("lf_findoverlap: default");
}
return 0;
}
/*
* Split a lock and a contained region into
* two or three locks as necessary.
*/
static void
lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
{
struct lockf *splitlock;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2) {
lf_print("lf_split", lock1);
lf_print("splitting from", lock2);
}
#endif /* LOCKF_DEBUG */
/*
* Check to see if splitting into only two pieces.
*/
if (lock1->lf_start == lock2->lf_start) {
lock1->lf_start = lock2->lf_end + 1;
lock2->lf_next = lock1;
return;
}
if (lock1->lf_end == lock2->lf_end) {
lock1->lf_end = lock2->lf_start - 1;
lock2->lf_next = lock1->lf_next;
lock1->lf_next = lock2;
return;
}
/*
* Make a new lock consisting of the last part of
* the encompassing lock
*/
splitlock = *sparelock;
*sparelock = NULL;
cv_destroy(&splitlock->lf_cv);
memcpy(splitlock, lock1, sizeof(*splitlock));
cv_init(&splitlock->lf_cv, lockstr);
splitlock->lf_start = lock2->lf_end + 1;
TAILQ_INIT(&splitlock->lf_blkhd);
lock1->lf_end = lock2->lf_start - 1;
/*
* OK, now link it in
*/
splitlock->lf_next = lock1->lf_next;
lock2->lf_next = splitlock;
lock1->lf_next = lock2;
}
/*
* Wakeup a blocklist
*/
static void
lf_wakelock(struct lockf *listhead)
{
struct lockf *wakelock;
while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) { KASSERT(wakelock->lf_next == listhead); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
wakelock->lf_next = NULL;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("lf_wakelock: awakening", wakelock);
#endif
cv_broadcast(&wakelock->lf_cv);
}
}
/*
* Remove a byte-range lock on an inode.
*
* Generally, find the lock (or an overlap to that lock)
* and remove it (or shrink it), then wakeup anyone we can.
*/
static int
lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
{
struct lockf **head = unlock->lf_head;
struct lockf *lf = *head;
struct lockf *overlap, **prev;
int ovcase;
if (lf == NULL)
return 0;
#ifdef LOCKF_DEBUG
if (unlock->lf_type != F_UNLCK)
panic("lf_clearlock: bad type");
if (lockf_debug & 1)
lf_print("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
prev = head;
while ((ovcase = lf_findoverlap(lf, unlock, SELF,
&prev, &overlap)) != 0) {
/*
* Wakeup the list of locks to be retried.
*/
lf_wakelock(overlap);
switch (ovcase) {
case 1: /* overlap == lock */
*prev = overlap->lf_next;
lf_free(overlap);
break;
case 2: /* overlap contains lock: split it */
if (overlap->lf_start == unlock->lf_start) {
overlap->lf_start = unlock->lf_end + 1;
break;
}
lf_split(overlap, unlock, sparelock);
overlap->lf_next = unlock->lf_next;
break;
case 3: /* lock contains overlap */
*prev = overlap->lf_next;
lf = overlap->lf_next;
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
overlap->lf_end = unlock->lf_start - 1;
prev = &overlap->lf_next;
lf = overlap->lf_next;
continue;
case 5: /* overlap ends after lock */
overlap->lf_start = unlock->lf_end + 1;
break;
}
break;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_printlist("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
return 0;
}
/*
* Walk the list of locks for an inode and
* return the first blocking lock.
*/
static struct lockf *
lf_getblock(struct lockf *lock)
{
struct lockf **prev, *overlap, *lf = *(lock->lf_head);
prev = lock->lf_head;
while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
/*
* We've found an overlap, see if it blocks us
*/
if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
return overlap;
/*
* Nope, point to the next one on the list and
* see if it blocks us
*/
lf = overlap->lf_next;
}
return NULL;
}
/*
* Set a byte-range lock.
*/
static int
lf_setlock(struct lockf *lock, struct lockf **sparelock,
kmutex_t *interlock)
{
struct lockf *block;
struct lockf **head = lock->lf_head;
struct lockf **prev, *overlap, *ltmp;
int ovcase, needtolink, error;
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_print("lf_setlock", lock);
#endif /* LOCKF_DEBUG */
/*
* Scan lock list for this file looking for locks that would block us.
*/
while ((block = lf_getblock(lock)) != NULL) {
/*
* Free the structure and return if nonblocking.
*/
if ((lock->lf_flags & F_WAIT) == 0) {
lf_free(lock);
return EAGAIN;
}
/*
* We are blocked. Since flock style locks cover
* the whole file, there is no chance for deadlock.
* For byte-range locks we must check for deadlock.
*
* Deadlock detection is done by looking through the
* wait channels to see if there are any cycles that
* involve us. MAXDEPTH is set just to make sure we
* do not go off into neverneverland.
*/
if ((lock->lf_flags & F_POSIX) &&
(block->lf_flags & F_POSIX)) {
struct lwp *wlwp;
volatile const struct lockf *waitblock;
int i = 0;
struct proc *p;
p = (struct proc *)block->lf_id;
KASSERT(p != NULL); while (i++ < maxlockdepth) {
mutex_enter(p->p_lock);
if (p->p_nlwps > 1) {
mutex_exit(p->p_lock);
break;
}
wlwp = LIST_FIRST(&p->p_lwps);
lwp_lock(wlwp);
if (wlwp->l_wchan == NULL ||
wlwp->l_wmesg != lockstr) {
lwp_unlock(wlwp);
mutex_exit(p->p_lock);
break;
}
waitblock = wlwp->l_wchan;
lwp_unlock(wlwp);
mutex_exit(p->p_lock);
/* Get the owner of the blocking lock */
waitblock = waitblock->lf_next;
if ((waitblock->lf_flags & F_POSIX) == 0)
break;
p = (struct proc *)waitblock->lf_id;
if (p == curproc) { lf_free(lock);
return EDEADLK;
}
}
/*
* If we're still following a dependency chain
* after maxlockdepth iterations, assume we're in
* a cycle to be safe.
*/
if (i >= maxlockdepth) {
lf_free(lock);
return EDEADLK;
}
}
/*
* For flock type locks, we must first remove
* any shared locks that we hold before we sleep
* waiting for an exclusive lock.
*/
if ((lock->lf_flags & F_FLOCK) &&
lock->lf_type == F_WRLCK) {
lock->lf_type = F_UNLCK;
(void) lf_clearlock(lock, NULL);
lock->lf_type = F_WRLCK;
}
/*
* Add our lock to the blocked list and sleep until we're free.
* Remember who blocked us (for deadlock detection).
*/
lock->lf_next = block;
TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
#ifdef LOCKF_DEBUG
if (lockf_debug & 1) {
lf_print("lf_setlock: blocking on", block);
lf_printlist("lf_setlock", block);
}
#endif /* LOCKF_DEBUG */
error = cv_wait_sig(&lock->lf_cv, interlock);
/*
* We may have been awoken by a signal (in
* which case we must remove ourselves from the
* blocked list) and/or by another process
* releasing a lock (in which case we have already
* been removed from the blocked list and our
* lf_next field set to NULL).
*/
if (lock->lf_next != NULL) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
lock->lf_next = NULL;
}
if (error) {
lf_free(lock);
return error;
}
}
/*
* No blocks!! Add the lock. Note that we will
* downgrade or upgrade any overlapping locks this
* process already owns.
*
* Skip over locks owned by other processes.
* Handle any locks that overlap and are owned by ourselves.
*/
prev = head;
block = *head;
needtolink = 1;
for (;;) {
ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
if (ovcase)
block = overlap->lf_next;
/*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
switch (ovcase) {
case 0: /* no overlap */
if (needtolink) {
*prev = lock;
lock->lf_next = overlap;
}
break;
case 1: /* overlap == lock */
/*
* If downgrading lock, others may be
* able to acquire it.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK)
lf_wakelock(overlap);
overlap->lf_type = lock->lf_type;
lf_free(lock);
lock = overlap; /* for debug output below */
break;
case 2: /* overlap contains lock */
/*
* Check for common starting point and different types.
*/
if (overlap->lf_type == lock->lf_type) {
lf_free(lock);
lock = overlap; /* for debug output below */
break;
}
if (overlap->lf_start == lock->lf_start) {
*prev = lock;
lock->lf_next = overlap;
overlap->lf_start = lock->lf_end + 1;
} else
lf_split(overlap, lock, sparelock);
lf_wakelock(overlap);
break;
case 3: /* lock contains overlap */
/*
* If downgrading lock, others may be able to
* acquire it, otherwise take the list.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK) {
lf_wakelock(overlap);
} else {
while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) { KASSERT(ltmp->lf_next == overlap); TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
lf_block);
ltmp->lf_next = lock;
TAILQ_INSERT_TAIL(&lock->lf_blkhd,
ltmp, lf_block);
}
}
/*
* Add the new lock if necessary and delete the overlap.
*/
if (needtolink) {
*prev = lock;
lock->lf_next = overlap->lf_next;
prev = &lock->lf_next;
needtolink = 0;
} else
*prev = overlap->lf_next;
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
/*
* Add lock after overlap on the list.
*/
lock->lf_next = overlap->lf_next;
overlap->lf_next = lock;
overlap->lf_end = lock->lf_start - 1;
prev = &lock->lf_next;
lf_wakelock(overlap);
needtolink = 0;
continue;
case 5: /* overlap ends after lock */
/*
* Add the new lock before overlap.
*/
if (needtolink) { *prev = lock;
lock->lf_next = overlap;
}
overlap->lf_start = lock->lf_end + 1;
lf_wakelock(overlap);
break;
}
break;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 1) {
lf_print("lf_setlock: got the lock", lock);
lf_printlist("lf_setlock", lock);
}
#endif /* LOCKF_DEBUG */
return 0;
}
/*
* Check whether there is a blocking lock,
* and if so return its process identifier.
*/
static int
lf_getlock(struct lockf *lock, struct flock *fl)
{
struct lockf *block;
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_print("lf_getlock", lock);
#endif /* LOCKF_DEBUG */
if ((block = lf_getblock(lock)) != NULL) {
fl->l_type = block->lf_type;
fl->l_whence = SEEK_SET;
fl->l_start = block->lf_start;
if (block->lf_end == -1)
fl->l_len = 0;
else
fl->l_len = block->lf_end - block->lf_start + 1;
if (block->lf_flags & F_POSIX)
fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
else
fl->l_pid = -1;
} else {
fl->l_type = F_UNLCK;
}
return 0;
}
/*
* Do an advisory lock operation.
*/
int
lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
{
struct flock *fl = ap->a_fl;
struct lockf *lock = NULL;
struct lockf *sparelock;
kmutex_t *interlock = &lockf_lock;
off_t start, end;
int error = 0;
KASSERTMSG(size >= 0, "size=%jd", (intmax_t)size);
/*
* Convert the flock structure into a start and end.
*/
switch (fl->l_whence) {
case SEEK_SET:
case SEEK_CUR:
/*
* Caller is responsible for adding any necessary offset
* when SEEK_CUR is used.
*/
start = fl->l_start;
break;
case SEEK_END:
if (fl->l_start > __type_max(off_t) - size)
return EINVAL;
start = size + fl->l_start;
break;
default:
return EINVAL;
}
if (fl->l_len == 0)
end = -1;
else {
if (fl->l_len >= 0) {
if (start >= 0 &&
fl->l_len - 1 > __type_max(off_t) - start)
return EINVAL;
end = start + (fl->l_len - 1);
} else {
/* lockf() allows -ve lengths */
if (start < 0)
return EINVAL;
end = start - 1;
start += fl->l_len;
}
}
if (start < 0)
return EINVAL;
/*
* Allocate locks before acquiring the interlock. We need two
* locks in the worst case.
*/
switch (ap->a_op) {
case F_SETLK:
case F_UNLCK:
/*
* XXX For F_UNLCK case, we can re-use the lock.
*/
if ((ap->a_flags & F_FLOCK) == 0) {
/*
* Byte-range lock might need one more lock.
*/
sparelock = lf_alloc(0);
if (sparelock == NULL) {
error = ENOMEM;
goto quit;
}
break;
}
/* FALLTHROUGH */
case F_GETLK:
sparelock = NULL;
break;
default:
return EINVAL;
}
switch (ap->a_op) {
case F_SETLK:
lock = lf_alloc(1);
break;
case F_UNLCK:
if (start == 0 || end == -1) {
/* never split */
lock = lf_alloc(0);
} else {
/* might split */
lock = lf_alloc(2);
}
break;
case F_GETLK:
lock = lf_alloc(0);
break;
}
if (lock == NULL) {
error = ENOMEM;
goto quit;
}
mutex_enter(interlock);
/*
* Avoid the common case of unlocking when inode has no locks.
*/
if (*head == (struct lockf *)0) { if (ap->a_op != F_SETLK) { fl->l_type = F_UNLCK;
error = 0;
goto quit_unlock;
}
}
/*
* Create the lockf structure.
*/
lock->lf_start = start;
lock->lf_end = end;
lock->lf_head = head;
lock->lf_type = fl->l_type;
lock->lf_next = (struct lockf *)0;
TAILQ_INIT(&lock->lf_blkhd);
lock->lf_flags = ap->a_flags;
if (lock->lf_flags & F_POSIX) { KASSERT(curproc == (struct proc *)ap->a_id);
}
lock->lf_id = ap->a_id;
/*
* Do the requested operation.
*/
switch (ap->a_op) {
case F_SETLK:
error = lf_setlock(lock, &sparelock, interlock);
lock = NULL; /* lf_setlock freed it */
break;
case F_UNLCK:
error = lf_clearlock(lock, &sparelock);
break;
case F_GETLK:
error = lf_getlock(lock, fl);
break;
default:
break;
/* NOTREACHED */
}
quit_unlock:
mutex_exit(interlock);
quit:
if (lock)
lf_free(lock);
if (sparelock) lf_free(sparelock);
return error;
}
/*
* Initialize subsystem. XXX We use a global lock. This could be the
* vnode interlock, but the deadlock detection code may need to inspect
* locks belonging to other files.
*/
void
lf_init(void)
{
mutex_init(&lockf_lock, MUTEX_DEFAULT, IPL_NONE);
}
/* $NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/lockf.h>
#include <sys/tty.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/module.h>
#include <sys/atomic.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
/*
* Lock order:
*
* vnode lock
* -> device_lock
* -> struct vnode::v_interlock
*/
/* symbolic sleep message strings for devices */
const char devopn[] = "devopn";
const char devio[] = "devio";
const char devwait[] = "devwait";
const char devin[] = "devin";
const char devout[] = "devout";
const char devioc[] = "devioc";
const char devcls[] = "devcls";
#define SPECHSZ 64
#if ((SPECHSZ&(SPECHSZ-1)) == 0)
#define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1))
#else
#define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ)
#endif
static vnode_t *specfs_hash[SPECHSZ];
extern struct mount *dead_rootmount;
/*
* This vnode operations vector is used for special device nodes
* created from whole cloth by the kernel. For the ops vector for
* vnodes built from special devices found in a filesystem, see (e.g)
* ffs_specop_entries[] in ffs_vnops.c or the equivalent for other
* filesystems.
*/
int (**spec_vnodeop_p)(void *);
const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
{ &vop_default_desc, vn_default_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, spec_lookup }, /* lookup */
{ &vop_create_desc, genfs_badop }, /* create */
{ &vop_mknod_desc, genfs_badop }, /* mknod */
{ &vop_open_desc, spec_open }, /* open */
{ &vop_close_desc, spec_close }, /* close */
{ &vop_access_desc, genfs_ebadf }, /* access */
{ &vop_accessx_desc, genfs_ebadf }, /* accessx */
{ &vop_getattr_desc, genfs_ebadf }, /* getattr */
{ &vop_setattr_desc, genfs_ebadf }, /* setattr */
{ &vop_read_desc, spec_read }, /* read */
{ &vop_write_desc, spec_write }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, spec_ioctl }, /* ioctl */
{ &vop_poll_desc, spec_poll }, /* poll */
{ &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
{ &vop_revoke_desc, genfs_revoke }, /* revoke */
{ &vop_mmap_desc, spec_mmap }, /* mmap */
{ &vop_fsync_desc, spec_fsync }, /* fsync */
{ &vop_seek_desc, spec_seek }, /* seek */
{ &vop_remove_desc, genfs_badop }, /* remove */
{ &vop_link_desc, genfs_badop }, /* link */
{ &vop_rename_desc, genfs_badop }, /* rename */
{ &vop_mkdir_desc, genfs_badop }, /* mkdir */
{ &vop_rmdir_desc, genfs_badop }, /* rmdir */
{ &vop_symlink_desc, genfs_badop }, /* symlink */
{ &vop_readdir_desc, genfs_badop }, /* readdir */
{ &vop_readlink_desc, genfs_badop }, /* readlink */
{ &vop_abortop_desc, genfs_badop }, /* abortop */
{ &vop_inactive_desc, spec_inactive }, /* inactive */
{ &vop_reclaim_desc, spec_reclaim }, /* reclaim */
{ &vop_lock_desc, genfs_lock }, /* lock */
{ &vop_unlock_desc, genfs_unlock }, /* unlock */
{ &vop_bmap_desc, spec_bmap }, /* bmap */
{ &vop_strategy_desc, spec_strategy }, /* strategy */
{ &vop_print_desc, spec_print }, /* print */
{ &vop_islocked_desc, genfs_islocked }, /* islocked */
{ &vop_pathconf_desc, spec_pathconf }, /* pathconf */
{ &vop_advlock_desc, spec_advlock }, /* advlock */
{ &vop_bwrite_desc, vn_bwrite }, /* bwrite */
{ &vop_getpages_desc, genfs_getpages }, /* getpages */
{ &vop_putpages_desc, genfs_putpages }, /* putpages */
{ NULL, NULL }
};
const struct vnodeopv_desc spec_vnodeop_opv_desc =
{ &spec_vnodeop_p, spec_vnodeop_entries };
static kauth_listener_t rawio_listener;
static struct kcondvar specfs_iocv;
/*
* Returns true if vnode is /dev/mem or /dev/kmem.
*/
bool
iskmemvp(struct vnode *vp)
{ return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev));
}
/*
* Returns true if dev is /dev/mem or /dev/kmem.
*/
int
iskmemdev(dev_t dev)
{
/* mem_no is emitted by config(8) to generated devsw.c */
extern const int mem_no;
/* minor 14 is /dev/io on i386 with COMPAT_10 */
return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14));
}
static int
rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
result = KAUTH_RESULT_DEFER;
if ((action != KAUTH_DEVICE_RAWIO_SPEC) &&
(action != KAUTH_DEVICE_RAWIO_PASSTHRU))
return result;
/* Access is mandated by permissions. */
result = KAUTH_RESULT_ALLOW;
return result;
}
void
spec_init(void)
{
rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
rawio_listener_cb, NULL);
cv_init(&specfs_iocv, "specio");
}
/*
* spec_io_enter(vp, &sn, &dev)
*
* Enter an operation that may not hold vp's vnode lock or an
* fstrans on vp's mount. Until spec_io_exit, the vnode will not
* be revoked.
*
* On success, set sn to the specnode pointer and dev to the dev_t
* number and return zero. Caller must later call spec_io_exit
* when done.
*
* On failure, return ENXIO -- the device has been revoked and no
* longer exists.
*/
static int
spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
{
dev_t dev;
struct specnode *sn;
unsigned iocnt;
int error = 0;
mutex_enter(vp->v_interlock);
/*
* Extract all the info we need from the vnode, unless the
* vnode has already been reclaimed. This can happen if the
* underlying device has been removed and all the device nodes
* for it have been revoked. The caller may not hold a vnode
* lock or fstrans to prevent this from happening before it has
* had an opportunity to notice the vnode is dead.
*/
if (vdead_check(vp, VDEAD_NOWAIT) != 0 || (sn = vp->v_specnode) == NULL ||
(dev = vp->v_rdev) == NODEV) {
error = ENXIO;
goto out;
}
/*
* Notify spec_close that we are doing an I/O operation which
* may not be not bracketed by fstrans(9) and thus is not
* blocked by vfs suspension.
*
* We could hold this reference with psref(9) instead, but we
* already have to take the interlock for vdead_check, so
* there's not much more cost here to another atomic operation.
*/
do {
iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); if (__predict_false(iocnt == UINT_MAX)) {
/*
* The I/O count is limited by the number of
* LWPs (which will never overflow this) --
* unless one driver uses another driver via
* specfs, which is rather unusual, but which
* could happen via pud(4) userspace drivers.
* We could use a 64-bit count, but can't use
* atomics for that on all platforms.
* (Probably better to switch to psref or
* localcount instead.)
*/
error = EBUSY;
goto out;
}
} while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
!= iocnt);
/* Success! */
*snp = sn;
*devp = dev;
error = 0;
out: mutex_exit(vp->v_interlock);
return error;
}
/*
* spec_io_exit(vp, sn)
*
* Exit an operation entered with a successful spec_io_enter --
* allow concurrent spec_node_revoke to proceed. The argument sn
* must match the struct specnode pointer returned by spec_io_exit
* for vp.
*/
static void
spec_io_exit(struct vnode *vp, struct specnode *sn)
{
struct specdev *sd = sn->sn_dev;
unsigned iocnt;
KASSERT(vp->v_specnode == sn);
/*
* We are done. Notify spec_close if appropriate. The
* transition of 1 -> 0 must happen under device_lock so
* spec_close doesn't miss a wakeup.
*/
do {
iocnt = atomic_load_relaxed(&sd->sd_iocnt); KASSERT(iocnt > 0);
if (iocnt == 1) {
mutex_enter(&device_lock);
if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) cv_broadcast(&specfs_iocv);
mutex_exit(&device_lock);
break;
}
} while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
}
/*
* spec_io_drain(sd)
*
* Wait for all existing spec_io_enter/exit sections to complete.
* Caller must ensure spec_io_enter will fail at this point.
*/
static void
spec_io_drain(struct specdev *sd)
{
/*
* I/O at the same time as closing is unlikely -- it often
* indicates an application bug.
*/
if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
return;
mutex_enter(&device_lock);
while (atomic_load_relaxed(&sd->sd_iocnt) > 0) cv_wait(&specfs_iocv, &device_lock); mutex_exit(&device_lock);
}
/*
* Initialize a vnode that represents a device.
*/
void
spec_node_init(vnode_t *vp, dev_t rdev)
{
specnode_t *sn;
specdev_t *sd;
vnode_t *vp2;
vnode_t **vpp;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode == NULL);
/*
* Search the hash table for this device. If known, add a
* reference to the device structure. If not known, create
* a new entry to represent the device. In all cases add
* the vnode to the hash table.
*/
sn = kmem_alloc(sizeof(*sn), KM_SLEEP);
sd = kmem_alloc(sizeof(*sd), KM_SLEEP);
mutex_enter(&device_lock);
vpp = &specfs_hash[SPECHASH(rdev)];
for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { KASSERT(vp2->v_specnode != NULL); if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) {
break;
}
}
if (vp2 == NULL) {
/* No existing record, create a new one. */
sd->sd_mountpoint = NULL;
sd->sd_lockf = NULL;
sd->sd_refcnt = 1;
sd->sd_opencnt = 0;
sd->sd_bdevvp = NULL;
sd->sd_iocnt = 0;
sd->sd_opened = false;
sd->sd_closing = false;
sn->sn_dev = sd;
sd = NULL;
} else {
/* Use the existing record. */
sn->sn_dev = vp2->v_specnode->sn_dev;
sn->sn_dev->sd_refcnt++;
}
/* Insert vnode into the hash chain. */
sn->sn_opencnt = 0;
sn->sn_rdev = rdev;
sn->sn_gone = false;
vp->v_specnode = sn;
vp->v_specnext = *vpp;
*vpp = vp;
mutex_exit(&device_lock);
/* Free the record we allocated if unused. */
if (sd != NULL) { kmem_free(sd, sizeof(*sd));
}
}
/*
* Lookup a vnode by device number and return it referenced.
*/
int
spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp)
{
int error;
vnode_t *vp;
top: mutex_enter(&device_lock);
for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (type == vp->v_type && dev == vp->v_rdev) {
mutex_enter(vp->v_interlock);
/* If clean or being cleaned, then ignore it. */
if (vdead_check(vp, VDEAD_NOWAIT) == 0)
break;
if ((flags & VDEAD_NOWAIT) == 0) {
mutex_exit(&device_lock);
/*
* It may be being revoked as we speak,
* and the caller wants to wait until
* all revocation has completed. Let
* vcache_vget wait for it to finish
* dying; as a side effect, vcache_vget
* releases vp->v_interlock. Note that
* vcache_vget cannot succeed at this
* point because vdead_check already
* failed.
*/
error = vcache_vget(vp);
KASSERT(error);
goto top;
}
mutex_exit(vp->v_interlock);
}
}
KASSERT(vp == NULL || mutex_owned(vp->v_interlock));
if (vp == NULL) {
mutex_exit(&device_lock);
return ENOENT;
}
/*
* If it is an opened block device return the opened vnode.
*/
if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { mutex_exit(vp->v_interlock);
vp = vp->v_specnode->sn_dev->sd_bdevvp;
mutex_enter(vp->v_interlock);
}
mutex_exit(&device_lock);
error = vcache_vget(vp);
if (error)
return error;
*vpp = vp;
return 0;
}
/*
* Lookup a vnode by file system mounted on and return it referenced.
*/
int
spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp)
{
int i, error;
vnode_t *vp, *vq;
mutex_enter(&device_lock);
for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) {
for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) {
if (vp->v_type != VBLK)
continue;
vq = vp->v_specnode->sn_dev->sd_bdevvp;
if (vq != NULL &&
vq->v_specnode->sn_dev->sd_mountpoint == mp)
break;
vq = NULL;
}
}
if (vq == NULL) {
mutex_exit(&device_lock);
return ENOENT;
}
mutex_enter(vq->v_interlock);
mutex_exit(&device_lock);
error = vcache_vget(vq);
if (error)
return error;
*vpp = vq;
return 0;
}
/*
* Get the file system mounted on this block device.
*
* XXX Caller should hold the vnode lock -- shared or exclusive -- so
* that this can't changed, and the vnode can't be revoked while we
* examine it. But not all callers do, and they're scattered through a
* lot of file systems, so we can't assert this yet.
*/
struct mount *
spec_node_getmountedfs(vnode_t *devvp)
{
struct mount *mp;
KASSERT(devvp->v_type == VBLK);
mp = devvp->v_specnode->sn_dev->sd_mountpoint;
return mp;
}
/*
* Set the file system mounted on this block device.
*
* XXX Caller should hold the vnode lock exclusively so this can't be
* changed or assumed by spec_node_getmountedfs while we change it, and
* the vnode can't be revoked while we handle it. But not all callers
* do, and they're scattered through a lot of file systems, so we can't
* assert this yet. Instead, for now, we'll take an I/O reference so
* at least the ioctl doesn't race with revoke/detach.
*
* If you do change this to assert an exclusive vnode lock, you must
* also do vdead_check before trying bdev_ioctl, because the vnode may
* have been revoked by the time the caller locked it, and this is
* _not_ a vop -- calls to spec_node_setmountedfs don't go through
* v_op, so revoking the vnode doesn't prevent further calls.
*
* XXX Caller should additionally have the vnode open, at least if mp
* is nonnull, but I'm not sure all callers do that -- need to audit.
* Currently udf closes the vnode before clearing the mount.
*/
void
spec_node_setmountedfs(vnode_t *devvp, struct mount *mp)
{
struct dkwedge_info dkw;
struct specnode *sn;
dev_t dev;
int error;
KASSERT(devvp->v_type == VBLK);
error = spec_io_enter(devvp, &sn, &dev);
if (error)
return;
KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL);
sn->sn_dev->sd_mountpoint = mp;
if (mp == NULL)
goto out;
error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp);
if (error)
goto out;
strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname,
sizeof(mp->mnt_stat.f_mntfromlabel));
out: spec_io_exit(devvp, sn);
}
/*
* A vnode representing a special device is going away. Close
* the device if the vnode holds it open.
*/
void
spec_node_revoke(vnode_t *vp)
{
specnode_t *sn;
specdev_t *sd;
struct vnode **vpp;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
sn = vp->v_specnode;
sd = sn->sn_dev;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_gone == false);
mutex_enter(&device_lock);
KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sn->sn_gone = true;
if (sn->sn_opencnt != 0) {
sd->sd_opencnt -= (sn->sn_opencnt - 1);
sn->sn_opencnt = 1;
mutex_exit(&device_lock);
VOP_CLOSE(vp, FNONBLOCK, NOCRED);
mutex_enter(&device_lock);
KASSERT(sn->sn_opencnt == 0);
}
/*
* We may have revoked the vnode in this thread while another
* thread was in the middle of spec_close, in the window when
* spec_close releases the vnode lock to call .d_close for the
* last close. In that case, wait for the concurrent
* spec_close to complete.
*/
while (sd->sd_closing)
cv_wait(&specfs_iocv, &device_lock);
/*
* Remove from the hash so lookups stop returning this
* specnode. We will dissociate it from the specdev -- and
* possibly free the specdev -- in spec_node_destroy.
*/
KASSERT(sn->sn_gone); KASSERT(sn->sn_opencnt == 0);
for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];;
vpp = &(*vpp)->v_specnext) { if (*vpp == vp) {
*vpp = vp->v_specnext;
vp->v_specnext = NULL;
break;
}
}
mutex_exit(&device_lock);
}
/*
* A vnode representing a special device is being recycled.
* Destroy the specfs component.
*/
void
spec_node_destroy(vnode_t *vp)
{
specnode_t *sn;
specdev_t *sd;
int refcnt;
sn = vp->v_specnode;
sd = sn->sn_dev;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_opencnt == 0);
mutex_enter(&device_lock);
sn = vp->v_specnode;
vp->v_specnode = NULL;
refcnt = sd->sd_refcnt--;
KASSERT(refcnt > 0);
mutex_exit(&device_lock);
/* If the device is no longer in use, destroy our record. */
if (refcnt == 1) { KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL);
kmem_free(sd, sizeof(*sd));
}
kmem_free(sn, sizeof(*sn));
}
/*
* Trivial lookup routine that always fails.
*/
int
spec_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
*ap->a_vpp = NULL;
return ENOTDIR;
}
typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *);
/*
* Open a special file.
*/
/* ARGSUSED */
int
spec_open(void *v)
{
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct lwp *l = curlwp;
struct vnode *vp = ap->a_vp;
dev_t dev, dev1;
int error;
enum kauth_device_req req;
specnode_t *sn, *sn1;
specdev_t *sd;
spec_ioctl_t ioctl;
u_int gen = 0;
const char *name = NULL;
bool needclose = false;
struct partinfo pi;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
vp->v_type);
dev = vp->v_rdev;
sn = vp->v_specnode;
sd = sn->sn_dev;
/*
* Don't allow open if fs is mounted -nodev.
*/
if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
switch (ap->a_mode & (FREAD | FWRITE)) {
case FREAD | FWRITE:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW;
break;
case FWRITE:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE;
break;
default:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
break;
}
error = kauth_authorize_device_spec(ap->a_cred, req, vp);
if (error)
return error;
/*
* Acquire an open reference -- as long as we hold onto it, and
* the vnode isn't revoked, it can't be closed, and the vnode
* can't be revoked until we release the vnode lock.
*/
mutex_enter(&device_lock);
KASSERT(!sn->sn_gone);
switch (vp->v_type) {
case VCHR:
/*
* Character devices can accept opens from multiple
* vnodes. But first, wait for any close to finish.
* Wait under the vnode lock so we don't have to worry
* about the vnode being revoked while we wait.
*/
while (sd->sd_closing) {
error = cv_wait_sig(&specfs_iocv, &device_lock);
if (error)
break;
}
if (error)
break;
sd->sd_opencnt++;
sn->sn_opencnt++;
KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
break;
case VBLK:
/*
* For block devices, permit only one open. The buffer
* cache cannot remain self-consistent with multiple
* vnodes holding a block device open.
*
* Treat zero opencnt with non-NULL mountpoint as open.
* This may happen after forced detach of a mounted device.
*
* Also treat sd_closing, meaning there is a concurrent
* close in progress, as still open.
*/
if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL ||
sd->sd_closing) {
error = EBUSY;
break;
}
KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
sn->sn_opencnt);
sn->sn_opencnt = 1;
sd->sd_opencnt = 1;
sd->sd_bdevvp = vp;
break;
default:
panic("invalid specfs vnode type: %d", vp->v_type);
}
mutex_exit(&device_lock);
if (error)
return error;
/*
* Set VV_ISTTY if this is a tty cdev.
*
* XXX This does the wrong thing if the module has to be
* autoloaded. We should maybe set this after autoloading
* modules and calling .d_open successfully, except (a) we need
* the vnode lock to touch it, and (b) once we acquire the
* vnode lock again, the vnode may have been revoked, and
* deadfs's dead_read needs VV_ISTTY to be already set in order
* to return the right answer. So this needs some additional
* synchronization to be made to work correctly with tty driver
* module autoload. For now, let's just hope it doesn't cause
* too much trouble for a tty from an autoloaded driver module
* to fail with EIO instead of returning EOF.
*/
if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY;
}
/*
* Because opening the device may block indefinitely, e.g. when
* opening a tty, and loading a module may cross into many
* other subsystems, we must not hold the vnode lock while
* calling .d_open, so release it now and reacquire it when
* done.
*
* Take an I/O reference so that any concurrent spec_close via
* spec_node_revoke will wait for us to finish calling .d_open.
* The vnode can't be dead at this point because we have it
* locked. Note that if revoked, the driver must interrupt
* .d_open before spec_close starts waiting for I/O to drain so
* this doesn't deadlock.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn1, &dev1);
if (error) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
}
KASSERT(sn1 == sn); KASSERT(dev1 == dev);
/*
* Open the device. If .d_open returns ENXIO (device not
* configured), the driver may not be loaded, so try
* autoloading a module and then try .d_open again if anything
* got loaded.
*/
switch (vp->v_type) {
case VCHR:
do {
const struct cdevsw *cdev;
gen = module_gen;
error = cdev_open(dev, ap->a_mode, S_IFCHR, l);
if (error != ENXIO)
break;
/* Check if we already have a valid driver */
mutex_enter(&device_lock);
cdev = cdevsw_lookup(dev);
mutex_exit(&device_lock);
if (cdev != NULL)
break;
/* Get device name from devsw_conv array */
if ((name = cdevsw_getname(major(dev))) == NULL)
break;
/* Try to autoload device module */
(void)module_autoload(name, MODULE_CLASS_DRIVER);
} while (gen != module_gen);
break;
case VBLK:
do {
const struct bdevsw *bdev;
gen = module_gen;
error = bdev_open(dev, ap->a_mode, S_IFBLK, l);
if (error != ENXIO)
break;
/* Check if we already have a valid driver */
mutex_enter(&device_lock);
bdev = bdevsw_lookup(dev);
mutex_exit(&device_lock);
if (bdev != NULL)
break;
/* Get device name from devsw_conv array */
if ((name = bdevsw_getname(major(dev))) == NULL)
break;
/* Try to autoload device module */
(void)module_autoload(name, MODULE_CLASS_DRIVER);
} while (gen != module_gen);
break;
default:
__unreachable();
}
/*
* Release the I/O reference now that we have called .d_open,
* and reacquire the vnode lock. At this point, the device may
* have been revoked, so we must tread carefully. However, sn
* and sd remain valid pointers until we drop our reference.
*/
spec_io_exit(vp, sn);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(vp->v_specnode == sn);
/*
* If it has been revoked since we released the vnode lock and
* reacquired it, then spec_node_revoke has closed it, and we
* must fail with EBADF.
*
* Otherwise, if opening it failed, back out and release the
* open reference. If it was ever successfully opened and we
* got the last reference this way, it's now our job to close
* it. This might happen in the following scenario:
*
* Thread 1 Thread 2
* VOP_OPEN
* ...
* .d_open -> 0 (success)
* acquire vnode lock
* do stuff VOP_OPEN
* release vnode lock ...
* .d_open -> EBUSY
* VOP_CLOSE
* acquire vnode lock
* --sd_opencnt != 0
* => no .d_close
* release vnode lock
* acquire vnode lock
* --sd_opencnt == 0
*
* We can't resolve this by making spec_close wait for .d_open
* to complete before examining sd_opencnt, because .d_open can
* hang indefinitely, e.g. for a tty.
*/
mutex_enter(&device_lock);
if (sn->sn_gone) {
if (error == 0)
error = EBADF;
} else if (error == 0) {
/*
* Device has not been revoked, so our opencnt can't
* have gone away at this point -- transition to
* sn_gone=true happens before transition to
* sn_opencnt=0 in spec_node_revoke.
*/
KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
KASSERT(!sd->sd_closing);
sd->sd_opened = true;
} else if (sd->sd_opencnt == 1 && sd->sd_opened) {
/*
* We're the last reference to a _previous_ open even
* though this one failed, so we have to close it.
* Don't decrement the reference count here --
* spec_close will do that.
*/
KASSERT(sn->sn_opencnt == 1);
needclose = true;
} else {
KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sd->sd_opencnt--;
sn->sn_opencnt--;
if (vp->v_type == VBLK) sd->sd_bdevvp = NULL;
}
mutex_exit(&device_lock);
/*
* If this open failed, but the device was previously opened,
* and another thread concurrently closed the vnode while we
* were in the middle of reopening it, the other thread will
* see sd_opencnt > 0 and thus decide not to call .d_close --
* it is now our responsibility to do so.
*
* XXX The flags passed to VOP_CLOSE here are wrong, but
* drivers can't rely on FREAD|FWRITE anyway -- e.g., consider
* a device opened by thread 0 with O_READ, then opened by
* thread 1 with O_WRITE, then closed by thread 0, and finally
* closed by thread 1; the last .d_close call will have FWRITE
* but not FREAD. We should just eliminate the FREAD/FWRITE
* parameter to .d_close altogether.
*/
if (needclose) {
KASSERT(error);
VOP_CLOSE(vp, FNONBLOCK, NOCRED);
}
/* If anything went wrong, we're done. */
if (error)
return error;
/*
* For disk devices, automagically set the vnode size to the
* partition size, if we can. This applies to block devices
* and character devices alike -- every block device must have
* a corresponding character device. And if the module is
* loaded it will remain loaded until we're done here (it is
* forbidden to devsw_detach until closed). So it is safe to
* query cdev_type unconditionally here.
*/
if (cdev_type(dev) == D_DISK) {
ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl;
if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0)
uvm_vnp_setsize(vp,
(voff_t)pi.pi_secsize * pi.pi_size);
}
/* Success! */
return 0;
}
/*
* Vnode op for read
*/
/* ARGSUSED */
int
spec_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *l = curlwp;
struct specnode *sn;
dev_t dev;
struct buf *bp;
daddr_t bn;
int bsize, bscale;
struct partinfo pi;
int n, on;
int error = 0;
int i, nra;
daddr_t lastbn, *rablks;
int *rasizes;
int nrablks, ratogo;
KASSERT(uio->uio_rw == UIO_READ); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
uio->uio_vmspace == curproc->p_vmspace),
"vmspace belongs to neither kernel nor curproc");
if (uio->uio_resid == 0)
return 0;
switch (vp->v_type) {
case VCHR:
/*
* Release the lock while we sleep -- possibly
* indefinitely, if this is, e.g., a tty -- in
* cdev_read, so we don't hold up everything else that
* might want access to the vnode.
*
* But before we issue the read, take an I/O reference
* to the specnode so close will know when we're done
* reading. Note that the moment we release the lock,
* the vnode's identity may change; hence spec_io_enter
* may fail, and the caller may have a dead vnode on
* their hands, if the file system on which vp lived
* has been unmounted.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
error = cdev_read(dev, uio, ap->a_ioflag);
spec_io_exit(vp, sn);
out: /* XXX What if the caller held an exclusive lock? */
vn_lock(vp, LK_SHARED | LK_RETRY);
return error;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_offset < 0)
return EINVAL;
if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
else
bsize = BLKDEV_IOSIZE;
bscale = bsize >> DEV_BSHIFT;
nra = uimax(16 * MAXPHYS / bsize - 1, 511);
rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP);
rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP);
lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT)
&~ (bscale - 1);
nrablks = ratogo = 0;
do {
bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
on = uio->uio_offset % bsize;
n = uimin((unsigned)(bsize - on), uio->uio_resid);
if (ratogo == 0) {
nrablks = uimin((lastbn - bn) / bscale, nra);
ratogo = nrablks;
for (i = 0; i < nrablks; ++i) {
rablks[i] = bn + (i+1) * bscale;
rasizes[i] = bsize;
}
error = breadn(vp, bn, bsize,
rablks, rasizes, nrablks,
0, &bp);
} else {
if (ratogo > 0)
--ratogo;
error = bread(vp, bn, bsize, 0, &bp);
}
if (error)
break;
n = uimin(n, bsize - bp->b_resid);
error = uiomove((char *)bp->b_data + on, n, uio);
brelse(bp, 0);
} while (error == 0 && uio->uio_resid > 0 && n != 0);
kmem_free(rablks, nra * sizeof(*rablks));
kmem_free(rasizes, nra * sizeof(*rasizes));
return error;
default:
panic("spec_read type");
}
/* NOTREACHED */
}
/*
* Vnode op for write
*/
/* ARGSUSED */
int
spec_write(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *l = curlwp;
struct specnode *sn;
dev_t dev;
struct buf *bp;
daddr_t bn;
int bsize, bscale;
struct partinfo pi;
int n, on;
int error = 0;
KASSERT(uio->uio_rw == UIO_WRITE); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
uio->uio_vmspace == curproc->p_vmspace),
"vmspace belongs to neither kernel nor curproc");
switch (vp->v_type) {
case VCHR:
/*
* Release the lock while we sleep -- possibly
* indefinitely, if this is, e.g., a tty -- in
* cdev_write, so we don't hold up everything else that
* might want access to the vnode.
*
* But before we issue the write, take an I/O reference
* to the specnode so close will know when we're done
* writing. Note that the moment we release the lock,
* the vnode's identity may change; hence spec_io_enter
* may fail, and the caller may have a dead vnode on
* their hands, if the file system on which vp lived
* has been unmounted.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
error = cdev_write(dev, uio, ap->a_ioflag);
spec_io_exit(vp, sn);
out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_resid == 0)
return 0;
if (uio->uio_offset < 0)
return EINVAL;
if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
else
bsize = BLKDEV_IOSIZE;
bscale = bsize >> DEV_BSHIFT;
do {
bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
on = uio->uio_offset % bsize;
n = uimin((unsigned)(bsize - on), uio->uio_resid);
if (n == bsize)
bp = getblk(vp, bn, bsize, 0, 0);
else
error = bread(vp, bn, bsize, B_MODIFY, &bp);
if (error) {
return error;
}
n = uimin(n, bsize - bp->b_resid);
error = uiomove((char *)bp->b_data + on, n, uio);
if (error)
brelse(bp, 0);
else {
if (n + on == bsize)
bawrite(bp);
else
bdwrite(bp);
error = bp->b_error;
}
} while (error == 0 && uio->uio_resid > 0 && n != 0);
return error;
default:
panic("spec_write type");
}
/* NOTREACHED */
}
/*
* fdiscard, which on disk devices becomes TRIM.
*/
int
spec_fdiscard(void *v)
{
struct vop_fdiscard_args /* {
struct vnode *a_vp;
off_t a_pos;
off_t a_len;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
dev_t dev;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
dev = vp->v_rdev;
switch (vp->v_type) {
case VCHR:
#if 0 /* This is not stored for character devices. */
KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp);
#endif
return cdev_discard(dev, ap->a_pos, ap->a_len);
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
return bdev_discard(dev, ap->a_pos, ap->a_len);
default:
panic("spec_fdiscard: not a device\n");
}
}
/*
* Device ioctl operation.
*/
/* ARGSUSED */
int
spec_ioctl(void *v)
{
struct vop_ioctl_args /* {
struct vnode *a_vp;
u_long a_command;
void *a_data;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
switch (vp->v_type) {
case VCHR:
error = cdev_ioctl(dev, ap->a_command, ap->a_data,
ap->a_fflag, curlwp);
break;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
error = bdev_ioctl(dev, ap->a_command, ap->a_data,
ap->a_fflag, curlwp);
break;
default:
panic("spec_ioctl");
/* NOTREACHED */
}
spec_io_exit(vp, sn);
return error;
}
/* ARGSUSED */
int
spec_poll(void *v)
{
struct vop_poll_args /* {
struct vnode *a_vp;
int a_events;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int revents;
if (spec_io_enter(vp, &sn, &dev) != 0)
return POLLERR;
switch (vp->v_type) {
case VCHR:
revents = cdev_poll(dev, ap->a_events, curlwp);
break;
default:
revents = genfs_poll(v);
break;
}
spec_io_exit(vp, sn);
return revents;
}
/* ARGSUSED */
int
spec_kqfilter(void *v)
{
struct vop_kqfilter_args /* {
struct vnode *a_vp;
struct proc *a_kn;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
switch (vp->v_type) {
case VCHR:
error = cdev_kqfilter(dev, ap->a_kn);
break;
default:
/*
* Block devices don't support kqfilter, and refuse it
* for any other files (like those vflush()ed) too.
*/
error = EOPNOTSUPP;
break;
}
spec_io_exit(vp, sn);
return error;
}
/*
* Allow mapping of only D_DISK. This is called only for VBLK.
*/
int
spec_mmap(void *v)
{
struct vop_mmap_args /* {
struct vnode *a_vp;
vm_prot_t a_prot;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
KASSERT(vp->v_type == VBLK);
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
spec_io_exit(vp, sn);
return 0;
}
/*
* Synch buffers associated with a block device
*/
/* ARGSUSED */
int
spec_fsync(void *v)
{
struct vop_fsync_args /* {
struct vnode *a_vp;
kauth_cred_t a_cred;
int a_flags;
off_t offlo;
off_t offhi;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct mount *mp;
int error;
if (vp->v_type == VBLK) { if ((mp = spec_node_getmountedfs(vp)) != NULL) {
error = VFS_FSYNC(mp, vp, ap->a_flags);
if (error != EOPNOTSUPP)
return error;
}
return vflushbuf(vp, ap->a_flags);
}
return 0;
}
/*
* Just call the device strategy routine
*/
int
spec_strategy(void *v)
{
struct vop_strategy_args /* {
struct vnode *a_vp;
struct buf *a_bp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct buf *bp = ap->a_bp;
struct specnode *sn = NULL;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
bp->b_dev = dev;
if (!(bp->b_flags & B_READ)) {
#ifdef DIAGNOSTIC
if (bp->b_vp && bp->b_vp->v_type == VBLK) {
struct mount *mp = spec_node_getmountedfs(bp->b_vp);
if (mp && (mp->mnt_flag & MNT_RDONLY)) {
printf("%s blk %"PRId64" written while ro!\n",
mp->mnt_stat.f_mntonname, bp->b_blkno);
#ifdef DDB
db_stacktrace();
#endif
}
}
#endif /* DIAGNOSTIC */
error = fscow_run(bp, false);
if (error)
goto out;
}
bdev_strategy(bp);
error = 0;
out: if (sn) spec_io_exit(vp, sn); if (error) { bp->b_error = error;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
return error;
}
int
spec_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
struct bool *a_recycle;
} */ *ap = v;
KASSERT(ap->a_vp->v_mount == dead_rootmount);
*ap->a_recycle = true;
return 0;
}
int
spec_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(vp->v_specnode->sn_opencnt == 0);
VOP_UNLOCK(vp);
KASSERT(vp->v_mount == dead_rootmount);
return 0;
}
/*
* This is a noop, simply returning what one has been given.
*/
int
spec_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
if (ap->a_vpp != NULL)
*ap->a_vpp = ap->a_vp;
if (ap->a_bnp != NULL)
*ap->a_bnp = ap->a_bn;
if (ap->a_runp != NULL)
*ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1;
return 0;
}
/*
* Device close routine
*/
/* ARGSUSED */
int
spec_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct session *sess;
dev_t dev;
int flags = ap->a_fflag;
int mode, error, count;
specnode_t *sn;
specdev_t *sd;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
mutex_enter(vp->v_interlock);
sn = vp->v_specnode;
dev = vp->v_rdev;
sd = sn->sn_dev;
/*
* If we're going away soon, make this non-blocking.
* Also ensures that we won't wedge in vn_lock below.
*/
if (vdead_check(vp, VDEAD_NOWAIT) != 0)
flags |= FNONBLOCK;
mutex_exit(vp->v_interlock);
switch (vp->v_type) {
case VCHR:
/*
* Hack: a tty device that is a controlling terminal
* has a reference from the session structure. We
* cannot easily tell that a character device is a
* controlling terminal, unless it is the closing
* process' controlling terminal. In that case, if the
* open count is 1 release the reference from the
* session. Also, remove the link from the tty back to
* the session and pgrp.
*
* XXX V. fishy.
*/
mutex_enter(&proc_lock);
sess = curlwp->l_proc->p_session;
if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) {
mutex_spin_enter(&tty_lock);
sess->s_ttyvp = NULL;
if (sess->s_ttyp->t_session != NULL) {
sess->s_ttyp->t_pgrp = NULL;
sess->s_ttyp->t_session = NULL;
mutex_spin_exit(&tty_lock);
/* Releases proc_lock. */
proc_sessrele(sess);
} else {
mutex_spin_exit(&tty_lock);
if (sess->s_ttyp->t_pgrp != NULL)
panic("spec_close: spurious pgrp ref"); mutex_exit(&proc_lock);
}
vrele(vp);
} else
mutex_exit(&proc_lock);
/*
* If the vnode is locked, then we are in the midst
* of forcably closing the device, otherwise we only
* close on last reference.
*/
mode = S_IFCHR;
break;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
/*
* On last close of a block device (that isn't mounted)
* we must invalidate any in core blocks, so that
* we can, for instance, change floppy disks.
*/
error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0);
if (error)
return error;
/*
* We do not want to really close the device if it
* is still in use unless we are trying to close it
* forcibly. Since every use (buffer, vnode, swap, cmap)
* holds a reference to the vnode, and because we mark
* any other vnodes that alias this device, when the
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
mode = S_IFBLK;
break;
default:
panic("spec_close: not special");
}
/*
* Decrement the open reference count of this node and the
* device. For block devices, the open reference count must be
* 1 at this point. If the device's open reference count goes
* to zero, we're the last one out so get the lights.
*
* We may find --sd->sd_opencnt gives zero, and yet
* sd->sd_opened is false. This happens if the vnode is
* revoked at the same time as it is being opened, which can
* happen when opening a tty blocks indefinitely. In that
* case, we still must call close -- it is the job of close to
* interrupt the open. Either way, the device will be no
* longer opened, so we have to clear sd->sd_opened; subsequent
* opens will have responsibility for issuing close.
*
* This has the side effect that the sequence of opens might
* happen out of order -- we might end up doing open, open,
* close, close, instead of open, close, open, close. This is
* unavoidable with the current devsw API, where open is
* allowed to block and close must be able to run concurrently
* to interrupt it. It is the driver's responsibility to
* ensure that close is idempotent so that this works. Drivers
* requiring per-open state and exact 1:1 correspondence
* between open and close can use fd_clone.
*/
mutex_enter(&device_lock);
KASSERT(sn->sn_opencnt); KASSERT(sd->sd_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sn->sn_opencnt--;
count = --sd->sd_opencnt;
if (vp->v_type == VBLK) { KASSERTMSG(count == 0, "block device with %u opens",
count + 1);
sd->sd_bdevvp = NULL;
}
if (count == 0) {
KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
sn->sn_opencnt);
KASSERT(!sd->sd_closing);
sd->sd_opened = false;
sd->sd_closing = true;
}
mutex_exit(&device_lock);
if (count != 0)
return 0;
/*
* If we're able to block, release the vnode lock & reacquire. We
* might end up sleeping for someone else who wants our queues. They
* won't get them if we hold the vnode locked.
*/
if (!(flags & FNONBLOCK)) VOP_UNLOCK(vp);
/*
* If we can cancel all outstanding I/O, then wait for it to
* drain before we call .d_close. Drivers that split up
* .d_cancel and .d_close this way need not have any internal
* mechanism for waiting in .d_close for I/O to drain.
*/
if (vp->v_type == VBLK)
error = bdev_cancel(dev, flags, mode, curlwp);
else
error = cdev_cancel(dev, flags, mode, curlwp); if (error == 0) spec_io_drain(sd);
else
KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d",
(unsigned long)dev, error);
if (vp->v_type == VBLK)
error = bdev_close(dev, flags, mode, curlwp);
else
error = cdev_close(dev, flags, mode, curlwp);
/*
* Wait for all other devsw operations to drain. After this
* point, no bdev/cdev_* can be active for this specdev.
*/
spec_io_drain(sd);
/*
* Wake any spec_open calls waiting for close to finish -- do
* this before reacquiring the vnode lock, because spec_open
* holds the vnode lock while waiting, so doing this after
* reacquiring the lock would deadlock.
*/
mutex_enter(&device_lock);
KASSERT(!sd->sd_opened); KASSERT(sd->sd_closing);
sd->sd_closing = false;
cv_broadcast(&specfs_iocv);
mutex_exit(&device_lock);
if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
}
/*
* Print out the contents of a special device vnode.
*/
int
spec_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev),
(unsigned long long)minor(ap->a_vp->v_rdev));
return 0;
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
spec_pathconf(void *v)
{
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap = v;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
return 0;
case _PC_MAX_CANON:
*ap->a_retval = MAX_CANON;
return 0;
case _PC_MAX_INPUT:
*ap->a_retval = MAX_INPUT;
return 0;
case _PC_PIPE_BUF:
*ap->a_retval = PIPE_BUF;
return 0;
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
return 0;
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
return 0;
case _PC_SYNC_IO:
*ap->a_retval = 1;
return 0;
default:
return genfs_pathconf(ap);
}
/* NOTREACHED */
}
/*
* Advisory record locking support.
*/
int
spec_advlock(void *v)
{
struct vop_advlock_args /* {
struct vnode *a_vp;
void *a_id;
int a_op;
struct flock *a_fl;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
return lf_advlock(ap, &vp->v_speclockf, (off_t)0);
}
/* $NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $ */
/*
* Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum, and by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)trap.c 7.4 (Berkeley) 5/13/91
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $");
#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_xen.h"
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/acct.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/ras.h>
#include <sys/signal.h>
#include <sys/syscall.h>
#include <sys/cpu.h>
#include <sys/ucontext.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <uvm/uvm_extern.h>
#include <machine/cpufunc.h>
#include <x86/fpu.h>
#include <x86/dbregs.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/trap.h>
#include <machine/userret.h>
#include <machine/db_machdep.h>
#include <x86/nmi.h>
#ifndef XENPV
#include "isa.h"
#endif
#include <sys/kgdb.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
/*
* This is a hook which is initialized by the dtrace module to handle traps
* which might occur during DTrace probe execution.
*/
dtrace_trap_func_t dtrace_trap_func = NULL;
dtrace_doubletrap_func_t dtrace_doubletrap_func = NULL;
#endif
/*
* Module hook for amd64_oosyscall
*/
struct amd64_oosyscall_hook_t amd64_oosyscall_hook;
void nmitrap(struct trapframe *);
void doubletrap(struct trapframe *);
void trap(struct trapframe *);
const char * const trap_type[] = {
"privileged instruction fault", /* 0 T_PRIVINFLT */
"breakpoint trap", /* 1 T_BPTFLT */
"arithmetic trap", /* 2 T_ARITHTRAP */
"asynchronous system trap", /* 3 T_ASTFLT */
"protection fault", /* 4 T_PROTFLT */
"trace trap", /* 5 T_TRCTRAP */
"page fault", /* 6 T_PAGEFLT */
"alignment fault", /* 7 T_ALIGNFLT */
"integer divide fault", /* 8 T_DIVIDE */
"non-maskable interrupt", /* 9 T_NMI */
"overflow trap", /* 10 T_OFLOW */
"bounds check fault", /* 11 T_BOUND */
"FPU not available fault", /* 12 T_DNA */
"double fault", /* 13 T_DOUBLEFLT */
"FPU operand fetch fault", /* 14 T_FPOPFLT */
"invalid TSS fault", /* 15 T_TSSFLT */
"segment not present fault", /* 16 T_SEGNPFLT */
"stack fault", /* 17 T_STKFLT */
"machine check fault", /* 18 T_MCA */
"SSE FP exception", /* 19 T_XMM */
"reserved trap", /* 20 T_RESERVED */
};
int trap_types = __arraycount(trap_type);
#ifdef TRAP_SIGDEBUG
static void sigdebug(const struct trapframe *, const ksiginfo_t *, int);
#define SIGDEBUG(a, b, c) sigdebug(a, b, c)
#else
#define SIGDEBUG(a, b, c)
#endif
static void
onfault_restore(struct trapframe *frame, void *onfault, int error)
{
frame->tf_rip = (uintptr_t)onfault;
frame->tf_rax = error;
}
static void *
onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
{
struct onfault_table {
uintptr_t start;
uintptr_t end;
void *handler;
};
extern const struct onfault_table onfault_table[];
const struct onfault_table *p;
uintptr_t pc;
if (pcb->pcb_onfault != NULL) {
return pcb->pcb_onfault;
}
pc = tf->tf_rip;
for (p = onfault_table; p->start; p++) { if (p->start <= pc && pc < p->end) { return p->handler;
}
}
return NULL;
}
static void
trap_print(const struct trapframe *frame, const lwp_t *l)
{
const int type = frame->tf_trapno;
if (frame->tf_trapno < trap_types) {
printf("fatal %s", trap_type[type]);
} else {
printf("unknown trap %d", type);
}
printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
printf("trap type %d code %#lx rip %#lx cs %#lx rflags %#lx cr2 %#lx "
"ilevel %#x rsp %#lx\n",
type, frame->tf_err, (u_long)frame->tf_rip, frame->tf_cs,
frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp);
printf("curlwp %p pid %d.%d lowest kstack %p\n",
l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l));
}
void
nmitrap(struct trapframe *frame)
{
const int type = T_NMI;
if (nmi_dispatch(frame))
return;
/* NMI can be hooked up to a pushbutton for debugging */
if (kgdb_trap(type, frame))
return;
if (kdb_trap(type, 0, frame))
return;
/* machine/parity/power fail/"kitchen sink" faults */
x86_nmi();
}
void
doubletrap(struct trapframe *frame)
{
const int type = T_DOUBLEFLT;
struct lwp *l = curlwp;
trap_print(frame, l);
if (kdb_trap(type, 0, frame))
return;
if (kgdb_trap(type, frame))
return;
panic("double fault");
}
/*
* trap(frame): exception, fault, and trap interface to BSD kernel.
*
* This common code is called from assembly language IDT gate entry routines
* that prepare a suitable stack frame, and restore this frame after the
* exception has been processed. Note that the effect is as if the arguments
* were passed call by reference.
*
* Note that the fpu traps (07 T_DNA, 10 T_ARITHTRAP and 13 T_XMM)
* jump directly into the code in x86/fpu.c so they get processed
* without interrupts being enabled.
*/
void
trap(struct trapframe *frame)
{
struct lwp *l = curlwp;
struct proc *p;
struct pcb *pcb;
extern char kcopy_fault[];
ksiginfo_t ksi;
void *onfault;
int type, error;
uint64_t cr2;
bool pfail;
if (__predict_true(l != NULL)) { pcb = lwp_getpcb(l);
p = l->l_proc;
} else {
/*
* This can happen eg on break points in early on boot.
*/
pcb = NULL;
p = NULL;
}
type = frame->tf_trapno;
if (!KERNELMODE(frame->tf_cs)) { type |= T_USER;
l->l_md.md_regs = frame;
}
#ifdef KDTRACE_HOOKS
/*
* A trap can occur while DTrace executes a probe. Before
* executing the probe, DTrace blocks re-scheduling and sets
* a flag in its per-cpu flags to indicate that it doesn't
* want to fault. On returning from the probe, the no-fault
* flag is cleared and finally re-scheduling is enabled.
*
* If the DTrace kernel module has registered a trap handler,
* call it and if it returns non-zero, assume that it has
* handled the trap and modified the trap frame so that this
* function can return normally.
*/
if ((type == T_PROTFLT || type == T_PAGEFLT) &&
dtrace_trap_func != NULL) {
if ((*dtrace_trap_func)(frame, type)) {
return;
}
}
#endif
switch (type) {
default:
we_re_toast:
trap_print(frame, l);
if (kdb_trap(type, 0, frame))
return;
if (kgdb_trap(type, frame))
return;
/*
* If this is a breakpoint, don't panic if we're not connected.
*/
if (type == T_BPTFLT && kgdb_disconnected()) { printf("kgdb: ignored %s\n", trap_type[type]);
return;
}
panic("trap");
/*NOTREACHED*/
case T_PROTFLT:
case T_SEGNPFLT:
case T_ALIGNFLT:
case T_STKFLT:
case T_TSSFLT:
if (p == NULL)
goto we_re_toast;
/* Check for copyin/copyout fault. */
onfault = onfault_handler(pcb, frame); if (onfault != NULL) {
onfault_restore(frame, onfault, EFAULT);
return;
}
goto we_re_toast;
case T_PROTFLT|T_USER: /* protection fault */
{ int hook_ret;
MODULE_HOOK_CALL(amd64_oosyscall_hook, (p, frame),
ENOSYS, hook_ret);
if (hook_ret == 0) {
/* Do the syscall */
p->p_md.md_syscall(frame);
goto out;
}
}
/* FALLTHROUGH */
case T_TSSFLT|T_USER:
case T_SEGNPFLT|T_USER:
case T_STKFLT|T_USER:
case T_ALIGNFLT|T_USER:
KSI_INIT_TRAP(&ksi);
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)frame->tf_rip;
switch (type) {
case T_SEGNPFLT|T_USER:
case T_STKFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRERR;
break;
case T_TSSFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
break;
case T_ALIGNFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRALN;
break;
case T_PROTFLT|T_USER:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_ACCERR;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_PRIVINFLT|T_USER: /* privileged instruction fault */
case T_FPOPFLT|T_USER: /* coprocessor operand fault */
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGILL;
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *) frame->tf_rip;
switch (type) {
case T_PRIVINFLT|T_USER:
ksi.ksi_code = ILL_PRVOPC;
break;
case T_FPOPFLT|T_USER:
ksi.ksi_code = ILL_COPROC;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_ASTFLT|T_USER:
/* Allow process switch. */
//curcpu()->ci_data.cpu_nast++;
if (l->l_pflag & LP_OWEUPC) { l->l_pflag &= ~LP_OWEUPC;
ADDUPROF(l);
}
goto out;
case T_BOUND|T_USER:
case T_OFLOW|T_USER:
case T_DIVIDE|T_USER:
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGFPE;
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)frame->tf_rip;
switch (type) {
case T_BOUND|T_USER:
ksi.ksi_code = FPE_FLTSUB;
break;
case T_OFLOW|T_USER:
ksi.ksi_code = FPE_INTOVF;
break;
case T_DIVIDE|T_USER:
ksi.ksi_code = FPE_INTDIV;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_PAGEFLT:
/* Allow page faults in kernel mode. */
if (__predict_false(l == NULL))
goto we_re_toast;
onfault = pcb->pcb_onfault;
if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
goto we_re_toast;
}
cr2 = rcr2();
if (frame->tf_err & PGEX_I) {
/* SMEP might have brought us here */
if (cr2 < VM_MAXUSER_ADDRESS) {
printf("prevented execution of %p (SMEP)\n",
(void *)cr2);
goto we_re_toast;
}
}
if ((frame->tf_err & PGEX_P) &&
cr2 < VM_MAXUSER_ADDRESS) {
/* SMAP might have brought us here */
if (onfault_handler(pcb, frame) == NULL) {
printf("prevented access to %p (SMAP)\n",
(void *)cr2);
goto we_re_toast;
}
}
goto pagefltcommon;
case T_PAGEFLT|T_USER: {
register vaddr_t va;
register struct vmspace *vm;
register struct vm_map *map;
vm_prot_t ftype;
extern struct vm_map *kernel_map;
cr2 = rcr2();
if (p->p_emul->e_usertrap != NULL &&
(*p->p_emul->e_usertrap)(l, cr2, frame) != 0)
return;
pagefltcommon:
vm = p->p_vmspace;
if (__predict_false(vm == NULL)) {
goto we_re_toast;
}
pcb->pcb_cr2 = cr2;
va = trunc_page((vaddr_t)cr2);
/*
* It is only a kernel address space fault iff:
* 1. (type & T_USER) == 0 and
* 2. pcb_onfault not set or
* 3. pcb_onfault set but supervisor space fault
* The last can occur during an exec() copyin where the
* argument space is lazy-allocated.
*/
if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS)
map = kernel_map;
else
map = &vm->vm_map;
if (frame->tf_err & PGEX_W)
ftype = VM_PROT_WRITE;
else if (frame->tf_err & PGEX_I)
ftype = VM_PROT_EXECUTE;
else
ftype = VM_PROT_READ;
#ifdef DIAGNOSTIC
if (map == kernel_map && va == 0) {
printf("trap: bad kernel access at %lx\n", va);
goto we_re_toast;
}
#endif
/* Fault the original page in. */
onfault = pcb->pcb_onfault;
pcb->pcb_onfault = NULL;
error = uvm_fault(map, va, ftype);
pcb->pcb_onfault = onfault;
if (error == 0) {
if (map != kernel_map && (void *)va >= vm->vm_maxsaddr) uvm_grow(p, va);
pfail = false;
while (type == T_PAGEFLT) {
/*
* we need to switch pmap now if we're in
* the middle of copyin/out.
*
* but we don't need to do so for kcopy as
* it never touch userspace.
*/
kpreempt_disable();
if (curcpu()->ci_want_pmapload) { onfault = onfault_handler(pcb, frame); if (onfault != kcopy_fault) {
pmap_load();
}
}
/*
* We need to keep the pmap loaded and
* so avoid being preempted until back
* into the copy functions. Disable
* interrupts at the hardware level before
* re-enabling preemption. Interrupts
* will be re-enabled by 'iret' when
* returning back out of the trap stub.
* They'll only be re-enabled when the
* program counter is once again in
* the copy functions, and so visible
* to cpu_kpreempt_exit().
*/
#ifndef XENPV
x86_disable_intr();
#endif
l->l_nopreempt--;
if (l->l_nopreempt > 0 || !l->l_dopreempt ||
pfail) {
return;
}
#ifndef XENPV
x86_enable_intr();
#endif
/*
* If preemption fails for some reason,
* don't retry it. The conditions won't
* change under our nose.
*/
pfail = kpreempt(0);
}
goto out;
}
if (type == T_PAGEFLT) {
onfault = onfault_handler(pcb, frame); if (onfault != NULL) {
onfault_restore(frame, onfault, error);
return;
}
printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
map, va, ftype, error);
goto we_re_toast;
}
KSI_INIT_TRAP(&ksi);
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)cr2;
switch (error) {
case EINVAL:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRERR;
break;
case EACCES:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_ACCERR;
error = EFAULT;
break;
case ENOMEM:
ksi.ksi_signo = SIGKILL;
printf("UVM: pid %d.%d (%s), uid %d killed: "
"out of swap\n", p->p_pid, l->l_lid, p->p_comm,
l->l_cred ? kauth_cred_geteuid(l->l_cred) : -1);
break;
default:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_MAPERR;
break;
}
SIGDEBUG(frame, &ksi, error);
(*p->p_emul->e_trapsignal)(l, &ksi);
break;
}
case T_TRCTRAP:
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which
* can happen under several conditions such as
* if a user sets a watchpoint on a buffer and
* then passes that buffer to a system call.
* We still want to get TRCTRAPS for addresses
* in kernel space because that is useful when
* debugging the kernel.
*/
if (x86_dbregs_user_trap())
break;
goto we_re_toast;
case T_BPTFLT|T_USER: /* bpt instruction fault */
case T_TRCTRAP|T_USER: /* trace trap */
/*
* Don't go single-stepping into a RAS.
*/
if (p->p_raslist == NULL ||
(ras_lookup(p, (void *)frame->tf_rip) == (void *)-1)) {
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGTRAP;
ksi.ksi_trap = type & ~T_USER;
if (x86_dbregs_user_trap()) {
x86_dbregs_store_dr6(l);
ksi.ksi_code = TRAP_DBREG;
} else if (type == (T_BPTFLT|T_USER))
ksi.ksi_code = TRAP_BRKPT;
else
ksi.ksi_code = TRAP_TRACE;
(*p->p_emul->e_trapsignal)(l, &ksi);
}
break;
}
if ((type & T_USER) == 0)
return;
out:
userret(l);
return;
trapsignal:
SIGDEBUG(frame, &ksi, 0);
(*p->p_emul->e_trapsignal)(l, &ksi);
userret(l);
}
/*
* startlwp: start of a new LWP.
*/
void
startlwp(void *arg)
{
ucontext_t *uc = arg;
lwp_t *l = curlwp;
int error __diagused;
error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
KASSERT(error == 0);
kmem_free(uc, sizeof(ucontext_t));
userret(l);
}
#ifdef TRAP_SIGDEBUG
static void
frame_dump(const struct trapframe *tf, struct pcb *pcb)
{
printf("trapframe %p\n", tf);
printf("rip %#018lx rsp %#018lx rfl %#018lx\n",
tf->tf_rip, tf->tf_rsp, tf->tf_rflags);
printf("rdi %#018lx rsi %#018lx rdx %#018lx\n",
tf->tf_rdi, tf->tf_rsi, tf->tf_rdx);
printf("rcx %#018lx r8 %#018lx r9 %#018lx\n",
tf->tf_rcx, tf->tf_r8, tf->tf_r9);
printf("r10 %#018lx r11 %#018lx r12 %#018lx\n",
tf->tf_r10, tf->tf_r11, tf->tf_r12);
printf("r13 %#018lx r14 %#018lx r15 %#018lx\n",
tf->tf_r13, tf->tf_r14, tf->tf_r15);
printf("rbp %#018lx rbx %#018lx rax %#018lx\n",
tf->tf_rbp, tf->tf_rbx, tf->tf_rax);
printf("cs %#04lx ds %#04lx es %#04lx "
"fs %#04lx gs %#04lx ss %#04lx\n",
tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff,
tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff);
printf("fsbase %#018lx gsbase %#018lx\n", pcb->pcb_fs, pcb->pcb_gs);
printf("\n");
hexdump(printf, "Stack dump", tf, 256);
}
static void
sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
printf("pid %d.%d (%s): signal %d code=%d (trap %#lx) "
"@rip %#lx addr %#lx error=%d\n",
p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code,
tf->tf_trapno, tf->tf_rip, rcr2(), e);
frame_dump(tf, lwp_getpcb(l));
}
#endif
/* $NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $ */
/* $OpenBSD: strlcpy.c,v 1.7 2003/04/12 21:56:39 millert Exp $ */
/*
* Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE
* FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if !defined(_KERNEL) && !defined(_STANDALONE)
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $");
#endif /* LIBC_SCCS and not lint */
#ifdef _LIBC
#include "namespace.h"
#endif
#include <sys/types.h>
#include <assert.h>
#include <string.h>
#ifdef _LIBC
# ifdef __weak_alias
__weak_alias(strlcpy, _strlcpy)
# endif
#endif
#else
#include <lib/libkern/libkern.h>
#endif /* !_KERNEL && !_STANDALONE */
#if !HAVE_STRLCPY
/*
* Copy src to string dst of size siz. At most siz-1 characters
* will be copied. Always NUL terminates (unless siz == 0).
* Returns strlen(src); if retval >= siz, truncation occurred.
*/
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
char *d = dst;
const char *s = src;
size_t n = siz;
_DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL);
/* Copy as many bytes as will fit */
if (n != 0 && --n != 0) {
do {
if ((*d++ = *s++) == 0)
break;
} while (--n != 0);
}
/* Not enough room in dst, add NUL and traverse rest of src */
if (n == 0) {
if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++)
;
}
return(s - src - 1); /* count does not include NUL */
}
#endif
/* $NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $ */
/*-
* Copyright (c) 1998, 2008, 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.3 (Berkeley) 1/9/95
*/
#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_route.h"
#endif /* _KERNEL && _KERNEL_OPT */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/pool.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/raw_cb.h>
static rtbl_t *rt_tables[AF_MAX+1];
int
rt_inithead(rtbl_t **tp, int off)
{
rtbl_t *t;
if (*tp != NULL)
return 1;
t = kmem_alloc(sizeof(*t), KM_SLEEP);
*tp = t;
return rn_inithead0(&t->t_rnh, off);
}
struct rtentry *
rt_matchaddr(rtbl_t *t, const struct sockaddr *dst)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
return NULL;
return (struct rtentry *)rn;
}
int
rt_addaddr(rtbl_t *t, struct rtentry *rt, const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh, rt->rt_nodes);
return (rn == NULL) ? EEXIST : 0;
}
struct rtentry *
rt_lookup(rtbl_t *t, const struct sockaddr *dst, const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_lookup(dst, netmask, rnh);
if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
return NULL;
return (struct rtentry *)rn;
}
struct rtentry *
rt_deladdr(rtbl_t *t, const struct sockaddr *dst,
const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
return NULL;
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("%s", __func__);
return (struct rtentry *)rn;
}
static int
rt_walktree_visitor(struct radix_node *rn, void *v)
{
struct rtwalk *rw = (struct rtwalk *)v;
return (*rw->rw_f)((struct rtentry *)rn, rw->rw_v);
}
int
rtbl_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
{
rtbl_t *t = rt_tables[family];
struct rtwalk rw;
if (t == NULL)
return 0;
rw.rw_f = f;
rw.rw_v = v;
return rn_walktree(&t->t_rnh, rt_walktree_visitor, &rw);
}
struct rtentry *
rtbl_search_matched_entry(sa_family_t family,
int (*f)(struct rtentry *, void *), void *v)
{
rtbl_t *t = rt_tables[family];
struct rtwalk rw;
if (t == NULL)
return 0;
rw.rw_f = f;
rw.rw_v = v;
return (struct rtentry *)
rn_search_matched(&t->t_rnh, rt_walktree_visitor, &rw);
}
rtbl_t *
rt_gettable(sa_family_t af)
{ if (af >= __arraycount(rt_tables))
return NULL;
return rt_tables[af];
}
void
rtbl_init(void)
{
struct domain *dom;
DOMAIN_FOREACH(dom)
if (dom->dom_rtattach)
dom->dom_rtattach(&rt_tables[dom->dom_family],
dom->dom_rtoffset);
}
void
rt_assert_inactive(const struct rtentry *rt)
{
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtfree 2");
}
int
rt_refines(const struct sockaddr *m_sa, const struct sockaddr *n_sa)
{
return rn_refines(m_sa, n_sa);
}
/* $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $ */
/*-
* Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
* Copyright (c) 2009 Apple, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
*/
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif /* _KERNEL_OPT */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/wait.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/select.h>
#include <sys/queue.h>
#include <sys/event.h>
#include <sys/eventvar.h>
#include <sys/poll.h>
#include <sys/kmem.h>
#include <sys/stat.h>
#include <sys/filedesc.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/conf.h>
#include <sys/atomic.h>
static int kqueue_scan(file_t *, size_t, struct kevent *,
const struct timespec *, register_t *,
const struct kevent_ops *, struct kevent *,
size_t);
static int kqueue_ioctl(file_t *, u_long, void *);
static int kqueue_fcntl(file_t *, u_int, void *);
static int kqueue_poll(file_t *, int);
static int kqueue_kqfilter(file_t *, struct knote *);
static int kqueue_stat(file_t *, struct stat *);
static int kqueue_close(file_t *);
static void kqueue_restart(file_t *);
static int kqueue_fpathconf(file_t *, int, register_t *);
static int kqueue_register(struct kqueue *, struct kevent *);
static void kqueue_doclose(struct kqueue *, struct klist *, int);
static void knote_detach(struct knote *, filedesc_t *fdp, bool);
static void knote_enqueue(struct knote *);
static void knote_activate(struct knote *);
static void knote_activate_locked(struct knote *);
static void knote_deactivate_locked(struct knote *);
static void filt_kqdetach(struct knote *);
static int filt_kqueue(struct knote *, long hint);
static int filt_procattach(struct knote *);
static void filt_procdetach(struct knote *);
static int filt_proc(struct knote *, long hint);
static int filt_fileattach(struct knote *);
static void filt_timerexpire(void *x);
static int filt_timerattach(struct knote *);
static void filt_timerdetach(struct knote *);
static int filt_timer(struct knote *, long hint);
static int filt_timertouch(struct knote *, struct kevent *, long type);
static int filt_userattach(struct knote *);
static void filt_userdetach(struct knote *);
static int filt_user(struct knote *, long hint);
static int filt_usertouch(struct knote *, struct kevent *, long type);
/*
* Private knote state that should never be exposed outside
* of kern_event.c
*
* Field locking:
*
* q kn_kq->kq_lock
*/
struct knote_impl {
struct knote ki_knote;
unsigned int ki_influx; /* q: in-flux counter */
kmutex_t ki_foplock; /* for kn_filterops */
};
#define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote)
#define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote)
static inline struct knote *
knote_alloc(bool sleepok)
{
struct knote_impl *ki;
ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
return KIMPL_TO_KNOTE(ki);
}
static inline void
knote_free(struct knote *kn)
{
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
mutex_destroy(&ki->ki_foplock);
kmem_free(ki, sizeof(*ki));
}
static inline void
knote_foplock_enter(struct knote *kn)
{
mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static inline void
knote_foplock_exit(struct knote *kn)
{
mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static inline bool __diagused
knote_foplock_owned(struct knote *kn)
{
return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static const struct fileops kqueueops = {
.fo_name = "kqueue",
.fo_read = (void *)enxio,
.fo_write = (void *)enxio,
.fo_ioctl = kqueue_ioctl,
.fo_fcntl = kqueue_fcntl,
.fo_poll = kqueue_poll,
.fo_stat = kqueue_stat,
.fo_close = kqueue_close,
.fo_kqfilter = kqueue_kqfilter,
.fo_restart = kqueue_restart,
.fo_fpathconf = kqueue_fpathconf,
};
static void
filt_nopdetach(struct knote *kn __unused)
{
}
static int
filt_nopevent(struct knote *kn __unused, long hint __unused)
{
return 0;
}
static const struct filterops nop_fd_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_nopdetach,
.f_event = filt_nopevent,
};
static const struct filterops nop_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_nopdetach,
.f_event = filt_nopevent,
};
static const struct filterops kqread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_kqdetach,
.f_event = filt_kqueue,
};
static const struct filterops proc_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_procattach,
.f_detach = filt_procdetach,
.f_event = filt_proc,
};
/*
* file_filtops is not marked MPSAFE because it's going to call
* fileops::fo_kqfilter(), which might not be. That function,
* however, will override the knote's filterops, and thus will
* inherit the MPSAFE-ness of the back-end at that time.
*/
static const struct filterops file_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = filt_fileattach,
.f_detach = NULL,
.f_event = NULL,
};
static const struct filterops timer_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_timerattach,
.f_detach = filt_timerdetach,
.f_event = filt_timer,
.f_touch = filt_timertouch,
};
static const struct filterops user_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_userattach,
.f_detach = filt_userdetach,
.f_event = filt_user,
.f_touch = filt_usertouch,
};
static u_int kq_ncallouts = 0;
static int kq_calloutmax = (4 * 1024);
#define KN_HASHSIZE 64 /* XXX should be tunable */
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
extern const struct filterops fs_filtops; /* vfs_syscalls.c */
extern const struct filterops sig_filtops; /* kern_sig.c */
/*
* Table for all system-defined filters.
* These should be listed in the numeric order of the EVFILT_* defines.
* If filtops is NULL, the filter isn't implemented in NetBSD.
* End of list is when name is NULL.
*
* Note that 'refcnt' is meaningless for built-in filters.
*/
struct kfilter {
const char *name; /* name of filter */
uint32_t filter; /* id of filter */
unsigned refcnt; /* reference count */
const struct filterops *filtops;/* operations for filter */
size_t namelen; /* length of name string */
};
/* System defined filters */
static struct kfilter sys_kfilters[] = {
{ "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
{ "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
{ "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
{ "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
{ "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
{ "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
{ "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
{ "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
{ "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 },
{ "EVFILT_EMPTY", EVFILT_EMPTY, 0, &file_filtops, 0 },
{ NULL, 0, 0, NULL, 0 },
};
/* User defined kfilters */
static struct kfilter *user_kfilters; /* array */
static int user_kfilterc; /* current offset */
static int user_kfiltermaxc; /* max size so far */
static size_t user_kfiltersz; /* size of allocated memory */
/*
* Global Locks.
*
* Lock order:
*
* kqueue_filter_lock
* -> kn_kq->kq_fdp->fd_lock
* -> knote foplock (if taken)
* -> object lock (e.g., device driver lock, &c.)
* -> kn_kq->kq_lock
*
* Locking rules. ==> indicates the lock is acquired by the backing
* object, locks prior are acquired before calling filter ops:
*
* f_attach: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
*
* f_detach: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
*
* f_event via kevent: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
* N.B. NOTE_SUBMIT will never be set in the "hint" argument
* in this case.
*
* f_event via knote (via backing object: Whatever caller guarantees.
* Typically:
* f_event(NOTE_SUBMIT): caller has already acquired backing
* object lock.
* f_event(!NOTE_SUBMIT): caller has not acquired backing object,
* lock or has possibly acquired KERNEL_LOCK. Backing object
* lock may or may not be acquired as-needed.
* N.B. the knote foplock will **not** be acquired in this case. The
* caller guarantees that klist_fini() will not be called concurrently
* with knote().
*
* f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
* N.B. knote foplock is **not** acquired in this case and
* the caller must guarantee that klist_fini() will never
* be called. kevent_register() restricts filters that
* provide f_touch to known-safe cases.
*
* klist_fini(): Caller must guarantee that no more knotes can
* be attached to the klist, and must **not** hold the backing
* object's lock; klist_fini() itself will acquire the foplock
* of each knote on the klist.
*
* Locking rules when detaching knotes:
*
* There are some situations where knote submission may require dropping
* locks (see knote_proc_fork()). In order to support this, it's possible
* to mark a knote as being 'in-flux'. Such a knote is guaranteed not to
* be detached while it remains in-flux. Because it will not be detached,
* locks can be dropped so e.g. memory can be allocated, locks on other
* data structures can be acquired, etc. During this time, any attempt to
* detach an in-flux knote must wait until the knote is no longer in-flux.
* When this happens, the knote is marked for death (KN_WILLDETACH) and the
* LWP who gets to finish the detach operation is recorded in the knote's
* 'udata' field (which is no longer required for its original purpose once
* a knote is so marked). Code paths that lead to knote_detach() must ensure
* that their LWP is the one tasked with its final demise after waiting for
* the in-flux status of the knote to clear. Note that once a knote is
* marked KN_WILLDETACH, no code paths may put it into an in-flux state.
*
* Once the special circumstances have been handled, the locks are re-
* acquired in the proper order (object lock -> kq_lock), the knote taken
* out of flux, and any waiters are notified. Because waiters must have
* also dropped *their* locks in order to safely block, they must re-
* validate all of their assumptions; see knote_detach_quiesce(). See also
* the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
* cases.
*
* When kqueue_scan() encounters an in-flux knote, the situation is
* treated like another LWP's list marker.
*
* LISTEN WELL: It is important to not hold knotes in flux for an
* extended period of time! In-flux knotes effectively block any
* progress of the kqueue_scan() operation. Any code paths that place
* knotes in-flux should be careful to not block for indefinite periods
* of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
* KM_SLEEP is not).
*/
static krwlock_t kqueue_filter_lock; /* lock on filter lists */
#define KQ_FLUX_WAIT(kq) (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
#define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv)
static inline bool
kn_in_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
}
static inline bool
kn_enter_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
if (kn->kn_status & KN_WILLDETACH) {
return false;
}
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
KASSERT(ki->ki_influx < UINT_MAX);
ki->ki_influx++;
return true;
}
static inline bool
kn_leave_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
KASSERT(ki->ki_influx > 0);
ki->ki_influx--;
return ki->ki_influx == 0;
}
static void
kn_wait_flux(struct knote *kn, bool can_loop)
{
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
bool loop;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
/*
* It may not be safe for us to touch the knote again after
* dropping the kq_lock. The caller has let us know in
* 'can_loop'.
*/
for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
KQ_FLUX_WAIT(kn->kn_kq);
}
}
#define KNOTE_WILLDETACH(kn) \
do { \
(kn)->kn_status |= KN_WILLDETACH; \
(kn)->kn_kevent.udata = curlwp; \
} while (/*CONSTCOND*/0)
/*
* Wait until the specified knote is in a quiescent state and
* safe to detach. Returns true if we potentially blocked (and
* thus dropped our locks).
*/
static bool
knote_detach_quiesce(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
filedesc_t *fdp = kq->kq_fdp;
KASSERT(mutex_owned(&fdp->fd_lock));
mutex_spin_enter(&kq->kq_lock);
/*
* There are two cases where we might see KN_WILLDETACH here:
*
* 1. Someone else has already started detaching the knote but
* had to wait for it to settle first.
*
* 2. We had to wait for it to settle, and had to come back
* around after re-acquiring the locks.
*
* When KN_WILLDETACH is set, we also set the LWP that claimed
* the prize of finishing the detach in the 'udata' field of the
* knote (which will never be used again for its usual purpose
* once the note is in this state). If it doesn't point to us,
* we must drop the locks and let them in to finish the job.
*
* Otherwise, once we have claimed the knote for ourselves, we
* can finish waiting for it to settle. The is the only scenario
* where touching a detaching knote is safe after dropping the
* locks.
*/
if ((kn->kn_status & KN_WILLDETACH) != 0 &&
kn->kn_kevent.udata != curlwp) {
/*
* N.B. it is NOT safe for us to touch the knote again
* after dropping the locks here. The caller must go
* back around and re-validate everything. However, if
* the knote is in-flux, we want to block to minimize
* busy-looping.
*/
mutex_exit(&fdp->fd_lock);
if (kn_in_flux(kn)) {
kn_wait_flux(kn, false);
mutex_spin_exit(&kq->kq_lock);
return true;
}
mutex_spin_exit(&kq->kq_lock);
preempt_point();
return true;
}
/*
* If we get here, we know that we will be claiming the
* detach responsibilies, or that we already have and
* this is the second attempt after re-validation.
*/
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
kn->kn_kevent.udata == curlwp);
/*
* Similarly, if we get here, either we are just claiming it
* and may have to wait for it to settle, or if this is the
* second attempt after re-validation that no other code paths
* have put it in-flux.
*/
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
kn_in_flux(kn) == false);
KNOTE_WILLDETACH(kn);
if (kn_in_flux(kn)) {
mutex_exit(&fdp->fd_lock);
kn_wait_flux(kn, true);
/*
* It is safe for us to touch the knote again after
* dropping the locks, but the caller must still
* re-validate everything because other aspects of
* the environment may have changed while we blocked.
*/
KASSERT(kn_in_flux(kn) == false);
mutex_spin_exit(&kq->kq_lock);
return true;
}
mutex_spin_exit(&kq->kq_lock);
return false;
}
/*
* Calls into the filterops need to be resilient against things which
* destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
* chasing garbage pointers (to data, or even potentially code in a
* module about to be unloaded). To that end, we acquire the
* knote foplock before calling into the filter ops. When a driver
* (or anything else) is tearing down its klist, klist_fini() enumerates
* each knote, acquires its foplock, and replaces the filterops with a
* nop stub, allowing knote detach (when descriptors are closed) to safely
* proceed.
*/
static int
filter_attach(struct knote *kn)
{
int rv;
KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_attach != NULL);
/*
* N.B. that kn->kn_fop may change as the result of calling
* f_attach(). After f_attach() returns, kn->kn_fop may not
* be modified by code outside of klist_fini().
*/
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
rv = kn->kn_fop->f_attach(kn);
} else {
KERNEL_LOCK(1, NULL);
rv = kn->kn_fop->f_attach(kn);
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
static void
filter_detach(struct knote *kn)
{
KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_detach != NULL);
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
kn->kn_fop->f_detach(kn);
} else {
KERNEL_LOCK(1, NULL);
kn->kn_fop->f_detach(kn);
KERNEL_UNLOCK_ONE(NULL);
}
}
static int
filter_event(struct knote *kn, long hint, bool submitting)
{
int rv;
/* See knote(). */
KASSERT(submitting || knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_event != NULL);
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
rv = kn->kn_fop->f_event(kn, hint);
} else {
KERNEL_LOCK(1, NULL);
rv = kn->kn_fop->f_event(kn, hint);
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
static int
filter_touch(struct knote *kn, struct kevent *kev, long type)
{
/*
* XXX We cannot assert that the knote foplock is held here
* XXX beause we cannot safely acquire it in all cases
* XXX where "touch" will be used in kqueue_scan(). We just
* XXX have to assume that f_touch will always be safe to call,
* XXX and kqueue_register() allows only the two known-safe
* XXX users of that op.
*/
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_touch != NULL);
return kn->kn_fop->f_touch(kn, kev, type);
}
static kauth_listener_t kqueue_listener;
static int
kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
if (action != KAUTH_PROCESS_KEVENT_FILTER)
return result;
if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) || ISSET(p->p_flag, PK_SUGID)))
return result;
result = KAUTH_RESULT_ALLOW;
return result;
}
/*
* Initialize the kqueue subsystem.
*/
void
kqueue_init(void)
{
rw_init(&kqueue_filter_lock);
kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
kqueue_listener_cb, NULL);
}
/*
* Find kfilter entry by name, or NULL if not found.
*/
static struct kfilter *
kfilter_byname_sys(const char *name)
{
int i;
KASSERT(rw_lock_held(&kqueue_filter_lock));
for (i = 0; sys_kfilters[i].name != NULL; i++) {
if (strcmp(name, sys_kfilters[i].name) == 0)
return &sys_kfilters[i];
}
return NULL;
}
static struct kfilter *
kfilter_byname_user(const char *name)
{
int i;
KASSERT(rw_lock_held(&kqueue_filter_lock));
/* user filter slots have a NULL name if previously deregistered */
for (i = 0; i < user_kfilterc ; i++) {
if (user_kfilters[i].name != NULL &&
strcmp(name, user_kfilters[i].name) == 0)
return &user_kfilters[i];
}
return NULL;
}
static struct kfilter *
kfilter_byname(const char *name)
{
struct kfilter *kfilter;
KASSERT(rw_lock_held(&kqueue_filter_lock));
if ((kfilter = kfilter_byname_sys(name)) != NULL)
return kfilter;
return kfilter_byname_user(name);
}
/*
* Find kfilter entry by filter id, or NULL if not found.
* Assumes entries are indexed in filter id order, for speed.
*/
static struct kfilter *
kfilter_byfilter(uint32_t filter)
{
struct kfilter *kfilter;
KASSERT(rw_lock_held(&kqueue_filter_lock));
if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
kfilter = &sys_kfilters[filter];
else if (user_kfilters != NULL &&
filter < EVFILT_SYSCOUNT + user_kfilterc)
/* it's a user filter */
kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
else
return (NULL); /* out of range */
KASSERT(kfilter->filter == filter); /* sanity check! */
return (kfilter);
}
/*
* Register a new kfilter. Stores the entry in user_kfilters.
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
* If retfilter != NULL, the new filterid is returned in it.
*/
int
kfilter_register(const char *name, const struct filterops *filtops,
int *retfilter)
{
struct kfilter *kfilter;
size_t len;
int i;
if (name == NULL || name[0] == '\0' || filtops == NULL)
return (EINVAL); /* invalid args */
rw_enter(&kqueue_filter_lock, RW_WRITER);
if (kfilter_byname(name) != NULL) {
rw_exit(&kqueue_filter_lock);
return (EEXIST); /* already exists */
}
if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
rw_exit(&kqueue_filter_lock);
return (EINVAL); /* too many */
}
for (i = 0; i < user_kfilterc; i++) {
kfilter = &user_kfilters[i];
if (kfilter->name == NULL) {
/* Previously deregistered slot. Reuse. */
goto reuse;
}
}
/* check if need to grow user_kfilters */
if (user_kfilterc + 1 > user_kfiltermaxc) {
/* Grow in KFILTER_EXTENT chunks. */
user_kfiltermaxc += KFILTER_EXTENT;
len = user_kfiltermaxc * sizeof(*kfilter);
kfilter = kmem_alloc(len, KM_SLEEP);
memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
if (user_kfilters != NULL) {
memcpy(kfilter, user_kfilters, user_kfiltersz);
kmem_free(user_kfilters, user_kfiltersz);
}
user_kfiltersz = len;
user_kfilters = kfilter;
}
/* Adding new slot */
kfilter = &user_kfilters[user_kfilterc++];
reuse:
kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
if (retfilter != NULL)
*retfilter = kfilter->filter;
rw_exit(&kqueue_filter_lock);
return (0);
}
/*
* Unregister a kfilter previously registered with kfilter_register.
* This retains the filter id, but clears the name and frees filtops (filter
* operations), so that the number isn't reused during a boot.
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
*/
int
kfilter_unregister(const char *name)
{
struct kfilter *kfilter;
if (name == NULL || name[0] == '\0')
return (EINVAL); /* invalid name */
rw_enter(&kqueue_filter_lock, RW_WRITER);
if (kfilter_byname_sys(name) != NULL) {
rw_exit(&kqueue_filter_lock);
return (EINVAL); /* can't detach system filters */
}
kfilter = kfilter_byname_user(name);
if (kfilter == NULL) {
rw_exit(&kqueue_filter_lock);
return (ENOENT);
}
if (kfilter->refcnt != 0) {
rw_exit(&kqueue_filter_lock);
return (EBUSY);
}
/* Cast away const (but we know it's safe. */
kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
kfilter->name = NULL; /* mark as `not implemented' */
if (kfilter->filtops != NULL) {
/* Cast away const (but we know it's safe. */
kmem_free(__UNCONST(kfilter->filtops),
sizeof(*kfilter->filtops));
kfilter->filtops = NULL; /* mark as `not implemented' */
}
rw_exit(&kqueue_filter_lock);
return (0);
}
/*
* Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
* descriptors. Calls fileops kqfilter method for given file descriptor.
*/
static int
filt_fileattach(struct knote *kn)
{
file_t *fp;
fp = kn->kn_obj;
return (*fp->f_ops->fo_kqfilter)(fp, kn);
}
/*
* Filter detach method for EVFILT_READ on kqueue descriptor.
*/
static void
filt_kqdetach(struct knote *kn)
{
struct kqueue *kq;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
mutex_spin_enter(&kq->kq_lock);
selremove_knote(&kq->kq_sel, kn);
mutex_spin_exit(&kq->kq_lock);
}
/*
* Filter event method for EVFILT_READ on kqueue descriptor.
*/
/*ARGSUSED*/
static int
filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq;
int rv;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
if (hint != NOTE_SUBMIT)
mutex_spin_enter(&kq->kq_lock);
kn->kn_data = KQ_COUNT(kq);
rv = (kn->kn_data > 0);
if (hint != NOTE_SUBMIT)
mutex_spin_exit(&kq->kq_lock);
return rv;
}
/*
* Filter attach method for EVFILT_PROC.
*/
static int
filt_procattach(struct knote *kn)
{
struct proc *p;
mutex_enter(&proc_lock);
p = proc_find(kn->kn_id);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
/*
* Fail if it's not owned by you, or the last exec gave us
* setuid/setgid privs (unless you're root).
*/
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
if (kauth_authorize_process(curlwp->l_cred,
KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
mutex_exit(p->p_lock);
return EACCES;
}
kn->kn_obj = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
/*
* NOTE_CHILD is only ever generated internally; don't let it
* leak in from user-space. See knote_proc_fork_track().
*/
kn->kn_sfflags &= ~NOTE_CHILD;
klist_insert(&p->p_klist, kn);
mutex_exit(p->p_lock);
return 0;
}
/*
* Filter detach method for EVFILT_PROC.
*
* The knote may be attached to a different process, which may exit,
* leaving nothing for the knote to be attached to. So when the process
* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
* it will be deleted when read out. However, as part of the knote deletion,
* this routine is called, so a check is needed to avoid actually performing
* a detach, because the original process might not exist any more.
*/
static void
filt_procdetach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct proc *p;
/*
* We have to synchronize with knote_proc_exit(), but we
* are forced to acquire the locks in the wrong order here
* because we can't be sure kn->kn_obj is valid unless
* KN_DETACHED is not set.
*/
again:
mutex_spin_enter(&kq->kq_lock);
if ((kn->kn_status & KN_DETACHED) == 0) {
p = kn->kn_obj;
if (!mutex_tryenter(p->p_lock)) {
mutex_spin_exit(&kq->kq_lock);
preempt_point();
goto again;
}
kn->kn_status |= KN_DETACHED;
klist_remove(&p->p_klist, kn);
mutex_exit(p->p_lock);
}
mutex_spin_exit(&kq->kq_lock);
}
/*
* Filter event method for EVFILT_PROC.
*
* Due to some of the complexities of process locking, we have special
* entry points for delivering knote submissions. filt_proc() is used
* only to check for activation from kqueue_register() and kqueue_scan().
*/
static int
filt_proc(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
uint32_t fflags;
/*
* Because we share the same klist with signal knotes, just
* ensure that we're not being invoked for the proc-related
* submissions.
*/
KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);
mutex_spin_enter(&kq->kq_lock);
fflags = kn->kn_fflags;
mutex_spin_exit(&kq->kq_lock);
return fflags != 0;
}
void
knote_proc_exec(struct proc *p)
{
struct knote *kn, *tmpkn;
struct kqueue *kq;
uint32_t fflags;
mutex_enter(p->p_lock);
SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
if (kn->kn_fop == &sig_filtops) {
continue;
}
KASSERT(kn->kn_fop == &proc_filtops);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
if (fflags) {
knote_activate_locked(kn);
}
mutex_spin_exit(&kq->kq_lock);
}
mutex_exit(p->p_lock);
}
static int __noinline
knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
{
struct kqueue *kq = okn->kn_kq;
KASSERT(mutex_owned(&kq->kq_lock));
KASSERT(mutex_owned(p1->p_lock));
/*
* We're going to put this knote into flux while we drop
* the locks and create and attach a new knote to track the
* child. If we are not able to enter flux, then this knote
* is about to go away, so skip the notification.
*/
if (!kn_enter_flux(okn)) {
return 0;
}
mutex_spin_exit(&kq->kq_lock);
mutex_exit(p1->p_lock);
/*
* We actually have to register *two* new knotes:
*
* ==> One for the NOTE_CHILD notification. This is a forced
* ONESHOT note.
*
* ==> One to actually track the child process as it subsequently
* forks, execs, and, ultimately, exits.
*
* If we only register a single knote, then it's possible for
* for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
* notification if the child exits before the tracking process
* has received the NOTE_CHILD notification, which applications
* aren't expecting (the event's 'data' field would be clobbered,
* for example).
*
* To do this, what we have here is an **extremely** stripped-down
* version of kqueue_register() that has the following properties:
*
* ==> Does not block to allocate memory. If we are unable
* to allocate memory, we return ENOMEM.
*
* ==> Does not search for existing knotes; we know there
* are not any because this is a new process that isn't
* even visible to other processes yet.
*
* ==> Assumes that the knhash for our kq's descriptor table
* already exists (after all, we're already tracking
* processes with knotes if we got here).
*
* ==> Directly attaches the new tracking knote to the child
* process.
*
* The whole point is to do the minimum amount of work while the
* knote is held in-flux, and to avoid doing extra work in general
* (we already have the new child process; why bother looking it
* up again?).
*/
filedesc_t *fdp = kq->kq_fdp;
struct knote *knchild, *kntrack;
int error = 0;
knchild = knote_alloc(false);
kntrack = knote_alloc(false);
if (__predict_false(knchild == NULL || kntrack == NULL)) {
error = ENOMEM;
goto out;
}
kntrack->kn_obj = p2;
kntrack->kn_id = p2->p_pid;
kntrack->kn_kq = kq;
kntrack->kn_fop = okn->kn_fop;
kntrack->kn_kfilter = okn->kn_kfilter;
kntrack->kn_sfflags = okn->kn_sfflags;
kntrack->kn_sdata = p1->p_pid;
kntrack->kn_kevent.ident = p2->p_pid;
kntrack->kn_kevent.filter = okn->kn_filter;
kntrack->kn_kevent.flags =
okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
kntrack->kn_kevent.fflags = 0;
kntrack->kn_kevent.data = 0;
kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */
/*
* The child note does not need to be attached to the
* new proc's klist at all.
*/
*knchild = *kntrack;
knchild->kn_status = KN_DETACHED;
knchild->kn_sfflags = 0;
knchild->kn_kevent.flags |= EV_ONESHOT;
knchild->kn_kevent.fflags = NOTE_CHILD;
knchild->kn_kevent.data = p1->p_pid; /* parent */
mutex_enter(&fdp->fd_lock);
/*
* We need to check to see if the kq is closing, and skip
* attaching the knote if so. Normally, this isn't necessary
* when coming in the front door because the file descriptor
* layer will synchronize this.
*
* It's safe to test KQ_CLOSING without taking the kq_lock
* here because that flag is only ever set when the fd_lock
* is also held.
*/
if (__predict_false(kq->kq_count & KQ_CLOSING)) {
mutex_exit(&fdp->fd_lock);
goto out;
}
/*
* We do the "insert into FD table" and "attach to klist" steps
* in the opposite order of kqueue_register() here to avoid
* having to take p2->p_lock twice. But this is OK because we
* hold fd_lock across the entire operation.
*/
mutex_enter(p2->p_lock);
error = kauth_authorize_process(curlwp->l_cred,
KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
if (__predict_false(error != 0)) {
mutex_exit(p2->p_lock);
mutex_exit(&fdp->fd_lock);
error = EACCES;
goto out;
}
klist_insert(&p2->p_klist, kntrack);
mutex_exit(p2->p_lock);
KASSERT(fdp->fd_knhashmask != 0);
KASSERT(fdp->fd_knhash != NULL);
struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
fdp->fd_knhashmask)];
SLIST_INSERT_HEAD(list, kntrack, kn_link);
SLIST_INSERT_HEAD(list, knchild, kn_link);
/* This adds references for knchild *and* kntrack. */
atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);
knote_activate(knchild);
kntrack = NULL;
knchild = NULL;
mutex_exit(&fdp->fd_lock);
out:
if (__predict_false(knchild != NULL)) {
knote_free(knchild);
}
if (__predict_false(kntrack != NULL)) {
knote_free(kntrack);
}
mutex_enter(p1->p_lock);
mutex_spin_enter(&kq->kq_lock);
if (kn_leave_flux(okn)) {
KQ_FLUX_WAKEUP(kq);
}
return error;
}
void
knote_proc_fork(struct proc *p1, struct proc *p2)
{
struct knote *kn;
struct kqueue *kq;
uint32_t fflags;
mutex_enter(p1->p_lock);
/*
* N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
* don't want to pre-fetch the next knote; in the event we
* have to drop p_lock, we will have put the knote in-flux,
* meaning that no one will be able to detach it until we
* have taken the knote out of flux. However, that does
* NOT stop someone else from detaching the next note in the
* list while we have it unlocked. Thus, we want to fetch
* the next note in the list only after we have re-acquired
* the lock, and using SLIST_FOREACH() will satisfy that.
*/
SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
if (kn->kn_fop == &sig_filtops) {
continue;
}
KASSERT(kn->kn_fop == &proc_filtops);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
/*
* This will drop kq_lock and p_lock and
* re-acquire them before it returns.
*/
if (knote_proc_fork_track(p1, p2, kn)) {
kn->kn_fflags |= NOTE_TRACKERR;
}
KASSERT(mutex_owned(p1->p_lock));
KASSERT(mutex_owned(&kq->kq_lock));
}
fflags = kn->kn_fflags;
if (fflags) {
knote_activate_locked(kn);
}
mutex_spin_exit(&kq->kq_lock);
}
mutex_exit(p1->p_lock);
}
void
knote_proc_exit(struct proc *p)
{
struct knote *kn;
struct kqueue *kq;
KASSERT(mutex_owned(p->p_lock));
while (!SLIST_EMPTY(&p->p_klist)) {
kn = SLIST_FIRST(&p->p_klist);
kq = kn->kn_kq;
KASSERT(kn->kn_obj == p);
mutex_spin_enter(&kq->kq_lock);
kn->kn_data = P_WAITSTATUS(p);
/*
* Mark as ONESHOT, so that the knote is g/c'ed
* when read.
*/
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;
/*
* Detach the knote from the process and mark it as such.
* N.B. EVFILT_SIGNAL are also on p_klist, but by the
* time we get here, all open file descriptors for this
* process have been released, meaning that signal knotes
* will have already been detached.
*
* We need to synchronize this with filt_procdetach().
*/
KASSERT(kn->kn_fop == &proc_filtops);
if ((kn->kn_status & KN_DETACHED) == 0) {
kn->kn_status |= KN_DETACHED;
SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
}
/*
* Always activate the knote for NOTE_EXIT regardless
* of whether or not the listener cares about it.
* This matches historical behavior.
*/
knote_activate_locked(kn);
mutex_spin_exit(&kq->kq_lock);
}
}
#define FILT_TIMER_NOSCHED ((uintptr_t)-1)
static int
filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
{
struct timespec ts;
uintptr_t tticks;
if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
return EINVAL;
}
/*
* Convert the event 'data' to a timespec, then convert the
* timespec to callout ticks.
*/
switch (kev->fflags & NOTE_TIMER_UNITMASK) {
case NOTE_SECONDS:
ts.tv_sec = kev->data;
ts.tv_nsec = 0;
break;
case NOTE_MSECONDS: /* == historical value 0 */
ts.tv_sec = kev->data / 1000;
ts.tv_nsec = (kev->data % 1000) * 1000000;
break;
case NOTE_USECONDS:
ts.tv_sec = kev->data / 1000000;
ts.tv_nsec = (kev->data % 1000000) * 1000;
break;
case NOTE_NSECONDS:
ts.tv_sec = kev->data / 1000000000;
ts.tv_nsec = kev->data % 1000000000;
break;
default:
return EINVAL;
}
if (kev->fflags & NOTE_ABSTIME) {
struct timespec deadline = ts;
/*
* Get current time.
*
* XXX This is CLOCK_REALTIME. There is no way to
* XXX specify CLOCK_MONOTONIC.
*/
nanotime(&ts);
/* Absolute timers do not repeat. */
kev->data = FILT_TIMER_NOSCHED;
/* If we're past the deadline, then the event will fire. */
if (timespeccmp(&deadline, &ts, <=)) {
tticks = FILT_TIMER_NOSCHED;
goto out;
}
/* Calculate how much time is left. */
timespecsub(&deadline, &ts, &ts);
} else {
/* EV_CLEAR automatically set for relative timers. */
kev->flags |= EV_CLEAR;
}
tticks = tstohz(&ts);
/* if the supplied value is under our resolution, use 1 tick */
if (tticks == 0) {
if (kev->data == 0)
return EINVAL;
tticks = 1;
} else if (tticks > INT_MAX) {
return EINVAL;
}
if ((kev->flags & EV_ONESHOT) != 0) {
/* Timer does not repeat. */
kev->data = FILT_TIMER_NOSCHED;
} else {
KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
kev->data = tticks;
}
out:
*tticksp = tticks;
return 0;
}
static void
filt_timerexpire(void *knx)
{
struct knote *kn = knx;
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_data++;
knote_activate_locked(kn);
if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
KASSERT(kn->kn_sdata > 0);
KASSERT(kn->kn_sdata <= INT_MAX);
callout_schedule((callout_t *)kn->kn_hook,
(int)kn->kn_sdata);
}
mutex_spin_exit(&kq->kq_lock);
}
static inline void
filt_timerstart(struct knote *kn, uintptr_t tticks)
{
callout_t *calloutp = kn->kn_hook;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
KASSERT(!callout_pending(calloutp));
if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
kn->kn_data = 1;
} else {
KASSERT(tticks <= INT_MAX);
callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
}
}
static int
filt_timerattach(struct knote *kn)
{
callout_t *calloutp;
struct kqueue *kq;
uintptr_t tticks;
int error;
struct kevent kev = {
.flags = kn->kn_flags,
.fflags = kn->kn_sfflags,
.data = kn->kn_sdata,
};
error = filt_timercompute(&kev, &tticks);
if (error) {
return error;
}
if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
(calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
atomic_dec_uint(&kq_ncallouts);
return ENOMEM;
}
callout_init(calloutp, CALLOUT_MPSAFE);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_sdata = kev.data;
kn->kn_flags = kev.flags;
KASSERT(kn->kn_sfflags == kev.fflags);
kn->kn_hook = calloutp;
filt_timerstart(kn, tticks);
mutex_spin_exit(&kq->kq_lock);
return (0);
}
static void
filt_timerdetach(struct knote *kn)
{
callout_t *calloutp;
struct kqueue *kq = kn->kn_kq;
/* prevent rescheduling when we expire */
mutex_spin_enter(&kq->kq_lock);
kn->kn_sdata = FILT_TIMER_NOSCHED;
mutex_spin_exit(&kq->kq_lock);
calloutp = (callout_t *)kn->kn_hook;
/*
* Attempt to stop the callout. This will block if it's
* already running.
*/
callout_halt(calloutp, NULL);
callout_destroy(calloutp);
kmem_free(calloutp, sizeof(*calloutp));
atomic_dec_uint(&kq_ncallouts);
}
static int
filt_timertouch(struct knote *kn, struct kevent *kev, long type)
{
struct kqueue *kq = kn->kn_kq;
callout_t *calloutp;
uintptr_t tticks;
int error;
KASSERT(mutex_owned(&kq->kq_lock));
switch (type) {
case EVENT_REGISTER:
/* Only relevant for EV_ADD. */
if ((kev->flags & EV_ADD) == 0) {
return 0;
}
/*
* Stop the timer, under the assumption that if
* an application is re-configuring the timer,
* they no longer care about the old one. We
* can safely drop the kq_lock while we wait
* because fdp->fd_lock will be held throughout,
* ensuring that no one can sneak in with an
* EV_DELETE or close the kq.
*/
KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));
calloutp = kn->kn_hook;
callout_halt(calloutp, &kq->kq_lock);
KASSERT(mutex_owned(&kq->kq_lock));
knote_deactivate_locked(kn);
kn->kn_data = 0;
error = filt_timercompute(kev, &tticks);
if (error) {
return error;
}
kn->kn_sdata = kev->data;
kn->kn_flags = kev->flags;
kn->kn_sfflags = kev->fflags;
filt_timerstart(kn, tticks);
break;
case EVENT_PROCESS:
*kev = kn->kn_kevent;
break;
default:
panic("%s: invalid type (%ld)", __func__, type);
}
return 0;
}
static int
filt_timer(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
int rv;
mutex_spin_enter(&kq->kq_lock);
rv = (kn->kn_data != 0);
mutex_spin_exit(&kq->kq_lock);
return rv;
}
static int
filt_userattach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
mutex_spin_enter(&kq->kq_lock);
kn->kn_hook = NULL;
if (kn->kn_fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
else
kn->kn_hookid = 0;
mutex_spin_exit(&kq->kq_lock);
return (0);
}
static void
filt_userdetach(struct knote *kn)
{
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
}
static int
filt_user(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
int hookid;
mutex_spin_enter(&kq->kq_lock);
hookid = kn->kn_hookid;
mutex_spin_exit(&kq->kq_lock);
return hookid;
}
static int
filt_usertouch(struct knote *kn, struct kevent *kev, long type)
{
int ffctrl;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
switch (type) {
case EVENT_REGISTER:
if (kev->fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
ffctrl = kev->fflags & NOTE_FFCTRLMASK;
kev->fflags &= NOTE_FFLAGSMASK;
switch (ffctrl) {
case NOTE_FFNOP:
break;
case NOTE_FFAND:
kn->kn_sfflags &= kev->fflags;
break;
case NOTE_FFOR:
kn->kn_sfflags |= kev->fflags;
break;
case NOTE_FFCOPY:
kn->kn_sfflags = kev->fflags;
break;
default:
/* XXX Return error? */
break;
}
kn->kn_sdata = kev->data;
if (kev->flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
case EVENT_PROCESS:
*kev = kn->kn_kevent;
kev->fflags = kn->kn_sfflags;
kev->data = kn->kn_sdata;
if (kn->kn_flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
default:
panic("filt_usertouch() - invalid type (%ld)", type);
break;
}
return 0;
}
/*
* filt_seltrue:
*
* This filter "event" routine simulates seltrue().
*/
int
filt_seltrue(struct knote *kn, long hint)
{
/*
* We don't know how much data can be read/written,
* but we know that it *can* be. This is about as
* good as select/poll does as well.
*/
kn->kn_data = 0;
return (1);
}
/*
* This provides full kqfilter entry for device switch tables, which
* has same effect as filter using filt_seltrue() as filter method.
*/
static void
filt_seltruedetach(struct knote *kn)
{
/* Nothing to do */
}
const struct filterops seltrue_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_seltruedetach,
.f_event = filt_seltrue,
};
int
seltrue_kqfilter(dev_t dev, struct knote *kn)
{
switch (kn->kn_filter) {
case EVFILT_READ:
case EVFILT_WRITE:
kn->kn_fop = &seltrue_filtops;
break;
default:
return (EINVAL);
}
/* Nothing more to do */
return (0);
}
/*
* kqueue(2) system call.
*/
static int
kqueue1(struct lwp *l, int flags, register_t *retval)
{
struct kqueue *kq;
file_t *fp;
int fd, error;
if ((error = fd_allocfile(&fp, &fd)) != 0)
return error;
fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
fp->f_type = DTYPE_KQUEUE;
fp->f_ops = &kqueueops;
kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
cv_init(&kq->kq_cv, "kqueue");
selinit(&kq->kq_sel);
TAILQ_INIT(&kq->kq_head);
fp->f_kqueue = kq;
*retval = fd;
kq->kq_fdp = curlwp->l_fd;
fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
fd_affix(curproc, fp, fd);
return error;
}
/*
* kqueue(2) system call.
*/
int
sys_kqueue(struct lwp *l, const void *v, register_t *retval)
{
return kqueue1(l, 0, retval);
}
int
sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
register_t *retval)
{
/* {
syscallarg(int) flags;
} */
return kqueue1(l, SCARG(uap, flags), retval);
}
/*
* kevent(2) system call.
*/
int
kevent_fetch_changes(void *ctx, const struct kevent *changelist,
struct kevent *changes, size_t index, int n)
{
return copyin(changelist + index, changes, n * sizeof(*changes));
}
int
kevent_put_events(void *ctx, struct kevent *events,
struct kevent *eventlist, size_t index, int n)
{
return copyout(events, eventlist + index, n * sizeof(*events));
}
static const struct kevent_ops kevent_native_ops = {
.keo_private = NULL,
.keo_fetch_timeout = copyin,
.keo_fetch_changes = kevent_fetch_changes,
.keo_put_events = kevent_put_events,
};
int
sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct kevent *) changelist;
syscallarg(size_t) nchanges;
syscallarg(struct kevent *) eventlist;
syscallarg(size_t) nevents;
syscallarg(const struct timespec *) timeout;
} */
return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
SCARG(uap, timeout), &kevent_native_ops);
}
int
kevent1(register_t *retval, int fd,
const struct kevent *changelist, size_t nchanges,
struct kevent *eventlist, size_t nevents,
const struct timespec *timeout,
const struct kevent_ops *keops)
{
struct kevent *kevp;
struct kqueue *kq;
struct timespec ts;
size_t i, n, ichange;
int nerrors, error;
struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
file_t *fp;
/* check that we're dealing with a kq */
fp = fd_getfile(fd);
if (fp == NULL)
return (EBADF);
if (fp->f_type != DTYPE_KQUEUE) {
fd_putfile(fd);
return (EBADF);
}
if (timeout != NULL) {
error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
if (error)
goto done;
timeout = &ts;
}
kq = fp->f_kqueue;
nerrors = 0;
ichange = 0;
/* traverse list of events to register */
while (nchanges > 0) {
n = MIN(nchanges, __arraycount(kevbuf));
error = (*keops->keo_fetch_changes)(keops->keo_private,
changelist, kevbuf, ichange, n);
if (error)
goto done;
for (i = 0; i < n; i++) {
kevp = &kevbuf[i];
kevp->flags &= ~EV_SYSFLAGS;
/* register each knote */
error = kqueue_register(kq, kevp);
if (!error && !(kevp->flags & EV_RECEIPT))
continue;
if (nevents == 0)
goto done;
kevp->flags = EV_ERROR;
kevp->data = error;
error = (*keops->keo_put_events)
(keops->keo_private, kevp,
eventlist, nerrors, 1);
if (error)
goto done;
nevents--;
nerrors++;
}
nchanges -= n; /* update the results */
ichange += n;
}
if (nerrors) {
*retval = nerrors;
error = 0;
goto done;
}
/* actually scan through the events */
error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
kevbuf, __arraycount(kevbuf));
done:
fd_putfile(fd);
return (error);
}
/*
* Register a given kevent kev onto the kqueue
*/
static int
kqueue_register(struct kqueue *kq, struct kevent *kev)
{
struct kfilter *kfilter;
filedesc_t *fdp;
file_t *fp;
fdfile_t *ff;
struct knote *kn, *newkn;
struct klist *list;
int error, fd, rv;
fdp = kq->kq_fdp;
fp = NULL;
kn = NULL;
error = 0;
fd = 0;
newkn = knote_alloc(true);
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byfilter(kev->filter);
if (kfilter == NULL || kfilter->filtops == NULL) {
/* filter not found nor implemented */
rw_exit(&kqueue_filter_lock);
knote_free(newkn);
return (EINVAL);
}
/* search if knote already exists */
if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
/* monitoring a file descriptor */
/* validate descriptor */
if (kev->ident > INT_MAX
|| (fp = fd_getfile(fd = kev->ident)) == NULL) {
rw_exit(&kqueue_filter_lock);
knote_free(newkn);
return EBADF;
}
mutex_enter(&fdp->fd_lock);
ff = fdp->fd_dt->dt_ff[fd];
if (ff->ff_refcnt & FR_CLOSING) {
error = EBADF;
goto doneunlock;
}
if (fd <= fdp->fd_lastkqfile) {
SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
if (kq == kn->kn_kq &&
kev->filter == kn->kn_filter)
break;
}
}
} else {
/*
* not monitoring a file descriptor, so
* lookup knotes in internal hash table
*/
mutex_enter(&fdp->fd_lock);
if (fdp->fd_knhashmask != 0) {
list = &fdp->fd_knhash[
KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
SLIST_FOREACH(kn, list, kn_link) {
if (kev->ident == kn->kn_id &&
kq == kn->kn_kq &&
kev->filter == kn->kn_filter)
break;
}
}
}
/* It's safe to test KQ_CLOSING while holding only the fd_lock. */
KASSERT(mutex_owned(&fdp->fd_lock));
KASSERT((kq->kq_count & KQ_CLOSING) == 0);
/*
* kn now contains the matching knote, or NULL if no match
*/
if (kn == NULL) {
if (kev->flags & EV_ADD) {
/* create new knote */
kn = newkn;
newkn = NULL;
kn->kn_obj = fp;
kn->kn_id = kev->ident;
kn->kn_kq = kq;
kn->kn_fop = kfilter->filtops;
kn->kn_kfilter = kfilter;
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
kev->fflags = 0;
kev->data = 0;
kn->kn_kevent = *kev;
KASSERT(kn->kn_fop != NULL);
/*
* XXX Allow only known-safe users of f_touch.
* XXX See filter_touch() for details.
*/
if (kn->kn_fop->f_touch != NULL &&
kn->kn_fop != &timer_filtops &&
kn->kn_fop != &user_filtops) {
error = ENOTSUP;
goto fail_ev_add;
}
/*
* apply reference count to knote structure, and
* do not release it at the end of this routine.
*/
fp = NULL;
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
/*
* If knote is not on an fd, store on
* internal hash table.
*/
if (fdp->fd_knhashmask == 0) {
/* XXXAD can block with fd_lock held */
fdp->fd_knhash = hashinit(KN_HASHSIZE,
HASH_LIST, true,
&fdp->fd_knhashmask);
}
list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
fdp->fd_knhashmask)];
} else {
/* Otherwise, knote is on an fd. */
list = (struct klist *)
&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
if ((int)kn->kn_id > fdp->fd_lastkqfile)
fdp->fd_lastkqfile = kn->kn_id;
}
SLIST_INSERT_HEAD(list, kn, kn_link);
/*
* N.B. kn->kn_fop may change as the result
* of filter_attach()!
*/
knote_foplock_enter(kn);
error = filter_attach(kn);
if (error != 0) {
#ifdef DEBUG
struct proc *p = curlwp->l_proc;
const file_t *ft = kn->kn_obj;
printf("%s: %s[%d]: event type %d not "
"supported for file type %d/%s "
"(error %d)\n", __func__,
p->p_comm, p->p_pid,
kn->kn_filter, ft ? ft->f_type : -1,
ft ? ft->f_ops->fo_name : "?", error);
#endif
fail_ev_add:
/*
* N.B. no need to check for this note to
* be in-flux, since it was never visible
* to the monitored object.
*
* knote_detach() drops fdp->fd_lock
*/
knote_foplock_exit(kn);
mutex_enter(&kq->kq_lock);
KNOTE_WILLDETACH(kn);
KASSERT(kn_in_flux(kn) == false);
mutex_exit(&kq->kq_lock);
knote_detach(kn, fdp, false);
goto done;
}
atomic_inc_uint(&kfilter->refcnt);
goto done_ev_add;
} else {
/* No matching knote and the EV_ADD flag is not set. */
error = ENOENT;
goto doneunlock;
}
}
if (kev->flags & EV_DELETE) {
/*
* Let the world know that this knote is about to go
* away, and wait for it to settle if it's currently
* in-flux.
*/
mutex_spin_enter(&kq->kq_lock);
if (kn->kn_status & KN_WILLDETACH) {
/*
* This knote is already on its way out,
* so just be done.
*/
mutex_spin_exit(&kq->kq_lock);
goto doneunlock;
}
KNOTE_WILLDETACH(kn);
if (kn_in_flux(kn)) {
mutex_exit(&fdp->fd_lock);
/*
* It's safe for us to conclusively wait for
* this knote to settle because we know we'll
* be completing the detach.
*/
kn_wait_flux(kn, true);
KASSERT(kn_in_flux(kn) == false);
mutex_spin_exit(&kq->kq_lock);
mutex_enter(&fdp->fd_lock);
} else {
mutex_spin_exit(&kq->kq_lock);
}
/* knote_detach() drops fdp->fd_lock */
knote_detach(kn, fdp, true);
goto done;
}
/*
* The user may change some filter values after the
* initial EV_ADD, but doing so will not reset any
* filter which have already been triggered.
*/
knote_foplock_enter(kn);
kn->kn_kevent.udata = kev->udata;
KASSERT(kn->kn_fop != NULL);
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
kn->kn_fop->f_touch != NULL) {
mutex_spin_enter(&kq->kq_lock);
error = filter_touch(kn, kev, EVENT_REGISTER);
mutex_spin_exit(&kq->kq_lock);
if (__predict_false(error != 0)) {
/* Never a new knote (which would consume newkn). */
KASSERT(newkn != NULL);
knote_foplock_exit(kn);
goto doneunlock;
}
} else {
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
}
/*
* We can get here if we are trying to attach
* an event to a file descriptor that does not
* support events, and the attach routine is
* broken and does not return an error.
*/
done_ev_add:
rv = filter_event(kn, 0, false);
if (rv)
knote_activate(kn);
knote_foplock_exit(kn);
/* disable knote */
if ((kev->flags & EV_DISABLE)) {
mutex_spin_enter(&kq->kq_lock);
if ((kn->kn_status & KN_DISABLED) == 0)
kn->kn_status |= KN_DISABLED;
mutex_spin_exit(&kq->kq_lock);
}
/* enable knote */
if ((kev->flags & EV_ENABLE)) {
knote_enqueue(kn);
}
doneunlock:
mutex_exit(&fdp->fd_lock);
done:
rw_exit(&kqueue_filter_lock);
if (newkn != NULL)
knote_free(newkn);
if (fp != NULL)
fd_putfile(fd);
return (error);
}
#define KN_FMT(buf, kn) \
(snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
#if defined(DDB)
void
kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
{
const struct knote *kn;
u_int count;
int nmarker;
char buf[128];
count = 0;
nmarker = 0;
(*pr)("kqueue %p (restart=%d count=%u):\n", kq,
!!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
(*pr)(" Queued knotes:\n");
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if (kn->kn_status & KN_MARKER) {
nmarker++;
} else {
count++;
}
(*pr)(" knote %p: kq=%p status=%s\n",
kn, kn->kn_kq, KN_FMT(buf, kn));
(*pr)(" id=0x%lx (%lu) filter=%d\n",
(u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
if (kn->kn_kq != kq) {
(*pr)(" !!! kn->kn_kq != kq\n");
}
}
if (count != KQ_COUNT(kq)) {
(*pr)(" !!! count(%u) != KQ_COUNT(%u)\n",
count, KQ_COUNT(kq));
}
}
#endif /* DDB */
#if defined(DEBUG)
static void
kqueue_check(const char *func, size_t line, const struct kqueue *kq)
{
const struct knote *kn;
u_int count;
int nmarker;
char buf[128];
KASSERT(mutex_owned(&kq->kq_lock));
count = 0;
nmarker = 0;
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
func, line, kq, kn, KN_FMT(buf, kn));
}
if ((kn->kn_status & KN_MARKER) == 0) {
if (kn->kn_kq != kq) {
panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
func, line, kq, kn, kn->kn_kq,
KN_FMT(buf, kn));
}
if ((kn->kn_status & KN_ACTIVE) == 0) {
panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
func, line, kq, kn, KN_FMT(buf, kn));
}
count++;
if (count > KQ_COUNT(kq)) { panic("%s,%zu: kq=%p kq->kq_count(%u) != "
"count(%d), nmarker=%d",
func, line, kq, KQ_COUNT(kq), count,
nmarker);
}
} else {
nmarker++;
}
}
}
#define kq_check(a) kqueue_check(__func__, __LINE__, (a))
#else /* defined(DEBUG) */
#define kq_check(a) /* nothing */
#endif /* defined(DEBUG) */
static void
kqueue_restart(file_t *fp)
{
struct kqueue *kq = fp->f_kqueue;
KASSERT(kq != NULL);
mutex_spin_enter(&kq->kq_lock);
kq->kq_count |= KQ_RESTART;
cv_broadcast(&kq->kq_cv);
mutex_spin_exit(&kq->kq_lock);
}
static int
kqueue_fpathconf(struct file *fp, int name, register_t *retval)
{
return EINVAL;
}
/*
* Scan through the list of events on fp (for a maximum of maxevents),
* returning the results in to ulistp. Timeout is determined by tsp; if
* NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
* as appropriate.
*/
static int
kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
const struct timespec *tsp, register_t *retval,
const struct kevent_ops *keops, struct kevent *kevbuf,
size_t kevcnt)
{
struct kqueue *kq;
struct kevent *kevp;
struct timespec ats, sleepts;
struct knote *kn, *marker;
struct knote_impl morker;
size_t count, nkev, nevents;
int timeout, error, touch, rv, influx;
filedesc_t *fdp;
fdp = curlwp->l_fd;
kq = fp->f_kqueue;
count = maxevents;
nkev = nevents = error = 0;
if (count == 0) {
*retval = 0;
return 0;
}
if (tsp) { /* timeout supplied */
ats = *tsp;
if (inittimeleft(&ats, &sleepts) == -1) {
*retval = maxevents;
return EINVAL;
}
timeout = tstohz(&ats);
if (timeout <= 0)
timeout = -1; /* do poll */
} else {
/* no timeout, wait forever */
timeout = 0;
}
memset(&morker, 0, sizeof(morker));
marker = &morker.ki_knote;
marker->kn_kq = kq;
marker->kn_status = KN_MARKER;
mutex_spin_enter(&kq->kq_lock);
retry:
kevp = kevbuf;
if (KQ_COUNT(kq) == 0) {
if (timeout >= 0) {
error = cv_timedwait_sig(&kq->kq_cv,
&kq->kq_lock, timeout);
if (error == 0) {
if (KQ_COUNT(kq) == 0 &&
(kq->kq_count & KQ_RESTART)) {
/* return to clear file reference */
error = ERESTART;
} else if (tsp == NULL || (timeout =
gettimeleft(&ats, &sleepts)) > 0) {
goto retry;
}
} else {
/* don't restart after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
}
}
mutex_spin_exit(&kq->kq_lock);
goto done;
}
/* mark end of knote list */
TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
influx = 0;
/*
* Acquire the fdp->fd_lock interlock to avoid races with
* file creation/destruction from other threads.
*/
mutex_spin_exit(&kq->kq_lock);
relock:
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
while (count != 0) {
/*
* Get next knote. We are guaranteed this will never
* be NULL because of the marker we inserted above.
*/
kn = TAILQ_FIRST(&kq->kq_head);
bool kn_is_other_marker =
(kn->kn_status & KN_MARKER) != 0 && kn != marker;
bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
bool kn_is_in_flux = kn_in_flux(kn);
/*
* If we found a marker that's not ours, or this knote
* is in a state of flux, then wait for everything to
* settle down and go around again.
*/
if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
if (influx) {
influx = 0;
KQ_FLUX_WAKEUP(kq);
}
mutex_exit(&fdp->fd_lock);
if (kn_is_other_marker || kn_is_in_flux) {
KQ_FLUX_WAIT(kq);
mutex_spin_exit(&kq->kq_lock);
} else {
/*
* Detaching but not in-flux? Someone is
* actively trying to finish the job; just
* go around and try again.
*/
KASSERT(kn_is_detaching);
mutex_spin_exit(&kq->kq_lock);
preempt_point();
}
goto relock;
}
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
if (kn == marker) {
/* it's our marker, stop */
KQ_FLUX_WAKEUP(kq);
if (count == maxevents) {
mutex_exit(&fdp->fd_lock);
goto retry;
}
break;
}
KASSERT((kn->kn_status & KN_BUSY) == 0);
kq_check(kq);
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_BUSY;
kq_check(kq);
if (kn->kn_status & KN_DISABLED) {
kn->kn_status &= ~KN_BUSY;
kq->kq_count--;
/* don't want disabled events */
continue;
}
if ((kn->kn_flags & EV_ONESHOT) == 0) {
mutex_spin_exit(&kq->kq_lock);
KASSERT(mutex_owned(&fdp->fd_lock));
knote_foplock_enter(kn);
rv = filter_event(kn, 0, false);
knote_foplock_exit(kn);
mutex_spin_enter(&kq->kq_lock);
/* Re-poll if note was re-enqueued. */
if ((kn->kn_status & KN_QUEUED) != 0) {
kn->kn_status &= ~KN_BUSY;
/* Re-enqueue raised kq_count, lower it again */
kq->kq_count--;
influx = 1;
continue;
}
if (rv == 0) {
/*
* non-ONESHOT event that hasn't triggered
* again, so it will remain de-queued.
*/
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
influx = 1;
continue;
}
} else {
/*
* Must NOT drop kq_lock until we can do
* the KNOTE_WILLDETACH() below.
*/
}
KASSERT(kn->kn_fop != NULL);
touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
kn->kn_fop->f_touch != NULL);
/* XXXAD should be got from f_event if !oneshot. */
KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
if (touch) {
(void)filter_touch(kn, kevp, EVENT_PROCESS);
} else {
*kevp = kn->kn_kevent;
}
kevp++;
nkev++;
influx = 1;
if (kn->kn_flags & EV_ONESHOT) {
/* delete ONESHOT events after retrieval */
KNOTE_WILLDETACH(kn);
kn->kn_status &= ~KN_BUSY;
kq->kq_count--;
KASSERT(kn_in_flux(kn) == false);
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
KASSERT(kn->kn_kevent.udata == curlwp);
mutex_spin_exit(&kq->kq_lock);
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
} else if (kn->kn_flags & EV_CLEAR) {
/* clear state after retrieval */
kn->kn_data = 0;
kn->kn_fflags = 0;
/*
* Manually clear knotes who weren't
* 'touch'ed.
*/
if (touch == 0) {
kn->kn_data = 0;
kn->kn_fflags = 0;
}
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
} else if (kn->kn_flags & EV_DISPATCH) {
kn->kn_status |= KN_DISABLED;
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
} else {
/* add event back on list */
kq_check(kq);
kn->kn_status |= KN_QUEUED;
kn->kn_status &= ~KN_BUSY;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kq_check(kq);
}
if (nkev == kevcnt) {
/* do copyouts in kevcnt chunks */
influx = 0;
KQ_FLUX_WAKEUP(kq);
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
error = (*keops->keo_put_events)
(keops->keo_private,
kevbuf, ulistp, nevents, nkev);
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
nevents += nkev;
nkev = 0;
kevp = kevbuf;
}
count--;
if (error != 0 || count == 0) {
/* remove marker */
TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
break;
}
}
KQ_FLUX_WAKEUP(kq);
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
done:
if (nkev != 0) {
/* copyout remaining events */
error = (*keops->keo_put_events)(keops->keo_private,
kevbuf, ulistp, nevents, nkev);
}
*retval = maxevents - count;
return error;
}
/*
* fileops ioctl method for a kqueue descriptor.
*
* Two ioctls are currently supported. They both use struct kfilter_mapping:
* KFILTER_BYNAME find name for filter, and return result in
* name, which is of size len.
* KFILTER_BYFILTER find filter for name. len is ignored.
*/
/*ARGSUSED*/
static int
kqueue_ioctl(file_t *fp, u_long com, void *data)
{
struct kfilter_mapping *km;
const struct kfilter *kfilter;
char *name;
int error;
km = data;
error = 0;
name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
switch (com) {
case KFILTER_BYFILTER: /* convert filter -> name */
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byfilter(km->filter);
if (kfilter != NULL) {
strlcpy(name, kfilter->name, KFILTER_MAXNAME);
rw_exit(&kqueue_filter_lock);
error = copyoutstr(name, km->name, km->len, NULL);
} else {
rw_exit(&kqueue_filter_lock);
error = ENOENT;
}
break;
case KFILTER_BYNAME: /* convert name -> filter */
error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
if (error) {
break;
}
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byname(name);
if (kfilter != NULL)
km->filter = kfilter->filter;
else
error = ENOENT;
rw_exit(&kqueue_filter_lock);
break;
default:
error = ENOTTY;
break;
}
kmem_free(name, KFILTER_MAXNAME);
return (error);
}
/*
* fileops fcntl method for a kqueue descriptor.
*/
static int
kqueue_fcntl(file_t *fp, u_int com, void *data)
{
return (ENOTTY);
}
/*
* fileops poll method for a kqueue descriptor.
* Determine if kqueue has events pending.
*/
static int
kqueue_poll(file_t *fp, int events)
{
struct kqueue *kq;
int revents;
kq = fp->f_kqueue;
revents = 0;
if (events & (POLLIN | POLLRDNORM)) {
mutex_spin_enter(&kq->kq_lock);
if (KQ_COUNT(kq) != 0) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
selrecord(curlwp, &kq->kq_sel);
}
kq_check(kq);
mutex_spin_exit(&kq->kq_lock);
}
return revents;
}
/*
* fileops stat method for a kqueue descriptor.
* Returns dummy info, with st_size being number of events pending.
*/
static int
kqueue_stat(file_t *fp, struct stat *st)
{
struct kqueue *kq;
kq = fp->f_kqueue;
memset(st, 0, sizeof(*st));
st->st_size = KQ_COUNT(kq);
st->st_blksize = sizeof(struct kevent);
st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
st->st_blocks = 1;
st->st_uid = kauth_cred_geteuid(fp->f_cred);
st->st_gid = kauth_cred_getegid(fp->f_cred);
return 0;
}
static void
kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
{
struct knote *kn;
filedesc_t *fdp;
fdp = kq->kq_fdp;
KASSERT(mutex_owned(&fdp->fd_lock));
again:
for (kn = SLIST_FIRST(list); kn != NULL;) {
if (kq != kn->kn_kq) {
kn = SLIST_NEXT(kn, kn_link);
continue;
}
if (knote_detach_quiesce(kn)) {
mutex_enter(&fdp->fd_lock);
goto again;
}
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
kn = SLIST_FIRST(list);
}
}
/*
* fileops close method for a kqueue descriptor.
*/
static int
kqueue_close(file_t *fp)
{
struct kqueue *kq;
filedesc_t *fdp;
fdfile_t *ff;
int i;
kq = fp->f_kqueue;
fp->f_kqueue = NULL;
fp->f_type = 0;
fdp = curlwp->l_fd;
KASSERT(kq->kq_fdp == fdp);
mutex_enter(&fdp->fd_lock);
/*
* We're doing to drop the fd_lock multiple times while
* we detach knotes. During this time, attempts to register
* knotes via the back door (e.g. knote_proc_fork_track())
* need to fail, lest they sneak in to attach a knote after
* we've already drained the list it's destined for.
*
* We must acquire kq_lock here to set KQ_CLOSING (to serialize
* with other code paths that modify kq_count without holding
* the fd_lock), but once this bit is set, it's only safe to
* test it while holding the fd_lock, and holding kq_lock while
* doing so is not necessary.
*/
mutex_enter(&kq->kq_lock);
kq->kq_count |= KQ_CLOSING;
mutex_exit(&kq->kq_lock);
for (i = 0; i <= fdp->fd_lastkqfile; i++) {
if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
continue;
kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
}
if (fdp->fd_knhashmask != 0) {
for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
}
}
mutex_exit(&fdp->fd_lock);
#if defined(DEBUG)
mutex_enter(&kq->kq_lock);
kq_check(kq);
mutex_exit(&kq->kq_lock);
#endif /* DEBUG */
KASSERT(TAILQ_EMPTY(&kq->kq_head));
KASSERT(KQ_COUNT(kq) == 0);
mutex_destroy(&kq->kq_lock);
cv_destroy(&kq->kq_cv);
seldestroy(&kq->kq_sel);
kmem_free(kq, sizeof(*kq));
return (0);
}
/*
* struct fileops kqfilter method for a kqueue descriptor.
* Event triggered when monitored kqueue changes.
*/
static int
kqueue_kqfilter(file_t *fp, struct knote *kn)
{
struct kqueue *kq;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
KASSERT(fp == kn->kn_obj);
if (kn->kn_filter != EVFILT_READ)
return EINVAL;
kn->kn_fop = &kqread_filtops;
mutex_enter(&kq->kq_lock);
selrecord_knote(&kq->kq_sel, kn);
mutex_exit(&kq->kq_lock);
return 0;
}
/*
* Walk down a list of knotes, activating them if their event has
* triggered. The caller's object lock (e.g. device driver lock)
* must be held.
*/
void
knote(struct klist *list, long hint)
{
struct knote *kn, *tmpkn;
SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
/*
* We assume here that the backing object's lock is
* already held if we're traversing the klist, and
* so acquiring the knote foplock would create a
* deadlock scenario. But we also know that the klist
* won't disappear on us while we're here, so not
* acquiring it is safe.
*/
if (filter_event(kn, hint, true)) { knote_activate(kn);
}
}
}
/*
* Remove all knotes referencing a specified fd
*/
void
knote_fdclose(int fd)
{
struct klist *list;
struct knote *kn;
filedesc_t *fdp;
again:
fdp = curlwp->l_fd;
mutex_enter(&fdp->fd_lock);
list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
while ((kn = SLIST_FIRST(list)) != NULL) {
if (knote_detach_quiesce(kn)) {
goto again;
}
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
}
mutex_exit(&fdp->fd_lock);
}
/*
* Drop knote. Called with fdp->fd_lock held, and will drop before
* returning.
*/
static void
knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
{
struct klist *list;
struct kqueue *kq;
kq = kn->kn_kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
KASSERT(kn->kn_fop != NULL);
KASSERT(mutex_owned(&fdp->fd_lock));
/* Remove from monitored object. */
if (dofop) {
knote_foplock_enter(kn);
filter_detach(kn);
knote_foplock_exit(kn);
}
/* Remove from descriptor table. */
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
else
list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
/* Remove from kqueue. */
again:
mutex_spin_enter(&kq->kq_lock);
KASSERT(kn_in_flux(kn) == false);
if ((kn->kn_status & KN_QUEUED) != 0) {
kq_check(kq);
KASSERT(KQ_COUNT(kq) != 0);
kq->kq_count--;
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq_check(kq);
} else if (kn->kn_status & KN_BUSY) {
mutex_spin_exit(&kq->kq_lock);
goto again;
}
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
fd_putfile(kn->kn_id);
atomic_dec_uint(&kn->kn_kfilter->refcnt);
knote_free(kn);
}
/*
* Queue new event for knote.
*/
static void
knote_enqueue(struct knote *kn)
{
struct kqueue *kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
/* Don't bother enqueueing a dying knote. */
goto out;
}
if ((kn->kn_status & KN_DISABLED) != 0) {
kn->kn_status &= ~KN_DISABLED;
}
if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
kq_check(kq);
kn->kn_status |= KN_QUEUED;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
kq->kq_count++;
kq_check(kq);
cv_broadcast(&kq->kq_cv);
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
}
out:
mutex_spin_exit(&kq->kq_lock);
}
/*
* Queue new event for knote.
*/
static void
knote_activate_locked(struct knote *kn)
{
struct kqueue *kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
kq = kn->kn_kq;
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
/* Don't bother enqueueing a dying knote. */
return;
}
kn->kn_status |= KN_ACTIVE;
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
kq_check(kq);
kn->kn_status |= KN_QUEUED;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
kq->kq_count++;
kq_check(kq);
cv_broadcast(&kq->kq_cv);
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
}
}
static void
knote_activate(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
knote_activate_locked(kn);
mutex_spin_exit(&kq->kq_lock);
}
static void
knote_deactivate_locked(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
if (kn->kn_status & KN_QUEUED) {
kq_check(kq);
kn->kn_status &= ~KN_QUEUED;
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
KASSERT(KQ_COUNT(kq) > 0);
kq->kq_count--;
kq_check(kq);
}
kn->kn_status &= ~KN_ACTIVE;
}
/*
* Set EV_EOF on the specified knote. Also allows additional
* EV_* flags to be set (e.g. EV_ONESHOT).
*/
void
knote_set_eof(struct knote *kn, uint32_t flags)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_flags |= EV_EOF | flags;
mutex_spin_exit(&kq->kq_lock);
}
/*
* Clear EV_EOF on the specified knote.
*/
void
knote_clear_eof(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_flags &= ~EV_EOF;
mutex_spin_exit(&kq->kq_lock);
}
/*
* Initialize a klist.
*/
void
klist_init(struct klist *list)
{
SLIST_INIT(list);
}
/*
* Finalize a klist.
*/
void
klist_fini(struct klist *list)
{
struct knote *kn;
/*
* Neuter all existing knotes on the klist because the list is
* being destroyed. The caller has guaranteed that no additional
* knotes will be added to the list, that the backing object's
* locks are not held (otherwise there is a locking order issue
* with acquiring the knote foplock ), and that we can traverse
* the list safely in this state.
*/
SLIST_FOREACH(kn, list, kn_selnext) {
knote_foplock_enter(kn);
KASSERT(kn->kn_fop != NULL);
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
kn->kn_fop = &nop_fd_filtops;
} else {
kn->kn_fop = &nop_filtops;
}
knote_foplock_exit(kn);
}
}
/*
* Insert a knote into a klist.
*/
void
klist_insert(struct klist *list, struct knote *kn)
{
SLIST_INSERT_HEAD(list, kn, kn_selnext);
}
/*
* Remove a knote from a klist. Returns true if the last
* knote was removed and the list is now empty.
*/
bool
klist_remove(struct klist *list, struct knote *kn)
{
SLIST_REMOVE(list, kn, knote, kn_selnext);
return SLIST_EMPTY(list);
}
/* $NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
* from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $");
#include "opt_kgdb.h"
#include "opt_kstack.h"
#include "opt_uvmhist.h"
/*
* uvm_glue.c: glue functions
*/
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/syncobj.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/asan.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>
/*
* uvm_kernacc: test if kernel can access a memory region.
*
* => Currently used only by /dev/kmem driver (dev/mm.c).
*/
bool
uvm_kernacc(void *addr, size_t len, vm_prot_t prot)
{
vaddr_t saddr = trunc_page((vaddr_t)addr);
vaddr_t eaddr = round_page(saddr + len);
bool rv;
vm_map_lock_read(kernel_map);
rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot);
vm_map_unlock_read(kernel_map);
return rv;
}
#ifdef KGDB
/*
* Change protections on kernel pages from addr to addr+len
* (presumably so debugger can plant a breakpoint).
*
* We force the protection change at the pmap level. If we were
* to use vm_map_protect a change to allow writing would be lazily-
* applied meaning we would still take a protection fault, something
* we really don't want to do. It would also fragment the kernel
* map unnecessarily. We cannot use pmap_protect since it also won't
* enforce a write-enable request. Using pmap_enter is the only way
* we can ensure the change takes place properly.
*/
void
uvm_chgkprot(void *addr, size_t len, int rw)
{
vm_prot_t prot;
paddr_t pa;
vaddr_t sva, eva;
prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE;
eva = round_page((vaddr_t)addr + len);
for (sva = trunc_page((vaddr_t)addr); sva < eva; sva += PAGE_SIZE) {
/*
* Extract physical address for the page.
*/
if (pmap_extract(pmap_kernel(), sva, &pa) == false)
panic("%s: invalid page", __func__);
pmap_enter(pmap_kernel(), sva, pa, prot, PMAP_WIRED);
}
pmap_update(pmap_kernel());
}
#endif
/*
* uvm_vslock: wire user memory for I/O
*
* - called from physio and sys___sysctl
* - XXXCDC: consider nuking this (or making it a macro?)
*/
int
uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access_type)
{
struct vm_map *map;
vaddr_t start, end;
int error;
map = &vs->vm_map;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
error = uvm_fault_wire(map, start, end, access_type, 0);
return error;
}
/*
* uvm_vsunlock: unwire user memory wired by uvm_vslock()
*
* - called from physio and sys___sysctl
* - XXXCDC: consider nuking this (or making it a macro?)
*/
void
uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
{
uvm_fault_unwire(&vs->vm_map, trunc_page((vaddr_t)addr),
round_page((vaddr_t)addr + len));
}
/*
* uvm_proc_fork: fork a virtual address space
*
* - the address space is copied as per parent map's inherit values
*/
void
uvm_proc_fork(struct proc *p1, struct proc *p2, bool shared)
{
if (shared == true) {
p2->p_vmspace = NULL;
uvmspace_share(p1, p2);
} else {
p2->p_vmspace = uvmspace_fork(p1->p_vmspace);
}
cpu_proc_fork(p1, p2);
}
/*
* uvm_lwp_fork: fork a thread
*
* - a new PCB structure is allocated for the child process,
* and filled in by MD layer
* - if specified, the child gets a new user stack described by
* stack and stacksize
* - NOTE: the kernel stack may be at a different location in the child
* process, and thus addresses of automatic variables may be invalid
* after cpu_lwp_fork returns in the child process. We do nothing here
* after cpu_lwp_fork returns.
*/
void
uvm_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
void (*func)(void *), void *arg)
{
/* Fill stack with magic number. */
kstack_setup_magic(l2);
/*
* cpu_lwp_fork() copy and update the pcb, and make the child ready
* to run. If this is a normal user fork, the child will exit
* directly to user mode via child_return() on its first time
* slice and will not return here. If this is a kernel thread,
* the specified entry point will be executed.
*/
cpu_lwp_fork(l1, l2, stack, stacksize, func, arg);
}
#ifndef USPACE_ALIGN
#define USPACE_ALIGN 0
#endif
static pool_cache_t uvm_uarea_cache;
#if defined(__HAVE_CPU_UAREA_ROUTINES)
static pool_cache_t uvm_uarea_system_cache;
#else
#define uvm_uarea_system_cache uvm_uarea_cache
#endif
static void *
uarea_poolpage_alloc(struct pool *pp, int flags)
{ KASSERT((flags & PR_WAITOK) != 0);
#if defined(PMAP_MAP_POOLPAGE)
while (USPACE == PAGE_SIZE &&
(USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
struct vm_page *pg;
vaddr_t va;
#if defined(PMAP_ALLOC_POOLPAGE)
pg = PMAP_ALLOC_POOLPAGE(0);
#else
pg = uvm_pagealloc(NULL, 0, NULL, 0);
#endif
if (pg == NULL) {
uvm_wait("uarea");
continue;
}
va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
KASSERT(va != 0);
return (void *)va;
}
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
void *va = cpu_uarea_alloc(false);
if (va)
return (void *)va;
#endif
return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
USPACE_ALIGN, UVM_KMF_WIRED | UVM_KMF_WAITVA);
}
static void
uarea_poolpage_free(struct pool *pp, void *addr)
{
#if defined(PMAP_MAP_POOLPAGE)
if (USPACE == PAGE_SIZE &&
(USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
paddr_t pa;
pa = PMAP_UNMAP_POOLPAGE((vaddr_t) addr);
KASSERT(pa != 0);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
return;
}
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
if (cpu_uarea_free(addr))
return;
#endif
uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
UVM_KMF_WIRED);
}
static struct pool_allocator uvm_uarea_allocator = {
.pa_alloc = uarea_poolpage_alloc,
.pa_free = uarea_poolpage_free,
.pa_pagesz = USPACE,
};
#if defined(__HAVE_CPU_UAREA_ROUTINES)
static void *
uarea_system_poolpage_alloc(struct pool *pp, int flags)
{
void * const va = cpu_uarea_alloc(true);
if (va != NULL)
return va;
return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
USPACE_ALIGN, UVM_KMF_WIRED |
((flags & PR_WAITOK) ? UVM_KMF_WAITVA :
(UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)));
}
static void
uarea_system_poolpage_free(struct pool *pp, void *addr)
{
if (cpu_uarea_free(addr))
return;
uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
UVM_KMF_WIRED);
}
static struct pool_allocator uvm_uarea_system_allocator = {
.pa_alloc = uarea_system_poolpage_alloc,
.pa_free = uarea_system_poolpage_free,
.pa_pagesz = USPACE,
};
#endif /* __HAVE_CPU_UAREA_ROUTINES */
void
uvm_uarea_init(void)
{
int flags = PR_NOTOUCH;
/*
* specify PR_NOALIGN unless the alignment provided by
* the backend (USPACE_ALIGN) is sufficient to provide
* pool page size (UPSACE) alignment.
*/
if ((USPACE_ALIGN == 0 && USPACE != PAGE_SIZE) ||
(USPACE_ALIGN % USPACE) != 0) {
flags |= PR_NOALIGN;
}
uvm_uarea_cache = pool_cache_init(USPACE, USPACE_ALIGN, 0, flags,
"uarea", &uvm_uarea_allocator, IPL_NONE, NULL, NULL, NULL);
#if defined(__HAVE_CPU_UAREA_ROUTINES)
uvm_uarea_system_cache = pool_cache_init(USPACE, USPACE_ALIGN,
0, flags, "uareasys", &uvm_uarea_system_allocator,
IPL_NONE, NULL, NULL, NULL);
#endif
}
/*
* uvm_uarea_alloc: allocate a u-area
*/
vaddr_t
uvm_uarea_alloc(void)
{
return (vaddr_t)pool_cache_get(uvm_uarea_cache, PR_WAITOK);
}
vaddr_t
uvm_uarea_system_alloc(struct cpu_info *ci)
{
#ifdef __HAVE_CPU_UAREA_ALLOC_IDLELWP
if (__predict_false(ci != NULL))
return cpu_uarea_alloc_idlelwp(ci);
#endif
return (vaddr_t)pool_cache_get(uvm_uarea_system_cache, PR_WAITOK);
}
/*
* uvm_uarea_free: free a u-area
*/
void
uvm_uarea_free(vaddr_t uaddr)
{
kasan_mark((void *)uaddr, USPACE, USPACE, 0);
pool_cache_put(uvm_uarea_cache, (void *)uaddr);
}
void
uvm_uarea_system_free(vaddr_t uaddr)
{
kasan_mark((void *)uaddr, USPACE, USPACE, 0);
pool_cache_put(uvm_uarea_system_cache, (void *)uaddr);
}
vaddr_t
uvm_lwp_getuarea(lwp_t *l)
{
return (vaddr_t)l->l_addr - UAREA_PCB_OFFSET;
}
void
uvm_lwp_setuarea(lwp_t *l, vaddr_t addr)
{
l->l_addr = (void *)(addr + UAREA_PCB_OFFSET);
}
/*
* uvm_proc_exit: exit a virtual address space
*
* - borrow proc0's address space because freeing the vmspace
* of the dead process may block.
*/
void
uvm_proc_exit(struct proc *p)
{
struct lwp *l = curlwp; /* XXX */
struct vmspace *ovm;
KASSERT(p == l->l_proc);
ovm = p->p_vmspace;
KASSERT(ovm != NULL);
if (__predict_false(ovm == proc0.p_vmspace))
return;
/*
* borrow proc0's address space.
*/
kpreempt_disable();
pmap_deactivate(l);
p->p_vmspace = proc0.p_vmspace;
pmap_activate(l);
kpreempt_enable();
uvmspace_free(ovm);
}
void
uvm_lwp_exit(struct lwp *l)
{
vaddr_t va = uvm_lwp_getuarea(l);
bool system = (l->l_flag & LW_SYSTEM) != 0;
if (system)
uvm_uarea_system_free(va);
else
uvm_uarea_free(va);
#ifdef DIAGNOSTIC
uvm_lwp_setuarea(l, (vaddr_t)NULL);
#endif
}
/*
* uvm_init_limit: init per-process VM limits
*
* - called for process 0 and then inherited by all others.
*/
void
uvm_init_limits(struct proc *p)
{
/*
* Set up the initial limits on process VM. Set the maximum
* resident set size to be all of (reasonably) available memory.
* This causes any single, large process to start random page
* replacement once it fills memory.
*/
p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
p->p_rlimit[RLIMIT_STACK].rlim_max = maxsmap;
p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
p->p_rlimit[RLIMIT_DATA].rlim_max = maxdmap;
p->p_rlimit[RLIMIT_AS].rlim_cur = RLIM_INFINITY;
p->p_rlimit[RLIMIT_AS].rlim_max = RLIM_INFINITY;
p->p_rlimit[RLIMIT_RSS].rlim_cur = MIN(VM_MAXUSER_ADDRESS,
ctob((rlim_t)uvm_availmem(false)));
}
/*
* uvm_scheduler: process zero main loop.
*/
extern struct loadavg averunnable;
void
uvm_scheduler(void)
{
lwp_t *l = curlwp;
lwp_lock(l);
l->l_class = SCHED_FIFO;
lwp_changepri(l, PRI_VM);
lwp_unlock(l);
/* Start the freelist cache. */
uvm_pgflcache_start();
for (;;) {
/* Update legacy stats for post-mortem debugging. */
uvm_update_uvmexp();
/* See if the pagedaemon needs to generate some free pages. */
uvm_kick_pdaemon();
/* Calculate process statistics. */
sched_pstats();
(void)kpause("uvm", false, hz, NULL);
}
}
/*
* uvm_idle: called from the idle loop.
*/
void
uvm_idle(void)
{
struct cpu_info *ci = curcpu();
struct uvm_cpu *ucpu = ci->ci_data.cpu_uvm;
KASSERT(kpreempt_disabled());
uvmpdpol_idle(ucpu);
}
/* $NetBSD: ipsec.h,v 1.93 2022/10/28 05:23:09 ozaki-r Exp $ */
/* $FreeBSD: ipsec.h,v 1.2.4.2 2004/02/14 22:23:23 bms Exp $ */
/* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _NETIPSEC_IPSEC_H_
#define _NETIPSEC_IPSEC_H_
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <net/pfkeyv2.h>
#ifdef _KERNEL
#include <sys/socketvar.h>
#include <sys/localcount.h>
#include <netinet/in_pcb.h>
#include <netipsec/keydb.h>
/*
* Security Policy Index
* Ensure that both address families in the "src" and "dst" are same.
* When the value of the ul_proto is ICMPv6, the port field in "src"
* specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
*/
struct secpolicyindex {
u_int8_t dir; /* direction of packet flow, see blow */
union sockaddr_union src; /* IP src address for SP */
union sockaddr_union dst; /* IP dst address for SP */
u_int8_t prefs; /* prefix length in bits for src */
u_int8_t prefd; /* prefix length in bits for dst */
u_int16_t ul_proto; /* upper layer Protocol */
};
/* Security Policy Data Base */
struct secpolicy {
struct pslist_entry pslist_entry;
struct localcount localcount; /* reference count */
struct secpolicyindex spidx; /* selector */
u_int32_t id; /* It's unique number on the system. */
u_int state; /* 0: dead, others: alive */
#define IPSEC_SPSTATE_DEAD 0
#define IPSEC_SPSTATE_ALIVE 1
u_int origin; /* who generate this SP. */
#define IPSEC_SPORIGIN_USER 0
#define IPSEC_SPORIGIN_KERNEL 1
u_int policy; /* DISCARD, NONE or IPSEC, see keyv2.h */
struct ipsecrequest *req;
/* pointer to the ipsec request tree, */
/* if policy == IPSEC else this value == NULL.*/
/*
* lifetime handler.
* the policy can be used without limitiation if both lifetime and
* validtime are zero.
* "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime.
* "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime.
*/
time_t created; /* time created the policy */
time_t lastused; /* updated every when kernel sends a packet */
time_t lifetime; /* duration of the lifetime of this policy */
time_t validtime; /* duration this policy is valid without use */
};
/* Request for IPsec */
struct ipsecrequest {
struct ipsecrequest *next;
/* pointer to next structure */
/* If NULL, it means the end of chain. */
struct secasindex saidx;/* hint for search proper SA */
/* if __ss_len == 0 then no address specified.*/
u_int level; /* IPsec level defined below. */
struct secpolicy *sp; /* back pointer to SP */
};
/* security policy in PCB */
struct inpcbpolicy {
struct secpolicy *sp_in;
struct secpolicy *sp_out;
int priv; /* privileged socket ? */
/* cached policy */
struct {
struct secpolicy *cachesp;
struct secpolicyindex cacheidx;
int cachehint; /* processing requirement hint: */
#define IPSEC_PCBHINT_UNKNOWN 0 /* Unknown */
#define IPSEC_PCBHINT_YES 1 /* IPsec processing is required */
#define IPSEC_PCBHINT_NO 2 /* IPsec processing not required */
u_int cachegen; /* spdgen when cache filled */
} sp_cache[3]; /* XXX 3 == IPSEC_DIR_MAX */
int sp_cacheflags;
#define IPSEC_PCBSP_CONNECTED 1
struct inpcb *sp_inp; /* back pointer */
};
extern u_int ipsec_spdgen;
static __inline bool
ipsec_pcb_skip_ipsec(struct inpcbpolicy *pcbsp, int dir)
{
KASSERT(inp_locked(pcbsp->sp_inp)); return pcbsp->sp_cache[(dir)].cachehint == IPSEC_PCBHINT_NO &&
pcbsp->sp_cache[(dir)].cachegen == ipsec_spdgen;
}
/* SP acquiring list table. */
struct secspacq {
LIST_ENTRY(secspacq) chain;
struct secpolicyindex spidx;
time_t created; /* for lifetime */
int count; /* for lifetime */
/* XXX: here is mbuf place holder to be sent ? */
};
#endif /* _KERNEL */
/* buffer size for formatted output of ipsec address (addr + '%' + scope_id?) */
#define IPSEC_ADDRSTRLEN (INET6_ADDRSTRLEN + 11)
/* buffer size for ipsec_logsastr() */
#define IPSEC_LOGSASTRLEN 192
/* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */
#define IPSEC_PORT_ANY 0
#define IPSEC_ULPROTO_ANY 255
#define IPSEC_PROTO_ANY 255
/* mode of security protocol */
/* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */
#define IPSEC_MODE_ANY 0 /* i.e. wildcard. */
#define IPSEC_MODE_TRANSPORT 1
#define IPSEC_MODE_TUNNEL 2
#define IPSEC_MODE_TCPMD5 3 /* TCP MD5 mode */
/*
* Direction of security policy.
* NOTE: Since INVALID is used just as flag.
* The other are used for loop counter too.
*/
#define IPSEC_DIR_ANY 0
#define IPSEC_DIR_INBOUND 1
#define IPSEC_DIR_OUTBOUND 2
#define IPSEC_DIR_MAX 3
#define IPSEC_DIR_INVALID 4
#define IPSEC_DIR_IS_VALID(dir) ((dir) >= 0 && (dir) <= IPSEC_DIR_MAX)
#define IPSEC_DIR_IS_INOROUT(dir) ((dir) == IPSEC_DIR_INBOUND || \
(dir) == IPSEC_DIR_OUTBOUND)
/* Policy level */
/*
* IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB,
* DISCARD, IPSEC and NONE are allowed for setkey() in SPD.
* DISCARD and NONE are allowed for system default.
*/
#define IPSEC_POLICY_DISCARD 0 /* discarding packet */
#define IPSEC_POLICY_NONE 1 /* through IPsec engine */
#define IPSEC_POLICY_IPSEC 2 /* do IPsec */
#define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */
#define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */
/* Security protocol level */
#define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */
#define IPSEC_LEVEL_USE 1 /* use SA if present. */
#define IPSEC_LEVEL_REQUIRE 2 /* require SA. */
#define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */
#define IPSEC_MANUAL_REQID_MAX 0x3fff
/*
* if security policy level == unique, this id
* indicate to a relative SA for use, else is
* zero.
* 1 - 0x3fff are reserved for manual keying.
* 0 are reserved for above reason. Others is
* for kernel use.
* Note that this id doesn't identify SA
* by only itself.
*/
#define IPSEC_REPLAYWSIZE 32
#ifdef _KERNEL
extern int ipsec_debug;
#ifdef IPSEC_DEBUG
extern int ipsec_replay;
extern int ipsec_integrity;
#endif
extern struct secpolicy ip4_def_policy;
extern int ip4_esp_trans_deflev;
extern int ip4_esp_net_deflev;
extern int ip4_ah_trans_deflev;
extern int ip4_ah_net_deflev;
extern int ip4_ah_cleartos;
extern int ip4_ah_offsetmask;
extern int ip4_ipsec_dfbit;
extern int ip4_ipsec_ecn;
extern int crypto_support;
#include <sys/syslog.h>
#define DPRINTF(fmt, args...) \
do { \
if (ipsec_debug) \
log(LOG_DEBUG, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#define IPSECLOG(level, fmt, args...) \
do { \
if (ipsec_debug) \
log(level, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#define ipsec_indone(m) \
((m->m_flags & M_AUTHIPHDR) || (m->m_flags & M_DECRYPTED))
#define ipsec_outdone(m) \
(m_tag_find((m), PACKET_TAG_IPSEC_OUT_DONE) != NULL)
static __inline bool
ipsec_skip_pfil(struct mbuf *m)
{
bool rv;
if (ipsec_indone(m) &&
((m->m_pkthdr.pkthdr_flags & PKTHDR_FLAG_IPSEC_SKIP_PFIL) != 0)) {
m->m_pkthdr.pkthdr_flags &= ~PKTHDR_FLAG_IPSEC_SKIP_PFIL;
rv = true;
} else {
rv = false;
}
return rv;
}
void ipsec_pcbconn(struct inpcbpolicy *);
void ipsec_pcbdisconn(struct inpcbpolicy *);
void ipsec_invalpcbcacheall(void);
struct inpcb;
int ipsec4_output(struct mbuf *, struct inpcb *, int, u_long *, bool *, bool *, bool *);
int ipsec_ip_input_checkpolicy(struct mbuf *, bool);
void ipsec_mtu(struct mbuf *, int *);
#ifdef INET6
void ipsec6_udp_cksum(struct mbuf *);
#endif
struct inpcb;
int ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **);
int ipsec_copy_policy(const struct inpcbpolicy *, struct inpcbpolicy *);
u_int ipsec_get_reqlevel(const struct ipsecrequest *);
int ipsec_set_policy(struct inpcb *, const void *, size_t, kauth_cred_t);
int ipsec_get_policy(struct inpcb *, const void *, size_t, struct mbuf **);
int ipsec_delete_pcbpolicy(struct inpcb *);
int ipsec_in_reject(struct mbuf *, struct inpcb *);
struct secasvar *ipsec_lookup_sa(const struct ipsecrequest *,
const struct mbuf *);
struct secas;
struct tcpcb;
int ipsec_chkreplay(u_int32_t, const struct secasvar *);
int ipsec_updatereplay(u_int32_t, const struct secasvar *);
size_t ipsec_hdrsiz(struct mbuf *, u_int, struct inpcb *);
size_t ipsec4_hdrsiz_tcp(struct tcpcb *);
union sockaddr_union;
const char *ipsec_address(const union sockaddr_union* sa, char *, size_t);
const char *ipsec_logsastr(const struct secasvar *, char *, size_t);
/* NetBSD protosw ctlin entrypoint */
void *esp4_ctlinput(int, const struct sockaddr *, void *);
void *ah4_ctlinput(int, const struct sockaddr *, void *);
void ipsec_output_init(void);
struct m_tag;
void ipsec4_common_input(struct mbuf *m, int, int);
int ipsec4_common_input_cb(struct mbuf *, struct secasvar *, int, int);
int ipsec4_process_packet(struct mbuf *, const struct ipsecrequest *, u_long *);
int ipsec_process_done(struct mbuf *, const struct ipsecrequest *,
struct secasvar *, int);
struct mbuf *m_clone(struct mbuf *);
struct mbuf *m_makespace(struct mbuf *, int, int, int *);
void *m_pad(struct mbuf *, int);
int m_striphdr(struct mbuf *, int, int);
extern int ipsec_used __read_mostly;
extern int ipsec_enabled __read_mostly;
#endif /* _KERNEL */
#ifndef _KERNEL
char *ipsec_set_policy(const char *, int);
int ipsec_get_policylen(char *);
char *ipsec_dump_policy(char *, const char *);
const char *ipsec_strerror(void);
#endif /* !_KERNEL */
#ifdef _KERNEL
/* External declarations of per-file init functions */
void ah_attach(void);
void esp_attach(void);
void ipcomp_attach(void);
void ipe4_attach(void);
void tcpsignature_attach(void);
void ipsec_attach(void);
void sysctl_net_inet_ipsec_setup(struct sysctllog **);
#ifdef INET6
void sysctl_net_inet6_ipsec6_setup(struct sysctllog **);
#endif
#endif /* _KERNEL */
#endif /* !_NETIPSEC_IPSEC_H_ */
/* $NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncmp.c 8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
int
strncmp(const char *s1, const char *s2, size_t n)
{ if (n == 0)
return (0);
do {
if (*s1 != *s2++)
return (*(const unsigned char *)s1 -
*(const unsigned char *)--s2);
if (*s1++ == 0)
break;
} while (--n != 0);
return (0);
}
/* $NetBSD: userret.h,v 1.35 2024/01/28 10:06:19 skrll Exp $ */
/*-
* Copyright (c) 1998, 2000, 2003, 2006, 2008, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum, and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_USERRET_H_
#define _SYS_USERRET_H_
#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/psref.h>
/*
* Define the MI code needed before returning to user mode, for trap and
* syscall.
*
* We handle "exceptional" events: pending signals, stop/exit actions, etc.
* Note that the event must be flagged BEFORE any AST is posted as we are
* reading unlocked.
*/
static __inline void
mi_userret(struct lwp *l)
{
int exception;
KPREEMPT_DISABLE(l); KASSERTMSG(l->l_cpu->ci_biglock_count == 0, "kernel_lock leaked"); KASSERT(l->l_blcnt == 0);
exception = l->l_cpu->ci_want_resched | (l->l_flag & LW_USERRET);
KPREEMPT_ENABLE(l); if (__predict_false(exception)) { lwp_userret(l);
}
LOCKDEBUG_BARRIER(NULL, 0);
KASSERT(l->l_nopreempt == 0);
PSREF_DEBUG_BARRIER();
KASSERT(l->l_psrefs == 0);
}
#endif /* !_SYS_USERRET_H_ */
/* $NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $ */
/*-
* Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Brown.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Mike Karels at Berkeley Software Design, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95
*/
/*
* sysctl system call.
*/
#define __COMPAT_SYSCTL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_defcorename.h"
#endif
#include "ksyms.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/ksyms.h>
#include <sys/ktrace.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/once.h>
#include <sys/rndsource.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <crypto/blake2/blake2s.h>
#define MAXDESCLEN 1024
MALLOC_DEFINE(M_SYSCTLNODE, "sysctlnode", "sysctl node structures");
MALLOC_DEFINE(M_SYSCTLDATA, "sysctldata", "misc sysctl data");
static int sysctl_mmap(SYSCTLFN_PROTO);
static int sysctl_alloc(struct sysctlnode *, int);
static int sysctl_realloc(struct sysctlnode *);
static int sysctl_cvt_in(struct lwp *, int *, const void *, size_t,
struct sysctlnode *);
static int sysctl_cvt_out(struct lwp *, int, const struct sysctlnode *,
void *, size_t, size_t *);
static int sysctl_log_add(struct sysctllog **, const struct sysctlnode *);
static int sysctl_log_realloc(struct sysctllog *);
typedef void sysctl_setup_func(struct sysctllog **);
#ifdef SYSCTL_DEBUG
#define DPRINTF(a) printf a
#else
#define DPRINTF(a)
#endif
struct sysctllog {
const struct sysctlnode *log_root;
int *log_num;
int log_size, log_left;
};
/*
* the "root" of the new sysctl tree
*/
struct sysctlnode sysctl_root = {
.sysctl_flags = SYSCTL_VERSION|
CTLFLAG_ROOT|CTLFLAG_READWRITE|
CTLTYPE_NODE,
.sysctl_num = 0,
.sysctl_size = sizeof(struct sysctlnode),
.sysctl_name = "(root)",
};
/*
* link set of functions that add nodes at boot time (see also
* sysctl_buildtree())
*/
__link_set_decl(sysctl_funcs, sysctl_setup_func);
/*
* The `sysctl_treelock' is intended to serialize access to the sysctl
* tree. XXX This has serious problems; allocating memory and
* copying data out with the lock held is insane.
*/
krwlock_t sysctl_treelock;
kmutex_t sysctl_file_marker_lock;
/*
* Attributes stored in the kernel.
*/
char hostname[MAXHOSTNAMELEN];
int hostnamelen;
char domainname[MAXHOSTNAMELEN];
int domainnamelen;
long hostid;
#ifndef DEFCORENAME
#define DEFCORENAME "%n.core"
#endif
char defcorename[MAXPATHLEN] = DEFCORENAME;
/*
* ********************************************************************
* Section 0: Some simple glue
* ********************************************************************
* By wrapping copyin(), copyout(), and copyinstr() like this, we can
* stop caring about who's calling us and simplify some code a bunch.
* ********************************************************************
*/
int
sysctl_copyin(struct lwp *l, const void *uaddr, void *kaddr, size_t len)
{
int error;
if (l != NULL) {
error = copyin(uaddr, kaddr, len);
ktrmibio(-1, UIO_WRITE, uaddr, len, error);
} else {
error = kcopy(uaddr, kaddr, len);
}
return error;
}
int
sysctl_copyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len)
{
int error;
if (l != NULL) {
error = copyout(kaddr, uaddr, len);
ktrmibio(-1, UIO_READ, uaddr, len, error);
} else {
error = kcopy(kaddr, uaddr, len);
}
return error;
}
int
sysctl_copyinstr(struct lwp *l, const void *uaddr, void *kaddr,
size_t len, size_t *done)
{
int error;
if (l != NULL) {
error = copyinstr(uaddr, kaddr, len, done);
ktrmibio(-1, UIO_WRITE, uaddr, len, error);
} else {
error = copystr(uaddr, kaddr, len, done);
}
return error;
}
/*
* ********************************************************************
* Initialize sysctl subsystem.
* ********************************************************************
*/
void
sysctl_init(void)
{
sysctl_setup_func *const *sysctl_setup;
rw_init(&sysctl_treelock);
/*
* dynamic mib numbers start here
*/
sysctl_root.sysctl_num = CREATE_BASE;
sysctl_basenode_init();
__link_set_foreach(sysctl_setup, sysctl_funcs) {
(**sysctl_setup)(NULL);
}
mutex_init(&sysctl_file_marker_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Setting this means no more permanent nodes can be added,
* trees that claim to be readonly at the root now are, and if
* the main tree is readonly, *everything* is.
*
* Also starts up the PRNG used for the "random" sysctl: it's
* better to start it later than sooner.
*
* Call this at the end of kernel init.
*/
void
sysctl_finalize(void)
{
sysctl_root.sysctl_flags |= CTLFLAG_PERMANENT;
}
/*
* ********************************************************************
* The main native sysctl system call itself.
* ********************************************************************
*/
int
sys___sysctl(struct lwp *l, const struct sys___sysctl_args *uap, register_t *retval)
{
/* {
syscallarg(const int *) name;
syscallarg(u_int) namelen;
syscallarg(void *) old;
syscallarg(size_t *) oldlenp;
syscallarg(const void *) new;
syscallarg(size_t) newlen;
} */
int error, nerror, name[CTL_MAXNAME];
size_t oldlen, savelen, *oldlenp;
/*
* get oldlen
*/
oldlen = 0;
oldlenp = SCARG(uap, oldlenp);
if (oldlenp != NULL) {
error = copyin(oldlenp, &oldlen, sizeof(oldlen));
if (error)
return (error);
}
savelen = oldlen;
/*
* top-level sysctl names may or may not be non-terminal, but
* we don't care
*/
if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 1)
return (EINVAL);
error = copyin(SCARG(uap, name), &name,
SCARG(uap, namelen) * sizeof(int));
if (error)
return (error);
ktrmib(name, SCARG(uap, namelen));
sysctl_lock(SCARG(uap, newv) != NULL);
/*
* do sysctl work (NULL means main built-in default tree)
*/
error = sysctl_dispatch(&name[0], SCARG(uap, namelen),
SCARG(uap, oldv), &oldlen,
SCARG(uap, newv), SCARG(uap, newlen),
&name[0], l, NULL);
/*
* release the sysctl lock
*/
sysctl_unlock();
/*
* set caller's oldlen to new value even in the face of an
* error (if this gets an error and they didn't have one, they
* get this one)
*/
if (oldlenp) {
nerror = copyout(&oldlen, oldlenp, sizeof(oldlen));
if (error == 0)
error = nerror;
}
/*
* if the only problem is that we weren't given enough space,
* that's an ENOMEM error
*/
if (error == 0 && SCARG(uap, oldv) != NULL && savelen < oldlen)
error = ENOMEM;
return (error);
}
/*
* ********************************************************************
* Section 1: How the tree is used
* ********************************************************************
* Implementations of sysctl for emulations should typically need only
* these three functions in this order: lock the tree, dispatch
* request into it, unlock the tree.
* ********************************************************************
*/
void
sysctl_lock(bool write)
{
if (write) {
rw_enter(&sysctl_treelock, RW_WRITER);
curlwp->l_pflag |= LP_SYSCTLWRITE;
} else {
rw_enter(&sysctl_treelock, RW_READER);
curlwp->l_pflag &= ~LP_SYSCTLWRITE;
}
}
void
sysctl_relock(void)
{
if ((curlwp->l_pflag & LP_SYSCTLWRITE) != 0) {
rw_enter(&sysctl_treelock, RW_WRITER);
} else {
rw_enter(&sysctl_treelock, RW_READER);
}
}
/*
* ********************************************************************
* the main sysctl dispatch routine. scans the given tree and picks a
* function to call based on what it finds.
* ********************************************************************
*/
int
sysctl_dispatch(SYSCTLFN_ARGS)
{
int error;
sysctlfn fn;
int ni;
KASSERT(rw_lock_held(&sysctl_treelock)); if (rnode && SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_dispatch: rnode %p wrong version\n", rnode);
error = EINVAL;
goto out;
}
fn = NULL;
error = sysctl_locate(l, name, namelen, &rnode, &ni);
if (rnode->sysctl_func != NULL) {
/*
* the node we ended up at has a function, so call it. it can
* hand off to query or create if it wants to.
*/
fn = rnode->sysctl_func;
} else if (error == 0) {
/*
* we found the node they were looking for, so do a lookup.
*/
fn = (sysctlfn)sysctl_lookup; /* XXX may write to rnode */
} else if (error == ENOENT && (ni + 1) == namelen && name[ni] < 0) {
/*
* prospective parent node found, but the terminal node was
* not. generic operations associate with the parent.
*/
switch (name[ni]) {
case CTL_QUERY:
fn = sysctl_query;
break;
case CTL_CREATE:
#if NKSYMS > 0
case CTL_CREATESYM:
#endif /* NKSYMS > 0 */
if (newp == NULL) {
error = EINVAL;
break;
}
KASSERT(rw_write_held(&sysctl_treelock));
fn = (sysctlfn)sysctl_create; /* we own the rnode */
break;
case CTL_DESTROY:
if (newp == NULL) {
error = EINVAL;
break;
}
KASSERT(rw_write_held(&sysctl_treelock));
fn = (sysctlfn)sysctl_destroy; /* we own the rnode */
break;
case CTL_MMAP:
fn = (sysctlfn)sysctl_mmap; /* we own the rnode */
break;
case CTL_DESCRIBE:
fn = sysctl_describe;
break;
default:
error = EOPNOTSUPP;
break;
}
}
/*
* after all of that, maybe we found someone who knows how to
* get us what we want?
*/
if (fn != NULL)
error = (*fn)(name + ni, namelen - ni, oldp, oldlenp,
newp, newlen, name, l, rnode);
else if (error == 0)
error = EOPNOTSUPP;
out:
return (error);
}
/*
* ********************************************************************
* Releases the tree lock.
* ********************************************************************
*/
void
sysctl_unlock(void)
{
rw_exit(&sysctl_treelock);
}
/*
* ********************************************************************
* Section 2: The main tree interfaces
* ********************************************************************
* This is how sysctl_dispatch() does its work, and you can too, by
* calling these routines from helpers (though typically only
* sysctl_lookup() will be used). The tree MUST BE LOCKED when these
* are called.
* ********************************************************************
*/
/*
* sysctl_locate -- Finds the node matching the given mib under the
* given tree (via rv). If no tree is given, we fall back to the
* native tree. The current process (via l) is used for access
* control on the tree (some nodes may be traversable only by root) and
* on return, nip will show how many numbers in the mib were consumed.
*/
int
sysctl_locate(struct lwp *l, const int *name, u_int namelen,
const struct sysctlnode **rnode, int *nip)
{
const struct sysctlnode *node, *pnode;
int tn, si, ni, error, alias;
KASSERT(rw_lock_held(&sysctl_treelock));
/*
* basic checks and setup
*/
if (*rnode == NULL) *rnode = &sysctl_root; if (nip) *nip = 0; if (namelen == 0)
return (0);
/*
* search starts from "root"
*/
pnode = *rnode;
if (SYSCTL_VERS(pnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_locate: pnode %p wrong version\n", pnode);
return (EINVAL);
}
node = pnode->sysctl_child;
error = 0;
/*
* scan for node to which new node should be attached
*/
for (ni = 0; ni < namelen; ni++) {
/*
* walked off bottom of tree
*/
if (node == NULL) {
if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
error = ENOENT;
else
error = ENOTDIR;
break;
}
/*
* can anyone traverse this node or only root?
*/
if (l != NULL && (pnode->sysctl_flags & CTLFLAG_PRIVATE) &&
(error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT,
NULL, NULL, NULL)) != 0)
return (error);
/*
* find a child node with the right number
*/
tn = name[ni];
alias = 0;
si = 0;
/*
* Note: ANYNUMBER only matches positive integers.
* Since ANYNUMBER is only permitted on single-node
* sub-trees (eg proc), check before the loop and skip
* it if we can.
*/
if ((node[si].sysctl_flags & CTLFLAG_ANYNUMBER) && (tn >= 0))
goto foundit;
for (; si < pnode->sysctl_clen; si++) { if (node[si].sysctl_num == tn) { if (node[si].sysctl_flags & CTLFLAG_ALIAS) { if (alias++ == 4)
break;
else {
tn = node[si].sysctl_alias;
si = -1;
}
} else
goto foundit;
}
}
/*
* if we ran off the end, it obviously doesn't exist
*/
error = ENOENT;
break;
/*
* so far so good, move on down the line
*/
foundit:
pnode = &node[si];
if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE) node = node[si].sysctl_child;
else
node = NULL;
}
*rnode = pnode;
if (nip) *nip = ni;
return (error);
}
/*
* sysctl_query -- The auto-discovery engine. Copies out the structs
* describing nodes under the given node and handles overlay trees.
*/
int
sysctl_query(SYSCTLFN_ARGS)
{
int error, ni, elim, v;
size_t out, left, t;
const struct sysctlnode *enode, *onode;
struct sysctlnode qnode;
KASSERT(rw_lock_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_query: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (namelen != 1 || name[0] != CTL_QUERY)
return (EINVAL);
error = 0;
out = 0;
left = *oldlenp;
elim = 0;
enode = NULL;
/*
* translate the given request to a current node
*/
error = sysctl_cvt_in(l, &v, newp, newlen, &qnode);
if (error)
return (error);
/*
* if the request specifies a version, check it
*/
if (qnode.sysctl_ver != 0) {
enode = rnode;
if (qnode.sysctl_ver != enode->sysctl_ver &&
qnode.sysctl_ver != sysctl_rootof(enode)->sysctl_ver)
return (EINVAL);
}
/*
* process has overlay tree
*/
if (l && l->l_proc->p_emul->e_sysctlovly) {
enode = l->l_proc->p_emul->e_sysctlovly;
elim = (name - oname);
error = sysctl_locate(l, oname, elim, &enode, NULL);
if (error == 0) {
/* ah, found parent in overlay */
elim = enode->sysctl_clen;
enode = enode->sysctl_child;
} else {
error = 0;
elim = 0;
enode = NULL;
}
}
for (ni = 0; ni < rnode->sysctl_clen; ni++) {
onode = &rnode->sysctl_child[ni];
if (enode && enode->sysctl_num == onode->sysctl_num) {
if (SYSCTL_TYPE(enode->sysctl_flags) != CTLTYPE_NODE)
onode = enode;
if (--elim > 0)
enode++;
else
enode = NULL;
}
error = sysctl_cvt_out(l, v, onode, oldp, left, &t);
if (error)
return (error);
if (oldp != NULL)
oldp = (char*)oldp + t;
out += t;
left -= MIN(left, t);
}
/*
* overlay trees *MUST* be entirely consumed
*/
KASSERT(enode == NULL);
*oldlenp = out;
return (error);
}
/*
* sysctl_create -- Adds a node (the description of which is taken
* from newp) to the tree, returning a copy of it in the space pointed
* to by oldp. In the event that the requested slot is already taken
* (either by name or by number), the offending node is returned
* instead. Yes, this is complex, but we want to make sure everything
* is proper.
*/
#ifdef SYSCTL_DEBUG_CREATE
int _sysctl_create(SYSCTLFN_ARGS);
int
_sysctl_create(SYSCTLFN_ARGS)
#else
int
sysctl_create(SYSCTLFN_ARGS)
#endif
{
struct sysctlnode nnode, *node, *pnode;
int error, ni, at, nm, type, nsz, sz, flags, anum, v;
void *own;
KASSERT(rw_write_held(&sysctl_treelock));
error = 0;
own = NULL;
anum = -1;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_create: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (namelen != 1 || (name[namelen - 1] != CTL_CREATE
#if NKSYMS > 0
&& name[namelen - 1] != CTL_CREATESYM
#endif /* NKSYMS > 0 */
))
return (EINVAL);
/*
* processes can only add nodes at securelevel 0, must be
* root, and can't add nodes to a parent that's not writeable
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_ADD, NULL, NULL, NULL);
if (error)
return (error);
if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
return (EPERM);
}
/*
* nothing can add a node if:
* we've finished initial set up of this tree and
* (the tree itself is not writeable or
* the entire sysctl system is not writeable)
*/
if ((sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_PERMANENT) &&
(!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE)))
return (EPERM);
/*
* it must be a "node", not a "int" or something
*/
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (rnode->sysctl_flags & CTLFLAG_ALIAS) {
printf("sysctl_create: attempt to add node to aliased "
"node %p\n", rnode);
return (EINVAL);
}
pnode = __UNCONST(rnode); /* we are adding children to this node */
if (newp == NULL)
return (EINVAL);
error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return (error);
/*
* nodes passed in don't *have* parents
*/
if (nnode.sysctl_parent != NULL)
return (EINVAL);
/*
* if we are indeed adding it, it should be a "good" name and
* number
*/
nm = nnode.sysctl_num;
#if NKSYMS > 0
if (nm == CTL_CREATESYM)
nm = CTL_CREATE;
#endif /* NKSYMS > 0 */
if (nm < 0 && nm != CTL_CREATE)
return (EINVAL);
/*
* the name can't start with a digit
*/
if (nnode.sysctl_name[0] >= '0' &&
nnode.sysctl_name[0] <= '9')
return (EINVAL);
/*
* the name must be only alphanumerics or - or _, longer than
* 0 bytes and less than SYSCTL_NAMELEN
*/
nsz = 0;
while (nsz < SYSCTL_NAMELEN && nnode.sysctl_name[nsz] != '\0') {
if ((nnode.sysctl_name[nsz] >= '0' &&
nnode.sysctl_name[nsz] <= '9') ||
(nnode.sysctl_name[nsz] >= 'A' &&
nnode.sysctl_name[nsz] <= 'Z') ||
(nnode.sysctl_name[nsz] >= 'a' &&
nnode.sysctl_name[nsz] <= 'z') ||
nnode.sysctl_name[nsz] == '-' ||
nnode.sysctl_name[nsz] == '_')
nsz++;
else
return (EINVAL);
}
if (nsz == 0 || nsz == SYSCTL_NAMELEN)
return (EINVAL);
/*
* various checks revolve around size vs type, etc
*/
type = SYSCTL_TYPE(nnode.sysctl_flags);
flags = SYSCTL_FLAGS(nnode.sysctl_flags);
sz = nnode.sysctl_size;
/*
* find out if there's a collision, and if so, let the caller
* know what they collided with
*/
node = pnode->sysctl_child;
at = 0;
if (node) {
if ((flags | node->sysctl_flags) & CTLFLAG_ANYNUMBER)
/* No siblings for a CTLFLAG_ANYNUMBER node */
return EINVAL;
for (ni = 0; ni < pnode->sysctl_clen; ni++) {
if (nm == node[ni].sysctl_num ||
strcmp(nnode.sysctl_name, node[ni].sysctl_name) == 0) {
/*
* ignore error here, since we
* are already fixed on EEXIST
*/
(void)sysctl_cvt_out(l, v, &node[ni], oldp,
*oldlenp, oldlenp);
return (EEXIST);
}
if (nm > node[ni].sysctl_num)
at++;
}
}
/*
* use sysctl_ver to add to the tree iff it hasn't changed
*/
if (nnode.sysctl_ver != 0) {
/*
* a specified value must match either the parent
* node's version or the root node's version
*/
if (nnode.sysctl_ver != sysctl_rootof(rnode)->sysctl_ver &&
nnode.sysctl_ver != rnode->sysctl_ver) {
return (EINVAL);
}
}
/*
* only the kernel can assign functions to entries
*/
if (l != NULL && nnode.sysctl_func != NULL)
return (EPERM);
/*
* only the kernel can create permanent entries, and only then
* before the kernel is finished setting itself up
*/
if (l != NULL && (flags & ~SYSCTL_USERFLAGS))
return (EPERM);
if ((flags & CTLFLAG_PERMANENT) &
(sysctl_root.sysctl_flags & CTLFLAG_PERMANENT))
return (EPERM);
if ((flags & (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) ==
(CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE))
return (EINVAL);
if ((flags & CTLFLAG_IMMEDIATE) &&
type != CTLTYPE_INT && type != CTLTYPE_QUAD && type != CTLTYPE_BOOL)
return (EINVAL);
/*
* check size, or set it if unset and we can figure it out.
* kernel created nodes are allowed to have a function instead
* of a size (or a data pointer).
*/
switch (type) {
case CTLTYPE_NODE:
/*
* only *i* can assert the size of a node
*/
if (flags & CTLFLAG_ALIAS) {
anum = nnode.sysctl_alias;
if (anum < 0)
return (EINVAL);
nnode.sysctl_alias = 0;
}
if (sz != 0 || nnode.sysctl_data != NULL)
return (EINVAL);
if (nnode.sysctl_csize != 0 ||
nnode.sysctl_clen != 0 ||
nnode.sysctl_child != 0)
return (EINVAL);
if (flags & CTLFLAG_OWNDATA)
return (EINVAL);
sz = sizeof(struct sysctlnode);
break;
case CTLTYPE_INT:
/*
* since an int is an int, if the size is not given or
* is wrong, we can "int-uit" it.
*/
if (sz != 0 && sz != sizeof(int))
return (EINVAL);
sz = sizeof(int);
break;
case CTLTYPE_STRING:
/*
* strings are a little more tricky
*/
if (sz == 0) {
if (l == NULL) {
if (nnode.sysctl_func == NULL) {
if (nnode.sysctl_data == NULL)
return (EINVAL);
else
sz = strlen(nnode.sysctl_data) +
1;
}
} else if (nnode.sysctl_data == NULL &&
flags & CTLFLAG_OWNDATA) {
return (EINVAL);
} else {
char *vp, *e;
size_t s;
/*
* we want a rough idea of what the
* size is now
*/
vp = malloc(PAGE_SIZE, M_SYSCTLDATA, M_WAITOK);
if (vp == NULL)
return (ENOMEM);
e = nnode.sysctl_data;
do {
error = copyinstr(e, vp, PAGE_SIZE, &s);
if (error) {
if (error != ENAMETOOLONG) {
free(vp, M_SYSCTLDATA);
return (error);
}
e += PAGE_SIZE;
if ((e - 32 * PAGE_SIZE) >
(char*)nnode.sysctl_data) {
free(vp, M_SYSCTLDATA);
return (ERANGE);
}
}
} while (error != 0);
sz = s + (e - (char*)nnode.sysctl_data);
free(vp, M_SYSCTLDATA);
}
}
break;
case CTLTYPE_QUAD:
if (sz != 0 && sz != sizeof(u_quad_t))
return (EINVAL);
sz = sizeof(u_quad_t);
break;
case CTLTYPE_BOOL:
/*
* since an bool is an bool, if the size is not given or
* is wrong, we can "intuit" it.
*/
if (sz != 0 && sz != sizeof(bool))
return (EINVAL);
sz = sizeof(bool);
break;
case CTLTYPE_STRUCT:
if (sz == 0) {
if (l != NULL || nnode.sysctl_func == NULL)
return (EINVAL);
if (flags & CTLFLAG_OWNDATA)
return (EINVAL);
}
break;
default:
return (EINVAL);
}
/*
* at this point, if sz is zero, we *must* have a
* function to go with it and we can't own it.
*/
/*
* l ptr own
* 0 0 0 -> EINVAL (if no func)
* 0 0 1 -> own
* 0 1 0 -> kptr
* 0 1 1 -> kptr
* 1 0 0 -> EINVAL
* 1 0 1 -> own
* 1 1 0 -> kptr, no own (fault on lookup)
* 1 1 1 -> uptr, own
*/
if (type != CTLTYPE_NODE) {
if (sz != 0) {
if (flags & CTLFLAG_OWNDATA) {
own = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (own == NULL)
return ENOMEM;
if (nnode.sysctl_data == NULL)
memset(own, 0, sz);
else {
error = sysctl_copyin(l,
nnode.sysctl_data, own, sz);
if (error != 0) {
free(own, M_SYSCTLDATA);
return (error);
}
}
} else if ((nnode.sysctl_data != NULL) &&
!(flags & CTLFLAG_IMMEDIATE)) {
#if NKSYMS > 0
if (name[namelen - 1] == CTL_CREATESYM) {
char symname[128]; /* XXX enough? */
u_long symaddr;
size_t symlen;
error = sysctl_copyinstr(l,
nnode.sysctl_data, symname,
sizeof(symname), &symlen);
if (error)
return (error);
error = ksyms_getval(NULL, symname,
&symaddr, KSYMS_EXTERN);
if (error)
return (error); /* EINVAL? */
nnode.sysctl_data = (void*)symaddr;
}
#endif /* NKSYMS > 0 */
/*
* Ideally, we'd like to verify here
* that this address is acceptable,
* but...
*
* - it might be valid now, only to
* become invalid later
*
* - it might be invalid only for the
* moment and valid later
*
* - or something else.
*
* Since we can't get a good answer,
* we'll just accept the address as
* given, and fault on individual
* lookups.
*/
}
} else if (nnode.sysctl_func == NULL)
return (EINVAL);
}
/*
* a process can't assign a function to a node, and the kernel
* can't create a node that has no function or data.
* (XXX somewhat redundant check)
*/
if (l != NULL || nnode.sysctl_func == NULL) {
if (type != CTLTYPE_NODE &&
!(flags & CTLFLAG_IMMEDIATE) &&
nnode.sysctl_data == NULL &&
own == NULL)
return (EINVAL);
}
#ifdef SYSCTL_DISALLOW_KWRITE
/*
* a process can't create a writable node unless it refers to
* new data.
*/
if (l != NULL && own == NULL && type != CTLTYPE_NODE &&
(flags & CTLFLAG_READWRITE) != CTLFLAG_READONLY &&
!(flags & CTLFLAG_IMMEDIATE))
return (EPERM);
#endif /* SYSCTL_DISALLOW_KWRITE */
/*
* make sure there's somewhere to put the new stuff.
*/
if (pnode->sysctl_child == NULL) {
if (flags & CTLFLAG_ANYNUMBER)
error = sysctl_alloc(pnode, 1);
else
error = sysctl_alloc(pnode, 0);
if (error) {
if (own != NULL)
free(own, M_SYSCTLDATA);
return (error);
}
}
node = pnode->sysctl_child;
/*
* no collisions, so pick a good dynamic number if we need to.
*/
if (nm == CTL_CREATE) {
nm = ++sysctl_root.sysctl_num;
for (ni = 0; ni < pnode->sysctl_clen; ni++) {
if (nm == node[ni].sysctl_num) {
nm++;
ni = -1;
} else if (nm > node[ni].sysctl_num)
at = ni + 1;
}
}
/*
* oops...ran out of space
*/
if (pnode->sysctl_clen == pnode->sysctl_csize) {
error = sysctl_realloc(pnode);
if (error) {
if (own != NULL)
free(own, M_SYSCTLDATA);
return (error);
}
node = pnode->sysctl_child;
}
/*
* insert new node data
*/
if (at < pnode->sysctl_clen) {
int t;
/*
* move the nodes that should come after the new one
*/
memmove(&node[at + 1], &node[at],
(pnode->sysctl_clen - at) * sizeof(struct sysctlnode));
memset(&node[at], 0, sizeof(struct sysctlnode));
node[at].sysctl_parent = pnode;
/*
* and...reparent any children of any moved nodes
*/
for (ni = at; ni <= pnode->sysctl_clen; ni++)
if (node[ni].sysctl_child != NULL)
for (t = 0; t < node[ni].sysctl_csize; t++)
node[ni].sysctl_child[t].sysctl_parent =
&node[ni];
}
node = &node[at];
pnode->sysctl_clen++;
strlcpy(node->sysctl_name, nnode.sysctl_name,
sizeof(node->sysctl_name));
node->sysctl_num = nm;
node->sysctl_size = sz;
node->sysctl_flags = SYSCTL_VERSION|type|flags; /* XXX other trees */
node->sysctl_csize = 0;
node->sysctl_clen = 0;
if (own) {
node->sysctl_data = own;
node->sysctl_flags |= CTLFLAG_OWNDATA;
} else if (flags & CTLFLAG_ALIAS) {
node->sysctl_alias = anum;
} else if (flags & CTLFLAG_IMMEDIATE) {
switch (type) {
case CTLTYPE_BOOL:
node->sysctl_bdata = nnode.sysctl_bdata;
break;
case CTLTYPE_INT:
node->sysctl_idata = nnode.sysctl_idata;
break;
case CTLTYPE_QUAD:
node->sysctl_qdata = nnode.sysctl_qdata;
break;
}
} else {
node->sysctl_data = nnode.sysctl_data;
node->sysctl_flags &= ~CTLFLAG_OWNDATA;
}
node->sysctl_func = nnode.sysctl_func;
node->sysctl_child = NULL;
/* node->sysctl_parent should already be done */
/*
* update "version" on path to "root"
*/
for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
;
pnode = node;
for (nm = rnode->sysctl_ver + 1; pnode != NULL;
pnode = pnode->sysctl_parent)
pnode->sysctl_ver = nm;
/* If this fails, the node is already added - the user won't know! */
error = sysctl_cvt_out(l, v, node, oldp, *oldlenp, oldlenp);
return (error);
}
/*
* ********************************************************************
* A wrapper around sysctl_create() that prints the thing we're trying
* to add.
* ********************************************************************
*/
#ifdef SYSCTL_DEBUG_CREATE
int
sysctl_create(SYSCTLFN_ARGS)
{
const struct sysctlnode *node;
int k, v, rc, ni, nl = namelen + (name - oname);
struct sysctlnode nnode;
if (newp == NULL)
return EINVAL;
int error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return error;
node = &nnode;
printf("namelen %d (", nl);
for (ni = 0; ni < nl - 1; ni++)
printf(" %d", oname[ni]);
printf(" %d )\t[%s]\tflags %08x (%08x %d %zu)\n",
k = node->sysctl_num,
node->sysctl_name,
node->sysctl_flags,
SYSCTL_FLAGS(node->sysctl_flags),
SYSCTL_TYPE(node->sysctl_flags),
node->sysctl_size);
node = rnode;
rc = _sysctl_create(SYSCTLFN_CALL(rnode));
printf("sysctl_create(");
for (ni = 0; ni < nl - 1; ni++)
printf(" %d", oname[ni]);
printf(" %d ) returned %d\n", k, rc);
return (rc);
}
#endif /* SYSCTL_DEBUG_CREATE */
/*
* sysctl_destroy -- Removes a node (as described by newp) from the
* given tree, returning (if successful) a copy of the dead node in
* oldp. Since we're removing stuff, there's not much to check.
*/
int
sysctl_destroy(SYSCTLFN_ARGS)
{
struct sysctlnode *node, *pnode, onode, nnode;
int ni, error, v;
KASSERT(rw_write_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_destroy: rnode %p wrong version\n", rnode);
return (EINVAL);
}
error = 0;
if (namelen != 1 || name[namelen - 1] != CTL_DESTROY)
return (EINVAL);
/*
* processes can only destroy nodes at securelevel 0, must be
* root, and can't remove nodes from a parent that's not
* writeable
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_DELETE, NULL, NULL, NULL);
if (error)
return (error);
if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
return (EPERM);
}
/*
* nothing can remove a node if:
* the node is permanent (checked later) or
* the tree itself is not writeable or
* the entire sysctl system is not writeable
*
* note that we ignore whether setup is complete or not,
* because these rules always apply.
*/
if (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))
return (EPERM);
if (newp == NULL)
return (EINVAL);
error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return (error);
memset(&onode, 0, sizeof(struct sysctlnode));
node = rnode->sysctl_child;
for (ni = 0; ni < rnode->sysctl_clen; ni++) {
if (nnode.sysctl_num == node[ni].sysctl_num) {
/*
* if name specified, must match
*/
if (nnode.sysctl_name[0] != '\0' &&
strcmp(nnode.sysctl_name, node[ni].sysctl_name))
continue;
/*
* if version specified, must match
*/
if (nnode.sysctl_ver != 0 &&
nnode.sysctl_ver != node[ni].sysctl_ver)
continue;
/*
* this must be the one
*/
break;
}
}
if (ni == rnode->sysctl_clen)
return (ENOENT);
node = &node[ni];
pnode = node->sysctl_parent;
/*
* if the kernel says permanent, it is, so there. nyah.
*/
if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_PERMANENT)
return (EPERM);
/*
* can't delete non-empty nodes
*/
if (SYSCTL_TYPE(node->sysctl_flags) == CTLTYPE_NODE &&
node->sysctl_clen != 0)
return (ENOTEMPTY);
/*
* if the node "owns" data, release it now
*/
if (node->sysctl_flags & CTLFLAG_OWNDATA) {
if (node->sysctl_data != NULL)
free(node->sysctl_data, M_SYSCTLDATA);
node->sysctl_data = NULL;
}
if (node->sysctl_flags & CTLFLAG_OWNDESC) {
if (node->sysctl_desc != NULL)
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
node->sysctl_desc = NULL;
}
/*
* if the node to be removed is not the last one on the list,
* move the remaining nodes up, and reparent any grandchildren
*/
onode = *node;
if (ni < pnode->sysctl_clen - 1) {
int t;
memmove(&pnode->sysctl_child[ni], &pnode->sysctl_child[ni + 1],
(pnode->sysctl_clen - ni - 1) *
sizeof(struct sysctlnode));
for (; ni < pnode->sysctl_clen - 1; ni++)
if (SYSCTL_TYPE(pnode->sysctl_child[ni].sysctl_flags) ==
CTLTYPE_NODE)
for (t = 0;
t < pnode->sysctl_child[ni].sysctl_clen;
t++)
pnode->sysctl_child[ni].sysctl_child[t].
sysctl_parent =
&pnode->sysctl_child[ni];
ni = pnode->sysctl_clen - 1;
node = &pnode->sysctl_child[ni];
}
/*
* reset the space we just vacated
*/
memset(node, 0, sizeof(struct sysctlnode));
node->sysctl_parent = pnode;
pnode->sysctl_clen--;
/*
* if this parent just lost its last child, nuke the creche
*/
if (pnode->sysctl_clen == 0) {
free(pnode->sysctl_child, M_SYSCTLNODE);
pnode->sysctl_csize = 0;
pnode->sysctl_child = NULL;
}
/*
* update "version" on path to "root"
*/
for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
;
for (ni = rnode->sysctl_ver + 1; pnode != NULL;
pnode = pnode->sysctl_parent)
pnode->sysctl_ver = ni;
error = sysctl_cvt_out(l, v, &onode, oldp, *oldlenp, oldlenp);
return (error);
}
/*
* sysctl_lookup -- Handles copyin/copyout of new and old values.
* Partial reads are globally allowed. Only root can write to things
* unless the node says otherwise.
*/
int
sysctl_lookup(SYSCTLFN_ARGS)
{
int error, rw;
size_t sz, len;
void *d;
KASSERT(rw_lock_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("%s: rnode %p wrong version\n", __func__, rnode);
return EINVAL;
}
if (newlen == 0)
newp = NULL;
error = 0;
/*
* you can't "look up" a node. you can "query" it, but you
* can't "look it up".
*/
if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_NODE || namelen != 0) {
DPRINTF(("%s: can't lookup a node\n", __func__));
return EINVAL;
}
/*
* some nodes are private, so only root can look into them.
*/
if (l != NULL && (rnode->sysctl_flags & CTLFLAG_PRIVATE) &&
(error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0) {
DPRINTF(("%s: private node\n", __func__));
return error;
}
/*
* if a node wants to be writable according to different rules
* other than "only root can write to stuff unless a flag is
* set", then it needs its own function which should have been
* called and not us.
*/
if (l != NULL && newp != NULL && !(rnode->sysctl_flags & CTLFLAG_ANYWRITE) &&
(error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_MODIFY, NULL, NULL,
NULL)) != 0) {
DPRINTF(("%s: can't modify\n", __func__));
return error;
}
/*
* is this node supposedly writable?
*/
rw = (rnode->sysctl_flags & CTLFLAG_READWRITE) ? 1 : 0;
/*
* it appears not to be writable at this time, so if someone
* tried to write to it, we must tell them to go away
*/
if (!rw && newp != NULL) {
DPRINTF(("%s: not writable\n", __func__));
return EPERM;
}
/*
* step one, copy out the stuff we have presently
*/
if (rnode->sysctl_flags & CTLFLAG_IMMEDIATE) {
/*
* note that we discard const here because we are
* modifying the contents of the node (which is okay
* because it's ours)
*
* It also doesn't matter which field of the union we pick.
*/
d = __UNCONST(&rnode->sysctl_qdata);
} else
d = rnode->sysctl_data;
if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_STRING)
sz = strlen(d) + 1; /* XXX@@@ possible fault here */
else
sz = rnode->sysctl_size; if (oldp != NULL) { error = sysctl_copyout(l, d, oldp, MIN(sz, *oldlenp)); if (error) {
DPRINTF(("%s: bad copyout %d\n", __func__, error));
return error;
}
}
*oldlenp = sz;
/*
* are we done?
*/
if (newp == NULL)
return 0;
/*
* hmm...not done. must now "copy in" new value. re-adjust
* sz to maximum value (strings are "weird").
*/
sz = rnode->sysctl_size;
switch (SYSCTL_TYPE(rnode->sysctl_flags)) {
case CTLTYPE_BOOL: {
bool tmp;
/*
* these data must be *exactly* the same size coming
* in. bool may only be true or false.
*/
if (newlen != sz) {
DPRINTF(("%s: bad size %zu != %zu\n", __func__, newlen,
sz));
return EINVAL;
}
error = sysctl_copyin(l, newp, &tmp, sz); if (error)
break;
if (tmp != true && tmp != false) {
DPRINTF(("%s: tmp %d\n", __func__, tmp));
return EINVAL;
}
*(bool *)d = tmp;
break;
}
case CTLTYPE_INT:
case CTLTYPE_QUAD:
case CTLTYPE_STRUCT:
/*
* these data must be *exactly* the same size coming
* in.
*/
if (newlen != sz)
goto bad_size;
error = sysctl_copyin(l, newp, d, sz);
rnd_add_data(NULL, d, sz, 0);
break;
case CTLTYPE_STRING: {
/*
* strings, on the other hand, can be shorter, and we
* let userland be sloppy about the trailing nul.
*/
char *newbuf;
/*
* too much new string?
*/
if (newlen > sz)
goto bad_size;
/*
* temporary copy of new inbound string
*/
len = MIN(sz, newlen);
newbuf = malloc(len, M_SYSCTLDATA, M_WAITOK);
if (newbuf == NULL) {
DPRINTF(("%s: oomem %zu\n", __func__, len));
return ENOMEM;
}
error = sysctl_copyin(l, newp, newbuf, len);
if (error) {
free(newbuf, M_SYSCTLDATA);
DPRINTF(("%s: copyin %d\n", __func__, error));
return error;
}
/*
* did they NUL terminate it, or do we have space
* left to do it ourselves?
*/
if (newbuf[len - 1] != '\0' && len == sz) {
free(newbuf, M_SYSCTLDATA);
DPRINTF(("%s: string too long\n", __func__));
return EINVAL;
}
/*
* looks good, so pop it into place and zero the rest.
*/
if (len > 0) { memcpy(d, newbuf, len);
rnd_add_data(NULL, d, len, 0);
}
if (sz != len) memset((char*)d + len, 0, sz - len);
free(newbuf, M_SYSCTLDATA);
break;
}
default:
DPRINTF(("%s: bad type\n", __func__));
return EINVAL;
}
if (error) {
DPRINTF(("%s: copyin %d\n", __func__, error));
}
return error;
bad_size:
DPRINTF(("%s: bad size %zu > %zu\n", __func__, newlen, sz));
return EINVAL;
}
/*
* sysctl_mmap -- Dispatches sysctl mmap requests to those nodes that
* purport to handle it. This interface isn't fully fleshed out yet,
* unfortunately.
*/
static int
sysctl_mmap(SYSCTLFN_ARGS)
{
const struct sysctlnode *node;
struct sysctlnode nnode;
int error;
int sysctl_num;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_mmap: rnode %p wrong version\n", rnode);
return (EINVAL);
}
/*
* let's just pretend that didn't happen, m'kay?
*/
if (l == NULL)
return (EPERM);
/*
* is this a sysctlnode description of an mmap request?
*/
if (newp == NULL || newlen != sizeof(struct sysctlnode))
return (EINVAL);
error = sysctl_copyin(l, newp, &nnode, sizeof(nnode));
if (error)
return (error);
/*
* does the node they asked for exist?
*/
if (namelen != 1)
return (EOPNOTSUPP);
node = rnode;
sysctl_num = nnode.sysctl_num;
error = sysctl_locate(l, &sysctl_num, 1, &node, NULL);
if (error)
return (error);
/*
* does this node that we have found purport to handle mmap?
*/
if (node->sysctl_func == NULL ||
!(node->sysctl_flags & CTLFLAG_MMAP))
return (EOPNOTSUPP);
/*
* well...okay, they asked for it.
*/
return ((*node->sysctl_func)(SYSCTLFN_CALL(node)));
}
int
sysctl_describe(SYSCTLFN_ARGS)
{
struct sysctldesc *d;
void *bf;
size_t sz, left, tot;
int i, error, v = -1;
struct sysctlnode *node;
struct sysctlnode dnode;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_query: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (namelen != 1 || name[0] != CTL_DESCRIBE)
return (EINVAL);
/*
* get ready...
*/
error = 0;
d = bf = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
if (bf == NULL)
return ENOMEM;
tot = 0;
node = rnode->sysctl_child;
left = *oldlenp;
/*
* no request -> all descriptions at this level
* request with desc unset -> just this node
* request with desc set -> set descr for this node
*/
if (newp != NULL) {
error = sysctl_cvt_in(l, &v, newp, newlen, &dnode);
if (error)
goto out;
if (dnode.sysctl_desc != NULL) {
/*
* processes cannot set descriptions above
* securelevel 0. and must be root. blah
* blah blah. a couple more checks are made
* once we find the node we want.
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_DESC, NULL,
NULL, NULL);
if (error)
goto out;
#else /* SYSCTL_DISALLOW_CREATE */
error = EPERM;
goto out;
#endif /* SYSCTL_DISALLOW_CREATE */
}
/*
* find node and try to set the description on it
*/
for (i = 0; i < rnode->sysctl_clen; i++)
if (node[i].sysctl_num == dnode.sysctl_num)
break;
if (i == rnode->sysctl_clen) {
error = ENOENT;
goto out;
}
node = &node[i];
/*
* did the caller specify a node version?
*/
if (dnode.sysctl_ver != 0 &&
dnode.sysctl_ver != node->sysctl_ver) {
error = EINVAL;
goto out;
}
/*
* okay...some rules:
* (1) if setup is done and the tree is
* read-only or the whole system is
* read-only
* (2) no one can set a description on a
* permanent node (it must be set when
* using createv)
* (3) processes cannot *change* a description
* (4) processes *can*, however, set a
* description on a read-only node so that
* one can be created and then described
* in two steps
* anything else come to mind?
*/
if ((sysctl_root.sysctl_flags & CTLFLAG_PERMANENT) &&
(!(sysctl_rootof(node)->sysctl_flags &
CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) {
error = EPERM;
goto out;
}
if (node->sysctl_flags & CTLFLAG_PERMANENT) {
error = EPERM;
goto out;
}
if (l != NULL && node->sysctl_desc != NULL) {
error = EPERM;
goto out;
}
/*
* right, let's go ahead. the first step is
* making the description into something the
* node can "own", if need be.
*/
if (l != NULL ||
dnode.sysctl_flags & CTLFLAG_OWNDESC) {
char *nd, *k;
k = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
if (k == NULL) {
error = ENOMEM;
goto out;
}
error = sysctl_copyinstr(l, dnode.sysctl_desc,
k, MAXDESCLEN, &sz);
if (error) {
free(k, M_TEMP);
goto out;
}
nd = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (nd == NULL) {
free(k, M_TEMP);
error = ENOMEM;
goto out;
}
memcpy(nd, k, sz);
dnode.sysctl_flags |= CTLFLAG_OWNDESC;
dnode.sysctl_desc = nd;
free(k, M_TEMP);
}
/*
* now "release" the old description and
* attach the new one. ta-da.
*/
if ((node->sysctl_flags & CTLFLAG_OWNDESC) &&
node->sysctl_desc != NULL)
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
node->sysctl_desc = dnode.sysctl_desc;
node->sysctl_flags |=
(dnode.sysctl_flags & CTLFLAG_OWNDESC);
/*
* now we "fall out" and into the loop which
* will copy the new description back out for
* those interested parties
*/
}
}
/*
* scan for one description or just retrieve all descriptions
*/
for (i = 0; i < rnode->sysctl_clen; i++) {
/*
* did they ask for the description of only one node?
*/
if (v != -1 && node[i].sysctl_num != dnode.sysctl_num)
continue;
/*
* don't describe "private" nodes to non-suser users
*/
if ((node[i].sysctl_flags & CTLFLAG_PRIVATE) && (l != NULL) &&
!(kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)))
continue;
/*
* is this description "valid"?
*/
memset(bf, 0, MAXDESCLEN);
if (node[i].sysctl_desc == NULL)
sz = 1;
else if (copystr(node[i].sysctl_desc, &d->descr_str[0],
MAXDESCLEN - sizeof(*d), &sz) != 0) {
/*
* erase possible partial description
*/
memset(bf, 0, MAXDESCLEN);
sz = 1;
}
/*
* we've got it, stuff it into the caller's buffer
*/
d->descr_num = node[i].sysctl_num;
d->descr_ver = node[i].sysctl_ver;
d->descr_len = sz; /* includes trailing nul */
sz = (char *)NEXT_DESCR(d) - (char *)d;
if (oldp != NULL && left >= sz) {
error = sysctl_copyout(l, d, oldp, sz);
if (error)
goto out;
left -= sz;
oldp = (void *)__sysc_desc_adv(oldp, d->descr_len);
}
tot += sz;
/*
* if we get this far with v not "unset", they asked
* for a specific node and we found it
*/
if (v != -1)
break;
}
/*
* did we find it after all?
*/
if (v != -1 && tot == 0)
error = ENOENT;
else
*oldlenp = tot;
out:
free(bf, M_TEMP);
return (error);
}
/*
* ********************************************************************
* Section 3: Create and destroy from inside the kernel
* ********************************************************************
* sysctl_createv() and sysctl_destroyv() are simpler-to-use
* interfaces for the kernel to fling new entries into the mib and rip
* them out later. In the case of sysctl_createv(), the returned copy
* of the node (see sysctl_create()) will be translated back into a
* pointer to the actual node.
*
* Note that sysctl_createv() will return 0 if the create request
* matches an existing node (ala mkdir -p), and that sysctl_destroyv()
* will return 0 if the node to be destroyed already does not exist
* (aka rm -f) or if it is a parent of other nodes.
*
* This allows two (or more) different subsystems to assert sub-tree
* existence before populating their own nodes, and to remove their
* own nodes without orphaning the others when they are done.
* ********************************************************************
*/
#undef sysctl_createv
int
sysctl_createv(struct sysctllog **log, int cflags,
const struct sysctlnode **rnode, const struct sysctlnode **cnode,
int flags, int type, const char *namep, const char *descr,
sysctlfn func, u_quad_t qv, void *newp, size_t newlen,
...)
{
va_list ap;
int error, ni, namelen, name[CTL_MAXNAME];
const struct sysctlnode *root, *pnode;
struct sysctlnode nnode, onode, *dnode;
size_t sz;
const struct sysctlnode *snode __diagused;
/*
* where are we putting this?
*/
if (rnode != NULL && *rnode == NULL) {
printf("sysctl_createv: rnode NULL\n");
return (EINVAL);
}
root = rnode ? *rnode : NULL;
if (cnode != NULL)
*cnode = NULL;
if (cflags != 0)
return (EINVAL);
/*
* what is it?
*/
flags = SYSCTL_VERSION|SYSCTL_TYPE(type)|SYSCTL_FLAGS(flags);
if (log != NULL)
flags &= ~CTLFLAG_PERMANENT;
/*
* where do we put it?
*/
va_start(ap, newlen);
namelen = 0;
error = 0;
ni = -1;
do {
if (++ni == CTL_MAXNAME) {
error = ENAMETOOLONG;
break;
}
name[ni] = va_arg(ap, int);
/*
* sorry, this is not supported from here
*/
if (name[ni] == CTL_CREATESYM) {
error = EINVAL;
break;
}
} while (name[ni] != CTL_EOL && name[ni] != CTL_CREATE);
va_end(ap);
if (error)
return error;
namelen = ni + (name[ni] == CTL_CREATE ? 1 : 0);
/*
* what's it called
*/
if (strlcpy(nnode.sysctl_name, namep, sizeof(nnode.sysctl_name)) >=
sizeof(nnode.sysctl_name))
return (ENAMETOOLONG);
/*
* cons up the description of the new node
*/
nnode.sysctl_num = name[namelen - 1];
name[namelen - 1] = CTL_CREATE;
nnode.sysctl_size = newlen;
nnode.sysctl_flags = flags;
if (type == CTLTYPE_NODE) {
nnode.sysctl_csize = 0;
nnode.sysctl_clen = 0;
nnode.sysctl_child = NULL;
if (flags & CTLFLAG_ALIAS)
nnode.sysctl_alias = qv;
} else if (flags & CTLFLAG_IMMEDIATE) {
switch (type) {
case CTLTYPE_BOOL:
nnode.sysctl_bdata = qv;
break;
case CTLTYPE_INT:
nnode.sysctl_idata = qv;
break;
case CTLTYPE_QUAD:
nnode.sysctl_qdata = qv;
break;
default:
return (EINVAL);
}
} else {
nnode.sysctl_data = newp;
}
nnode.sysctl_func = func;
nnode.sysctl_parent = NULL;
nnode.sysctl_ver = 0;
/*
* initialize lock state -- we need locks if the main tree has
* been marked as complete, but since we could be called from
* either there, or from a device driver (say, at device
* insertion), or from a module (at module load time, say), we
* don't really want to "wait"...
*/
sysctl_lock(true);
/*
* locate the prospective parent of the new node, and if we
* find it, add the new node.
*/
sz = sizeof(onode);
pnode = root;
error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
if (error) {
/*
* XXX: If you are seeing this printf in early bringup
* stages, perhaps your setfault is not functioning and
* thus kcopy() is mis-behaving.
*/
printf("sysctl_createv: sysctl_locate(%s) returned %d\n",
nnode.sysctl_name, error);
sysctl_unlock();
return (error);
}
error = sysctl_create(&name[ni], namelen - ni, &onode, &sz,
&nnode, sizeof(nnode), &name[0], NULL,
pnode);
/*
* unfortunately the node we wanted to create is already
* there. if the node that's already there is a reasonable
* facsimile of the node we wanted to create, just pretend
* (for the caller's benefit) that we managed to create the
* node they wanted.
*/
if (error == EEXIST) {
/* name is the same as requested... */
if (strcmp(nnode.sysctl_name, onode.sysctl_name) == 0 &&
/* they want the same function... */
nnode.sysctl_func == onode.sysctl_func &&
/* number is the same as requested, or... */
(nnode.sysctl_num == onode.sysctl_num ||
/* they didn't pick a number... */
nnode.sysctl_num == CTL_CREATE)) {
/*
* collision here from trying to create
* something that already existed; let's give
* our customers a hand and tell them they got
* what they wanted.
*/
#ifdef SYSCTL_DEBUG_CREATE
printf("cleared\n");
#endif /* SYSCTL_DEBUG_CREATE */
error = 0;
}
}
if (error == 0 &&
(cnode != NULL || log != NULL || descr != NULL)) {
/*
* sysctl_create() gave us back a copy of the node,
* but we need to know where it actually is...
*/
pnode = root;
error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
snode = pnode;
/*
* manual scan of last layer so that aliased nodes
* aren't followed.
*/
if (error == 0) {
for (ni = 0; ni < pnode->sysctl_clen; ni++)
if (pnode->sysctl_child[ni].sysctl_num ==
onode.sysctl_num)
break;
if (ni < pnode->sysctl_clen)
pnode = &pnode->sysctl_child[ni];
else
error = ENOENT;
}
/*
* not expecting an error here, but...
*/
if (error == 0) {
KASSERTMSG(pnode->sysctl_parent == snode,
"sysctl parent mis-match pnode %s, snode %s",
pnode->sysctl_name, snode->sysctl_name);
if (log != NULL)
sysctl_log_add(log, pnode);
if (cnode != NULL)
*cnode = pnode;
if (descr != NULL) {
/*
* allow first caller to *set* a
* description actually to set it
*
* discard const here so we can attach
* the description
*/
dnode = __UNCONST(pnode);
if (pnode->sysctl_desc != NULL)
/* skip it...we've got one */;
else if (flags & CTLFLAG_OWNDESC) {
size_t l = strlen(descr) + 1;
char *d = malloc(l, M_SYSCTLDATA,
M_WAITOK);
if (d != NULL) {
memcpy(d, descr, l);
dnode->sysctl_desc = d;
dnode->sysctl_flags |=
CTLFLAG_OWNDESC;
}
} else
dnode->sysctl_desc = descr;
}
} else {
printf("sysctl_create succeeded but node not found?!\n");
/*
* confusing, but the create said it
* succeeded, so...
*/
error = 0;
}
}
/*
* now it should be safe to release the lock state. note that
* the pointer to the newly created node being passed back may
* not be "good" for very long.
*/
sysctl_unlock();
if (error != 0) {
printf("sysctl_createv: sysctl_create(%s) returned %d\n",
nnode.sysctl_name, error);
#if 0
if (error != ENOENT)
sysctl_dump(&onode);
#endif
}
return (error);
}
int
sysctl_destroyv(struct sysctlnode *rnode, ...)
{
va_list ap;
int error, name[CTL_MAXNAME], namelen, ni;
const struct sysctlnode *pnode, *node;
struct sysctlnode dnode, *onode;
size_t sz;
va_start(ap, rnode);
namelen = 0;
ni = 0;
do {
if (ni == CTL_MAXNAME) {
va_end(ap);
return (ENAMETOOLONG);
}
name[ni] = va_arg(ap, int);
} while (name[ni++] != CTL_EOL);
namelen = ni - 1;
va_end(ap);
/*
* i can't imagine why we'd be destroying a node when the tree
* wasn't complete, but who knows?
*/
sysctl_lock(true);
/*
* where is it?
*/
node = rnode;
error = sysctl_locate(NULL, &name[0], namelen - 1, &node, &ni);
if (error) {
/* they want it gone and it's not there, so... */
sysctl_unlock();
return (error == ENOENT ? 0 : error);
}
/*
* set up the deletion
*/
pnode = node;
node = &dnode;
memset(&dnode, 0, sizeof(dnode));
dnode.sysctl_flags = SYSCTL_VERSION;
dnode.sysctl_num = name[namelen - 1];
/*
* we found it, now let's nuke it
*/
name[namelen - 1] = CTL_DESTROY;
sz = 0;
error = sysctl_destroy(&name[namelen - 1], 1, NULL, &sz,
node, sizeof(*node), &name[0], NULL,
pnode);
if (error == ENOTEMPTY) {
/*
* think of trying to delete "foo" when "foo.bar"
* (which someone else put there) is still in
* existence
*/
error = 0;
/*
* dunno who put the description there, but if this
* node can ever be removed, we need to make sure the
* string doesn't go out of context. that means we
* need to find the node that's still there (don't use
* sysctl_locate() because that follows aliasing).
*/
node = pnode->sysctl_child;
for (ni = 0; ni < pnode->sysctl_clen; ni++)
if (node[ni].sysctl_num == dnode.sysctl_num)
break;
node = (ni < pnode->sysctl_clen) ? &node[ni] : NULL;
/*
* if we found it, and this node has a description,
* and this node can be released, and it doesn't
* already own its own description...sigh. :)
*/
if (node != NULL && node->sysctl_desc != NULL &&
!(node->sysctl_flags & CTLFLAG_PERMANENT) &&
!(node->sysctl_flags & CTLFLAG_OWNDESC)) {
char *d;
sz = strlen(node->sysctl_desc) + 1;
d = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (d != NULL) {
/*
* discard const so that we can
* re-attach the description
*/
memcpy(d, node->sysctl_desc, sz);
onode = __UNCONST(node);
onode->sysctl_desc = d;
onode->sysctl_flags |= CTLFLAG_OWNDESC;
} else {
/*
* XXX drop the description? be
* afraid? don't care?
*/
}
}
}
sysctl_unlock();
return (error);
}
/*
* ********************************************************************
* Deletes an entire n-ary tree. Not recommended unless you know why
* you're doing it. Personally, I don't know why you'd even think
* about it.
* ********************************************************************
*/
void
sysctl_free(struct sysctlnode *rnode)
{
struct sysctlnode *node, *pnode;
rw_enter(&sysctl_treelock, RW_WRITER);
if (rnode == NULL)
rnode = &sysctl_root;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_free: rnode %p wrong version\n", rnode);
rw_exit(&sysctl_treelock);
return;
}
pnode = rnode;
node = pnode->sysctl_child;
do {
while (node != NULL && pnode->sysctl_csize > 0) {
while (node <
&pnode->sysctl_child[pnode->sysctl_clen] &&
(SYSCTL_TYPE(node->sysctl_flags) !=
CTLTYPE_NODE ||
node->sysctl_csize == 0)) {
if (SYSCTL_FLAGS(node->sysctl_flags) &
CTLFLAG_OWNDATA) {
if (node->sysctl_data != NULL) {
free(node->sysctl_data,
M_SYSCTLDATA);
node->sysctl_data = NULL;
}
}
if (SYSCTL_FLAGS(node->sysctl_flags) &
CTLFLAG_OWNDESC) {
if (node->sysctl_desc != NULL) {
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc),
M_SYSCTLDATA);
node->sysctl_desc = NULL;
}
}
node++;
}
if (node < &pnode->sysctl_child[pnode->sysctl_clen]) {
pnode = node;
node = node->sysctl_child;
} else
break;
}
if (pnode->sysctl_child != NULL)
free(pnode->sysctl_child, M_SYSCTLNODE);
pnode->sysctl_clen = 0;
pnode->sysctl_csize = 0;
pnode->sysctl_child = NULL;
node = pnode;
pnode = node->sysctl_parent;
} while (pnode != NULL && node != rnode);
rw_exit(&sysctl_treelock);
}
void
sysctl_log_print(const struct sysctllog *slog)
{
int i, len;
printf("root %p left %d size %d content", (const void *)slog->log_root,
slog->log_left, slog->log_size);
for (len = 0, i = slog->log_left; i < slog->log_size; i++) {
switch (len) {
case 0:
len = -1;
printf(" version %d", slog->log_num[i]);
break;
case -1:
len = -2;
printf(" type %d", slog->log_num[i]);
break;
case -2:
len = slog->log_num[i];
printf(" len %d:", slog->log_num[i]);
if (len <= 0)
len = -1;
break;
default:
len--;
printf(" %d", slog->log_num[i]);
break;
}
}
printf(" end\n");
}
int
sysctl_log_add(struct sysctllog **logp, const struct sysctlnode *node)
{
const int size0 = 16;
int name[CTL_MAXNAME], namelen, i;
const struct sysctlnode *pnode;
struct sysctllog *log;
if (node->sysctl_flags & CTLFLAG_PERMANENT)
return (0);
if (logp == NULL)
return (0);
if (*logp == NULL) {
log = malloc(sizeof(struct sysctllog),
M_SYSCTLDATA, M_WAITOK);
if (log == NULL) {
/* XXX print error message? */
return (-1);
}
log->log_num = malloc(size0 * sizeof(int),
M_SYSCTLDATA, M_WAITOK);
if (log->log_num == NULL) {
/* XXX print error message? */
free(log, M_SYSCTLDATA);
return (-1);
}
memset(log->log_num, 0, size0 * sizeof(int));
log->log_root = NULL;
log->log_size = size0;
log->log_left = size0;
*logp = log;
} else
log = *logp;
/*
* check that the root is proper. it's okay to record the
* address of the root of a tree. it's the only thing that's
* guaranteed not to shift around as nodes come and go.
*/
if (log->log_root == NULL)
log->log_root = sysctl_rootof(node);
else if (log->log_root != sysctl_rootof(node)) {
printf("sysctl: log %p root mismatch (%p)\n",
log->log_root, sysctl_rootof(node));
return (-1);
}
/*
* we will copy out name in reverse order
*/
for (pnode = node, namelen = 0;
pnode != NULL && !(pnode->sysctl_flags & CTLFLAG_ROOT);
pnode = pnode->sysctl_parent)
name[namelen++] = pnode->sysctl_num;
/*
* do we have space?
*/
if (log->log_left < (namelen + 3))
sysctl_log_realloc(log);
if (log->log_left < (namelen + 3))
return (-1);
/*
* stuff name in, then namelen, then node type, and finally,
* the version for non-node nodes.
*/
for (i = 0; i < namelen && i < CTL_MAXNAME; i++)
log->log_num[--log->log_left] = name[i];
log->log_num[--log->log_left] = namelen;
log->log_num[--log->log_left] = SYSCTL_TYPE(node->sysctl_flags);
if (log->log_num[log->log_left] != CTLTYPE_NODE)
log->log_num[--log->log_left] = node->sysctl_ver;
else
log->log_num[--log->log_left] = 0;
return (0);
}
void
sysctl_teardown(struct sysctllog **logp)
{
const struct sysctlnode *rnode;
struct sysctlnode node;
struct sysctllog *log;
uint namelen;
int *name, t, v, error, ni;
size_t sz;
if (logp == NULL || *logp == NULL)
return;
log = *logp;
rw_enter(&sysctl_treelock, RW_WRITER);
memset(&node, 0, sizeof(node));
while (log->log_left < log->log_size) {
KASSERT(log->log_left + 3 < log->log_size);
KASSERT(log->log_left + log->log_num[log->log_left + 2] <=
log->log_size);
v = log->log_num[log->log_left++];
t = log->log_num[log->log_left++];
namelen = log->log_num[log->log_left++];
name = &log->log_num[log->log_left];
node.sysctl_num = name[namelen - 1];
node.sysctl_flags = SYSCTL_VERSION|t;
node.sysctl_ver = v;
rnode = log->log_root;
error = sysctl_locate(NULL, &name[0], namelen, &rnode, &ni);
if (error == 0) {
name[namelen - 1] = CTL_DESTROY;
rnode = rnode->sysctl_parent;
sz = 0;
(void)sysctl_destroy(&name[namelen - 1], 1, NULL,
&sz, &node, sizeof(node),
&name[0], NULL, rnode);
}
log->log_left += namelen;
}
KASSERT(log->log_size == log->log_left);
free(log->log_num, M_SYSCTLDATA);
free(log, M_SYSCTLDATA);
*logp = NULL;
rw_exit(&sysctl_treelock);
}
/*
* ********************************************************************
* old_sysctl -- A routine to bridge old-style internal calls to the
* new infrastructure.
* ********************************************************************
*/
int
old_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen, struct lwp *l)
{
int error;
size_t oldlen = 0;
size_t savelen;
if (oldlenp) { oldlen = *oldlenp;
}
savelen = oldlen;
sysctl_lock(newp != NULL);
error = sysctl_dispatch(name, namelen, oldp, &oldlen,
newp, newlen, name, l, NULL);
sysctl_unlock();
if (error == 0 && oldp != NULL && savelen < oldlen)
error = ENOMEM;
if (oldlenp) {
*oldlenp = oldlen;
}
return (error);
}
/*
* ********************************************************************
* Section 4: Generic helper routines
* ********************************************************************
* "helper" routines that can do more finely grained access control,
* construct structures from disparate information, create the
* appearance of more nodes and sub-trees, etc. for example, if
* CTL_PROC wanted a helper function, it could respond to a CTL_QUERY
* with a dynamically created list of nodes that represented the
* currently running processes at that instant.
* ********************************************************************
*/
/*
* first, a few generic helpers that provide:
*
* sysctl_needfunc() a readonly interface that emits a warning
* sysctl_notavail() returns EOPNOTSUPP (generic error)
* sysctl_null() an empty return buffer with no error
*/
int
sysctl_needfunc(SYSCTLFN_ARGS)
{
int error;
printf("!!SYSCTL_NEEDFUNC!!\n");
if (newp != NULL || namelen != 0)
return (EOPNOTSUPP);
error = 0;
if (oldp != NULL)
error = sysctl_copyout(l, rnode->sysctl_data, oldp,
MIN(rnode->sysctl_size, *oldlenp));
*oldlenp = rnode->sysctl_size;
return (error);
}
int
sysctl_notavail(SYSCTLFN_ARGS)
{
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
return (EOPNOTSUPP);
}
int
sysctl_null(SYSCTLFN_ARGS)
{
*oldlenp = 0;
return (0);
}
u_int
sysctl_map_flags(const u_int *map, u_int word)
{
u_int rv;
for (rv = 0; *map != 0; map += 2)
if ((word & map[0]) != 0)
rv |= map[1];
return rv;
}
/*
* ********************************************************************
* Section 5: The machinery that makes it all go
* ********************************************************************
* Memory "manglement" routines. Not much to this, eh?
* ********************************************************************
*/
static int
sysctl_alloc(struct sysctlnode *p, int x)
{
int i;
struct sysctlnode *n;
assert(p->sysctl_child == NULL);
if (x == 1)
n = malloc(sizeof(struct sysctlnode),
M_SYSCTLNODE, M_WAITOK);
else
n = malloc(SYSCTL_DEFSIZE * sizeof(struct sysctlnode),
M_SYSCTLNODE, M_WAITOK);
if (n == NULL)
return (ENOMEM);
if (x == 1) {
memset(n, 0, sizeof(struct sysctlnode));
p->sysctl_csize = 1;
} else {
memset(n, 0, SYSCTL_DEFSIZE * sizeof(struct sysctlnode));
p->sysctl_csize = SYSCTL_DEFSIZE;
}
p->sysctl_clen = 0;
for (i = 0; i < p->sysctl_csize; i++)
n[i].sysctl_parent = p;
p->sysctl_child = n;
return (0);
}
static int
sysctl_realloc(struct sysctlnode *p)
{
int i, j, olen;
struct sysctlnode *n;
assert(p->sysctl_csize == p->sysctl_clen);
/*
* how many do we have...how many should we make?
*/
olen = p->sysctl_clen;
n = malloc(2 * olen * sizeof(struct sysctlnode), M_SYSCTLNODE,
M_WAITOK);
if (n == NULL)
return (ENOMEM);
/*
* move old children over...initialize new children
*/
memcpy(n, p->sysctl_child, olen * sizeof(struct sysctlnode));
memset(&n[olen], 0, olen * sizeof(struct sysctlnode));
p->sysctl_csize = 2 * olen;
/*
* reattach moved (and new) children to parent; if a moved
* child node has children, reattach the parent pointers of
* grandchildren
*/
for (i = 0; i < p->sysctl_csize; i++) {
n[i].sysctl_parent = p;
if (n[i].sysctl_child != NULL) {
for (j = 0; j < n[i].sysctl_csize; j++)
n[i].sysctl_child[j].sysctl_parent = &n[i];
}
}
/*
* get out with the old and in with the new
*/
free(p->sysctl_child, M_SYSCTLNODE);
p->sysctl_child = n;
return (0);
}
static int
sysctl_log_realloc(struct sysctllog *log)
{
int *n, s, d;
s = log->log_size * 2;
d = log->log_size;
n = malloc(s * sizeof(int), M_SYSCTLDATA, M_WAITOK);
if (n == NULL)
return (-1);
memset(n, 0, s * sizeof(int));
memcpy(&n[d], log->log_num, d * sizeof(int));
free(log->log_num, M_SYSCTLDATA);
log->log_num = n;
if (d)
log->log_left += d;
else
log->log_left = s;
log->log_size = s;
return (0);
}
/*
* ********************************************************************
* Section 6: Conversion between API versions wrt the sysctlnode
* ********************************************************************
*/
static int
sysctl_cvt_in(struct lwp *l, int *vp, const void *i, size_t sz,
struct sysctlnode *node)
{
int error, flags;
if (i == NULL || sz < sizeof(flags))
return (EINVAL);
error = sysctl_copyin(l, i, &flags, sizeof(flags));
if (error)
return (error);
#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_in: no support for SYSCTL_VERSION
#endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */
if (sz == sizeof(*node) &&
SYSCTL_VERS(flags) == SYSCTL_VERSION) {
error = sysctl_copyin(l, i, node, sizeof(*node));
if (error)
return (error);
*vp = SYSCTL_VERSION;
return (0);
}
return (EINVAL);
}
static int
sysctl_cvt_out(struct lwp *l, int v, const struct sysctlnode *i,
void *ovp, size_t left, size_t *szp)
{
size_t sz = sizeof(*i);
const void *src = i;
int error;
switch (v) {
case SYSCTL_VERS_0:
return (EINVAL);
#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_out: no support for SYSCTL_VERSION
#endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */
case SYSCTL_VERSION:
/* nothing more to do here */
break;
}
if (ovp != NULL && left >= sz) {
error = sysctl_copyout(l, src, ovp, sz);
if (error)
return (error);
}
if (szp != NULL)
*szp = sz;
return (0);
}
static uint8_t address_key[32]; /* key used in address hashing */
static ONCE_DECL(random_inithook);
static int
random_address_init(void)
{
cprng_strong(kern_cprng, address_key, sizeof(address_key), 0);
return 0;
}
void
hash_value_ensure_initialized(void)
{
RUN_ONCE(&random_inithook, random_address_init);
}
void
hash_value(void *d, size_t ds, const void *s, size_t ss)
{
blake2s(d, ds, address_key, sizeof(address_key), s, ss);
}
/* $NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $ */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)null_vfsops.c 1.5 (Berkeley) 7/10/92
* @(#)umap_vfsops.c 8.8 (Berkeley) 5/14/95
*/
/*
* Umap Layer
* (See mount_umap(8) for a description of this layer.)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, umap, "layerfs");
VFS_PROTOS(umapfs);
/*
* Mount umap layer
*/
int
umapfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
struct pathbuf *pb;
struct nameidata nd;
struct umap_args *args = data;
struct vnode *lowerrootvp, *vp;
struct umap_mount *amp;
int error;
#ifdef UMAPFS_DIAGNOSTIC
int i;
#endif
fsid_t tfsid;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args) {
#ifdef UMAPFS_DIAGNOSTIC
printf("mount_umap: data len %d < args %d\n",
(int)*data_len, (int)(sizeof *args));
#endif
return EINVAL;
}
if (mp->mnt_flag & MNT_GETARGS) {
amp = MOUNTTOUMAPMOUNT(mp);
if (amp == NULL)
return EIO;
args->la.target = NULL;
args->nentries = amp->info_nentries;
args->gnentries = amp->info_gnentries;
*data_len = sizeof *args;
return 0;
}
/* only for root */
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_UMAP, NULL, NULL, NULL);
if (error)
return error;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_mount(mp = %p)\n", mp);
#endif
/*
* Update is not supported
*/
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/*
* Find lower node
*/
error = pathbuf_copyin(args->umap_target, &pb);
if (error) {
return error;
}
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
return error;
}
/*
* Sanity check on lower vnode
*/
lowerrootvp = nd.ni_vp;
pathbuf_destroy(pb);
#ifdef UMAPFS_DIAGNOSTIC
printf("vp = %p, check for VDIR...\n", lowerrootvp);
#endif
if (lowerrootvp->v_type != VDIR) {
vput(lowerrootvp);
return (EINVAL);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("mp = %p\n", mp);
#endif
amp = kmem_zalloc(sizeof(struct umap_mount), KM_SLEEP);
mp->mnt_data = amp;
/*
* Now copy in the number of entries and maps for umap mapping.
*/
if (args->nentries < 0 || args->nentries > MAPFILEENTRIES || args->gnentries < 0 || args->gnentries > GMAPFILEENTRIES) {
vput(lowerrootvp);
return (EINVAL);
}
amp->info_nentries = args->nentries;
amp->info_gnentries = args->gnentries;
error = copyin(args->mapdata, amp->info_mapdata,
2*sizeof(u_long)*args->nentries);
if (error) {
vput(lowerrootvp);
return (error);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("umap_mount:nentries %d\n",args->nentries);
for (i = 0; i < args->nentries; i++)
printf(" %ld maps to %ld\n", amp->info_mapdata[i][0],
amp->info_mapdata[i][1]);
#endif
error = copyin(args->gmapdata, amp->info_gmapdata,
2*sizeof(u_long)*args->gnentries);
if (error) {
vput(lowerrootvp);
return (error);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("umap_mount:gnentries %d\n",args->gnentries);
for (i = 0; i < args->gnentries; i++)
printf("\tgroup %ld maps to %ld\n",
amp->info_gmapdata[i][0],
amp->info_gmapdata[i][1]);
#endif
/*
* Make sure the mount point's sufficiently initialized
* that the node create call will work.
*/
tfsid.__fsid_val[0] = (int32_t)args->fsid;
tfsid.__fsid_val[1] = makefstype(MOUNT_UMAP);
if (tfsid.__fsid_val[0] == 0) {
log(LOG_WARNING, "umapfs: fsid given as 0, ignoring\n");
vfs_getnewfsid(mp);
} else if (vfs_getvfs(&tfsid)) {
log(LOG_WARNING, "umapfs: fsid %x already mounted\n",
tfsid.__fsid_val[0]);
vfs_getnewfsid(mp);
} else {
mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
mp->mnt_stat.f_fsid = tfsid.__fsid_val[0];
}
log(LOG_DEBUG, "umapfs: using fsid %x/%x\n",
mp->mnt_stat.f_fsidx.__fsid_val[0],
mp->mnt_stat.f_fsidx.__fsid_val[1]);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(amp, sizeof(struct umap_mount));
return error;
}
amp->umapm_size = sizeof(struct umap_node);
amp->umapm_tag = VT_UMAP;
amp->umapm_bypass = umap_bypass;
amp->umapm_vnodeop_p = umap_vnodeop_p;
/*
* fix up umap node for root vnode.
*/
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
/*
* Make sure the node alias worked
*/
if (error) {
vrele(lowerrootvp);
kmem_free(amp, sizeof(struct umap_mount));
return error;
}
/*
* Keep a held reference to the root vnode.
* It is vrele'd in umapfs_unmount.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
amp->umapm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->umap_target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_mount: lower %s, alias at %s\n",
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
return 0;
}
/*
* Free reference to umap layer
*/
int
umapfs_unmount(struct mount *mp, int mntflags)
{
struct umap_mount *amp = MOUNTTOUMAPMOUNT(mp);
struct vnode *rtvp = amp->umapm_rootvp;
int error;
int flags = 0;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_unmount(mp = %p)\n", mp);
#endif
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, rtvp, flags)) != 0)
return (error);
#ifdef UMAPFS_DIAGNOSTIC
vprint("alias root of lower", rtvp);
#endif
/*
* Blow it away for future re-use
*/
vgone(rtvp);
/*
* Finally, throw away the umap_mount structure
*/
kmem_free(amp, sizeof(struct umap_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc umapfs_vnodeop_opv_desc;
const struct vnodeopv_desc * const umapfs_vnodeopv_descs[] = {
&umapfs_vnodeop_opv_desc,
NULL,
};
struct vfsops umapfs_vfsops = {
.vfs_name = MOUNT_UMAP,
.vfs_min_mount_data = sizeof (struct umap_args),
.vfs_mount = umapfs_mount,
.vfs_start = layerfs_start,
.vfs_unmount = umapfs_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = umapfs_vnodeopv_descs
};
SYSCTL_SETUP(umapfs_sysctl_setup, "umapfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "umap",
SYSCTL_DESCR("UID/GID remapping file system"),
NULL, 0, NULL, 0,
CTL_VFS, 10, CTL_EOL);
/*
* XXX the "10" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "10" is the order as taken from sys/mount.h
*/
}
static int
umap_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&umapfs_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&umapfs_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $ */
/*
* Copyright (c) 1987, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1996 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95
*/
/*
* Wrapper interface for obsolete malloc(9).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $");
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>
/*
* Built-in malloc types. Note: ought to be removed.
*/
MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
MALLOC_DEFINE(M_DMAMAP, "DMA map", "bus_dma(9) structures");
MALLOC_DEFINE(M_FREE, "free", "should be on free list");
MALLOC_DEFINE(M_TEMP, "temp", "misc. temporary data buffers");
MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
MALLOC_DEFINE(M_FTABLE, "fragtbl", "fragment reassembly header");
MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure");
MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
MALLOC_DEFINE(M_MRTABLE, "mrt", "multicast routing tables");
/*
* Header contains total size, including the header itself.
*/
struct malloc_header {
size_t mh_size;
#ifdef KASAN
size_t mh_rqsz;
#endif
} __aligned(ALIGNBYTES + 1);
void *
kern_malloc(unsigned long reqsize, int flags)
{
const int kmflags = (flags & M_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
#ifdef KASAN
const size_t origsize = reqsize;
#endif
size_t size = reqsize;
size_t allocsize, hdroffset;
struct malloc_header *mh;
void *p;
kasan_add_redzone(&size);
if (size >= PAGE_SIZE) {
if (size > (ULONG_MAX-PAGE_SIZE))
allocsize = ULONG_MAX; /* this will fail later */
else
allocsize = PAGE_SIZE + size; /* for page alignment */
hdroffset = PAGE_SIZE - sizeof(struct malloc_header);
} else {
allocsize = sizeof(struct malloc_header) + size;
hdroffset = 0;
}
p = kmem_intr_alloc(allocsize, kmflags);
if (p == NULL)
return NULL;
kmsan_mark(p, allocsize, KMSAN_STATE_UNINIT);
kmsan_orig(p, allocsize, KMSAN_TYPE_MALLOC, __RET_ADDR);
if ((flags & M_ZERO) != 0) { memset(p, 0, allocsize);
}
mh = (void *)((char *)p + hdroffset);
mh->mh_size = allocsize - hdroffset;
#ifdef KASAN
mh->mh_rqsz = origsize;
#endif
mh++;
kasan_mark(mh, origsize, size, KASAN_MALLOC_REDZONE);
return mh;
}
void
kern_free(void *addr)
{
struct malloc_header *mh;
mh = addr;
mh--;
kasan_mark(addr, mh->mh_size - sizeof(struct malloc_header),
mh->mh_size - sizeof(struct malloc_header), KASAN_MALLOC_REDZONE);
if (mh->mh_size >= PAGE_SIZE + sizeof(struct malloc_header)) {
kmsan_mark((char *)addr - PAGE_SIZE,
mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header),
KMSAN_STATE_INITED);
kmem_intr_free((char *)addr - PAGE_SIZE,
mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header));
} else {
kmsan_mark(mh, mh->mh_size, KMSAN_STATE_INITED);
kmem_intr_free(mh, mh->mh_size);
}
}
void *
kern_realloc(void *curaddr, unsigned long newsize, int flags)
{
struct malloc_header *mh;
unsigned long cursize;
void *newaddr;
/*
* realloc() with a NULL pointer is the same as malloc().
*/
if (curaddr == NULL)
return malloc(newsize, ksp, flags);
/*
* realloc() with zero size is the same as free().
*/
if (newsize == 0) {
free(curaddr, ksp);
return NULL;
}
if ((flags & M_NOWAIT) == 0) {
ASSERT_SLEEPABLE();
}
mh = curaddr;
mh--;
#ifdef KASAN
cursize = mh->mh_rqsz;
#else
cursize = mh->mh_size - sizeof(struct malloc_header);
#endif
/*
* If we already actually have as much as they want, we're done.
*/
if (newsize <= cursize)
return curaddr;
/*
* Can't satisfy the allocation with the existing block.
* Allocate a new one and copy the data.
*/
newaddr = malloc(newsize, ksp, flags);
if (__predict_false(newaddr == NULL)) {
/*
* malloc() failed, because flags included M_NOWAIT.
* Return NULL to indicate that failure. The old
* pointer is still valid.
*/
return NULL;
}
memcpy(newaddr, curaddr, cursize);
/*
* We were successful: free the old allocation and return
* the new one.
*/
free(curaddr, ksp);
return newaddr;
}
/* $NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $ */
/*-
* Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $");
#define __UFETCHSTORE_PRIVATE
#define __UCAS_PRIVATE
#include <sys/param.h>
#include <sys/fcntl.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
void
uio_setup_sysspace(struct uio *uio)
{
uio->uio_vmspace = vmspace_kernel();
}
int
uiomove(void *buf, size_t n, struct uio *uio)
{
struct vmspace *vm = uio->uio_vmspace;
struct iovec *iov;
size_t cnt;
int error = 0;
char *cp = buf;
ASSERT_SLEEPABLE();
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); while (n > 0 && uio->uio_resid) { KASSERT(uio->uio_iovcnt > 0);
iov = uio->uio_iov;
cnt = iov->iov_len;
if (cnt == 0) {
KASSERT(uio->uio_iovcnt > 1);
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
if (cnt > n)
cnt = n;
if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point();
}
if (uio->uio_rw == UIO_READ) {
error = copyout_vmspace(vm, cp, iov->iov_base,
cnt);
} else {
error = copyin_vmspace(vm, iov->iov_base, cp,
cnt);
}
if (error) {
break;
}
iov->iov_base = (char *)iov->iov_base + cnt;
iov->iov_len -= cnt;
uio->uio_resid -= cnt;
uio->uio_offset += cnt;
cp += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
return (error);
}
/*
* Wrapper for uiomove() that validates the arguments against a known-good
* kernel buffer.
*/
int
uiomove_frombuf(void *buf, size_t buflen, struct uio *uio)
{
size_t offset;
if (uio->uio_offset < 0 || /* uio->uio_resid < 0 || */
(offset = uio->uio_offset) != uio->uio_offset)
return (EINVAL);
if (offset >= buflen)
return (0);
return (uiomove((char *)buf + offset, buflen - offset, uio));
}
int
uiopeek(void *buf, size_t n, struct uio *uio)
{
struct vmspace *vm = uio->uio_vmspace;
struct iovec *iov;
size_t cnt;
int error = 0;
char *cp = buf;
size_t resid = uio->uio_resid;
int iovcnt = uio->uio_iovcnt;
char *base;
size_t len;
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
if (n == 0 || resid == 0)
return 0;
iov = uio->uio_iov;
base = iov->iov_base;
len = iov->iov_len;
while (n > 0 && resid > 0) {
KASSERT(iovcnt > 0);
cnt = len;
if (cnt == 0) {
KASSERT(iovcnt > 1);
iov++;
iovcnt--;
base = iov->iov_base;
len = iov->iov_len;
continue;
}
if (cnt > n)
cnt = n;
if (!VMSPACE_IS_KERNEL_P(vm)) {
preempt_point();
}
if (uio->uio_rw == UIO_READ) {
error = copyout_vmspace(vm, cp, base, cnt);
} else {
error = copyin_vmspace(vm, base, cp, cnt);
}
if (error) {
break;
}
base += cnt;
len -= cnt;
resid -= cnt;
cp += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
return error;
}
void
uioskip(size_t n, struct uio *uio)
{
struct iovec *iov;
size_t cnt;
KASSERTMSG(n <= uio->uio_resid, "n=%zu resid=%zu", n, uio->uio_resid);
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
while (n > 0 && uio->uio_resid) {
KASSERT(uio->uio_iovcnt > 0);
iov = uio->uio_iov;
cnt = iov->iov_len;
if (cnt == 0) {
KASSERT(uio->uio_iovcnt > 1);
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
if (cnt > n)
cnt = n;
iov->iov_base = (char *)iov->iov_base + cnt;
iov->iov_len -= cnt;
uio->uio_resid -= cnt;
uio->uio_offset += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
}
/*
* Give next character to user as result of read.
*/
int
ureadc(int c, struct uio *uio)
{
struct iovec *iov;
if (uio->uio_resid <= 0)
panic("ureadc: non-positive resid");
again:
if (uio->uio_iovcnt <= 0)
panic("ureadc: non-positive iovcnt");
iov = uio->uio_iov;
if (iov->iov_len <= 0) {
uio->uio_iovcnt--;
uio->uio_iov++;
goto again;
}
if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
int error;
if ((error = ustore_char(iov->iov_base, c)) != 0)
return (error);
} else {
*(char *)iov->iov_base = c;
}
iov->iov_base = (char *)iov->iov_base + 1;
iov->iov_len--;
uio->uio_resid--;
uio->uio_offset++;
return (0);
}
/*
* Like copyin(), but operates on an arbitrary vmspace.
*/
int
copyin_vmspace(struct vmspace *vm, const void *uaddr, void *kaddr, size_t len)
{
struct iovec iov;
struct uio uio;
int error;
if (len == 0)
return (0);
if (VMSPACE_IS_KERNEL_P(vm)) {
return kcopy(uaddr, kaddr, len);
}
if (__predict_true(vm == curproc->p_vmspace)) {
return copyin(uaddr, kaddr, len);
}
iov.iov_base = kaddr;
iov.iov_len = len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(uintptr_t)uaddr;
uio.uio_resid = len;
uio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&uio);
error = uvm_io(&vm->vm_map, &uio, 0);
return (error);
}
/*
* Like copyout(), but operates on an arbitrary vmspace.
*/
int
copyout_vmspace(struct vmspace *vm, const void *kaddr, void *uaddr, size_t len)
{
struct iovec iov;
struct uio uio;
int error;
if (len == 0)
return (0);
if (VMSPACE_IS_KERNEL_P(vm)) {
return kcopy(kaddr, uaddr, len);
}
if (__predict_true(vm == curproc->p_vmspace)) {
return copyout(kaddr, uaddr, len);
}
iov.iov_base = __UNCONST(kaddr); /* XXXUNCONST cast away const */
iov.iov_len = len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(uintptr_t)uaddr;
uio.uio_resid = len;
uio.uio_rw = UIO_WRITE;
UIO_SETUP_SYSSPACE(&uio);
error = uvm_io(&vm->vm_map, &uio, 0);
return (error);
}
/*
* Like copyin(), but operates on an arbitrary process.
*/
int
copyin_proc(struct proc *p, const void *uaddr, void *kaddr, size_t len)
{
struct vmspace *vm;
int error;
error = proc_vmspace_getref(p, &vm);
if (error) {
return error;
}
error = copyin_vmspace(vm, uaddr, kaddr, len);
uvmspace_free(vm);
return error;
}
/*
* Like copyout(), but operates on an arbitrary process.
*/
int
copyout_proc(struct proc *p, const void *kaddr, void *uaddr, size_t len)
{
struct vmspace *vm;
int error;
error = proc_vmspace_getref(p, &vm);
if (error) {
return error;
}
error = copyout_vmspace(vm, kaddr, uaddr, len);
uvmspace_free(vm);
return error;
}
/*
* Like copyin(), but operates on an arbitrary pid.
*/
int
copyin_pid(pid_t pid, const void *uaddr, void *kaddr, size_t len)
{
struct proc *p;
struct vmspace *vm;
int error;
mutex_enter(&proc_lock);
p = proc_find(pid);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
mutex_enter(p->p_lock);
error = proc_vmspace_getref(p, &vm);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
if (error == 0) {
error = copyin_vmspace(vm, uaddr, kaddr, len);
uvmspace_free(vm);
}
return error;
}
/*
* Like copyin(), except it operates on kernel addresses when the FKIOCTL
* flag is passed in `ioctlflags' from the ioctl call.
*/
int
ioctl_copyin(int ioctlflags, const void *src, void *dst, size_t len)
{
if (ioctlflags & FKIOCTL)
return kcopy(src, dst, len);
return copyin(src, dst, len);
}
/*
* Like copyout(), except it operates on kernel addresses when the FKIOCTL
* flag is passed in `ioctlflags' from the ioctl call.
*/
int
ioctl_copyout(int ioctlflags, const void *src, void *dst, size_t len)
{
if (ioctlflags & FKIOCTL)
return kcopy(src, dst, len);
return copyout(src, dst, len);
}
/*
* User-space CAS / fetch / store
*/
#ifdef __NO_STRICT_ALIGNMENT
#define CHECK_ALIGNMENT(x) __nothing
#else /* ! __NO_STRICT_ALIGNMENT */
static bool
ufetchstore_aligned(uintptr_t uaddr, size_t size)
{
return (uaddr & (size - 1)) == 0;
}
#define CHECK_ALIGNMENT() \
do { \
if (!ufetchstore_aligned((uintptr_t)uaddr, sizeof(*uaddr))) \
return EFAULT; \
} while (/*CONSTCOND*/0)
#endif /* __NO_STRICT_ALIGNMENT */
/*
* __HAVE_UCAS_FULL platforms provide _ucas_32() and _ucas_64() themselves.
* _RUMPKERNEL also provides it's own _ucas_32() and _ucas_64().
*
* In all other cases, we provide generic implementations that work on
* all platforms.
*/
#if !defined(__HAVE_UCAS_FULL) && !defined(_RUMPKERNEL)
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/once.h>
#include <sys/mutex.h>
#include <sys/ipi.h>
static int ucas_critical_splcookie;
static volatile u_int ucas_critical_pausing_cpus;
static u_int ucas_critical_ipi;
static ONCE_DECL(ucas_critical_init_once)
static void
ucas_critical_cpu_gate(void *arg __unused)
{
int count = SPINLOCK_BACKOFF_MIN;
KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0);
/*
* Notify ucas_critical_wait that we have stopped. Using
* store-release ensures all our memory operations up to the
* IPI happen before the ucas -- no buffered stores on our end
* can clobber it later on, for instance.
*
* Matches atomic_load_acquire in ucas_critical_wait -- turns
* the following atomic_dec_uint into a store-release.
*/
membar_release();
atomic_dec_uint(&ucas_critical_pausing_cpus);
/*
* Wait for ucas_critical_exit to reopen the gate and let us
* proceed. Using a load-acquire ensures the ucas happens
* before any of our memory operations when we return from the
* IPI and proceed -- we won't observe any stale cached value
* that the ucas overwrote, for instance.
*
* Matches atomic_store_release in ucas_critical_exit.
*/
while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) {
SPINLOCK_BACKOFF(count);
}
}
static int
ucas_critical_init(void)
{
ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL);
return 0;
}
static void
ucas_critical_wait(void)
{
int count = SPINLOCK_BACKOFF_MIN;
/*
* Wait for all CPUs to stop at the gate. Using a load-acquire
* ensures all memory operations before they stop at the gate
* happen before the ucas -- no buffered stores in other CPUs
* can clobber it later on, for instance.
*
* Matches membar_release/atomic_dec_uint (store-release) in
* ucas_critical_cpu_gate.
*/
while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) {
SPINLOCK_BACKOFF(count);
}
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
static inline void
ucas_critical_enter(lwp_t * const l)
{
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
if (ncpu > 1) {
RUN_ONCE(&ucas_critical_init_once, ucas_critical_init);
/*
* Acquire the mutex first, then go to splhigh() and
* broadcast the IPI to lock all of the other CPUs
* behind the gate.
*
* N.B. Going to splhigh() implicitly disables preemption,
* so there's no need to do it explicitly.
*/
mutex_enter(&cpu_lock);
ucas_critical_splcookie = splhigh();
ucas_critical_pausing_cpus = ncpu - 1;
ipi_trigger_broadcast(ucas_critical_ipi, true);
ucas_critical_wait();
return;
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
KPREEMPT_DISABLE(l);
}
static inline void
ucas_critical_exit(lwp_t * const l)
{
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
if (ncpu > 1) {
/*
* Open the gate and notify all CPUs in
* ucas_critical_cpu_gate that they can now proceed.
* Using a store-release ensures the ucas happens
* before any memory operations they issue after the
* IPI -- they won't observe any stale cache of the
* target word, for instance.
*
* Matches atomic_load_acquire in ucas_critical_cpu_gate.
*/
atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1);
splx(ucas_critical_splcookie);
mutex_exit(&cpu_lock);
return;
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
KPREEMPT_ENABLE(l);
}
int
_ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{
lwp_t * const l = curlwp;
uint32_t *uva = ((void *)(uintptr_t)uaddr);
int error;
/*
* Wire the user address down to avoid taking a page fault during
* the critical section.
*/
error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
VM_PROT_READ | VM_PROT_WRITE);
if (error)
return error;
ucas_critical_enter(l);
error = _ufetch_32(uva, ret);
if (error == 0 && *ret == old) {
error = _ustore_32(uva, new);
}
ucas_critical_exit(l);
uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));
return error;
}
#ifdef _LP64
int
_ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{
lwp_t * const l = curlwp;
uint64_t *uva = ((void *)(uintptr_t)uaddr);
int error;
/*
* Wire the user address down to avoid taking a page fault during
* the critical section.
*/
error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
VM_PROT_READ | VM_PROT_WRITE);
if (error)
return error;
ucas_critical_enter(l);
error = _ufetch_64(uva, ret);
if (error == 0 && *ret == old) {
error = _ustore_64(uva, new);
}
ucas_critical_exit(l);
uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));
return error;
}
#endif /* _LP64 */
#endif /* ! __HAVE_UCAS_FULL && ! _RUMPKERNEL */
int
ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
!defined(_RUMPKERNEL)
if (ncpu > 1) {
return _ucas_32_mp(uaddr, old, new, ret);
}
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
return _ucas_32(uaddr, old, new, ret);
}
#ifdef _LP64
int
ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
!defined(_RUMPKERNEL)
if (ncpu > 1) {
return _ucas_64_mp(uaddr, old, new, ret);
}
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
return _ucas_64(uaddr, old, new, ret);
}
#endif /* _LP64 */
__strong_alias(ucas_int,ucas_32);
#ifdef _LP64
__strong_alias(ucas_ptr,ucas_64);
#else
__strong_alias(ucas_ptr,ucas_32);
#endif /* _LP64 */
int
ufetch_8(const uint8_t *uaddr, uint8_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_8(uaddr, valp);
}
int
ufetch_16(const uint16_t *uaddr, uint16_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_16(uaddr, valp);
}
int
ufetch_32(const uint32_t *uaddr, uint32_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_32(uaddr, valp);
}
#ifdef _LP64
int
ufetch_64(const uint64_t *uaddr, uint64_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_64(uaddr, valp);
}
#endif /* _LP64 */
__strong_alias(ufetch_char,ufetch_8);
__strong_alias(ufetch_short,ufetch_16);
__strong_alias(ufetch_int,ufetch_32);
#ifdef _LP64
__strong_alias(ufetch_long,ufetch_64);
__strong_alias(ufetch_ptr,ufetch_64);
#else
__strong_alias(ufetch_long,ufetch_32);
__strong_alias(ufetch_ptr,ufetch_32);
#endif /* _LP64 */
int
ustore_8(uint8_t *uaddr, uint8_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_8(uaddr, val);
}
int
ustore_16(uint16_t *uaddr, uint16_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_16(uaddr, val);
}
int
ustore_32(uint32_t *uaddr, uint32_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_32(uaddr, val);
}
#ifdef _LP64
int
ustore_64(uint64_t *uaddr, uint64_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_64(uaddr, val);
}
#endif /* _LP64 */
__strong_alias(ustore_char,ustore_8);
__strong_alias(ustore_short,ustore_16);
__strong_alias(ustore_int,ustore_32);
#ifdef _LP64
__strong_alias(ustore_long,ustore_64);
__strong_alias(ustore_ptr,ustore_64);
#else
__strong_alias(ustore_long,ustore_32);
__strong_alias(ustore_ptr,ustore_32);
#endif /* _LP64 */
/* $NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/mutex.h>
#include <sys/lwp.h>
#include <sys/systm.h>
static kmutex_t kernconfig_mutex;
static lwp_t *kernconfig_lwp;
static int kernconfig_recurse;
/*
* Functions for manipulating the kernel configuration lock. This
* recursive lock should be used to protect all additions and removals
* of kernel functionality, such as device configuration and loading
* of modular kernel components.
*/
void
kernconfig_lock_init(void)
{
mutex_init(&kernconfig_mutex, MUTEX_DEFAULT, IPL_NONE);
kernconfig_lwp = NULL;
kernconfig_recurse = 0;
}
void
kernconfig_lock(void)
{
lwp_t *my_lwp;
/*
* It's OK to check this unlocked, since it could only be set to
* curlwp by the current thread itself, and not by an interrupt
* or any other LWP.
*/
KASSERT(!cpu_intr_p());
my_lwp = curlwp;
if (kernconfig_lwp == my_lwp) {
kernconfig_recurse++;
KASSERT(kernconfig_recurse > 1);
} else {
mutex_enter(&kernconfig_mutex);
kernconfig_lwp = my_lwp;
kernconfig_recurse = 1;
}
}
void
kernconfig_unlock(void)
{ KASSERT(kernconfig_is_held()); KASSERT(kernconfig_recurse != 0); if (--kernconfig_recurse == 0) { kernconfig_lwp = NULL;
mutex_exit(&kernconfig_mutex);
}
}
bool
kernconfig_is_held(void)
{
return mutex_owned(&kernconfig_mutex);
}
/* $NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/lwp.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/lock.h>
#include <machine/cpu_counter.h>
#include <machine/cpuvar.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <machine/cputypes.h>
#include "tsc.h"
#define TSC_SYNC_ROUNDS 1000
#define ABS(a) ((a) >= 0 ? (a) : -(a))
static u_int tsc_get_timecount(struct timecounter *);
static void tsc_delay(unsigned int);
static uint64_t tsc_dummy_cacheline __cacheline_aligned;
uint64_t tsc_freq __read_mostly; /* exported for sysctl */
static int64_t tsc_drift_max = 1000; /* max cycles */
static int64_t tsc_drift_observed;
uint64_t (*rdtsc)(void) = rdtsc_cpuid;
uint64_t (*cpu_counter)(void) = cpu_counter_cpuid;
uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid;
int tsc_user_enabled = 1;
static volatile int64_t tsc_sync_val;
static volatile struct cpu_info *tsc_sync_cpu;
static struct timecounter tsc_timecounter = {
.tc_get_timecount = tsc_get_timecount,
.tc_counter_mask = ~0U,
.tc_name = "TSC",
.tc_quality = 3000,
};
bool
tsc_is_invariant(void)
{
struct cpu_info *ci;
uint32_t descs[4];
uint32_t family;
bool invariant;
if (!cpu_hascounter())
return false;
ci = curcpu();
invariant = false;
if (cpu_vendor == CPUVENDOR_INTEL) {
/*
* From Intel(tm) 64 and IA-32 Architectures Software
* Developer's Manual Volume 3A: System Programming Guide,
* Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
* where the TSC is known invariant:
*
* Pentium 4, Intel Xeon (family 0f, models 03 and higher)
* Core Solo and Core Duo processors (family 06, model 0e)
* Xeon 5100 series and Core 2 Duo (family 06, model 0f)
* Core 2 and Xeon (family 06, model 17)
* Atom (family 06, model 1c)
*
* We'll also assume that it's safe on the Pentium, and
* that it's safe on P-II and P-III Xeons due to the
* typical configuration of those systems.
*
*/
switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
case 0x05:
invariant = true;
break;
case 0x06:
invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
break;
case 0x0f:
invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
break;
}
} else if (cpu_vendor == CPUVENDOR_AMD) {
/*
* TSC and Power Management Events on AMD Processors
* Nov 2, 2005 Rich Brunner, AMD Fellow
* http://lkml.org/lkml/2005/11/4/173
*
* See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
* Management Features, AMD64 Architecture Programmer's
* Manual Volume 3: General-Purpose and System Instructions.
* The check is done below.
*/
/*
* AMD Errata 778: Processor Core Time Stamp Counters May
* Experience Drift
*
* This affects all family 15h and family 16h processors.
*/
switch (CPUID_TO_FAMILY(ci->ci_signature)) {
case 0x15:
case 0x16:
return false;
}
}
/*
* The best way to check whether the TSC counter is invariant or not
* is to check CPUID 80000007.
*/
family = CPUID_TO_BASEFAMILY(ci->ci_signature);
if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
&& ((family == 0x06) || (family == 0x0f))) {
x86_cpuid(0x80000000, descs);
if (descs[0] >= 0x80000007) {
x86_cpuid(0x80000007, descs);
invariant = (descs[3] & CPUID_APM_ITSC) != 0;
}
}
return invariant;
}
/* Setup function pointers for rdtsc() and timecounter(9). */
void
tsc_setfunc(struct cpu_info *ci)
{
bool use_lfence, use_mfence;
use_lfence = use_mfence = false;
/*
* XXX On AMD, we might be able to use lfence for some cases:
* a) if MSR_DE_CFG exist and the bit 1 is set.
* b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
* lfence is always serializing.
*
* We don't use it because the test result showed mfence was better
* than lfence with MSR_DE_CFG.
*/
if (cpu_vendor == CPUVENDOR_AMD)
use_mfence = true;
else if (cpu_vendor == CPUVENDOR_INTEL)
use_lfence = true;
/* LFENCE and MFENCE are applicable if SSE2 is set. */
if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
use_lfence = use_mfence = false;
#define TSC_SETFUNC(fence) \
do { \
rdtsc = rdtsc_##fence; \
cpu_counter = cpu_counter_##fence; \
cpu_counter32 = cpu_counter32_##fence; \
} while (/* CONSTCOND */ 0)
if (use_lfence)
TSC_SETFUNC(lfence);
else if (use_mfence)
TSC_SETFUNC(mfence);
else
TSC_SETFUNC(cpuid);
aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
}
/*
* Initialize timecounter(9) and DELAY() function of TSC.
*
* This function is called after all secondary processors were brought up
* and drift has been measured, and after any other potential delay funcs
* have been installed (e.g. lapic_delay()).
*/
void
tsc_tc_init(void)
{
struct cpu_info *ci;
bool invariant;
if (!cpu_hascounter())
return;
ci = curcpu();
tsc_freq = ci->ci_data.cpu_cc_freq;
invariant = tsc_is_invariant();
if (!invariant) {
aprint_debug("TSC not known invariant on this CPU\n");
tsc_timecounter.tc_quality = -100;
} else if (tsc_drift_observed > tsc_drift_max) {
aprint_error("ERROR: %lld cycle TSC drift observed\n",
(long long)tsc_drift_observed);
tsc_timecounter.tc_quality = -100;
invariant = false;
} else if (vm_guest == VM_GUEST_NO) {
delay_func = tsc_delay;
} else if (vm_guest == VM_GUEST_VIRTUALBOX) {
tsc_timecounter.tc_quality = -100;
}
if (tsc_freq != 0) {
tsc_timecounter.tc_frequency = tsc_freq;
tc_init(&tsc_timecounter);
}
}
/*
* Record drift (in clock cycles). Called during AP startup.
*/
void
tsc_sync_drift(int64_t drift)
{
if (drift < 0)
drift = -drift;
if (drift > tsc_drift_observed)
tsc_drift_observed = drift;
}
/*
* Called during startup of APs, by the boot processor. Interrupts
* are disabled on entry.
*/
static void __noinline
tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
{
uint64_t bptsc;
if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
panic("tsc_sync_bp: 1");
}
/* Prepare a cache miss for the other side. */
(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
/* Flag our readiness. */
atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
/* Wait for other side then read our TSC. */
while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
__insn_barrier();
}
bptsc = rdtsc();
/* Wait for the results to come in. */
while (tsc_sync_cpu == ci) {
x86_pause();
}
if (tsc_sync_cpu != NULL) {
panic("tsc_sync_bp: 2");
}
*bptscp = bptsc;
*aptscp = tsc_sync_val;
}
void
tsc_sync_bp(struct cpu_info *ci)
{
int64_t bptsc, aptsc, val, diff;
if (!cpu_hascounter())
return;
val = INT64_MAX;
for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
tsc_read_bp(ci, &bptsc, &aptsc);
diff = bptsc - aptsc;
if (ABS(diff) < ABS(val)) {
val = diff;
}
}
ci->ci_data.cpu_cc_skew = val;
}
/*
* Called during startup of AP, by the AP itself. Interrupts are
* disabled on entry.
*/
static void __noinline
tsc_post_ap(struct cpu_info *ci)
{
uint64_t tsc;
/* Wait for go-ahead from primary. */
while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
__insn_barrier();
}
/* Instruct primary to read its counter. */
atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
/* Suffer a cache miss, then read TSC. */
__insn_barrier();
tsc = tsc_dummy_cacheline;
__insn_barrier();
tsc += rdtsc();
/* Post result. Ensure the whole value goes out atomically. */
(void)atomic_swap_64(&tsc_sync_val, tsc);
if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
panic("tsc_sync_ap");
}
}
void
tsc_sync_ap(struct cpu_info *ci)
{
if (!cpu_hascounter())
return;
for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
tsc_post_ap(ci);
}
}
static void
tsc_apply_cpu(void *arg1, void *arg2)
{
bool enable = arg1 != NULL;
if (enable) {
lcr4(rcr4() & ~CR4_TSD);
} else {
lcr4(rcr4() | CR4_TSD);
}
}
void
tsc_user_enable(void)
{
uint64_t xc;
xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
xc_wait(xc);
}
void
tsc_user_disable(void)
{
uint64_t xc;
xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
xc_wait(xc);
}
uint64_t
cpu_frequency(struct cpu_info *ci)
{
return ci->ci_data.cpu_cc_freq;
}
int
cpu_hascounter(void)
{
return cpu_feature[0] & CPUID_TSC;
}
static void
tsc_delay(unsigned int us)
{
uint64_t start, delta;
start = cpu_counter();
delta = (uint64_t)us * tsc_freq / 1000000;
while ((cpu_counter() - start) < delta) {
x86_pause();
}
}
static u_int
tsc_get_timecount(struct timecounter *tc)
{
#if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
static int lastwarn;
uint64_t cur, prev;
lwp_t *l = curlwp;
int ticks;
/*
* Previous value must be read before the counter and stored to
* after, because this routine can be called from interrupt context
* and may run over the top of an existing invocation. Ordering is
* guaranteed by "volatile" on md_tsc.
*/
prev = l->l_md.md_tsc;
cur = cpu_counter();
if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
__cpu_simple_lock_try(&lock)) {
ticks = getticks();
if (ticks - lastwarn >= hz) {
printf(
"WARNING: %s TSC went backwards by %u - "
"change sysctl(7) kern.timecounter?\n",
cpu_name(curcpu()), (unsigned)(prev - cur));
lastwarn = ticks;
}
__cpu_simple_unlock(&lock);
}
l->l_md.md_tsc = cur;
return (uint32_t)cur;
#else
return cpu_counter32();
#endif
}
/*
* tsc has been reset; zero the cached tsc of every lwp in the system
* so we don't spuriously report that the tsc has gone backward.
* Caller must ensure all LWPs are quiescent (except the current one,
* obviously) and interrupts are blocked while we update this.
*/
void
tsc_tc_reset(void)
{
struct lwp *l;
LIST_FOREACH(l, &alllwp, l_list)
l->l_md.md_tsc = 0;
}
/* $NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $ */
/*
*
* Coda: an Experimental Distributed File System
* Release 3.1
*
* Copyright (c) 1987-1998 Carnegie Mellon University
* All Rights Reserved
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation, and
* that credit is given to Carnegie Mellon University in all documents
* and publicity pertaining to direct or indirect use of this code or its
* derivatives.
*
* CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS,
* SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS
* FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON
* DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
* RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF
* ANY DERIVATIVE WORK.
*
* Carnegie Mellon encourages users of this software to return any
* improvements or extensions that they make, and to grant Carnegie
* Mellon the rights to redistribute these changes without encumbrance.
*
* @(#) coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:26:46 rvb Exp $
*/
/*
* Mach Operating System
* Copyright (c) 1990 Carnegie-Mellon University
* Copyright (c) 1989 Carnegie-Mellon University
* All rights reserved. The CMU software License Agreement specifies
* the terms and conditions for use and redistribution.
*/
/*
* This code was written for the Coda file system at Carnegie Mellon
* University. Contributers include David Steere, James Kistler, and
* M. Satyanarayanan.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/acct.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/namei.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/dirent.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vnops.h>
#include <coda/coda_venus.h>
#include <coda/coda_opstats.h>
#include <coda/coda_subr.h>
#include <coda/coda_namecache.h>
#include <coda/coda_pioctl.h>
/*
* These flags select various performance enhancements.
*/
int coda_attr_cache = 1; /* Set to cache attributes in the kernel */
int coda_symlink_cache = 1; /* Set to cache symbolic link information */
int coda_access_cache = 1; /* Set to handle some access checks directly */
/* structure to keep track of vfs calls */
struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE];
#define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++)
#define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++)
/* What we are delaying for in printf */
static int coda_lockdebug = 0;
#define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__))
/* Definition of the vnode operation vector */
const struct vnodeopv_entry_desc coda_vnodeop_entries[] = {
{ &vop_default_desc, coda_vop_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, coda_lookup }, /* lookup */
{ &vop_create_desc, coda_create }, /* create */
{ &vop_mknod_desc, coda_vop_error }, /* mknod */
{ &vop_open_desc, coda_open }, /* open */
{ &vop_close_desc, coda_close }, /* close */
{ &vop_access_desc, coda_access }, /* access */
{ &vop_accessx_desc, genfs_accessx }, /* access */
{ &vop_getattr_desc, coda_getattr }, /* getattr */
{ &vop_setattr_desc, coda_setattr }, /* setattr */
{ &vop_read_desc, coda_read }, /* read */
{ &vop_write_desc, coda_write }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, coda_ioctl }, /* ioctl */
{ &vop_mmap_desc, genfs_mmap }, /* mmap */
{ &vop_fsync_desc, coda_fsync }, /* fsync */
{ &vop_remove_desc, coda_remove }, /* remove */
{ &vop_link_desc, coda_link }, /* link */
{ &vop_rename_desc, coda_rename }, /* rename */
{ &vop_mkdir_desc, coda_mkdir }, /* mkdir */
{ &vop_rmdir_desc, coda_rmdir }, /* rmdir */
{ &vop_symlink_desc, coda_symlink }, /* symlink */
{ &vop_readdir_desc, coda_readdir }, /* readdir */
{ &vop_readlink_desc, coda_readlink }, /* readlink */
{ &vop_abortop_desc, coda_abortop }, /* abortop */
{ &vop_inactive_desc, coda_inactive }, /* inactive */
{ &vop_reclaim_desc, coda_reclaim }, /* reclaim */
{ &vop_lock_desc, coda_lock }, /* lock */
{ &vop_unlock_desc, coda_unlock }, /* unlock */
{ &vop_bmap_desc, coda_bmap }, /* bmap */
{ &vop_strategy_desc, coda_strategy }, /* strategy */
{ &vop_print_desc, coda_vop_error }, /* print */
{ &vop_islocked_desc, coda_islocked }, /* islocked */
{ &vop_pathconf_desc, coda_pathconf }, /* pathconf */
{ &vop_advlock_desc, coda_vop_nop }, /* advlock */
{ &vop_bwrite_desc, coda_vop_error }, /* bwrite */
{ &vop_seek_desc, genfs_seek }, /* seek */
{ &vop_poll_desc, genfs_poll }, /* poll */
{ &vop_getpages_desc, coda_getpages }, /* getpages */
{ &vop_putpages_desc, coda_putpages }, /* putpages */
{ NULL, NULL }
};
static void coda_print_vattr(struct vattr *);
int (**coda_vnodeop_p)(void *);
const struct vnodeopv_desc coda_vnodeop_opv_desc =
{ &coda_vnodeop_p, coda_vnodeop_entries };
/* Definitions of NetBSD vnodeop interfaces */
/*
* A generic error routine. Return EIO without looking at arguments.
*/
int
coda_vop_error(void *anon) {
struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;
if (codadebug) {
myprintf(("%s: Vnode operation %s called (error).\n",
__func__, (*desc)->vdesc_name));
}
return EIO;
}
/* A generic do-nothing. */
int
coda_vop_nop(void *anon) {
struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;
if (codadebug) {
myprintf(("Vnode operation %s called, but unsupported\n",
(*desc)->vdesc_name));
}
return (0);
}
int
coda_vnodeopstats_init(void)
{
int i;
for(i=0;i<CODA_VNODEOPS_SIZE;i++) {
coda_vnodeopstats[i].opcode = i;
coda_vnodeopstats[i].entries = 0;
coda_vnodeopstats[i].sat_intrn = 0;
coda_vnodeopstats[i].unsat_intrn = 0;
coda_vnodeopstats[i].gen_intrn = 0;
}
return 0;
}
/*
* XXX The entire relationship between VOP_OPEN and having a container
* file (via venus_open) needs to be reexamined. In particular, it's
* valid to open/mmap/close and then reference. Instead of doing
* VOP_OPEN when getpages needs a container, we should do the
* venus_open part, and record that the vnode has opened the container
* for getpages, and do the matching logical close on coda_inactive.
* Further, coda_rdwr needs a container file, and sometimes needs to
* do the equivalent of open (core dumps).
*/
/*
* coda_open calls Venus to return the device and inode of the
* container file, and then obtains a vnode for that file. The
* container vnode is stored in the coda vnode, and a reference is
* added for each open file.
*/
int
coda_open(void *v)
{
/*
* NetBSD can pass the O_EXCL flag in mode, even though the check
* has already happened. Venus defensively assumes that if open
* is passed the EXCL, it must be a bug. We strip the flag here.
*/
/* true args */
struct vop_open_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
int flag = ap->a_mode & (~O_EXCL);
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
dev_t dev; /* container file device, inode, vnode */
ino_t inode;
vnode_t *container_vp;
MARK_ENTRY(CODA_OPEN_STATS);
KASSERT(VOP_ISLOCKED(vp));
/* Check for open of control file. */
if (IS_CTL_VP(vp)) {
/* if (WRITABLE(flag)) */
if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) {
MARK_INT_FAIL(CODA_OPEN_STATS);
return(EACCES);
}
MARK_INT_SAT(CODA_OPEN_STATS);
return(0);
}
error = venus_open(vtomi(vp), &cp->c_fid, flag, cred, curlwp, &dev, &inode);
if (error)
return (error);
if (!error) {
CODADEBUG(CODA_OPEN, myprintf((
"%s: dev 0x%llx inode %llu result %d\n", __func__,
(unsigned long long)dev, (unsigned long long)inode, error));)
}
/*
* Obtain locked and referenced container vnode from container
* device/inode.
*/
error = coda_grab_vnode(vp, dev, inode, &container_vp);
if (error)
return (error);
/* Save the vnode pointer for the container file. */
if (cp->c_ovp == NULL) {
cp->c_ovp = container_vp;
} else {
if (cp->c_ovp != container_vp)
/*
* Perhaps venus returned a different container, or
* something else went wrong.
*/
panic("%s: cp->c_ovp != container_vp", __func__);
}
cp->c_ocount++;
/* Flush the attribute cache if writing the file. */
if (flag & FWRITE) {
cp->c_owrite++;
cp->c_flags &= ~C_VATTR;
}
/*
* Save the <device, inode> pair for the container file to speed
* up subsequent reads while closed (mmap, program execution).
* This is perhaps safe because venus will invalidate the node
* before changing the container file mapping.
*/
cp->c_device = dev;
cp->c_inode = inode;
/* Open the container file. */
error = VOP_OPEN(container_vp, flag, cred);
/*
* Drop the lock on the container, after we have done VOP_OPEN
* (which requires a locked vnode).
*/
VOP_UNLOCK(container_vp);
return(error);
}
/*
* Close the cache file used for I/O and notify Venus.
*/
int
coda_close(void *v)
{
/* true args */
struct vop_close_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
int flag = ap->a_fflag;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_CLOSE_STATS);
/* Check for close of control file. */
if (IS_CTL_VP(vp)) {
MARK_INT_SAT(CODA_CLOSE_STATS);
return(0);
}
/*
* XXX The IS_UNMOUNTING part of this is very suspect.
*/
if (IS_UNMOUNTING(cp)) {
if (cp->c_ovp) {
#ifdef CODA_VERBOSE
printf("%s: destroying container %d, ufs vp %p of vp %p/cp %p\n",
__func__, vrefcnt(vp), cp->c_ovp, vp, cp);
#endif
#ifdef hmm
vgone(cp->c_ovp);
#else
vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
vput(cp->c_ovp);
#endif
} else {
#ifdef CODA_VERBOSE
printf("%s: NO container vp %p/cp %p\n", __func__, vp, cp);
#endif
}
return ENODEV;
}
/* Lock the container node, and VOP_CLOSE it. */
vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
/*
* Drop the lock we just obtained, and vrele the container vnode.
* Decrement reference counts, and clear container vnode pointer on
* last close.
*/
vput(cp->c_ovp);
if (flag & FWRITE)
--cp->c_owrite;
if (--cp->c_ocount == 0)
cp->c_ovp = NULL;
error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, curlwp);
CODADEBUG(CODA_CLOSE, myprintf(("%s: result %d\n", __func__, error)); )
return(error);
}
int
coda_read(void *v)
{
struct vop_read_args *ap = v;
ENTRY;
return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ,
ap->a_ioflag, ap->a_cred, curlwp));
}
int
coda_write(void *v)
{
struct vop_write_args *ap = v;
ENTRY;
return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE,
ap->a_ioflag, ap->a_cred, curlwp));
}
int
coda_rdwr(vnode_t *vp, struct uio *uiop, enum uio_rw rw, int ioflag,
kauth_cred_t cred, struct lwp *l)
{
/* upcall decl */
/* NOTE: container file operation!!! */
/* locals */
struct cnode *cp = VTOC(vp);
vnode_t *cfvp = cp->c_ovp;
struct proc *p = l->l_proc;
int opened_internally = 0;
int error = 0;
MARK_ENTRY(CODA_RDWR_STATS);
CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %lu, %lld)\n", rw,
uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
(long long) uiop->uio_offset)); )
/* Check for rdwr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(EINVAL);
}
/* Redirect the request to UFS. */
/*
* If file is not already open this must be a page
* {read,write} request. Iget the cache file's inode
* pointer if we still have its <device, inode> pair.
* Otherwise, we must do an internal open to derive the
* pair.
* XXX Integrate this into a coherent strategy for container
* file acquisition.
*/
if (cfvp == NULL) {
/*
* If we're dumping core, do the internal open. Otherwise
* venus won't have the correct size of the core when
* it's completely written.
*/
if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) {
#ifdef CODA_VERBOSE
printf("%s: grabbing container vnode, losing reference\n",
__func__);
#endif
/* Get locked and refed vnode. */
error = coda_grab_vnode(vp, cp->c_device, cp->c_inode, &cfvp);
if (error) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(error);
}
/*
* Drop lock.
* XXX Where is reference released.
*/
VOP_UNLOCK(cfvp);
}
else {
#ifdef CODA_VERBOSE
printf("%s: internal VOP_OPEN\n", __func__);
#endif
opened_internally = 1;
MARK_INT_GEN(CODA_OPEN_STATS);
error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
#ifdef CODA_VERBOSE
printf("%s: Internally Opening %p\n", __func__, vp);
#endif
if (error) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(error);
}
cfvp = cp->c_ovp;
}
}
/* Have UFS handle the call. */
CODADEBUG(CODA_RDWR, myprintf(("%s: fid = %s, refcnt = %d\n", __func__,
coda_f2s(&cp->c_fid), vrefcnt(CTOV(cp)))); )
if (rw == UIO_READ) {
error = VOP_READ(cfvp, uiop, ioflag, cred);
} else {
error = VOP_WRITE(cfvp, uiop, ioflag, cred);
}
if (error)
MARK_INT_FAIL(CODA_RDWR_STATS);
else
MARK_INT_SAT(CODA_RDWR_STATS);
/* Do an internal close if necessary. */
if (opened_internally) {
MARK_INT_GEN(CODA_CLOSE_STATS);
(void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
}
/* Invalidate cached attributes if writing. */
if (rw == UIO_WRITE)
cp->c_flags &= ~C_VATTR;
return(error);
}
int
coda_ioctl(void *v)
{
/* true args */
struct vop_ioctl_args *ap = v;
vnode_t *vp = ap->a_vp;
int com = ap->a_command;
void *data = ap->a_data;
int flag = ap->a_fflag;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
vnode_t *tvp;
struct PioctlData *iap = (struct PioctlData *)data;
namei_simple_flags_t sflags;
MARK_ENTRY(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));)
/* Don't check for operation on a dying object, for ctlvp it
shouldn't matter */
/* Must be control object to succeed. */
if (!IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: vp != ctlvp", __func__));)
return (EOPNOTSUPP);
}
/* Look up the pathname. */
/* Should we use the name cache here? It would get it from
lookupname sooner or later anyway, right? */
sflags = iap->follow ? NSM_FOLLOW_NOEMULROOT : NSM_NOFOLLOW_NOEMULROOT;
error = namei_simple_user(iap->path, sflags, &tvp);
if (error) {
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: lookup returns %d\n",
__func__, error));)
return(error);
}
/*
* Make sure this is a coda style cnode, but it may be a
* different vfsp
*/
/* XXX: this totally violates the comment about vtagtype in vnode.h */
if (tvp->v_tag != VT_CODA) {
vrele(tvp);
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: %s not a coda object\n",
__func__, iap->path));)
return(EINVAL);
}
if (iap->vi.in_size > VC_MAXDATASIZE || iap->vi.out_size > VC_MAXDATASIZE) {
vrele(tvp);
return(EINVAL);
}
error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data,
cred, curlwp);
if (error)
MARK_INT_FAIL(CODA_IOCTL_STATS);
else
CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); )
vrele(tvp);
return(error);
}
/*
* To reduce the cost of a user-level venus;we cache attributes in
* the kernel. Each cnode has storage allocated for an attribute. If
* c_vattr is valid, return a reference to it. Otherwise, get the
* attributes from venus and store them in the cnode. There is some
* question if this method is a security leak. But I think that in
* order to make this call, the user must have done a lookup and
* opened the file, and therefore should already have access.
*/
int
coda_getattr(void *v)
{
/* true args */
struct vop_getattr_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct vattr *vap = ap->a_vap;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_GETATTR_STATS);
/* Check for getattr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_GETATTR_STATS);
return(ENOENT);
}
/* Check to see if the attributes have already been cached */
if (VALID_VATTR(cp)) {
CODADEBUG(CODA_GETATTR, { myprintf(("%s: attr cache hit: %s\n",
__func__, coda_f2s(&cp->c_fid)));})
CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
coda_print_vattr(&cp->c_vattr); )
*vap = cp->c_vattr;
MARK_INT_SAT(CODA_GETATTR_STATS);
return(0);
}
error = venus_getattr(vtomi(vp), &cp->c_fid, cred, curlwp, vap);
if (!error) {
CODADEBUG(CODA_GETATTR, myprintf(("%s miss %s: result %d\n",
__func__, coda_f2s(&cp->c_fid), error)); )
CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
coda_print_vattr(vap); )
/* If not open for write, store attributes in cnode */
if ((cp->c_owrite == 0) && (coda_attr_cache)) {
cp->c_vattr = *vap;
cp->c_flags |= C_VATTR;
}
}
return(error);
}
int
coda_setattr(void *v)
{
/* true args */
struct vop_setattr_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct vattr *vap = ap->a_vap;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_SETATTR_STATS);
/* Check for setattr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_SETATTR_STATS);
return(ENOENT);
}
if (codadebug & CODADBGMSK(CODA_SETATTR)) {
coda_print_vattr(vap);
}
error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, curlwp);
if (!error)
cp->c_flags &= ~C_VATTR;
CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); )
return(error);
}
int
coda_access(void *v)
{
/* true args */
struct vop_access_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
accmode_t accmode = ap->a_accmode;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_ACCESS_STATS);
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
/* Check for access of control object. Only read access is
allowed on it. */
if (IS_CTL_VP(vp)) {
/* bogus hack - all will be marked as successes */
MARK_INT_SAT(CODA_ACCESS_STATS);
return(((accmode & VREAD) && !(accmode & (VWRITE | VEXEC)))
? 0 : EACCES);
}
/*
* if the file is a directory, and we are checking exec (eg lookup)
* access, and the file is in the namecache, then the user must have
* lookup access to it.
*/
if (coda_access_cache) {
if ((vp->v_type == VDIR) && (accmode & VEXEC)) {
if (coda_nc_lookup(cp, ".", 1, cred)) {
MARK_INT_SAT(CODA_ACCESS_STATS);
return(0); /* it was in the cache */
}
}
}
error = venus_access(vtomi(vp), &cp->c_fid, accmode, cred, curlwp);
return(error);
}
/*
* CODA abort op, called after namei() when a CREATE/DELETE isn't actually
* done. If a buffer has been saved in anticipation of a coda_create or
* a coda_remove, delete it.
*/
/* ARGSUSED */
int
coda_abortop(void *v)
{
/* true args */
struct vop_abortop_args /* {
vnode_t *a_dvp;
struct componentname *a_cnp;
} */ *ap = v;
(void)ap;
/* upcall decl */
/* locals */
return (0);
}
int
coda_readlink(void *v)
{
/* true args */
struct vop_readlink_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct uio *uiop = ap->a_uio;
kauth_cred_t cred = ap->a_cred;
/* locals */
struct lwp *l = curlwp;
int error;
char *str;
int len;
MARK_ENTRY(CODA_READLINK_STATS);
/* Check for readlink of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_READLINK_STATS);
return(ENOENT);
}
if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */
uiop->uio_rw = UIO_READ;
error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop);
if (error)
MARK_INT_FAIL(CODA_READLINK_STATS);
else
MARK_INT_SAT(CODA_READLINK_STATS);
return(error);
}
error = venus_readlink(vtomi(vp), &cp->c_fid, cred, l, &str, &len);
if (!error) {
uiop->uio_rw = UIO_READ;
error = uiomove(str, len, uiop);
if (coda_symlink_cache) {
cp->c_symlink = str;
cp->c_symlen = len;
cp->c_flags |= C_SYMLINK;
} else
CODA_FREE(str, len);
}
CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));)
return(error);
}
int
coda_fsync(void *v)
{
/* true args */
struct vop_fsync_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
kauth_cred_t cred = ap->a_cred;
/* locals */
vnode_t *convp = cp->c_ovp;
int error;
MARK_ENTRY(CODA_FSYNC_STATS);
/* Check for fsync on an unmounting object */
/* The NetBSD kernel, in its infinite wisdom, can try to fsync
* after an unmount has been initiated. This is a Bad Thing,
* which we have to avoid. Not a legitimate failure for stats.
*/
if (IS_UNMOUNTING(cp)) {
return(ENODEV);
}
/* Check for fsync of control object or unitialized cnode. */
if (IS_CTL_VP(vp) || vp->v_type == VNON) {
MARK_INT_SAT(CODA_FSYNC_STATS);
return(0);
}
if (convp)
VOP_FSYNC(convp, cred, MNT_WAIT, 0, 0);
/*
* We can expect fsync on any vnode at all if venus is pruging it.
* Venus can't very well answer the fsync request, now can it?
* Hopefully, it won't have to, because hopefully, venus preserves
* the (possibly untrue) invariant that it never purges an open
* vnode. Hopefully.
*/
if (cp->c_flags & C_PURGING) {
return(0);
}
error = venus_fsync(vtomi(vp), &cp->c_fid, cred, curlwp);
CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); )
return(error);
}
/*
* vp is locked on entry, and we must unlock it.
* XXX This routine is suspect and probably needs rewriting.
*/
int
coda_inactive(void *v)
{
/* true args */
struct vop_inactive_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
kauth_cred_t cred __unused = NULL;
/* We don't need to send inactive to venus - DCS */
MARK_ENTRY(CODA_INACTIVE_STATS);
if (IS_CTL_VP(vp)) {
MARK_INT_SAT(CODA_INACTIVE_STATS);
return 0;
}
CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n",
coda_f2s(&cp->c_fid), vp->v_mount));)
if (vp->v_mount->mnt_data == NULL) {
myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp));
panic("badness in coda_inactive");
}
#ifdef CODA_VERBOSE
/* Sanity checks that perhaps should be panic. */
if (vrefcnt(vp) > 1)
printf("%s: %p usecount %d\n", __func__, vp, vrefcnt(vp));
if (cp->c_ovp != NULL)
printf("%s: %p ovp != NULL\n", __func__, vp);
#endif
/* XXX Do we need to VOP_CLOSE container vnodes? */
if (!IS_UNMOUNTING(cp))
*ap->a_recycle = true;
MARK_INT_SAT(CODA_INACTIVE_STATS);
return(0);
}
/*
* Coda does not use the normal namecache, but a private version.
* Consider how to use the standard facility instead.
*/
int
coda_lookup(void *v)
{
/* true args */
struct vop_lookup_v2_args *ap = v;
/* (locked) vnode of dir in which to do lookup */
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
/* output variable for result */
vnode_t **vpp = ap->a_vpp;
/* name to lookup */
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
struct cnode *cp;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
CodaFid VFid;
int vtype;
int error = 0;
MARK_ENTRY(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s in %s\n", __func__,
nm, coda_f2s(&dcp->c_fid)));)
/*
* XXX componentname flags in MODMASK are not handled at all
*/
/*
* The overall strategy is to switch on the lookup type and get a
* result vnode that is vref'd but not locked.
*/
/* Check for lookup of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = coda_ctlvp;
vref(*vpp);
MARK_INT_SAT(CODA_LOOKUP_STATS);
goto exit;
}
/* Avoid trying to hand venus an unreasonably long name. */
if (len+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: name too long:, %s (%s)\n",
__func__, coda_f2s(&dcp->c_fid), nm));)
*vpp = (vnode_t *)0;
error = EINVAL;
goto exit;
}
/*
* Try to resolve the lookup in the minicache. If that fails, ask
* venus to do the lookup. XXX The interaction between vnode
* locking and any locking that coda does is not clear.
*/
cp = coda_nc_lookup(dcp, nm, len, cred);
if (cp) {
*vpp = CTOV(cp);
vref(*vpp);
CODADEBUG(CODA_LOOKUP,
myprintf(("lookup result %d vpp %p\n",error,*vpp));)
} else {
/* The name wasn't cached, so ask Venus. */
error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, l, &VFid,
&vtype);
if (error) {
MARK_INT_FAIL(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: lookup error on %s (%s)%d\n",
__func__, coda_f2s(&dcp->c_fid), nm, error));)
*vpp = (vnode_t *)0;
} else {
MARK_INT_SAT(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s type %o result %d\n",
__func__, coda_f2s(&VFid), vtype, error)); )
cp = make_coda_node(&VFid, dvp->v_mount, vtype);
*vpp = CTOV(cp);
/* vpp is now vrefed. */
/*
* Unless this vnode is marked CODA_NOCACHE, enter it into
* the coda name cache to avoid a future venus round-trip.
* XXX Interaction with componentname NOCACHE is unclear.
*/
if (!(vtype & CODA_NOCACHE))
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
}
}
exit:
/*
* If we are creating, and this was the last name to be looked up,
* and the error was ENOENT, then make the leaf NULL and return
* success.
* XXX Check against new lookup rules.
*/
if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME))
&& (cnp->cn_flags & ISLASTCN)
&& (error == ENOENT))
{
error = EJUSTRETURN;
*ap->a_vpp = NULL;
}
return(error);
}
/*ARGSUSED*/
int
coda_create(void *v)
{
/* true args */
struct vop_create_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct vattr *va = ap->a_vap;
int exclusive = 1;
int mode = ap->a_vap->va_mode;
vnode_t **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
struct cnode *cp;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
CodaFid VFid;
struct vattr attr;
MARK_ENTRY(CODA_CREATE_STATS);
/* All creates are exclusive XXX */
/* I'm assuming the 'mode' argument is the file mode bits XXX */
/* Check for create of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_CREATE_STATS);
return(EACCES);
}
error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, l, &VFid, &attr);
if (!error) {
/*
* XXX Violation of venus/kernel invariants is a difficult case,
* but venus should not be able to cause a panic.
*/
/* If this is an exclusive create, panic if the file already exists. */
/* Venus should have detected the file and reported EEXIST. */
if ((exclusive == 1) &&
(coda_find(&VFid) != NULL))
panic("cnode existed for newly created file!");
cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type);
*vpp = CTOV(cp);
/* XXX vnodeops doesn't say this argument can be changed. */
/* Update va to reflect the new attributes. */
(*va) = attr;
/* Update the attribute cache and mark it as valid */
if (coda_attr_cache) {
VTOC(*vpp)->c_vattr = attr;
VTOC(*vpp)->c_flags |= C_VATTR;
}
/* Invalidate parent's attr cache (modification time has changed). */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* enter the new vnode in the Name Cache */
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
CODADEBUG(CODA_CREATE, myprintf(("%s: %s, result %d\n", __func__,
coda_f2s(&VFid), error)); )
} else {
*vpp = (vnode_t *)0;
CODADEBUG(CODA_CREATE, myprintf(("%s: create error %d\n", __func__,
error));)
}
if (!error) {
#ifdef CODA_VERBOSE
if ((cnp->cn_flags & LOCKLEAF) == 0)
/* This should not happen; flags are for lookup only. */
printf("%s: LOCKLEAF not set!\n", __func__);
#endif
}
return(error);
}
int
coda_remove(void *v)
{
/* true args */
struct vop_remove_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *cp = VTOC(dvp);
vnode_t *vp = ap->a_vp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *tp;
MARK_ENTRY(CODA_REMOVE_STATS);
CODADEBUG(CODA_REMOVE, myprintf(("%s: %s in %s\n", __func__,
nm, coda_f2s(&cp->c_fid)));)
/* Remove the file's entry from the CODA Name Cache */
/* We're being conservative here, it might be that this person
* doesn't really have sufficient access to delete the file
* but we feel zapping the entry won't really hurt anyone -- dcs
*/
/* I'm gonna go out on a limb here. If a file and a hardlink to it
* exist, and one is removed, the link count on the other will be
* off by 1. We could either invalidate the attrs if cached, or
* fix them. I'll try to fix them. DCS 11/8/94
*/
tp = coda_nc_lookup(VTOC(dvp), nm, len, cred);
if (tp) {
if (VALID_VATTR(tp)) { /* If attrs are cached */
if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */
tp->c_vattr.va_nlink--;
}
}
coda_nc_zapfile(VTOC(dvp), nm, len);
/* No need to flush it if it doesn't exist! */
}
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* Check for remove of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_REMOVE_STATS);
return(ENOENT);
}
error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, l);
CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); )
/*
* Unlock and release child (avoiding double if ".").
*/
if (dvp == vp) {
vrele(vp);
} else {
vput(vp);
}
return(error);
}
/*
* dvp is the directory where the link is to go, and is locked.
* vp is the object to be linked to, and is unlocked.
* At exit, we must unlock dvp, and vput dvp.
*/
int
coda_link(void *v)
{
/* true args */
struct vop_link_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
MARK_ENTRY(CODA_LINK_STATS);
if (codadebug & CODADBGMSK(CODA_LINK)) {
myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
myprintf(("%s: dvp fid: %s)\n", __func__, coda_f2s(&dcp->c_fid)));
}
if (codadebug & CODADBGMSK(CODA_LINK)) {
myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
myprintf(("%s: dvp fid: %s\n", __func__, coda_f2s(&dcp->c_fid)));
}
/* Check for link to/from control object. */
if (IS_CTL_NAME(dvp, nm, len) || IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_LINK_STATS);
return(EACCES);
}
/* If linking . to a name, error out earlier. */
if (vp == dvp) {
#ifdef CODA_VERBOSE
printf("%s coda_link vp==dvp\n", __func__);
#endif
error = EISDIR;
goto exit;
}
/* XXX Why does venus_link need the vnode to be locked?*/
if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
#ifdef CODA_VERBOSE
printf("%s: couldn't lock vnode %p\n", __func__, vp);
#endif
error = EFAULT; /* XXX better value */
goto exit;
}
error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
dvp, 0);
if (error)
goto exit;
error = venus_link(vtomi(vp), &cp->c_fid, &dcp->c_fid, nm, len, cred, l);
VOP_UNLOCK(vp);
/* Invalidate parent's attr cache (the modification time has changed). */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* Invalidate child's attr cache (XXX why). */
VTOC(vp)->c_flags &= ~C_VATTR;
CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); )
exit:
return(error);
}
int
coda_rename(void *v)
{
/* true args */
struct vop_rename_args *ap = v;
vnode_t *odvp = ap->a_fdvp;
struct cnode *odcp = VTOC(odvp);
struct componentname *fcnp = ap->a_fcnp;
vnode_t *ndvp = ap->a_tdvp;
struct cnode *ndcp = VTOC(ndvp);
struct componentname *tcnp = ap->a_tcnp;
kauth_cred_t cred = fcnp->cn_cred;
struct lwp *l = curlwp;
/* true args */
int error;
const char *fnm = fcnp->cn_nameptr;
int flen = fcnp->cn_namelen;
const char *tnm = tcnp->cn_nameptr;
int tlen = tcnp->cn_namelen;
MARK_ENTRY(CODA_RENAME_STATS);
/* Hmmm. The vnodes are already looked up. Perhaps they are locked?
This could be Bad. XXX */
#ifdef OLD_DIAGNOSTIC
if ((fcnp->cn_cred != tcnp->cn_cred)
|| (fcnp->cn_lwp != tcnp->cn_lwp))
{
panic("%s: component names don't agree", __func__);
}
#endif
/* Check for rename involving control object. */
if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) {
MARK_INT_FAIL(CODA_RENAME_STATS);
return(EACCES);
}
/* Problem with moving directories -- need to flush entry for .. */
if (odvp != ndvp) {
struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred);
if (ovcp) {
vnode_t *ovp = CTOV(ovcp);
if ((ovp) &&
(ovp->v_type == VDIR)) /* If it's a directory */
coda_nc_zapfile(VTOC(ovp),"..", 2);
}
}
/* Remove the entries for both source and target files */
coda_nc_zapfile(VTOC(odvp), fnm, flen);
coda_nc_zapfile(VTOC(ndvp), tnm, tlen);
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(odvp)->c_flags &= ~C_VATTR;
VTOC(ndvp)->c_flags &= ~C_VATTR;
if (flen+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_RENAME_STATS);
error = EINVAL;
goto exit;
}
if (tlen+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_RENAME_STATS);
error = EINVAL;
goto exit;
}
error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, l);
exit:
CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));)
/* XXX - do we need to call cache pureg on the moved vnode? */
cache_purge(ap->a_fvp);
/* It seems to be incumbent on us to drop locks on all four vnodes */
/* From-vnodes are not locked, only ref'd. To-vnodes are locked. */
vrele(ap->a_fvp);
vrele(odvp);
if (ap->a_tvp) {
if (ap->a_tvp == ndvp) {
vrele(ap->a_tvp);
} else {
vput(ap->a_tvp);
}
}
vput(ndvp);
return(error);
}
int
coda_mkdir(void *v)
{
/* true args */
struct vop_mkdir_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct componentname *cnp = ap->a_cnp;
struct vattr *va = ap->a_vap;
vnode_t **vpp = ap->a_vpp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *cp;
CodaFid VFid;
struct vattr ova;
MARK_ENTRY(CODA_MKDIR_STATS);
/* Check for mkdir of target object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_MKDIR_STATS);
return(EACCES);
}
if (len+1 > CODA_MAXNAMLEN) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_MKDIR_STATS);
return(EACCES);
}
error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, l, &VFid, &ova);
if (!error) {
if (coda_find(&VFid) != NULL)
panic("cnode existed for newly created directory!");
cp = make_coda_node(&VFid, dvp->v_mount, va->va_type);
*vpp = CTOV(cp);
/* enter the new vnode in the Name Cache */
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
/* as a side effect, enter "." and ".." for the directory */
coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp));
coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp));
if (coda_attr_cache) {
VTOC(*vpp)->c_vattr = ova; /* update the attr cache */
VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */
}
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(dvp)->c_flags &= ~C_VATTR;
CODADEBUG( CODA_MKDIR, myprintf(("%s: %s result %d\n", __func__,
coda_f2s(&VFid), error)); )
} else {
*vpp = (vnode_t *)0;
CODADEBUG(CODA_MKDIR, myprintf(("%s error %d\n", __func__, error));)
}
return(error);
}
int
coda_rmdir(void *v)
{
/* true args */
struct vop_rmdir_v2_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
vnode_t *vp = ap->a_vp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* true args */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *cp;
MARK_ENTRY(CODA_RMDIR_STATS);
/* Check for rmdir of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_RMDIR_STATS);
return(ENOENT);
}
/* Can't remove . in self. */
if (dvp == vp) {
#ifdef CODA_VERBOSE
printf("%s: dvp == vp\n", __func__);
#endif
error = EINVAL;
goto exit;
}
/*
* The caller may not have adequate permissions, and the venus
* operation may fail, but it doesn't hurt from a correctness
* viewpoint to invalidate cache entries.
* XXX Why isn't this done after the venus_rmdir call?
*/
/* Look up child in name cache (by name, from parent). */
cp = coda_nc_lookup(dcp, nm, len, cred);
/* If found, remove all children of the child (., ..). */
if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL);
/* Remove child's own entry. */
coda_nc_zapfile(dcp, nm, len);
/* Invalidate parent's attr cache (the modification time has changed). */
dcp->c_flags &= ~C_VATTR;
error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, l);
CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); )
exit:
/* unlock and release child */
if (dvp == vp) {
vrele(vp);
} else {
vput(vp);
}
return(error);
}
int
coda_symlink(void *v)
{
/* true args */
struct vop_symlink_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
/* a_vpp is used in place below */
struct componentname *cnp = ap->a_cnp;
struct vattr *tva = ap->a_vap;
char *path = ap->a_target;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
u_long saved_cn_flags;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
int plen = strlen(path);
/*
* Here's the strategy for the moment: perform the symlink, then
* do a lookup to grab the resulting vnode. I know this requires
* two communications with Venus for a new symbolic link, but
* that's the way the ball bounces. I don't yet want to change
* the way the Mach symlink works. When Mach support is
* deprecated, we should change symlink so that the common case
* returns the resultant vnode in a vpp argument.
*/
MARK_ENTRY(CODA_SYMLINK_STATS);
/* Check for symlink of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EACCES;
goto exit;
}
if (plen+1 > CODA_MAXPATHLEN) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EINVAL;
goto exit;
}
if (len+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EINVAL;
goto exit;
}
error = venus_symlink(vtomi(dvp), &dcp->c_fid, path, plen, nm, len, tva, cred, l);
/* Invalidate the parent's attr cache (modification time has changed). */
dcp->c_flags &= ~C_VATTR;
if (!error) {
/*
* VOP_SYMLINK is not defined to pay attention to cnp->cn_flags;
* these are defined only for VOP_LOOKUP. We desire to reuse
* cnp for a VOP_LOOKUP operation, and must be sure to not pass
* stray flags passed to us. Such stray flags can occur because
* sys_symlink makes a namei call and then reuses the
* componentname structure.
*/
/*
* XXX Arguably we should create our own componentname structure
* and not reuse the one that was passed in.
*/
saved_cn_flags = cnp->cn_flags;
cnp->cn_flags &= ~(MODMASK | OPMASK);
cnp->cn_flags |= LOOKUP;
error = VOP_LOOKUP(dvp, ap->a_vpp, cnp);
cnp->cn_flags = saved_cn_flags;
}
exit:
CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); )
return(error);
}
/*
* Read directory entries.
*/
int
coda_readdir(void *v)
{
/* true args */
struct vop_readdir_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct uio *uiop = ap->a_uio;
kauth_cred_t cred = ap->a_cred;
int *eofflag = ap->a_eofflag;
/* upcall decl */
/* locals */
size_t initial_resid = uiop->uio_resid;
int error = 0;
int opened_internally = 0;
int ncookies;
char *buf;
struct vnode *cvp;
struct dirent *dirp;
MARK_ENTRY(CODA_READDIR_STATS);
CODADEBUG(CODA_READDIR, myprintf(("%s: (%p, %lu, %lld)\n", __func__,
uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
(long long) uiop->uio_offset)); )
/* Check for readdir of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_READDIR_STATS);
return ENOENT;
}
/* If directory is not already open do an "internal open" on it. */
if (cp->c_ovp == NULL) {
opened_internally = 1;
MARK_INT_GEN(CODA_OPEN_STATS);
error = VOP_OPEN(vp, FREAD, cred);
#ifdef CODA_VERBOSE
printf("%s: Internally Opening %p\n", __func__, vp);
#endif
if (error)
return error;
KASSERT(cp->c_ovp != NULL);
}
cvp = cp->c_ovp;
CODADEBUG(CODA_READDIR, myprintf(("%s: fid = %s, refcnt = %d\n",
__func__, coda_f2s(&cp->c_fid), vrefcnt(cvp))); )
if (ap->a_ncookies) {
ncookies = ap->a_uio->uio_resid / _DIRENT_RECLEN(dirp, 1);
*ap->a_ncookies = 0;
*ap->a_cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
}
buf = kmem_alloc(CODA_DIRBLKSIZ, KM_SLEEP);
dirp = kmem_alloc(sizeof(*dirp), KM_SLEEP);
vn_lock(cvp, LK_EXCLUSIVE | LK_RETRY);
while (error == 0) {
size_t resid = 0;
char *dp, *ep;
if (!ALIGNED_POINTER(uiop->uio_offset, uint32_t)) {
error = EINVAL;
break;
}
error = vn_rdwr(UIO_READ, cvp, buf,
CODA_DIRBLKSIZ, uiop->uio_offset,
UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, curlwp);
if (error || resid == CODA_DIRBLKSIZ)
break;
for (dp = buf, ep = dp + CODA_DIRBLKSIZ - resid; dp < ep; ) {
off_t off;
struct venus_dirent *vd = (struct venus_dirent *)dp;
if (!ALIGNED_POINTER(vd, uint32_t) ||
!ALIGNED_POINTER(vd->d_reclen, uint32_t) ||
vd->d_reclen == 0) {
error = EINVAL;
break;
}
if (dp + vd->d_reclen > ep) {
error = ENAMETOOLONG;
break;
}
if (vd->d_namlen == 0) {
uiop->uio_offset += vd->d_reclen;
dp += vd->d_reclen;
continue;
}
dirp->d_fileno = vd->d_fileno;
dirp->d_type = vd->d_type;
dirp->d_namlen = vd->d_namlen;
dirp->d_reclen = _DIRENT_SIZE(dirp);
strlcpy(dirp->d_name, vd->d_name, dirp->d_namlen + 1);
if (uiop->uio_resid < dirp->d_reclen) {
error = ENAMETOOLONG;
break;
}
off = uiop->uio_offset;
error = uiomove(dirp, dirp->d_reclen, uiop);
uiop->uio_offset = off;
if (error)
break;
uiop->uio_offset += vd->d_reclen;
dp += vd->d_reclen;
if (ap->a_ncookies)
(*ap->a_cookies)[(*ap->a_ncookies)++] =
uiop->uio_offset;
}
}
VOP_UNLOCK(cvp);
kmem_free(dirp, sizeof(*dirp));
kmem_free(buf, CODA_DIRBLKSIZ);
if (eofflag && error == 0)
*eofflag = 1;
if (uiop->uio_resid < initial_resid && error == ENAMETOOLONG)
error = 0;
if (ap->a_ncookies && error) {
free(*ap->a_cookies, M_TEMP);
*ap->a_ncookies = 0;
*ap->a_cookies = NULL;
}
if (error)
MARK_INT_FAIL(CODA_READDIR_STATS);
else
MARK_INT_SAT(CODA_READDIR_STATS);
/* Do an "internal close" if necessary. */
if (opened_internally) {
MARK_INT_GEN(CODA_CLOSE_STATS);
(void)VOP_CLOSE(vp, FREAD, cred);
}
return error;
}
/*
* Convert from file system blocks to device blocks
*/
int
coda_bmap(void *v)
{
/* XXX on the global proc */
/* true args */
struct vop_bmap_args *ap = v;
vnode_t *vp __unused = ap->a_vp; /* file's vnode */
daddr_t bn __unused = ap->a_bn; /* fs block number */
vnode_t **vpp = ap->a_vpp; /* RETURN vp of device */
daddr_t *bnp __unused = ap->a_bnp; /* RETURN device block number */
struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */
*vpp = (vnode_t *)0;
myprintf(("coda_bmap called!\n"));
return(EINVAL);
}
/*
* I don't think the following two things are used anywhere, so I've
* commented them out
*
* struct buf *async_bufhead;
* int async_daemon_count;
*/
int
coda_strategy(void *v)
{
/* true args */
struct vop_strategy_args *ap = v;
struct buf *bp __unused = ap->a_bp;
struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */
myprintf(("coda_strategy called! "));
return(EINVAL);
}
int
coda_reclaim(void *v)
{
/* true args */
struct vop_reclaim_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
VOP_UNLOCK(vp);
/*
* Forced unmount/flush will let vnodes with non zero use be destroyed!
*/
ENTRY;
if (IS_UNMOUNTING(cp)) {
#ifdef DEBUG
if (VTOC(vp)->c_ovp) {
if (IS_UNMOUNTING(cp))
printf("%s: c_ovp not void: vp %p, cp %p\n", __func__, vp, cp);
}
#endif
} else {
#ifdef OLD_DIAGNOSTIC
if (vrefcnt(vp) != 0)
print("%s: pushing active %p\n", __func__, vp);
if (VTOC(vp)->c_ovp) {
panic("%s: c_ovp not void", __func__);
}
#endif
}
/* If an array has been allocated to hold the symlink, deallocate it */
if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) {
if (cp->c_symlink == NULL)
panic("%s: null symlink pointer in cnode", __func__);
CODA_FREE(cp->c_symlink, cp->c_symlen);
cp->c_flags &= ~C_SYMLINK;
cp->c_symlen = 0;
}
mutex_enter(vp->v_interlock);
mutex_enter(&cp->c_lock);
SET_VTOC(vp) = NULL;
mutex_exit(&cp->c_lock);
mutex_exit(vp->v_interlock);
mutex_destroy(&cp->c_lock);
kmem_free(cp, sizeof(*cp));
return (0);
}
int
coda_lock(void *v)
{
/* true args */
struct vop_lock_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
ENTRY;
if (coda_lockdebug) {
myprintf(("Attempting lock on %s\n",
coda_f2s(&cp->c_fid)));
}
return genfs_lock(v);
}
int
coda_unlock(void *v)
{
/* true args */
struct vop_unlock_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
ENTRY;
if (coda_lockdebug) {
myprintf(("Attempting unlock on %s\n",
coda_f2s(&cp->c_fid)));
}
return genfs_unlock(v);
}
int
coda_islocked(void *v)
{
/* true args */
ENTRY;
return genfs_islocked(v);
}
int
coda_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
switch (ap->a_name) {
default:
return EINVAL;
}
/* NOTREACHED */
}
/*
* Given a device and inode, obtain a locked vnode. One reference is
* obtained and passed back to the caller.
*/
int
coda_grab_vnode(vnode_t *uvp, dev_t dev, ino_t ino, vnode_t **vpp)
{
int error;
struct mount *mp;
/* Obtain mount point structure from device. */
if (!(mp = devtomp(dev))) {
myprintf(("%s: devtomp(0x%llx) returns NULL\n", __func__,
(unsigned long long)dev));
return(ENXIO);
}
/*
* Obtain vnode from mount point and inode.
*/
error = VFS_VGET(mp, ino, LK_EXCLUSIVE, vpp);
if (error) {
myprintf(("%s: iget/vget(0x%llx, %llu) returns %p, err %d\n", __func__,
(unsigned long long)dev, (unsigned long long)ino, *vpp, error));
return(ENOENT);
}
/* share the underlying vnode lock with the coda vnode */
vshareilock(*vpp, uvp);
KASSERT(VOP_ISLOCKED(*vpp));
return(0);
}
static void
coda_print_vattr(struct vattr *attr)
{
const char *typestr;
switch (attr->va_type) {
case VNON:
typestr = "VNON";
break;
case VREG:
typestr = "VREG";
break;
case VDIR:
typestr = "VDIR";
break;
case VBLK:
typestr = "VBLK";
break;
case VCHR:
typestr = "VCHR";
break;
case VLNK:
typestr = "VLNK";
break;
case VSOCK:
typestr = "VSCK";
break;
case VFIFO:
typestr = "VFFO";
break;
case VBAD:
typestr = "VBAD";
break;
default:
typestr = "????";
break;
}
myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n",
typestr, (int)attr->va_mode, (int)attr->va_uid,
(int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev));
myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n",
(int)attr->va_fileid, (int)attr->va_nlink,
(int)attr->va_size,
(int)attr->va_blocksize,(int)attr->va_bytes));
myprintf((" gen %ld flags %ld vaflags %d\n",
attr->va_gen, attr->va_flags, attr->va_vaflags));
myprintf((" atime sec %d nsec %d\n",
(int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec));
myprintf((" mtime sec %d nsec %d\n",
(int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec));
myprintf((" ctime sec %d nsec %d\n",
(int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec));
}
/*
* Return a vnode for the given fid.
* If no cnode exists for this fid create one and put it
* in a table hashed by coda_f2i(). If the cnode for
* this fid is already in the table return it (ref count is
* incremented by coda_find. The cnode will be flushed from the
* table when coda_inactive calls coda_unsave.
*/
struct cnode *
make_coda_node(CodaFid *fid, struct mount *fvsp, short type)
{
int error __diagused;
struct vnode *vp;
struct cnode *cp;
error = vcache_get(fvsp, fid, sizeof(CodaFid), &vp);
KASSERT(error == 0);
mutex_enter(vp->v_interlock);
cp = VTOC(vp);
KASSERT(cp != NULL);
mutex_enter(&cp->c_lock);
mutex_exit(vp->v_interlock);
if (vp->v_type != type) {
if (vp->v_type == VCHR || vp->v_type == VBLK)
spec_node_destroy(vp);
vp->v_type = type;
if (type == VCHR || type == VBLK)
spec_node_init(vp, NODEV);
uvm_vnp_setsize(vp, 0);
}
mutex_exit(&cp->c_lock);
return cp;
}
/*
* coda_getpages may be called on a vnode which has not been opened,
* e.g. to fault in pages to execute a program. In that case, we must
* open the file to get the container. The vnode may or may not be
* locked, and we must leave it in the same state.
*/
int
coda_getpages(void *v)
{
struct vop_getpages_args /* {
vnode_t *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp, *cvp;
struct cnode *cp = VTOC(vp);
struct lwp *l = curlwp;
kauth_cred_t cred = l->l_cred;
int error, cerror;
int waslocked; /* 1 if vnode lock was held on entry */
int didopen = 0; /* 1 if we opened container file */
krw_t op;
/*
* Handle a case that uvm_fault doesn't quite use yet.
* See layer_vnops.c. for inspiration.
*/
if (ap->a_flags & PGO_LOCKED) {
return EBUSY;
}
KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));
/* Check for control object. */
if (IS_CTL_VP(vp)) {
#ifdef CODA_VERBOSE
printf("%s: control object %p\n", __func__, vp);
#endif
return(EINVAL);
}
/*
* XXX It's really not ok to be releasing the lock we get,
* because we could be overlapping with another call to
* getpages and drop a lock they are relying on. We need to
* figure out whether getpages ever is called holding the
* lock, and if we should serialize getpages calls by some
* mechanism.
*/
/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
op = rw_lock_op(vp->v_uobj.vmobjlock);
waslocked = VOP_ISLOCKED(vp);
/* Get container file if not already present. */
cvp = cp->c_ovp;
if (cvp == NULL) {
/*
* VOP_OPEN requires a locked vnode. We must avoid
* locking the vnode if it is already locked, and
* leave it in the same state on exit.
*/
if (waslocked == 0) {
rw_exit(vp->v_uobj.vmobjlock);
cerror = vn_lock(vp, LK_EXCLUSIVE);
if (cerror) {
#ifdef CODA_VERBOSE
printf("%s: can't lock vnode %p\n",
__func__, vp);
#endif
return cerror;
}
#ifdef CODA_VERBOSE
printf("%s: locked vnode %p\n", __func__, vp);
#endif
}
/*
* Open file (causes upcall to venus).
* XXX Perhaps we should not fully open the file, but
* simply obtain a container file.
*/
/* XXX Is it ok to do this while holding the mutex? */
cerror = VOP_OPEN(vp, FREAD, cred);
if (cerror) {
#ifdef CODA_VERBOSE
printf("%s: cannot open vnode %p => %d\n", __func__,
vp, cerror);
#endif
if (waslocked == 0)
VOP_UNLOCK(vp);
return cerror;
}
#ifdef CODA_VERBOSE
printf("%s: opened vnode %p\n", __func__, vp);
#endif
cvp = cp->c_ovp;
didopen = 1;
if (waslocked == 0)
rw_enter(vp->v_uobj.vmobjlock, op);
}
KASSERT(cvp != NULL);
/* Munge the arg structure to refer to the container vnode. */
KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
ap->a_vp = cp->c_ovp;
/* Finally, call getpages on it. */
error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
/* If we opened the vnode, we must close it. */
if (didopen) {
/*
* VOP_CLOSE requires a locked vnode, but we are still
* holding the lock (or riding a caller's lock).
*/
cerror = VOP_CLOSE(vp, FREAD, cred);
#ifdef CODA_VERBOSE
if (cerror != 0)
/* XXX How should we handle this? */
printf("%s: closed vnode %p -> %d\n", __func__,
vp, cerror);
#endif
/* If we obtained a lock, drop it. */
if (waslocked == 0)
VOP_UNLOCK(vp);
}
return error;
}
/*
* The protocol requires v_interlock to be held by the caller.
*/
int
coda_putpages(void *v)
{
struct vop_putpages_args /* {
vnode_t *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp, *cvp;
struct cnode *cp = VTOC(vp);
int error;
KASSERT(rw_write_held(vp->v_uobj.vmobjlock));
/* Check for control object. */
if (IS_CTL_VP(vp)) {
rw_exit(vp->v_uobj.vmobjlock);
#ifdef CODA_VERBOSE
printf("%s: control object %p\n", __func__, vp);
#endif
return 0;
}
/*
* If container object is not present, then there are no pages
* to put; just return without error. This happens all the
* time, apparently during discard of a closed vnode (which
* trivially can't have dirty pages).
*/
cvp = cp->c_ovp;
if (cvp == NULL) {
rw_exit(vp->v_uobj.vmobjlock);
return 0;
}
/* Munge the arg structure to refer to the container vnode. */
KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
ap->a_vp = cvp;
/* Finally, call putpages on it. */
error = VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
return error;
}
/* $NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
*/
/*
* File descriptor management.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/pool.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>
/*
* A list (head) of open files, counter, and lock protecting them.
*/
struct filelist filehead __cacheline_aligned;
static u_int nfiles __cacheline_aligned;
kmutex_t filelist_lock __cacheline_aligned;
static pool_cache_t filedesc_cache __read_mostly;
static pool_cache_t file_cache __read_mostly;
static int file_ctor(void *, void *, int);
static void file_dtor(void *, void *);
static void fdfile_ctor(fdfile_t *);
static void fdfile_dtor(fdfile_t *);
static int filedesc_ctor(void *, void *, int);
static void filedesc_dtor(void *, void *);
static int filedescopen(dev_t, int, int, lwp_t *);
static int sysctl_kern_file(SYSCTLFN_PROTO);
static int sysctl_kern_file2(SYSCTLFN_PROTO);
static void fill_file(struct file *, const struct file *);
static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *,
int, pid_t);
const struct cdevsw filedesc_cdevsw = {
.d_open = filedescopen,
.d_close = noclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = noioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* For ease of reading. */
__strong_alias(fd_putvnode,fd_putfile)
__strong_alias(fd_putsock,fd_putfile)
/*
* Initialize the descriptor system.
*/
void
fd_sys_init(void)
{
static struct sysctllog *clog;
mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&filehead);
file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
KASSERT(file_cache != NULL);
filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
NULL);
KASSERT(filedesc_cache != NULL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "file",
SYSCTL_DESCR("System open file table"),
sysctl_kern_file, 0, NULL, 0,
CTL_KERN, KERN_FILE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "file2",
SYSCTL_DESCR("System open file table"),
sysctl_kern_file2, 0, NULL, 0,
CTL_KERN, KERN_FILE2, CTL_EOL);
}
static bool
fd_isused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);
return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0;
}
/*
* Verify that the bitmaps match the descriptor table.
*/
static inline void
fd_checkmaps(filedesc_t *fdp)
{
#ifdef DEBUG
fdtab_t *dt;
u_int fd;
KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock));
dt = fdp->fd_dt;
if (fdp->fd_refcnt == -1) {
/*
* fd_free tears down the table without maintaining its bitmap.
*/
return;
}
for (fd = 0; fd < dt->dt_nfiles; fd++) { if (fd < NDFDFILE) { KASSERT(dt->dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
if (dt->dt_ff[fd] == NULL) {
KASSERT(!fd_isused(fdp, fd)); } else if (dt->dt_ff[fd]->ff_file != NULL) { KASSERT(fd_isused(fdp, fd));
}
}
#endif
}
static int
fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
{
int i, off, maxoff;
uint32_t sub;
KASSERT(mutex_owned(&fdp->fd_lock));
fd_checkmaps(fdp);
if (want > bits)
return -1;
off = want >> NDENTRYSHIFT;
i = want & NDENTRYMASK;
if (i) {
sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
if (sub != ~0)
goto found;
off++;
}
maxoff = NDLOSLOTS(bits);
while (off < maxoff) { if ((sub = bitmap[off]) != ~0)
goto found;
off++;
}
return -1;
found:
return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
}
static int
fd_last_set(filedesc_t *fd, int last)
{
int off, i;
fdfile_t **ff = fd->fd_dt->dt_ff;
uint32_t *bitmap = fd->fd_lomap;
KASSERT(mutex_owned(&fd->fd_lock));
fd_checkmaps(fd);
off = (last - 1) >> NDENTRYSHIFT;
while (off >= 0 && !bitmap[off])
off--;
if (off < 0)
return -1;
i = ((off + 1) << NDENTRYSHIFT) - 1;
if (i >= last)
i = last - 1;
/* XXX should use bitmap */
while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
i--;
return i;
}
static inline void
fd_used(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_allocated);
ff->ff_allocated = true;
fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK);
if (__predict_false(fdp->fd_lomap[off] == ~0)) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1U << (off & NDENTRYMASK))) == 0);
fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK);
}
if ((int)fd > fdp->fd_lastfile) { fdp->fd_lastfile = fd;
}
fd_checkmaps(fdp);
}
static inline void
fd_unused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); if (fd < fdp->fd_freefile) { fdp->fd_freefile = fd;
}
if (fdp->fd_lomap[off] == ~0) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1U << (off & NDENTRYMASK))) != 0);
fdp->fd_himap[off >> NDENTRYSHIFT] &=
~(1U << (off & NDENTRYMASK));
}
KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0);
fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK));
ff->ff_allocated = false;
KASSERT(fd <= fdp->fd_lastfile); if (fd == fdp->fd_lastfile) { fdp->fd_lastfile = fd_last_set(fdp, fd);
}
fd_checkmaps(fdp);
}
/*
* Look up the file structure corresponding to a file descriptor
* and return the file, holding a reference on the descriptor.
*/
file_t *
fd_getfile(unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
/*
* Look up the fdfile structure representing this descriptor.
* We are doing this unlocked. See fd_tryexpand().
*/
fdp = curlwp->l_fd;
dt = atomic_load_consume(&fdp->fd_dt); if (__predict_false(fd >= dt->dt_nfiles)) {
return NULL;
}
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (__predict_false(ff == NULL)) {
return NULL;
}
/* Now get a reference to the descriptor. */
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
ff->ff_refcnt++;
} else {
/*
* Multi threaded: issue a memory barrier to ensure that we
* acquire the file pointer _after_ adding a reference. If
* no memory barrier, we could fetch a stale pointer.
*
* In particular, we must coordinate the following four
* memory operations:
*
* A. fd_close store ff->ff_file = NULL
* B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
* C. fd_getfile atomic_inc_uint(&ff->ff_refcnt)
* D. fd_getfile load fp = ff->ff_file
*
* If the order is D;A;B;C:
*
* 1. D: fp = ff->ff_file
* 2. A: ff->ff_file = NULL
* 3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
* 4. C: atomic_inc_uint(&ff->ff_refcnt)
*
* then fd_close determines that there are no more
* references and decides to free fp immediately, at
* the same that fd_getfile ends up with an fp that's
* about to be freed. *boom*
*
* By making B a release operation in fd_close, and by
* making C an acquire operation in fd_getfile, since
* they are atomic operations on the same object, which
* has a total modification order, we guarantee either:
*
* - B happens before C. Then since A is
* sequenced before B in fd_close, and C is
* sequenced before D in fd_getfile, we
* guarantee A happens before D, so fd_getfile
* reads a null fp and safely fails.
*
* - C happens before B. Then fd_getfile may read
* null or nonnull, but either way, fd_close
* will safely wait for references to drain.
*/
atomic_inc_uint(&ff->ff_refcnt);
membar_acquire();
}
/*
* If the file is not open or is being closed then put the
* reference back.
*/
fp = atomic_load_consume(&ff->ff_file); if (__predict_true(fp != NULL)) {
return fp;
}
fd_putfile(fd);
return NULL;
}
/*
* Release a reference to a file descriptor acquired with fd_getfile().
*/
void
fd_putfile(unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
u_int u, v;
fdp = curlwp->l_fd;
KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL); KASSERT((ff->ff_refcnt & FR_MASK) > 0); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
fd_close(fd);
return;
}
ff->ff_refcnt--;
return;
}
/*
* Ensure that any use of the file is complete and globally
* visible before dropping the final reference. If no membar,
* the current CPU could still access memory associated with
* the file after it has been freed or recycled by another
* CPU.
*/
membar_release();
/*
* Be optimistic and start out with the assumption that no other
* threads are trying to close the descriptor. If the CAS fails,
* we lost a race and/or it's being closed.
*/
for (u = ff->ff_refcnt & FR_MASK;; u = v) {
v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
if (__predict_true(u == v)) {
return;
}
if (__predict_false((v & FR_CLOSING) != 0)) {
break;
}
}
/* Another thread is waiting to close the file: join it. */
(void)fd_close(fd);
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a vnode.
*/
int
fd_getvnode(unsigned fd, file_t **fpp)
{
vnode_t *vp;
file_t *fp;
fp = fd_getfile(fd);
if (__predict_false(fp == NULL)) {
return EBADF;
}
if (__predict_false(fp->f_type != DTYPE_VNODE)) {
fd_putfile(fd);
return EINVAL;
}
vp = fp->f_vnode;
if (__predict_false(vp->v_type == VBAD)) {
/* XXX Is this case really necessary? */
fd_putfile(fd);
return EBADF;
}
*fpp = fp;
return 0;
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a socket.
*/
int
fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
{
*fp = fd_getfile(fd);
if (__predict_false(*fp == NULL)) {
return EBADF;
}
if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
fd_putfile(fd);
return ENOTSOCK;
}
*sop = (*fp)->f_socket;
return 0;
}
int
fd_getsock(unsigned fd, struct socket **sop)
{
file_t *fp;
return fd_getsock1(fd, sop, &fp);
}
/*
* Look up the file structure corresponding to a file descriptor
* and return it with a reference held on the file, not the
* descriptor.
*
* This is heavyweight and only used when accessing descriptors
* from a foreign process. The caller must ensure that `p' does
* not exit or fork across this call.
*
* To release the file (not descriptor) reference, use closef().
*/
file_t *
fd_getfile2(proc_t *p, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
fdp = p->p_fd;
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
if (fd >= dt->dt_nfiles) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((ff = dt->dt_ff[fd]) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&fdp->fd_lock);
return fp;
}
/*
* Internal form of close. Must be called with a reference to the
* descriptor, and will drop the reference. When all descriptor
* references are dropped, releases the descriptor slot and a single
* reference to the file structure.
*/
int
fd_close(unsigned fd)
{
struct flock lf;
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
proc_t *p;
lwp_t *l;
u_int refcnt;
l = curlwp;
p = l->l_proc;
fdp = l->l_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
mutex_enter(&fdp->fd_lock);
KASSERT((ff->ff_refcnt & FR_MASK) > 0); fp = atomic_load_consume(&ff->ff_file);
if (__predict_false(fp == NULL)) {
/*
* Another user of the file is already closing, and is
* waiting for other users of the file to drain. Release
* our reference, and wake up the closer.
*/
membar_release();
atomic_dec_uint(&ff->ff_refcnt);
cv_broadcast(&ff->ff_closing);
mutex_exit(&fdp->fd_lock);
/*
* An application error, so pretend that the descriptor
* was already closed. We can't safely wait for it to
* be closed without potentially deadlocking.
*/
return (EBADF);
}
KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
/*
* There may be multiple users of this file within the process.
* Notify existing and new users that the file is closing. This
* will prevent them from adding additional uses to this file
* while we are closing it.
*/
atomic_store_relaxed(&ff->ff_file, NULL);
ff->ff_exclose = false;
/*
* We expect the caller to hold a descriptor reference - drop it.
* The reference count may increase beyond zero at this point due
* to an erroneous descriptor reference by an application, but
* fd_getfile() will notice that the file is being closed and drop
* the reference again.
*/
if (fdp->fd_refcnt == 1) {
/* Single threaded. */
refcnt = --(ff->ff_refcnt);
} else {
/* Multi threaded. */
membar_release();
refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
membar_acquire();
}
if (__predict_false(refcnt != 0)) {
/*
* Wait for other references to drain. This is typically
* an application error - the descriptor is being closed
* while still in use.
* (Or just a threaded application trying to unblock its
* thread that sleeps in (say) accept()).
*/
atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
/*
* Remove any knotes attached to the file. A knote
* attached to the descriptor can hold references on it.
*/
mutex_exit(&fdp->fd_lock);
if (!SLIST_EMPTY(&ff->ff_knlist)) { knote_fdclose(fd);
}
/*
* Since the file system code doesn't know which fd
* each request came from (think dup()), we have to
* ask it to return ERESTART for any long-term blocks.
* The re-entry through read/write/etc will detect the
* closed fd and return EBAFD.
* Blocked partial writes may return a short length.
*/
(*fp->f_ops->fo_restart)(fp);
mutex_enter(&fdp->fd_lock);
/*
* We need to see the count drop to zero at least once,
* in order to ensure that all pre-existing references
* have been drained. New references past this point are
* of no interest.
* XXX (dsl) this may need to call fo_restart() after a
* timeout to guarantee that all the system calls exit.
*/
while ((ff->ff_refcnt & FR_MASK) != 0) {
cv_wait(&ff->ff_closing, &fdp->fd_lock);
}
atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
} else {
/* If no references, there must be no knotes. */
KASSERT(SLIST_EMPTY(&ff->ff_knlist));
}
/*
* POSIX record locking dictates that any close releases ALL
* locks owned by this process. This is handled by setting
* a flag in the unlock to free ONLY locks obeying POSIX
* semantics, and not to free BSD-style file locks.
* If the descriptor was in a message, POSIX-style locks
* aren't passed with the descriptor.
*/
if (__predict_false((p->p_flag & PK_ADVLOCK) != 0) &&
fp->f_ops->fo_advlock != NULL) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
mutex_exit(&fdp->fd_lock);
(void)(*fp->f_ops->fo_advlock)(fp, p, F_UNLCK, &lf, F_POSIX);
mutex_enter(&fdp->fd_lock);
}
/* Free descriptor slot. */
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
/* Now drop reference to the file itself. */
return closef(fp);
}
/*
* Duplicate a file descriptor.
*/
int
fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
{
proc_t *p = curproc;
fdtab_t *dt;
int error;
while ((error = fd_alloc(p, minfd, newp)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
dt = atomic_load_consume(&curlwp->l_fd->fd_dt);
dt->dt_ff[*newp]->ff_exclose = exclose;
fd_affix(p, fp, *newp);
return 0;
}
/*
* dup2 operation.
*/
int
fd_dup2(file_t *fp, unsigned newfd, int flags)
{
filedesc_t *fdp = curlwp->l_fd;
fdfile_t *ff;
fdtab_t *dt;
if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
return EINVAL;
/*
* Ensure there are enough slots in the descriptor table,
* and allocate an fdfile_t up front in case we need it.
*/
while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) { fd_tryexpand(curproc);
}
ff = kmem_alloc(sizeof(*ff), KM_SLEEP);
fdfile_ctor(ff);
/*
* If there is already a file open, close it. If the file is
* half open, wait for it to be constructed before closing it.
* XXX Potential for deadlock here?
*/
mutex_enter(&fdp->fd_lock);
while (fd_isused(fdp, newfd)) {
mutex_exit(&fdp->fd_lock);
if (fd_getfile(newfd) != NULL) {
(void)fd_close(newfd);
} else {
/*
* Crummy, but unlikely to happen.
* Can occur if we interrupt another
* thread while it is opening a file.
*/
kpause("dup2", false, 1, NULL);
}
mutex_enter(&fdp->fd_lock);
}
dt = fdp->fd_dt;
if (dt->dt_ff[newfd] == NULL) { KASSERT(newfd >= NDFDFILE);
dt->dt_ff[newfd] = ff;
ff = NULL;
}
fd_used(fdp, newfd);
mutex_exit(&fdp->fd_lock);
dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0;
fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE);
/* Slot is now allocated. Insert copy of the file. */
fd_affix(curproc, fp, newfd);
if (ff != NULL) { cv_destroy(&ff->ff_closing);
kmem_free(ff, sizeof(*ff));
}
return 0;
}
/*
* Drop reference to a file structure.
*/
int
closef(file_t *fp)
{
struct flock lf;
int error;
/*
* Drop reference. If referenced elsewhere it's still open
* and we have nothing more to do.
*/
mutex_enter(&fp->f_lock);
KASSERT(fp->f_count > 0); if (--fp->f_count > 0) {
mutex_exit(&fp->f_lock);
return 0;
}
KASSERT(fp->f_count == 0);
mutex_exit(&fp->f_lock);
/* We held the last reference - release locks, close and free. */
if (fp->f_ops->fo_advlock == NULL) {
KASSERT((fp->f_flag & FHASLOCK) == 0); } else if (fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void)(*fp->f_ops->fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
}
if (fp->f_ops != NULL) { error = (*fp->f_ops->fo_close)(fp);
} else {
error = 0;
}
KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
return error;
}
/*
* Allocate a file descriptor for the process.
*
* Future idea for experimentation: replace all of this with radixtree.
*/
int
fd_alloc(proc_t *p, int want, int *result)
{
filedesc_t *fdp = p->p_fd;
int i, lim, last, error, hi;
u_int off;
fdtab_t *dt;
KASSERT(p == curproc || p == &proc0);
/*
* Search for a free descriptor starting at the higher
* of want or fd_freefile.
*/
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
last = uimin(dt->dt_nfiles, lim);
for (;;) {
if ((i = want) < fdp->fd_freefile)
i = fdp->fd_freefile;
off = i >> NDENTRYSHIFT;
hi = fd_next_zero(fdp, fdp->fd_himap, off,
(last + NDENTRIES - 1) >> NDENTRYSHIFT);
if (hi == -1)
break;
i = fd_next_zero(fdp, &fdp->fd_lomap[hi],
hi > off ? 0 : i & NDENTRYMASK, NDENTRIES);
if (i == -1) {
/*
* Free file descriptor in this block was
* below want, try again with higher want.
*/
want = (hi + 1) << NDENTRYSHIFT;
continue;
}
i += (hi << NDENTRYSHIFT);
if (i >= last) {
break;
}
if (dt->dt_ff[i] == NULL) { KASSERT(i >= NDFDFILE);
dt->dt_ff[i] = kmem_alloc(sizeof(fdfile_t), KM_SLEEP);
fdfile_ctor(dt->dt_ff[i]);
}
KASSERT(dt->dt_ff[i]->ff_file == NULL);
fd_used(fdp, i);
if (want <= fdp->fd_freefile) { fdp->fd_freefile = i;
}
*result = i;
KASSERT(i >= NDFDFILE ||
dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
return 0;
}
/* No space in current array. Let the caller expand and retry. */
error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
mutex_exit(&fdp->fd_lock);
return error;
}
/*
* Allocate memory for a descriptor table.
*/
static fdtab_t *
fd_dtab_alloc(int n)
{
fdtab_t *dt;
size_t sz;
KASSERT(n > NDFILE);
sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
dt = kmem_alloc(sz, KM_SLEEP);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
dt->dt_nfiles = n;
dt->dt_link = NULL;
return dt;
}
/*
* Free a descriptor table, and all tables linked for deferred free.
*/
static void
fd_dtab_free(fdtab_t *dt)
{
fdtab_t *next;
size_t sz;
do {
next = dt->dt_link;
KASSERT(dt->dt_nfiles > NDFILE);
sz = sizeof(*dt) +
(dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
kmem_free(dt, sz);
dt = next;
} while (dt != NULL);
}
/*
* Allocate descriptor bitmap.
*/
static void
fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
{
uint8_t *ptr;
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
*lo = (uint32_t *)ptr;
*hi = (uint32_t *)(ptr + szlo);
}
/*
* Free descriptor bitmap.
*/
static void
fd_map_free(int n, uint32_t *lo, uint32_t *hi)
{
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
kmem_free(lo, szlo + szhi);
}
/*
* Expand a process' descriptor table.
*/
void
fd_tryexpand(proc_t *p)
{
filedesc_t *fdp;
int i, numfiles, oldnfiles;
fdtab_t *newdt, *dt;
uint32_t *newhimap, *newlomap;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
newhimap = NULL;
newlomap = NULL;
oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles;
if (oldnfiles < NDEXTENT)
numfiles = NDEXTENT;
else
numfiles = 2 * oldnfiles;
newdt = fd_dtab_alloc(numfiles);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_alloc(numfiles, &newlomap, &newhimap);
}
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
if (dt->dt_nfiles != oldnfiles) {
/* fdp changed; caller must retry */
mutex_exit(&fdp->fd_lock);
fd_dtab_free(newdt);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_free(numfiles, newlomap, newhimap);
}
return;
}
/* Copy the existing descriptor table and zero the new portion. */
i = sizeof(fdfile_t *) * oldnfiles;
memcpy(newdt->dt_ff, dt->dt_ff, i);
memset((uint8_t *)newdt->dt_ff + i, 0,
numfiles * sizeof(fdfile_t *) - i);
/*
* Link old descriptor array into list to be discarded. We defer
* freeing until the last reference to the descriptor table goes
* away (usually process exit). This allows us to do lockless
* lookups in fd_getfile().
*/
if (oldnfiles > NDFILE) {
if (fdp->fd_refcnt > 1) {
newdt->dt_link = dt;
} else {
fd_dtab_free(dt);
}
}
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newhimap, fdp->fd_himap, i);
memset((uint8_t *)newhimap + i, 0,
NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newlomap, fdp->fd_lomap, i);
memset((uint8_t *)newlomap + i, 0,
NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
}
fdp->fd_himap = newhimap;
fdp->fd_lomap = newlomap;
}
/*
* All other modifications must become globally visible before
* the change to fd_dt. See fd_getfile().
*/
atomic_store_release(&fdp->fd_dt, newdt);
KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
}
/*
* Create a new open file structure and allocate a file descriptor
* for the current process.
*/
int
fd_allocfile(file_t **resultfp, int *resultfd)
{
proc_t *p = curproc;
kauth_cred_t cred;
file_t *fp;
int error;
while ((error = fd_alloc(p, 0, resultfd)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
fp = pool_cache_get(file_cache, PR_WAITOK);
if (fp == NULL) {
fd_abort(p, NULL, *resultfd);
return ENFILE;
}
KASSERT(fp->f_count == 0); KASSERT(fp->f_msgcount == 0); KASSERT(fp->f_unpcount == 0);
/* Replace cached credentials if not what we need. */
cred = curlwp->l_cred;
if (__predict_false(cred != fp->f_cred)) { kauth_cred_free(fp->f_cred);
fp->f_cred = kauth_cred_hold(cred);
}
/*
* Don't allow recycled files to be scanned.
* See uipc_usrreq.c.
*/
if (__predict_false((fp->f_flag & FSCAN) != 0)) { mutex_enter(&fp->f_lock);
atomic_and_uint(&fp->f_flag, ~FSCAN);
mutex_exit(&fp->f_lock);
}
fp->f_advice = 0;
fp->f_offset = 0;
*resultfp = fp;
return 0;
}
/*
* Successful creation of a new descriptor: make visible to the process.
*/
void
fd_affix(proc_t *p, file_t *fp, unsigned fd)
{
fdfile_t *ff;
filedesc_t *fdp;
fdtab_t *dt;
KASSERT(p == curproc || p == &proc0);
/* Add a reference to the file structure. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
/*
* Insert the new file into the descriptor slot.
*/
fdp = p->p_fd;
dt = atomic_load_consume(&fdp->fd_dt);
ff = dt->dt_ff[fd];
KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); KASSERT(fd_isused(fdp, fd)); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
/* No need to lock in order to make file initially visible. */
atomic_store_release(&ff->ff_file, fp);
}
/*
* Abort creation of a new descriptor: free descriptor slot and file.
*/
void
fd_abort(proc_t *p, file_t *fp, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
ff->ff_exclose = false;
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
mutex_enter(&fdp->fd_lock);
KASSERT(fd_isused(fdp, fd));
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
if (fp != NULL) { KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
}
}
static int
file_ctor(void *arg, void *obj, int flags)
{
/*
* It's easy to exhaust the open file limit on a system with many
* CPUs due to caching. Allow a bit of leeway to reduce the element
* of surprise.
*/
u_int slop = PCG_NOBJECTS_NORMAL * (ncpu - 1);
file_t *fp = obj;
memset(fp, 0, sizeof(*fp));
mutex_enter(&filelist_lock);
if (__predict_false(nfiles >= slop + maxfiles)) {
mutex_exit(&filelist_lock);
tablefull("file", "increase kern.maxfiles or MAXFILES");
return ENFILE;
}
nfiles++;
LIST_INSERT_HEAD(&filehead, fp, f_list);
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
fp->f_cred = kauth_cred_hold(curlwp->l_cred);
mutex_exit(&filelist_lock);
return 0;
}
static void
file_dtor(void *arg, void *obj)
{
file_t *fp = obj;
mutex_enter(&filelist_lock);
nfiles--;
LIST_REMOVE(fp, f_list);
mutex_exit(&filelist_lock);
KASSERT(fp->f_count == 0);
kauth_cred_free(fp->f_cred);
mutex_destroy(&fp->f_lock);
}
static void
fdfile_ctor(fdfile_t *ff)
{
memset(ff, 0, sizeof(*ff));
cv_init(&ff->ff_closing, "fdclose");
}
static void
fdfile_dtor(fdfile_t *ff)
{
cv_destroy(&ff->ff_closing);
}
file_t *
fgetdummy(void)
{
file_t *fp;
fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
return fp;
}
void
fputdummy(file_t *fp)
{
mutex_destroy(&fp->f_lock);
kmem_free(fp, sizeof(*fp));
}
/*
* Create an initial filedesc structure.
*/
filedesc_t *
fd_init(filedesc_t *fdp)
{
#ifdef DIAGNOSTIC
unsigned fd;
#endif
if (__predict_true(fdp == NULL)) {
fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
} else {
KASSERT(fdp == &filedesc0);
filedesc_ctor(NULL, fdp, PR_WAITOK);
}
#ifdef DIAGNOSTIC
KASSERT(fdp->fd_lastfile == -1);
KASSERT(fdp->fd_lastkqfile == -1);
KASSERT(fdp->fd_knhash == NULL);
KASSERT(fdp->fd_freefile == 0);
KASSERT(fdp->fd_exclose == false);
KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (fd = 0; fd < NDFDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
for (fd = NDFDFILE; fd < NDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
}
KASSERT(fdp->fd_himap == fdp->fd_dhimap);
KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
#endif /* DIAGNOSTIC */
fdp->fd_refcnt = 1;
fd_checkmaps(fdp);
return fdp;
}
/*
* Initialize a file descriptor table.
*/
static int
filedesc_ctor(void *arg, void *obj, int flag)
{
filedesc_t *fdp = obj;
fdfile_t **ffp;
int i;
memset(fdp, 0, sizeof(*fdp));
mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
fdp->fd_lastfile = -1;
fdp->fd_lastkqfile = -1;
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
fdfile_ctor(*ffp = (fdfile_t *)fdp->fd_dfdfile[i]);
}
return 0;
}
static void
filedesc_dtor(void *arg, void *obj)
{
filedesc_t *fdp = obj;
int i;
for (i = 0; i < NDFDFILE; i++) {
fdfile_dtor((fdfile_t *)fdp->fd_dfdfile[i]);
}
mutex_destroy(&fdp->fd_lock);
}
/*
* Make p share curproc's filedesc structure.
*/
void
fd_share(struct proc *p)
{
filedesc_t *fdp;
fdp = curlwp->l_fd;
p->p_fd = fdp;
atomic_inc_uint(&fdp->fd_refcnt);
}
/*
* Acquire a hold on a filedesc structure.
*/
void
fd_hold(lwp_t *l)
{
filedesc_t *fdp = l->l_fd;
atomic_inc_uint(&fdp->fd_refcnt);
}
/*
* Copy a filedesc structure.
*/
filedesc_t *
fd_copy(void)
{
filedesc_t *newfdp, *fdp;
fdfile_t *ff, **ffp, **nffp, *ff2;
int i, j, numfiles, lastfile, newlast;
file_t *fp;
fdtab_t *newdt;
fdp = curproc->p_fd;
newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
newfdp->fd_refcnt = 1;
#ifdef DIAGNOSTIC
KASSERT(newfdp->fd_lastfile == -1); KASSERT(newfdp->fd_lastkqfile == -1); KASSERT(newfdp->fd_knhash == NULL); KASSERT(newfdp->fd_freefile == 0); KASSERT(newfdp->fd_exclose == false); KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (i = 0; i < NDFDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
(fdfile_t *)&newfdp->fd_dfdfile[i]);
}
for (i = NDFDFILE; i < NDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
}
#endif /* DIAGNOSTIC */
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
numfiles = fdp->fd_dt->dt_nfiles;
lastfile = fdp->fd_lastfile;
/*
* If the number of open files fits in the internal arrays
* of the open file structure, use them, otherwise allocate
* additional memory for the number of descriptors currently
* in use.
*/
if (lastfile < NDFILE) {
i = NDFILE;
newdt = newfdp->fd_dt;
KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
} else {
/*
* Compute the smallest multiple of NDEXTENT needed
* for the file descriptors currently in use,
* allowing the table to shrink.
*/
i = numfiles;
while (i >= 2 * NDEXTENT && i > lastfile * 2) {
i /= 2;
}
KASSERT(i > NDFILE);
newdt = fd_dtab_alloc(i);
newfdp->fd_dt = newdt;
memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
NDFDFILE * sizeof(fdfile_t **));
memset(newdt->dt_ff + NDFDFILE, 0,
(i - NDFDFILE) * sizeof(fdfile_t **));
}
if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
newfdp->fd_himap = newfdp->fd_dhimap;
newfdp->fd_lomap = newfdp->fd_dlomap;
} else {
fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
KASSERT(i >= NDENTRIES * NDENTRIES);
memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
}
newfdp->fd_freefile = fdp->fd_freefile;
newfdp->fd_exclose = fdp->fd_exclose;
ffp = fdp->fd_dt->dt_ff;
nffp = newdt->dt_ff;
newlast = -1;
for (i = 0; i <= lastfile; i++, ffp++, nffp++) { KASSERT(i >= NDFDFILE ||
*nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
ff = *ffp;
if (ff == NULL || (fp = atomic_load_consume(&ff->ff_file)) == NULL) {
/* Descriptor unused, or descriptor half open. */
KASSERT(!fd_isused(newfdp, i));
continue;
}
if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
/* kqueue descriptors cannot be copied. */
if (i < newfdp->fd_freefile) { newfdp->fd_freefile = i;
}
continue;
}
/* It's active: add a reference to the file. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
/* Allocate an fdfile_t to represent it. */
if (i >= NDFDFILE) {
ff2 = kmem_alloc(sizeof(*ff2), KM_SLEEP);
fdfile_ctor(ff2);
*nffp = ff2;
} else {
ff2 = newdt->dt_ff[i];
}
ff2->ff_file = fp;
ff2->ff_exclose = ff->ff_exclose;
ff2->ff_allocated = true;
/* Fix up bitmaps. */
j = i >> NDENTRYSHIFT;
KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0);
newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK);
if (__predict_false(newfdp->fd_lomap[j] == ~0)) { KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
(1U << (j & NDENTRYMASK))) == 0);
newfdp->fd_himap[j >> NDENTRYSHIFT] |=
1U << (j & NDENTRYMASK);
}
newlast = i;
}
KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
newfdp->fd_lastfile = newlast;
fd_checkmaps(newfdp);
mutex_exit(&fdp->fd_lock);
return newfdp;
}
/*
* Release a filedesc structure.
*/
void
fd_free(void)
{
fdfile_t *ff;
file_t *fp;
int fd, nf;
fdtab_t *dt;
lwp_t * const l = curlwp;
filedesc_t * const fdp = l->l_fd;
const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] ==
(fdfile_t *)fdp->fd_dfdfile[0]);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
membar_release();
if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
return;
membar_acquire();
/*
* Close any files that the process holds open.
*/
dt = fdp->fd_dt;
fd_checkmaps(fdp);
#ifdef DEBUG
fdp->fd_refcnt = -1; /* see fd_checkmaps */
#endif
for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE ||
ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (ff == NULL)
continue;
if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) {
/*
* Must use fd_close() here if there is
* a reference from kqueue or we might have posix
* advisory locks.
*/
if (__predict_true(ff->ff_refcnt == 0) && (noadvlock || fp->f_type != DTYPE_VNODE)) {
ff->ff_file = NULL;
ff->ff_exclose = false;
ff->ff_allocated = false;
closef(fp);
} else {
ff->ff_refcnt++;
fd_close(fd);
}
}
KASSERT(ff->ff_refcnt == 0); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_exclose); KASSERT(!ff->ff_allocated); if (fd >= NDFDFILE) { cv_destroy(&ff->ff_closing);
kmem_free(ff, sizeof(*ff));
dt->dt_ff[fd] = NULL;
}
}
/*
* Clean out the descriptor table for the next user and return
* to the cache.
*/
if (__predict_false(dt != &fdp->fd_dtbuiltin)) { fd_dtab_free(fdp->fd_dt);
/* Otherwise, done above. */
memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
(NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
fdp->fd_dt = &fdp->fd_dtbuiltin;
}
if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { KASSERT(fdp->fd_himap != fdp->fd_dhimap); KASSERT(fdp->fd_lomap != fdp->fd_dlomap); fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
}
if (__predict_false(fdp->fd_knhash != NULL)) {
hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
fdp->fd_knhash = NULL;
fdp->fd_knhashmask = 0;
} else {
KASSERT(fdp->fd_knhashmask == 0);
}
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_lastkqfile = -1;
fdp->fd_lastfile = -1;
fdp->fd_freefile = 0;
fdp->fd_exclose = false;
memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
offsetof(filedesc_t, fd_startzero));
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
#ifdef DEBUG
fdp->fd_refcnt = 0; /* see fd_checkmaps */
#endif
fd_checkmaps(fdp);
pool_cache_put(filedesc_cache, fdp);
}
/*
* File Descriptor pseudo-device driver (/dev/fd/).
*
* Opening minor device N dup()s the file (if any) connected to file
* descriptor N belonging to the calling process. Note that this driver
* consists of only the ``open()'' routine, because all subsequent
* references to this file will be direct to the other driver.
*/
static int
filedescopen(dev_t dev, int mode, int type, lwp_t *l)
{
/*
* XXX Kludge: set dupfd to contain the value of the
* the file descriptor being sought for duplication. The error
* return ensures that the vnode for this device will be released
* by vn_open. Open will detect this special error and take the
* actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
* will simply report the error.
*/
l->l_dupfd = minor(dev); /* XXX */
return EDUPFD;
}
/*
* Duplicate the specified descriptor to a free descriptor.
*
* old is the original fd.
* moveit is true if we should move rather than duplicate.
* flags are the open flags (converted from O_* to F*).
* newp returns the new fd on success.
*
* These two cases are produced by the EDUPFD and EMOVEFD magic
* errnos, but in the interest of removing that regrettable interface,
* vn_open has been changed to intercept them. Now vn_open returns
* either a vnode or a filehandle, and the filehandle is accompanied
* by a boolean that says whether we should dup (moveit == false) or
* move (moveit == true) the fd.
*
* The dup case is used by /dev/stderr, /proc/self/fd, and such. The
* move case is used by cloner devices that allocate a fd of their
* own (a layering violation that should go away eventually) that
* then needs to be put in the place open() expects it.
*/
int
fd_dupopen(int old, bool moveit, int flags, int *newp)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
int error;
if ((fp = fd_getfile(old)) == NULL) {
return EBADF;
}
fdp = curlwp->l_fd;
dt = atomic_load_consume(&fdp->fd_dt);
ff = dt->dt_ff[old];
/*
* There are two cases of interest here.
*
* 1. moveit == false (used to be the EDUPFD magic errno):
* simply dup (old) to file descriptor (new) and return.
*
* 2. moveit == true (used to be the EMOVEFD magic errno):
* steal away the file structure from (old) and store it in
* (new). (old) is effectively closed by this operation.
*/
if (moveit == false) {
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
*/
if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
error = EACCES;
goto out;
}
/* Copy it. */
error = fd_dup(fp, 0, newp, ff->ff_exclose);
} else {
/* Copy it. */
error = fd_dup(fp, 0, newp, ff->ff_exclose);
if (error != 0) {
goto out;
}
/* Steal away the file pointer from 'old'. */
(void)fd_close(old);
return 0;
}
out:
fd_putfile(old);
return error;
}
/*
* Close open files on exec.
*/
void
fd_closeexec(void)
{
proc_t *p;
filedesc_t *fdp;
fdfile_t *ff;
lwp_t *l;
fdtab_t *dt;
int fd;
l = curlwp;
p = l->l_proc;
fdp = p->p_fd;
if (fdp->fd_refcnt > 1) {
fdp = fd_copy();
fd_free();
p->p_fd = fdp;
l->l_fd = fdp;
}
if (!fdp->fd_exclose) {
return;
}
fdp->fd_exclose = false;
dt = atomic_load_consume(&fdp->fd_dt);
for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
if ((ff = dt->dt_ff[fd]) == NULL) {
KASSERT(fd >= NDFDFILE);
continue;
}
KASSERT(fd >= NDFDFILE ||
ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (ff->ff_file == NULL)
continue;
if (ff->ff_exclose) {
/*
* We need a reference to close the file.
* No other threads can see the fdfile_t at
* this point, so don't bother locking.
*/
KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
ff->ff_refcnt++;
fd_close(fd);
}
}
}
/*
* Sets descriptor owner. If the owner is a process, 'pgid'
* is set to positive value, process ID. If the owner is process group,
* 'pgid' is set to -pg_id.
*/
int
fsetown(pid_t *pgid, u_long cmd, const void *data)
{
pid_t id = *(const pid_t *)data;
int error;
if (id == INT_MIN)
return EINVAL;
switch (cmd) {
case TIOCSPGRP:
if (id < 0)
return EINVAL;
id = -id;
break;
default:
break;
}
if (id > 0) {
mutex_enter(&proc_lock);
error = proc_find(id) ? 0 : ESRCH;
mutex_exit(&proc_lock);
} else if (id < 0) {
error = pgid_in_session(curproc, -id);
} else {
error = 0;
}
if (!error) {
*pgid = id;
}
return error;
}
void
fd_set_exclose(struct lwp *l, int fd, bool exclose)
{
filedesc_t *fdp = l->l_fd;
fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
ff->ff_exclose = exclose;
if (exclose) fdp->fd_exclose = true;
}
/*
* Return descriptor owner information. If the value is positive,
* it's process ID. If it's negative, it's process group ID and
* needs the sign removed before use.
*/
int
fgetown(pid_t pgid, u_long cmd, void *data)
{
switch (cmd) {
case TIOCGPGRP:
*(int *)data = -pgid;
break;
default:
*(int *)data = pgid;
break;
}
return 0;
}
/*
* Send signal to descriptor owner, either process or process group.
*/
void
fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p());
if (pgid == 0) {
return;
}
KSI_INIT(&ksi);
ksi.ksi_signo = signo;
ksi.ksi_code = code;
ksi.ksi_band = band;
mutex_enter(&proc_lock);
if (pgid > 0) {
struct proc *p1;
p1 = proc_find(pgid);
if (p1 != NULL) {
kpsignal(p1, &ksi, fdescdata);
}
} else {
struct pgrp *pgrp;
KASSERT(pgid < 0);
pgrp = pgrp_find(-pgid);
if (pgrp != NULL) {
kpgsignal(pgrp, &ksi, fdescdata, 0);
}
}
mutex_exit(&proc_lock);
}
int
fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
void *data)
{
fdfile_t *ff;
filedesc_t *fdp;
fp->f_flag = flag & FMASK;
fdp = curproc->p_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL);
ff->ff_exclose = (flag & O_CLOEXEC) != 0;
fp->f_type = DTYPE_MISC;
fp->f_ops = fops;
fp->f_data = data;
curlwp->l_dupfd = fd;
fd_affix(curproc, fp, fd);
return EMOVEFD;
}
int
fnullop_fcntl(file_t *fp, u_int cmd, void *data)
{
if (cmd == F_SETFL)
return 0;
return EOPNOTSUPP;
}
int
fnullop_poll(file_t *fp, int which)
{
return 0;
}
int
fnullop_kqfilter(file_t *fp, struct knote *kn)
{
return EOPNOTSUPP;
}
void
fnullop_restart(file_t *fp)
{
}
int
fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_ioctl(file_t *fp, u_long com, void *data)
{
return EOPNOTSUPP;
}
int
fbadop_stat(file_t *fp, struct stat *sb)
{
return EOPNOTSUPP;
}
int
fbadop_close(file_t *fp)
{
return EOPNOTSUPP;
}
/*
* sysctl routines pertaining to file descriptors
*/
/* Initialized in sysctl_init() for now... */
extern kmutex_t sysctl_file_marker_lock;
static u_int sysctl_file_marker = 1;
/*
* Expects to be called with proc_lock and sysctl_file_marker_lock locked.
*/
static void
sysctl_file_marker_reset(void)
{
struct proc *p;
PROCLIST_FOREACH(p, &allproc) {
struct filedesc *fd = p->p_fd;
fdtab_t *dt;
u_int i;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
struct file *fp;
fdfile_t *ff;
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
continue;
}
fp->f_marker = 0;
}
mutex_exit(&fd->fd_lock);
}
}
/*
* sysctl helper routine for kern.file pseudo-subtree.
*/
static int
sysctl_kern_file(SYSCTLFN_ARGS)
{
const bool allowaddr = get_expose_address(curproc);
struct filelist flist;
int error;
size_t buflen;
struct file *fp, fbuf;
char *start, *where;
struct proc *p;
start = where = oldp;
buflen = *oldlenp;
if (where == NULL) {
/*
* overestimate by 10 files
*/
*oldlenp = sizeof(filehead) + (nfiles + 10) *
sizeof(struct file);
return 0;
}
/*
* first sysctl_copyout filehead
*/
if (buflen < sizeof(filehead)) {
*oldlenp = 0;
return 0;
}
sysctl_unlock();
if (allowaddr) {
memcpy(&flist, &filehead, sizeof(flist));
} else {
memset(&flist, 0, sizeof(flist));
}
error = sysctl_copyout(l, &flist, where, sizeof(flist));
if (error) {
sysctl_relock();
return error;
}
buflen -= sizeof(flist);
where += sizeof(flist);
/*
* followed by an array of file structures
*/
mutex_enter(&sysctl_file_marker_lock);
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
struct filedesc *fd;
fdtab_t *dt;
u_int i;
if (p->p_stat == SIDL) {
/* skip embryonic processes */
continue;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
NULL, NULL);
mutex_exit(p->p_lock);
if (error != 0) {
/*
* Don't leak kauth retval if we're silently
* skipping this entry.
*/
error = 0;
continue;
}
/*
* Grab a hold on the process.
*/
if (!rw_tryenter(&p->p_reflock, RW_READER)) {
continue;
}
mutex_exit(&proc_lock);
fd = p->p_fd;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
fdfile_t *ff;
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
continue;
}
mutex_enter(&fp->f_lock);
if ((fp->f_count == 0) ||
(fp->f_marker == sysctl_file_marker)) {
mutex_exit(&fp->f_lock);
continue;
}
/* Check that we have enough space. */
if (buflen < sizeof(struct file)) {
*oldlenp = where - start;
mutex_exit(&fp->f_lock);
error = ENOMEM;
break;
}
fill_file(&fbuf, fp);
mutex_exit(&fp->f_lock);
error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
if (error) {
break;
}
buflen -= sizeof(struct file);
where += sizeof(struct file);
fp->f_marker = sysctl_file_marker;
}
mutex_exit(&fd->fd_lock);
/*
* Release reference to process.
*/
mutex_enter(&proc_lock);
rw_exit(&p->p_reflock);
if (error)
break;
}
sysctl_file_marker++;
/* Reset all markers if wrapped. */
if (sysctl_file_marker == 0) {
sysctl_file_marker_reset();
sysctl_file_marker++;
}
mutex_exit(&proc_lock);
mutex_exit(&sysctl_file_marker_lock);
*oldlenp = where - start;
sysctl_relock();
return error;
}
/*
* sysctl helper function for kern.file2
*/
static int
sysctl_kern_file2(SYSCTLFN_ARGS)
{
struct proc *p;
struct file *fp;
struct filedesc *fd;
struct kinfo_file kf;
char *dp;
u_int i, op;
size_t len, needed, elem_size, out_size;
int error, arg, elem_count;
fdfile_t *ff;
fdtab_t *dt;
if (namelen == 1 && name[0] == CTL_QUERY)
return sysctl_query(SYSCTLFN_CALL(rnode));
if (namelen != 4)
return EINVAL;
error = 0;
dp = oldp;
len = (oldp != NULL) ? *oldlenp : 0;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
out_size = MIN(sizeof(kf), elem_size);
needed = 0;
if (elem_size < 1 || elem_count < 0)
return EINVAL;
switch (op) {
case KERN_FILE_BYFILE:
case KERN_FILE_BYPID:
/*
* We're traversing the process list in both cases; the BYFILE
* case does additional work of keeping track of files already
* looked at.
*/
/* doesn't use arg so it must be zero */
if ((op == KERN_FILE_BYFILE) && (arg != 0))
return EINVAL;
if ((op == KERN_FILE_BYPID) && (arg < -1))
/* -1 means all processes */
return EINVAL;
sysctl_unlock();
if (op == KERN_FILE_BYFILE)
mutex_enter(&sysctl_file_marker_lock);
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
if (p->p_stat == SIDL) {
/* skip embryonic processes */
continue;
}
if (arg > 0 && p->p_pid != arg) {
/* pick only the one we want */
/* XXX want 0 to mean "kernel files" */
continue;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
NULL, NULL);
mutex_exit(p->p_lock);
if (error != 0) {
/*
* Don't leak kauth retval if we're silently
* skipping this entry.
*/
error = 0;
continue;
}
/*
* Grab a hold on the process.
*/
if (!rw_tryenter(&p->p_reflock, RW_READER)) {
continue;
}
mutex_exit(&proc_lock);
fd = p->p_fd;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) ==
NULL) {
continue;
}
if ((op == KERN_FILE_BYFILE) &&
(fp->f_marker == sysctl_file_marker)) {
continue;
}
if (len >= elem_size && elem_count > 0) {
mutex_enter(&fp->f_lock);
fill_file2(&kf, fp, ff, i, p->p_pid);
mutex_exit(&fp->f_lock);
mutex_exit(&fd->fd_lock);
error = sysctl_copyout(l,
&kf, dp, out_size);
mutex_enter(&fd->fd_lock);
if (error)
break;
dp += elem_size;
len -= elem_size;
}
if (op == KERN_FILE_BYFILE)
fp->f_marker = sysctl_file_marker;
needed += elem_size;
if (elem_count > 0 && elem_count != INT_MAX)
elem_count--;
}
mutex_exit(&fd->fd_lock);
/*
* Release reference to process.
*/
mutex_enter(&proc_lock);
rw_exit(&p->p_reflock);
}
if (op == KERN_FILE_BYFILE) {
sysctl_file_marker++;
/* Reset all markers if wrapped. */
if (sysctl_file_marker == 0) {
sysctl_file_marker_reset();
sysctl_file_marker++;
}
}
mutex_exit(&proc_lock);
if (op == KERN_FILE_BYFILE)
mutex_exit(&sysctl_file_marker_lock);
sysctl_relock();
break;
default:
return EINVAL;
}
if (oldp == NULL)
needed += KERN_FILESLOP * elem_size;
*oldlenp = needed;
return error;
}
static void
fill_file(struct file *fp, const struct file *fpsrc)
{
const bool allowaddr = get_expose_address(curproc);
memset(fp, 0, sizeof(*fp));
fp->f_offset = fpsrc->f_offset;
COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr);
COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr);
COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr);
COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr);
fp->f_flag = fpsrc->f_flag;
fp->f_marker = fpsrc->f_marker;
fp->f_type = fpsrc->f_type;
fp->f_advice = fpsrc->f_advice;
fp->f_count = fpsrc->f_count;
fp->f_msgcount = fpsrc->f_msgcount;
fp->f_unpcount = fpsrc->f_unpcount;
COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr);
}
static void
fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
int i, pid_t pid)
{
const bool allowaddr = get_expose_address(curproc);
memset(kp, 0, sizeof(*kp));
COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr);
kp->ki_flag = fp->f_flag;
kp->ki_iflags = 0;
kp->ki_ftype = fp->f_type;
kp->ki_count = fp->f_count;
kp->ki_msgcount = fp->f_msgcount;
COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr);
kp->ki_fuid = kauth_cred_geteuid(fp->f_cred);
kp->ki_fgid = kauth_cred_getegid(fp->f_cred);
COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr);
kp->ki_foffset = fp->f_offset;
COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr);
/* vnode information to glue this file to something */
if (fp->f_type == DTYPE_VNODE) {
struct vnode *vp = fp->f_vnode;
COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket),
allowaddr);
kp->ki_vsize = vp->v_size;
kp->ki_vtype = vp->v_type;
kp->ki_vtag = vp->v_tag;
COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data),
allowaddr);
}
/* process information when retrieved via KERN_FILE_BYPID */
if (ff != NULL) {
kp->ki_pid = pid;
kp->ki_fd = i;
kp->ki_ofileflags = ff->ff_exclose;
kp->ki_usecount = ff->ff_refcnt;
}
}
/* $NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_disk.h>
#include <dev/scsipi/scsiconf.h>
#include <dev/scsipi/scsipi_base.h>
static void scsi_print_xfer_mode(struct scsipi_periph *);
/*
* Do a scsi operation, asking a device to run as SCSI-II if it can.
*/
int
scsi_change_def(struct scsipi_periph *periph, int flags)
{
struct scsi_changedef cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_CHANGE_DEFINITION;
cmd.how = SC_SCSI_2;
return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, 100000, NULL, flags));
}
/*
* ask the scsi driver to perform a command for us.
* tell it where to read/write the data, and how
* long the data is supposed to be. If we have a buf
* to associate with the transfer, we need that too.
*/
void
scsi_scsipi_cmd(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
SC_DEBUG(periph, SCSIPI_DB2, ("scsi_scsipi_cmd\n"));
/*
* Set the LUN in the CDB if we have an older device. We also
* set it for more modern SCSI-2 devices "just in case".
*/
if (periph->periph_version <= 2)
xs->cmd->bytes[0] |=
((periph->periph_lun << SCSI_CMD_LUN_SHIFT) &
SCSI_CMD_LUN_MASK);
}
/*
* Utility routines often used in SCSI stuff
*/
/*
* Print out the periph's address info.
*/
void
scsi_print_addr(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
struct scsipi_adapter *adapt = chan->chan_adapter;
printf("%s(%s:%d:%d:%d): ", periph->periph_dev != NULL ?
device_xname(periph->periph_dev) : "probe",
device_xname(adapt->adapt_dev),
chan->chan_channel, periph->periph_target,
periph->periph_lun);
}
/*
* Kill off all pending xfers for a periph.
*
* Must be called with channel lock held
*/
void
scsi_kill_pending(struct scsipi_periph *periph)
{
struct scsipi_xfer *xs;
TAILQ_FOREACH(xs, &periph->periph_xferq, device_q) {
callout_stop(&xs->xs_callout);
scsi_print_addr(periph);
printf("killed ");
scsipi_print_cdb(xs->cmd);
xs->error = XS_DRIVER_STUFFUP;
scsipi_done(xs);
}
}
/*
* scsi_print_xfer_mode:
*
* Print a parallel SCSI periph's capabilities.
*/
static void
scsi_print_xfer_mode(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
struct scsipi_adapter *adapt = chan->chan_adapter;
int period, freq, speed, mbs;
if (periph->periph_dev)
aprint_normal_dev(periph->periph_dev, "");
else
aprint_normal("probe(%s:%d:%d:%d): ",
device_xname(adapt->adapt_dev),
chan->chan_channel, periph->periph_target,
periph->periph_lun);
if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
period = scsipi_sync_factor_to_period(periph->periph_period);
aprint_normal("sync (%d.%02dns offset %d)",
period / 100, period % 100, periph->periph_offset);
} else
aprint_normal("async");
if (periph->periph_mode & PERIPH_CAP_WIDE32)
aprint_normal(", 32-bit");
else if (periph->periph_mode & (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
aprint_normal(", 16-bit");
else
aprint_normal(", 8-bit");
if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
freq = scsipi_sync_factor_to_freq(periph->periph_period);
speed = freq;
if (periph->periph_mode & PERIPH_CAP_WIDE32)
speed *= 4;
else if (periph->periph_mode &
(PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
speed *= 2;
mbs = speed / 1000;
if (mbs > 0) {
aprint_normal(" (%d.%03dMB/s)", mbs,
speed % 1000);
} else
aprint_normal(" (%dKB/s)", speed % 1000);
}
aprint_normal(" transfers");
if (periph->periph_mode & PERIPH_CAP_TQING)
aprint_normal(", tagged queueing");
aprint_normal("\n");
}
/*
* scsi_async_event_xfer_mode:
*
* Update the xfer mode for all parallel SCSI periphs sharing the
* specified I_T Nexus.
*/
void
scsi_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
struct scsipi_xfer_mode *xm = arg;
struct scsipi_periph *periph;
int lun, announce, mode, period, offset;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
if (periph == NULL)
continue;
announce = 0;
/*
* Clamp the xfer mode down to this periph's capabilities.
*/
mode = xm->xm_mode & periph->periph_cap;
if (mode & PERIPH_CAP_SYNC) {
period = xm->xm_period;
offset = xm->xm_offset;
} else {
period = 0;
offset = 0;
}
/*
* If we do not have a valid xfer mode yet, or the parameters
* are different, announce them.
*/
if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
periph->periph_mode != mode ||
periph->periph_period != period ||
periph->periph_offset != offset)
announce = 1;
periph->periph_mode = mode;
periph->periph_period = period;
periph->periph_offset = offset;
periph->periph_flags |= PERIPH_MODE_VALID;
if (announce)
scsi_print_xfer_mode(periph);
}
}
/*
* scsipi_async_event_xfer_mode:
*
* Update the xfer mode for all SAS/FC periphs sharing the
* specified I_T Nexus.
*/
void
scsi_fc_sas_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
struct scsipi_xfer_mode *xm = arg;
struct scsipi_periph *periph;
int lun, announce, mode;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
if (periph == NULL)
continue;
announce = 0;
/*
* Clamp the xfer mode down to this periph's capabilities.
*/
mode = xm->xm_mode & periph->periph_cap;
/*
* If we do not have a valid xfer mode yet, or the parameters
* are different, announce them.
*/
if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
periph->periph_mode != mode)
announce = 1;
periph->periph_mode = mode;
periph->periph_flags |= PERIPH_MODE_VALID;
if (announce &&
(periph->periph_mode & PERIPH_CAP_TQING) != 0) {
aprint_normal_dev(periph->periph_dev,
"tagged queueing\n");
}
}
}
/* $NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $ */
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_prof.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $");
#ifdef _KERNEL_OPT
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/cpu.h>
#ifdef GPROF
#include <sys/malloc.h>
#include <sys/gmon.h>
#include <sys/xcall.h>
MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
static int sysctl_kern_profiling(SYSCTLFN_ARGS);
#ifdef MULTIPROCESSOR
void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
#endif
/*
* Froms is actually a bunch of unsigned shorts indexing tos
*/
struct gmonparam _gmonparam = { .state = GMON_PROF_OFF };
/* Actual start of the kernel text segment. */
extern char kernel_text[];
extern char etext[];
void
kmstartup(void)
{
char *cp;
struct gmonparam *p = &_gmonparam;
unsigned long size;
/*
* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
p->lowpc = rounddown(((u_long)kernel_text),
HISTFRACTION * sizeof(HISTCOUNTER));
p->highpc = roundup((u_long)etext,
HISTFRACTION * sizeof(HISTCOUNTER));
p->textsize = p->highpc - p->lowpc;
printf("Profiling kernel, textsize=%ld [%lx..%lx]\n",
p->textsize, p->lowpc, p->highpc);
p->kcountsize = p->textsize / HISTFRACTION;
p->hashfraction = HASHFRACTION;
p->fromssize = p->textsize / HASHFRACTION;
p->tolimit = p->textsize * ARCDENSITY / 100;
if (p->tolimit < MINARCS)
p->tolimit = MINARCS;
else if (p->tolimit > MAXARCS)
p->tolimit = MAXARCS;
p->tossize = p->tolimit * sizeof(struct tostruct);
size = p->kcountsize + p->fromssize + p->tossize;
#ifdef MULTIPROCESSOR
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
p = malloc(sizeof(struct gmonparam) + size, M_GPROF,
M_NOWAIT | M_ZERO);
if (p == NULL) {
printf("No memory for profiling on %s\n",
cpu_name(ci));
/* cannot profile on this cpu */
continue;
}
memcpy(p, &_gmonparam, sizeof(_gmonparam));
ci->ci_gmon = p;
/*
* To allow profiling to be controlled only by the global
* _gmonparam.state, set the default value for each CPU to
* GMON_PROF_ON. If _gmonparam.state is not ON, mcount will
* not be executed.
* This is For compatibility of the kgmon(8) kmem interface.
*/
p->state = GMON_PROF_ON;
cp = (char *)(p + 1);
p->tos = (struct tostruct *)cp;
p->kcount = (u_short *)(cp + p->tossize);
p->froms = (u_short *)(cp + p->tossize + p->kcountsize);
}
sysctl_createv(NULL, 0, NULL, NULL,
0, CTLTYPE_NODE, "percpu",
SYSCTL_DESCR("per cpu profiling information"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL);
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
sysctl_createv(NULL, 0, NULL, NULL,
0, CTLTYPE_NODE, cpu_name(ci),
NULL,
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_INT, "state",
SYSCTL_DESCR("Profiling state"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_STATE, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count",
SYSCTL_DESCR("Array of statistical program counters"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_COUNT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms",
SYSCTL_DESCR("Array indexed by program counter of "
"call-from points"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_FROMS, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos",
SYSCTL_DESCR("Array of structures describing "
"destination of calls and their counts"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_TOS, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam",
SYSCTL_DESCR("Structure giving the sizes of the above "
"arrays"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_GMONPARAM, CTL_EOL);
}
/*
* For minimal compatibility of the kgmon(8) kmem interface,
* the _gmonparam and cpu0:ci_gmon share buffers.
*/
p = curcpu()->ci_gmon;
if (p != NULL) {
_gmonparam.tos = p->tos;
_gmonparam.kcount = p->kcount;
_gmonparam.froms = p->froms;
}
#else /* MULTIPROCESSOR */
cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO);
if (cp == 0) {
printf("No memory for profiling.\n");
return;
}
p->tos = (struct tostruct *)cp;
cp += p->tossize;
p->kcount = (u_short *)cp;
cp += p->kcountsize;
p->froms = (u_short *)cp;
#endif /* MULTIPROCESSOR */
}
#ifdef MULTIPROCESSOR
static void
prof_set_state_xc(void *arg1, void *arg2 __unused)
{
int state = PTRTOUINT64(arg1);
struct gmonparam *gp = curcpu()->ci_gmon;
if (gp != NULL)
gp->state = state;
}
#endif /* MULTIPROCESSOR */
/*
* Return kernel profiling information.
*/
/*
* sysctl helper routine for kern.profiling subtree. enables/disables
* kernel profiling and gives out copies of the profiling data.
*/
static int
sysctl_kern_profiling(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
struct gmonparam *gp;
int error;
#ifdef MULTIPROCESSOR
CPU_INFO_ITERATOR cii;
struct cpu_info *ci, *target_ci;
uint64_t where;
int state;
bool prof_on, do_merge;
target_ci = (struct cpu_info *)rnode->sysctl_data;
do_merge = (oldp != NULL) && (target_ci == NULL) &&
((node.sysctl_num == GPROF_COUNT) ||
(node.sysctl_num == GPROF_FROMS) ||
(node.sysctl_num == GPROF_TOS));
if (do_merge) {
/* kern.profiling.{count,froms,tos} */
unsigned long size;
char *cp;
/* allocate temporary gmonparam, and merge results of all CPU */
size = _gmonparam.kcountsize + _gmonparam.fromssize +
_gmonparam.tossize;
gp = malloc(sizeof(struct gmonparam) + size, M_GPROF,
M_NOWAIT | M_ZERO);
if (gp == NULL)
return ENOMEM;
memcpy(gp, &_gmonparam, sizeof(_gmonparam));
cp = (char *)(gp + 1);
gp->tos = (struct tostruct *)cp;
gp->kcount = (u_short *)(cp + gp->tossize);
gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize);
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
_gmonparam_merge(gp, ci->ci_gmon);
}
} else if (target_ci != NULL) {
/* kern.profiling.percpu.* */
gp = target_ci->ci_gmon;
} else {
/* kern.profiling.{state,gmonparam} */
gp = &_gmonparam;
}
#else /* MULTIPROCESSOR */
gp = &_gmonparam;
#endif
switch (node.sysctl_num) {
case GPROF_STATE:
#ifdef MULTIPROCESSOR
/*
* if _gmonparam.state is OFF, the state of each CPU is
* considered to be OFF, even if it is actually ON.
*/
if (_gmonparam.state == GMON_PROF_OFF ||
gp->state == GMON_PROF_OFF)
state = GMON_PROF_OFF;
else
state = GMON_PROF_ON;
node.sysctl_data = &state;
#else
node.sysctl_data = &gp->state;
#endif
break;
case GPROF_COUNT:
node.sysctl_data = gp->kcount;
node.sysctl_size = gp->kcountsize;
break;
case GPROF_FROMS:
node.sysctl_data = gp->froms;
node.sysctl_size = gp->fromssize;
break;
case GPROF_TOS:
node.sysctl_data = gp->tos;
node.sysctl_size = gp->tossize;
break;
case GPROF_GMONPARAM:
node.sysctl_data = gp;
node.sysctl_size = sizeof(*gp);
break;
default:
return (EOPNOTSUPP);
}
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
goto done;
#ifdef MULTIPROCESSOR
switch (node.sysctl_num) {
case GPROF_STATE:
if (target_ci != NULL) {
where = xc_unicast(0, prof_set_state_xc,
UINT64TOPTR(state), NULL, target_ci);
xc_wait(where);
/* if even one CPU being profiled, enable perfclock. */
prof_on = false;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
if (ci->ci_gmon->state != GMON_PROF_OFF) {
prof_on = true;
break;
}
}
mutex_spin_enter(&proc0.p_stmutex);
if (prof_on)
startprofclock(&proc0);
else
stopprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
if (prof_on) {
_gmonparam.state = GMON_PROF_ON;
} else {
_gmonparam.state = GMON_PROF_OFF;
/*
* when _gmonparam.state and all CPU gmon state
* are OFF, all CPU states should be ON so that
* the entire CPUs profiling can be controlled
* by _gmonparam.state only.
*/
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
ci->ci_gmon->state = GMON_PROF_ON;
}
}
} else {
_gmonparam.state = state;
where = xc_broadcast(0, prof_set_state_xc,
UINT64TOPTR(state), NULL);
xc_wait(where);
mutex_spin_enter(&proc0.p_stmutex);
if (state == GMON_PROF_OFF)
stopprofclock(&proc0);
else
startprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
}
break;
case GPROF_COUNT:
/*
* if 'kern.profiling.{count,froms,tos}' is written, the same
* data will be written to 'kern.profiling.percpu.cpuN.xxx'
*/
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->kcount, gp->kcount,
newlen);
}
}
break;
case GPROF_FROMS:
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->froms, gp->froms, newlen);
}
}
break;
case GPROF_TOS:
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->tos, gp->tos, newlen);
}
}
break;
}
#else
if (node.sysctl_num == GPROF_STATE) {
mutex_spin_enter(&proc0.p_stmutex);
if (gp->state == GMON_PROF_OFF)
stopprofclock(&proc0);
else
startprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
}
#endif
done:
#ifdef MULTIPROCESSOR
if (do_merge)
free(gp, M_GPROF);
#endif
return error;
}
SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "profiling",
SYSCTL_DESCR("Profiling information (available)"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "state",
SYSCTL_DESCR("Profiling state"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_STATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "count",
SYSCTL_DESCR("Array of statistical program counters"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_COUNT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "froms",
SYSCTL_DESCR("Array indexed by program counter of "
"call-from points"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_FROMS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "tos",
SYSCTL_DESCR("Array of structures describing "
"destination of calls and their counts"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_TOS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "gmonparam",
SYSCTL_DESCR("Structure giving the sizes of the above "
"arrays"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_GMONPARAM, CTL_EOL);
}
#endif /* GPROF */
/*
* Profiling system call.
*
* The scale factor is a fixed point number with 16 bits of fraction, so that
* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
*/
/* ARGSUSED */
int
sys_profil(struct lwp *l, const struct sys_profil_args *uap, register_t *retval)
{
/* {
syscallarg(char *) samples;
syscallarg(size_t) size;
syscallarg(u_long) offset;
syscallarg(u_int) scale;
} */
struct proc *p = l->l_proc;
struct uprof *upp;
if (SCARG(uap, scale) > (1 << 16))
return (EINVAL);
if (SCARG(uap, scale) == 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
return (0);
}
upp = &p->p_stats->p_prof;
/* Block profile interrupts while changing state. */
mutex_spin_enter(&p->p_stmutex);
upp->pr_off = SCARG(uap, offset);
upp->pr_scale = SCARG(uap, scale);
upp->pr_base = SCARG(uap, samples);
upp->pr_size = SCARG(uap, size);
startprofclock(p);
mutex_spin_exit(&p->p_stmutex);
return (0);
}
/*
* Scale is a fixed-point number with the binary point 16 bits
* into the value, and is <= 1.0. pc is at most 32 bits, so the
* intermediate result is at most 48 bits.
*/
#define PC_TO_INDEX(pc, prof) \
((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
(u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
/*
* Collect user-level profiling statistics; called on a profiling tick,
* when a process is running in user-mode. This routine may be called
* from an interrupt context. We schedule an AST that will vector us
* to trap() with a context in which copyin and copyout will work.
* Trap will then call addupc_task().
*
* XXX We could use ufetch/ustore here if the profile buffers were
* wired.
*
* Note that we may (rarely) not get around to the AST soon enough, and
* lose profile ticks when the next tick overwrites this one, but in this
* case the system is overloaded and the profile is probably already
* inaccurate.
*/
void
addupc_intr(struct lwp *l, u_long pc)
{
struct uprof *prof;
struct proc *p;
u_int i;
p = l->l_proc;
KASSERT(mutex_owned(&p->p_stmutex));
prof = &p->p_stats->p_prof;
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
return; /* out of range; ignore */
mutex_spin_exit(&p->p_stmutex);
/* XXXSMP */
prof->pr_addr = pc;
prof->pr_ticks++;
cpu_need_proftick(l);
mutex_spin_enter(&p->p_stmutex);
}
/*
* Much like before, but we can afford to take faults here. If the
* update fails, we simply turn off profiling.
*/
void
addupc_task(struct lwp *l, u_long pc, u_int ticks)
{
struct uprof *prof;
struct proc *p;
void *addr;
int error;
u_int i;
u_short v;
p = l->l_proc;
if (ticks == 0)
return;
mutex_spin_enter(&p->p_stmutex);
prof = &p->p_stats->p_prof;
/* Testing P_PROFIL may be unnecessary, but is certainly safe. */
if ((p->p_stflag & PST_PROFIL) == 0 || pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
mutex_spin_exit(&p->p_stmutex);
return;
}
addr = prof->pr_base + i;
mutex_spin_exit(&p->p_stmutex);
if ((error = copyin(addr, (void *)&v, sizeof(v))) == 0) {
v += ticks;
error = copyout((void *)&v, addr, sizeof(v));
}
if (error != 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
}
/* $NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $ */
/* $KAME: ip6_mroute.c,v 1.49 2001/07/25 09:21:18 jinmei Exp $ */
/*
* Copyright (C) 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
*/
/*
* Copyright (c) 1989 Stephen Deering
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
*/
/*
* IP multicast forwarding procedures
*
* Written by David Waitzman, BBN Labs, August 1988.
* Modified by Steve Deering, Stanford, February 1989.
* Modified by Mark J. Steiglitz, Stanford, May, 1991
* Modified by Van Jacobson, LBL, January 1993
* Modified by Ajit Thyagarajan, PARC, August 1993
* Modified by Bill Fenner, PARC, April 1994
*
* MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_mrouting.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/icmp6.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/scope6_var.h>
#include <netinet6/pim6.h>
#include <netinet6/pim6_var.h>
#include <netinet6/nd6.h>
static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
static int set_pim6(int *);
static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *);
static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
/*
* Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static,
* except for netstat or debugging purposes.
*/
struct socket *ip6_mrouter = NULL;
int ip6_mrouter_ver = 0;
int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */
struct mrt6stat mrt6stat;
#define NO_RTE_FOUND 0x1
#define RTE_FOUND 0x2
struct mf6c *mf6ctable[MF6CTBLSIZ];
u_char n6expire[MF6CTBLSIZ];
struct mif6 mif6table[MAXMIFS];
#ifdef MRT6DEBUG
u_int mrt6debug = 0; /* debug level */
#define DEBUG_MFC 0x02
#define DEBUG_FORWARD 0x04
#define DEBUG_EXPIRE 0x08
#define DEBUG_XMIT 0x10
#define DEBUG_REG 0x20
#define DEBUG_PIM 0x40
#define __mrt6debugused /* empty */
#else
#define __mrt6debugused __unused
#endif
static void expire_upcalls(void *);
#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
#define UPCALL_EXPIRE 6 /* number of timeouts */
#ifdef INET
#ifdef MROUTING
extern struct socket *ip_mrouter;
#endif
#endif
/*
* 'Interfaces' associated with decapsulator (so we can tell
* packets that went through it from ones that get reflected
* by a broken gateway). These interfaces are never linked into
* the system ifnet list & no routes point to them. I.e., packets
* can't be sent this way. They only exist as a placeholder for
* multicast source verification.
*/
struct ifnet multicast_register_if6;
#define ENCAP_HOPS 64
/*
* Private variables.
*/
static mifi_t nummifs = 0;
static mifi_t reg_mif_num = (mifi_t)-1;
static percpu_t *pim6stat_percpu;
#define PIM6_STATINC(x) _NET_STATINC(pim6stat_percpu, x)
static int pim6;
/*
* Hash function for a source, group entry
*/
#define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
(a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
(g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
(g).s6_addr32[2] ^ (g).s6_addr32[3])
/*
* Find a route for a given origin IPv6 address and Multicast group address.
* Quality of service parameter to be added in the future!!!
*/
#define MF6CFIND(o, g, rt) do { \
struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
rt = NULL; \
mrt6stat.mrt6s_mfc_lookups++; \
while (_rt) { \
if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
(_rt->mf6c_stall == NULL)) { \
rt = _rt; \
break; \
} \
_rt = _rt->mf6c_next; \
} \
if (rt == NULL) { \
mrt6stat.mrt6s_mfc_misses++; \
} \
} while (/*CONSTCOND*/ 0)
/*
* Macros to compute elapsed time efficiently
* Borrowed from Van Jacobson's scheduling code
*/
#define TV_DELTA(a, b, delta) do { \
int xxs; \
\
delta = (a).tv_usec - (b).tv_usec; \
if ((xxs = (a).tv_sec - (b).tv_sec)) { \
switch (xxs) { \
case 2: \
delta += 1000000; \
/* FALLTHROUGH */ \
case 1: \
delta += 1000000; \
break; \
default: \
delta += (1000000 * xxs); \
} \
} \
} while (/*CONSTCOND*/ 0)
#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
#ifdef UPCALL_TIMING
#define UPCALL_MAX 50
u_long upcall_data[UPCALL_MAX + 1];
static void collate();
#endif /* UPCALL_TIMING */
static int get_sg_cnt(struct sioc_sg_req6 *);
static int get_mif6_cnt(struct sioc_mif_req6 *);
static int ip6_mrouter_init(struct socket *, int, int);
static int add_m6if(struct mif6ctl *);
static int del_m6if(mifi_t *);
static int add_m6fc(struct mf6cctl *);
static int del_m6fc(struct mf6cctl *);
static void sysctl_net_inet6_pim6_setup(struct sysctllog **);
static callout_t expire_upcalls_ch;
void
pim6_init(void)
{
sysctl_net_inet6_pim6_setup(NULL);
pim6stat_percpu = percpu_alloc(sizeof(uint64_t) * PIM6_NSTATS);
}
/*
* Handle MRT setsockopt commands to modify the multicast routing tables.
*/
int
ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
{
int error, optval;
struct mif6ctl mifc;
struct mf6cctl mfcc;
mifi_t mifi;
if (sopt->sopt_name != MRT6_INIT && so != ip6_mrouter)
return (EACCES);
error = 0;
switch (sopt->sopt_name) {
#ifdef MRT6_OINIT
case MRT6_OINIT:
#endif
case MRT6_INIT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
return (ip6_mrouter_init(so, optval, sopt->sopt_name));
case MRT6_DONE:
return (ip6_mrouter_done());
case MRT6_ADD_MIF:
error = sockopt_get(sopt, &mifc, sizeof(mifc));
if (error)
break;
return (add_m6if(&mifc));
case MRT6_DEL_MIF:
error = sockopt_get(sopt, &mifi, sizeof(mifi));
if (error)
break;
return (del_m6if(&mifi));
case MRT6_ADD_MFC:
error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
if (error)
break;
return (add_m6fc(&mfcc));
case MRT6_DEL_MFC:
error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
if (error)
break;
return (del_m6fc(&mfcc));
case MRT6_PIM:
error = sockopt_getint(sopt, &optval);
if (error)
break;
return (set_pim6(&optval));
default:
error = EOPNOTSUPP;
}
return (error);
}
/*
* Handle MRT getsockopt commands
*/
int
ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
{
int error;
if (so != ip6_mrouter)
return EACCES;
error = 0;
switch (sopt->sopt_name) {
case MRT6_PIM:
error = sockopt_set(sopt, &pim6, sizeof(pim6));
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
/*
* Handle ioctl commands to obtain information from the cache
*/
int
mrt6_ioctl(u_long cmd, void *data)
{
switch (cmd) {
case SIOCGETSGCNT_IN6:
return (get_sg_cnt((struct sioc_sg_req6 *)data));
case SIOCGETMIFCNT_IN6:
return (get_mif6_cnt((struct sioc_mif_req6 *)data));
default:
return (EINVAL);
}
}
/*
* returns the packet, byte, rpf-failure count for the source group provided
*/
static int
get_sg_cnt(struct sioc_sg_req6 *req)
{
struct mf6c *rt;
int s;
s = splsoftnet();
MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
splx(s);
if (rt != NULL) {
req->pktcnt = rt->mf6c_pkt_cnt;
req->bytecnt = rt->mf6c_byte_cnt;
req->wrong_if = rt->mf6c_wrong_if;
} else
return (ESRCH);
#if 0
req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
#endif
return 0;
}
/*
* returns the input and output packet and byte counts on the mif provided
*/
static int
get_mif6_cnt(struct sioc_mif_req6 *req)
{
mifi_t mifi = req->mifi;
if (mifi >= nummifs)
return EINVAL;
req->icount = mif6table[mifi].m6_pkt_in;
req->ocount = mif6table[mifi].m6_pkt_out;
req->ibytes = mif6table[mifi].m6_bytes_in;
req->obytes = mif6table[mifi].m6_bytes_out;
return 0;
}
static int
set_pim6(int *i)
{
if ((*i != 1) && (*i != 0))
return EINVAL;
pim6 = *i;
return 0;
}
/*
* Enable multicast routing
*/
static int
ip6_mrouter_init(struct socket *so, int v, int cmd)
{
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG,
"ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
so->so_type, so->so_proto->pr_protocol);
#endif
if (so->so_type != SOCK_RAW ||
so->so_proto->pr_protocol != IPPROTO_ICMPV6)
return EOPNOTSUPP;
if (v != 1)
return ENOPROTOOPT;
if (ip6_mrouter != NULL)
return EADDRINUSE;
ip6_mrouter = so;
ip6_mrouter_ver = cmd;
memset((void *)mf6ctable, 0, sizeof(mf6ctable));
memset((void *)n6expire, 0, sizeof(n6expire));
pim6 = 0;/* used for stubbing out/in pim stuff */
callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE);
callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
expire_upcalls, NULL);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "ip6_mrouter_init\n");
#endif
return 0;
}
/*
* Disable multicast routing
*/
int
ip6_mrouter_done(void)
{
mifi_t mifi;
int i;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
struct mf6c *rt;
struct rtdetq *rte;
int s;
s = splsoftnet();
/*
* For each phyint in use, disable promiscuous reception of all IPv6
* multicasts.
*/
#ifdef INET
#ifdef MROUTING
/*
* If there is still IPv4 multicast routing daemon,
* we remain interfaces to receive all muliticasted packets.
* XXX: there may be an interface in which the IPv4 multicast
* daemon is not interested...
*/
if (!ip_mrouter)
#endif
#endif
{
for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp &&
!(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
ifp = mif6table[mifi].m6_ifp;
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
if_mcast_op(ifp, SIOCDELMULTI,
sin6tocsa(&sin6));
}
}
}
memset((void *)mif6table, 0, sizeof(mif6table));
nummifs = 0;
pim6 = 0; /* used to stub out/in pim specific code */
callout_stop(&expire_upcalls_ch);
/*
* Free all multicast forwarding cache entries.
*/
for (i = 0; i < MF6CTBLSIZ; i++) {
rt = mf6ctable[i];
while (rt) {
struct mf6c *frt;
for (rte = rt->mf6c_stall; rte != NULL; ) {
struct rtdetq *n = rte->next;
m_freem(rte->m);
free(rte, M_MRTABLE);
rte = n;
}
frt = rt;
rt = rt->mf6c_next;
free(frt, M_MRTABLE);
}
}
memset((void *)mf6ctable, 0, sizeof(mf6ctable));
/*
* Reset register interface
*/
if (reg_mif_num != (mifi_t)-1) { if_detach(&multicast_register_if6);
reg_mif_num = (mifi_t)-1;
}
ip6_mrouter = NULL;
ip6_mrouter_ver = 0;
splx(s);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "ip6_mrouter_done\n");
#endif
return 0;
}
void
ip6_mrouter_detach(struct ifnet *ifp)
{
struct rtdetq *rte;
struct mf6c *mfc;
mifi_t mifi;
int i;
if (ip6_mrouter == NULL)
return;
/*
* Delete a mif which points to ifp.
*/
for (mifi = 0; mifi < nummifs; mifi++)
if (mif6table[mifi].m6_ifp == ifp)
del_m6if(&mifi);
/*
* Clear rte->ifp of cache entries received on ifp.
*/
for (i = 0; i < MF6CTBLSIZ; i++) {
if (n6expire[i] == 0)
continue;
for (mfc = mf6ctable[i]; mfc != NULL; mfc = mfc->mf6c_next) {
for (rte = mfc->mf6c_stall; rte != NULL; rte = rte->next) {
if (rte->ifp == ifp)
rte->ifp = NULL;
}
}
}
}
/*
* Add a mif to the mif table
*/
static int
add_m6if(struct mif6ctl *mifcp)
{
struct mif6 *mifp;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
int error, s;
if (mifcp->mif6c_mifi >= MAXMIFS)
return EINVAL;
mifp = mif6table + mifcp->mif6c_mifi;
if (mifp->m6_ifp)
return EADDRINUSE; /* XXX: is it appropriate? */
if (!mifcp->mif6c_pifi || (ifp = if_byindex(mifcp->mif6c_pifi)) == NULL)
return ENXIO;
if (mifcp->mif6c_flags & MIFF_REGISTER) {
ifp = &multicast_register_if6;
if (reg_mif_num == (mifi_t)-1) { strlcpy(ifp->if_xname, "register_mif",
sizeof(ifp->if_xname));
ifp->if_flags |= IFF_LOOPBACK;
ifp->if_index = mifcp->mif6c_mifi;
reg_mif_num = mifcp->mif6c_mifi;
if_attach(ifp);
}
} else {
/* Make sure the interface supports multicast */
if ((ifp->if_flags & IFF_MULTICAST) == 0)
return EOPNOTSUPP;
s = splsoftnet();
/*
* Enable promiscuous reception of all IPv6 multicasts
* from the interface.
*/
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
error = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
splx(s);
if (error)
return error;
}
s = splsoftnet();
mifp->m6_flags = mifcp->mif6c_flags;
mifp->m6_ifp = ifp;
/* initialize per mif pkt counters */
mifp->m6_pkt_in = 0;
mifp->m6_pkt_out = 0;
mifp->m6_bytes_in = 0;
mifp->m6_bytes_out = 0;
splx(s);
/* Adjust nummifs up if the mifi is higher than nummifs */
if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1;
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG,
"add_mif #%d, phyint %s\n",
mifcp->mif6c_mifi, ifp->if_xname);
#endif
return 0;
}
/*
* Delete a mif from the mif table
*/
static int
del_m6if(mifi_t *mifip)
{
struct mif6 *mifp = mif6table + *mifip;
mifi_t mifi;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
int s;
if (*mifip >= nummifs)
return EINVAL;
if (mifp->m6_ifp == NULL)
return EINVAL;
s = splsoftnet();
if (!(mifp->m6_flags & MIFF_REGISTER)) {
/*
* XXX: what if there is yet IPv4 multicast daemon
* using the interface?
*/
ifp = mifp->m6_ifp;
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
if_mcast_op(ifp, SIOCDELMULTI, sin6tosa(&sin6));
} else {
if (reg_mif_num != (mifi_t)-1) {
if_detach(&multicast_register_if6);
reg_mif_num = (mifi_t)-1;
}
}
memset((void *)mifp, 0, sizeof (*mifp));
/* Adjust nummifs down */
for (mifi = nummifs; mifi > 0; mifi--)
if (mif6table[mifi - 1].m6_ifp)
break;
nummifs = mifi;
splx(s);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
#endif
return 0;
}
/*
* Add an mfc entry
*/
static int
add_m6fc(struct mf6cctl *mfccp)
{
struct mf6c *rt;
u_long hash;
struct rtdetq *rte;
u_short nstl;
int s;
char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
mfccp->mf6cc_mcastgrp.sin6_addr, rt);
/* If an entry already exists, just update the fields */
if (rt) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,"add_m6fc update o %s g %s p %x\n",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent);
#endif
s = splsoftnet();
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
splx(s);
return 0;
}
/*
* Find the entry for which the upcall was made and update
*/
s = splsoftnet();
hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
mfccp->mf6cc_mcastgrp.sin6_addr);
for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
&mfccp->mf6cc_mcastgrp.sin6_addr) &&
(rt->mf6c_stall != NULL)) {
if (nstl++)
log(LOG_ERR,
"add_m6fc: %s o %s g %s p %x dbx %p\n",
"multiple kernel entries",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent, rt->mf6c_stall);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,
"add_m6fc o %s g %s p %x dbg %p\n",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent, rt->mf6c_stall);
#endif
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
rt->mf6c_expire = 0; /* Don't clean this guy up */
n6expire[hash]--;
/* free packets Qed at the end of this entry */
for (rte = rt->mf6c_stall; rte != NULL; ) {
struct rtdetq *n = rte->next;
if (rte->ifp) { ip6_mdq(rte->m, rte->ifp, rt);
}
m_freem(rte->m);
#ifdef UPCALL_TIMING
collate(&(rte->t));
#endif
free(rte, M_MRTABLE);
rte = n;
}
rt->mf6c_stall = NULL;
}
}
/*
* It is possible that an entry is being inserted without an upcall
*/
if (nstl == 0) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,
"add_mfc no upcall h %ld o %s g %s p %x\n",
hash,
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent);
#endif
for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&&
IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
&mfccp->mf6cc_mcastgrp.sin6_addr)) {
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
if (rt->mf6c_expire) n6expire[hash]--;
rt->mf6c_expire = 0;
}
}
if (rt == NULL) {
/* no upcall, so make a new entry */
rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
if (rt == NULL) {
splx(s);
return ENOBUFS;
}
/* insert new entry at head of hash chain */
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
rt->mf6c_expire = 0;
rt->mf6c_stall = NULL;
/* link into table */
rt->mf6c_next = mf6ctable[hash];
mf6ctable[hash] = rt;
}
}
splx(s);
return 0;
}
#ifdef UPCALL_TIMING
/*
* collect delay statistics on the upcalls
*/
static void
collate(struct timeval *t)
{
u_long d;
struct timeval tp;
u_long delta;
GET_TIME(tp);
if (TV_LT(*t, tp))
{
TV_DELTA(tp, *t, delta);
d = delta >> 10;
if (d > UPCALL_MAX)
d = UPCALL_MAX;
++upcall_data[d];
}
}
#endif /* UPCALL_TIMING */
/*
* Delete an mfc entry
*/
static int
del_m6fc(struct mf6cctl *mfccp)
{
struct sockaddr_in6 origin;
struct sockaddr_in6 mcastgrp;
struct mf6c *rt;
struct mf6c **nptr;
u_long hash;
int s;
origin = mfccp->mf6cc_origin;
mcastgrp = mfccp->mf6cc_mcastgrp;
hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC) {
char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
IN6_PRINT(ip6bufo, &origin.sin6_addr),
IN6_PRINT(ip6bufm, &mcastgrp.sin6_addr));
}
#endif
s = splsoftnet();
nptr = &mf6ctable[hash];
while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
&rt->mf6c_mcastgrp.sin6_addr) &&
rt->mf6c_stall == NULL)
break;
nptr = &rt->mf6c_next;
}
if (rt == NULL) {
splx(s);
return EADDRNOTAVAIL;
}
*nptr = rt->mf6c_next;
free(rt, M_MRTABLE);
splx(s);
return 0;
}
static int
socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
{
if (s) {
if (sbappendaddr(&s->so_rcv, sin6tosa(src), mm, NULL) != 0) {
sorwakeup(s);
return 0;
}
soroverflow(s);
}
m_freem(mm);
return -1;
}
/*
* IPv6 multicast forwarding function. This function assumes that the packet
* pointed to by "ip6" has arrived on (or is about to be sent to) the interface
* pointed to by "ifp", and the packet is to be relayed to other networks
* that have members of the packet's destination IPv6 multicast group.
*
* The packet is returned unscathed to the caller, unless it is
* erroneous, in which case a non-zero return value tells the caller to
* discard it.
*/
int
ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
{
struct mf6c *rt;
struct mif6 *mifp;
struct mbuf *mm;
int s;
mifi_t mifi;
struct sockaddr_in6 sin6;
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
ifp->if_index);
#endif
/*
* Don't forward a packet with Hop limit of zero or one,
* or a packet destined to a local-only group.
*/
if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) ||
IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
return 0;
ip6->ip6_hlim--;
/*
* Source address check: do not forward packets with unspecified
* source. It was discussed in July 2000, on ipngwg mailing list.
* This is rather more serious than unicast cases, because some
* MLD packets can be sent with the unspecified source address
* (although such packets must normally set the hop limit field to 1).
*/
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
if (ip6_log_time + ip6_log_interval < time_uptime) {
ip6_log_time = time_uptime;
log(LOG_DEBUG,
"cannot forward "
"from %s to %s nxt %d received on %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
ip6->ip6_nxt,
m->m_pkthdr.rcvif_index ?
if_name(m_get_rcvif_NOMPSAFE(m)) : "?");
}
return 0;
}
/*
* Determine forwarding mifs from the forwarding cache table
*/
s = splsoftnet();
MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
/* Entry exists, so forward if necessary */
if (rt) {
splx(s);
return ip6_mdq(m, ifp, rt);
} else {
/*
* If we don't have a route for packet's origin, make a copy
* of the packet and send message to routing daemon.
*/
struct mbuf *mb0;
struct rtdetq *rte;
u_long hash;
#ifdef UPCALL_TIMING
struct timeval tp;
GET_TIME(tp);
#endif
mrt6stat.mrt6s_no_route++;
#ifdef MRT6DEBUG
if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC))
log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
#endif
/*
* Allocate mbufs early so that we don't do extra work if we
* are just going to fail anyway.
*/
rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
if (rte == NULL) {
splx(s);
return ENOBUFS;
}
mb0 = m_copypacket(m, M_DONTWAIT);
/*
* Pullup packet header if needed before storing it,
* as other references may modify it in the meantime.
*/
if (mb0 && M_UNWRITABLE(mb0, sizeof(struct ip6_hdr)))
mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
if (mb0 == NULL) {
free(rte, M_MRTABLE);
splx(s);
return ENOBUFS;
}
/* is there an upcall waiting for this packet? */
hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
&rt->mf6c_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
&rt->mf6c_mcastgrp.sin6_addr) &&
(rt->mf6c_stall != NULL))
break;
}
if (rt == NULL) {
struct mrt6msg *im;
struct omrt6msg *oim;
/* no upcall, so make a new entry */
rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
if (rt == NULL) {
free(rte, M_MRTABLE);
m_freem(mb0);
splx(s);
return ENOBUFS;
}
/*
* Make a copy of the header to send to the user
* level process
*/
mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_DONTWAIT);
if (mm == NULL) {
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return ENOBUFS;
}
/*
* Send message to routing daemon
*/
sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);
im = NULL;
oim = NULL;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim = mtod(mm, struct omrt6msg *);
oim->im6_msgtype = MRT6MSG_NOCACHE;
oim->im6_mbz = 0;
break;
case MRT6_INIT:
im = mtod(mm, struct mrt6msg *);
im->im6_msgtype = MRT6MSG_NOCACHE;
im->im6_mbz = 0;
break;
default:
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return EINVAL;
}
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG,
"getting the iif info in the kernel\n");
#endif
for (mifp = mif6table, mifi = 0;
mifi < nummifs && mifp->m6_ifp != ifp;
mifp++, mifi++)
;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim->im6_mif = mifi;
break;
case MRT6_INIT:
im->im6_mif = mifi;
break;
}
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
"socket queue full\n");
mrt6stat.mrt6s_upq_sockfull++;
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return ENOBUFS;
}
mrt6stat.mrt6s_upcalls++;
/* insert new entry at head of hash chain */
memset(rt, 0, sizeof(*rt));
sockaddr_in6_init(&rt->mf6c_origin, &ip6->ip6_src,
0, 0, 0);
sockaddr_in6_init(&rt->mf6c_mcastgrp, &ip6->ip6_dst,
0, 0, 0);
rt->mf6c_expire = UPCALL_EXPIRE;
n6expire[hash]++;
rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
/* link into table */
rt->mf6c_next = mf6ctable[hash];
mf6ctable[hash] = rt;
/* Add this entry to the end of the queue */
rt->mf6c_stall = rte;
} else {
/* determine if q has overflowed */
struct rtdetq **p;
int npkts = 0;
for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) {
if (++npkts > MAX_UPQ6) {
mrt6stat.mrt6s_upq_ovflw++;
free(rte, M_MRTABLE);
m_freem(mb0);
splx(s);
return 0;
}
}
/* Add this entry to the end of the queue */
*p = rte;
}
rte->next = NULL;
rte->m = mb0;
rte->ifp = ifp;
#ifdef UPCALL_TIMING
rte->t = tp;
#endif
splx(s);
return 0;
}
}
/*
* Clean up cache entries if upcalls are not serviced
* Call from the Slow Timeout mechanism, every 0.25 seconds.
*/
static void
expire_upcalls(void *unused)
{
struct rtdetq *rte;
struct mf6c *mfc, **nptr;
int i;
/* XXX NOMPSAFE still need softnet_lock */
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
for (i = 0; i < MF6CTBLSIZ; i++) {
if (n6expire[i] == 0)
continue;
nptr = &mf6ctable[i];
while ((mfc = *nptr) != NULL) {
rte = mfc->mf6c_stall;
/*
* Skip real cache entries
* Make sure it wasn't marked to not expire (shouldn't happen)
* If it expires now
*/
if (rte != NULL &&
mfc->mf6c_expire != 0 &&
--mfc->mf6c_expire == 0) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_EXPIRE) {
char ip6bufo[INET6_ADDRSTRLEN];
char ip6bufm[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"expire_upcalls: expiring (%s %s)\n",
IN6_PRINT(ip6bufo,
&mfc->mf6c_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfc->mf6c_mcastgrp.sin6_addr));
}
#endif
/*
* drop all the packets
* free the mbuf with the pkt, if, timing info
*/
do {
struct rtdetq *n = rte->next;
m_freem(rte->m);
free(rte, M_MRTABLE);
rte = n;
} while (rte != NULL);
mrt6stat.mrt6s_cache_cleanups++;
n6expire[i]--;
*nptr = mfc->mf6c_next;
free(mfc, M_MRTABLE);
} else {
nptr = &mfc->mf6c_next;
}
}
}
callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
expire_upcalls, NULL);
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Macro to send packet on mif. Since RSVP packets don't get counted on
* input, they shouldn't get counted on output, so statistics keeping is
* separate.
*/
#define MC6_SEND(ip6, mifp, m) do { \
if ((mifp)->m6_flags & MIFF_REGISTER) \
register_send((ip6), (mifp), (m)); \
else \
phyint_send((ip6), (mifp), (m)); \
} while (/*CONSTCOND*/ 0)
/*
* Packet forwarding routine once entry in the cache is made
*/
static int
ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
mifi_t mifi, iif;
struct mif6 *mifp;
int plen = m->m_pkthdr.len;
struct in6_addr src0, dst0; /* copies for local work */
u_int32_t iszone, idzone, oszone, odzone;
int error = 0;
/*
* Don't forward if it didn't arrive from the parent mif
* for its origin.
*/
mifi = rt->mf6c_parent;
if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
/* came in the wrong interface */
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG,
"wrong if: ifid %d mifi %d mififid %x\n",
ifp->if_index, mifi,
mif6table[mifi].m6_ifp ?
mif6table[mifi].m6_ifp->if_index : -1);
#endif
mrt6stat.mrt6s_wrong_if++;
rt->mf6c_wrong_if++;
/*
* If we are doing PIM processing, and we are forwarding
* packets on this interface, send a message to the
* routing daemon.
*/
/* have to make sure this is a valid mif */
if (mifi < nummifs && mif6table[mifi].m6_ifp) {
if (pim6 && (m->m_flags & M_LOOP) == 0) {
/*
* Check the M_LOOP flag to avoid an
* unnecessary PIM assert.
* XXX: M_LOOP is an ad-hoc hack...
*/
struct sockaddr_in6 sin6;
struct mbuf *mm;
struct mrt6msg *im;
struct omrt6msg *oim;
mm = m_copym(m, 0, sizeof(struct ip6_hdr), M_DONTWAIT);
if (mm && M_UNWRITABLE(mm, sizeof(struct ip6_hdr)))
mm = m_pullup(mm, sizeof(struct ip6_hdr));
if (mm == NULL)
return ENOBUFS;
oim = NULL;
im = NULL;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim = mtod(mm, struct omrt6msg *);
oim->im6_msgtype = MRT6MSG_WRONGMIF;
oim->im6_mbz = 0;
break;
case MRT6_INIT:
im = mtod(mm, struct mrt6msg *);
im->im6_msgtype = MRT6MSG_WRONGMIF;
im->im6_mbz = 0;
break;
default:
m_freem(mm);
return EINVAL;
}
for (mifp = mif6table, iif = 0;
iif < nummifs && mifp &&
mifp->m6_ifp != ifp;
mifp++, iif++)
;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim->im6_mif = iif;
sin6.sin6_addr = oim->im6_src;
break;
case MRT6_INIT:
im->im6_mif = iif;
sin6.sin6_addr = im->im6_src;
break;
}
mrt6stat.mrt6s_upcalls++;
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
#endif
++mrt6stat.mrt6s_upq_sockfull;
return ENOBUFS;
}
}
}
return 0;
}
/* If I sourced this packet, it counts as output, else it was input. */
if (m->m_pkthdr.rcvif_index == 0) {
/* XXX: is rcvif really NULL when output?? */
mif6table[mifi].m6_pkt_out++;
mif6table[mifi].m6_bytes_out += plen;
} else {
mif6table[mifi].m6_pkt_in++;
mif6table[mifi].m6_bytes_in += plen;
}
rt->mf6c_pkt_cnt++;
rt->mf6c_byte_cnt += plen;
/*
* For each mif, forward a copy of the packet if there are group
* members downstream on the interface.
*/
src0 = ip6->ip6_src;
dst0 = ip6->ip6_dst;
if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
(error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
IP6_STATINC(IP6_STAT_BADSCOPE);
return error;
}
for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
if (mif6table[mifi].m6_ifp == NULL)
continue;
/*
* check if the outgoing packet is going to break
* a scope boundary.
* XXX: For packets through PIM register tunnel
* interface, we believe the routing daemon.
*/
if ((mif6table[rt->mf6c_parent].m6_flags &
MIFF_REGISTER) == 0 &&
(mif6table[mifi].m6_flags & MIFF_REGISTER) == 0) {
if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
&oszone) ||
in6_setscope(&dst0, mif6table[mifi].m6_ifp,
&odzone) ||
iszone != oszone || idzone != odzone) {
IP6_STATINC(IP6_STAT_BADSCOPE);
continue;
}
}
mifp->m6_pkt_out++;
mifp->m6_bytes_out += plen;
MC6_SEND(ip6, mifp, m);
}
}
return 0;
}
static void
phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
{
struct mbuf *mb_copy;
struct ifnet *ifp = mifp->m6_ifp;
int error __mrt6debugused = 0;
int s;
static struct route ro;
bool ingroup;
struct sockaddr_in6 dst6;
s = splsoftnet();
/*
* Make a new reference to the packet; make sure that
* the IPv6 header is actually copied, not just referenced,
* so that ip6_output() only scribbles on the copy.
*/
mb_copy = m_copypacket(m, M_DONTWAIT);
if (mb_copy && M_UNWRITABLE(mb_copy, sizeof(struct ip6_hdr)))
mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
if (mb_copy == NULL) {
splx(s);
return;
}
/* set MCAST flag to the outgoing packet */
mb_copy->m_flags |= M_MCAST;
/*
* If we sourced the packet, call ip6_output since we may divide
* the packet into fragments when the packet is too big for the
* outgoing interface.
* Otherwise, we can simply send the packet to the interface
* sending queue.
*/
if (m->m_pkthdr.rcvif_index == 0) {
struct ip6_moptions im6o;
im6o.im6o_multicast_if_index = if_get_index(ifp);
/* XXX: ip6_output will override ip6->ip6_hlim */
im6o.im6o_multicast_hlim = ip6->ip6_hlim;
im6o.im6o_multicast_loop = 1;
error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING,
&im6o, NULL, NULL);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT)
log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
mifp - mif6table, error);
#endif
splx(s);
return;
}
/*
* If we belong to the destination multicast group
* on the outgoing interface, loop back a copy.
*/
/*
* Does not have to check source info, as it's already covered by
* ip6_input
*/
sockaddr_in6_init(&dst6, &ip6->ip6_dst, 0, 0, 0);
ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
if (ingroup) {
ip6_mloopback(ifp, m,
satocsin6(rtcache_getdst(&ro)));
}
/*
* Put the packet into the sending queue of the outgoing interface
* if it would fit in the MTU of the interface.
*/
if (mb_copy->m_pkthdr.len <= ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) {
error = ip6_if_output(ifp, ifp, mb_copy, &dst6, NULL);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT)
log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
mifp - mif6table, error);
#endif
} else {
/*
* pMTU discovery is intentionally disabled by default, since
* various routers may notify pMTU in multicast, which can be
* a DDoS to a router.
*/
if (ip6_mcast_pmtu) {
icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0,
ifp->if_mtu);
} else {
/* simply discard the packet */
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"phyint_send: packet too big on %s o %s g %s"
" size %d(discarded)\n",
if_name(ifp),
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
mb_copy->m_pkthdr.len);
}
#endif
m_freem(mb_copy);
}
}
splx(s);
}
static int
register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
{
struct mbuf *mm;
int i, len = m->m_pkthdr.len;
struct sockaddr_in6 sin6;
struct mrt6msg *im6;
#ifdef MRT6DEBUG
if (mrt6debug) {
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
}
#endif
PIM6_STATINC(PIM6_STAT_SND_REGISTERS);
/* Make a copy of the packet to send to the user level process */
MGETHDR(mm, M_DONTWAIT, MT_HEADER);
if (mm == NULL)
return ENOBUFS;
mm->m_data += max_linkhdr;
mm->m_len = sizeof(struct ip6_hdr);
if ((mm->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
m_freem(mm);
return ENOBUFS;
}
i = MHLEN - M_LEADINGSPACE(mm);
if (i > len)
i = len;
mm = m_pullup(mm, i);
if (mm == NULL)
return ENOBUFS;
mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
/*
* Send message to routing daemon
*/
sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);
im6 = mtod(mm, struct mrt6msg *);
im6->im6_msgtype = MRT6MSG_WHOLEPKT;
im6->im6_mbz = 0;
im6->im6_mif = mif - mif6table;
/* iif info is not given for reg. encap.n */
mrt6stat.mrt6s_upcalls++;
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_WARNING,
"register_send: ip6_mrouter socket queue full\n");
#endif
++mrt6stat.mrt6s_upq_sockfull;
return ENOBUFS;
}
return 0;
}
/*
* PIM sparse mode hook. Receives the pim control messages, and passes them up
* to the listening socket, using rip6_input.
*
* The only message processed is the REGISTER pim message; the pim header
* is stripped off, and the inner packet is passed to register_mforward.
*/
int
pim6_input(struct mbuf **mp, int *offp, int proto)
{
struct pim *pim;
struct ip6_hdr *ip6 __mrt6debugused;
int pimlen;
struct mbuf *m = *mp;
int minlen;
int off = *offp;
PIM6_STATINC(PIM6_STAT_RCV_TOTAL);
ip6 = mtod(m, struct ip6_hdr *);
pimlen = m->m_pkthdr.len - off;
/*
* Validate lengths
*/
if (pimlen < PIM_MINLEN) {
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* If the packet is at least as big as a REGISTER, go ahead
* and grab the PIM REGISTER header size, to avoid another
* possible m_pullup() later.
*
* PIM_MINLEN == pimhdr + u_int32 == 8
* PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
*/
minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
/*
* Make sure that the IP6 and PIM headers in contiguous memory, and
* possibly the PIM REGISTER header
*/
IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
if (pim == NULL) {
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
return IPPROTO_DONE;
}
/* PIM version check */
if (pim->pim_ver != PIM_VERSION) {
PIM6_STATINC(PIM6_STAT_RCV_BADVERSION);
#ifdef MRT6DEBUG
log(LOG_ERR,
"pim6_input: incorrect version %d, expecting %d\n",
pim->pim_ver, PIM_VERSION);
#endif
m_freem(m);
return IPPROTO_DONE;
}
#define PIM6_CHECKSUM
#ifdef PIM6_CHECKSUM
{
int cksumlen;
/*
* Validate checksum.
* If PIM REGISTER, exclude the data packet
*/
if (pim->pim_type == PIM_REGISTER)
cksumlen = PIM_MINLEN;
else
cksumlen = pimlen;
if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
PIM6_STATINC(PIM6_STAT_RCV_BADSUM);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,
"pim6_input: invalid checksum\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
}
#endif /* PIM_CHECKSUM */
if (pim->pim_type == PIM_REGISTER) {
/*
* since this is a REGISTER, we'll make a copy of the register
* headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
* routing daemon.
*/
static const struct sockaddr_in6 dst = {
.sin6_len = sizeof(dst),
.sin6_family = AF_INET6,
};
struct mbuf *mcp;
struct ip6_hdr *eip6;
u_int32_t *reghdr;
PIM6_STATINC(PIM6_STAT_RCV_REGISTERS);
if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,
"pim6_input: register mif not set: %d\n",
reg_mif_num);
#endif
m_freem(m);
return IPPROTO_DONE;
}
reghdr = (u_int32_t *)(pim + 1);
if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
goto pim6_input_to_daemon;
/*
* Validate length
*/
if (pimlen < PIM6_REG_MINLEN) {
#ifdef MRT6DEBUG
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_ERR,
"pim6_input: register packet size too "
"small %d from %s\n",
pimlen, IN6_PRINT(ip6buf, &ip6->ip6_src));
#endif
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
m_freem(m);
return IPPROTO_DONE;
}
eip6 = (struct ip6_hdr *)(reghdr + 1);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input[register], eip6: %s -> %s, "
"eip6 plen %d\n",
IN6_PRINT(ip6bufs, &eip6->ip6_src),
IN6_PRINT(ip6bufd, &eip6->ip6_dst),
ntohs(eip6->ip6_plen));
}
#endif
/* verify the version number of the inner packet */
if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
"of the inner packet\n",
(eip6->ip6_vfc & IPV6_VERSION));
#endif
m_freem(m);
return IPPROTO_DONE;
}
/* verify the inner packet is destined to a mcast group */
if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input: inner packet of register "
"is not multicast %s\n",
IN6_PRINT(ip6buf, &eip6->ip6_dst));
}
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* make a copy of the whole header to pass to the daemon later.
*/
mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_DONTWAIT);
if (mcp == NULL) {
#ifdef MRT6DEBUG
log(LOG_ERR,
"pim6_input: pim register: "
"could not copy register head\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* forward the inner ip6 packet; point m_data at the inner ip6.
*/
m_adj(m, off + PIM_MINLEN);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input: forwarding decapsulated register: "
"src %s, dst %s, mif %d\n",
IN6_PRINT(ip6bufs, &eip6->ip6_src),
IN6_PRINT(ip6bufd, &eip6->ip6_dst),
reg_mif_num);
}
#endif
looutput(mif6table[reg_mif_num].m6_ifp, m, sin6tocsa(&dst),
NULL);
/* prepare the register head to send to the mrouting daemon */
m = mcp;
}
/*
* Pass the PIM message up to the daemon; if it is a register message
* pass the 'head' only up to the daemon. This includes the
* encapsulator ip6 header, pim header, register header and the
* encapsulated ip6 header.
*/
pim6_input_to_daemon:
/*
* Currently, rip6_input() is always called holding softnet_lock
* by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
*/
KASSERT(mutex_owned(softnet_lock));
rip6_input(&m, offp, proto);
return IPPROTO_DONE;
}
static int
sysctl_net_inet6_pim6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(pim6stat_percpu, PIM6_NSTATS));
}
static void
sysctl_net_inet6_pim6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "pim6",
SYSCTL_DESCR("PIMv6 settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_PIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("PIMv6 statistics"),
sysctl_net_inet6_pim6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_PIM, PIM6CTL_STATS,
CTL_EOL);
}
/* $NetBSD: prop_dictionary_util.c,v 1.9 2022/08/03 21:13:46 riastradh Exp $ */
/*-
* Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Utility routines to make it more convenient to work with values
* stored in dictionaries.
*
* Note: There is no special magic going on here. We use the standard
* proplib(3) APIs to do all of this work. Any application could do
* exactly what we're doing here.
*/
#include "prop_object_impl.h" /* only to hide kernel vs. not-kernel */
#include <prop/proplib.h>
bool
prop_dictionary_get_dict(prop_dictionary_t dict, const char *key,
prop_dictionary_t *dp)
{
prop_object_t o;
o = prop_dictionary_get(dict, key);
if (prop_object_type(o) != PROP_TYPE_DICTIONARY)
return false;
*dp = o;
return true;
}
bool
prop_dictionary_get_bool(prop_dictionary_t dict, const char *key, bool *valp)
{
prop_bool_t b;
b = prop_dictionary_get(dict, key);
if (prop_object_type(b) != PROP_TYPE_BOOL)
return (false);
*valp = prop_bool_true(b);
return (true);
}
bool
prop_dictionary_set_bool(prop_dictionary_t dict, const char *key, bool val)
{
return prop_dictionary_set_and_rel(dict, key, prop_bool_create(val));
}
#define TEMPLATE(name, typ) \
bool \
prop_dictionary_get_ ## name (prop_dictionary_t dict, \
const char *key, \
typ *valp) \
{ \
return prop_number_ ## name ## _value( \
prop_dictionary_get(dict, key), valp); \
}
TEMPLATE(schar, signed char)
TEMPLATE(short, short)
TEMPLATE(int, int)
TEMPLATE(long, long)
TEMPLATE(longlong, long long)
TEMPLATE(intptr, intptr_t)
TEMPLATE(int8, int8_t)
TEMPLATE(int16, int16_t)
TEMPLATE(int32, int32_t)
TEMPLATE(int64, int64_t)
TEMPLATE(uchar, unsigned char)
TEMPLATE(ushort, unsigned short)
TEMPLATE(uint, unsigned int)
TEMPLATE(ulong, unsigned long)
TEMPLATE(ulonglong, unsigned long long)
TEMPLATE(uintptr, uintptr_t)
TEMPLATE(uint8, uint8_t)
TEMPLATE(uint16, uint16_t)
TEMPLATE(uint32, uint32_t)
TEMPLATE(uint64, uint64_t)
#undef TEMPLATE
static bool
prop_dictionary_set_signed_number(prop_dictionary_t dict, const char *key,
intmax_t val)
{
return prop_dictionary_set_and_rel(dict, key,
prop_number_create_signed(val));
}
static bool
prop_dictionary_set_unsigned_number(prop_dictionary_t dict, const char *key,
uintmax_t val)
{
/*LINTED: for conversion from 'long long' to 'long'*/ \
return prop_dictionary_set_and_rel(dict, key,
prop_number_create_unsigned(val));
}
#define TEMPLATE(name, which, typ) \
bool \
prop_dictionary_set_ ## name (prop_dictionary_t dict, \
const char *key, \
typ val) \
{ \
/*LINTED: for conversion from long long to 'long'*/ \
return prop_dictionary_set_ ## which ## _number(dict, key, val);\
}
#define STEMPLATE(name, typ) TEMPLATE(name, signed, typ)
#define UTEMPLATE(name, typ) TEMPLATE(name, unsigned, typ)
STEMPLATE(schar, signed char)
STEMPLATE(short, short)
STEMPLATE(int, int)
STEMPLATE(long, long)
STEMPLATE(longlong, long long)
STEMPLATE(intptr, intptr_t)
STEMPLATE(int8, int8_t)
STEMPLATE(int16, int16_t)
STEMPLATE(int32, int32_t)
STEMPLATE(int64, int64_t)
UTEMPLATE(uchar, unsigned char)
UTEMPLATE(ushort, unsigned short)
UTEMPLATE(uint, unsigned int)
UTEMPLATE(ulong, unsigned long)
UTEMPLATE(ulonglong, unsigned long long)
UTEMPLATE(uintptr, uintptr_t)
UTEMPLATE(uint8, uint8_t)
UTEMPLATE(uint16, uint16_t)
UTEMPLATE(uint32, uint32_t)
UTEMPLATE(uint64, uint64_t)
#undef STEMPLATE
#undef UTEMPLATE
#undef TEMPLATE
bool
prop_dictionary_get_string(prop_dictionary_t dict, const char *key,
const char **cpp)
{
prop_string_t str;
const char *cp;
str = prop_dictionary_get(dict, key);
if (prop_object_type(str) != PROP_TYPE_STRING)
return (false);
cp = prop_string_value(str);
if (cp == NULL)
return (false);
*cpp = cp;
return (true);
}
bool
prop_dictionary_set_string(prop_dictionary_t dict, const char *key,
const char *cp)
{ return prop_dictionary_set_and_rel(dict, key,
prop_string_create_copy(cp));
}
bool
prop_dictionary_set_string_nocopy(prop_dictionary_t dict,
const char *key,
const char *cp)
{ return prop_dictionary_set_and_rel(dict, key,
prop_string_create_nocopy(cp));
}
bool
prop_dictionary_get_data(prop_dictionary_t dict, const char *key,
const void **vp, size_t *sizep)
{
prop_data_t data;
const void *v;
data = prop_dictionary_get(dict, key);
if (prop_object_type(data) != PROP_TYPE_DATA)
return (false);
v = prop_data_value(data);
if (v == NULL)
return (false);
*vp = v;
if (sizep != NULL)
*sizep = prop_data_size(data);
return (true);
}
bool
prop_dictionary_set_data(prop_dictionary_t dict, const char *key,
const void *v, size_t size)
{
return prop_dictionary_set_and_rel(dict, key,
prop_data_create_copy(v, size));
}
bool
prop_dictionary_set_data_nocopy(prop_dictionary_t dict, const char *key,
const void *v, size_t size)
{
return prop_dictionary_set_and_rel(dict, key,
prop_data_create_nocopy(v, size));
}
_PROP_DEPRECATED(prop_dictionary_get_cstring,
"this program uses prop_dictionary_get_cstring(), "
"which is deprecated; use prop_dictionary_get_string() and copy instead.")
bool
prop_dictionary_get_cstring(prop_dictionary_t dict,
const char *key,
char **cpp)
{
prop_string_t str;
char *cp;
size_t len;
bool rv;
str = prop_dictionary_get(dict, key);
if (prop_object_type(str) != PROP_TYPE_STRING)
return (false);
len = prop_string_size(str);
cp = _PROP_MALLOC(len + 1, M_TEMP);
if (cp == NULL)
return (false);
rv = prop_string_copy_value(str, cp, len + 1);
if (rv)
*cpp = cp;
else
_PROP_FREE(cp, M_TEMP);
return (rv);
}
_PROP_DEPRECATED(prop_string_get_cstring_nocopy,
"this program uses prop_string_get_cstring_nocopy(), "
"which is deprecated; use prop_dictionary_get_string() instead.")
bool
prop_dictionary_get_cstring_nocopy(prop_dictionary_t dict,
const char *key,
const char **cpp)
{
return prop_dictionary_get_string(dict, key, cpp);
}
_PROP_DEPRECATED(prop_dictionary_set_cstring,
"this program uses prop_dictionary_set_cstring(), "
"which is deprecated; use prop_dictionary_set_string() instead.")
bool
prop_dictionary_set_cstring(prop_dictionary_t dict,
const char *key,
const char *cp)
{
return prop_dictionary_set_string(dict, key, cp);
}
_PROP_DEPRECATED(prop_dictionary_set_cstring_nocopy,
"this program uses prop_dictionary_set_cstring_nocopy(), "
"which is deprecated; use prop_dictionary_set_string_nocopy() instead.")
bool
prop_dictionary_set_cstring_nocopy(prop_dictionary_t dict,
const char *key,
const char *cp)
{
return prop_dictionary_set_string_nocopy(dict, key, cp);
}
bool
prop_dictionary_set_and_rel(prop_dictionary_t dict, const char *key,
prop_object_t po)
{
bool rv;
if (po == NULL)
return false;
rv = prop_dictionary_set(dict, key, po);
prop_object_release(po);
return rv;
}
/* $NetBSD: mbuf.h,v 1.239 2024/01/22 21:15:02 jdolecek Exp $ */
/*
* Copyright (c) 1996, 1997, 1999, 2001, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center and Matt Thomas of 3am Software Foundry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mbuf.h 8.5 (Berkeley) 2/19/95
*/
#ifndef _SYS_MBUF_H_
#define _SYS_MBUF_H_
#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif
#ifndef M_WAITOK
#include <sys/malloc.h>
#endif
#include <sys/pool.h>
#include <sys/queue.h>
#if defined(_KERNEL)
#include <sys/percpu_types.h>
#include <sys/socket.h> /* for AF_UNSPEC */
#include <sys/psref.h>
#endif /* defined(_KERNEL) */
/* For offsetof() */
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/systm.h>
#else
#include <stddef.h>
#endif
#include <uvm/uvm_param.h> /* for MIN_PAGE_SIZE */
#include <net/if.h>
/*
* Mbufs are of a single size, MSIZE (machine/param.h), which
* includes overhead. An mbuf may add a single "mbuf cluster" of size
* MCLBYTES (also in machine/param.h), which has no additional overhead
* and is used instead of the internal data area; this is done when
* at least MINCLSIZE of data must be stored.
*/
/* Packet tags structure */
struct m_tag {
SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */
uint16_t m_tag_id; /* Tag ID */
uint16_t m_tag_len; /* Length of data */
};
/* mbuf ownership structure */
struct mowner {
char mo_name[16]; /* owner name (fxp0) */
char mo_descr[16]; /* owner description (input) */
LIST_ENTRY(mowner) mo_link; /* */
struct percpu *mo_counters;
};
#define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y }
enum mowner_counter_index {
MOWNER_COUNTER_CLAIMS, /* # of small mbuf claimed */
MOWNER_COUNTER_RELEASES, /* # of small mbuf released */
MOWNER_COUNTER_CLUSTER_CLAIMS, /* # of cluster mbuf claimed */
MOWNER_COUNTER_CLUSTER_RELEASES,/* # of cluster mbuf released */
MOWNER_COUNTER_EXT_CLAIMS, /* # of M_EXT mbuf claimed */
MOWNER_COUNTER_EXT_RELEASES, /* # of M_EXT mbuf released */
MOWNER_COUNTER_NCOUNTERS,
};
#if defined(_KERNEL)
struct mowner_counter {
u_long mc_counter[MOWNER_COUNTER_NCOUNTERS];
};
#endif
/* userland-exported version of struct mowner */
struct mowner_user {
char mo_name[16]; /* owner name (fxp0) */
char mo_descr[16]; /* owner description (input) */
LIST_ENTRY(mowner) mo_link; /* unused padding; for compatibility */
u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */
};
/*
* Macros for type conversion
* mtod(m,t) - convert mbuf pointer to data pointer of correct type
*/
#define mtod(m, t) ((t)((m)->m_data))
/* header at beginning of each mbuf */
struct m_hdr {
struct mbuf *mh_next; /* next buffer in chain */
struct mbuf *mh_nextpkt; /* next chain in queue/record */
char *mh_data; /* location of data */
struct mowner *mh_owner; /* mbuf owner */
int mh_len; /* amount of data in this mbuf */
int mh_flags; /* flags; see below */
paddr_t mh_paddr; /* physical address of mbuf */
short mh_type; /* type of data in this mbuf */
};
/*
* record/packet header in first mbuf of chain; valid if M_PKTHDR set
*
* A note about csum_data:
*
* o For the out-bound direction, the low 16 bits indicates the offset after
* the L4 header where the final L4 checksum value is to be stored and the
* high 16 bits is the length of the L3 header (the start of the data to
* be checksummed).
*
* o For the in-bound direction, it is only valid if the M_CSUM_DATA flag is
* set. In this case, an L4 checksum has been calculated by hardware and
* is stored in csum_data, but it is up to software to perform final
* verification.
*
* Note for in-bound TCP/UDP checksums: we expect the csum_data to NOT
* be bit-wise inverted (the final step in the calculation of an IP
* checksum) -- this is so we can accumulate the checksum for fragmented
* packets during reassembly.
*
* Size ILP32: 40
* LP64: 56
*/
struct pkthdr {
union {
void *ctx; /* for M_GETCTX/M_SETCTX */
if_index_t index; /* rcv interface index */
} _rcvif;
#define rcvif_index _rcvif.index
SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
int len; /* total packet length */
int csum_flags; /* checksum flags */
uint32_t csum_data; /* checksum data */
u_int segsz; /* segment size */
uint16_t ether_vtag; /* ethernet 802.1p+q vlan tag */
uint16_t pkthdr_flags; /* flags for pkthdr, see blow */
#define PKTHDR_FLAG_IPSEC_SKIP_PFIL 0x0001 /* skip pfil_run_hooks() after ipsec decrypt */
/*
* Following three fields are open-coded struct altq_pktattr
* to rearrange struct pkthdr fields flexibly.
*/
int pattr_af; /* ALTQ: address family */
void *pattr_class; /* ALTQ: sched class set by classifier */
void *pattr_hdr; /* ALTQ: saved header position in mbuf */
};
/* Checksumming flags (csum_flags). */
#define M_CSUM_TCPv4 0x00000001 /* TCP header/payload */
#define M_CSUM_UDPv4 0x00000002 /* UDP header/payload */
#define M_CSUM_TCP_UDP_BAD 0x00000004 /* TCP/UDP checksum bad */
#define M_CSUM_DATA 0x00000008 /* consult csum_data */
#define M_CSUM_TCPv6 0x00000010 /* IPv6 TCP header/payload */
#define M_CSUM_UDPv6 0x00000020 /* IPv6 UDP header/payload */
#define M_CSUM_IPv4 0x00000040 /* IPv4 header */
#define M_CSUM_IPv4_BAD 0x00000080 /* IPv4 header checksum bad */
#define M_CSUM_TSOv4 0x00000100 /* TCPv4 segmentation offload */
#define M_CSUM_TSOv6 0x00000200 /* TCPv6 segmentation offload */
/* Checksum-assist quirks: keep separate from jump-table bits. */
#define M_CSUM_BLANK 0x40000000 /* csum is missing */
#define M_CSUM_NO_PSEUDOHDR 0x80000000 /* Rx csum_data does not include
* the UDP/TCP pseudo-hdr, and
* is not yet 1s-complemented.
*/
#define M_CSUM_BITS \
"\20\1TCPv4\2UDPv4\3TCP_UDP_BAD\4DATA\5TCPv6\6UDPv6\7IPv4\10IPv4_BAD" \
"\11TSOv4\12TSOv6\37BLANK\40NO_PSEUDOHDR"
/*
* Macros for manipulating csum_data on outgoing packets. These are
* used to pass information down from the L4/L3 to the L2.
*
* _IPHL: Length of the IPv{4/6} header, plus the options; in other
* words the offset of the UDP/TCP header in the packet.
* _OFFSET: Offset of the checksum field in the UDP/TCP header.
*/
#define M_CSUM_DATA_IPv4_IPHL(x) ((x) >> 16)
#define M_CSUM_DATA_IPv4_OFFSET(x) ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_IPHL(x) ((x) >> 16)
#define M_CSUM_DATA_IPv6_OFFSET(x) ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_SET(x, v) (x) = ((x) & 0xffff) | ((v) << 16)
/*
* Max # of pages we can attach to m_ext. This is carefully chosen
* to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page.
*/
#ifdef MIN_PAGE_SIZE
#define M_EXT_MAXPAGES ((65536 / MIN_PAGE_SIZE) + 1)
#endif
/*
* Description of external storage mapped into mbuf, valid if M_EXT set.
*/
struct _m_ext_storage {
unsigned int ext_refcnt;
char *ext_buf; /* start of buffer */
void (*ext_free) /* free routine if not the usual */
(struct mbuf *, void *, size_t, void *);
void *ext_arg; /* argument for ext_free */
size_t ext_size; /* size of buffer, for ext_free */
union {
/* M_EXT_CLUSTER: physical address */
paddr_t extun_paddr;
#ifdef M_EXT_MAXPAGES
/* M_EXT_PAGES: pages */
struct vm_page *extun_pgs[M_EXT_MAXPAGES];
#endif
} ext_un;
#define ext_paddr ext_un.extun_paddr
#define ext_pgs ext_un.extun_pgs
};
struct _m_ext {
struct mbuf *ext_ref;
struct _m_ext_storage ext_storage;
};
#define M_PADDR_INVALID POOL_PADDR_INVALID
/*
* Definition of "struct mbuf".
* Don't change this without understanding how MHLEN/MLEN are defined.
*/
#define MBUF_DEFINE(name, mhlen, mlen) \
struct name { \
struct m_hdr m_hdr; \
union { \
struct { \
struct pkthdr MH_pkthdr; \
union { \
struct _m_ext MH_ext; \
char MH_databuf[(mhlen)]; \
} MH_dat; \
} MH; \
char M_databuf[(mlen)]; \
} M_dat; \
}
#define m_next m_hdr.mh_next
#define m_len m_hdr.mh_len
#define m_data m_hdr.mh_data
#define m_owner m_hdr.mh_owner
#define m_type m_hdr.mh_type
#define m_flags m_hdr.mh_flags
#define m_nextpkt m_hdr.mh_nextpkt
#define m_paddr m_hdr.mh_paddr
#define m_pkthdr M_dat.MH.MH_pkthdr
#define m_ext_storage M_dat.MH.MH_dat.MH_ext.ext_storage
#define m_ext_ref M_dat.MH.MH_dat.MH_ext.ext_ref
#define m_ext m_ext_ref->m_ext_storage
#define m_pktdat M_dat.MH.MH_dat.MH_databuf
#define m_dat M_dat.M_databuf
/*
* Dummy mbuf structure to calculate the right values for MLEN/MHLEN, taking
* into account inter-structure padding.
*/
MBUF_DEFINE(_mbuf_dummy, 1, 1);
/* normal data len */
#define MLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_dat)))
/* data len w/pkthdr */
#define MHLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_pktdat)))
#define MINCLSIZE (MHLEN+MLEN+1) /* smallest amount to put in cluster */
/*
* The *real* struct mbuf
*/
MBUF_DEFINE(mbuf, MHLEN, MLEN);
/* mbuf flags */
#define M_EXT 0x00000001 /* has associated external storage */
#define M_PKTHDR 0x00000002 /* start of record */
#define M_EOR 0x00000004 /* end of record */
#define M_PROTO1 0x00000008 /* protocol-specific */
/* mbuf pkthdr flags, also in m_flags */
#define M_AUTHIPHDR 0x00000010 /* authenticated (IPsec) */
#define M_DECRYPTED 0x00000020 /* decrypted (IPsec) */
#define M_LOOP 0x00000040 /* received on loopback */
#define M_BCAST 0x00000100 /* send/received as L2 broadcast */
#define M_MCAST 0x00000200 /* send/received as L2 multicast */
#define M_CANFASTFWD 0x00000400 /* packet can be fast-forwarded */
#define M_ANYCAST6 0x00000800 /* received as IPv6 anycast */
#define M_LINK0 0x00001000 /* link layer specific flag */
#define M_LINK1 0x00002000 /* link layer specific flag */
#define M_LINK2 0x00004000 /* link layer specific flag */
#define M_LINK3 0x00008000 /* link layer specific flag */
#define M_LINK4 0x00010000 /* link layer specific flag */
#define M_LINK5 0x00020000 /* link layer specific flag */
#define M_LINK6 0x00040000 /* link layer specific flag */
#define M_LINK7 0x00080000 /* link layer specific flag */
#define M_VLANTAG 0x00100000 /* ether_vtag is valid */
/* additional flags for M_EXT mbufs */
#define M_EXT_FLAGS 0xff000000
#define M_EXT_CLUSTER 0x01000000 /* ext is a cluster */
#define M_EXT_PAGES 0x02000000 /* ext_pgs is valid */
#define M_EXT_ROMAP 0x04000000 /* ext mapping is r-o at MMU */
#define M_EXT_RW 0x08000000 /* ext storage is writable */
/* for source-level compatibility */
#define M_NOTIFICATION M_PROTO1
#define M_FLAGS_BITS \
"\20\1EXT\2PKTHDR\3EOR\4PROTO1\5AUTHIPHDR\6DECRYPTED\7LOOP\10NONE" \
"\11BCAST\12MCAST\13CANFASTFWD\14ANYCAST6\15LINK0\16LINK1\17LINK2\20LINK3" \
"\21LINK4\22LINK5\23LINK6\24LINK7" \
"\25VLANTAG" \
"\31EXT_CLUSTER\32EXT_PAGES\33EXT_ROMAP\34EXT_RW"
/* flags copied when copying m_pkthdr */
#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD| \
M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP| \
M_VLANTAG)
/* flag copied when shallow-copying external storage */
#define M_EXTCOPYFLAGS (M_EXT|M_EXT_FLAGS)
/* mbuf types */
#define MT_FREE 0 /* should be on free list */
#define MT_DATA 1 /* dynamic (data) allocation */
#define MT_HEADER 2 /* packet header */
#define MT_SONAME 3 /* socket name */
#define MT_SOOPTS 4 /* socket options */
#define MT_FTABLE 5 /* fragment reassembly header */
#define MT_CONTROL 6 /* extra-data protocol message */
#define MT_OOBDATA 7 /* expedited data */
#ifdef MBUFTYPES
const char * const mbuftypes[] = {
"mbfree",
"mbdata",
"mbheader",
"mbsoname",
"mbsopts",
"mbftable",
"mbcontrol",
"mboobdata",
};
#else
extern const char * const mbuftypes[];
#endif
/* flags to m_get/MGET */
#define M_DONTWAIT M_NOWAIT
#define M_WAIT M_WAITOK
#ifdef MBUFTRACE
/* Mbuf allocation tracing. */
void mowner_init_owner(struct mowner *, const char *, const char *);
void mowner_init(struct mbuf *, int);
void mowner_ref(struct mbuf *, int);
void m_claim(struct mbuf *, struct mowner *);
void mowner_revoke(struct mbuf *, bool, int);
void mowner_attach(struct mowner *);
void mowner_detach(struct mowner *);
void m_claimm(struct mbuf *, struct mowner *);
#else
#define mowner_init_owner(mo, n, d) __nothing
#define mowner_init(m, type) __nothing
#define mowner_ref(m, flags) __nothing
#define mowner_revoke(m, all, flags) __nothing
#define m_claim(m, mowner) __nothing
#define mowner_attach(mo) __nothing
#define mowner_detach(mo) __nothing
#define m_claimm(m, mo) __nothing
#endif
#define MCLAIM(m, mo) m_claim((m), (mo))
#define MOWNER_ATTACH(mo) mowner_attach(mo)
#define MOWNER_DETACH(mo) mowner_detach(mo)
/*
* mbuf allocation/deallocation macros:
*
* MGET(struct mbuf *m, int how, int type)
* allocates an mbuf and initializes it to contain internal data.
*
* MGETHDR(struct mbuf *m, int how, int type)
* allocates an mbuf and initializes it to contain a packet header
* and internal data.
*
* If 'how' is M_WAIT, these macros (and the corresponding functions)
* are guaranteed to return successfully.
*/
#define MGET(m, how, type) m = m_get((how), (type))
#define MGETHDR(m, how, type) m = m_gethdr((how), (type))
#if defined(_KERNEL)
#define MCLINITREFERENCE(m) \
do { \
KASSERT(((m)->m_flags & M_EXT) == 0); \
(m)->m_ext_ref = (m); \
(m)->m_ext.ext_refcnt = 1; \
} while (/* CONSTCOND */ 0)
/*
* Macros for mbuf external storage.
*
* MCLGET allocates and adds an mbuf cluster to a normal mbuf;
* the flag M_EXT is set upon success.
*
* MEXTMALLOC allocates external storage and adds it to
* a normal mbuf; the flag M_EXT is set upon success.
*
* MEXTADD adds pre-allocated external storage to
* a normal mbuf; the flag M_EXT is set upon success.
*/
#define MCLGET(m, how) m_clget((m), (how))
#define MEXTMALLOC(m, size, how) \
do { \
(m)->m_ext_storage.ext_buf = malloc((size), 0, (how)); \
if ((m)->m_ext_storage.ext_buf != NULL) { \
MCLINITREFERENCE(m); \
(m)->m_data = (m)->m_ext.ext_buf; \
(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | \
M_EXT|M_EXT_RW; \
(m)->m_ext.ext_size = (size); \
(m)->m_ext.ext_free = NULL; \
(m)->m_ext.ext_arg = NULL; \
mowner_ref((m), M_EXT); \
} \
} while (/* CONSTCOND */ 0)
#define MEXTADD(m, buf, size, type, free, arg) \
do { \
MCLINITREFERENCE(m); \
(m)->m_data = (m)->m_ext.ext_buf = (char *)(buf); \
(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT; \
(m)->m_ext.ext_size = (size); \
(m)->m_ext.ext_free = (free); \
(m)->m_ext.ext_arg = (arg); \
mowner_ref((m), M_EXT); \
} while (/* CONSTCOND */ 0)
#define M_BUFADDR(m) \
(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
((m)->m_flags & M_PKTHDR) ? (m)->m_pktdat : (m)->m_dat)
#define M_BUFSIZE(m) \
(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \
((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN)
#define MRESETDATA(m) (m)->m_data = M_BUFADDR(m)
/*
* Compute the offset of the beginning of the data buffer of a non-ext
* mbuf.
*/
#define M_BUFOFFSET(m) \
(((m)->m_flags & M_PKTHDR) ? \
offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat))
/*
* Determine if an mbuf's data area is read-only. This is true
* if external storage is read-only mapped, or not marked as R/W,
* or referenced by more than one mbuf.
*/
#define M_READONLY(m) \
(((m)->m_flags & M_EXT) != 0 && \
(((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW || \
(m)->m_ext.ext_refcnt > 1))
#define M_UNWRITABLE(__m, __len) \
((__m)->m_len < (__len) || M_READONLY((__m)))
/*
* Determine if an mbuf's data area is read-only at the MMU.
*/
#define M_ROMAP(m) \
(((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))
/*
* Compute the amount of space available before the current start of
* data in an mbuf.
*/
#define M_LEADINGSPACE(m) \
(M_READONLY((m)) ? 0 : ((m)->m_data - M_BUFADDR(m)))
/*
* Compute the amount of space available
* after the end of data in an mbuf.
*/
#define _M_TRAILINGSPACE(m) \
((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \
((m)->m_data + (m)->m_len) : \
&(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))
#define M_TRAILINGSPACE(m) \
(M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m)))
/*
* Arrange to prepend space of size plen to mbuf m.
* If a new mbuf must be allocated, how specifies whether to wait.
* If how is M_DONTWAIT and allocation fails, the original mbuf chain
* is freed and m is set to NULL.
*/
#define M_PREPEND(m, plen, how) \
do { \
if (M_LEADINGSPACE(m) >= (plen)) { \
(m)->m_data -= (plen); \
(m)->m_len += (plen); \
} else \
(m) = m_prepend((m), (plen), (how)); \
if ((m) && (m)->m_flags & M_PKTHDR) \
(m)->m_pkthdr.len += (plen); \
} while (/* CONSTCOND */ 0)
/* change mbuf to new type */
#define MCHTYPE(m, t) \
do { \
KASSERT((t) != MT_FREE); \
mbstat_type_add((m)->m_type, -1); \
mbstat_type_add(t, 1); \
(m)->m_type = t; \
} while (/* CONSTCOND */ 0)
#ifdef DIAGNOSTIC
#define M_VERIFY_PACKET(m) m_verify_packet(m)
#else
#define M_VERIFY_PACKET(m) __nothing
#endif
/* The "copy all" special length. */
#define M_COPYALL -1
/*
* Allow drivers and/or protocols to store private context information.
*/
#define M_GETCTX(m, t) ((t)(m)->m_pkthdr._rcvif.ctx)
#define M_SETCTX(m, c) ((void)((m)->m_pkthdr._rcvif.ctx = (void *)(c)))
#define M_CLEARCTX(m) M_SETCTX((m), NULL)
/*
* M_REGION_GET ensures that the "len"-sized region of type "typ" starting
* from "off" within "m" is located in a single mbuf, contiguously.
*
* The pointer to the region will be returned to pointer variable "val".
*/
#define M_REGION_GET(val, typ, m, off, len) \
do { \
struct mbuf *_t; \
int _tmp; \
if ((m)->m_len >= (off) + (len)) \
(val) = (typ)(mtod((m), char *) + (off)); \
else { \
_t = m_pulldown((m), (off), (len), &_tmp); \
if (_t) { \
if (_t->m_len < _tmp + (len)) \
panic("m_pulldown malfunction"); \
(val) = (typ)(mtod(_t, char *) + _tmp); \
} else { \
(val) = (typ)NULL; \
(m) = NULL; \
} \
} \
} while (/*CONSTCOND*/ 0)
#endif /* defined(_KERNEL) */
/*
* Simple mbuf queueing system
*
* this is basically a SIMPLEQ adapted to mbuf use (ie using
* m_nextpkt instead of field.sqe_next).
*
* m_next is ignored, so queueing chains of mbufs is possible
*/
#define MBUFQ_HEAD(name) \
struct name { \
struct mbuf *mq_first; \
struct mbuf **mq_last; \
}
#define MBUFQ_INIT(q) do { \
(q)->mq_first = NULL; \
(q)->mq_last = &(q)->mq_first; \
} while (/*CONSTCOND*/0)
#define MBUFQ_ENQUEUE(q, m) do { \
(m)->m_nextpkt = NULL; \
*(q)->mq_last = (m); \
(q)->mq_last = &(m)->m_nextpkt; \
} while (/*CONSTCOND*/0)
#define MBUFQ_PREPEND(q, m) do { \
if (((m)->m_nextpkt = (q)->mq_first) == NULL) \
(q)->mq_last = &(m)->m_nextpkt; \
(q)->mq_first = (m); \
} while (/*CONSTCOND*/0)
#define MBUFQ_DEQUEUE(q, m) do { \
if (((m) = (q)->mq_first) != NULL) { \
if (((q)->mq_first = (m)->m_nextpkt) == NULL) \
(q)->mq_last = &(q)->mq_first; \
else \
(m)->m_nextpkt = NULL; \
} \
} while (/*CONSTCOND*/0)
#define MBUFQ_DRAIN(q) do { \
struct mbuf *__m0; \
while ((__m0 = (q)->mq_first) != NULL) { \
(q)->mq_first = __m0->m_nextpkt; \
m_freem(__m0); \
} \
(q)->mq_last = &(q)->mq_first; \
} while (/*CONSTCOND*/0)
#define MBUFQ_FIRST(q) ((q)->mq_first)
#define MBUFQ_NEXT(m) ((m)->m_nextpkt)
#define MBUFQ_LAST(q) (*(q)->mq_last)
/*
* Mbuf statistics.
* For statistics related to mbuf and cluster allocations, see also the
* pool headers (mb_cache and mcl_cache).
*/
struct mbstat {
u_long _m_spare; /* formerly m_mbufs */
u_long _m_spare1; /* formerly m_clusters */
u_long _m_spare2; /* spare field */
u_long _m_spare3; /* formely m_clfree - free clusters */
u_long m_drops; /* times failed to find space */
u_long m_wait; /* times waited for space */
u_long m_drain; /* times drained protocols for space */
u_short m_mtypes[256]; /* type specific mbuf allocations */
};
struct mbstat_cpu {
u_int m_mtypes[256]; /* type specific mbuf allocations */
};
/*
* Mbuf sysctl variables.
*/
#define MBUF_MSIZE 1 /* int: mbuf base size */
#define MBUF_MCLBYTES 2 /* int: mbuf cluster size */
#define MBUF_NMBCLUSTERS 3 /* int: limit on the # of clusters */
#define MBUF_MBLOWAT 4 /* int: mbuf low water mark */
#define MBUF_MCLLOWAT 5 /* int: mbuf cluster low water mark */
#define MBUF_STATS 6 /* struct: mbstat */
#define MBUF_MOWNERS 7 /* struct: m_owner[] */
#define MBUF_NMBCLUSTERS_LIMIT 8 /* int: limit of nmbclusters */
#ifdef _KERNEL
extern struct mbstat mbstat;
extern int nmbclusters; /* limit on the # of clusters */
extern int mblowat; /* mbuf low water mark */
extern int mcllowat; /* mbuf cluster low water mark */
extern int max_linkhdr; /* largest link-level header */
extern int max_protohdr; /* largest protocol header */
extern int max_hdr; /* largest link+protocol header */
extern int max_datalen; /* MHLEN - max_hdr */
extern const int msize; /* mbuf base size */
extern const int mclbytes; /* mbuf cluster size */
extern pool_cache_t mb_cache;
#ifdef MBUFTRACE
LIST_HEAD(mownerhead, mowner);
extern struct mownerhead mowners;
extern struct mowner unknown_mowners[];
extern struct mowner revoked_mowner;
#endif
MALLOC_DECLARE(M_MBUF);
MALLOC_DECLARE(M_SONAME);
struct mbuf *m_copym(struct mbuf *, int, int, int);
struct mbuf *m_copypacket(struct mbuf *, int);
struct mbuf *m_devget(char *, int, int, struct ifnet *);
struct mbuf *m_dup(struct mbuf *, int, int, int);
struct mbuf *m_get(int, int);
struct mbuf *m_gethdr(int, int);
struct mbuf *m_get_n(int, int, size_t, size_t);
struct mbuf *m_gethdr_n(int, int, size_t, size_t);
struct mbuf *m_prepend(struct mbuf *,int, int);
struct mbuf *m_pulldown(struct mbuf *, int, int, int *);
struct mbuf *m_pullup(struct mbuf *, int);
struct mbuf *m_copyup(struct mbuf *, int, int);
struct mbuf *m_split(struct mbuf *,int, int);
struct mbuf *m_getptr(struct mbuf *, int, int *);
void m_adj(struct mbuf *, int);
struct mbuf *m_defrag(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, unsigned int), void *);
void m_cat(struct mbuf *,struct mbuf *);
void m_clget(struct mbuf *, int);
void m_copyback(struct mbuf *, int, int, const void *);
struct mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int);
int m_makewritable(struct mbuf **, int, int, int);
struct mbuf *m_getcl(int, int, int);
void m_copydata(struct mbuf *, int, int, void *);
void m_verify_packet(struct mbuf *);
struct mbuf *m_free(struct mbuf *);
void m_freem(struct mbuf *);
void mbinit(void);
void m_remove_pkthdr(struct mbuf *);
void m_copy_pkthdr(struct mbuf *, struct mbuf *);
void m_move_pkthdr(struct mbuf *, struct mbuf *);
void m_align(struct mbuf *, int);
bool m_ensure_contig(struct mbuf **, int);
struct mbuf *m_add(struct mbuf *, struct mbuf *);
/* Inline routines. */
static __inline u_int m_length(const struct mbuf *) __unused;
/* Statistics */
void mbstat_type_add(int, int);
/* Packet tag routines */
struct m_tag *m_tag_get(int, int, int);
void m_tag_free(struct m_tag *);
void m_tag_prepend(struct mbuf *, struct m_tag *);
void m_tag_unlink(struct mbuf *, struct m_tag *);
void m_tag_delete(struct mbuf *, struct m_tag *);
void m_tag_delete_chain(struct mbuf *);
struct m_tag *m_tag_find(const struct mbuf *, int);
struct m_tag *m_tag_copy(struct m_tag *);
int m_tag_copy_chain(struct mbuf *, struct mbuf *);
/* Packet tag types */
#define PACKET_TAG_NONE 0 /* Nothing */
#define PACKET_TAG_SO 4 /* sending socket pointer */
#define PACKET_TAG_NPF 10 /* packet filter */
#define PACKET_TAG_PF 11 /* packet filter */
#define PACKET_TAG_ALTQ_QID 12 /* ALTQ queue id */
#define PACKET_TAG_IPSEC_OUT_DONE 18
#define PACKET_TAG_IPSEC_NAT_T_PORTS 25 /* two uint16_t */
#define PACKET_TAG_INET6 26 /* IPv6 info */
#define PACKET_TAG_TUNNEL_INFO 28 /* tunnel identification and
* protocol callback, for loop
* detection/recovery
*/
#define PACKET_TAG_MPLS 29 /* Indicate it's for MPLS */
#define PACKET_TAG_SRCROUTE 30 /* IPv4 source routing */
#define PACKET_TAG_ETHERNET_SRC 31 /* Ethernet source address */
/*
* Return the number of bytes in the mbuf chain, m.
*/
static __inline u_int
m_length(const struct mbuf *m)
{
const struct mbuf *m0;
u_int pktlen;
if ((m->m_flags & M_PKTHDR) != 0)
return m->m_pkthdr.len;
pktlen = 0;
for (m0 = m; m0 != NULL; m0 = m0->m_next)
pktlen += m0->m_len;
return pktlen;
}
static __inline void
m_set_rcvif(struct mbuf *m, const struct ifnet *ifp)
{
KASSERT(m->m_flags & M_PKTHDR);
m->m_pkthdr.rcvif_index = ifp->if_index;
}
static __inline void
m_reset_rcvif(struct mbuf *m)
{
KASSERT(m->m_flags & M_PKTHDR);
/* A caller may expect whole _rcvif union is zeroed */
/* m->m_pkthdr.rcvif_index = 0; */
m->m_pkthdr._rcvif.ctx = NULL;
}
static __inline void
m_copy_rcvif(struct mbuf *m, const struct mbuf *n)
{
KASSERT(m->m_flags & M_PKTHDR);
KASSERT(n->m_flags & M_PKTHDR);
m->m_pkthdr.rcvif_index = n->m_pkthdr.rcvif_index;
}
#define M_GET_ALIGNED_HDR(m, type, linkhdr) \
m_get_aligned_hdr((m), __alignof(type) - 1, sizeof(type), (linkhdr))
static __inline int
m_get_aligned_hdr(struct mbuf **m, int mask, size_t hlen, bool linkhdr)
{
#ifndef __NO_STRICT_ALIGNMENT
if (((uintptr_t)mtod(*m, void *) & mask) != 0)
*m = m_copyup(*m, hlen,
linkhdr ? (max_linkhdr + mask) & ~mask : 0);
else
#endif
if (__predict_false((size_t)(*m)->m_len < hlen))
*m = m_pullup(*m, hlen);
return *m == NULL;
}
void m_print(const struct mbuf *, const char *, void (*)(const char *, ...)
__printflike(1, 2));
/* from uipc_mbufdebug.c */
void m_examine(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
/* parsers for m_examine() */
void m_examine_ether(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_pppoe(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ppp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_arp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip6(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp6(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_tcp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_udp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_hex(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
/*
* Get rcvif of a mbuf.
*
* The caller must call m_put_rcvif after using rcvif if the returned rcvif
* isn't NULL. If the returned rcvif is NULL, the caller doesn't need to call
* m_put_rcvif (although calling it is safe).
*
* The caller must not block or sleep while using rcvif. The API ensures a
* returned rcvif isn't freed until m_put_rcvif is called.
*/
static __inline struct ifnet *
m_get_rcvif(const struct mbuf *m, int *s)
{
struct ifnet *ifp;
KASSERT(m->m_flags & M_PKTHDR);
*s = pserialize_read_enter();
ifp = if_byindex(m->m_pkthdr.rcvif_index);
if (__predict_false(ifp == NULL))
pserialize_read_exit(*s);
return ifp;
}
static __inline void
m_put_rcvif(struct ifnet *ifp, int *s)
{
if (ifp == NULL)
return;
pserialize_read_exit(*s);
}
/*
* Get rcvif of a mbuf.
*
* The caller must call m_put_rcvif_psref after using rcvif. The API ensures
* a got rcvif isn't be freed until m_put_rcvif_psref is called.
*/
static __inline struct ifnet *
m_get_rcvif_psref(const struct mbuf *m, struct psref *psref)
{
KASSERT(m->m_flags & M_PKTHDR);
return if_get_byindex(m->m_pkthdr.rcvif_index, psref);
}
static __inline void
m_put_rcvif_psref(struct ifnet *ifp, struct psref *psref)
{
if (ifp == NULL)
return;
if_put(ifp, psref);
}
/*
* Get rcvif of a mbuf.
*
* This is NOT an MP-safe API and shouldn't be used at where we want MP-safe.
*/
static __inline struct ifnet *
m_get_rcvif_NOMPSAFE(const struct mbuf *m)
{
KASSERT(m->m_flags & M_PKTHDR);
return if_byindex(m->m_pkthdr.rcvif_index);
}
#endif /* _KERNEL */
#endif /* !_SYS_MBUF_H_ */
/* $NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_mrouting.h"
#include "opt_net_mpsafe.h"
#include "opt_mpls.h"
#endif
#include "arp.h"
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kauth.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_private.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#endif
#ifdef MROUTING
#include <netinet/ip_mroute.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif
static int ip_pcbopts(struct inpcb *, const struct sockopt *);
static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
static struct ifnet *ip_multicast_if(struct in_addr *, int *);
static void ip_mloopback(struct ifnet *, struct mbuf *,
const struct sockaddr_in *);
static int ip_ifaddrvalid(const struct in_ifaddr *);
extern pfil_head_t *inet_pfil_hook; /* XXX */
int ip_do_loopback_cksum = 0;
static int
ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m,
const struct rtentry *rt)
{
int error = 0;
#ifdef MPLS
union mpls_shim msh;
if (rt == NULL || rt_gettag(rt) == NULL ||
rt_gettag(rt)->sa_family != AF_MPLS ||
(m->m_flags & (M_MCAST | M_BCAST)) != 0 ||
ifp->if_type != IFT_ETHER)
return 0;
msh.s_addr = MPLS_GETSADDR(rt);
if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
struct m_tag *mtag;
/*
* XXX tentative solution to tell ether_output
* it's MPLS. Need some more efficient solution.
*/
mtag = m_tag_get(PACKET_TAG_MPLS,
sizeof(int) /* dummy */,
M_NOWAIT);
if (mtag == NULL)
return ENOMEM;
m_tag_prepend(m, mtag);
}
#endif
return error;
}
/*
* Send an IP packet to a host.
*/
int
ip_if_output(struct ifnet * const ifp, struct mbuf * const m,
const struct sockaddr * const dst, const struct rtentry *rt)
{
int error = 0;
if (rt != NULL) {
error = rt_check_reject_route(rt, ifp);
if (error != 0) {
IP_STATINC(IP_STAT_RTREJECT);
m_freem(m);
return error;
}
}
error = ip_mark_mpls(ifp, m, rt);
if (error != 0) {
m_freem(m);
return error;
}
error = if_output_lock(ifp, ifp, m, dst, rt);
return error;
}
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
*/
int
ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
struct rtentry *rt;
struct ip *ip;
struct ifnet *ifp, *mifp = NULL;
struct mbuf *m = m0;
int len, hlen, error = 0;
struct route iproute;
const struct sockaddr_in *dst;
struct in_ifaddr *ia = NULL;
struct ifaddr *ifa;
int isbroadcast;
int sw_csum;
u_long mtu;
bool natt_frag = false;
bool rtmtu_nolock;
union {
struct sockaddr sa;
struct sockaddr_in sin;
} udst, usrc;
struct sockaddr *rdst = &udst.sa; /* real IP destination, as
* opposed to the nexthop
*/
struct psref psref, psref_ia;
int bound;
bool bind_need_restore = false;
const struct sockaddr *sa;
len = 0;
MCLAIM(m, &ip_tx_mowner);
KASSERT((m->m_flags & M_PKTHDR) != 0);
KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0);
KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) !=
(M_CSUM_TCPv4|M_CSUM_UDPv4));
KASSERT(m->m_len >= sizeof(struct ip));
hlen = sizeof(struct ip);
if (opt) {
m = ip_insertoptions(m, opt, &len);
hlen = len;
}
ip = mtod(m, struct ip *);
/*
* Fill in IP header.
*/
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_off = htons(0);
/* ip->ip_id filled in after we find out source ia */
ip->ip_hl = hlen >> 2;
IP_STATINC(IP_STAT_LOCALOUT);
} else {
hlen = ip->ip_hl << 2;
}
/*
* Route packet.
*/
if (ro == NULL) {
memset(&iproute, 0, sizeof(iproute));
ro = &iproute;
}
sockaddr_in_init(&udst.sin, &ip->ip_dst, 0);
dst = satocsin(rtcache_getdst(ro));
/*
* If there is a cached route, check that it is to the same
* destination and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing
* the cache with IPv6.
*/
if (dst && (dst->sin_family != AF_INET ||
!in_hosteq(dst->sin_addr, ip->ip_dst)))
rtcache_free(ro);
/* XXX must be before rtcache operations */
bound = curlwp_bind();
bind_need_restore = true;
if ((rt = rtcache_validate(ro)) == NULL &&
(rt = rtcache_update(ro, 1)) == NULL) {
dst = &udst.sin;
error = rtcache_setdst(ro, &udst.sa);
if (error != 0) {
IP_STATINC(IP_STAT_ODROPPED);
goto bad;
}
}
/*
* If routing to interface only, short circuit routing lookup.
*/
if (flags & IP_ROUTETOIF) {
ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia);
if (ifa == NULL) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/* ia is already referenced by psref_ia */
ia = ifatoia(ifa);
ifp = ia->ia_ifp;
mtu = ifp->if_mtu;
ip->ip_ttl = 1;
isbroadcast = in_broadcast(dst->sin_addr, ifp);
} else if (((IN_MULTICAST(ip->ip_dst.s_addr) ||
ip->ip_dst.s_addr == INADDR_BROADCAST) ||
(flags & IP_ROUTETOIFINDEX)) &&
imo != NULL && imo->imo_multicast_if_index != 0) {
ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref);
if (ifp == NULL) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
mtu = ifp->if_mtu;
ia = in_get_ia_from_ifp_psref(ifp, &psref_ia);
if (IN_MULTICAST(ip->ip_dst.s_addr) ||
ip->ip_dst.s_addr == INADDR_BROADCAST) {
isbroadcast = 0;
} else {
/* IP_ROUTETOIFINDEX */
isbroadcast = in_broadcast(dst->sin_addr, ifp);
if ((isbroadcast == 0) && ((ifp->if_flags &
(IFF_LOOPBACK | IFF_POINTOPOINT)) == 0) &&
(in_direct(dst->sin_addr, ifp) == 0)) {
/* gateway address required */
if (rt == NULL)
rt = rtcache_init(ro);
if (rt == NULL || rt->rt_ifp != ifp) {
IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
rt->rt_use++;
if (rt->rt_flags & RTF_GATEWAY)
dst = satosin(rt->rt_gateway);
if (rt->rt_flags & RTF_HOST)
isbroadcast =
rt->rt_flags & RTF_BROADCAST;
}
}
} else {
if (rt == NULL)
rt = rtcache_init(ro);
if (rt == NULL) {
IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
if (ifa_is_destroying(rt->rt_ifa)) {
rtcache_unref(rt, ro);
rt = NULL;
IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
ifa_acquire(rt->rt_ifa, &psref_ia);
ia = ifatoia(rt->rt_ifa);
ifp = rt->rt_ifp;
if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
mtu = ifp->if_mtu;
rt->rt_use++;
if (rt->rt_flags & RTF_GATEWAY)
dst = satosin(rt->rt_gateway);
if (rt->rt_flags & RTF_HOST)
isbroadcast = rt->rt_flags & RTF_BROADCAST;
else
isbroadcast = in_broadcast(dst->sin_addr, ifp);
}
rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0;
if (IN_MULTICAST(ip->ip_dst.s_addr) ||
(ip->ip_dst.s_addr == INADDR_BROADCAST)) {
bool inmgroup;
m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
M_BCAST : M_MCAST;
/*
* See if the caller provided any multicast options
*/
if (imo != NULL)
ip->ip_ttl = imo->imo_multicast_ttl;
else
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
/*
* if we don't know the outgoing ifp yet, we can't generate
* output
*/
if (!ifp) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/*
* If the packet is multicast or broadcast, confirm that
* the outgoing interface can transmit it.
*/
if (((m->m_flags & M_MCAST) &&
(ifp->if_flags & IFF_MULTICAST) == 0) ||
((m->m_flags & M_BCAST) &&
(ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/*
* If source address not specified yet, use an address
* of outgoing interface.
*/
if (in_nullhost(ip->ip_src)) {
struct in_ifaddr *xia;
struct ifaddr *xifa;
struct psref _psref;
xia = in_get_ia_from_ifp_psref(ifp, &_psref);
if (!xia) {
IP_STATINC(IP_STAT_IFNOADDR);
error = EADDRNOTAVAIL;
goto bad;
}
xifa = &xia->ia_ifa;
if (xifa->ifa_getifa != NULL) {
ia4_release(xia, &_psref);
/* FIXME ifa_getifa is NOMPSAFE */
xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
if (xia == NULL) {
IP_STATINC(IP_STAT_IFNOADDR);
error = EADDRNOTAVAIL;
goto bad;
}
ia4_acquire(xia, &_psref);
}
ip->ip_src = xia->ia_addr.sin_addr;
ia4_release(xia, &_psref);
}
inmgroup = in_multi_group(ip->ip_dst, ifp, flags);
if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
*/
ip_mloopback(ifp, m, &udst.sin);
}
#ifdef MROUTING
else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IP_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip_mloopback(),
* above, will be forwarded by the ip_input() routine,
* if necessary.
*/
extern struct socket *ip_mrouter;
if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
if (ip_mforward(m, ifp) != 0) {
m_freem(m);
goto done;
}
}
}
#endif
/*
* Multicasts with a time-to-live of zero may be looped-
* back, above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
IP_STATINC(IP_STAT_ODROPPED);
m_freem(m);
goto done;
}
goto sendit;
}
/*
* If source address not specified yet, use address
* of outgoing interface.
*/
if (in_nullhost(ip->ip_src)) {
struct ifaddr *xifa;
xifa = &ia->ia_ifa;
if (xifa->ifa_getifa != NULL) {
ia4_release(ia, &psref_ia);
/* FIXME ifa_getifa is NOMPSAFE */
ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto bad;
}
ia4_acquire(ia, &psref_ia);
}
ip->ip_src = ia->ia_addr.sin_addr;
}
/*
* Packets with Class-D address as source are not valid per
* RFC1112.
*/
if (IN_MULTICAST(ip->ip_src.s_addr)) {
IP_STATINC(IP_STAT_ODROPPED);
error = EADDRNOTAVAIL;
goto bad;
}
/*
* Look for broadcast address and verify user is allowed to
* send such a packet.
*/
if (isbroadcast) {
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EADDRNOTAVAIL;
goto bad;
}
if ((flags & IP_ALLOWBROADCAST) == 0) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EACCES;
goto bad;
}
/* don't allow broadcast messages to be fragmented */
if (ntohs(ip->ip_len) > ifp->if_mtu) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EMSGSIZE;
goto bad;
}
m->m_flags |= M_BCAST;
} else
m->m_flags &= ~M_BCAST;
sendit:
if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) {
if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
ip->ip_id = 0;
} else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
ip->ip_id = ip_newid(ia);
} else {
/*
* TSO capable interfaces (typically?) increment
* ip_id for each segment.
* "allocate" enough ids here to increase the chance
* for them to be unique.
*
* note that the following calculation is not
* needed to be precise. wasting some ip_id is fine.
*/
unsigned int segsz = m->m_pkthdr.segsz;
unsigned int datasz = ntohs(ip->ip_len) - hlen;
unsigned int num = howmany(datasz, segsz);
ip->ip_id = ip_newid_range(ia, num);
}
}
if (ia != NULL) {
ia4_release(ia, &psref_ia);
ia = NULL;
}
/*
* If we're doing Path MTU Discovery, we need to set DF unless
* the route's MTU is locked.
*/
if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) {
ip->ip_off |= htons(IP_DF);
}
#ifdef IPSEC
if (ipsec_used) {
bool ipsec_done = false;
bool count_drop = false;
/* Perform IPsec processing, if any. */
error = ipsec4_output(m, inp, flags, &mtu, &natt_frag,
&ipsec_done, &count_drop);
if (count_drop)
IP_STATINC(IP_STAT_IPSECDROP_OUT);
if (error || ipsec_done)
goto done;
}
if (!ipsec_used || !natt_frag)
#endif
{
/*
* Run through list of hooks for output packets.
*/
error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT);
if (error || m == NULL) {
IP_STATINC(IP_STAT_PFILDROP_OUT);
goto done;
}
}
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
m->m_pkthdr.csum_data |= hlen << 16;
/*
* search for the source address structure to
* maintain output statistics, and verify address
* validity
*/
KASSERT(ia == NULL);
sockaddr_in_init(&usrc.sin, &ip->ip_src, 0);
ifa = ifaof_ifpforaddr_psref(&usrc.sa, ifp, &psref_ia);
if (ifa != NULL)
ia = ifatoia(ifa);
/*
* Ensure we only send from a valid address.
* A NULL address is valid because the packet could be
* generated from a packet filter.
*/
if (ia != NULL && (flags & IP_FORWARDING) == 0 &&
(error = ip_ifaddrvalid(ia)) != 0)
{
ARPLOG(LOG_ERR,
"refusing to send from invalid address %s (pid %d)\n",
ARPLOGADDR(&ip->ip_src), curproc->p_pid);
IP_STATINC(IP_STAT_ODROPPED);
if (error == 1)
/*
* Address exists, but is tentative or detached.
* We can't send from it because it's invalid,
* so we drop the packet.
*/
error = 0;
else
error = EADDRNOTAVAIL;
goto bad;
}
/* Maybe skip checksums on loopback interfaces. */
if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
}
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
/* Need to fragment the packet */
if (ntohs(ip->ip_len) > mtu &&
(m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
goto fragment;
}
#if IFA_STATS
if (ia)
ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
/*
* Always initialize the sum to 0! Some HW assisted
* checksumming requires this.
*/
ip->ip_sum = 0;
if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
/*
* Perform any checksums that the hardware can't do
* for us.
*
* XXX Does any hardware require the {th,uh}_sum
* XXX fields to be 0?
*/
if (sw_csum & M_CSUM_IPv4) {
KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
ip->ip_sum = in_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
}
if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
if (IN_NEED_CHECKSUM(ifp,
sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
in_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &=
~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
}
sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst);
/* Send it */
if (__predict_false(sw_csum & M_CSUM_TSOv4)) {
/*
* TSO4 is required by a packet, but disabled for
* the interface.
*/
error = ip_tso_output(ifp, m, sa, rt);
} else
error = ip_if_output(ifp, m, sa, rt);
goto done;
fragment:
/*
* We can't use HW checksumming if we're about to fragment the packet.
*
* XXX Some hardware can do this.
*/
if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
if (IN_NEED_CHECKSUM(ifp,
m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
in_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
*/
if (ntohs(ip->ip_off) & IP_DF) {
if (flags & IP_RETURNMTU) {
KASSERT(inp != NULL);
in4p_errormtu(inp) = mtu;
}
error = EMSGSIZE;
IP_STATINC(IP_STAT_CANTFRAG);
goto bad;
}
error = ip_fragment(m, ifp, mtu);
if (error) {
m = NULL;
goto bad;
}
for (; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
if (error) {
m_freem(m);
continue;
}
#if IFA_STATS
if (ia)
ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
/*
* If we get there, the packet has not been handled by
* IPsec whereas it should have. Now that it has been
* fragmented, re-inject it in ip_output so that IPsec
* processing can occur.
*/
if (natt_frag) {
error = ip_output(m, opt, NULL,
flags | IP_RAWOUTPUT | IP_NOIPNEWID,
imo, inp);
} else {
KASSERT((m->m_pkthdr.csum_flags &
(M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
error = ip_if_output(ifp, m, (m->m_flags & M_MCAST) ?
sintocsa(rdst) : sintocsa(dst), rt);
}
}
if (error == 0) {
IP_STATINC(IP_STAT_FRAGMENTED);
}
done:
ia4_release(ia, &psref_ia);
rtcache_unref(rt, ro);
if (ro == &iproute) {
rtcache_free(&iproute);
}
if (mifp != NULL) {
if_put(mifp, &psref);
}
if (bind_need_restore)
curlwp_bindx(bound);
return error;
bad:
m_freem(m);
goto done;
}
int
ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
{
struct ip *ip, *mhip;
struct mbuf *m0;
int len, hlen, off;
int mhlen, firstlen;
struct mbuf **mnext;
int sw_csum = m->m_pkthdr.csum_flags;
int fragments = 0;
int error = 0;
int ipoff, ipflg;
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
/* Preserve the offset and flags. */
ipoff = ntohs(ip->ip_off) & IP_OFFMASK;
ipflg = ntohs(ip->ip_off) & (IP_RF|IP_DF|IP_MF);
if (ifp != NULL)
sw_csum &= ~ifp->if_csum_flags_tx;
len = (mtu - hlen) &~ 7;
if (len < 8) {
IP_STATINC(IP_STAT_CANTFRAG);
m_freem(m);
return EMSGSIZE;
}
firstlen = len;
mnext = &m->m_nextpkt;
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
*/
m0 = m;
mhlen = sizeof(struct ip);
for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
goto sendorfree;
}
MCLAIM(m, m0->m_owner);
*mnext = m;
mnext = &m->m_nextpkt;
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
/* we must inherit the flags */
m->m_flags |= m0->m_flags & M_COPYFLAGS;
if (hlen > sizeof(struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
mhip->ip_off = ((off - hlen) >> 3) + ipoff;
mhip->ip_off |= ipflg;
if (off + len >= ntohs(ip->ip_len))
len = ntohs(ip->ip_len) - off;
else
mhip->ip_off |= IP_MF;
HTONS(mhip->ip_off);
mhip->ip_len = htons((u_int16_t)(len + mhlen));
m->m_next = m_copym(m0, off, len, M_DONTWAIT);
if (m->m_next == NULL) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
goto sendorfree;
}
m->m_pkthdr.len = mhlen + len;
m_reset_rcvif(m);
mhip->ip_sum = 0;
KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
if (sw_csum & M_CSUM_IPv4) {
mhip->ip_sum = in_cksum(m, mhlen);
} else {
/*
* checksum is hw-offloaded or not necessary.
*/
m->m_pkthdr.csum_flags |=
m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
m->m_pkthdr.csum_data |= mhlen << 16;
KASSERT(!(ifp != NULL &&
IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
(m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
}
IP_STATINC(IP_STAT_OFRAGMENTS);
fragments++;
}
/*
* Update first fragment by trimming what's been copied out
* and updating header, then send each fragment (in order).
*/
m = m0;
m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
m->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
ip->ip_off |= htons(IP_MF);
ip->ip_sum = 0;
if (sw_csum & M_CSUM_IPv4) {
ip->ip_sum = in_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
} else {
/*
* checksum is hw-offloaded or not necessary.
*/
KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
(m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
sizeof(struct ip));
}
sendorfree:
/*
* If there is no room for all the fragments, don't queue
* any of them.
*/
if (ifp != NULL) {
IFQ_LOCK(&ifp->if_snd);
if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
error == 0) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
IFQ_INC_DROPS(&ifp->if_snd);
}
IFQ_UNLOCK(&ifp->if_snd);
}
if (error) {
for (m = m0; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
m_freem(m);
}
}
return error;
}
/*
* Determine the maximum length of the options to be inserted;
* we would far rather allocate too much space rather than too little.
*/
u_int
ip_optlen(struct inpcb *inp)
{
struct mbuf *m = inp->inp_options;
if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) {
return (m->m_len - offsetof(struct ipoption, ipopt_dst));
}
return 0;
}
/*
* Insert IP options into preformed packet.
* Adjust IP destination as required for IP source routing,
* as indicated by a non-zero in_addr at the start of the options.
*/
static struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
struct ipoption *p = mtod(opt, struct ipoption *);
struct mbuf *n;
struct ip *ip = mtod(m, struct ip *);
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
KASSERT(optlen % 4 == 0);
if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
return m; /* XXX should fail */
if (!in_nullhost(p->ipopt_dst))
ip->ip_dst = p->ipopt_dst;
if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n == NULL)
return m;
MCLAIM(n, m->m_owner);
m_move_pkthdr(n, m);
m->m_len -= sizeof(struct ip);
m->m_data += sizeof(struct ip);
n->m_next = m;
n->m_len = optlen + sizeof(struct ip);
n->m_data += max_linkhdr;
memcpy(mtod(n, void *), ip, sizeof(struct ip));
m = n;
} else {
m->m_data -= optlen;
m->m_len += optlen;
memmove(mtod(m, void *), ip, sizeof(struct ip));
}
m->m_pkthdr.len += optlen;
ip = mtod(m, struct ip *);
memcpy(ip + 1, p->ipopt_list, optlen);
*phlen = sizeof(struct ip) + optlen;
ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
return m;
}
/*
* Copy options from ipsrc to ipdst, omitting those not copied during
* fragmentation.
*/
int
ip_optcopy(struct ip *ipsrc, struct ip *ipdst)
{
u_char *cp, *dp;
int opt, optlen, cnt;
cp = (u_char *)(ipsrc + 1);
dp = (u_char *)(ipdst + 1);
cnt = (ipsrc->ip_hl << 2) - sizeof(struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP) {
/* Preserve for IP mcast tunnel's LSRR alignment. */
*dp++ = IPOPT_NOP;
optlen = 1;
continue;
}
KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp));
optlen = cp[IPOPT_OLEN];
KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt);
/* Invalid lengths should have been caught by ip_dooptions. */
if (optlen > cnt)
optlen = cnt;
if (IPOPT_COPIED(opt)) {
bcopy((void *)cp, (void *)dp, (unsigned)optlen);
dp += optlen;
}
}
for (optlen = dp - (u_char *)(ipdst+1); optlen & 0x3; optlen++) {
*dp++ = IPOPT_EOL;
}
return optlen;
}
/*
* IP socket option processing.
*/
int
ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
struct ip *ip = &in4p_ip(inp);
int inpflags = inp->inp_flags;
int optval = 0, error = 0;
struct in_pktinfo pktinfo;
KASSERT(solocked(so)); if (sopt->sopt_level != IPPROTO_IP) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
return 0;
return ENOPROTOOPT;
}
switch (op) {
case PRCO_SETOPT:
switch (sopt->sopt_name) {
case IP_OPTIONS:
#ifdef notyet
case IP_RETOPTS:
#endif
error = ip_pcbopts(inp, sopt);
break;
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVPKTINFO:
case IP_RECVTTL:
case IP_BINDANY:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (sopt->sopt_name) {
case IP_TOS:
ip->ip_tos = optval;
break;
case IP_TTL:
ip->ip_ttl = optval;
break;
case IP_MINTTL:
if (optval > 0 && optval <= MAXTTL) in4p_ip_minttl(inp) = optval;
else
error = EINVAL;
break;
#define OPTSET(bit) \
if (optval) \
inpflags |= bit; \
else \
inpflags &= ~bit;
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
case IP_RECVPKTINFO:
OPTSET(INP_RECVPKTINFO);
break;
case IP_RECVRETOPTS:
OPTSET(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
OPTSET(INP_RECVDSTADDR);
break;
case IP_RECVIF:
OPTSET(INP_RECVIF);
break;
case IP_RECVTTL:
OPTSET(INP_RECVTTL);
break;
case IP_BINDANY:
error = kauth_authorize_network(
kauth_cred_get(), KAUTH_NETWORK_BIND,
KAUTH_REQ_NETWORK_BIND_ANYADDR, so,
NULL, NULL);
if (error == 0) { OPTSET(INP_BINDANY);
}
break;
}
break;
case IP_PKTINFO:
error = sockopt_getint(sopt, &optval);
if (!error) {
/* Linux compatibility */
OPTSET(INP_RECVPKTINFO);
break;
}
error = sockopt_get(sopt, &pktinfo, sizeof(pktinfo));
if (error)
break;
if (pktinfo.ipi_ifindex == 0) {
in4p_prefsrcip(inp) = pktinfo.ipi_addr;
break;
}
/* Solaris compatibility */
struct ifnet *ifp;
struct in_ifaddr *ia;
int s;
/* pick up primary address */
s = pserialize_read_enter();
ifp = if_byindex(pktinfo.ipi_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
ia = in_get_ia_from_ifp(ifp);
if (ia == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
in4p_prefsrcip(inp) = IA_SIN(ia)->sin_addr;
pserialize_read_exit(s);
break;
break;
#undef OPTSET
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_setmoptions(&inp->inp_moptions, sopt);
break;
case IP_PORTRANGE:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optval) {
case IP_PORTRANGE_DEFAULT:
case IP_PORTRANGE_HIGH:
inpflags &= ~(INP_LOWPORT);
break;
case IP_PORTRANGE_LOW:
inpflags |= INP_LOWPORT;
break;
default:
error = EINVAL;
break;
}
break;
case IP_PORTALGO:
error = sockopt_getint(sopt, &optval);
if (error)
break;
error = portalgo_algo_index_select(inp, optval);
break;
#if defined(IPSEC)
case IP_IPSEC_POLICY:
if (ipsec_enabled) {
error = ipsec_set_policy(inp,
sopt->sopt_data, sopt->sopt_size,
curlwp->l_cred);
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (sopt->sopt_name) {
case IP_OPTIONS:
case IP_RETOPTS: {
struct mbuf *mopts = inp->inp_options;
if (mopts) {
struct mbuf *m;
m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
break;
}
error = sockopt_setmbuf(sopt, m);
}
break;
}
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVPKTINFO:
case IP_RECVTTL:
case IP_ERRORMTU:
case IP_BINDANY:
switch (sopt->sopt_name) {
case IP_TOS:
optval = ip->ip_tos;
break;
case IP_TTL:
optval = ip->ip_ttl;
break;
case IP_MINTTL:
optval = in4p_ip_minttl(inp);
break;
case IP_ERRORMTU:
optval = in4p_errormtu(inp);
break;
#define OPTBIT(bit) (inpflags & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
break;
case IP_RECVPKTINFO:
optval = OPTBIT(INP_RECVPKTINFO);
break;
case IP_RECVRETOPTS:
optval = OPTBIT(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
optval = OPTBIT(INP_RECVDSTADDR);
break;
case IP_RECVIF:
optval = OPTBIT(INP_RECVIF);
break;
case IP_RECVTTL:
optval = OPTBIT(INP_RECVTTL);
break;
case IP_BINDANY:
optval = OPTBIT(INP_BINDANY);
break;
}
error = sockopt_setint(sopt, optval);
break;
case IP_PKTINFO:
switch (sopt->sopt_size) {
case sizeof(int):
/* Linux compatibility */
optval = OPTBIT(INP_RECVPKTINFO);
error = sockopt_setint(sopt, optval);
break;
case sizeof(struct in_pktinfo):
/* Solaris compatibility */
pktinfo.ipi_ifindex = 0;
pktinfo.ipi_addr = in4p_prefsrcip(inp);
error = sockopt_set(sopt, &pktinfo,
sizeof(pktinfo));
break;
default:
/*
* While size is stuck at 0, and, later, if
* the caller doesn't use an exactly sized
* recipient for the data, default to Linux
* compatibility
*/
optval = OPTBIT(INP_RECVPKTINFO);
error = sockopt_setint(sopt, optval);
break;
}
break;
#if 0 /* defined(IPSEC) */
case IP_IPSEC_POLICY:
{
struct mbuf *m = NULL;
/* XXX this will return EINVAL as sopt is empty */
error = ipsec_get_policy(inp, sopt->sopt_data,
sopt->sopt_size, &m);
if (error == 0)
error = sockopt_setmbuf(sopt, m);
break;
}
#endif /*IPSEC*/
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_getmoptions(inp->inp_moptions, sopt);
break;
case IP_PORTRANGE:
if (inpflags & INP_LOWPORT)
optval = IP_PORTRANGE_LOW;
else
optval = IP_PORTRANGE_DEFAULT;
error = sockopt_setint(sopt, optval);
break;
case IP_PORTALGO:
optval = inp->inp_portalgo;
error = sockopt_setint(sopt, optval);
break;
default:
error = ENOPROTOOPT;
break;
}
break;
}
if (!error) {
inp->inp_flags = inpflags;
}
return error;
}
static int
ip_pktinfo_prepare(const struct inpcb *inp, const struct in_pktinfo *pktinfo,
struct ip_pktopts *pktopts, int *flags, kauth_cred_t cred)
{
struct ip_moptions *imo;
int error = 0;
bool addrset = false;
if (!in_nullhost(pktinfo->ipi_addr)) {
pktopts->ippo_laddr.sin_addr = pktinfo->ipi_addr;
/* EADDRNOTAVAIL? */
error = inpcb_bindableaddr(inp, &pktopts->ippo_laddr, cred);
if (error != 0)
return error;
addrset = true;
}
if (pktinfo->ipi_ifindex != 0) {
if (!addrset) {
struct ifnet *ifp;
struct in_ifaddr *ia;
int s;
/* pick up primary address */
s = pserialize_read_enter();
ifp = if_byindex(pktinfo->ipi_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
ia = in_get_ia_from_ifp(ifp);
if (ia == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
pktopts->ippo_laddr.sin_addr = IA_SIN(ia)->sin_addr;
pserialize_read_exit(s);
}
/*
* If specified ipi_ifindex,
* use copied or locally initialized ip_moptions.
* Original ip_moptions must not be modified.
*/
imo = &pktopts->ippo_imobuf; /* local buf in pktopts */
if (pktopts->ippo_imo != NULL) {
memcpy(imo, pktopts->ippo_imo, sizeof(*imo));
} else {
memset(imo, 0, sizeof(*imo));
imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
}
imo->imo_multicast_if_index = pktinfo->ipi_ifindex;
pktopts->ippo_imo = imo;
*flags |= IP_ROUTETOIFINDEX;
}
return error;
}
/*
* Set up IP outgoing packet options. Even if control is NULL,
* pktopts->ippo_laddr and pktopts->ippo_imo are set and used.
*/
int
ip_setpktopts(struct mbuf *control, struct ip_pktopts *pktopts, int *flags,
struct inpcb *inp, kauth_cred_t cred)
{
struct cmsghdr *cm;
struct in_pktinfo pktinfo;
int error;
pktopts->ippo_imo = inp->inp_moptions;
struct in_addr *ia = in_nullhost(in4p_prefsrcip(inp)) ? &in4p_laddr(inp) :
&in4p_prefsrcip(inp);
sockaddr_in_init(&pktopts->ippo_laddr, ia, 0);
if (control == NULL)
return 0;
/*
* XXX: Currently, we assume all the optional information is
* stored in a single mbuf.
*/
if (control->m_next)
return EINVAL;
for (; control->m_len > 0;
control->m_data += CMSG_ALIGN(cm->cmsg_len),
control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
cm = mtod(control, struct cmsghdr *);
if ((control->m_len < sizeof(*cm)) ||
(cm->cmsg_len == 0) ||
(cm->cmsg_len > control->m_len)) {
return EINVAL;
}
if (cm->cmsg_level != IPPROTO_IP)
continue;
switch (cm->cmsg_type) {
case IP_PKTINFO:
if (cm->cmsg_len != CMSG_LEN(sizeof(pktinfo)))
return EINVAL;
memcpy(&pktinfo, CMSG_DATA(cm), sizeof(pktinfo));
error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
flags, cred);
if (error)
return error;
break;
case IP_SENDSRCADDR: /* FreeBSD compatibility */
if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr)))
return EINVAL;
pktinfo.ipi_ifindex = 0;
pktinfo.ipi_addr =
((struct in_pktinfo *)CMSG_DATA(cm))->ipi_addr;
error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
flags, cred);
if (error)
return error;
break;
default:
return ENOPROTOOPT;
}
}
return 0;
}
/*
* Set up IP options in pcb for insertion in output packets.
* Store in mbuf with pointer in pcbopt, adding pseudo-option
* with destination address if source routed.
*/
static int
ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt)
{
struct mbuf *m;
const u_char *cp;
u_char *dp;
int cnt;
KASSERT(inp_locked(inp));
/* Turn off any old options. */
if (inp->inp_options) { m_free(inp->inp_options);
}
inp->inp_options = NULL;
if ((cnt = sopt->sopt_size) == 0) {
/* Only turning off any previous options. */
return 0;
}
cp = sopt->sopt_data;
if (cnt % 4) {
/* Must be 4-byte aligned, because there's no padding. */
return EINVAL;
}
m = m_get(M_DONTWAIT, MT_SOOPTS);
if (m == NULL)
return ENOBUFS;
dp = mtod(m, u_char *);
memset(dp, 0, sizeof(struct in_addr));
dp += sizeof(struct in_addr);
m->m_len = sizeof(struct in_addr);
/*
* IP option list according to RFC791. Each option is of the form
*
* [optval] [olen] [(olen - 2) data bytes]
*
* We validate the list and copy options to an mbuf for prepending
* to data packets. The IP first-hop destination address will be
* stored before actual options and is zero if unset.
*/
while (cnt > 0) {
uint8_t optval, olen, offset;
optval = cp[IPOPT_OPTVAL];
if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
olen = 1;
} else {
if (cnt < IPOPT_OLEN + 1)
goto bad;
olen = cp[IPOPT_OLEN];
if (olen < IPOPT_OLEN + 1 || olen > cnt)
goto bad;
}
if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
/*
* user process specifies route as:
* ->A->B->C->D
* D must be our final destination (but we can't
* check that since we may not have connected yet).
* A is first hop destination, which doesn't appear in
* actual IP option, but is stored before the options.
*/
if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
goto bad;
offset = cp[IPOPT_OFFSET];
memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
sizeof(struct in_addr));
cp += sizeof(struct in_addr);
cnt -= sizeof(struct in_addr);
olen -= sizeof(struct in_addr);
if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
memcpy(dp, cp, olen);
dp[IPOPT_OPTVAL] = optval;
dp[IPOPT_OLEN] = olen;
dp[IPOPT_OFFSET] = offset;
break;
} else {
if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
memcpy(dp, cp, olen);
break;
}
dp += olen;
m->m_len += olen;
if (optval == IPOPT_EOL)
break;
cp += olen;
cnt -= olen;
}
inp->inp_options = m;
return 0;
bad:
(void)m_free(m);
return EINVAL;
}
/*
* following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
* Must be called in a pserialize critical section.
*/
static struct ifnet *
ip_multicast_if(struct in_addr *a, int *ifindexp)
{
int ifindex;
struct ifnet *ifp = NULL;
struct in_ifaddr *ia;
if (ifindexp) *ifindexp = 0;
if (ntohl(a->s_addr) >> 24 == 0) {
ifindex = ntohl(a->s_addr) & 0xffffff;
ifp = if_byindex(ifindex);
if (!ifp)
return NULL;
if (ifindexp)
*ifindexp = ifindex;
} else {
IN_ADDRHASH_READER_FOREACH(ia, a->s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
(ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
ifp = ia->ia_ifp;
if (if_is_deactivated(ifp))
ifp = NULL;
break;
}
}
}
return ifp;
}
static int
ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
{
u_int tval;
u_char cval;
int error;
if (sopt == NULL)
return EINVAL;
switch (sopt->sopt_size) {
case sizeof(u_char):
error = sockopt_get(sopt, &cval, sizeof(u_char));
tval = cval;
break;
case sizeof(u_int):
error = sockopt_get(sopt, &tval, sizeof(u_int));
break;
default:
error = EINVAL;
}
if (error)
return error;
if (tval > maxval)
return EINVAL;
*val = tval;
return 0;
}
static int
ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
struct psref *psref, struct in_addr *ia, bool add)
{
int error;
struct ip_mreq mreq;
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error)
return error;
if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr))
return EINVAL;
memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia));
if (in_nullhost(mreq.imr_interface)) {
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
struct route ro;
if (!add) {
*ifp = NULL;
return 0;
}
/*
* If no interface address was provided, use the interface of
* the route to the given multicast address.
*/
struct rtentry *rt;
memset(&ro, 0, sizeof(ro));
sockaddr_in_init(&u.dst4, ia, 0);
error = rtcache_setdst(&ro, &u.dst);
if (error != 0)
return error;
*ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL; if (*ifp != NULL) {
if (if_is_deactivated(*ifp))
*ifp = NULL;
else
if_acquire(*ifp, psref);
}
rtcache_unref(rt, &ro);
rtcache_free(&ro);
} else {
int s = pserialize_read_enter();
*ifp = ip_multicast_if(&mreq.imr_interface, NULL);
if (!add && *ifp == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
if (*ifp != NULL) {
if (if_is_deactivated(*ifp))
*ifp = NULL;
else
if_acquire(*ifp, psref);
}
pserialize_read_exit(s);
}
return 0;
}
/*
* Add a multicast group membership.
* Group must be a valid IP multicast address.
*/
static int
ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
struct ifnet *ifp = NULL; // XXX: gcc [ppc]
struct in_addr ia;
int i, error, bound;
struct psref psref;
/* imo is protected by solock or referenced only by the caller */
bound = curlwp_bind();
if (sopt->sopt_size == sizeof(struct ip_mreq))
error = ip_get_membership(sopt, &ifp, &psref, &ia, true);
else {
#ifdef INET6
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
error = EINVAL;
#endif
}
if (error)
goto out;
/*
* See if we found an interface, and confirm that it
* supports multicast.
*/
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
/*
* See if the membership already exists or if all the
* membership slots are full.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp &&
in_hosteq(imo->imo_membership[i]->inm_addr, ia))
break;
}
if (i < imo->imo_num_memberships) {
error = EADDRINUSE;
goto out;
}
if (i == IP_MAX_MEMBERSHIPS) {
error = ETOOMANYREFS;
goto out;
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
imo->imo_membership[i] = in_addmulti(&ia, ifp);
if (imo->imo_membership[i] == NULL) {
error = ENOBUFS;
goto out;
}
++imo->imo_num_memberships;
error = 0;
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Drop a multicast group membership.
* Group must be a valid IP multicast address.
*/
static int
ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
struct in_addr ia = { .s_addr = 0 }; // XXX: gcc [ppc]
struct ifnet *ifp = NULL; // XXX: gcc [ppc]
int i, error, bound;
struct psref psref;
/* imo is protected by solock or referenced only by the caller */
bound = curlwp_bind();
if (sopt->sopt_size == sizeof(struct ip_mreq))
error = ip_get_membership(sopt, &ifp, &psref, &ia, false);
else {
#ifdef INET6
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
error = EINVAL;
#endif
}
if (error)
goto out;
/*
* Find the membership in the membership array.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) &&
in_hosteq(imo->imo_membership[i]->inm_addr, ia))
break;
}
if (i == imo->imo_num_memberships) {
error = EADDRNOTAVAIL;
goto out;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
in_delmulti(imo->imo_membership[i]);
/*
* Remove the gap in the membership array.
*/
for (++i; i < imo->imo_num_memberships; ++i)
imo->imo_membership[i-1] = imo->imo_membership[i];
--imo->imo_num_memberships;
error = 0;
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Set the IP multicast options in response to user setsockopt().
*/
int
ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt)
{
struct ip_moptions *imo = *pimo;
struct in_addr addr;
struct ifnet *ifp;
int ifindex, error = 0;
/* The passed imo isn't NULL, it should be protected by solock */
if (!imo) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP); if (imo == NULL)
return ENOBUFS;
imo->imo_multicast_if_index = 0;
imo->imo_multicast_addr.s_addr = INADDR_ANY;
imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
imo->imo_num_memberships = 0;
*pimo = imo;
}
switch (sopt->sopt_name) {
case IP_MULTICAST_IF: {
int s;
/*
* Select the interface for outgoing multicast packets.
*/
error = sockopt_get(sopt, &addr, sizeof(addr));
if (error)
break;
/*
* INADDR_ANY is used to remove a previous selection.
* When no interface is selected, a default one is
* chosen every time a multicast packet is sent.
*/
if (in_nullhost(addr)) {
imo->imo_multicast_if_index = 0;
break;
}
/*
* The selected interface is identified by its local
* IP address. Find the interface and confirm that
* it supports multicasting.
*/
s = pserialize_read_enter();
ifp = ip_multicast_if(&addr, &ifindex);
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
imo->imo_multicast_if_index = ifp->if_index;
pserialize_read_exit(s);
if (ifindex)
imo->imo_multicast_addr = addr;
else
imo->imo_multicast_addr.s_addr = INADDR_ANY;
break;
}
case IP_MULTICAST_TTL:
/*
* Set the IP time-to-live for outgoing multicast packets.
*/
error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
break;
case IP_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
break;
case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */
error = ip_add_membership(imo, sopt);
break;
case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */
error = ip_drop_membership(imo, sopt);
break;
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the mbuf.
*/
if (imo->imo_multicast_if_index == 0 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
imo->imo_num_memberships == 0) {
kmem_intr_free(imo, sizeof(*imo));
*pimo = NULL;
}
return error;
}
/*
* Return the IP multicast options in response to user getsockopt().
*/
int
ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
{
struct in_addr addr;
uint8_t optval;
int error = 0;
/* imo is protected by solock or referenced only by the caller */
switch (sopt->sopt_name) {
case IP_MULTICAST_IF:
if (imo == NULL || imo->imo_multicast_if_index == 0)
addr = zeroin_addr;
else if (imo->imo_multicast_addr.s_addr) {
/* return the value user has set */
addr = imo->imo_multicast_addr;
} else {
struct ifnet *ifp;
struct in_ifaddr *ia = NULL;
int s = pserialize_read_enter();
ifp = if_byindex(imo->imo_multicast_if_index);
if (ifp != NULL) {
ia = in_get_ia_from_ifp(ifp);
}
addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
pserialize_read_exit(s);
}
error = sockopt_set(sopt, &addr, sizeof(addr));
break;
case IP_MULTICAST_TTL:
optval = imo ? imo->imo_multicast_ttl
: IP_DEFAULT_MULTICAST_TTL;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IP_MULTICAST_LOOP:
optval = imo ? imo->imo_multicast_loop
: IP_DEFAULT_MULTICAST_LOOP;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
default:
error = EOPNOTSUPP;
}
return error;
}
/*
* Discard the IP multicast options.
*/
void
ip_freemoptions(struct ip_moptions *imo)
{
int i;
/* The owner of imo (inp) should be protected by solock */
if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) {
struct in_multi *inm = imo->imo_membership[i];
in_delmulti(inm);
/* ifp should not leave thanks to solock */
}
kmem_intr_free(imo, sizeof(*imo));
}
}
/*
* Routine called from ip_output() to loop back a copy of an IP multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be lo0ifp -- easier than replicating that code here.
*/
static void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
{
struct ip *ip;
struct mbuf *copym;
copym = m_copypacket(m, M_DONTWAIT);
if (copym != NULL &&
(copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
copym = m_pullup(copym, sizeof(struct ip));
if (copym == NULL)
return;
/*
* We don't bother to fragment if the IP length is greater
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
in_undefer_cksum_tcpudp(copym);
copym->m_pkthdr.csum_flags &=
~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
(void)looutput(ifp, copym, sintocsa(dst), NULL);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
/*
* Ensure sending address is valid.
* Returns 0 on success, -1 if an error should be sent back or 1
* if the packet could be dropped without error (protocol dependent).
*/
static int
ip_ifaddrvalid(const struct in_ifaddr *ia)
{
if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
return 0;
if (ia->ia4_flags & IN_IFF_DUPLICATED)
return -1;
else if (ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DETACHED))
return 1;
return 0;
}
/* $NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/mbuf.h> /* for MLEN */
#include <sys/protosw.h>
#include <sys/compat_stub.h>
#include <sys/syscallargs.h>
#include <net/if.h>
#include <net/bpf.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <net/if_gre.h>
#include <net/if_tap.h>
#include <net80211/ieee80211_ioctl.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <compat/net/if.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
#include <uvm/uvm_extern.h>
#if defined(COMPAT_43)
/*
* Use a wrapper so that the compat_cvtcmd() can return a u_long
*/
static int
do_compat_cvtcmd(u_long *ncmd, u_long ocmd)
{
*ncmd = compat_cvtcmd(ocmd);
return 0;
}
u_long
compat_cvtcmd(u_long cmd)
{
u_long ncmd;
if (IOCPARM_LEN(cmd) != sizeof(struct oifreq))
return cmd;
switch (cmd) {
case OSIOCSIFADDR:
return SIOCSIFADDR;
case OOSIOCGIFADDR:
return SIOCGIFADDR;
case OSIOCSIFDSTADDR:
return SIOCSIFDSTADDR;
case OOSIOCGIFDSTADDR:
return SIOCGIFDSTADDR;
case OSIOCSIFFLAGS:
return SIOCSIFFLAGS;
case OSIOCGIFFLAGS:
return SIOCGIFFLAGS;
case OOSIOCGIFBRDADDR:
return SIOCGIFBRDADDR;
case OSIOCSIFBRDADDR:
return SIOCSIFBRDADDR;
case OOSIOCGIFCONF:
return SIOCGIFCONF;
case OOSIOCGIFNETMASK:
return SIOCGIFNETMASK;
case OSIOCSIFNETMASK:
return SIOCSIFNETMASK;
case OSIOCGIFCONF:
return SIOCGIFCONF;
case OSIOCADDMULTI:
return SIOCADDMULTI;
case OSIOCDELMULTI:
return SIOCDELMULTI;
case SIOCSIFMEDIA_43:
return SIOCSIFMEDIA_80;
case OSIOCGIFMTU:
return SIOCGIFMTU;
case OSIOCGIFDATA:
return SIOCGIFDATA;
case OSIOCZIFDATA:
return SIOCZIFDATA;
case OBIOCGETIF:
return BIOCGETIF;
case OBIOCSETIF:
return BIOCSETIF;
case OTAPGIFNAME:
return TAPGIFNAME;
default:
/*
* XXX: the following code should be removed and the
* needing treatment ioctls should move to the switch
* above.
*/
ncmd = ((cmd) & ~(IOCPARM_MASK << IOCPARM_SHIFT)) |
(sizeof(struct ifreq) << IOCPARM_SHIFT);
switch (ncmd) {
case BIOCGETIF:
case BIOCSETIF:
case GREDSOCK:
case GREGADDRD:
case GREGADDRS:
case GREGPROTO:
case GRESADDRD:
case GRESADDRS:
case GRESPROTO:
case GRESSOCK:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCDIFADDR:
case SIOCDIFADDR_IN6:
case SIOCDIFPHYADDR:
case SIOCG80211NWID:
case SIOCG80211STATS:
case SIOCG80211ZSTATS:
case SIOCGIFADDR:
case SIOCGIFADDR_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
case SIOCGIFBRDADDR:
case SIOCGIFDLT:
case SIOCGIFDSTADDR:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFFLAGS:
case SIOCGIFGENERIC:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
case SIOCGIFNETMASK:
case SIOCGIFNETMASK_IN6:
case SIOCGIFPDSTADDR:
case SIOCGIFPDSTADDR_IN6:
case SIOCGIFPSRCADDR:
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFSTAT_ICMP6:
case SIOCGIFSTAT_IN6:
case SIOCGVH:
case SIOCIFCREATE:
case SIOCIFDESTROY:
case SIOCS80211NWID:
case SIOCSIFADDR:
case SIOCSIFADDR_IN6:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFFLAGS:
case SIOCSIFGENERIC:
case SIOCSIFMEDIA:
case SIOCSIFMETRIC:
case SIOCSIFMTU:
case SIOCSIFNETMASK:
case SIOCSIFNETMASK_IN6:
case SIOCSVH:
case TAPGIFNAME:
return ncmd;
default:
{ int rv;
MODULE_HOOK_CALL(if43_cvtcmd_20_hook, (ncmd), enosys(),
rv);
if (rv == 0)
return ncmd;
return cmd;
}
}
}
}
int
compat_ifioctl(struct socket *so, u_long ocmd, u_long cmd, void *data,
struct lwp *l)
{
int error;
struct ifreq *ifr = (struct ifreq *)data;
struct ifreq ifrb;
struct oifreq *oifr = NULL;
struct ifnet *ifp;
struct sockaddr *sa;
struct psref psref;
int bound = curlwp_bind();
ifp = if_get(ifr->ifr_name, &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
return ENXIO;
}
/*
* If we have not been converted, make sure that we are.
* (because the upper layer handles old socket calls, but
* not oifreq calls.
*/
if (cmd == ocmd) {
cmd = compat_cvtcmd(ocmd);
}
if (cmd != ocmd) {
oifr = data;
ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
}
switch (ocmd) {
enum { maxlen = sizeof(oifr->ifr_ifru) };
CTASSERT(maxlen == 16);
socklen_t famlen;
case OSIOCSIFADDR:
case OSIOCSIFDSTADDR:
case OSIOCSIFBRDADDR:
case OSIOCSIFNETMASK:
sa = &ifr->ifr_addr;
#if BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < maxlen) {
sa->sa_family = sa->sa_len;
sa->sa_len = maxlen;
}
#else
if (sa->sa_len == 0)
sa->sa_len = maxlen;
#endif
famlen = sockaddr_getsize_by_family(sa->sa_family);
if (famlen > sa->sa_len) {
curlwp_bindx(bound);
return EAFNOSUPPORT;
}
break;
}
error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, ifr, ifp);
if_put(ifp, &psref);
curlwp_bindx(bound);
switch (ocmd) {
case OOSIOCGIFADDR:
case OOSIOCGIFDSTADDR:
case OOSIOCGIFBRDADDR:
case OOSIOCGIFNETMASK:
*(u_int16_t *)&ifr->ifr_addr =
((struct sockaddr *)&ifr->ifr_addr)->sa_family;
break;
}
if (cmd != ocmd)
IFREQN2O_43(oifr, ifr);
return error;
}
int
if_43_init(void)
{
MODULE_HOOK_SET(if_cvtcmd_43_hook, do_compat_cvtcmd);
MODULE_HOOK_SET(if_ifioctl_43_hook, compat_ifioctl);
return 0;
}
int
if_43_fini(void)
{
MODULE_HOOK_UNSET(if_cvtcmd_43_hook);
MODULE_HOOK_UNSET(if_ifioctl_43_hook);
return 0;
}
#endif /* defined(COMPAT_43) */
/* $NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $ */
/*
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Kentaro A. Kurahone.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
* $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
*/
/*
* @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_tcp_debug.h"
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/route.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif
#ifndef INET6
#include <netinet/ip6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
/* SACK block pool. */
static struct pool sackhole_pool;
void
tcp_sack_init(void)
{
pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
"sackholepl", NULL, IPL_SOFTNET);
}
static struct sackhole *
sack_allochole(struct tcpcb *tp)
{
struct sackhole *hole;
if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
return NULL;
}
hole = pool_get(&sackhole_pool, PR_NOWAIT);
if (hole == NULL) {
return NULL;
}
tp->snd_numholes++;
tcp_sack_globalholes++;
return hole;
}
static struct sackhole *
sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
struct sackhole *prev)
{
struct sackhole *hole;
hole = sack_allochole(tp);
if (hole == NULL) {
return NULL;
}
hole->start = hole->rxmit = start;
hole->end = end;
if (prev != NULL) {
TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
} else {
TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
}
return hole;
}
static struct sackhole *
sack_removehole(struct tcpcb *tp, struct sackhole *hole)
{
struct sackhole *next;
next = TAILQ_NEXT(hole, sackhole_q);
tp->snd_numholes--;
tcp_sack_globalholes--;
TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
pool_put(&sackhole_pool, hole);
return next;
}
/*
* tcp_new_dsack: record the reception of a duplicated segment.
*/
void
tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
{
if (TCP_SACK_ENABLED(tp)) {
tp->rcv_dsack_block.left = seq;
tp->rcv_dsack_block.right = seq + len;
tp->rcv_sack_flags |= TCPSACK_HAVED;
}
}
/*
* tcp_sack_option: parse the given SACK option and update the scoreboard.
*/
void
tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
int optlen)
{
struct sackblk
t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
struct sackblk *sack = NULL;
struct sackhole *cur = NULL;
struct sackhole *tmp = NULL;
const char *lp = cp + 2;
int i, j, num_sack_blks;
tcp_seq left, right, acked;
/*
* If we aren't processing SACK responses, this is not an ACK
* or the peer sends us a sack option with invalid length, don't
* update the scoreboard.
*/
if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
(optlen % 8 != 2 || optlen < 10)) {
return;
}
/*
* If we don't want any SACK holes to be allocated, just return.
*/
if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
return;
}
/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
return;
/*
* Extract SACK blocks.
*
* Note that t_sack_block is sorted so that we only need to do
* one pass over the sequence number space. (SACK "fast-path")
*/
num_sack_blks = optlen / 8;
acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
memcpy(&left, lp, sizeof(uint32_t));
memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
left = ntohl(left);
right = ntohl(right);
if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
SEQ_GEQ(left, right)) {
/* SACK entry that's old, or invalid. */
i--;
num_sack_blks--;
continue;
}
/* Insertion sort. */
for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
j--) {
t_sack_block[j].left = t_sack_block[j - 1].left;
t_sack_block[j].right = t_sack_block[j - 1].right;
}
t_sack_block[j].left = left;
t_sack_block[j].right = right;
}
/* Update the scoreboard. */
cur = TAILQ_FIRST(&tp->snd_holes);
for (i = 0; i < num_sack_blks; i++) {
sack = &t_sack_block[i];
/*
* FACK TCP. Update snd_fack so we can enter Fast
* Recovery early.
*/
if (SEQ_GEQ(sack->right, tp->snd_fack))
tp->snd_fack = sack->right;
if (TAILQ_EMPTY(&tp->snd_holes)) {
/* First hole. */
cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
if (cur == NULL) {
/* ENOBUFS, bail out*/
return;
}
tp->rcv_lastsack = sack->right;
continue; /* With next sack block */
}
/* Go through the list of holes. */
while (cur) {
if (SEQ_LEQ(sack->right, cur->start))
/* SACKs data before the current hole */
break; /* No use going through more holes */
if (SEQ_GEQ(sack->left, cur->end)) {
/* SACKs data beyond the current hole */
cur = TAILQ_NEXT(cur, sackhole_q);
continue;
}
if (SEQ_LEQ(sack->left, cur->start)) {
/* Data acks at least the beginning of hole */
if (SEQ_GEQ(sack->right, cur->end)) {
/* Acks entire hole, so delete hole */
cur = sack_removehole(tp, cur);
break;
}
/* Otherwise, move start of hole forward */
cur->start = sack->right;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
break;
}
if (SEQ_GEQ(sack->right, cur->end)) {
/* Move end of hole backward. */
cur->end = sack->left;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur = TAILQ_NEXT(cur, sackhole_q);
break;
}
if (SEQ_LT(cur->start, sack->left) &&
SEQ_GT(cur->end, sack->right)) {
/*
* ACKs some data in middle of a hole; need to
* split current hole
*/
tmp = sack_inserthole(tp, sack->right, cur->end,
cur);
if (tmp == NULL) {
return;
}
tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
cur->end = sack->left;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur = tmp;
break;
}
}
/* At this point, we have reached the tail of the list. */
if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
/*
* Need to append new hole at end.
*/
cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
NULL);
if (cur == NULL) {
return;
}
}
if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
tp->rcv_lastsack = sack->right;
}
}
}
/*
* tcp_del_sackholes: remove holes covered by a cumulative ACK.
*/
void
tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
{
/* Max because this could be an older ack that just arrived. */
tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
th->th_ack : tp->snd_una;
struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
while (cur) {
if (SEQ_LEQ(cur->end, lastack)) {
cur = sack_removehole(tp, cur);
} else if (SEQ_LT(cur->start, lastack)) {
cur->start = lastack;
if (SEQ_LT(cur->rxmit, cur->start))
cur->rxmit = cur->start;
break;
} else
break;
}
}
/*
* tcp_free_sackholes: clear the scoreboard.
*/
void
tcp_free_sackholes(struct tcpcb *tp)
{
struct sackhole *sack;
/* Free up the SACK hole list. */
while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
sack_removehole(tp, sack);
}
KASSERT(tp->snd_numholes == 0);
}
/*
* Returns pointer to a sackhole if there are any pending retransmissions;
* NULL otherwise.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
{
struct sackhole *cur = NULL;
if (!TCP_SACK_ENABLED(tp))
return (NULL);
*sack_bytes_rexmt = 0;
TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
if (SEQ_LT(cur->rxmit, cur->end)) {
if (SEQ_LT(cur->rxmit, tp->snd_una)) {
/* old SACK hole */
continue;
}
*sack_bytes_rexmt += (cur->rxmit - cur->start);
break;
}
*sack_bytes_rexmt += (cur->rxmit - cur->start);
}
return (cur);
}
/*
* After a timeout, the SACK list may be rebuilt. This SACK information
* should be used to avoid retransmitting SACKed data. This function
* traverses the SACK list to see if snd_nxt should be moved forward.
*/
void
tcp_sack_adjust(struct tcpcb *tp)
{
struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
struct sackhole *n = NULL;
if (TAILQ_EMPTY(&tp->snd_holes))
return; /* No holes */
if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
return; /* We're already beyond any SACKed blocks */
/*
* Two cases for which we want to advance snd_nxt:
* i) snd_nxt lies between end of one hole and beginning of another
* ii) snd_nxt lies between end of last hole and rcv_lastsack
*/
while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
if (SEQ_GEQ(tp->snd_nxt, n->start))
cur = n;
else {
tp->snd_nxt = n->start;
return;
}
}
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
tp->snd_nxt = tp->rcv_lastsack;
return;
}
/*
* tcp_sack_numblks: return the number of SACK blocks to send.
*/
int
tcp_sack_numblks(const struct tcpcb *tp)
{
int numblks;
if (!TCP_SACK_ENABLED(tp)) {
return 0;
}
numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
tp->t_segqlen;
if (numblks == 0) {
return 0;
}
if (numblks > TCP_SACK_MAX) {
numblks = TCP_SACK_MAX;
}
return numblks;
}
#if defined(DDB)
void sack_dump(const struct tcpcb *);
void
sack_dump(const struct tcpcb *tp)
{
const struct sackhole *cur;
printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
tp->snd_una, tp->snd_max);
printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
tp->rcv_lastsack, tp->snd_fack);
printf("numholes=%d\n", tp->snd_numholes);
TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
cur->start, cur->end, cur->rxmit);
}
}
#endif /* defined(DDB) */
/* $NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $");
#define COMPAT_RTSOCK /* Use the COMPATNAME/COMPATCALL macros and the
* various other compat definitions - see
* sys/net/rtsock_shared.c for details
*/
#include <net/rtsock_shared.c>
#include <compat/net/route_50.h>
static struct sysctllog *clog;
void
compat_50_rt_oifmsg(struct ifnet *ifp)
{
struct if_msghdr50 oifm;
struct if_data ifi;
struct mbuf *m;
struct rt_addrinfo info;
if (COMPATNAME(route_info).ri_cb.any_count == 0)
return;
(void)memset(&info, 0, sizeof(info));
(void)memset(&oifm, 0, sizeof(oifm));
if_export_if_data(ifp, &ifi, false);
oifm.ifm_index = ifp->if_index;
oifm.ifm_flags = ifp->if_flags;
oifm.ifm_data.ifi_type = ifi.ifi_type;
oifm.ifm_data.ifi_addrlen = ifi.ifi_addrlen;
oifm.ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
oifm.ifm_data.ifi_link_state = ifi.ifi_link_state;
oifm.ifm_data.ifi_mtu = ifi.ifi_mtu;
oifm.ifm_data.ifi_metric = ifi.ifi_metric;
oifm.ifm_data.ifi_baudrate = ifi.ifi_baudrate;
oifm.ifm_data.ifi_ipackets = ifi.ifi_ipackets;
oifm.ifm_data.ifi_ierrors = ifi.ifi_ierrors;
oifm.ifm_data.ifi_opackets = ifi.ifi_opackets;
oifm.ifm_data.ifi_oerrors = ifi.ifi_oerrors;
oifm.ifm_data.ifi_collisions = ifi.ifi_collisions;
oifm.ifm_data.ifi_ibytes = ifi.ifi_ibytes;
oifm.ifm_data.ifi_obytes = ifi.ifi_obytes;
oifm.ifm_data.ifi_imcasts = ifi.ifi_imcasts;
oifm.ifm_data.ifi_omcasts = ifi.ifi_omcasts;
oifm.ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
oifm.ifm_data.ifi_noproto = ifi.ifi_noproto;
TIMESPEC_TO_TIMEVAL(&oifm.ifm_data.ifi_lastchange,
&ifi.ifi_lastchange);
oifm.ifm_addrs = 0;
m = COMPATNAME(rt_msg1)(RTM_OIFINFO, &info, (void *)&oifm, sizeof(oifm));
if (m == NULL)
return;
COMPATNAME(route_enqueue)(m, 0);
}
int
compat_50_iflist(struct ifnet *ifp, struct rt_walkarg *w,
struct rt_addrinfo *info, size_t len)
{
struct if_msghdr50 *ifm;
struct if_data ifi;
int error;
ifm = (struct if_msghdr50 *)w->w_tmem;
if_export_if_data(ifp, &ifi, false);
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags;
ifm->ifm_data.ifi_type = ifi.ifi_type;
ifm->ifm_data.ifi_addrlen = ifi.ifi_addrlen;
ifm->ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
ifm->ifm_data.ifi_link_state = ifi.ifi_link_state;
ifm->ifm_data.ifi_mtu = ifi.ifi_mtu;
ifm->ifm_data.ifi_metric = ifi.ifi_metric;
ifm->ifm_data.ifi_baudrate = ifi.ifi_baudrate;
ifm->ifm_data.ifi_ipackets = ifi.ifi_ipackets;
ifm->ifm_data.ifi_ierrors = ifi.ifi_ierrors;
ifm->ifm_data.ifi_opackets = ifi.ifi_opackets;
ifm->ifm_data.ifi_oerrors = ifi.ifi_oerrors;
ifm->ifm_data.ifi_collisions = ifi.ifi_collisions;
ifm->ifm_data.ifi_ibytes = ifi.ifi_ibytes;
ifm->ifm_data.ifi_obytes = ifi.ifi_obytes;
ifm->ifm_data.ifi_imcasts = ifi.ifi_imcasts;
ifm->ifm_data.ifi_omcasts = ifi.ifi_omcasts;
ifm->ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
ifm->ifm_data.ifi_noproto = ifi.ifi_noproto;
TIMESPEC_TO_TIMEVAL(&ifm->ifm_data.ifi_lastchange,
&ifi.ifi_lastchange);
ifm->ifm_addrs = info->rti_addrs;
error = copyout(ifm, w->w_where, len);
if (error)
return error;
w->w_where = (char *)w->w_where + len;
return 0;
}
void
rtsock_50_init(void)
{
MODULE_HOOK_SET(rtsock_iflist_50_hook, compat_50_iflist);
MODULE_HOOK_SET(rtsock_oifmsg_50_hook, compat_50_rt_oifmsg);
MODULE_HOOK_SET(rtsock_rt_missmsg_50_hook, compat_50_rt_missmsg);
MODULE_HOOK_SET(rtsock_rt_ifmsg_50_hook, compat_50_rt_ifmsg);
MODULE_HOOK_SET(rtsock_rt_addrmsg_rt_50_hook, compat_50_rt_addrmsg_rt);
MODULE_HOOK_SET(rtsock_rt_addrmsg_src_50_hook,
compat_50_rt_addrmsg_src);
MODULE_HOOK_SET(rtsock_rt_addrmsg_50_hook, compat_50_rt_addrmsg);
MODULE_HOOK_SET(rtsock_rt_ifannouncemsg_50_hook,
compat_50_rt_ifannouncemsg);
MODULE_HOOK_SET(rtsock_rt_ieee80211msg_50_hook,
compat_50_rt_ieee80211msg);
sysctl_net_route_setup(&clog, PF_OROUTE, "ortable");
}
void
rtsock_50_fini(void)
{
sysctl_teardown(&clog);
MODULE_HOOK_UNSET(rtsock_iflist_50_hook);
MODULE_HOOK_UNSET(rtsock_oifmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_missmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ifmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_rt_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_src_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ifannouncemsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ieee80211msg_50_hook);
}
/* $NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by David Laight.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $");
#include <sys/syscall_stats.h>
#include <sys/syscallvar.h>
/*
* MI indirect system call support.
* Included from sys_indirect.c and compat/netbsd32/netbsd32_indirect.c
*
* SYS_SYSCALL is set to the required function name.
*/
#define CONCAT(a,b) __CONCAT(a,b)
static void
CONCAT(SYS_SYSCALL, _biglockcheck)(struct proc *p, int code)
{
#ifdef DIAGNOSTIC
kpreempt_disable(); /* make curcpu() stable */
KASSERTMSG(curcpu()->ci_biglock_count == 0,
"syscall %ld of emul %s leaked %d kernel locks",
(long)code, p->p_emul->e_name, curcpu()->ci_biglock_count);
kpreempt_enable();
#endif
}
int
SYS_SYSCALL(struct lwp *l, const struct CONCAT(SYS_SYSCALL, _args) *uap,
register_t *rval)
{
/* {
syscallarg(int) code;
syscallarg(register_t) args[SYS_MAXSYSARGS];
} */
const struct sysent *callp;
struct proc *p = l->l_proc;
int code;
int error;
#ifdef NETBSD32_SYSCALL
register_t args64[SYS_MAXSYSARGS];
int i, narg;
#define TRACE_ARGS args64
#else
#define TRACE_ARGS &SCARG(uap, args[0])
#endif
callp = p->p_emul->e_sysent;
code = SCARG(uap, code) & (SYS_NSYSENT - 1);
SYSCALL_COUNT(syscall_counts, code);
callp += code;
if (__predict_false(callp->sy_flags & SYCALL_INDIRECT))
return ENOSYS;
if (__predict_true(!p->p_trace_enabled)) {
error = sy_call(callp, l, &uap->args, rval);
CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
return error;
}
#ifdef NETBSD32_SYSCALL
narg = callp->sy_narg;
for (i = 0; i < narg; i++)
args64[i] = SCARG(uap, args[i]);
#endif
error = trace_enter(code, callp, TRACE_ARGS);
if (__predict_true(error == 0)) error = sy_call(callp, l, &uap->args, rval);
trace_exit(code, callp, &uap->args, rval, error);
CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
return error;
#undef TRACE_ARGS
}
/* $NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $ */
/*
* Copyright (c) 1994 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1994 Jan-Simon Pendry.
* All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95
*/
/*
* Union Layer
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/genfs/genfs.h>
#include <fs/union/union.h>
MODULE(MODULE_CLASS_VFS, union, NULL);
/*
* Mount union filesystem
*/
int
union_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0;
struct union_args *args = data;
struct vnode *lowerrootvp = NULLVP;
struct vnode *upperrootvp = NULLVP;
struct union_mount *um = 0;
const char *cp;
char *xp;
int len;
size_t size;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p)\n", __func__, mp);
#endif
if (mp->mnt_flag & MNT_GETARGS) {
um = MOUNTTOUNIONMOUNT(mp);
if (um == NULL)
return EIO;
args->target = NULL;
args->mntflags = um->um_op;
*data_len = sizeof *args;
return 0;
}
/*
* Update is a no-op
*/
if (mp->mnt_flag & MNT_UPDATE) {
/*
* Need to provide.
* 1. a way to convert between rdonly and rdwr mounts.
* 2. support for nfs exports.
*/
error = EOPNOTSUPP;
goto bad;
}
lowerrootvp = mp->mnt_vnodecovered;
vref(lowerrootvp);
/*
* Find upper node.
*/
error = namei_simple_user(args->target,
NSM_FOLLOW_NOEMULROOT, &upperrootvp);
if (error != 0)
goto bad;
if (upperrootvp->v_type != VDIR) {
error = EINVAL;
goto bad;
}
um = kmem_zalloc(sizeof(*um), KM_SLEEP);
/*
* Keep a held reference to the target vnodes.
* They are vrele'd in union_unmount.
*
* Depending on the _BELOW flag, the filesystems are
* viewed in a different order. In effect, this is the
* same as providing a mount under option to the mount syscall.
*/
um->um_op = args->mntflags & UNMNT_OPMASK;
switch (um->um_op) {
case UNMNT_ABOVE:
um->um_lowervp = lowerrootvp;
um->um_uppervp = upperrootvp;
break;
case UNMNT_BELOW:
um->um_lowervp = upperrootvp;
um->um_uppervp = lowerrootvp;
break;
case UNMNT_REPLACE:
vrele(lowerrootvp);
lowerrootvp = NULLVP;
um->um_uppervp = upperrootvp;
um->um_lowervp = lowerrootvp;
break;
default:
error = EINVAL;
goto bad;
}
/*
* This mount is mp-safe if both lower mounts are mp-safe.
*/
if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_iflag & IMNT_MPSAFE)) &&
(um->um_uppervp->v_mount->mnt_iflag & IMNT_MPSAFE))
mp->mnt_iflag |= IMNT_MPSAFE;
/*
* Unless the mount is readonly, ensure that the top layer
* supports whiteout operations
*/
if ((mp->mnt_flag & MNT_RDONLY) == 0) {
static struct componentname nullcn = {
.cn_nameiop = LOOKUP,
.cn_cred = NOCRED
};
vn_lock(um->um_uppervp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_WHITEOUT(um->um_uppervp, &nullcn, LOOKUP);
VOP_UNLOCK(um->um_uppervp);
if (error)
goto bad;
}
um->um_cred = l->l_cred;
kauth_cred_hold(um->um_cred);
um->um_cmode = UN_DIRMODE &~ l->l_proc->p_cwdi->cwdi_cmask;
/*
* Depending on what you think the MNT_LOCAL flag might mean,
* you may want the && to be || on the conditional below.
* At the moment it has been defined that the filesystem is
* only local if it is all local, ie the MNT_LOCAL flag implies
* that the entire namespace is local. If you think the MNT_LOCAL
* flag implies that some of the files might be stored locally
* then you will want to change the conditional.
*/
if (um->um_op == UNMNT_ABOVE) { if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) &&
(um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL))
mp->mnt_flag |= MNT_LOCAL;
}
/*
* Copy in the upper layer's RDONLY flag. This is for the benefit
* of lookup() which explicitly checks the flag, rather than asking
* the filesystem for its own opinion. This means, that an update
* mount of the underlying filesystem to go from rdonly to rdwr
* will leave the unioned view as read-only.
*/
mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY);
mp->mnt_data = um;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, NULL, UIO_USERSPACE,
mp->mnt_op->vfs_name, mp, l);
if (error)
goto bad;
error = vfs_set_lowermount(mp, um->um_uppervp->v_mount);
if (error)
goto bad;
switch (um->um_op) {
case UNMNT_ABOVE:
cp = "<above>:";
break;
case UNMNT_BELOW:
cp = "<below>:";
break;
case UNMNT_REPLACE:
cp = "";
break;
default:
cp = "<invalid>:";
#ifdef DIAGNOSTIC
panic("%s: bad um_op", __func__);
#endif
break;
}
len = strlen(cp);
memcpy(mp->mnt_stat.f_mntfromname, cp, len);
xp = mp->mnt_stat.f_mntfromname + len;
len = MNAMELEN - len;
(void) copyinstr(args->target, xp, len - 1, &size);
memset(xp + size, 0, len - size);
#ifdef UNION_DIAGNOSTIC
printf("%s: from %s, on %s\n", __func__,
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
/* Setup the readdir hook if it's not set already */
if (!vn_union_readdir_hook) vn_union_readdir_hook = union_readdirhook;
return 0;
bad:
if (um) {
if (um->um_cred) kauth_cred_free(um->um_cred);
kmem_free(um, sizeof(*um));
}
if (upperrootvp)
vrele(upperrootvp);
if (lowerrootvp) vrele(lowerrootvp);
return error;
}
/*
* VFS start. Nothing needed here - the start routine
* on the underlying filesystem(s) will have been called
* when that filesystem was mounted.
*/
/*ARGSUSED*/
int
union_start(struct mount *mp, int flags)
{
return 0;
}
/*
* Free reference to union layer
*/
static bool
union_unmount_selector(void *cl, struct vnode *vp)
{
int *count = cl;
KASSERT(mutex_owned(vp->v_interlock));
*count += 1;
return false;
}
int
union_unmount(struct mount *mp, int mntflags)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
int freeing;
int error;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p)\n", __func__, mp);
#endif
/*
* Keep flushing vnodes from the mount list.
* This is needed because of the un_pvp held
* reference to the parent vnode.
* If more vnodes have been freed on a given pass,
* the try again. The loop will iterate at most
* (d) times, where (d) is the maximum tree depth
* in the filesystem.
*/
for (freeing = 0; (error = vflush(mp, NULL, 0)) != 0;) {
struct vnode_iterator *marker;
int n;
/* count #vnodes held on mount list */
n = 0;
vfs_vnode_iterator_init(mp, &marker);
vfs_vnode_iterator_next(marker, union_unmount_selector, &n);
vfs_vnode_iterator_destroy(marker);
/* if this is unchanged then stop */
if (n == freeing)
break;
/* otherwise try once more time */
freeing = n;
}
/*
* Ok, now that we've tried doing it gently, get out the hammer.
*/
if (mntflags & MNT_FORCE)
error = vflush(mp, NULL, FORCECLOSE);
if (error)
return error;
/*
* Discard references to upper and lower target vnodes.
*/
if (um->um_lowervp)
vrele(um->um_lowervp);
vrele(um->um_uppervp);
kauth_cred_free(um->um_cred);
/*
* Finally, throw away the union_mount structure
*/
kmem_free(um, sizeof(*um));
mp->mnt_data = NULL;
return 0;
}
int
union_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
int error;
/*
* Return locked reference to root.
*/
vref(um->um_uppervp);
if (um->um_lowervp) vref(um->um_lowervp);
error = union_allocvp(vpp, mp, NULL, NULL, NULL,
um->um_uppervp, um->um_lowervp, 1);
if (error) {
vrele(um->um_uppervp);
if (um->um_lowervp) vrele(um->um_lowervp);
return error;
}
vn_lock(*vpp, lktype | LK_RETRY);
return 0;
}
int
union_statvfs(struct mount *mp, struct statvfs *sbp)
{
int error;
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
struct statvfs *sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
unsigned long lbsize;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p, lvp = %p, uvp = %p)\n", __func__, mp,
um->um_lowervp, um->um_uppervp);
#endif
if (um->um_lowervp) {
error = VFS_STATVFS(um->um_lowervp->v_mount, sbuf);
if (error)
goto done;
}
/* now copy across the "interesting" information and fake the rest */
lbsize = sbuf->f_bsize;
sbp->f_blocks = sbuf->f_blocks - sbuf->f_bfree;
sbp->f_files = sbuf->f_files - sbuf->f_ffree;
error = VFS_STATVFS(um->um_uppervp->v_mount, sbuf);
if (error)
goto done;
sbp->f_flag = sbuf->f_flag;
sbp->f_bsize = sbuf->f_bsize;
sbp->f_frsize = sbuf->f_frsize;
sbp->f_iosize = sbuf->f_iosize;
/*
* The "total" fields count total resources in all layers,
* the "free" fields count only those resources which are
* free in the upper layer (since only the upper layer
* is writable).
*/
if (sbuf->f_bsize != lbsize) sbp->f_blocks = sbp->f_blocks * lbsize / sbuf->f_bsize;
sbp->f_blocks += sbuf->f_blocks;
sbp->f_bfree = sbuf->f_bfree;
sbp->f_bavail = sbuf->f_bavail;
sbp->f_bresvd = sbuf->f_bresvd;
sbp->f_files += sbuf->f_files;
sbp->f_ffree = sbuf->f_ffree;
sbp->f_favail = sbuf->f_favail;
sbp->f_fresvd = sbuf->f_fresvd;
copy_statvfs_info(sbp, mp);
done:
kmem_free(sbuf, sizeof(*sbuf));
return error;
}
/*ARGSUSED*/
int
union_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
/*
* XXX - Assumes no data cached at union layer.
*/
return 0;
}
/*ARGSUSED*/
int
union_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
return EOPNOTSUPP;
}
static int
union_renamelock_enter(struct mount *mp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
/* Lock just the upper fs, where the action happens. */
return VFS_RENAMELOCK_ENTER(um->um_uppervp->v_mount);
}
static void
union_renamelock_exit(struct mount *mp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
VFS_RENAMELOCK_EXIT(um->um_uppervp->v_mount);
}
extern const struct vnodeopv_desc union_vnodeop_opv_desc;
const struct vnodeopv_desc * const union_vnodeopv_descs[] = {
&union_vnodeop_opv_desc,
NULL,
};
struct vfsops union_vfsops = {
.vfs_name = MOUNT_UNION,
.vfs_min_mount_data = sizeof (struct union_args),
.vfs_mount = union_mount,
.vfs_start = union_start,
.vfs_unmount = union_unmount,
.vfs_root = union_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = union_statvfs,
.vfs_sync = union_sync,
.vfs_vget = union_vget,
.vfs_loadvnode = union_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = union_init,
.vfs_reinit = union_reinit,
.vfs_done = union_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = union_renamelock_enter,
.vfs_renamelock_exit = union_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = union_vnodeopv_descs
};
SYSCTL_SETUP(unionfs_sysctl_setup, "unionfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "union",
SYSCTL_DESCR("Union file system"),
NULL, 0, NULL, 0,
CTL_VFS, 15, CTL_EOL);
/*
* XXX the "15" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "15" is the order as taken from sys/mount.h
*/
}
static int
union_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return vfs_attach(&union_vfsops);
case MODULE_CMD_FINI:
return vfs_detach(&union_vfsops);
default:
return ENOTTY;
}
}
/* $NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $ */
/*-
* Copyright (c) 1998, 1999, 2000, 2002, 2003, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum; by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/hash.h>
#include <sys/atomic.h>
#include <dev/scsipi/scsi_sdt.h>
#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsipi_disk.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_message.h>
#include <machine/param.h>
SDT_PROVIDER_DEFINE(scsi);
SDT_PROBE_DEFINE3(scsi, base, tag, get,
"struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);
SDT_PROBE_DEFINE3(scsi, base, tag, put,
"struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);
SDT_PROBE_DEFINE3(scsi, base, adapter, request__start,
"struct scsipi_channel *"/*chan*/,
"scsipi_adapter_req_t"/*req*/,
"void *"/*arg*/);
SDT_PROBE_DEFINE3(scsi, base, adapter, request__done,
"struct scsipi_channel *"/*chan*/,
"scsipi_adapter_req_t"/*req*/,
"void *"/*arg*/);
SDT_PROBE_DEFINE1(scsi, base, queue, batch__start,
"struct scsipi_channel *"/*chan*/);
SDT_PROBE_DEFINE2(scsi, base, queue, run,
"struct scsipi_channel *"/*chan*/,
"struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, queue, batch__done,
"struct scsipi_channel *"/*chan*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, execute, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, enqueue, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, done, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, redone, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, complete, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, restart, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, free, "struct scsipi_xfer *"/*xs*/);
static int scsipi_complete(struct scsipi_xfer *);
static void scsipi_request_sense(struct scsipi_xfer *);
static int scsipi_enqueue(struct scsipi_xfer *);
static void scsipi_run_queue(struct scsipi_channel *chan);
static void scsipi_completion_thread(void *);
static void scsipi_get_tag(struct scsipi_xfer *);
static void scsipi_put_tag(struct scsipi_xfer *);
static int scsipi_get_resource(struct scsipi_channel *);
static void scsipi_put_resource(struct scsipi_channel *);
static void scsipi_async_event_max_openings(struct scsipi_channel *,
struct scsipi_max_openings *);
static void scsipi_async_event_channel_reset(struct scsipi_channel *);
static void scsipi_channel_freeze_locked(struct scsipi_channel *, int);
static void scsipi_adapter_lock(struct scsipi_adapter *adapt);
static void scsipi_adapter_unlock(struct scsipi_adapter *adapt);
static void scsipi_update_timeouts(struct scsipi_xfer *xs);
static struct pool scsipi_xfer_pool;
int scsipi_xs_count = 0;
/*
* scsipi_init:
*
* Called when a scsibus or atapibus is attached to the system
* to initialize shared data structures.
*/
void
scsipi_init(void)
{
static int scsipi_init_done;
if (scsipi_init_done)
return;
scsipi_init_done = 1;
/* Initialize the scsipi_xfer pool. */
pool_init(&scsipi_xfer_pool, sizeof(struct scsipi_xfer), 0,
0, 0, "scxspl", NULL, IPL_BIO);
pool_prime(&scsipi_xfer_pool, 1);
scsipi_ioctl_init();
}
/*
* scsipi_channel_init:
*
* Initialize a scsipi_channel when it is attached.
*/
int
scsipi_channel_init(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
int i;
/* Initialize shared data. */
scsipi_init();
/* Initialize the queues. */
TAILQ_INIT(&chan->chan_queue);
TAILQ_INIT(&chan->chan_complete);
for (i = 0; i < SCSIPI_CHAN_PERIPH_BUCKETS; i++)
LIST_INIT(&chan->chan_periphtab[i]);
/*
* Create the asynchronous completion thread.
*/
if (kthread_create(PRI_NONE, 0, NULL, scsipi_completion_thread, chan,
&chan->chan_thread, "%s", chan->chan_name)) {
aprint_error_dev(adapt->adapt_dev, "unable to create completion thread for "
"channel %d\n", chan->chan_channel);
panic("scsipi_channel_init");
}
return 0;
}
/*
* scsipi_channel_shutdown:
*
* Shutdown a scsipi_channel.
*/
void
scsipi_channel_shutdown(struct scsipi_channel *chan)
{
mutex_enter(chan_mtx(chan));
/*
* Shut down the completion thread.
*/
chan->chan_tflags |= SCSIPI_CHANT_SHUTDOWN;
cv_broadcast(chan_cv_complete(chan));
/*
* Now wait for the thread to exit.
*/
while (chan->chan_thread != NULL)
cv_wait(chan_cv_thread(chan), chan_mtx(chan));
mutex_exit(chan_mtx(chan));
}
static uint32_t
scsipi_chan_periph_hash(uint64_t t, uint64_t l)
{
uint32_t hash;
hash = hash32_buf(&t, sizeof(t), HASH32_BUF_INIT);
hash = hash32_buf(&l, sizeof(l), hash);
return hash & SCSIPI_CHAN_PERIPH_HASHMASK;
}
/*
* scsipi_insert_periph:
*
* Insert a periph into the channel.
*/
void
scsipi_insert_periph(struct scsipi_channel *chan, struct scsipi_periph *periph)
{
uint32_t hash;
hash = scsipi_chan_periph_hash(periph->periph_target,
periph->periph_lun);
mutex_enter(chan_mtx(chan));
LIST_INSERT_HEAD(&chan->chan_periphtab[hash], periph, periph_hash);
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_remove_periph:
*
* Remove a periph from the channel.
*/
void
scsipi_remove_periph(struct scsipi_channel *chan,
struct scsipi_periph *periph)
{
LIST_REMOVE(periph, periph_hash);
}
/*
* scsipi_lookup_periph:
*
* Lookup a periph on the specified channel.
*/
static struct scsipi_periph *
scsipi_lookup_periph_internal(struct scsipi_channel *chan, int target, int lun, bool lock)
{
struct scsipi_periph *periph;
uint32_t hash;
if (target >= chan->chan_ntargets ||
lun >= chan->chan_nluns)
return NULL;
hash = scsipi_chan_periph_hash(target, lun);
if (lock)
mutex_enter(chan_mtx(chan));
LIST_FOREACH(periph, &chan->chan_periphtab[hash], periph_hash) {
if (periph->periph_target == target &&
periph->periph_lun == lun)
break;
}
if (lock)
mutex_exit(chan_mtx(chan));
return periph;
}
struct scsipi_periph *
scsipi_lookup_periph_locked(struct scsipi_channel *chan, int target, int lun)
{
return scsipi_lookup_periph_internal(chan, target, lun, false);
}
struct scsipi_periph *
scsipi_lookup_periph(struct scsipi_channel *chan, int target, int lun)
{
return scsipi_lookup_periph_internal(chan, target, lun, true);
}
/*
* scsipi_get_resource:
*
* Allocate a single xfer `resource' from the channel.
*
* NOTE: Must be called with channel lock held
*/
static int
scsipi_get_resource(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
if (chan->chan_flags & SCSIPI_CHAN_OPENINGS) {
if (chan->chan_openings > 0) {
chan->chan_openings--;
return 1;
}
return 0;
}
if (adapt->adapt_openings > 0) {
adapt->adapt_openings--;
return 1;
}
return 0;
}
/*
* scsipi_grow_resources:
*
* Attempt to grow resources for a channel. If this succeeds,
* we allocate one for our caller.
*
* NOTE: Must be called with channel lock held
*/
static inline int
scsipi_grow_resources(struct scsipi_channel *chan)
{
if (chan->chan_flags & SCSIPI_CHAN_CANGROW) {
if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
mutex_exit(chan_mtx(chan));
scsipi_adapter_request(chan,
ADAPTER_REQ_GROW_RESOURCES, NULL);
mutex_enter(chan_mtx(chan));
return scsipi_get_resource(chan);
}
/*
* ask the channel thread to do it. It'll have to thaw the
* queue
*/
scsipi_channel_freeze_locked(chan, 1);
chan->chan_tflags |= SCSIPI_CHANT_GROWRES;
cv_broadcast(chan_cv_complete(chan));
return 0;
}
return 0;
}
/*
* scsipi_put_resource:
*
* Free a single xfer `resource' to the channel.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_put_resource(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
if (chan->chan_flags & SCSIPI_CHAN_OPENINGS)
chan->chan_openings++;
else
adapt->adapt_openings++;
}
/*
* scsipi_get_tag:
*
* Get a tag ID for the specified xfer.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_get_tag(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int bit, tag;
u_int word;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
bit = 0; /* XXX gcc */
for (word = 0; word < PERIPH_NTAGWORDS; word++) {
bit = ffs(periph->periph_freetags[word]);
if (bit != 0)
break;
}
#ifdef DIAGNOSTIC
if (word == PERIPH_NTAGWORDS) {
scsipi_printaddr(periph);
printf("no free tags\n");
panic("scsipi_get_tag");
}
#endif
bit -= 1;
periph->periph_freetags[word] &= ~(1U << bit);
tag = (word << 5) | bit;
/* XXX Should eventually disallow this completely. */
if (tag >= periph->periph_openings) { scsipi_printaddr(periph);
printf("WARNING: tag %d greater than available openings %d\n",
tag, periph->periph_openings);
}
xs->xs_tag_id = tag;
SDT_PROBE3(scsi, base, tag, get,
xs, xs->xs_tag_id, xs->xs_tag_type);
}
/*
* scsipi_put_tag:
*
* Put the tag ID for the specified xfer back into the pool.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_put_tag(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int word, bit;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
SDT_PROBE3(scsi, base, tag, put,
xs, xs->xs_tag_id, xs->xs_tag_type);
word = xs->xs_tag_id >> 5;
bit = xs->xs_tag_id & 0x1f;
periph->periph_freetags[word] |= (1U << bit);
}
/*
* scsipi_get_xs:
*
* Allocate an xfer descriptor and associate it with the
* specified peripheral. If the peripheral has no more
* available command openings, we either block waiting for
* one to become available, or fail.
*
* When this routine is called with the channel lock held
* the flags must include XS_CTL_NOSLEEP.
*/
struct scsipi_xfer *
scsipi_get_xs(struct scsipi_periph *periph, int flags)
{
struct scsipi_xfer *xs;
bool lock = (flags & XS_CTL_NOSLEEP) == 0;
SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_get_xs\n"));
KASSERT(!cold);
#ifdef DIAGNOSTIC
/*
* URGENT commands can never be ASYNC.
*/
if ((flags & (XS_CTL_URGENT|XS_CTL_ASYNC)) ==
(XS_CTL_URGENT|XS_CTL_ASYNC)) {
scsipi_printaddr(periph);
printf("URGENT and ASYNC\n");
panic("scsipi_get_xs");
}
#endif
/*
* Wait for a command opening to become available. Rules:
*
* - All xfers must wait for an available opening.
* Exception: URGENT xfers can proceed when
* active == openings, because we use the opening
* of the command we're recovering for.
* - if the periph has sense pending, only URGENT & REQSENSE
* xfers may proceed.
*
* - If the periph is recovering, only URGENT xfers may
* proceed.
*
* - If the periph is currently executing a recovery
* command, URGENT commands must block, because only
* one recovery command can execute at a time.
*/
if (lock) mutex_enter(chan_mtx(periph->periph_channel));
for (;;) {
if (flags & XS_CTL_URGENT) {
if (periph->periph_active > periph->periph_openings)
goto wait_for_opening;
if (periph->periph_flags & PERIPH_SENSE) {
if ((flags & XS_CTL_REQSENSE) == 0)
goto wait_for_opening;
} else {
if ((periph->periph_flags &
PERIPH_RECOVERY_ACTIVE) != 0)
goto wait_for_opening;
periph->periph_flags |= PERIPH_RECOVERY_ACTIVE;
}
break;
}
if (periph->periph_active >= periph->periph_openings ||
(periph->periph_flags & PERIPH_RECOVERING) != 0)
goto wait_for_opening;
periph->periph_active++;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
break;
wait_for_opening:
if (flags & XS_CTL_NOSLEEP) {
KASSERT(!lock);
return NULL;
}
KASSERT(lock);
SC_DEBUG(periph, SCSIPI_DB3, ("sleeping\n"));
periph->periph_flags |= PERIPH_WAITING;
cv_wait(periph_cv_periph(periph),
chan_mtx(periph->periph_channel));
}
if (lock)
mutex_exit(chan_mtx(periph->periph_channel));
SC_DEBUG(periph, SCSIPI_DB3, ("calling pool_get\n"));
xs = pool_get(&scsipi_xfer_pool,
((flags & XS_CTL_NOSLEEP) != 0 ? PR_NOWAIT : PR_WAITOK));
if (xs == NULL) {
if (lock)
mutex_enter(chan_mtx(periph->periph_channel));
if (flags & XS_CTL_URGENT) {
if ((flags & XS_CTL_REQSENSE) == 0) periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
} else
periph->periph_active--; if (lock) mutex_exit(chan_mtx(periph->periph_channel));
scsipi_printaddr(periph);
printf("unable to allocate %sscsipi_xfer\n",
(flags & XS_CTL_URGENT) ? "URGENT " : "");
}
SC_DEBUG(periph, SCSIPI_DB3, ("returning\n"));
if (xs != NULL) {
memset(xs, 0, sizeof(*xs));
callout_init(&xs->xs_callout, 0);
xs->xs_periph = periph;
xs->xs_control = flags;
xs->xs_status = 0;
if ((flags & XS_CTL_NOSLEEP) == 0) mutex_enter(chan_mtx(periph->periph_channel)); TAILQ_INSERT_TAIL(&periph->periph_xferq, xs, device_q); KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); if ((flags & XS_CTL_NOSLEEP) == 0) mutex_exit(chan_mtx(periph->periph_channel));
}
return xs;
}
/*
* scsipi_put_xs:
*
* Release an xfer descriptor, decreasing the outstanding command
* count for the peripheral. If there is a thread waiting for
* an opening, wake it up. If not, kick any queued I/O the
* peripheral may have.
*
* NOTE: Must be called with channel lock held
*/
void
scsipi_put_xs(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int flags = xs->xs_control;
SDT_PROBE1(scsi, base, xfer, free, xs);
SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_free_xs\n"));
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
TAILQ_REMOVE(&periph->periph_xferq, xs, device_q);
callout_destroy(&xs->xs_callout);
pool_put(&scsipi_xfer_pool, xs);
#ifdef DIAGNOSTIC
if ((periph->periph_flags & PERIPH_RECOVERY_ACTIVE) != 0 &&
periph->periph_active == 0) {
scsipi_printaddr(periph);
printf("recovery without a command to recovery for\n");
panic("scsipi_put_xs");
}
#endif
if (flags & XS_CTL_URGENT) {
if ((flags & XS_CTL_REQSENSE) == 0)
periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
} else
periph->periph_active--;
if (periph->periph_active == 0 &&
(periph->periph_flags & PERIPH_WAITDRAIN) != 0) {
periph->periph_flags &= ~PERIPH_WAITDRAIN;
cv_broadcast(periph_cv_active(periph));
}
if (periph->periph_flags & PERIPH_WAITING) {
periph->periph_flags &= ~PERIPH_WAITING;
cv_broadcast(periph_cv_periph(periph));
} else {
if (periph->periph_switch->psw_start != NULL &&
device_is_active(periph->periph_dev)) {
SC_DEBUG(periph, SCSIPI_DB2,
("calling private start()\n"));
(*periph->periph_switch->psw_start)(periph);
}
}
}
/*
* scsipi_channel_freeze:
*
* Freeze a channel's xfer queue.
*/
void
scsipi_channel_freeze(struct scsipi_channel *chan, int count)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
chan->chan_qfreeze += count;
if (lock)
mutex_exit(chan_mtx(chan));
}
static void
scsipi_channel_freeze_locked(struct scsipi_channel *chan, int count)
{
chan->chan_qfreeze += count;
}
/*
* scsipi_channel_thaw:
*
* Thaw a channel's xfer queue.
*/
void
scsipi_channel_thaw(struct scsipi_channel *chan, int count)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
chan->chan_qfreeze -= count;
/*
* Don't let the freeze count go negative.
*
* Presumably the adapter driver could keep track of this,
* but it might just be easier to do this here so as to allow
* multiple callers, including those outside the adapter driver.
*/
if (chan->chan_qfreeze < 0) {
chan->chan_qfreeze = 0;
}
if (lock)
mutex_exit(chan_mtx(chan));
/*
* until the channel is running
*/
if (!lock)
return;
/*
* Kick the channel's queue here. Note, we may be running in
* interrupt context (softclock or HBA's interrupt), so the adapter
* driver had better not sleep.
*/
if (chan->chan_qfreeze == 0)
scsipi_run_queue(chan);
}
/*
* scsipi_channel_timed_thaw:
*
* Thaw a channel after some time has expired. This will also
* run the channel's queue if the freeze count has reached 0.
*/
void
scsipi_channel_timed_thaw(void *arg)
{
struct scsipi_channel *chan = arg;
scsipi_channel_thaw(chan, 1);
}
/*
* scsipi_periph_freeze:
*
* Freeze a device's xfer queue.
*/
void
scsipi_periph_freeze_locked(struct scsipi_periph *periph, int count)
{
periph->periph_qfreeze += count;
}
/*
* scsipi_periph_thaw:
*
* Thaw a device's xfer queue.
*/
void
scsipi_periph_thaw_locked(struct scsipi_periph *periph, int count)
{
periph->periph_qfreeze -= count;
#ifdef DIAGNOSTIC
if (periph->periph_qfreeze < 0) {
static const char pc[] = "periph freeze count < 0";
scsipi_printaddr(periph);
printf("%s\n", pc);
panic(pc);
}
#endif
if (periph->periph_qfreeze == 0 &&
(periph->periph_flags & PERIPH_WAITING) != 0)
cv_broadcast(periph_cv_periph(periph));
}
void
scsipi_periph_freeze(struct scsipi_periph *periph, int count)
{
mutex_enter(chan_mtx(periph->periph_channel));
scsipi_periph_freeze_locked(periph, count);
mutex_exit(chan_mtx(periph->periph_channel));
}
void
scsipi_periph_thaw(struct scsipi_periph *periph, int count)
{
mutex_enter(chan_mtx(periph->periph_channel));
scsipi_periph_thaw_locked(periph, count);
mutex_exit(chan_mtx(periph->periph_channel));
}
/*
* scsipi_periph_timed_thaw:
*
* Thaw a device after some time has expired.
*/
void
scsipi_periph_timed_thaw(void *arg)
{
struct scsipi_periph *periph = arg;
struct scsipi_channel *chan = periph->periph_channel;
callout_stop(&periph->periph_callout);
mutex_enter(chan_mtx(chan));
scsipi_periph_thaw_locked(periph, 1);
if ((periph->periph_channel->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/*
* Kick the channel's queue here. Note, we're running in
* interrupt context (softclock), so the adapter driver
* had better not sleep.
*/
mutex_exit(chan_mtx(chan));
scsipi_run_queue(periph->periph_channel);
} else {
/*
* Tell the completion thread to kick the channel's queue here.
*/
periph->periph_channel->chan_tflags |= SCSIPI_CHANT_KICK;
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
}
}
/*
* scsipi_wait_drain:
*
* Wait for a periph's pending xfers to drain.
*/
void
scsipi_wait_drain(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
mutex_enter(chan_mtx(chan));
while (periph->periph_active != 0) {
periph->periph_flags |= PERIPH_WAITDRAIN;
cv_wait(periph_cv_active(periph), chan_mtx(chan));
}
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_kill_pending:
*
* Kill off all pending xfers for a periph.
*
* NOTE: Must be called with channel lock held
*/
void
scsipi_kill_pending(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
(*chan->chan_bustype->bustype_kill_pending)(periph);
while (periph->periph_active != 0) {
periph->periph_flags |= PERIPH_WAITDRAIN;
cv_wait(periph_cv_active(periph), chan_mtx(chan));
}
}
/*
* scsipi_print_cdb:
* prints a command descriptor block (for debug purpose, error messages,
* SCSIVERBOSE, ...)
*/
void
scsipi_print_cdb(struct scsipi_generic *cmd)
{
int i, j;
printf("0x%02x", cmd->opcode);
switch (CDB_GROUPID(cmd->opcode)) {
case CDB_GROUPID_0:
j = CDB_GROUP0;
break;
case CDB_GROUPID_1:
j = CDB_GROUP1;
break;
case CDB_GROUPID_2:
j = CDB_GROUP2;
break;
case CDB_GROUPID_3:
j = CDB_GROUP3;
break;
case CDB_GROUPID_4:
j = CDB_GROUP4;
break;
case CDB_GROUPID_5:
j = CDB_GROUP5;
break;
case CDB_GROUPID_6:
j = CDB_GROUP6;
break;
case CDB_GROUPID_7:
j = CDB_GROUP7;
break;
default:
j = 0;
}
if (j == 0)
j = sizeof (cmd->bytes);
for (i = 0; i < j-1; i++) /* already done the opcode */
printf(" %02x", cmd->bytes[i]);
}
/*
* scsipi_interpret_sense:
*
* Look at the returned sense and act on the error, determining
* the unix error number to pass back. (0 = report no error)
*
* NOTE: If we return ERESTART, we are expected to have
* thawed the device!
*
* THIS IS THE DEFAULT ERROR HANDLER FOR SCSI DEVICES.
*/
int
scsipi_interpret_sense(struct scsipi_xfer *xs)
{
struct scsi_sense_data *sense;
struct scsipi_periph *periph = xs->xs_periph;
u_int8_t key;
int error;
u_int32_t info;
static const char *error_mes[] = {
"soft error (corrected)",
"not ready", "medium error",
"non-media hardware failure", "illegal request",
"unit attention", "readonly device",
"no data found", "vendor unique",
"copy aborted", "command aborted",
"search returned equal", "volume overflow",
"verify miscompare", "unknown error key"
};
sense = &xs->sense.scsi_sense;
#ifdef SCSIPI_DEBUG
if (periph->periph_flags & SCSIPI_DB1) {
int count, len;
scsipi_printaddr(periph);
printf(" sense debug information:\n");
printf("\tcode 0x%x valid %d\n",
SSD_RCODE(sense->response_code),
sense->response_code & SSD_RCODE_VALID ? 1 : 0);
printf("\tseg 0x%x key 0x%x ili 0x%x eom 0x%x fmark 0x%x\n",
sense->segment,
SSD_SENSE_KEY(sense->flags),
sense->flags & SSD_ILI ? 1 : 0,
sense->flags & SSD_EOM ? 1 : 0,
sense->flags & SSD_FILEMARK ? 1 : 0);
printf("\ninfo: 0x%x 0x%x 0x%x 0x%x followed by %d "
"extra bytes\n",
sense->info[0],
sense->info[1],
sense->info[2],
sense->info[3],
sense->extra_len);
len = SSD_ADD_BYTES_LIM(sense);
printf("\textra (up to %d bytes): ", len);
for (count = 0; count < len; count++)
printf("0x%x ", sense->csi[count]);
printf("\n");
}
#endif
/*
* If the periph has its own error handler, call it first.
* If it returns a legit error value, return that, otherwise
* it wants us to continue with normal error processing.
*/
if (periph->periph_switch->psw_error != NULL) {
SC_DEBUG(periph, SCSIPI_DB2,
("calling private err_handler()\n"));
error = (*periph->periph_switch->psw_error)(xs);
if (error != EJUSTRETURN)
return error;
}
/* otherwise use the default */
switch (SSD_RCODE(sense->response_code)) {
/*
* Old SCSI-1 and SASI devices respond with
* codes other than 70.
*/
case 0x00: /* no error (command completed OK) */
return 0;
case 0x04: /* drive not ready after it was selected */
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
return 0;
/* XXX - display some sort of error here? */
return EIO;
case 0x20: /* invalid command */
if ((xs->xs_control &
XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
return 0;
return EINVAL;
case 0x25: /* invalid LUN (Adaptec ACB-4000) */
return EACCES;
/*
* If it's code 70, use the extended stuff and
* interpret the key
*/
case 0x71: /* delayed error */
scsipi_printaddr(periph);
key = SSD_SENSE_KEY(sense->flags);
printf(" DEFERRED ERROR, key = 0x%x\n", key);
/* FALLTHROUGH */
case 0x70:
if ((sense->response_code & SSD_RCODE_VALID) != 0)
info = _4btol(sense->info);
else
info = 0;
key = SSD_SENSE_KEY(sense->flags);
switch (key) {
case SKEY_NO_SENSE:
case SKEY_RECOVERED_ERROR:
if (xs->resid == xs->datalen && xs->datalen) {
/*
* Why is this here?
*/
xs->resid = 0; /* not short read */
}
error = 0;
break;
case SKEY_EQUAL:
error = 0;
break;
case SKEY_NOT_READY:
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
return 0;
if (sense->asc == 0x3A) {
error = ENODEV; /* Medium not present */
if (xs->xs_control & XS_CTL_SILENT_NODEV)
return error;
} else
error = EIO;
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return error;
break;
case SKEY_ILLEGAL_REQUEST:
if ((xs->xs_control &
XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
return 0;
/*
* Handle the case where a device reports
* Logical Unit Not Supported during discovery.
*/
if ((xs->xs_control & XS_CTL_DISCOVERY) != 0 &&
sense->asc == 0x25 &&
sense->ascq == 0x00)
return EINVAL;
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return EIO;
error = EINVAL;
break;
case SKEY_UNIT_ATTENTION:
if (sense->asc == 0x29 &&
sense->ascq == 0x00) {
/* device or bus reset */
return ERESTART;
}
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control &
XS_CTL_IGNORE_MEDIA_CHANGE) != 0 ||
/* XXX Should reupload any transient state. */
(periph->periph_flags &
PERIPH_REMOVABLE) == 0) {
return ERESTART;
}
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return EIO;
error = EIO;
break;
case SKEY_DATA_PROTECT:
error = EROFS;
break;
case SKEY_BLANK_CHECK:
error = 0;
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
break;
case SKEY_VOLUME_OVERFLOW:
error = ENOSPC;
break;
default:
error = EIO;
break;
}
/* Print verbose decode if appropriate and possible */
if ((key == 0) ||
((xs->xs_control & XS_CTL_SILENT) != 0) ||
(scsipi_print_sense(xs, 0) != 0))
return error;
/* Print brief(er) sense information */
scsipi_printaddr(periph);
printf("%s", error_mes[key - 1]);
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
switch (key) {
case SKEY_NOT_READY:
case SKEY_ILLEGAL_REQUEST:
case SKEY_UNIT_ATTENTION:
case SKEY_DATA_PROTECT:
break;
case SKEY_BLANK_CHECK:
printf(", requested size: %d (decimal)",
info);
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries)
printf(", retrying");
printf(", cmd 0x%x, info 0x%x",
xs->cmd->opcode, info);
break;
default:
printf(", info = %d (decimal)", info);
}
}
if (sense->extra_len != 0) {
int n;
printf(", data =");
for (n = 0; n < sense->extra_len; n++)
printf(" %02x",
sense->csi[n]);
}
printf("\n");
return error;
/*
* Some other code, just report it
*/
default:
#if defined(SCSIDEBUG) || defined(DEBUG)
{
static const char *uc = "undecodable sense error";
int i;
u_int8_t *cptr = (u_int8_t *) sense;
scsipi_printaddr(periph);
if (xs->cmd == &xs->cmdstore) {
printf("%s for opcode 0x%x, data=",
uc, xs->cmdstore.opcode);
} else {
printf("%s, data=", uc);
}
for (i = 0; i < sizeof (sense); i++)
printf(" 0x%02x", *(cptr++) & 0xff);
printf("\n");
}
#else
scsipi_printaddr(periph);
printf("Sense Error Code 0x%x",
SSD_RCODE(sense->response_code));
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
struct scsi_sense_data_unextended *usense =
(struct scsi_sense_data_unextended *)sense;
printf(" at block no. %d (decimal)",
_3btol(usense->block));
}
printf("\n");
#endif
return EIO;
}
}
/*
* scsipi_test_unit_ready:
*
* Issue a `test unit ready' request.
*/
int
scsipi_test_unit_ready(struct scsipi_periph *periph, int flags)
{
struct scsi_test_unit_ready cmd;
int retries;
/* some ATAPI drives don't support TEST UNIT READY. Sigh */
if (periph->periph_quirks & PQUIRK_NOTUR)
return 0;
if (flags & XS_CTL_DISCOVERY)
retries = 0;
else
retries = SCSIPIRETRIES;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_TEST_UNIT_READY;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
retries, 10000, NULL, flags);
}
static const struct scsipi_inquiry3_pattern {
const char vendor[8];
const char product[16];
const char revision[4];
} scsipi_inquiry3_quirk[] = {
{ "ES-6600 ", "", "" },
};
static int
scsipi_inquiry3_ok(const struct scsipi_inquiry_data *ib)
{
for (size_t i = 0; i < __arraycount(scsipi_inquiry3_quirk); i++) {
const struct scsipi_inquiry3_pattern *q =
&scsipi_inquiry3_quirk[i];
#define MATCH(field) \
(q->field[0] ? memcmp(ib->field, q->field, sizeof(ib->field)) == 0 : 1)
if (MATCH(vendor) && MATCH(product) && MATCH(revision))
return 0;
}
return 1;
}
/*
* scsipi_inquire:
*
* Ask the device about itself.
*/
int
scsipi_inquire(struct scsipi_periph *periph, struct scsipi_inquiry_data *inqbuf,
int flags)
{
struct scsipi_inquiry cmd;
int error;
int retries;
if (flags & XS_CTL_DISCOVERY)
retries = 0;
else
retries = SCSIPIRETRIES;
/*
* If we request more data than the device can provide, it SHOULD just
* return a short response. However, some devices error with an
* ILLEGAL REQUEST sense code, and yet others have even more special
* failure modes (such as the GL641USB flash adapter, which goes loony
* and sends corrupted CRCs). To work around this, and to bring our
* behavior more in line with other OSes, we do a shorter inquiry,
* covering all the SCSI-2 information, first, and then request more
* data iff the "additional length" field indicates there is more.
* - mycroft, 2003/10/16
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = INQUIRY;
cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI2;
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI2, retries,
10000, NULL, flags | XS_CTL_DATA_IN);
if (!error &&
inqbuf->additional_length > SCSIPI_INQUIRY_LENGTH_SCSI2 - 4) {
if (scsipi_inquiry3_ok(inqbuf)) {
#if 0
printf("inquire: addlen=%d, retrying\n", inqbuf->additional_length);
#endif
cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI3;
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI3, retries,
10000, NULL, flags | XS_CTL_DATA_IN);
#if 0
printf("inquire: error=%d\n", error);
#endif
}
}
#ifdef SCSI_OLD_NOINQUIRY
/*
* Kludge for the Adaptec ACB-4000 SCSI->MFM translator.
* This board doesn't support the INQUIRY command at all.
*/
if (error == EINVAL || error == EACCES) {
/*
* Conjure up an INQUIRY response.
*/
inqbuf->device = (error == EINVAL ?
SID_QUAL_LU_PRESENT :
SID_QUAL_LU_NOTPRESENT) | T_DIRECT;
inqbuf->dev_qual2 = 0;
inqbuf->version = 0;
inqbuf->response_format = SID_FORMAT_SCSI1;
inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
memcpy(inqbuf->vendor, "ADAPTEC ACB-4000 ", 28);
error = 0;
}
/*
* Kludge for the Emulex MT-02 SCSI->QIC translator.
* This board gives an empty response to an INQUIRY command.
*/
else if (error == 0 &&
inqbuf->device == (SID_QUAL_LU_PRESENT | T_DIRECT) &&
inqbuf->dev_qual2 == 0 &&
inqbuf->version == 0 &&
inqbuf->response_format == SID_FORMAT_SCSI1) {
/*
* Fill out the INQUIRY response.
*/
inqbuf->device = (SID_QUAL_LU_PRESENT | T_SEQUENTIAL);
inqbuf->dev_qual2 = SID_REMOVABLE;
inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
memcpy(inqbuf->vendor, "EMULEX MT-02 QIC ", 28);
}
#endif /* SCSI_OLD_NOINQUIRY */
return error;
}
/*
* scsipi_prevent:
*
* Prevent or allow the user to remove the media
*/
int
scsipi_prevent(struct scsipi_periph *periph, int type, int flags)
{
struct scsi_prevent_allow_medium_removal cmd;
if (periph->periph_quirks & PQUIRK_NODOORLOCK)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL;
cmd.how = type;
return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, 5000, NULL, flags));
}
/*
* scsipi_start:
*
* Send a START UNIT.
*/
int
scsipi_start(struct scsipi_periph *periph, int type, int flags)
{
struct scsipi_start_stop cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = START_STOP;
cmd.byte2 = 0x00;
cmd.how = type;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, (type & SSS_START) ? 60000 : 10000, NULL, flags);
}
/*
* scsipi_mode_sense, scsipi_mode_sense_big:
* get a sense page from a device
*/
int
scsipi_mode_sense(struct scsipi_periph *periph, int byte2, int page,
struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_sense_6 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SENSE_6;
cmd.byte2 = byte2;
cmd.page = page;
cmd.length = len & 0xff;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}
int
scsipi_mode_sense_big(struct scsipi_periph *periph, int byte2, int page,
struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_sense_10 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SENSE_10;
cmd.byte2 = byte2;
cmd.page = page;
_lto2b(len, cmd.length);
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}
int
scsipi_mode_select(struct scsipi_periph *periph, int byte2,
struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_select_6 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SELECT_6;
cmd.byte2 = byte2;
cmd.length = len & 0xff;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}
int
scsipi_mode_select_big(struct scsipi_periph *periph, int byte2,
struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_select_10 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SELECT_10;
cmd.byte2 = byte2;
_lto2b(len, cmd.length);
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}
/*
* scsipi_get_opcodeinfo:
*
* query the device for supported commands and their timeout
* building a timeout lookup table if timeout information is available.
*/
void
scsipi_get_opcodeinfo(struct scsipi_periph *periph)
{
u_int8_t *data;
int len = 16*1024;
int rc;
struct scsi_repsuppopcode cmd;
/* refrain from asking for supported opcodes */
if (periph->periph_quirks & PQUIRK_NOREPSUPPOPC ||
periph->periph_type == T_PROCESSOR || /* spec. */
periph->periph_type == T_CDROM) /* spec. */
return;
scsipi_free_opcodeinfo(periph);
/*
* query REPORT SUPPORTED OPERATION CODES
* if OK
* enumerate all codes
* if timeout exists insert maximum into opcode table
*/
data = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MAINTENANCE_IN;
cmd.svcaction = RSOC_REPORT_SUPPORTED_OPCODES;
cmd.repoption = RSOC_RCTD|RSOC_ALL;
_lto4b(len, cmd.alloclen);
rc = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, 0, 1000, NULL,
XS_CTL_DATA_IN|XS_CTL_SILENT);
if (rc == 0) {
int count;
int dlen = _4btol(data);
u_int8_t *c = data + 4;
SC_DEBUG(periph, SCSIPI_DB3,
("supported opcode timeout-values loaded\n"));
SC_DEBUG(periph, SCSIPI_DB3,
("CMD LEN SA spec nom. time cmd timeout\n"));
struct scsipi_opcodes *tot = malloc(sizeof(struct scsipi_opcodes),
M_DEVBUF, M_WAITOK|M_ZERO);
count = 0;
while (tot != NULL &&
dlen >= (int)sizeof(struct scsi_repsupopcode_all_commands_descriptor)) {
struct scsi_repsupopcode_all_commands_descriptor *acd
= (struct scsi_repsupopcode_all_commands_descriptor *)c;
#ifdef SCSIPI_DEBUG
int cdblen = _2btol((const u_int8_t *)&acd->cdblen);
#endif
dlen -= sizeof(struct scsi_repsupopcode_all_commands_descriptor);
c += sizeof(struct scsi_repsupopcode_all_commands_descriptor);
SC_DEBUG(periph, SCSIPI_DB3,
("0x%02x(%2d) ", acd->opcode, cdblen));
tot->opcode_info[acd->opcode].ti_flags = SCSIPI_TI_VALID;
if (acd->flags & RSOC_ACD_SERVACTV) {
SC_DEBUGN(periph, SCSIPI_DB3,
("0x%02x%02x ",
acd->serviceaction[0],
acd->serviceaction[1]));
} else {
SC_DEBUGN(periph, SCSIPI_DB3, (" "));
}
if (acd->flags & RSOC_ACD_CTDP
&& dlen >= (int)sizeof(struct scsi_repsupopcode_timeouts_descriptor)) {
struct scsi_repsupopcode_timeouts_descriptor *td
= (struct scsi_repsupopcode_timeouts_descriptor *)c;
long nomto = _4btol(td->nom_process_timeout);
long cmdto = _4btol(td->cmd_process_timeout);
long t = (cmdto > nomto) ? cmdto : nomto;
dlen -= sizeof(struct scsi_repsupopcode_timeouts_descriptor);
c += sizeof(struct scsi_repsupopcode_timeouts_descriptor);
SC_DEBUGN(periph, SCSIPI_DB3,
("0x%02x %10ld %10ld",
td->cmd_specific,
nomto, cmdto));
if (t > tot->opcode_info[acd->opcode].ti_timeout) {
tot->opcode_info[acd->opcode].ti_timeout = t;
++count;
}
}
SC_DEBUGN(periph, SCSIPI_DB3,("\n"));
}
if (count > 0) {
periph->periph_opcs = tot;
} else {
free(tot, M_DEVBUF);
SC_DEBUG(periph, SCSIPI_DB3,
("no usable timeout values available\n"));
}
} else {
SC_DEBUG(periph, SCSIPI_DB3,
("SCSI_MAINTENANCE_IN"
"[RSOC_REPORT_SUPPORTED_OPCODES] failed error=%d"
" - no device provided timeout "
"values available\n", rc));
}
free(data, M_DEVBUF);
}
/*
* scsipi_update_timeouts:
* Override timeout value if device/config provided
* timeouts are available.
*/
static void
scsipi_update_timeouts(struct scsipi_xfer *xs)
{
struct scsipi_opcodes *opcs;
u_int8_t cmd;
int timeout;
struct scsipi_opinfo *oi;
if (xs->timeout <= 0) {
return;
}
opcs = xs->xs_periph->periph_opcs;
if (opcs == NULL) {
return;
}
cmd = xs->cmd->opcode;
oi = &opcs->opcode_info[cmd];
timeout = 1000 * (int)oi->ti_timeout;
if (timeout > xs->timeout && timeout < 86400000) {
/*
* pick up device configured timeouts if they
* are longer than the requested ones but less
* than a day
*/
#ifdef SCSIPI_DEBUG
if ((oi->ti_flags & SCSIPI_TI_LOGGED) == 0) {
SC_DEBUG(xs->xs_periph, SCSIPI_DB3,
("Overriding command 0x%02x "
"timeout of %d with %d ms\n",
cmd, xs->timeout, timeout));
oi->ti_flags |= SCSIPI_TI_LOGGED;
}
#endif
xs->timeout = timeout;
}
}
/*
* scsipi_free_opcodeinfo:
*
* free the opcode information table
*/
void
scsipi_free_opcodeinfo(struct scsipi_periph *periph)
{
if (periph->periph_opcs != NULL) {
free(periph->periph_opcs, M_DEVBUF);
}
periph->periph_opcs = NULL;
}
/*
* scsipi_done:
*
* This routine is called by an adapter's interrupt handler when
* an xfer is completed.
*/
void
scsipi_done(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int freezecnt;
SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_done\n"));
#ifdef SCSIPI_DEBUG
if (periph->periph_dbflags & SCSIPI_DB1)
show_scsipi_cmd(xs);
#endif
mutex_enter(chan_mtx(chan));
SDT_PROBE1(scsi, base, xfer, done, xs);
/*
* The resource this command was using is now free.
*/
if (xs->xs_status & XS_STS_DONE) {
/* XXX in certain circumstances, such as a device
* being detached, a xs that has already been
* scsipi_done()'d by the main thread will be done'd
* again by scsibusdetach(). Putting the xs on the
* chan_complete queue causes list corruption and
* everyone dies. This prevents that, but perhaps
* there should be better coordination somewhere such
* that this won't ever happen (and can be turned into
* a KASSERT().
*/
SDT_PROBE1(scsi, base, xfer, redone, xs);
mutex_exit(chan_mtx(chan));
goto out;
}
scsipi_put_resource(chan);
xs->xs_periph->periph_sent--;
/*
* If the command was tagged, free the tag.
*/
if (XS_CTL_TAGTYPE(xs) != 0)
scsipi_put_tag(xs);
else
periph->periph_flags &= ~PERIPH_UNTAG;
/* Mark the command as `done'. */
xs->xs_status |= XS_STS_DONE;
#ifdef DIAGNOSTIC
if ((xs->xs_control & (XS_CTL_ASYNC|XS_CTL_POLL)) ==
(XS_CTL_ASYNC|XS_CTL_POLL))
panic("scsipi_done: ASYNC and POLL");
#endif
/*
* If the xfer had an error of any sort, freeze the
* periph's queue. Freeze it again if we were requested
* to do so in the xfer.
*/
freezecnt = 0;
if (xs->error != XS_NOERROR)
freezecnt++;
if (xs->xs_control & XS_CTL_FREEZE_PERIPH)
freezecnt++;
if (freezecnt != 0)
scsipi_periph_freeze_locked(periph, freezecnt);
/*
* record the xfer with a pending sense, in case a SCSI reset is
* received before the thread is waked up.
*/
if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
periph->periph_flags |= PERIPH_SENSE;
periph->periph_xscheck = xs;
}
/*
* If this was an xfer that was not to complete asynchronously,
* let the requesting thread perform error checking/handling
* in its context.
*/
if ((xs->xs_control & XS_CTL_ASYNC) == 0) {
/*
* If it's a polling job, just return, to unwind the
* call graph. We don't need to restart the queue,
* because polling jobs are treated specially, and
* are really only used during crash dumps anyway
* (XXX or during boot-time autoconfiguration of
* ATAPI devices).
*/
if (xs->xs_control & XS_CTL_POLL) {
mutex_exit(chan_mtx(chan));
return;
}
cv_broadcast(xs_cv(xs));
mutex_exit(chan_mtx(chan));
goto out;
}
/*
* Catch the extremely common case of I/O completing
* without error; no use in taking a context switch
* if we can handle it in interrupt context.
*/
if (xs->error == XS_NOERROR) {
mutex_exit(chan_mtx(chan));
(void) scsipi_complete(xs);
goto out;
}
/*
* There is an error on this xfer. Put it on the channel's
* completion queue, and wake up the completion thread.
*/
TAILQ_INSERT_TAIL(&chan->chan_complete, xs, channel_q);
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
out:
/*
* If there are more xfers on the channel's queue, attempt to
* run them.
*/
scsipi_run_queue(chan);
}
/*
* scsipi_complete:
*
* Completion of a scsipi_xfer. This is the guts of scsipi_done().
*
* NOTE: This routine MUST be called with valid thread context
* except for the case where the following two conditions are
* true:
*
* xs->error == XS_NOERROR
* XS_CTL_ASYNC is set in xs->xs_control
*
* The semantics of this routine can be tricky, so here is an
* explanation:
*
* 0 Xfer completed successfully.
*
* ERESTART Xfer had an error, but was restarted.
*
* anything else Xfer had an error, return value is Unix
* errno.
*
* If the return value is anything but ERESTART:
*
* - If XS_CTL_ASYNC is set, `xs' has been freed back to
* the pool.
* - If there is a buf associated with the xfer,
* it has been biodone()'d.
*/
static int
scsipi_complete(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int error;
SDT_PROBE1(scsi, base, xfer, complete, xs);
#ifdef DIAGNOSTIC
if ((xs->xs_control & XS_CTL_ASYNC) != 0 && xs->bp == NULL)
panic("scsipi_complete: XS_CTL_ASYNC but no buf");
#endif
/*
* If command terminated with a CHECK CONDITION, we need to issue a
* REQUEST_SENSE command. Once the REQUEST_SENSE has been processed
* we'll have the real status.
* Must be processed with channel lock held to avoid missing
* a SCSI bus reset for this command.
*/
mutex_enter(chan_mtx(chan));
if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
/* request sense for a request sense ? */
if (xs->xs_control & XS_CTL_REQSENSE) {
scsipi_printaddr(periph);
printf("request sense for a request sense ?\n");
/* XXX maybe we should reset the device ? */
/* we've been frozen because xs->error != XS_NOERROR */
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
if (xs->resid < xs->datalen) {
printf("we read %d bytes of sense anyway:\n",
xs->datalen - xs->resid);
scsipi_print_sense_data((void *)xs->data, 0);
}
return EINVAL;
}
mutex_exit(chan_mtx(chan)); // XXX allows other commands to queue or run
scsipi_request_sense(xs);
} else
mutex_exit(chan_mtx(chan));
/*
* If it's a user level request, bypass all usual completion
* processing, let the user work it out..
*/
if ((xs->xs_control & XS_CTL_USERCMD) != 0) {
SC_DEBUG(periph, SCSIPI_DB3, ("calling user done()\n"));
mutex_enter(chan_mtx(chan));
if (xs->error != XS_NOERROR)
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
scsipi_user_done(xs);
SC_DEBUG(periph, SCSIPI_DB3, ("returned from user done()\n "));
return 0;
}
switch (xs->error) {
case XS_NOERROR:
error = 0;
break;
case XS_SENSE:
case XS_SHORTSENSE:
error = (*chan->chan_bustype->bustype_interpret_sense)(xs);
break;
case XS_RESOURCE_SHORTAGE:
/*
* XXX Should freeze channel's queue.
*/
scsipi_printaddr(periph);
printf("adapter resource shortage\n");
/* FALLTHROUGH */
case XS_BUSY:
if (xs->error == XS_BUSY && xs->status == SCSI_QUEUE_FULL) {
struct scsipi_max_openings mo;
/*
* We set the openings to active - 1, assuming that
* the command that got us here is the first one that
* can't fit into the device's queue. If that's not
* the case, I guess we'll find out soon enough.
*/
mo.mo_target = periph->periph_target;
mo.mo_lun = periph->periph_lun;
if (periph->periph_active < periph->periph_openings)
mo.mo_openings = periph->periph_active - 1;
else
mo.mo_openings = periph->periph_openings - 1;
#ifdef DIAGNOSTIC
if (mo.mo_openings < 0) {
scsipi_printaddr(periph);
printf("QUEUE FULL resulted in < 0 openings\n");
panic("scsipi_done");
}
#endif
if (mo.mo_openings == 0) {
scsipi_printaddr(periph);
printf("QUEUE FULL resulted in 0 openings\n");
mo.mo_openings = 1;
}
scsipi_async_event(chan, ASYNC_EVENT_MAX_OPENINGS, &mo);
error = ERESTART;
} else if (xs->xs_retries != 0) {
xs->xs_retries--;
/*
* Wait one second, and try again.
*/
mutex_enter(chan_mtx(chan));
if ((xs->xs_control & XS_CTL_POLL) ||
(chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/* XXX: quite extreme */
kpause("xsbusy", false, hz, chan_mtx(chan));
} else if (!callout_pending(&periph->periph_callout)) {
scsipi_periph_freeze_locked(periph, 1);
callout_reset(&periph->periph_callout,
hz, scsipi_periph_timed_thaw, periph);
}
mutex_exit(chan_mtx(chan));
error = ERESTART;
} else
error = EBUSY;
break;
case XS_REQUEUE:
error = ERESTART;
break;
case XS_SELTIMEOUT:
case XS_TIMEOUT:
/*
* If the device hasn't gone away, honor retry counts.
*
* Note that if we're in the middle of probing it,
* it won't be found because it isn't here yet so
* we won't honor the retry count in that case.
*/
if (scsipi_lookup_periph(chan, periph->periph_target,
periph->periph_lun) && xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
break;
case XS_RESET:
if (xs->xs_control & XS_CTL_REQSENSE) {
/*
* request sense interrupted by reset: signal it
* with EINTR return code.
*/
error = EINTR;
} else {
if (xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
}
break;
case XS_DRIVER_STUFFUP:
scsipi_printaddr(periph);
printf("generic HBA error\n");
error = EIO;
break;
default:
scsipi_printaddr(periph);
printf("invalid return code from adapter: %d\n", xs->error);
error = EIO;
break;
}
mutex_enter(chan_mtx(chan));
if (error == ERESTART) {
SDT_PROBE1(scsi, base, xfer, restart, xs);
/*
* If we get here, the periph has been thawed and frozen
* again if we had to issue recovery commands. Alternatively,
* it may have been frozen again and in a timed thaw. In
* any case, we thaw the periph once we re-enqueue the
* command. Once the periph is fully thawed, it will begin
* operation again.
*/
xs->error = XS_NOERROR;
xs->status = SCSI_OK;
xs->xs_status &= ~XS_STS_DONE;
xs->xs_requeuecnt++;
error = scsipi_enqueue(xs);
if (error == 0) {
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
return ERESTART;
}
}
/*
* scsipi_done() freezes the queue if not XS_NOERROR.
* Thaw it here.
*/
if (xs->error != XS_NOERROR)
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
if (periph->periph_switch->psw_done)
periph->periph_switch->psw_done(xs, error);
mutex_enter(chan_mtx(chan));
if (xs->xs_control & XS_CTL_ASYNC)
scsipi_put_xs(xs);
mutex_exit(chan_mtx(chan));
return error;
}
/*
* Issue a request sense for the given scsipi_xfer. Called when the xfer
* returns with a CHECK_CONDITION status. Must be called in valid thread
* context.
*/
static void
scsipi_request_sense(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int flags, error;
struct scsi_request_sense cmd;
periph->periph_flags |= PERIPH_SENSE;
/* if command was polling, request sense will too */
flags = xs->xs_control & XS_CTL_POLL;
/* Polling commands can't sleep */
if (flags)
flags |= XS_CTL_NOSLEEP;
flags |= XS_CTL_REQSENSE | XS_CTL_URGENT | XS_CTL_DATA_IN |
XS_CTL_THAW_PERIPH | XS_CTL_FREEZE_PERIPH;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_REQUEST_SENSE;
cmd.length = sizeof(struct scsi_sense_data);
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)&xs->sense.scsi_sense, sizeof(struct scsi_sense_data),
0, 1000, NULL, flags);
periph->periph_flags &= ~PERIPH_SENSE;
periph->periph_xscheck = NULL;
switch (error) {
case 0:
/* we have a valid sense */
xs->error = XS_SENSE;
return;
case EINTR:
/* REQUEST_SENSE interrupted by bus reset. */
xs->error = XS_RESET;
return;
case EIO:
/* request sense couldn't be performed */
/*
* XXX this isn't quite right but we don't have anything
* better for now
*/
xs->error = XS_DRIVER_STUFFUP;
return;
default:
/* Notify that request sense failed. */
xs->error = XS_DRIVER_STUFFUP;
scsipi_printaddr(periph);
printf("request sense failed with error %d\n", error);
return;
}
}
/*
* scsipi_enqueue:
*
* Enqueue an xfer on a channel.
*/
static int
scsipi_enqueue(struct scsipi_xfer *xs)
{
struct scsipi_channel *chan = xs->xs_periph->periph_channel;
struct scsipi_xfer *qxs;
SDT_PROBE1(scsi, base, xfer, enqueue, xs);
/*
* If the xfer is to be polled, and there are already jobs on
* the queue, we can't proceed.
*/
KASSERT(mutex_owned(chan_mtx(chan))); if ((xs->xs_control & XS_CTL_POLL) != 0 &&
TAILQ_FIRST(&chan->chan_queue) != NULL) {
xs->error = XS_DRIVER_STUFFUP;
return EAGAIN;
}
/*
* If we have an URGENT xfer, it's an error recovery command
* and it should just go on the head of the channel's queue.
*/
if (xs->xs_control & XS_CTL_URGENT) {
TAILQ_INSERT_HEAD(&chan->chan_queue, xs, channel_q);
goto out;
}
/*
* If this xfer has already been on the queue before, we
* need to reinsert it in the correct order. That order is:
*
* Immediately before the first xfer for this periph
* with a requeuecnt less than xs->xs_requeuecnt.
*
* Failing that, at the end of the queue. (We'll end up
* there naturally.)
*/
if (xs->xs_requeuecnt != 0) { for (qxs = TAILQ_FIRST(&chan->chan_queue); qxs != NULL;
qxs = TAILQ_NEXT(qxs, channel_q)) {
if (qxs->xs_periph == xs->xs_periph &&
qxs->xs_requeuecnt < xs->xs_requeuecnt)
break;
}
if (qxs != NULL) {
TAILQ_INSERT_AFTER(&chan->chan_queue, qxs, xs,
channel_q);
goto out;
}
}
TAILQ_INSERT_TAIL(&chan->chan_queue, xs, channel_q);
out:
if (xs->xs_control & XS_CTL_THAW_PERIPH) scsipi_periph_thaw_locked(xs->xs_periph, 1);
return 0;
}
/*
* scsipi_run_queue:
*
* Start as many xfers as possible running on the channel.
*/
static void
scsipi_run_queue(struct scsipi_channel *chan)
{
struct scsipi_xfer *xs;
struct scsipi_periph *periph;
SDT_PROBE1(scsi, base, queue, batch__start, chan);
for (;;) {
mutex_enter(chan_mtx(chan));
/*
* If the channel is frozen, we can't do any work right
* now.
*/
if (chan->chan_qfreeze != 0) {
mutex_exit(chan_mtx(chan));
break;
}
/*
* Look for work to do, and make sure we can do it.
*/
for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL;
xs = TAILQ_NEXT(xs, channel_q)) {
periph = xs->xs_periph;
if ((periph->periph_sent >= periph->periph_openings) || periph->periph_qfreeze != 0 ||
(periph->periph_flags & PERIPH_UNTAG) != 0)
continue;
if ((periph->periph_flags & (PERIPH_RECOVERING | PERIPH_SENSE)) != 0 &&
(xs->xs_control & XS_CTL_URGENT) == 0)
continue;
/*
* We can issue this xfer!
*/
goto got_one;
}
/*
* Can't find any work to do right now.
*/
mutex_exit(chan_mtx(chan));
break;
got_one:
/*
* Have an xfer to run. Allocate a resource from
* the adapter to run it. If we can't allocate that
* resource, we don't dequeue the xfer.
*/
if (scsipi_get_resource(chan) == 0) {
/*
* Adapter is out of resources. If the adapter
* supports it, attempt to grow them.
*/
if (scsipi_grow_resources(chan) == 0) {
/*
* Wasn't able to grow resources,
* nothing more we can do.
*/
if (xs->xs_control & XS_CTL_POLL) { scsipi_printaddr(xs->xs_periph);
printf("polling command but no "
"adapter resources");
/* We'll panic shortly... */
}
mutex_exit(chan_mtx(chan));
/*
* XXX: We should be able to note that
* XXX: that resources are needed here!
*/
break;
}
/*
* scsipi_grow_resources() allocated the resource
* for us.
*/
}
/*
* We have a resource to run this xfer, do it!
*/
TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);
/*
* If the command is to be tagged, allocate a tag ID
* for it.
*/
if (XS_CTL_TAGTYPE(xs) != 0)
scsipi_get_tag(xs);
else
periph->periph_flags |= PERIPH_UNTAG;
periph->periph_sent++;
mutex_exit(chan_mtx(chan));
SDT_PROBE2(scsi, base, queue, run, chan, xs); scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs);
}
SDT_PROBE1(scsi, base, queue, batch__done, chan);
}
/*
* scsipi_execute_xs:
*
* Begin execution of an xfer, waiting for it to complete, if necessary.
*/
int
scsipi_execute_xs(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int oasync, async, poll, error;
KASSERT(!cold); scsipi_update_timeouts(xs);
(chan->chan_bustype->bustype_cmd)(xs);
xs->xs_status &= ~XS_STS_DONE;
xs->error = XS_NOERROR;
xs->resid = xs->datalen;
xs->status = SCSI_OK;
SDT_PROBE1(scsi, base, xfer, execute, xs);
#ifdef SCSIPI_DEBUG
if (xs->xs_periph->periph_dbflags & SCSIPI_DB3) {
printf("scsipi_execute_xs: ");
show_scsipi_xs(xs);
printf("\n");
}
#endif
/*
* Deal with command tagging:
*
* - If the device's current operating mode doesn't
* include tagged queueing, clear the tag mask.
*
* - If the device's current operating mode *does*
* include tagged queueing, set the tag_type in
* the xfer to the appropriate byte for the tag
* message.
*/
if ((PERIPH_XFER_MODE(periph) & PERIPH_CAP_TQING) == 0 ||
(xs->xs_control & XS_CTL_REQSENSE)) {
xs->xs_control &= ~XS_CTL_TAGMASK;
xs->xs_tag_type = 0;
} else {
/*
* If the request doesn't specify a tag, give Head
* tags to URGENT operations and Simple tags to
* everything else.
*/
if (XS_CTL_TAGTYPE(xs) == 0) { if (xs->xs_control & XS_CTL_URGENT)
xs->xs_control |= XS_CTL_HEAD_TAG;
else
xs->xs_control |= XS_CTL_SIMPLE_TAG;
}
switch (XS_CTL_TAGTYPE(xs)) {
case XS_CTL_ORDERED_TAG:
xs->xs_tag_type = MSG_ORDERED_Q_TAG;
break;
case XS_CTL_SIMPLE_TAG:
xs->xs_tag_type = MSG_SIMPLE_Q_TAG;
break;
case XS_CTL_HEAD_TAG:
xs->xs_tag_type = MSG_HEAD_OF_Q_TAG;
break;
default:
scsipi_printaddr(periph);
printf("invalid tag mask 0x%08x\n",
XS_CTL_TAGTYPE(xs));
panic("scsipi_execute_xs");
}
}
/* If the adapter wants us to poll, poll. */
if (chan->chan_adapter->adapt_flags & SCSIPI_ADAPT_POLL_ONLY) xs->xs_control |= XS_CTL_POLL;
/*
* If we don't yet have a completion thread, or we are to poll for
* completion, clear the ASYNC flag.
*/
oasync = (xs->xs_control & XS_CTL_ASYNC); if (chan->chan_thread == NULL || (xs->xs_control & XS_CTL_POLL) != 0)
xs->xs_control &= ~XS_CTL_ASYNC;
async = (xs->xs_control & XS_CTL_ASYNC);
poll = (xs->xs_control & XS_CTL_POLL);
#ifdef DIAGNOSTIC
if (oasync != 0 && xs->bp == NULL) panic("scsipi_execute_xs: XS_CTL_ASYNC but no buf");
#endif
/*
* Enqueue the transfer. If we're not polling for completion, this
* should ALWAYS return `no error'.
*/
error = scsipi_enqueue(xs);
if (error) {
if (poll == 0) {
scsipi_printaddr(periph);
printf("not polling, but enqueue failed with %d\n",
error);
panic("scsipi_execute_xs");
}
scsipi_printaddr(periph);
printf("should have flushed queue?\n");
goto free_xs;
}
mutex_exit(chan_mtx(chan));
restarted:
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
/*
* The xfer is enqueued, and possibly running. If it's to be
* completed asynchronously, just return now.
*/
if (async)
return 0;
/*
* Not an asynchronous command; wait for it to complete.
*/
while ((xs->xs_status & XS_STS_DONE) == 0) {
if (poll) {
scsipi_printaddr(periph);
printf("polling command not done\n");
panic("scsipi_execute_xs");
}
cv_wait(xs_cv(xs), chan_mtx(chan));
}
/*
* Command is complete. scsipi_done() has awakened us to perform
* the error handling.
*/
mutex_exit(chan_mtx(chan));
error = scsipi_complete(xs);
if (error == ERESTART)
goto restarted;
/*
* If it was meant to run async and we cleared async ourselves,
* don't return an error here. It has already been handled
*/
if (oasync)
error = 0;
/*
* Command completed successfully or fatal error occurred. Fall
* into....
*/
mutex_enter(chan_mtx(chan));
free_xs:
scsipi_put_xs(xs);
mutex_exit(chan_mtx(chan));
/*
* Kick the queue, keep it running in case it stopped for some
* reason.
*/
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
return error;
}
/*
* scsipi_completion_thread:
*
* This is the completion thread. We wait for errors on
* asynchronous xfers, and perform the error handling
* function, restarting the command, if necessary.
*/
static void
scsipi_completion_thread(void *arg)
{
struct scsipi_channel *chan = arg;
struct scsipi_xfer *xs;
if (chan->chan_init_cb)
(*chan->chan_init_cb)(chan, chan->chan_init_cb_arg);
mutex_enter(chan_mtx(chan));
chan->chan_flags |= SCSIPI_CHAN_TACTIVE;
for (;;) {
xs = TAILQ_FIRST(&chan->chan_complete);
if (xs == NULL && chan->chan_tflags == 0) {
/* nothing to do; wait */
cv_wait(chan_cv_complete(chan), chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
/* call chan_callback from thread context */
chan->chan_tflags &= ~SCSIPI_CHANT_CALLBACK;
chan->chan_callback(chan, chan->chan_callback_arg);
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_GROWRES) {
/* attempt to get more openings for this channel */
chan->chan_tflags &= ~SCSIPI_CHANT_GROWRES;
mutex_exit(chan_mtx(chan));
scsipi_adapter_request(chan,
ADAPTER_REQ_GROW_RESOURCES, NULL);
scsipi_channel_thaw(chan, 1);
if (chan->chan_tflags & SCSIPI_CHANT_GROWRES)
kpause("scsizzz", FALSE, hz/10, NULL);
mutex_enter(chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_KICK) {
/* explicitly run the queues for this channel */
chan->chan_tflags &= ~SCSIPI_CHANT_KICK;
mutex_exit(chan_mtx(chan));
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_SHUTDOWN) {
break;
}
if (xs) {
TAILQ_REMOVE(&chan->chan_complete, xs, channel_q);
mutex_exit(chan_mtx(chan));
/*
* Have an xfer with an error; process it.
*/
(void) scsipi_complete(xs);
/*
* Kick the queue; keep it running if it was stopped
* for some reason.
*/
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
}
}
chan->chan_thread = NULL;
/* In case parent is waiting for us to exit. */
cv_broadcast(chan_cv_thread(chan));
mutex_exit(chan_mtx(chan));
kthread_exit(0);
}
/*
* scsipi_thread_call_callback:
*
* request to call a callback from the completion thread
*/
int
scsipi_thread_call_callback(struct scsipi_channel *chan,
void (*callback)(struct scsipi_channel *, void *), void *arg)
{
mutex_enter(chan_mtx(chan));
if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/* kernel thread doesn't exist yet */
mutex_exit(chan_mtx(chan));
return ESRCH;
}
if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
mutex_exit(chan_mtx(chan));
return EBUSY;
}
scsipi_channel_freeze(chan, 1);
chan->chan_callback = callback;
chan->chan_callback_arg = arg;
chan->chan_tflags |= SCSIPI_CHANT_CALLBACK;
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
return 0;
}
/*
* scsipi_async_event:
*
* Handle an asynchronous event from an adapter.
*/
void
scsipi_async_event(struct scsipi_channel *chan, scsipi_async_event_t event,
void *arg)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
switch (event) {
case ASYNC_EVENT_MAX_OPENINGS:
scsipi_async_event_max_openings(chan,
(struct scsipi_max_openings *)arg);
break;
case ASYNC_EVENT_XFER_MODE:
if (chan->chan_bustype->bustype_async_event_xfer_mode) {
chan->chan_bustype->bustype_async_event_xfer_mode(
chan, arg);
}
break;
case ASYNC_EVENT_RESET:
scsipi_async_event_channel_reset(chan);
break;
}
if (lock)
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_async_event_max_openings:
*
* Update the maximum number of outstanding commands a
* device may have.
*/
static void
scsipi_async_event_max_openings(struct scsipi_channel *chan,
struct scsipi_max_openings *mo)
{
struct scsipi_periph *periph;
int minlun, maxlun;
if (mo->mo_lun == -1) {
/*
* Wildcarded; apply it to all LUNs.
*/
minlun = 0;
maxlun = chan->chan_nluns - 1;
} else
minlun = maxlun = mo->mo_lun;
/* XXX This could really suck with a large LUN space. */
for (; minlun <= maxlun; minlun++) {
periph = scsipi_lookup_periph_locked(chan, mo->mo_target, minlun);
if (periph == NULL)
continue;
if (mo->mo_openings < periph->periph_openings)
periph->periph_openings = mo->mo_openings;
else if (mo->mo_openings > periph->periph_openings &&
(periph->periph_flags & PERIPH_GROW_OPENINGS) != 0)
periph->periph_openings = mo->mo_openings;
}
}
/*
* scsipi_set_xfer_mode:
*
* Set the xfer mode for the specified I_T Nexus.
*/
void
scsipi_set_xfer_mode(struct scsipi_channel *chan, int target, int immed)
{
struct scsipi_xfer_mode xm;
struct scsipi_periph *itperiph;
int lun;
/*
* Go to the minimal xfer mode.
*/
xm.xm_target = target;
xm.xm_mode = 0;
xm.xm_period = 0; /* ignored */
xm.xm_offset = 0; /* ignored */
/*
* Find the first LUN we know about on this I_T Nexus.
*/
for (itperiph = NULL, lun = 0; lun < chan->chan_nluns; lun++) {
itperiph = scsipi_lookup_periph(chan, target, lun);
if (itperiph != NULL)
break;
}
if (itperiph != NULL) {
xm.xm_mode = itperiph->periph_cap;
/*
* Now issue the request to the adapter.
*/
scsipi_adapter_request(chan, ADAPTER_REQ_SET_XFER_MODE, &xm);
/*
* If we want this to happen immediately, issue a dummy
* command, since most adapters can't really negotiate unless
* they're executing a job.
*/
if (immed != 0) {
(void) scsipi_test_unit_ready(itperiph,
XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST |
XS_CTL_IGNORE_NOT_READY |
XS_CTL_IGNORE_MEDIA_CHANGE);
}
}
}
/*
* scsipi_channel_reset:
*
* handle scsi bus reset
* called with channel lock held
*/
static void
scsipi_async_event_channel_reset(struct scsipi_channel *chan)
{
struct scsipi_xfer *xs, *xs_next;
struct scsipi_periph *periph;
int target, lun;
/*
* Channel has been reset. Also mark as reset pending REQUEST_SENSE
* commands; as the sense is not available any more.
* can't call scsipi_done() from here, as the command has not been
* sent to the adapter yet (this would corrupt accounting).
*/
for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL; xs = xs_next) {
xs_next = TAILQ_NEXT(xs, channel_q);
if (xs->xs_control & XS_CTL_REQSENSE) {
TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);
xs->error = XS_RESET;
if ((xs->xs_control & XS_CTL_ASYNC) != 0)
TAILQ_INSERT_TAIL(&chan->chan_complete, xs,
channel_q);
}
}
cv_broadcast(chan_cv_complete(chan));
/* Catch xs with pending sense which may not have a REQSENSE xs yet */
for (target = 0; target < chan->chan_ntargets; target++) {
if (target == chan->chan_id)
continue;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, target, lun);
if (periph) {
xs = periph->periph_xscheck;
if (xs)
xs->error = XS_RESET;
}
}
}
}
/*
* scsipi_target_detach:
*
* detach all periph associated with a I_T
* must be called from valid thread context
*/
int
scsipi_target_detach(struct scsipi_channel *chan, int target, int lun,
int flags)
{
struct scsipi_periph *periph;
device_t tdev;
int ctarget, mintarget, maxtarget;
int clun, minlun, maxlun;
int error = 0;
if (target == -1) {
mintarget = 0;
maxtarget = chan->chan_ntargets;
} else {
if (target == chan->chan_id)
return EINVAL;
if (target < 0 || target >= chan->chan_ntargets)
return EINVAL;
mintarget = target;
maxtarget = target + 1;
}
if (lun == -1) {
minlun = 0;
maxlun = chan->chan_nluns;
} else {
if (lun < 0 || lun >= chan->chan_nluns)
return EINVAL;
minlun = lun;
maxlun = lun + 1;
}
/* for config_detach */
KERNEL_LOCK(1, curlwp);
mutex_enter(chan_mtx(chan));
for (ctarget = mintarget; ctarget < maxtarget; ctarget++) {
if (ctarget == chan->chan_id)
continue;
for (clun = minlun; clun < maxlun; clun++) {
periph = scsipi_lookup_periph_locked(chan, ctarget, clun);
if (periph == NULL)
continue;
tdev = periph->periph_dev;
mutex_exit(chan_mtx(chan));
error = config_detach(tdev, flags);
if (error)
goto out;
mutex_enter(chan_mtx(chan));
KASSERT(scsipi_lookup_periph_locked(chan, ctarget, clun) == NULL);
}
}
mutex_exit(chan_mtx(chan));
out:
KERNEL_UNLOCK_ONE(curlwp);
return error;
}
/*
* scsipi_adapter_addref:
*
* Add a reference to the adapter pointed to by the provided
* link, enabling the adapter if necessary.
*/
int
scsipi_adapter_addref(struct scsipi_adapter *adapt)
{
int error = 0;
if (atomic_inc_uint_nv(&adapt->adapt_refcnt) == 1
&& adapt->adapt_enable != NULL) {
scsipi_adapter_lock(adapt);
error = scsipi_adapter_enable(adapt, 1);
scsipi_adapter_unlock(adapt);
if (error)
atomic_dec_uint(&adapt->adapt_refcnt);
}
return error;
}
/*
* scsipi_adapter_delref:
*
* Delete a reference to the adapter pointed to by the provided
* link, disabling the adapter if possible.
*/
void
scsipi_adapter_delref(struct scsipi_adapter *adapt)
{
membar_release();
if (atomic_dec_uint_nv(&adapt->adapt_refcnt) == 0
&& adapt->adapt_enable != NULL) {
membar_acquire();
scsipi_adapter_lock(adapt);
(void) scsipi_adapter_enable(adapt, 0);
scsipi_adapter_unlock(adapt);
}
}
static struct scsipi_syncparam {
int ss_factor;
int ss_period; /* ns * 100 */
} scsipi_syncparams[] = {
{ 0x08, 625 }, /* FAST-160 (Ultra320) */
{ 0x09, 1250 }, /* FAST-80 (Ultra160) */
{ 0x0a, 2500 }, /* FAST-40 40MHz (Ultra2) */
{ 0x0b, 3030 }, /* FAST-40 33MHz (Ultra2) */
{ 0x0c, 5000 }, /* FAST-20 (Ultra) */
};
static const int scsipi_nsyncparams =
sizeof(scsipi_syncparams) / sizeof(scsipi_syncparams[0]);
int
scsipi_sync_period_to_factor(int period /* ns * 100 */)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (period <= scsipi_syncparams[i].ss_period)
return scsipi_syncparams[i].ss_factor;
}
return (period / 100) / 4;
}
int
scsipi_sync_factor_to_period(int factor)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (factor == scsipi_syncparams[i].ss_factor)
return scsipi_syncparams[i].ss_period;
}
return (factor * 4) * 100;
}
int
scsipi_sync_factor_to_freq(int factor)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (factor == scsipi_syncparams[i].ss_factor)
return 100000000 / scsipi_syncparams[i].ss_period;
}
return 10000000 / ((factor * 4) * 10);
}
static inline void
scsipi_adapter_lock(struct scsipi_adapter *adapt)
{
if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_LOCK(1, NULL);
}
static inline void
scsipi_adapter_unlock(struct scsipi_adapter *adapt)
{
if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_UNLOCK_ONE(NULL);
}
void
scsipi_adapter_minphys(struct scsipi_channel *chan, struct buf *bp)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
scsipi_adapter_lock(adapt);
(adapt->adapt_minphys)(bp);
scsipi_adapter_unlock(chan->chan_adapter);
}
void
scsipi_adapter_request(struct scsipi_channel *chan,
scsipi_adapter_req_t req, void *arg)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
scsipi_adapter_lock(adapt); SDT_PROBE3(scsi, base, adapter, request__start, chan, req, arg);
(adapt->adapt_request)(chan, req, arg);
SDT_PROBE3(scsi, base, adapter, request__done, chan, req, arg); scsipi_adapter_unlock(adapt);
}
int
scsipi_adapter_ioctl(struct scsipi_channel *chan, u_long cmd,
void *data, int flag, struct proc *p)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
int error;
if (adapt->adapt_ioctl == NULL)
return ENOTTY;
scsipi_adapter_lock(adapt);
error = (adapt->adapt_ioctl)(chan, cmd, data, flag, p);
scsipi_adapter_unlock(adapt);
return error;
}
int
scsipi_adapter_enable(struct scsipi_adapter *adapt, int enable)
{
int error;
scsipi_adapter_lock(adapt);
error = (adapt->adapt_enable)(adapt->adapt_dev, enable);
scsipi_adapter_unlock(adapt);
return error;
}
#ifdef SCSIPI_DEBUG
/*
* Given a scsipi_xfer, dump the request, in all its glory
*/
void
show_scsipi_xs(struct scsipi_xfer *xs)
{
printf("xs(%p): ", xs);
printf("xs_control(0x%08x)", xs->xs_control);
printf("xs_status(0x%08x)", xs->xs_status);
printf("periph(%p)", xs->xs_periph);
printf("retr(0x%x)", xs->xs_retries);
printf("timo(0x%x)", xs->timeout);
printf("cmd(%p)", xs->cmd);
printf("len(0x%x)", xs->cmdlen);
printf("data(%p)", xs->data);
printf("len(0x%x)", xs->datalen);
printf("res(0x%x)", xs->resid);
printf("err(0x%x)", xs->error);
printf("bp(%p)", xs->bp);
show_scsipi_cmd(xs);
}
void
show_scsipi_cmd(struct scsipi_xfer *xs)
{
u_char *b = (u_char *) xs->cmd;
int i = 0;
scsipi_printaddr(xs->xs_periph);
printf(" command: ");
if ((xs->xs_control & XS_CTL_RESET) == 0) {
while (i < xs->cmdlen) {
if (i)
printf(",");
printf("0x%x", b[i++]);
}
printf("-[%d bytes]\n", xs->datalen);
if (xs->datalen)
show_mem(xs->data, uimin(64, xs->datalen));
} else
printf("-RESET-\n");
}
void
show_mem(u_char *address, int num)
{
int x;
printf("------------------------------");
for (x = 0; x < num; x++) {
if ((x % 16) == 0)
printf("\n%03d: ", x);
printf("%02x ", *address++);
}
printf("\n------------------------------\n");
}
#endif /* SCSIPI_DEBUG */
/* $NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_lockdebug.h"
#endif
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/cpu.h>
#include <sys/syslog.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/pserialize.h>
#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
#include <sys/ksyms.h>
#endif
#include <machine/lock.h>
#include <dev/lockstat.h>
#define RETURN_ADDRESS (uintptr_t)__builtin_return_address(0)
bool kernel_lock_dodebug;
__cpu_simple_lock_t kernel_lock[CACHE_LINE_SIZE / sizeof(__cpu_simple_lock_t)]
__cacheline_aligned;
void
assert_sleepable(void)
{
const char *reason;
long pctr;
bool idle;
if (__predict_false(panicstr != NULL)) {
return;
}
LOCKDEBUG_BARRIER(kernel_lock, 1);
/*
* Avoid disabling/re-enabling preemption here since this
* routine may be called in delicate situations.
*/
do {
pctr = lwp_pctr();
idle = CURCPU_IDLE_P();
} while (__predict_false(pctr != lwp_pctr()));
reason = NULL;
if (__predict_false(idle) && !cold) {
reason = "idle";
goto panic;
}
if (__predict_false(cpu_intr_p())) {
reason = "interrupt";
goto panic;
}
if (__predict_false(cpu_softintr_p())) {
reason = "softint";
goto panic;
}
if (__predict_false(!pserialize_not_in_read_section())) {
reason = "pserialize";
goto panic;
}
return;
panic: panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS);
}
/*
* Functions for manipulating the kernel_lock. We put them here
* so that they show up in profiles.
*/
#define _KERNEL_LOCK_ABORT(msg) \
LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)
#ifdef LOCKDEBUG
#define _KERNEL_LOCK_ASSERT(cond) \
do { \
if (!(cond)) \
_KERNEL_LOCK_ABORT("assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else
#define _KERNEL_LOCK_ASSERT(cond) /* nothing */
#endif
static void _kernel_lock_dump(const volatile void *, lockop_printer_t);
lockops_t _kernel_lock_ops = {
.lo_name = "Kernel lock",
.lo_type = LOCKOPS_SPIN,
.lo_dump = _kernel_lock_dump,
};
#ifdef LOCKDEBUG
#ifdef DDB
#include <ddb/ddb.h>
#endif
static void
kernel_lock_trace_ipi(void *cookie)
{
printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()),
curlwp->l_lid,
curlwp->l_name ? curlwp->l_name : curproc->p_comm);
#ifdef DDB
db_stacktrace();
#endif
}
#endif
/*
* Initialize the kernel lock.
*/
void
kernel_lock_init(void)
{
__cpu_simple_lock_init(kernel_lock);
kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
RETURN_ADDRESS);
}
CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));
/*
* Print debugging information about the kernel lock.
*/
static void
_kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
{
struct cpu_info *ci = curcpu();
(void)junk;
pr("curcpu holds : %18d wanted by: %#018lx\n",
ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
}
/*
* Acquire 'nlocks' holds on the kernel lock.
*
* Although it may not look it, this is one of the most central, intricate
* routines in the kernel, and tons of code elsewhere depends on its exact
* behaviour. If you change something in here, expect it to bite you in the
* rear.
*/
void
_kernel_lock(int nlocks)
{
struct cpu_info *ci;
LOCKSTAT_TIMER(spintime);
LOCKSTAT_FLAG(lsflag);
struct lwp *owant;
#ifdef LOCKDEBUG
static struct cpu_info *kernel_lock_holder;
u_int spins = 0;
u_int starttime = getticks();
#endif
int s;
struct lwp *l = curlwp;
_KERNEL_LOCK_ASSERT(nlocks > 0);
s = splvm();
ci = curcpu();
if (ci->ci_biglock_count != 0) {
_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
ci->ci_biglock_count += nlocks;
l->l_blcnt += nlocks;
splx(s);
return;
}
_KERNEL_LOCK_ASSERT(l->l_blcnt == 0); LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
0);
if (__predict_true(__cpu_simple_lock_try(kernel_lock))) {
#ifdef LOCKDEBUG
kernel_lock_holder = curcpu();
#endif
ci->ci_biglock_count = nlocks;
l->l_blcnt = nlocks;
LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
RETURN_ADDRESS, 0);
splx(s);
return;
}
/*
* To remove the ordering constraint between adaptive mutexes
* and kernel_lock we must make it appear as if this thread is
* blocking. For non-interlocked mutex release, a store fence
* is required to ensure that the result of any mutex_exit()
* by the current LWP becomes visible on the bus before the set
* of ci->ci_biglock_wanted becomes visible.
*
* This membar_producer matches the membar_consumer in
* mutex_vector_enter.
*
* That way, if l has just released a mutex, mutex_vector_enter
* can't see this store ci->ci_biglock_wanted := l until it
* will also see the mutex_exit store mtx->mtx_owner := 0 which
* clears the has-waiters bit.
*/
membar_producer();
owant = ci->ci_biglock_wanted;
atomic_store_relaxed(&ci->ci_biglock_wanted, l);
#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
l->l_ld_wanted = __builtin_return_address(0);
#endif
/*
* Spin until we acquire the lock. Once we have it, record the
* time spent with lockstat.
*/
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
do {
splx(s);
while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins) && start_init_exec &&
(getticks() - starttime) > 10*hz) {
ipi_msg_t msg = {
.func = kernel_lock_trace_ipi,
};
kpreempt_disable();
ipi_unicast(&msg, kernel_lock_holder);
ipi_wait(&msg);
kpreempt_enable();
_KERNEL_LOCK_ABORT("spinout");
}
#endif
SPINLOCK_BACKOFF_HOOK;
SPINLOCK_SPIN_HOOK;
}
s = splvm();
} while (!__cpu_simple_lock_try(kernel_lock));
ci->ci_biglock_count = nlocks;
l->l_blcnt = nlocks;
LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
RETURN_ADDRESS, 0);
if (owant == NULL) { LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
}
LOCKSTAT_EXIT(lsflag);
splx(s);
/*
* Now that we have kernel_lock, reset ci_biglock_wanted. This
* store must be visible on other CPUs before a mutex_exit() on
* this CPU can test the has-waiters bit.
*
* This membar_enter matches the membar_enter in
* mutex_vector_enter. (Yes, not membar_exit -- the legacy
* naming is confusing, but store-before-load usually pairs
* with store-before-load, in the extremely rare cases where it
* is used at all.)
*
* That way, mutex_vector_enter can't see this store
* ci->ci_biglock_wanted := owant until it has set the
* has-waiters bit.
*/
(void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_enter();
#endif
#ifdef LOCKDEBUG
kernel_lock_holder = curcpu();
#endif
}
/*
* Release 'nlocks' holds on the kernel lock. If 'nlocks' is zero, release
* all holds.
*/
void
_kernel_unlock(int nlocks, int *countp)
{
struct cpu_info *ci;
u_int olocks;
int s;
struct lwp *l = curlwp;
_KERNEL_LOCK_ASSERT(nlocks < 2);
olocks = l->l_blcnt;
if (olocks == 0) {
_KERNEL_LOCK_ASSERT(nlocks <= 0); if (countp != NULL)
*countp = 0;
return;
}
_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); if (nlocks == 0)
nlocks = olocks;
else if (nlocks == -1) {
nlocks = 1;
_KERNEL_LOCK_ASSERT(olocks == 1);
}
s = splvm();
ci = curcpu();
_KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
if (ci->ci_biglock_count == nlocks) {
LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
RETURN_ADDRESS, 0);
ci->ci_biglock_count = 0;
__cpu_simple_unlock(kernel_lock);
l->l_blcnt -= nlocks;
splx(s);
if (l->l_dopreempt) kpreempt(0);
} else {
ci->ci_biglock_count -= nlocks;
l->l_blcnt -= nlocks;
splx(s);
}
if (countp != NULL)
*countp = olocks;
}
bool
_kernel_locked_p(void)
{
return __SIMPLELOCK_LOCKED_P(kernel_lock);
}
/* $NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $ */
/*-
* Copyright (c) 1999, 2006, 2007, 2008, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_kstack.h"
#include "opt_maxuprc.h"
#include "opt_dtrace.h"
#include "opt_compat_netbsd32.h"
#include "opt_kaslr.h"
#endif
#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
&& !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/acct.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <ufs/ufs/quota.h>
#include <sys/uio.h>
#include <sys/pool.h>
#include <sys/pset.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/ras.h>
#include <sys/filedesc.h>
#include <sys/syscall_stats.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/dtrace_bsd.h>
#include <sys/sysctl.h>
#include <sys/exec.h>
#include <sys/cpu.h>
#include <sys/compat_stub.h>
#include <sys/futex.h>
#include <sys/pserialize.h>
#include <uvm/uvm_extern.h>
/*
* Process lists.
*/
struct proclist allproc __cacheline_aligned;
struct proclist zombproc __cacheline_aligned;
kmutex_t proc_lock __cacheline_aligned;
static pserialize_t proc_psz;
/*
* pid to lwp/proc lookup is done by indexing the pid_table array.
* Since pid numbers are only allocated when an empty slot
* has been found, there is no need to search any lists ever.
* (an orphaned pgrp will lock the slot, a session will lock
* the pgrp with the same number.)
* If the table is too small it is reallocated with twice the
* previous size and the entries 'unzipped' into the two halves.
* A linked list of free entries is passed through the pt_lwp
* field of 'free' items - set odd to be an invalid ptr. Two
* additional bits are also used to indicate if the slot is
* currently occupied by a proc or lwp, and if the PID is
* hidden from certain kinds of lookups. We thus require a
* minimum alignment for proc and lwp structures (LWPs are
* at least 32-byte aligned).
*/
struct pid_table {
uintptr_t pt_slot;
struct pgrp *pt_pgrp;
pid_t pt_pid;
};
#define PT_F_FREE ((uintptr_t)__BIT(0))
#define PT_F_LWP 0 /* pseudo-flag */
#define PT_F_PROC ((uintptr_t)__BIT(1))
#define PT_F_TYPEBITS (PT_F_FREE|PT_F_PROC)
#define PT_F_ALLBITS (PT_F_FREE|PT_F_PROC)
#define PT_VALID(s) (((s) & PT_F_FREE) == 0)
#define PT_RESERVED(s) ((s) == 0)
#define PT_NEXT(s) ((u_int)(s) >> 1)
#define PT_SET_FREE(pid) (((pid) << 1) | PT_F_FREE)
#define PT_SET_LWP(l) ((uintptr_t)(l))
#define PT_SET_PROC(p) (((uintptr_t)(p)) | PT_F_PROC)
#define PT_SET_RESERVED 0
#define PT_GET_LWP(s) ((struct lwp *)((s) & ~PT_F_ALLBITS))
#define PT_GET_PROC(s) ((struct proc *)((s) & ~PT_F_ALLBITS))
#define PT_GET_TYPE(s) ((s) & PT_F_TYPEBITS)
#define PT_IS_LWP(s) (PT_GET_TYPE(s) == PT_F_LWP && (s) != 0)
#define PT_IS_PROC(s) (PT_GET_TYPE(s) == PT_F_PROC)
#define MIN_PROC_ALIGNMENT (PT_F_ALLBITS + 1)
/*
* Table of process IDs (PIDs).
*/
static struct pid_table *pid_table __read_mostly;
#define INITIAL_PID_TABLE_SIZE (1 << 5)
/* Table mask, threshold for growing and number of allocated PIDs. */
static u_int pid_tbl_mask __read_mostly;
static u_int pid_alloc_lim __read_mostly;
static u_int pid_alloc_cnt __cacheline_aligned;
/* Next free, last free and maximum PIDs. */
static u_int next_free_pt __cacheline_aligned;
static u_int last_free_pt __cacheline_aligned;
static pid_t pid_max __read_mostly;
/* Components of the first process -- never freed. */
struct session session0 = {
.s_count = 1,
.s_sid = 0,
};
struct pgrp pgrp0 = {
.pg_members = LIST_HEAD_INITIALIZER(&pgrp0.pg_members),
.pg_session = &session0,
};
filedesc_t filedesc0;
struct cwdinfo cwdi0 = {
.cwdi_cmask = CMASK,
.cwdi_refcnt = 1,
};
struct plimit limit0;
struct pstats pstat0;
struct vmspace vmspace0;
struct sigacts sigacts0;
struct proc proc0 = {
.p_lwps = LIST_HEAD_INITIALIZER(&proc0.p_lwps),
.p_sigwaiters = LIST_HEAD_INITIALIZER(&proc0.p_sigwaiters),
.p_nlwps = 1,
.p_nrlwps = 1,
.p_pgrp = &pgrp0,
.p_comm = "system",
/*
* Set P_NOCLDWAIT so that kernel threads are reparented to init(8)
* when they exit. init(8) can easily wait them out for us.
*/
.p_flag = PK_SYSTEM | PK_NOCLDWAIT,
.p_stat = SACTIVE,
.p_nice = NZERO,
.p_emul = &emul_netbsd,
.p_cwdi = &cwdi0,
.p_limit = &limit0,
.p_fd = &filedesc0,
.p_vmspace = &vmspace0,
.p_stats = &pstat0,
.p_sigacts = &sigacts0,
#ifdef PROC0_MD_INITIALIZERS
PROC0_MD_INITIALIZERS
#endif
};
kauth_cred_t cred0;
static const int nofile = NOFILE;
static const int maxuprc = MAXUPRC;
static int sysctl_doeproc(SYSCTLFN_PROTO);
static int sysctl_kern_proc_args(SYSCTLFN_PROTO);
static int sysctl_security_expose_address(SYSCTLFN_PROTO);
#ifdef KASLR
static int kern_expose_address = 0;
#else
static int kern_expose_address = 1;
#endif
/*
* The process list descriptors, used during pid allocation and
* by sysctl. No locking on this data structure is needed since
* it is completely static.
*/
const struct proclist_desc proclists[] = {
{ &allproc },
{ &zombproc },
{ NULL },
};
static struct pgrp * pg_remove(pid_t);
static void pg_delete(pid_t);
static void orphanpg(struct pgrp *);
static specificdata_domain_t proc_specificdata_domain;
static pool_cache_t proc_cache;
static kauth_listener_t proc_listener;
static void fill_proc(const struct proc *, struct proc *, bool);
static int fill_pathname(struct lwp *, pid_t, void *, size_t *);
static int fill_cwd(struct lwp *, pid_t, void *, size_t *);
static int
proc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
switch (action) {
case KAUTH_PROCESS_CANSEE: {
enum kauth_process_req req;
req = (enum kauth_process_req)(uintptr_t)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_CANSEE_ARGS:
case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
case KAUTH_REQ_PROCESS_CANSEE_EPROC:
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_PROCESS_CANSEE_ENV:
if (kauth_cred_getuid(cred) !=
kauth_cred_getuid(p->p_cred) || kauth_cred_getuid(cred) !=
kauth_cred_getsvuid(p->p_cred))
break;
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_PROCESS_CANSEE_KPTR:
if (!kern_expose_address)
break;
if (kern_expose_address == 1 && !(p->p_flag & PK_KMEM))
break;
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_PROCESS_FORK: {
int lnprocs = (int)(unsigned long)arg2;
/*
* Don't allow a nonprivileged user to use the last few
* processes. The variable lnprocs is the current number of
* processes, maxproc is the limit.
*/
if (__predict_false((lnprocs >= maxproc - 5)))
break;
result = KAUTH_RESULT_ALLOW;
break;
}
case KAUTH_PROCESS_CORENAME:
case KAUTH_PROCESS_STOPFLAG:
if (proc_uidmatch(cred, p->p_cred) == 0)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return result;
}
static int
proc_ctor(void *arg __unused, void *obj, int flags __unused)
{
struct proc *p = obj;
memset(p, 0, sizeof(*p));
klist_init(&p->p_klist);
/*
* There is no need for a proc_dtor() to do a klist_fini(),
* since knote_proc_exit() ensures that p->p_klist is empty
* when a process exits.
*/
return 0;
}
static pid_t proc_alloc_pid_slot(struct proc *, uintptr_t);
/*
* Initialize global process hashing structures.
*/
void
procinit(void)
{
const struct proclist_desc *pd;
u_int i;
#define LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1))
for (pd = proclists; pd->pd_list != NULL; pd++)
LIST_INIT(pd->pd_list);
mutex_init(&proc_lock, MUTEX_DEFAULT, IPL_NONE);
proc_psz = pserialize_create();
pid_table = kmem_alloc(INITIAL_PID_TABLE_SIZE
* sizeof(struct pid_table), KM_SLEEP);
pid_tbl_mask = INITIAL_PID_TABLE_SIZE - 1;
pid_max = PID_MAX;
/* Set free list running through table...
Preset 'use count' above PID_MAX so we allocate pid 1 next. */
for (i = 0; i <= pid_tbl_mask; i++) {
pid_table[i].pt_slot = PT_SET_FREE(LINK_EMPTY + i + 1);
pid_table[i].pt_pgrp = 0;
pid_table[i].pt_pid = 0;
}
/* slot 0 is just grabbed */
next_free_pt = 1;
/* Need to fix last entry. */
last_free_pt = pid_tbl_mask;
pid_table[last_free_pt].pt_slot = PT_SET_FREE(LINK_EMPTY);
/* point at which we grow table - to avoid reusing pids too often */
pid_alloc_lim = pid_tbl_mask - 1;
#undef LINK_EMPTY
/* Reserve PID 1 for init(8). */ /* XXX slightly gross */
mutex_enter(&proc_lock);
if (proc_alloc_pid_slot(&proc0, PT_SET_RESERVED) != 1)
panic("failed to reserve PID 1 for init(8)");
mutex_exit(&proc_lock);
proc_specificdata_domain = specificdata_domain_create();
KASSERT(proc_specificdata_domain != NULL);
size_t proc_alignment = coherency_unit;
if (proc_alignment < MIN_PROC_ALIGNMENT)
proc_alignment = MIN_PROC_ALIGNMENT;
proc_cache = pool_cache_init(sizeof(struct proc), proc_alignment, 0, 0,
"procpl", NULL, IPL_NONE, proc_ctor, NULL, NULL);
proc_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
proc_listener_cb, NULL);
}
void
procinit_sysctl(void)
{
static struct sysctllog *clog;
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "expose_address",
SYSCTL_DESCR("Enable exposing kernel addresses"),
sysctl_security_expose_address, 0,
&kern_expose_address, 0, CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc",
SYSCTL_DESCR("System-wide process information"),
sysctl_doeproc, 0, NULL, 0,
CTL_KERN, KERN_PROC, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc2",
SYSCTL_DESCR("Machine-independent process information"),
sysctl_doeproc, 0, NULL, 0,
CTL_KERN, KERN_PROC2, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc_args",
SYSCTL_DESCR("Process argument information"),
sysctl_kern_proc_args, 0, NULL, 0,
CTL_KERN, KERN_PROC_ARGS, CTL_EOL);
/*
"nodes" under these:
KERN_PROC_ALL
KERN_PROC_PID pid
KERN_PROC_PGRP pgrp
KERN_PROC_SESSION sess
KERN_PROC_TTY tty
KERN_PROC_UID uid
KERN_PROC_RUID uid
KERN_PROC_GID gid
KERN_PROC_RGID gid
all in all, probably not worth the effort...
*/
}
/*
* Initialize process 0.
*/
void
proc0_init(void)
{
struct proc *p;
struct pgrp *pg;
struct rlimit *rlim;
rlim_t lim;
int i;
p = &proc0;
pg = &pgrp0;
mutex_init(&p->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
p->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
rw_init(&p->p_reflock);
cv_init(&p->p_waitcv, "wait");
cv_init(&p->p_lwpcv, "lwpwait");
LIST_INSERT_HEAD(&p->p_lwps, &lwp0, l_sibling);
KASSERT(lwp0.l_lid == 0);
pid_table[lwp0.l_lid].pt_slot = PT_SET_LWP(&lwp0);
LIST_INSERT_HEAD(&allproc, p, p_list);
pid_table[lwp0.l_lid].pt_pgrp = pg;
LIST_INSERT_HEAD(&pg->pg_members, p, p_pglist);
#ifdef __HAVE_SYSCALL_INTERN
(*p->p_emul->e_syscall_intern)(p);
#endif
/* Create credentials. */
cred0 = kauth_cred_alloc();
p->p_cred = cred0;
/* Create the CWD info. */
rw_init(&cwdi0.cwdi_lock);
/* Create the limits structures. */
mutex_init(&limit0.pl_lock, MUTEX_DEFAULT, IPL_NONE);
rlim = limit0.pl_rlimit;
for (i = 0; i < __arraycount(limit0.pl_rlimit); i++) {
rlim[i].rlim_cur = RLIM_INFINITY;
rlim[i].rlim_max = RLIM_INFINITY;
}
rlim[RLIMIT_NOFILE].rlim_max = maxfiles;
rlim[RLIMIT_NOFILE].rlim_cur = maxfiles < nofile ? maxfiles : nofile;
rlim[RLIMIT_NPROC].rlim_max = maxproc;
rlim[RLIMIT_NPROC].rlim_cur = maxproc < maxuprc ? maxproc : maxuprc;
lim = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false)));
rlim[RLIMIT_RSS].rlim_max = lim;
rlim[RLIMIT_MEMLOCK].rlim_max = lim;
rlim[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
rlim[RLIMIT_NTHR].rlim_max = maxlwp;
rlim[RLIMIT_NTHR].rlim_cur = maxlwp / 2;
/* Note that default core name has zero length. */
limit0.pl_corename = defcorename;
limit0.pl_cnlen = 0;
limit0.pl_refcnt = 1;
limit0.pl_writeable = false;
limit0.pl_sv_limit = NULL;
/* Configure virtual memory system, set vm rlimits. */
uvm_init_limits(p);
/* Initialize file descriptor table for proc0. */
fd_init(&filedesc0);
/*
* Initialize proc0's vmspace, which uses the kernel pmap.
* All kernel processes (which never have user space mappings)
* share proc0's vmspace, and thus, the kernel pmap.
*/
uvmspace_init(&vmspace0, pmap_kernel(), round_page(VM_MIN_ADDRESS),
trunc_page(VM_MAXUSER_ADDRESS),
#ifdef __USE_TOPDOWN_VM
true
#else
false
#endif
);
/* Initialize signal state for proc0. XXX IPL_SCHED */
mutex_init(&p->p_sigacts->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
siginit(p);
proc_initspecific(p);
kdtrace_proc_ctor(NULL, p);
}
/*
* Session reference counting.
*/
void
proc_sesshold(struct session *ss)
{
KASSERT(mutex_owned(&proc_lock));
ss->s_count++;
}
void
proc_sessrele(struct session *ss)
{
struct pgrp *pg;
KASSERT(mutex_owned(&proc_lock)); KASSERT(ss->s_count > 0);
/*
* We keep the pgrp with the same id as the session in order to
* stop a process being given the same pid. Since the pgrp holds
* a reference to the session, it must be a 'zombie' pgrp by now.
*/
if (--ss->s_count == 0) {
pg = pg_remove(ss->s_sid);
} else {
pg = NULL;
ss = NULL;
}
mutex_exit(&proc_lock);
if (pg) kmem_free(pg, sizeof(struct pgrp));
if (ss)
kmem_free(ss, sizeof(struct session));
}
/*
* Check that the specified process group is in the session of the
* specified process.
* Treats -ve ids as process ids.
* Used to validate TIOCSPGRP requests.
*/
int
pgid_in_session(struct proc *p, pid_t pg_id)
{
struct pgrp *pgrp;
struct session *session;
int error;
if (pg_id == INT_MIN)
return EINVAL;
mutex_enter(&proc_lock);
if (pg_id < 0) {
struct proc *p1 = proc_find(-pg_id);
if (p1 == NULL) {
error = EINVAL;
goto fail;
}
pgrp = p1->p_pgrp;
} else {
pgrp = pgrp_find(pg_id);
if (pgrp == NULL) {
error = EINVAL;
goto fail;
}
}
session = pgrp->pg_session;
error = (session != p->p_pgrp->pg_session) ? EPERM : 0;
fail:
mutex_exit(&proc_lock);
return error;
}
/*
* p_inferior: is p an inferior of q?
*/
static inline bool
p_inferior(struct proc *p, struct proc *q)
{
KASSERT(mutex_owned(&proc_lock)); for (; p != q; p = p->p_pptr) if (p->p_pid == 0)
return false;
return true;
}
/*
* proc_find_lwp: locate an lwp in said proc by the ID.
*
* => Must be called with p::p_lock held.
* => LSIDL lwps are not returned because they are only partially
* constructed while occupying the slot.
* => Callers need to be careful about lwp::l_stat of the returned
* lwp.
*/
struct lwp *
proc_find_lwp(proc_t *p, pid_t pid)
{
struct pid_table *pt;
unsigned pt_mask;
struct lwp *l = NULL;
uintptr_t slot;
int s;
KASSERT(mutex_owned(p->p_lock));
/*
* Look in the pid_table. This is done unlocked inside a
* pserialize read section covering pid_table's memory
* allocation only, so take care to read things in the correct
* order:
*
* 1. First read the table mask -- this only ever increases, in
* expand_pid_table, so a stale value is safely
* conservative.
*
* 2. Next read the pid table -- this is always set _before_
* the mask increases, so if we see a new table and stale
* mask, the mask is still valid for the table.
*/
s = pserialize_read_enter();
pt_mask = atomic_load_acquire(&pid_tbl_mask);
pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
slot = atomic_load_consume(&pt->pt_slot);
if (__predict_false(!PT_IS_LWP(slot))) {
pserialize_read_exit(s);
return NULL;
}
/*
* Check to see if the LWP is from the correct process. We won't
* see entries in pid_table from a prior process that also used "p",
* by virtue of the fact that allocating "p" means all prior updates
* to dependant data structures are visible to this thread.
*/
l = PT_GET_LWP(slot);
if (__predict_false(atomic_load_relaxed(&l->l_proc) != p)) {
pserialize_read_exit(s);
return NULL;
}
/*
* We now know that p->p_lock holds this LWP stable.
*
* If the status is not LSIDL, it means the LWP is intended to be
* findable by LID and l_lid cannot change behind us.
*
* No need to acquire the LWP's lock to check for LSIDL, as
* p->p_lock must be held to transition in and out of LSIDL.
* Any other observed state of is no particular interest.
*/
pserialize_read_exit(s);
return l->l_stat != LSIDL && l->l_lid == pid ? l : NULL;
}
/*
* proc_find_lwp_unlocked: locate an lwp in said proc by the ID.
*
* => Called in a pserialize read section with no locks held.
* => LSIDL lwps are not returned because they are only partially
* constructed while occupying the slot.
* => Callers need to be careful about lwp::l_stat of the returned
* lwp.
* => If an LWP is found, it's returned locked.
*/
struct lwp *
proc_find_lwp_unlocked(proc_t *p, pid_t pid)
{
struct pid_table *pt;
unsigned pt_mask;
struct lwp *l = NULL;
uintptr_t slot;
KASSERT(pserialize_in_read_section());
/*
* Look in the pid_table. This is done unlocked inside a
* pserialize read section covering pid_table's memory
* allocation only, so take care to read things in the correct
* order:
*
* 1. First read the table mask -- this only ever increases, in
* expand_pid_table, so a stale value is safely
* conservative.
*
* 2. Next read the pid table -- this is always set _before_
* the mask increases, so if we see a new table and stale
* mask, the mask is still valid for the table.
*/
pt_mask = atomic_load_acquire(&pid_tbl_mask);
pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
slot = atomic_load_consume(&pt->pt_slot); if (__predict_false(!PT_IS_LWP(slot))) {
return NULL;
}
/*
* Lock the LWP we found to get it stable. If it's embryonic or
* reaped (LSIDL) then none of the other fields can safely be
* checked.
*/
l = PT_GET_LWP(slot);
lwp_lock(l);
if (__predict_false(l->l_stat == LSIDL)) {
lwp_unlock(l);
return NULL;
}
/*
* l_proc and l_lid are now known stable because the LWP is not
* LSIDL, so check those fields too to make sure we found the
* right thing.
*/
if (__predict_false(l->l_proc != p || l->l_lid != pid)) {
lwp_unlock(l);
return NULL;
}
/* Everything checks out, return it locked. */
return l;
}
/*
* proc_find_lwp_acquire_proc: locate an lwp and acquire a lock
* on its containing proc.
*
* => Similar to proc_find_lwp(), but does not require you to have
* the proc a priori.
* => Also returns proc * to caller, with p::p_lock held.
* => Same caveats apply.
*/
struct lwp *
proc_find_lwp_acquire_proc(pid_t pid, struct proc **pp)
{
struct pid_table *pt;
struct proc *p = NULL;
struct lwp *l = NULL;
uintptr_t slot;
KASSERT(pp != NULL);
mutex_enter(&proc_lock);
pt = &pid_table[pid & pid_tbl_mask];
slot = pt->pt_slot;
if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
l = PT_GET_LWP(slot);
p = l->l_proc;
mutex_enter(p->p_lock);
if (__predict_false(l->l_stat == LSIDL)) {
mutex_exit(p->p_lock);
l = NULL;
p = NULL;
}
}
mutex_exit(&proc_lock);
KASSERT(p == NULL || mutex_owned(p->p_lock));
*pp = p;
return l;
}
/*
* proc_find_raw_pid_table_locked: locate a process by the ID.
*
* => Must be called with proc_lock held.
*/
static proc_t *
proc_find_raw_pid_table_locked(pid_t pid, bool any_lwpid)
{
struct pid_table *pt;
proc_t *p = NULL;
uintptr_t slot;
/* No - used by DDB. KASSERT(mutex_owned(&proc_lock)); */
pt = &pid_table[pid & pid_tbl_mask];
slot = pt->pt_slot;
if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
/*
* When looking up processes, require a direct match
* on the PID assigned to the proc, not just one of
* its LWPs.
*
* N.B. We require lwp::l_proc of LSIDL LWPs to be
* valid here.
*/
p = PT_GET_LWP(slot)->l_proc; if (__predict_false(p->p_pid != pid && !any_lwpid))
p = NULL;
} else if (PT_IS_PROC(slot) && pt->pt_pid == pid) { p = PT_GET_PROC(slot);
}
return p;
}
proc_t *
proc_find_raw(pid_t pid)
{ return proc_find_raw_pid_table_locked(pid, false);
}
static proc_t *
proc_find_internal(pid_t pid, bool any_lwpid)
{
proc_t *p;
KASSERT(mutex_owned(&proc_lock)); p = proc_find_raw_pid_table_locked(pid, any_lwpid); if (__predict_false(p == NULL)) {
return NULL;
}
/*
* Only allow live processes to be found by PID.
* XXX: p_stat might change, since proc unlocked.
*/
if (__predict_true(p->p_stat == SACTIVE || p->p_stat == SSTOP)) {
return p;
}
return NULL;
}
proc_t *
proc_find(pid_t pid)
{ return proc_find_internal(pid, false);
}
proc_t *
proc_find_lwpid(pid_t pid)
{
return proc_find_internal(pid, true);
}
/*
* pgrp_find: locate a process group by the ID.
*
* => Must be called with proc_lock held.
*/
struct pgrp *
pgrp_find(pid_t pgid)
{
struct pgrp *pg;
KASSERT(mutex_owned(&proc_lock));
pg = pid_table[pgid & pid_tbl_mask].pt_pgrp;
/*
* Cannot look up a process group that only exists because the
* session has not died yet (traditional).
*/
if (pg == NULL || pg->pg_id != pgid || LIST_EMPTY(&pg->pg_members)) {
return NULL;
}
return pg;
}
static void
expand_pid_table(void)
{
size_t pt_size, tsz;
struct pid_table *n_pt, *new_pt;
uintptr_t slot;
struct pgrp *pgrp;
pid_t pid, rpid;
u_int i;
uint new_pt_mask;
KASSERT(mutex_owned(&proc_lock));
/* Unlock the pid_table briefly to allocate memory. */
pt_size = pid_tbl_mask + 1;
mutex_exit(&proc_lock);
tsz = pt_size * 2 * sizeof(struct pid_table);
new_pt = kmem_alloc(tsz, KM_SLEEP);
new_pt_mask = pt_size * 2 - 1;
/* XXX For now. The pratical limit is much lower anyway. */
KASSERT(new_pt_mask <= FUTEX_TID_MASK);
mutex_enter(&proc_lock);
if (pt_size != pid_tbl_mask + 1) {
/* Another process beat us to it... */
mutex_exit(&proc_lock);
kmem_free(new_pt, tsz);
goto out;
}
/*
* Copy entries from old table into new one.
* If 'pid' is 'odd' we need to place in the upper half,
* even pid's to the lower half.
* Free items stay in the low half so we don't have to
* fixup the reference to them.
* We stuff free items on the front of the freelist
* because we can't write to unmodified entries.
* Processing the table backwards maintains a semblance
* of issuing pid numbers that increase with time.
*/
i = pt_size - 1;
n_pt = new_pt + i;
for (; ; i--, n_pt--) {
slot = pid_table[i].pt_slot;
pgrp = pid_table[i].pt_pgrp;
if (!PT_VALID(slot)) {
/* Up 'use count' so that link is valid */
pid = (PT_NEXT(slot) + pt_size) & ~pt_size;
rpid = 0;
slot = PT_SET_FREE(pid);
if (pgrp) pid = pgrp->pg_id;
} else {
pid = pid_table[i].pt_pid;
rpid = pid;
}
/* Save entry in appropriate half of table */
n_pt[pid & pt_size].pt_slot = slot;
n_pt[pid & pt_size].pt_pgrp = pgrp;
n_pt[pid & pt_size].pt_pid = rpid;
/* Put other piece on start of free list */
pid = (pid ^ pt_size) & ~pid_tbl_mask;
n_pt[pid & pt_size].pt_slot =
PT_SET_FREE((pid & ~pt_size) | next_free_pt);
n_pt[pid & pt_size].pt_pgrp = 0;
n_pt[pid & pt_size].pt_pid = 0;
next_free_pt = i | (pid & pt_size);
if (i == 0)
break;
}
/* Save old table size and switch tables */
tsz = pt_size * sizeof(struct pid_table);
n_pt = pid_table;
atomic_store_release(&pid_table, new_pt);
KASSERT(new_pt_mask >= pid_tbl_mask);
atomic_store_release(&pid_tbl_mask, new_pt_mask);
/*
* pid_max starts as PID_MAX (= 30000), once we have 16384
* allocated pids we need it to be larger!
*/
if (pid_tbl_mask > PID_MAX) {
pid_max = pid_tbl_mask * 2 + 1;
pid_alloc_lim |= pid_alloc_lim << 1;
} else
pid_alloc_lim <<= 1; /* doubles number of free slots... */
mutex_exit(&proc_lock);
/*
* Make sure that unlocked access to the old pid_table is complete
* and then free it.
*/
pserialize_perform(proc_psz);
kmem_free(n_pt, tsz);
out: /* Return with proc_lock held again. */
mutex_enter(&proc_lock);
}
struct proc *
proc_alloc(void)
{
struct proc *p;
p = pool_cache_get(proc_cache, PR_WAITOK);
p->p_stat = SIDL; /* protect against others */
proc_initspecific(p);
kdtrace_proc_ctor(NULL, p);
/*
* Allocate a placeholder in the pid_table. When we create the
* first LWP for this process, it will take ownership of the
* slot.
*/
if (__predict_false(proc_alloc_pid(p) == -1)) {
/* Allocating the PID failed; unwind. */
proc_finispecific(p);
proc_free_mem(p);
p = NULL;
}
return p;
}
/*
* proc_alloc_pid_slot: allocate PID and record the occcupant so that
* proc_find_raw() can find it by the PID.
*/
static pid_t __noinline
proc_alloc_pid_slot(struct proc *p, uintptr_t slot)
{
struct pid_table *pt;
pid_t pid;
int nxt;
KASSERT(mutex_owned(&proc_lock)); for (;;expand_pid_table()) { if (__predict_false(pid_alloc_cnt >= pid_alloc_lim)) {
/* ensure pids cycle through 2000+ values */
continue;
}
/*
* The first user process *must* be given PID 1.
* it has already been reserved for us. This
* will be coming in from the proc_alloc() call
* above, and the entry will be usurped later when
* the first user LWP is created.
* XXX this is slightly gross.
*/
if (__predict_false(PT_RESERVED(pid_table[1].pt_slot) &&
p != &proc0)) {
KASSERT(PT_IS_PROC(slot));
pt = &pid_table[1];
pt->pt_slot = slot;
return 1;
}
pt = &pid_table[next_free_pt];
#ifdef DIAGNOSTIC
if (__predict_false(PT_VALID(pt->pt_slot) || pt->pt_pgrp))
panic("proc_alloc: slot busy");
#endif
nxt = PT_NEXT(pt->pt_slot);
if (nxt & pid_tbl_mask)
break;
/* Table full - expand (NB last entry not used....) */
}
/* pid is 'saved use count' + 'size' + entry */
pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + next_free_pt;
if ((uint)pid > (uint)pid_max)
pid &= pid_tbl_mask;
next_free_pt = nxt & pid_tbl_mask;
/* XXX For now. The pratical limit is much lower anyway. */
KASSERT(pid <= FUTEX_TID_MASK);
/* Grab table slot */
pt->pt_slot = slot;
KASSERT(pt->pt_pid == 0);
pt->pt_pid = pid;
pid_alloc_cnt++;
return pid;
}
pid_t
proc_alloc_pid(struct proc *p)
{
pid_t pid;
KASSERT((((uintptr_t)p) & PT_F_ALLBITS) == 0); KASSERT(p->p_stat == SIDL);
mutex_enter(&proc_lock);
pid = proc_alloc_pid_slot(p, PT_SET_PROC(p));
if (pid != -1) p->p_pid = pid;
mutex_exit(&proc_lock);
return pid;
}
pid_t
proc_alloc_lwpid(struct proc *p, struct lwp *l)
{
struct pid_table *pt;
pid_t pid;
KASSERT((((uintptr_t)l) & PT_F_ALLBITS) == 0); KASSERT(l->l_proc == p); KASSERT(l->l_stat == LSIDL);
/*
* For unlocked lookup in proc_find_lwp(), make sure l->l_proc
* is globally visible before the LWP becomes visible via the
* pid_table.
*/
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_producer();
#endif
/*
* If the slot for p->p_pid currently points to the proc,
* then we should usurp this ID for the LWP. This happens
* at least once per process (for the first LWP), and can
* happen again if the first LWP for a process exits and
* before the process creates another.
*/
mutex_enter(&proc_lock);
pid = p->p_pid;
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(pt->pt_pid == pid);
if (PT_IS_PROC(pt->pt_slot)) {
KASSERT(PT_GET_PROC(pt->pt_slot) == p);
l->l_lid = pid;
pt->pt_slot = PT_SET_LWP(l);
} else {
/* Need to allocate a new slot. */
pid = proc_alloc_pid_slot(p, PT_SET_LWP(l));
if (pid != -1) l->l_lid = pid;
}
mutex_exit(&proc_lock);
return pid;
}
static void __noinline
proc_free_pid_internal(pid_t pid, uintptr_t type __diagused)
{
struct pid_table *pt;
KASSERT(mutex_owned(&proc_lock));
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(PT_GET_TYPE(pt->pt_slot) == type); KASSERT(pt->pt_pid == pid);
/* save pid use count in slot */
pt->pt_slot = PT_SET_FREE(pid & ~pid_tbl_mask);
pt->pt_pid = 0;
if (pt->pt_pgrp == NULL) {
/* link last freed entry onto ours */
pid &= pid_tbl_mask;
pt = &pid_table[last_free_pt];
pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pid);
pt->pt_pid = 0;
last_free_pt = pid;
pid_alloc_cnt--;
}
}
/*
* Free a process id - called from proc_free (in kern_exit.c)
*
* Called with the proc_lock held.
*/
void
proc_free_pid(pid_t pid)
{ KASSERT(mutex_owned(&proc_lock));
proc_free_pid_internal(pid, PT_F_PROC);
}
/*
* Free a process id used by an LWP. If this was the process's
* first LWP, we convert the slot to point to the process; the
* entry will get cleaned up later when the process finishes exiting.
*
* If not, then it's the same as proc_free_pid().
*/
void
proc_free_lwpid(struct proc *p, pid_t pid)
{ KASSERT(mutex_owned(&proc_lock));
if (__predict_true(p->p_pid == pid)) {
struct pid_table *pt;
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(pt->pt_pid == pid); KASSERT(PT_IS_LWP(pt->pt_slot)); KASSERT(PT_GET_LWP(pt->pt_slot)->l_proc == p);
pt->pt_slot = PT_SET_PROC(p);
return;
}
proc_free_pid_internal(pid, PT_F_LWP);
}
void
proc_free_mem(struct proc *p)
{ kdtrace_proc_dtor(NULL, p);
pool_cache_put(proc_cache, p);
}
/*
* proc_enterpgrp: move p to a new or existing process group (and session).
*
* If we are creating a new pgrp, the pgid should equal
* the calling process' pid.
* If is only valid to enter a process group that is in the session
* of the process.
* Also mksess should only be set if we are creating a process group
*
* Only called from sys_setsid, sys_setpgid and posix_spawn/spawn_return.
*/
int
proc_enterpgrp(struct proc *curp, pid_t pid, pid_t pgid, bool mksess)
{
struct pgrp *new_pgrp, *pgrp;
struct session *sess;
struct proc *p;
int rval;
pid_t pg_id = NO_PGID;
/* Allocate data areas we might need before doing any validity checks */
sess = mksess ? kmem_alloc(sizeof(*sess), KM_SLEEP) : NULL;
new_pgrp = kmem_alloc(sizeof(*new_pgrp), KM_SLEEP);
mutex_enter(&proc_lock);
rval = EPERM; /* most common error (to save typing) */
/* Check pgrp exists or can be created */
pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp;
if (pgrp != NULL && pgrp->pg_id != pgid)
goto done;
/* Can only set another process under restricted circumstances. */
if (pid != curp->p_pid) {
/* Must exist and be one of our children... */
p = proc_find_internal(pid, false); if (p == NULL || !p_inferior(p, curp)) {
rval = ESRCH;
goto done;
}
/* ... in the same session... */
if (sess != NULL || p->p_session != curp->p_session)
goto done;
/* ... existing pgid must be in same session ... */
if (pgrp != NULL && pgrp->pg_session != p->p_session)
goto done;
/* ... and not done an exec. */
if (p->p_flag & PK_EXEC) {
rval = EACCES;
goto done;
}
} else {
/* ... setsid() cannot re-enter a pgrp */
if (mksess && (curp->p_pgid == curp->p_pid || pgrp_find(curp->p_pid)))
goto done;
p = curp;
}
/* Changing the process group/session of a session
leader is definitely off limits. */
if (SESS_LEADER(p)) {
if (sess == NULL && p->p_pgrp == pgrp)
/* unless it's a definite noop */
rval = 0;
goto done;
}
/* Can only create a process group with id of process */
if (pgrp == NULL && pgid != pid)
goto done;
/* Can only create a session if creating pgrp */
if (sess != NULL && pgrp != NULL)
goto done;
/* Check we allocated memory for a pgrp... */
if (pgrp == NULL && new_pgrp == NULL)
goto done;
/* Don't attach to 'zombie' pgrp */
if (pgrp != NULL && LIST_EMPTY(&pgrp->pg_members))
goto done;
/* Expect to succeed now */
rval = 0;
if (pgrp == p->p_pgrp)
/* nothing to do */
goto done;
/* Ok all setup, link up required structures */
if (pgrp == NULL) {
pgrp = new_pgrp;
new_pgrp = NULL;
if (sess != NULL) {
sess->s_sid = p->p_pid;
sess->s_leader = p;
sess->s_count = 1;
sess->s_ttyvp = NULL;
sess->s_ttyp = NULL;
sess->s_flags = p->p_session->s_flags & ~S_LOGIN_SET;
memcpy(sess->s_login, p->p_session->s_login,
sizeof(sess->s_login));
p->p_lflag &= ~PL_CONTROLT;
} else {
sess = p->p_pgrp->pg_session;
proc_sesshold(sess);
}
pgrp->pg_session = sess;
sess = NULL;
pgrp->pg_id = pgid;
LIST_INIT(&pgrp->pg_members);
#ifdef DIAGNOSTIC
if (__predict_false(pid_table[pgid & pid_tbl_mask].pt_pgrp))
panic("enterpgrp: pgrp table slot in use");
if (__predict_false(mksess && p != curp))
panic("enterpgrp: mksession and p != curproc");
#endif
pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp;
pgrp->pg_jobc = 0;
}
/*
* Adjust eligibility of affected pgrps to participate in job control.
* Increment eligibility counts before decrementing, otherwise we
* could reach 0 spuriously during the first call.
*/
fixjobc(p, pgrp, 1);
fixjobc(p, p->p_pgrp, 0);
/* Interlock with ttread(). */
mutex_spin_enter(&tty_lock);
/* Move process to requested group. */
LIST_REMOVE(p, p_pglist); if (LIST_EMPTY(&p->p_pgrp->pg_members))
/* defer delete until we've dumped the lock */
pg_id = p->p_pgrp->pg_id;
p->p_pgrp = pgrp;
LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
/* Done with the swap; we can release the tty mutex. */
mutex_spin_exit(&tty_lock);
done:
if (pg_id != NO_PGID) {
/* Releases proc_lock. */
pg_delete(pg_id);
} else {
mutex_exit(&proc_lock);
}
if (sess != NULL) kmem_free(sess, sizeof(*sess)); if (new_pgrp != NULL) kmem_free(new_pgrp, sizeof(*new_pgrp));
#ifdef DEBUG_PGRP
if (__predict_false(rval))
printf("enterpgrp(%d,%d,%d), curproc %d, rval %d\n",
pid, pgid, mksess, curp->p_pid, rval);
#endif
return rval;
}
/*
* proc_leavepgrp: remove a process from its process group.
* => must be called with the proc_lock held, which will be released;
*/
void
proc_leavepgrp(struct proc *p)
{
struct pgrp *pgrp;
KASSERT(mutex_owned(&proc_lock));
/* Interlock with ttread() */
mutex_spin_enter(&tty_lock);
pgrp = p->p_pgrp;
LIST_REMOVE(p, p_pglist);
p->p_pgrp = NULL;
mutex_spin_exit(&tty_lock);
if (LIST_EMPTY(&pgrp->pg_members)) {
/* Releases proc_lock. */
pg_delete(pgrp->pg_id);
} else {
mutex_exit(&proc_lock);
}
}
/*
* pg_remove: remove a process group from the table.
* => must be called with the proc_lock held;
* => returns process group to free;
*/
static struct pgrp *
pg_remove(pid_t pg_id)
{
struct pgrp *pgrp;
struct pid_table *pt;
KASSERT(mutex_owned(&proc_lock));
pt = &pid_table[pg_id & pid_tbl_mask];
pgrp = pt->pt_pgrp;
KASSERT(pgrp != NULL); KASSERT(pgrp->pg_id == pg_id); KASSERT(LIST_EMPTY(&pgrp->pg_members));
pt->pt_pgrp = NULL;
if (!PT_VALID(pt->pt_slot)) {
/* Orphaned pgrp, put slot onto free list. */
KASSERT((PT_NEXT(pt->pt_slot) & pid_tbl_mask) == 0);
pg_id &= pid_tbl_mask;
pt = &pid_table[last_free_pt];
pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pg_id);
KASSERT(pt->pt_pid == 0);
last_free_pt = pg_id;
pid_alloc_cnt--;
}
return pgrp;
}
/*
* pg_delete: delete and free a process group.
* => must be called with the proc_lock held, which will be released.
*/
static void
pg_delete(pid_t pg_id)
{
struct pgrp *pg;
struct tty *ttyp;
struct session *ss;
KASSERT(mutex_owned(&proc_lock));
pg = pid_table[pg_id & pid_tbl_mask].pt_pgrp;
if (pg == NULL || pg->pg_id != pg_id || !LIST_EMPTY(&pg->pg_members)) {
mutex_exit(&proc_lock);
return;
}
ss = pg->pg_session;
/* Remove reference (if any) from tty to this process group */
mutex_spin_enter(&tty_lock);
ttyp = ss->s_ttyp;
if (ttyp != NULL && ttyp->t_pgrp == pg) {
ttyp->t_pgrp = NULL;
KASSERT(ttyp->t_session == ss);
}
mutex_spin_exit(&tty_lock);
/*
* The leading process group in a session is freed by proc_sessrele(),
* if last reference. It will also release the locks.
*/
pg = (ss->s_sid != pg->pg_id) ? pg_remove(pg_id) : NULL;
proc_sessrele(ss);
if (pg != NULL) {
/* Free it, if was not done above. */
kmem_free(pg, sizeof(struct pgrp));
}
}
/*
* Adjust pgrp jobc counters when specified process changes process group.
* We count the number of processes in each process group that "qualify"
* the group for terminal job control (those with a parent in a different
* process group of the same session). If that count reaches zero, the
* process group becomes orphaned. Check both the specified process'
* process group and that of its children.
* entering == 0 => p is leaving specified group.
* entering == 1 => p is entering specified group.
*
* Call with proc_lock held.
*/
void
fixjobc(struct proc *p, struct pgrp *pgrp, int entering)
{
struct pgrp *hispgrp;
struct session *mysession = pgrp->pg_session;
struct proc *child;
KASSERT(mutex_owned(&proc_lock));
/*
* Check p's parent to see whether p qualifies its own process
* group; if so, adjust count for p's process group.
*/
hispgrp = p->p_pptr->p_pgrp;
if (hispgrp != pgrp && hispgrp->pg_session == mysession) {
if (entering) {
pgrp->pg_jobc++;
p->p_lflag &= ~PL_ORPHANPG;
} else {
/* KASSERT(pgrp->pg_jobc > 0); */
if (--pgrp->pg_jobc == 0) orphanpg(pgrp);
}
}
/*
* Check this process' children to see whether they qualify
* their process groups; if so, adjust counts for children's
* process groups.
*/
LIST_FOREACH(child, &p->p_children, p_sibling) {
hispgrp = child->p_pgrp;
if (hispgrp != pgrp && hispgrp->pg_session == mysession && !P_ZOMBIE(child)) {
if (entering) {
child->p_lflag &= ~PL_ORPHANPG;
hispgrp->pg_jobc++;
} else {
KASSERT(hispgrp->pg_jobc > 0); if (--hispgrp->pg_jobc == 0) orphanpg(hispgrp);
}
}
}
}
/*
* A process group has become orphaned;
* if there are any stopped processes in the group,
* hang-up all process in that group.
*
* Call with proc_lock held.
*/
static void
orphanpg(struct pgrp *pg)
{
struct proc *p;
KASSERT(mutex_owned(&proc_lock)); LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (p->p_stat == SSTOP) { p->p_lflag |= PL_ORPHANPG;
psignal(p, SIGHUP);
psignal(p, SIGCONT);
}
}
}
#ifdef DDB
#include <ddb/db_output.h>
void pidtbl_dump(void);
void
pidtbl_dump(void)
{
struct pid_table *pt;
struct proc *p;
struct pgrp *pgrp;
uintptr_t slot;
int id;
db_printf("pid table %p size %x, next %x, last %x\n",
pid_table, pid_tbl_mask+1,
next_free_pt, last_free_pt);
for (pt = pid_table, id = 0; id <= pid_tbl_mask; id++, pt++) {
slot = pt->pt_slot;
if (!PT_VALID(slot) && !pt->pt_pgrp)
continue;
if (PT_IS_LWP(slot)) {
p = PT_GET_LWP(slot)->l_proc;
} else if (PT_IS_PROC(slot)) {
p = PT_GET_PROC(slot);
} else {
p = NULL;
}
db_printf(" id %x: ", id);
if (p != NULL)
db_printf("slotpid %d proc %p id %d (0x%x) %s\n",
pt->pt_pid, p, p->p_pid, p->p_pid, p->p_comm);
else
db_printf("next %x use %x\n",
PT_NEXT(slot) & pid_tbl_mask,
PT_NEXT(slot) & ~pid_tbl_mask);
if ((pgrp = pt->pt_pgrp)) {
db_printf("\tsession %p, sid %d, count %d, login %s\n",
pgrp->pg_session, pgrp->pg_session->s_sid,
pgrp->pg_session->s_count,
pgrp->pg_session->s_login);
db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n",
pgrp, pgrp->pg_id, pgrp->pg_jobc,
LIST_FIRST(&pgrp->pg_members));
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
db_printf("\t\tpid %d addr %p pgrp %p %s\n",
p->p_pid, p, p->p_pgrp, p->p_comm);
}
}
}
}
#endif /* DDB */
#ifdef KSTACK_CHECK_MAGIC
#define KSTACK_MAGIC 0xdeadbeaf
/* XXX should be per process basis? */
static int kstackleftmin = KSTACK_SIZE;
static int kstackleftthres = KSTACK_SIZE / 8;
void
kstack_setup_magic(const struct lwp *l)
{
uint32_t *ip;
uint32_t const *end;
KASSERT(l != NULL);
KASSERT(l != &lwp0);
/*
* fill all the stack with magic number
* so that later modification on it can be detected.
*/
ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
for (; ip < end; ip++) {
*ip = KSTACK_MAGIC;
}
}
void
kstack_check_magic(const struct lwp *l)
{
uint32_t const *ip, *end;
int stackleft;
KASSERT(l != NULL);
/* don't check proc0 */ /*XXX*/
if (l == &lwp0)
return;
#ifdef __MACHINE_STACK_GROWS_UP
/* stack grows upwards (eg. hppa) */
ip = (uint32_t *)((void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
end = (uint32_t *)KSTACK_LOWEST_ADDR(l);
for (ip--; ip >= end; ip--)
if (*ip != KSTACK_MAGIC)
break;
stackleft = (void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE - (void *)ip;
#else /* __MACHINE_STACK_GROWS_UP */
/* stack grows downwards (eg. i386) */
ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
for (; ip < end; ip++)
if (*ip != KSTACK_MAGIC)
break;
stackleft = ((const char *)ip) - (const char *)KSTACK_LOWEST_ADDR(l);
#endif /* __MACHINE_STACK_GROWS_UP */
if (kstackleftmin > stackleft) {
kstackleftmin = stackleft;
if (stackleft < kstackleftthres)
printf("warning: kernel stack left %d bytes"
"(pid %u:lid %u)\n", stackleft,
(u_int)l->l_proc->p_pid, (u_int)l->l_lid);
}
if (stackleft <= 0) {
panic("magic on the top of kernel stack changed for "
"pid %u, lid %u: maybe kernel stack overflow",
(u_int)l->l_proc->p_pid, (u_int)l->l_lid);
}
}
#endif /* KSTACK_CHECK_MAGIC */
int
proclist_foreach_call(struct proclist *list,
int (*callback)(struct proc *, void *arg), void *arg)
{
struct proc marker;
struct proc *p;
int ret = 0;
marker.p_flag = PK_MARKER;
mutex_enter(&proc_lock);
for (p = LIST_FIRST(list); ret == 0 && p != NULL;) {
if (p->p_flag & PK_MARKER) {
p = LIST_NEXT(p, p_list);
continue;
}
LIST_INSERT_AFTER(p, &marker, p_list);
ret = (*callback)(p, arg);
KASSERT(mutex_owned(&proc_lock));
p = LIST_NEXT(&marker, p_list);
LIST_REMOVE(&marker, p_list);
}
mutex_exit(&proc_lock);
return ret;
}
int
proc_vmspace_getref(struct proc *p, struct vmspace **vm)
{
/* XXXCDC: how should locking work here? */
/* curproc exception is for coredump. */
if ((p != curproc && (p->p_sflag & PS_WEXIT) != 0) ||
(p->p_vmspace->vm_refcnt < 1)) {
return EFAULT;
}
uvmspace_addref(p->p_vmspace);
*vm = p->p_vmspace;
return 0;
}
/*
* Acquire a write lock on the process credential.
*/
void
proc_crmod_enter(void)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
kauth_cred_t oc;
/* Reset what needs to be reset in plimit. */
if (p->p_limit->pl_corename != defcorename) { lim_setcorename(p, defcorename, 0);
}
mutex_enter(p->p_lock);
/* Ensure the LWP cached credentials are up to date. */
if ((oc = l->l_cred) != p->p_cred) { l->l_cred = kauth_cred_hold(p->p_cred);
kauth_cred_free(oc);
}
}
/*
* Set in a new process credential, and drop the write lock. The credential
* must have a reference already. Optionally, free a no-longer required
* credential.
*/
void
proc_crmod_leave(kauth_cred_t scred, kauth_cred_t fcred, bool sugid)
{
struct lwp *l = curlwp, *l2;
struct proc *p = l->l_proc;
kauth_cred_t oc;
KASSERT(mutex_owned(p->p_lock));
/* Is there a new credential to set in? */
if (scred != NULL) {
p->p_cred = scred;
LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (l2 != l) { lwp_lock(l2);
l2->l_flag |= LW_CACHECRED;
lwp_need_userret(l2);
lwp_unlock(l2);
}
}
/* Ensure the LWP cached credentials are up to date. */
if ((oc = l->l_cred) != scred) { l->l_cred = kauth_cred_hold(scred);
}
} else
oc = NULL; /* XXXgcc */
if (sugid) {
/*
* Mark process as having changed credentials, stops
* tracing etc.
*/
p->p_flag |= PK_SUGID;
}
mutex_exit(p->p_lock);
/* If there is a credential to be released, free it now. */
if (fcred != NULL) { KASSERT(scred != NULL);
kauth_cred_free(fcred);
if (oc != scred) kauth_cred_free(oc);
}
}
/*
* proc_specific_key_create --
* Create a key for subsystem proc-specific data.
*/
int
proc_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return (specificdata_key_create(proc_specificdata_domain, keyp, dtor));
}
/*
* proc_specific_key_delete --
* Delete a key for subsystem proc-specific data.
*/
void
proc_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(proc_specificdata_domain, key);
}
/*
* proc_initspecific --
* Initialize a proc's specificdata container.
*/
void
proc_initspecific(struct proc *p)
{
int error __diagused;
error = specificdata_init(proc_specificdata_domain, &p->p_specdataref);
KASSERT(error == 0);
}
/*
* proc_finispecific --
* Finalize a proc's specificdata container.
*/
void
proc_finispecific(struct proc *p)
{
specificdata_fini(proc_specificdata_domain, &p->p_specdataref);
}
/*
* proc_getspecific --
* Return proc-specific data corresponding to the specified key.
*/
void *
proc_getspecific(struct proc *p, specificdata_key_t key)
{
return (specificdata_getspecific(proc_specificdata_domain,
&p->p_specdataref, key));
}
/*
* proc_setspecific --
* Set proc-specific data corresponding to the specified key.
*/
void
proc_setspecific(struct proc *p, specificdata_key_t key, void *data)
{
specificdata_setspecific(proc_specificdata_domain,
&p->p_specdataref, key, data);
}
int
proc_uidmatch(kauth_cred_t cred, kauth_cred_t target)
{
int r = 0;
if (kauth_cred_getuid(cred) != kauth_cred_getuid(target) ||
kauth_cred_getuid(cred) != kauth_cred_getsvuid(target)) {
/*
* suid proc of ours or proc not ours
*/
r = EPERM;
} else if (kauth_cred_getgid(target) != kauth_cred_getsvgid(target)) {
/*
* sgid proc has sgid back to us temporarily
*/
r = EPERM;
} else {
/*
* our rgid must be in target's group list (ie,
* sub-processes started by a sgid process)
*/
int ismember = 0;
if (kauth_cred_ismember_gid(cred,
kauth_cred_getgid(target), &ismember) != 0 ||
!ismember)
r = EPERM;
}
return (r);
}
/*
* sysctl stuff
*/
#define KERN_PROCSLOP (5 * sizeof(struct kinfo_proc))
static const u_int sysctl_flagmap[] = {
PK_ADVLOCK, P_ADVLOCK,
PK_EXEC, P_EXEC,
PK_NOCLDWAIT, P_NOCLDWAIT,
PK_32, P_32,
PK_CLDSIGIGN, P_CLDSIGIGN,
PK_SUGID, P_SUGID,
0
};
static const u_int sysctl_sflagmap[] = {
PS_NOCLDSTOP, P_NOCLDSTOP,
PS_WEXIT, P_WEXIT,
PS_STOPFORK, P_STOPFORK,
PS_STOPEXEC, P_STOPEXEC,
PS_STOPEXIT, P_STOPEXIT,
0
};
static const u_int sysctl_slflagmap[] = {
PSL_TRACED, P_TRACED,
PSL_CHTRACED, P_CHTRACED,
PSL_SYSCALL, P_SYSCALL,
0
};
static const u_int sysctl_lflagmap[] = {
PL_CONTROLT, P_CONTROLT,
PL_PPWAIT, P_PPWAIT,
0
};
static const u_int sysctl_stflagmap[] = {
PST_PROFIL, P_PROFIL,
0
};
/* used by kern_lwp also */
const u_int sysctl_lwpflagmap[] = {
LW_SINTR, L_SINTR,
LW_SYSTEM, L_SYSTEM,
0
};
/*
* Find the most ``active'' lwp of a process and return it for ps display
* purposes
*/
static struct lwp *
proc_active_lwp(struct proc *p)
{
static const int ostat[] = {
0,
2, /* LSIDL */
6, /* LSRUN */
5, /* LSSLEEP */
4, /* LSSTOP */
0, /* LSZOMB */
1, /* LSDEAD */
7, /* LSONPROC */
3 /* LSSUSPENDED */
};
struct lwp *l, *lp = NULL;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
KASSERT(l->l_stat >= 0);
KASSERT(l->l_stat < __arraycount(ostat));
if (lp == NULL ||
ostat[l->l_stat] > ostat[lp->l_stat] ||
(ostat[l->l_stat] == ostat[lp->l_stat] &&
l->l_cpticks > lp->l_cpticks)) {
lp = l;
continue;
}
}
return lp;
}
static int
sysctl_doeproc(SYSCTLFN_ARGS)
{
union {
struct kinfo_proc kproc;
struct kinfo_proc2 kproc2;
} *kbuf;
struct proc *p, *next, *marker;
char *where, *dp;
int type, op, arg, error;
u_int elem_size, kelem_size, elem_count;
size_t buflen, needed;
bool match, zombie, mmmbrains;
const bool allowaddr = get_expose_address(curproc);
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
dp = where = oldp;
buflen = where != NULL ? *oldlenp : 0;
error = 0;
needed = 0;
type = rnode->sysctl_num;
if (type == KERN_PROC) {
if (namelen == 0)
return EINVAL;
switch (op = name[0]) {
case KERN_PROC_ALL:
if (namelen != 1)
return EINVAL;
arg = 0;
break;
default:
if (namelen != 2)
return EINVAL;
arg = name[1];
break;
}
elem_count = 0; /* Hush little compiler, don't you cry */
kelem_size = elem_size = sizeof(kbuf->kproc);
} else {
if (namelen != 4)
return EINVAL;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
kelem_size = sizeof(kbuf->kproc2);
}
sysctl_unlock();
kbuf = kmem_zalloc(sizeof(*kbuf), KM_SLEEP);
marker = kmem_alloc(sizeof(*marker), KM_SLEEP);
marker->p_flag = PK_MARKER;
mutex_enter(&proc_lock);
/*
* Start with zombies to prevent reporting processes twice, in case they
* are dying and being moved from the list of alive processes to zombies.
*/
mmmbrains = true;
for (p = LIST_FIRST(&zombproc);; p = next) {
if (p == NULL) {
if (mmmbrains) {
p = LIST_FIRST(&allproc);
mmmbrains = false;
}
if (p == NULL)
break;
}
next = LIST_NEXT(p, p_list);
if ((p->p_flag & PK_MARKER) != 0)
continue;
/*
* Skip embryonic processes.
*/
if (p->p_stat == SIDL)
continue;
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_EPROC), NULL, NULL);
if (error != 0) {
mutex_exit(p->p_lock);
continue;
}
/*
* Hande all the operations in one switch on the cost of
* algorithm complexity is on purpose. The win splitting this
* function into several similar copies makes maintenance
* burden, code grow and boost is negligible in practical
* systems.
*/
switch (op) {
case KERN_PROC_PID:
match = (p->p_pid == (pid_t)arg);
break;
case KERN_PROC_PGRP:
match = (p->p_pgrp->pg_id == (pid_t)arg);
break;
case KERN_PROC_SESSION:
match = (p->p_session->s_sid == (pid_t)arg);
break;
case KERN_PROC_TTY:
match = true;
if (arg == (int) KERN_PROC_TTY_REVOKE) {
if ((p->p_lflag & PL_CONTROLT) == 0 ||
p->p_session->s_ttyp == NULL ||
p->p_session->s_ttyvp != NULL) {
match = false;
}
} else if ((p->p_lflag & PL_CONTROLT) == 0 ||
p->p_session->s_ttyp == NULL) {
if ((dev_t)arg != KERN_PROC_TTY_NODEV) {
match = false;
}
} else if (p->p_session->s_ttyp->t_dev != (dev_t)arg) {
match = false;
}
break;
case KERN_PROC_UID:
match = (kauth_cred_geteuid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_RUID:
match = (kauth_cred_getuid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_GID:
match = (kauth_cred_getegid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_RGID:
match = (kauth_cred_getgid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_ALL:
match = true;
/* allow everything */
break;
default:
error = EINVAL;
mutex_exit(p->p_lock);
goto cleanup;
}
if (!match) {
mutex_exit(p->p_lock);
continue;
}
/*
* Grab a hold on the process.
*/
if (mmmbrains) {
zombie = true;
} else {
zombie = !rw_tryenter(&p->p_reflock, RW_READER);
}
if (zombie) {
LIST_INSERT_AFTER(p, marker, p_list);
}
if (buflen >= elem_size &&
(type == KERN_PROC || elem_count > 0)) {
ruspace(p); /* Update process vm resource use */
if (type == KERN_PROC) {
fill_proc(p, &kbuf->kproc.kp_proc, allowaddr);
fill_eproc(p, &kbuf->kproc.kp_eproc, zombie,
allowaddr);
} else {
fill_kproc2(p, &kbuf->kproc2, zombie,
allowaddr);
elem_count--;
}
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
/*
* Copy out elem_size, but not larger than kelem_size
*/
error = sysctl_copyout(l, kbuf, dp,
uimin(kelem_size, elem_size));
mutex_enter(&proc_lock);
if (error) {
goto bah;
}
dp += elem_size;
buflen -= elem_size;
} else {
mutex_exit(p->p_lock);
}
needed += elem_size;
/*
* Release reference to process.
*/
if (zombie) {
next = LIST_NEXT(marker, p_list);
LIST_REMOVE(marker, p_list);
} else {
rw_exit(&p->p_reflock);
next = LIST_NEXT(p, p_list);
}
/*
* Short-circuit break quickly!
*/
if (op == KERN_PROC_PID)
break;
}
mutex_exit(&proc_lock);
if (where != NULL) {
*oldlenp = dp - where;
if (needed > *oldlenp) {
error = ENOMEM;
goto out;
}
} else {
needed += KERN_PROCSLOP;
*oldlenp = needed;
}
kmem_free(kbuf, sizeof(*kbuf));
kmem_free(marker, sizeof(*marker));
sysctl_relock();
return 0;
bah:
if (zombie)
LIST_REMOVE(marker, p_list);
else
rw_exit(&p->p_reflock);
cleanup:
mutex_exit(&proc_lock);
out:
kmem_free(kbuf, sizeof(*kbuf));
kmem_free(marker, sizeof(*marker));
sysctl_relock();
return error;
}
int
copyin_psstrings(struct proc *p, struct ps_strings *arginfo)
{
#if !defined(_RUMPKERNEL)
int retval;
if (p->p_flag & PK_32) {
MODULE_HOOK_CALL(kern_proc32_copyin_hook, (p, arginfo),
enosys(), retval);
return retval;
}
#endif /* !defined(_RUMPKERNEL) */
return copyin_proc(p, (void *)p->p_psstrp, arginfo, sizeof(*arginfo));
}
static int
copy_procargs_sysctl_cb(void *cookie_, const void *src, size_t off, size_t len)
{
void **cookie = cookie_;
struct lwp *l = cookie[0];
char *dst = cookie[1];
return sysctl_copyout(l, src, dst + off, len);
}
/*
* sysctl helper routine for kern.proc_args pseudo-subtree.
*/
static int
sysctl_kern_proc_args(SYSCTLFN_ARGS)
{
struct ps_strings pss;
struct proc *p;
pid_t pid;
int type, error;
void *cookie[2];
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (newp != NULL || namelen != 2)
return (EINVAL);
pid = name[0];
type = name[1];
switch (type) {
case KERN_PROC_PATHNAME:
sysctl_unlock();
error = fill_pathname(l, pid, oldp, oldlenp);
sysctl_relock();
return error;
case KERN_PROC_CWD:
sysctl_unlock();
error = fill_cwd(l, pid, oldp, oldlenp);
sysctl_relock();
return error;
case KERN_PROC_ARGV:
case KERN_PROC_NARGV:
case KERN_PROC_ENV:
case KERN_PROC_NENV:
/* ok */
break;
default:
return (EINVAL);
}
sysctl_unlock();
/* check pid */
mutex_enter(&proc_lock);
if ((p = proc_find(pid)) == NULL) {
error = EINVAL;
goto out_locked;
}
mutex_enter(p->p_lock);
/* Check permission. */
if (type == KERN_PROC_ARGV || type == KERN_PROC_NARGV)
error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ARGS), NULL, NULL);
else if (type == KERN_PROC_ENV || type == KERN_PROC_NENV)
error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENV), NULL, NULL);
else
error = EINVAL; /* XXXGCC */
if (error) {
mutex_exit(p->p_lock);
goto out_locked;
}
if (oldp == NULL) {
if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV)
*oldlenp = sizeof (int);
else
*oldlenp = ARG_MAX; /* XXX XXX XXX */
error = 0;
mutex_exit(p->p_lock);
goto out_locked;
}
/*
* Zombies don't have a stack, so we can't read their psstrings.
* System processes also don't have a user stack.
*/
if (P_ZOMBIE(p) || (p->p_flag & PK_SYSTEM) != 0) {
error = EINVAL;
mutex_exit(p->p_lock);
goto out_locked;
}
error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
mutex_exit(p->p_lock);
if (error) {
goto out_locked;
}
mutex_exit(&proc_lock);
if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) {
int value;
if ((error = copyin_psstrings(p, &pss)) == 0) {
if (type == KERN_PROC_NARGV)
value = pss.ps_nargvstr;
else
value = pss.ps_nenvstr;
error = sysctl_copyout(l, &value, oldp, sizeof(value));
*oldlenp = sizeof(value);
}
} else {
cookie[0] = l;
cookie[1] = oldp;
error = copy_procargs(p, type, oldlenp,
copy_procargs_sysctl_cb, cookie);
}
rw_exit(&p->p_reflock);
sysctl_relock();
return error;
out_locked:
mutex_exit(&proc_lock);
sysctl_relock();
return error;
}
int
copy_procargs(struct proc *p, int oid, size_t *limit,
int (*cb)(void *, const void *, size_t, size_t), void *cookie)
{
struct ps_strings pss;
size_t len, i, loaded, entry_len;
struct uio auio;
struct iovec aiov;
int error, argvlen;
char *arg;
char **argv;
vaddr_t user_argv;
struct vmspace *vmspace;
/*
* Allocate a temporary buffer to hold the argument vector and
* the arguments themselve.
*/
arg = kmem_alloc(PAGE_SIZE, KM_SLEEP);
argv = kmem_alloc(PAGE_SIZE, KM_SLEEP);
/*
* Lock the process down in memory.
*/
vmspace = p->p_vmspace;
uvmspace_addref(vmspace);
/*
* Read in the ps_strings structure.
*/
if ((error = copyin_psstrings(p, &pss)) != 0)
goto done;
/*
* Now read the address of the argument vector.
*/
switch (oid) {
case KERN_PROC_ARGV:
user_argv = (uintptr_t)pss.ps_argvstr;
argvlen = pss.ps_nargvstr;
break;
case KERN_PROC_ENV:
user_argv = (uintptr_t)pss.ps_envstr;
argvlen = pss.ps_nenvstr;
break;
default:
error = EINVAL;
goto done;
}
if (argvlen < 0) {
error = EIO;
goto done;
}
/*
* Now copy each string.
*/
len = 0; /* bytes written to user buffer */
loaded = 0; /* bytes from argv already processed */
i = 0; /* To make compiler happy */
entry_len = PROC_PTRSZ(p);
for (; argvlen; --argvlen) {
int finished = 0;
vaddr_t base;
size_t xlen;
int j;
if (loaded == 0) {
size_t rem = entry_len * argvlen;
loaded = MIN(rem, PAGE_SIZE);
error = copyin_vmspace(vmspace,
(const void *)user_argv, argv, loaded);
if (error)
break;
user_argv += loaded;
i = 0;
}
#if !defined(_RUMPKERNEL)
if (p->p_flag & PK_32)
MODULE_HOOK_CALL(kern_proc32_base_hook,
(argv, i++), 0, base);
else
#endif /* !defined(_RUMPKERNEL) */
base = (vaddr_t)argv[i++];
loaded -= entry_len;
/*
* The program has messed around with its arguments,
* possibly deleting some, and replacing them with
* NULL's. Treat this as the last argument and not
* a failure.
*/
if (base == 0)
break;
while (!finished) {
xlen = PAGE_SIZE - (base & PAGE_MASK);
aiov.iov_base = arg;
aiov.iov_len = PAGE_SIZE;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = base;
auio.uio_resid = xlen;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
error = uvm_io(&vmspace->vm_map, &auio, 0);
if (error)
goto done;
/* Look for the end of the string */
for (j = 0; j < xlen; j++) {
if (arg[j] == '\0') {
xlen = j + 1;
finished = 1;
break;
}
}
/* Check for user buffer overflow */
if (len + xlen > *limit) {
finished = 1;
if (len > *limit)
xlen = 0;
else
xlen = *limit - len;
}
/* Copyout the page */
error = (*cb)(cookie, arg, len, xlen);
if (error)
goto done;
len += xlen;
base += xlen;
}
}
*limit = len;
done:
kmem_free(argv, PAGE_SIZE);
kmem_free(arg, PAGE_SIZE);
uvmspace_free(vmspace);
return error;
}
/*
* Fill in a proc structure for the specified process.
*/
static void
fill_proc(const struct proc *psrc, struct proc *p, bool allowaddr)
{
COND_SET_STRUCT(p->p_list, psrc->p_list, allowaddr);
memset(&p->p_auxlock, 0, sizeof(p->p_auxlock));
COND_SET_STRUCT(p->p_lock, psrc->p_lock, allowaddr);
memset(&p->p_stmutex, 0, sizeof(p->p_stmutex));
memset(&p->p_reflock, 0, sizeof(p->p_reflock));
COND_SET_STRUCT(p->p_waitcv, psrc->p_waitcv, allowaddr);
COND_SET_STRUCT(p->p_lwpcv, psrc->p_lwpcv, allowaddr);
COND_SET_PTR(p->p_cred, psrc->p_cred, allowaddr);
COND_SET_PTR(p->p_fd, psrc->p_fd, allowaddr);
COND_SET_PTR(p->p_cwdi, psrc->p_cwdi, allowaddr);
COND_SET_PTR(p->p_stats, psrc->p_stats, allowaddr);
COND_SET_PTR(p->p_limit, psrc->p_limit, allowaddr);
COND_SET_PTR(p->p_vmspace, psrc->p_vmspace, allowaddr);
COND_SET_PTR(p->p_sigacts, psrc->p_sigacts, allowaddr);
COND_SET_PTR(p->p_aio, psrc->p_aio, allowaddr);
p->p_mqueue_cnt = psrc->p_mqueue_cnt;
memset(&p->p_specdataref, 0, sizeof(p->p_specdataref));
p->p_exitsig = psrc->p_exitsig;
p->p_flag = psrc->p_flag;
p->p_sflag = psrc->p_sflag;
p->p_slflag = psrc->p_slflag;
p->p_lflag = psrc->p_lflag;
p->p_stflag = psrc->p_stflag;
p->p_stat = psrc->p_stat;
p->p_trace_enabled = psrc->p_trace_enabled;
p->p_pid = psrc->p_pid;
COND_SET_STRUCT(p->p_pglist, psrc->p_pglist, allowaddr);
COND_SET_PTR(p->p_pptr, psrc->p_pptr, allowaddr);
COND_SET_STRUCT(p->p_sibling, psrc->p_sibling, allowaddr);
COND_SET_STRUCT(p->p_children, psrc->p_children, allowaddr);
COND_SET_STRUCT(p->p_lwps, psrc->p_lwps, allowaddr);
COND_SET_PTR(p->p_raslist, psrc->p_raslist, allowaddr);
p->p_nlwps = psrc->p_nlwps;
p->p_nzlwps = psrc->p_nzlwps;
p->p_nrlwps = psrc->p_nrlwps;
p->p_nlwpwait = psrc->p_nlwpwait;
p->p_ndlwps = psrc->p_ndlwps;
p->p_nstopchild = psrc->p_nstopchild;
p->p_waited = psrc->p_waited;
COND_SET_PTR(p->p_zomblwp, psrc->p_zomblwp, allowaddr);
COND_SET_PTR(p->p_vforklwp, psrc->p_vforklwp, allowaddr);
COND_SET_PTR(p->p_sched_info, psrc->p_sched_info, allowaddr);
p->p_estcpu = psrc->p_estcpu;
p->p_estcpu_inherited = psrc->p_estcpu_inherited;
p->p_forktime = psrc->p_forktime;
p->p_pctcpu = psrc->p_pctcpu;
COND_SET_PTR(p->p_opptr, psrc->p_opptr, allowaddr);
COND_SET_PTR(p->p_timers, psrc->p_timers, allowaddr);
p->p_rtime = psrc->p_rtime;
p->p_uticks = psrc->p_uticks;
p->p_sticks = psrc->p_sticks;
p->p_iticks = psrc->p_iticks;
p->p_xutime = psrc->p_xutime;
p->p_xstime = psrc->p_xstime;
p->p_traceflag = psrc->p_traceflag;
COND_SET_PTR(p->p_tracep, psrc->p_tracep, allowaddr);
COND_SET_PTR(p->p_textvp, psrc->p_textvp, allowaddr);
COND_SET_PTR(p->p_emul, psrc->p_emul, allowaddr);
COND_SET_PTR(p->p_emuldata, psrc->p_emuldata, allowaddr);
COND_SET_CPTR(p->p_execsw, psrc->p_execsw, allowaddr);
COND_SET_STRUCT(p->p_klist, psrc->p_klist, allowaddr);
COND_SET_STRUCT(p->p_sigwaiters, psrc->p_sigwaiters, allowaddr);
COND_SET_STRUCT(p->p_sigpend.sp_info, psrc->p_sigpend.sp_info,
allowaddr);
p->p_sigpend.sp_set = psrc->p_sigpend.sp_set;
COND_SET_PTR(p->p_lwpctl, psrc->p_lwpctl, allowaddr);
p->p_ppid = psrc->p_ppid;
p->p_oppid = psrc->p_oppid;
COND_SET_PTR(p->p_path, psrc->p_path, allowaddr);
p->p_sigctx = psrc->p_sigctx;
p->p_nice = psrc->p_nice;
memcpy(p->p_comm, psrc->p_comm, sizeof(p->p_comm));
COND_SET_PTR(p->p_pgrp, psrc->p_pgrp, allowaddr);
COND_SET_VALUE(p->p_psstrp, psrc->p_psstrp, allowaddr);
p->p_pax = psrc->p_pax;
p->p_xexit = psrc->p_xexit;
p->p_xsig = psrc->p_xsig;
p->p_acflag = psrc->p_acflag;
COND_SET_STRUCT(p->p_md, psrc->p_md, allowaddr);
p->p_stackbase = psrc->p_stackbase;
COND_SET_PTR(p->p_dtrace, psrc->p_dtrace, allowaddr);
}
/*
* Fill in an eproc structure for the specified process.
*/
void
fill_eproc(struct proc *p, struct eproc *ep, bool zombie, bool allowaddr)
{
struct tty *tp;
struct lwp *l;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
COND_SET_PTR(ep->e_paddr, p, allowaddr);
COND_SET_PTR(ep->e_sess, p->p_session, allowaddr);
if (p->p_cred) {
kauth_cred_topcred(p->p_cred, &ep->e_pcred);
kauth_cred_toucred(p->p_cred, &ep->e_ucred);
}
if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
struct vmspace *vm = p->p_vmspace;
ep->e_vm.vm_rssize = vm_resident_count(vm);
ep->e_vm.vm_tsize = vm->vm_tsize;
ep->e_vm.vm_dsize = vm->vm_dsize;
ep->e_vm.vm_ssize = vm->vm_ssize;
ep->e_vm.vm_map.size = vm->vm_map.size;
/* Pick the primary (first) LWP */
l = proc_active_lwp(p);
KASSERT(l != NULL);
lwp_lock(l);
if (l->l_wchan)
strncpy(ep->e_wmesg, l->l_wmesg, WMESGLEN);
lwp_unlock(l);
}
ep->e_ppid = p->p_ppid;
if (p->p_pgrp && p->p_session) {
ep->e_pgid = p->p_pgrp->pg_id;
ep->e_jobc = p->p_pgrp->pg_jobc;
ep->e_sid = p->p_session->s_sid;
if ((p->p_lflag & PL_CONTROLT) &&
(tp = p->p_session->s_ttyp)) {
ep->e_tdev = tp->t_dev;
ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
COND_SET_PTR(ep->e_tsess, tp->t_session, allowaddr);
} else
ep->e_tdev = (uint32_t)NODEV;
ep->e_flag = p->p_session->s_ttyvp ? EPROC_CTTY : 0;
if (SESS_LEADER(p))
ep->e_flag |= EPROC_SLEADER;
strncpy(ep->e_login, p->p_session->s_login, MAXLOGNAME);
}
ep->e_xsize = ep->e_xrssize = 0;
ep->e_xccount = ep->e_xswrss = 0;
}
/*
* Fill in a kinfo_proc2 structure for the specified process.
*/
void
fill_kproc2(struct proc *p, struct kinfo_proc2 *ki, bool zombie, bool allowaddr)
{
struct tty *tp;
struct lwp *l;
struct timeval ut, st, rt;
sigset_t ss1, ss2;
struct rusage ru;
struct vmspace *vm;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
sigemptyset(&ss1);
sigemptyset(&ss2);
COND_SET_VALUE(ki->p_paddr, PTRTOUINT64(p), allowaddr);
COND_SET_VALUE(ki->p_fd, PTRTOUINT64(p->p_fd), allowaddr);
COND_SET_VALUE(ki->p_cwdi, PTRTOUINT64(p->p_cwdi), allowaddr);
COND_SET_VALUE(ki->p_stats, PTRTOUINT64(p->p_stats), allowaddr);
COND_SET_VALUE(ki->p_limit, PTRTOUINT64(p->p_limit), allowaddr);
COND_SET_VALUE(ki->p_vmspace, PTRTOUINT64(p->p_vmspace), allowaddr);
COND_SET_VALUE(ki->p_sigacts, PTRTOUINT64(p->p_sigacts), allowaddr);
COND_SET_VALUE(ki->p_sess, PTRTOUINT64(p->p_session), allowaddr);
ki->p_tsess = 0; /* may be changed if controlling tty below */
COND_SET_VALUE(ki->p_ru, PTRTOUINT64(&p->p_stats->p_ru), allowaddr);
ki->p_eflag = 0;
ki->p_exitsig = p->p_exitsig;
ki->p_flag = L_INMEM; /* Process never swapped out */
ki->p_flag |= sysctl_map_flags(sysctl_flagmap, p->p_flag);
ki->p_flag |= sysctl_map_flags(sysctl_sflagmap, p->p_sflag);
ki->p_flag |= sysctl_map_flags(sysctl_slflagmap, p->p_slflag);
ki->p_flag |= sysctl_map_flags(sysctl_lflagmap, p->p_lflag);
ki->p_flag |= sysctl_map_flags(sysctl_stflagmap, p->p_stflag);
ki->p_pid = p->p_pid;
ki->p_ppid = p->p_ppid;
ki->p_uid = kauth_cred_geteuid(p->p_cred);
ki->p_ruid = kauth_cred_getuid(p->p_cred);
ki->p_gid = kauth_cred_getegid(p->p_cred);
ki->p_rgid = kauth_cred_getgid(p->p_cred);
ki->p_svuid = kauth_cred_getsvuid(p->p_cred);
ki->p_svgid = kauth_cred_getsvgid(p->p_cred);
ki->p_ngroups = kauth_cred_ngroups(p->p_cred);
kauth_cred_getgroups(p->p_cred, ki->p_groups,
uimin(ki->p_ngroups, sizeof(ki->p_groups) / sizeof(ki->p_groups[0])),
UIO_SYSSPACE);
ki->p_uticks = p->p_uticks;
ki->p_sticks = p->p_sticks;
ki->p_iticks = p->p_iticks;
ki->p_tpgid = NO_PGID; /* may be changed if controlling tty below */
COND_SET_VALUE(ki->p_tracep, PTRTOUINT64(p->p_tracep), allowaddr);
ki->p_traceflag = p->p_traceflag;
memcpy(&ki->p_sigignore, &p->p_sigctx.ps_sigignore,sizeof(ki_sigset_t));
memcpy(&ki->p_sigcatch, &p->p_sigctx.ps_sigcatch, sizeof(ki_sigset_t));
ki->p_cpticks = 0;
ki->p_pctcpu = p->p_pctcpu;
ki->p_estcpu = 0;
ki->p_stat = p->p_stat; /* Will likely be overridden by LWP status */
ki->p_realstat = p->p_stat;
ki->p_nice = p->p_nice;
ki->p_xstat = P_WAITSTATUS(p);
ki->p_acflag = p->p_acflag;
strncpy(ki->p_comm, p->p_comm,
uimin(sizeof(ki->p_comm), sizeof(p->p_comm)));
strncpy(ki->p_ename, p->p_emul->e_name, sizeof(ki->p_ename));
ki->p_nlwps = p->p_nlwps;
ki->p_realflag = ki->p_flag;
if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
vm = p->p_vmspace;
ki->p_vm_rssize = vm_resident_count(vm);
ki->p_vm_tsize = vm->vm_tsize;
ki->p_vm_dsize = vm->vm_dsize;
ki->p_vm_ssize = vm->vm_ssize;
ki->p_vm_vsize = atop(vm->vm_map.size);
/*
* Since the stack is initially mapped mostly with
* PROT_NONE and grown as needed, adjust the "mapped size"
* to skip the unused stack portion.
*/
ki->p_vm_msize =
atop(vm->vm_map.size) - vm->vm_issize + vm->vm_ssize;
/* Pick the primary (first) LWP */
l = proc_active_lwp(p);
KASSERT(l != NULL);
lwp_lock(l);
ki->p_nrlwps = p->p_nrlwps;
ki->p_forw = 0;
ki->p_back = 0;
COND_SET_VALUE(ki->p_addr, PTRTOUINT64(l->l_addr), allowaddr);
ki->p_stat = l->l_stat;
ki->p_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag);
ki->p_swtime = l->l_swtime;
ki->p_slptime = l->l_slptime;
if (l->l_stat == LSONPROC)
ki->p_schedflags = l->l_cpu->ci_schedstate.spc_flags;
else
ki->p_schedflags = 0;
ki->p_priority = lwp_eprio(l);
ki->p_usrpri = l->l_priority;
if (l->l_wchan)
strncpy(ki->p_wmesg, l->l_wmesg, sizeof(ki->p_wmesg));
COND_SET_VALUE(ki->p_wchan, PTRTOUINT64(l->l_wchan), allowaddr);
ki->p_cpuid = cpu_index(l->l_cpu);
lwp_unlock(l);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
/* This is hardly correct, but... */
sigplusset(&l->l_sigpend.sp_set, &ss1);
sigplusset(&l->l_sigmask, &ss2);
ki->p_cpticks += l->l_cpticks;
ki->p_pctcpu += l->l_pctcpu;
ki->p_estcpu += l->l_estcpu;
}
}
sigplusset(&p->p_sigpend.sp_set, &ss1);
memcpy(&ki->p_siglist, &ss1, sizeof(ki_sigset_t));
memcpy(&ki->p_sigmask, &ss2, sizeof(ki_sigset_t));
if (p->p_session != NULL) {
ki->p_sid = p->p_session->s_sid;
ki->p__pgid = p->p_pgrp->pg_id;
if (p->p_session->s_ttyvp)
ki->p_eflag |= EPROC_CTTY;
if (SESS_LEADER(p))
ki->p_eflag |= EPROC_SLEADER;
strncpy(ki->p_login, p->p_session->s_login,
uimin(sizeof ki->p_login - 1, sizeof p->p_session->s_login));
ki->p_jobc = p->p_pgrp->pg_jobc;
if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) {
ki->p_tdev = tp->t_dev;
ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
COND_SET_VALUE(ki->p_tsess, PTRTOUINT64(tp->t_session),
allowaddr);
} else {
ki->p_tdev = (int32_t)NODEV;
}
}
if (!P_ZOMBIE(p) && !zombie) {
ki->p_uvalid = 1;
ki->p_ustart_sec = p->p_stats->p_start.tv_sec;
ki->p_ustart_usec = p->p_stats->p_start.tv_usec;
calcru(p, &ut, &st, NULL, &rt);
ki->p_rtime_sec = rt.tv_sec;
ki->p_rtime_usec = rt.tv_usec;
ki->p_uutime_sec = ut.tv_sec;
ki->p_uutime_usec = ut.tv_usec;
ki->p_ustime_sec = st.tv_sec;
ki->p_ustime_usec = st.tv_usec;
memcpy(&ru, &p->p_stats->p_ru, sizeof(ru));
rulwps(p, &ru);
ki->p_uru_nvcsw = ru.ru_nvcsw;
ki->p_uru_nivcsw = ru.ru_nivcsw;
ki->p_uru_maxrss = ru.ru_maxrss;
ki->p_uru_ixrss = ru.ru_ixrss;
ki->p_uru_idrss = ru.ru_idrss;
ki->p_uru_isrss = ru.ru_isrss;
ki->p_uru_minflt = ru.ru_minflt;
ki->p_uru_majflt = ru.ru_majflt;
ki->p_uru_nswap = ru.ru_nswap;
ki->p_uru_inblock = ru.ru_inblock;
ki->p_uru_oublock = ru.ru_oublock;
ki->p_uru_msgsnd = ru.ru_msgsnd;
ki->p_uru_msgrcv = ru.ru_msgrcv;
ki->p_uru_nsignals = ru.ru_nsignals;
timeradd(&p->p_stats->p_cru.ru_utime,
&p->p_stats->p_cru.ru_stime, &ut);
ki->p_uctime_sec = ut.tv_sec;
ki->p_uctime_usec = ut.tv_usec;
}
}
int
proc_find_locked(struct lwp *l, struct proc **p, pid_t pid)
{
int error;
mutex_enter(&proc_lock);
if (pid == -1)
*p = l->l_proc;
else
*p = proc_find(pid);
if (*p == NULL) {
if (pid != -1)
mutex_exit(&proc_lock);
return ESRCH;
}
if (pid != -1)
mutex_enter((*p)->p_lock);
mutex_exit(&proc_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, *p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error) {
if (pid != -1)
mutex_exit((*p)->p_lock);
}
return error;
}
static int
fill_pathname(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
int error;
struct proc *p;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
if (p->p_path == NULL) {
if (pid != -1)
mutex_exit(p->p_lock);
return ENOENT;
}
size_t len = strlen(p->p_path) + 1;
if (oldp != NULL) {
size_t copylen = uimin(len, *oldlenp);
error = sysctl_copyout(l, p->p_path, oldp, copylen);
if (error == 0 && *oldlenp < len)
error = ENOSPC;
}
*oldlenp = len;
if (pid != -1)
mutex_exit(p->p_lock);
return error;
}
static int
fill_cwd(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
int error;
struct proc *p;
char *path;
char *bp, *bend;
struct cwdinfo *cwdi;
struct vnode *vp;
size_t len, lenused;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
len = MAXPATHLEN * 4;
path = kmem_alloc(len, KM_SLEEP);
bp = &path[len];
bend = bp;
*(--bp) = '\0';
cwdi = p->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
vp = cwdi->cwdi_cdir;
error = getcwd_common(vp, NULL, &bp, path, len/2, 0, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto out;
lenused = bend - bp;
if (oldp != NULL) {
size_t copylen = uimin(lenused, *oldlenp);
error = sysctl_copyout(l, bp, oldp, copylen);
if (error == 0 && *oldlenp < lenused)
error = ENOSPC;
}
*oldlenp = lenused;
out:
if (pid != -1)
mutex_exit(p->p_lock);
kmem_free(path, len);
return error;
}
int
proc_getauxv(struct proc *p, void **buf, size_t *len)
{
struct ps_strings pss;
int error;
void *uauxv, *kauxv;
size_t size;
if ((error = copyin_psstrings(p, &pss)) != 0)
return error;
if (pss.ps_envstr == NULL)
return EIO;
size = p->p_execsw->es_arglen;
if (size == 0)
return EIO;
size_t ptrsz = PROC_PTRSZ(p);
uauxv = (void *)((char *)pss.ps_envstr + (pss.ps_nenvstr + 1) * ptrsz);
kauxv = kmem_alloc(size, KM_SLEEP);
error = copyin_proc(p, uauxv, kauxv, size);
if (error) {
kmem_free(kauxv, size);
return error;
}
*buf = kauxv;
*len = size;
return 0;
}
static int
sysctl_security_expose_address(SYSCTLFN_ARGS)
{
int expose_address, error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &expose_address;
expose_address = *(int *)rnode->sysctl_data;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_KERNADDR,
0, NULL, NULL, NULL))
return EPERM;
switch (expose_address) {
case 0:
case 1:
case 2:
break;
default:
return EINVAL;
}
*(int *)rnode->sysctl_data = expose_address;
return 0;
}
bool
get_expose_address(struct proc *p)
{
/* allow only if sysctl variable is set or privileged */
return kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL) == 0;
}
/* $NetBSD: subr_iostat.c,v 1.25 2019/05/22 08:47:02 hannken Exp $ */
/* NetBSD: subr_disk.c,v 1.69 2005/05/29 22:24:15 christos Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_iostat.c,v 1.25 2019/05/22 08:47:02 hannken Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>
#include <sys/rwlock.h>
/*
* Function prototypes for sysctl nodes
*/
static int sysctl_hw_disknames(SYSCTLFN_PROTO);
static int sysctl_hw_iostatnames(SYSCTLFN_PROTO);
static int sysctl_hw_iostats(SYSCTLFN_PROTO);
static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
u_int namelen);
/*
* A global list of all drives attached to the system. May grow or
* shrink over time.
*/
struct iostatlist_head iostatlist = TAILQ_HEAD_INITIALIZER(iostatlist);
int iostat_count; /* number of drives in global drivelist */
krwlock_t iostatlist_lock;
static void sysctl_io_stats_setup(struct sysctllog **);
/*
* Initialise the iostat subsystem.
*/
void
iostat_init(void)
{
rw_init(&iostatlist_lock);
sysctl_io_stats_setup(NULL);
}
/*
* Searches the iostatlist for the iostat corresponding to the
* name provided.
*/
struct io_stats *
iostat_find(const char *name)
{
struct io_stats *iostatp;
KASSERT(name != NULL);
rw_enter(&iostatlist_lock, RW_READER);
TAILQ_FOREACH(iostatp, &iostatlist, io_link) {
if (strcmp(iostatp->io_name, name) == 0) {
break;
}
}
rw_exit(&iostatlist_lock);
return iostatp;
}
/*
* Allocate and initialise memory for the i/o statistics.
*/
struct io_stats *
iostat_alloc(int32_t type, void *parent, const char *name)
{
struct io_stats *stats;
stats = kmem_zalloc(sizeof(*stats), KM_SLEEP);
stats->io_type = type;
stats->io_parent = parent;
(void)strlcpy(stats->io_name, name, sizeof(stats->io_name));
/*
* Set the attached timestamp.
*/
getmicrouptime(&stats->io_attachtime);
/*
* Link into the drivelist.
*/
rw_enter(&iostatlist_lock, RW_WRITER);
TAILQ_INSERT_TAIL(&iostatlist, stats, io_link);
iostat_count++;
rw_exit(&iostatlist_lock);
return stats;
}
/*
* Remove i/o from stats collection.
*/
void
iostat_free(struct io_stats *stats)
{
/*
* Remove from the iostat list.
*/
if (iostat_count == 0)
panic("iostat_free: iostat_count == 0");
rw_enter(&iostatlist_lock, RW_WRITER);
TAILQ_REMOVE(&iostatlist, stats, io_link);
iostat_count--;
rw_exit(&iostatlist_lock);
kmem_free(stats, sizeof(*stats));
}
/*
* Rename i/o stats.
*/
void
iostat_rename(struct io_stats *stats, const char *name)
{
rw_enter(&iostatlist_lock, RW_WRITER);
(void)strlcpy(stats->io_name, name, sizeof(stats->io_name));
rw_exit(&iostatlist_lock);
}
/*
* multiply timeval by unsigned integer and add to result
*/
static void
timermac(struct timeval *a, uint64_t count, struct timeval *res)
{
struct timeval part = *a;
while (count) { if (count & 1) timeradd(res, &part, res);
timeradd(&part, &part, &part);
count >>= 1;
}
}
/*
* Increment the iostat wait counter.
* Accumulate wait time and timesum.
*
* Wait time is spent in the device bufq.
*/
void
iostat_wait(struct io_stats *stats)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_wait >= 0);
getmicrouptime(&dv_time);
timersub(&dv_time, &stats->io_waitstamp, &diff_time);
count = stats->io_wait++;
if (count != 0) { timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
}
stats->io_waitstamp = dv_time;
}
/*
* Decrement the iostat wait counter.
* Increment the iostat busy counter.
* Accumulate wait and busy times and timesums.
*
* Busy time is spent being processed by the device.
*
* Old devices do not yet measure wait time, so skip
* processing it if the counter is still zero.
*/
void
iostat_busy(struct io_stats *stats)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_wait >= 0); /* > 0 when iostat_wait is used */ KASSERT(stats->io_busy >= 0);
getmicrouptime(&dv_time);
timersub(&dv_time, &stats->io_waitstamp, &diff_time);
if (stats->io_wait != 0) {
count = stats->io_wait--;
timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
}
stats->io_waitstamp = dv_time;
timersub(&dv_time, &stats->io_busystamp, &diff_time);
count = stats->io_busy++;
if (count != 0) { timermac(&diff_time, count, &stats->io_busysum); timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
}
stats->io_busystamp = dv_time;
}
/*
* Decrement the iostat busy counter, increment the byte count.
* Accumulate busy time and timesum.
*/
void
iostat_unbusy(struct io_stats *stats, long bcount, int read)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_busy > 0);
getmicrouptime(&dv_time);
stats->io_timestamp = dv_time;
/* any op */
timersub(&dv_time, &stats->io_busystamp, &diff_time);
count = stats->io_busy--;
timermac(&diff_time, count, &stats->io_busysum);
timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
stats->io_busystamp = dv_time;
if (bcount > 0) {
if (read) {
stats->io_rbytes += bcount;
stats->io_rxfer++;
} else {
stats->io_wbytes += bcount;
stats->io_wxfer++;
}
}
}
/*
* Return non-zero if a device has an I/O request in flight.
*/
bool
iostat_isbusy(struct io_stats *stats)
{
return stats->io_busy != 0;
}
/*
* Increment the seek counter. This does look almost redundant but it
* abstracts the stats gathering.
*/
void
iostat_seek(struct io_stats *stats)
{
stats->io_seek++;
}
static int
sysctl_hw_disknames(SYSCTLFN_ARGS)
{
return iostati_getnames(1, oldp, oldlenp, newp, namelen);
}
static int
sysctl_hw_iostatnames(SYSCTLFN_ARGS)
{
return iostati_getnames(0, oldp, oldlenp, newp, namelen);
}
static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
u_int namelen)
{
char bf[IOSTATNAMELEN + 1];
char *where = oldp;
struct io_stats *stats;
size_t needed, left, slen;
int error, first;
if (newp != NULL)
return (EPERM);
if (namelen != 0)
return (EINVAL);
first = 1;
error = 0;
needed = 0;
left = *oldlenp;
rw_enter(&iostatlist_lock, RW_READER);
for (stats = TAILQ_FIRST(&iostatlist); stats != NULL;
stats = TAILQ_NEXT(stats, io_link)) {
if ((disk_only == 1) && (stats->io_type != IOSTAT_DISK))
continue;
if (where == NULL)
needed += strlen(stats->io_name) + 1;
else {
memset(bf, 0, sizeof(bf));
if (first) {
strncpy(bf, stats->io_name, sizeof(bf));
first = 0;
} else {
bf[0] = ' ';
strncpy(bf + 1, stats->io_name,
sizeof(bf) - 1);
}
bf[IOSTATNAMELEN] = '\0';
slen = strlen(bf);
if (left < slen + 1)
break;
/* +1 to copy out the trailing NUL byte */
error = copyout(bf, where, slen + 1);
if (error)
break;
where += slen;
needed += slen;
left -= slen;
}
}
rw_exit(&iostatlist_lock);
*oldlenp = needed;
return (error);
}
static int
sysctl_hw_iostats(SYSCTLFN_ARGS)
{
struct io_sysctl sdrive;
struct io_stats *stats;
char *where = oldp;
size_t tocopy, left;
int error;
if (newp != NULL)
return (EPERM);
/*
* The original hw.diskstats call was broken and did not require
* the userland to pass in its size of struct disk_sysctl. This
* was fixed after NetBSD 1.6 was released.
*/
if (namelen == 0)
tocopy = offsetof(struct io_sysctl, busy);
else
tocopy = name[0];
if (where == NULL) {
*oldlenp = iostat_count * tocopy;
return (0);
}
error = 0;
left = *oldlenp;
memset(&sdrive, 0, sizeof(sdrive));
*oldlenp = 0;
rw_enter(&iostatlist_lock, RW_READER);
TAILQ_FOREACH(stats, &iostatlist, io_link) {
if (left < tocopy)
break;
strncpy(sdrive.name, stats->io_name, sizeof(sdrive.name));
sdrive.attachtime_sec = stats->io_attachtime.tv_sec;
sdrive.attachtime_usec = stats->io_attachtime.tv_usec;
sdrive.timestamp_sec = stats->io_busystamp.tv_sec;
sdrive.timestamp_usec = stats->io_busystamp.tv_usec;
sdrive.time_sec = stats->io_busytime.tv_sec;
sdrive.time_usec = stats->io_busytime.tv_usec;
sdrive.seek = stats->io_seek;
sdrive.rxfer = stats->io_rxfer;
sdrive.wxfer = stats->io_wxfer;
sdrive.xfer = stats->io_rxfer + stats->io_wxfer;
sdrive.rbytes = stats->io_rbytes;
sdrive.wbytes = stats->io_wbytes;
sdrive.bytes = stats->io_rbytes + stats->io_wbytes;
sdrive.wait_sec = stats->io_waittime.tv_sec;
sdrive.wait_usec = stats->io_waittime.tv_usec;
sdrive.time_sec = stats->io_busytime.tv_sec;
sdrive.time_usec = stats->io_busytime.tv_usec;
sdrive.waitsum_sec = stats->io_waitsum.tv_sec;
sdrive.waitsum_usec = stats->io_waitsum.tv_usec;
sdrive.busysum_sec = stats->io_busysum.tv_sec;
sdrive.busysum_usec = stats->io_busysum.tv_usec;
sdrive.busy = stats->io_busy;
error = copyout(&sdrive, where, uimin(tocopy, sizeof(sdrive)));
if (error)
break;
where += tocopy;
*oldlenp += tocopy;
left -= tocopy;
}
rw_exit(&iostatlist_lock);
return (error);
}
static void
sysctl_io_stats_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "disknames",
SYSCTL_DESCR("List of disk drives present"),
sysctl_hw_disknames, 0, NULL, 0,
CTL_HW, HW_DISKNAMES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "iostatnames",
SYSCTL_DESCR("I/O stats are being collected for these"
" devices"),
sysctl_hw_iostatnames, 0, NULL, 0,
CTL_HW, HW_IOSTATNAMES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "iostats",
SYSCTL_DESCR("Statistics on device I/O operations"),
sysctl_hw_iostats, 0, NULL, 0,
CTL_HW, HW_IOSTATS, CTL_EOL);
}
/* $NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $ */
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93
*/
/*
* Raw protocol interface.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
static inline int
equal(const struct sockaddr *a1, const struct sockaddr *a2)
{
return memcmp(a1, a2, a1->sa_len) == 0;
}
/*
* raw_input: find the socket associated with the packet and move it over.
* If nothing exists for this packet, drop it.
*/
void
raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src,
struct sockaddr *dst, struct rawcbhead *rawcbhead)
{
struct rawcb *rp;
struct mbuf *m = m0;
struct socket *last;
last = NULL;
LIST_FOREACH(rp, rawcbhead, rcb_list) { if (rp->rcb_proto.sp_family != proto->sp_family)
continue;
if (rp->rcb_proto.sp_protocol &&
rp->rcb_proto.sp_protocol != proto->sp_protocol)
continue;
/*
* We assume the lower level routines have
* placed the address in a canonical format
* suitable for a structure comparison.
*
* Note that if the lengths are not the same
* the comparison will fail at the first byte.
*/
if (rp->rcb_laddr && !equal(rp->rcb_laddr, dst))
continue;
if (rp->rcb_faddr && !equal(rp->rcb_faddr, src))
continue;
/* Run any filtering that may have been installed. */
if (rp->rcb_filter != NULL && rp->rcb_filter(m, proto, rp) != 0)
continue;
if (last != NULL) {
struct mbuf *n;
if ((n = m_copypacket(m, M_DONTWAIT)) == NULL ||
sbappendaddr(&last->so_rcv, src, n, NULL) == 0)
{
if (n != NULL)
m_freem(n);
soroverflow(last);
} else
sorwakeup(last);
}
last = rp->rcb_socket;
}
if (last != NULL) {
if (sbappendaddr(&last->so_rcv, src, m, NULL) == 0) {
m_freem(m);
soroverflow(last);
} else
sorwakeup(last);
} else {
m_freem(m);
}
}
void *
raw_ctlinput(int cmd, const struct sockaddr *arg, void *d)
{
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
return NULL;
/* INCOMPLETE */
}
void
raw_setsockaddr(struct rawcb *rp, struct sockaddr *nam)
{
memcpy(nam, rp->rcb_laddr, rp->rcb_laddr->sa_len);
}
void
raw_setpeeraddr(struct rawcb *rp, struct sockaddr *nam)
{
memcpy(nam, rp->rcb_faddr, rp->rcb_faddr->sa_len);
}
int
raw_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l,
int (*output)(struct mbuf *, struct socket *))
{
struct rawcb *rp = sotorawcb(so);
int error = 0;
KASSERT(rp != NULL);
/*
* Ship a packet out. The appropriate raw output
* routine handles any massaging necessary.
*/
if (control && control->m_len) { m_freem(control);
m_freem(m);
return EINVAL;
}
if (nam) {
if ((so->so_state & SS_ISCONNECTED) != 0) {
error = EISCONN;
goto die;
}
error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
if (error) {
die:
m_freem(m);
return error;
}
} else {
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto die;
}
}
error = (*output)(m, so);
if (nam)
raw_disconnect(rp);
return error;
}
int
raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
struct mbuf *control, struct lwp *l)
{
KASSERT(req != PRU_ATTACH);
KASSERT(req != PRU_DETACH);
KASSERT(req != PRU_ACCEPT);
KASSERT(req != PRU_BIND);
KASSERT(req != PRU_LISTEN);
KASSERT(req != PRU_CONNECT);
KASSERT(req != PRU_CONNECT2);
KASSERT(req != PRU_DISCONNECT);
KASSERT(req != PRU_SHUTDOWN);
KASSERT(req != PRU_ABORT);
KASSERT(req != PRU_CONTROL);
KASSERT(req != PRU_SENSE);
KASSERT(req != PRU_PEERADDR);
KASSERT(req != PRU_SOCKADDR);
KASSERT(req != PRU_RCVD);
KASSERT(req != PRU_RCVOOB);
KASSERT(req != PRU_SEND);
KASSERT(req != PRU_SENDOOB);
KASSERT(req != PRU_PURGEIF);
if (sotorawcb(so) == NULL)
return EINVAL;
panic("raw_usrreq");
return 0;
}
/* $NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $ */
/*
* Copyright (c) 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
static bool
ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
{
/* for ufs, blocks in a hole is not 'contiguous'. */
if (daddr0 == 0)
return false;
return (daddr0 + ump->um_seqinc == daddr1);
}
/*
* Bmap converts the logical block number of a file to its physical block
* number on the disk. The conversion is done by using the logical block
* number to index into the array of block pointers described by the dinode.
*/
int
ufs_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
int error;
/*
* Check for underlying vnode requests and ensure that logical
* to physical mapping is requested.
*/
if (ap->a_vpp != NULL)
*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
if (ap->a_bnp == NULL)
return (0);
error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
ap->a_runp, ufs_issequential);
return error;
}
/*
* Indirect blocks are now on the vnode for the file. They are given negative
* logical block numbers. Indirect blocks are addressed by the negative
* address of the first data block to which they point. Double indirect blocks
* are addressed by one less than the address of the first indirect block to
* which they point. Triple indirect blocks are addressed by one less than
* the address of the first double indirect block to which they point.
*
* ufs_bmaparray does the bmap conversion, and if requested returns the
* array of logical blocks which must be traversed to get to a block.
* Each entry contains the offset into that block that gets you to the
* next block and the disk address of the block (if it is assigned).
*/
int
ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
int *nump, int *runp, ufs_issequential_callback_t is_sequential)
{
struct inode *ip;
struct buf *bp, *cbp;
struct ufsmount *ump;
struct mount *mp;
struct indir a[UFS_NIADDR + 1], *xap;
daddr_t daddr;
daddr_t metalbn;
int error, maxrun = 0, num;
ip = VTOI(vp);
mp = vp->v_mount;
ump = ip->i_ump;
KASSERTMSG(((ap == NULL) == (nump == NULL)),
"ufs_bmaparray: invalid arguments: ap = %p, nump = %p", ap, nump);
if (runp) {
/*
* XXX
* If MAXBSIZE is the largest transfer the disks can handle,
* we probably want maxrun to be 1 block less so that we
* don't create a block larger than the device can handle.
*/
*runp = 0;
maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
}
if (bn >= 0 && bn < UFS_NDADDR) {
if (nump != NULL) *nump = 0;
if (ump->um_fstype == UFS1)
daddr = ufs_rw32(ip->i_ffs1_db[bn],
UFS_MPNEEDSWAP(ump));
else
daddr = ufs_rw64(ip->i_ffs2_db[bn],
UFS_MPNEEDSWAP(ump));
*bnp = blkptrtodb(ump, daddr);
/*
* Since this is FFS independent code, we are out of
* scope for the definitions of BLK_NOCOPY and
* BLK_SNAP, but we do know that they will fall in
* the range 1..um_seqinc, so we use that test and
* return a request for a zeroed out buffer if attempts
* are made to read a BLK_NOCOPY or BLK_SNAP block.
*/
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 &&
daddr < ump->um_seqinc) {
*bnp = -1;
} else if (*bnp == 0) {
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
== SF_SNAPSHOT) {
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
} else {
*bnp = -1;
}
} else if (runp) {
if (ump->um_fstype == UFS1) {
for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
is_sequential(ump,
ufs_rw32(ip->i_ffs1_db[bn - 1],
UFS_MPNEEDSWAP(ump)),
ufs_rw32(ip->i_ffs1_db[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
} else {
for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
is_sequential(ump,
ufs_rw64(ip->i_ffs2_db[bn - 1],
UFS_MPNEEDSWAP(ump)),
ufs_rw64(ip->i_ffs2_db[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
}
return (0);
} else if (bn < 0 && bn >= -UFS_NXADDR) {
KASSERT(ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) != 0); daddr = ufs_rw64(ip->i_ffs2_extb[-1 - bn], UFS_MPNEEDSWAP(ump));
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0)
*bnp = -1;
return 0;
}
xap = ap == NULL ? a : ap;
if (!nump)
nump = #
if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
return (error);
num = *nump;
/* Get disk address out of indirect block array */
if (ump->um_fstype == UFS1)
daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
UFS_MPNEEDSWAP(ump));
else
daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
UFS_MPNEEDSWAP(ump));
for (bp = NULL, ++xap; --num; ++xap) {
/*
* Exit the loop if there is no disk address assigned yet and
* the indirect block isn't in the cache, or if we were
* looking for an indirect block and we've found it.
*/
metalbn = xap->in_lbn;
if (metalbn == bn)
break;
if (daddr == 0) {
mutex_enter(&bufcache_lock);
cbp = incore(vp, metalbn);
mutex_exit(&bufcache_lock);
if (cbp == NULL)
break;
}
/*
* If we get here, we've either got the block in the cache
* or we have a disk address for it, go fetch it.
*/
if (bp) brelse(bp, 0);
xap->in_exists = 1;
bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
if (bp == NULL) {
/*
* getblk() above returns NULL only iff we are
* pagedaemon. See the implementation of getblk
* for detail.
*/
return (ENOMEM);
}
if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
trace(TR_BREADHIT, pack(vp, size), metalbn);
} else {
KASSERTMSG((daddr != 0),
"ufs_bmaparray: indirect block not in cache");
trace(TR_BREADMISS, pack(vp, size), metalbn);
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
curlwp->l_ru.ru_inblock++; /* XXX */
if ((error = biowait(bp)) != 0) { brelse(bp, 0);
return (error);
}
}
if (ump->um_fstype == UFS1) {
daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
UFS_MPNEEDSWAP(ump));
if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
ufs_rw32(((int32_t *)bp->b_data)[bn-1],
UFS_MPNEEDSWAP(ump)),
ufs_rw32(((int32_t *)bp->b_data)[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
} else {
daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
UFS_MPNEEDSWAP(ump));
if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
ufs_rw64(((int64_t *)bp->b_data)[bn-1],
UFS_MPNEEDSWAP(ump)),
ufs_rw64(((int64_t *)bp->b_data)[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
}
}
if (bp) brelse(bp, 0);
/*
* Since this is FFS independent code, we are out of scope for the
* definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
* will fall in the range 1..um_seqinc, so we use that test and
* return a request for a zeroed out buffer if attempts are made
* to read a BLK_NOCOPY or BLK_SNAP block.
*/
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 && daddr < ump->um_seqinc) { *bnp = -1;
return (0);
}
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0) {
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
== SF_SNAPSHOT) {
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
} else {
*bnp = -1;
}
}
return (0);
}
/*
* Create an array of logical block number/offset pairs which represent the
* path of indirect blocks required to access a data block. The first "pair"
* contains the logical block number of the appropriate single, double or
* triple indirect block and the offset into the inode indirect block array.
* Note, the logical block number of the inode single/double/triple indirect
* block appears twice in the array, once with the offset into the i_ffs1_ib and
* once with the offset into the page itself.
*/
int
ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
{
daddr_t metalbn, realbn;
struct ufsmount *ump;
int64_t blockcnt;
int lbc;
int i, numlevels, off;
ump = VFSTOUFS(vp->v_mount);
if (nump) *nump = 0;
numlevels = 0;
realbn = bn;
if (bn < 0)
bn = -bn;
KASSERT(bn >= UFS_NDADDR);
/*
* Determine the number of levels of indirection. After this loop
* is done, blockcnt indicates the number of data blocks possible
* at the given level of indirection, and UFS_NIADDR - i is the number
* of levels of indirection needed to locate the requested block.
*/
bn -= UFS_NDADDR;
for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) {
if (i == 0)
return (EFBIG);
lbc += ump->um_lognindir;
blockcnt = (int64_t)1 << lbc;
if (bn < blockcnt)
break;
}
/* Calculate the address of the first meta-block. */
metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + UFS_NIADDR - i);
/*
* At each iteration, off is the offset into the bap array which is
* an array of disk addresses at the current level of indirection.
* The logical block number and the offset in that block are stored
* into the argument array.
*/
ap->in_lbn = metalbn;
ap->in_off = off = UFS_NIADDR - i;
ap->in_exists = 0;
ap++;
for (++numlevels; i <= UFS_NIADDR; i++) {
/* If searching for a meta-data block, quit when found. */
if (metalbn == realbn)
break;
lbc -= ump->um_lognindir;
off = (bn >> lbc) & (MNINDIR(ump) - 1);
++numlevels;
ap->in_lbn = metalbn;
ap->in_off = off;
ap->in_exists = 0;
++ap;
metalbn -= -1 + ((int64_t)off << lbc);
}
if (nump) *nump = numlevels;
return (0);
}
/* $NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Null file-system: VFS operations.
*
* See null_vnops.c for a description.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <miscfs/nullfs/null.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, null, "layerfs");
VFS_PROTOS(nullfs);
int
nullfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct vnode *lowerrootvp, *vp;
struct null_args *args = data;
struct null_mount *nmp;
struct layer_mount *lmp;
struct pathbuf *pb;
struct nameidata nd;
int error;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof(*args))
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
lmp = MOUNTTOLAYERMOUNT(mp);
if (lmp == NULL)
return EIO;
args->la.target = NULL;
*data_len = sizeof(*args);
return 0;
}
/* Update is not supported. */
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/* Find the lower vnode and lock it. */
error = pathbuf_copyin(args->la.target, &pb);
if (error) {
return error;
}
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
return error;
}
lowerrootvp = nd.ni_vp;
pathbuf_destroy(pb);
/* Create the mount point. */
nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP);
mp->mnt_data = nmp;
mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_MPSAFE;
mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP;
/*
* Make sure that the mount point is sufficiently initialized
* that the node create call will work.
*/
vfs_getnewfsid(mp);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(nmp, sizeof(struct null_mount));
return error;
}
nmp->nullm_size = sizeof(struct null_node);
nmp->nullm_tag = VT_NULL;
nmp->nullm_bypass = layer_bypass;
nmp->nullm_vnodeop_p = null_vnodeop_p;
/* Setup a null node for root vnode. */
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
if (error) {
vrele(lowerrootvp);
kmem_free(nmp, sizeof(struct null_mount));
return error;
}
/*
* Keep a held reference to the root vnode. It will be released on
* umount. Note: nullfs is MP-safe.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
nmp->nullm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, curlwp);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
return 0;
}
int
nullfs_unmount(struct mount *mp, int mntflags)
{
struct null_mount *nmp = MOUNTTONULLMOUNT(mp);
struct vnode *null_rootvp = nmp->nullm_rootvp;
int error, flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(null_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
return EBUSY;
if ((error = vflush(mp, null_rootvp, flags)) != 0)
return error;
/* Eliminate all activity and release the vnode. */
vgone(null_rootvp);
/* Finally, destroy the mount point structures. */
kmem_free(mp->mnt_data, sizeof(struct null_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc null_vnodeop_opv_desc;
const struct vnodeopv_desc * const nullfs_vnodeopv_descs[] = {
&null_vnodeop_opv_desc,
NULL,
};
struct vfsops nullfs_vfsops = {
.vfs_name = MOUNT_NULL,
.vfs_min_mount_data = sizeof (struct null_args),
.vfs_mount = nullfs_mount,
.vfs_start = layerfs_start,
.vfs_unmount = nullfs_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = nullfs_vnodeopv_descs
};
SYSCTL_SETUP(nullfs_sysctl_setup, "nullfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "null",
SYSCTL_DESCR("Loopback file system"),
NULL, 0, NULL, 0,
CTL_VFS, 9, CTL_EOL);
/*
* XXX the "9" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "9" is the order as taken from sys/mount.h
*/
}
static int
null_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&nullfs_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&nullfs_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $ */
/*-
* Copyright (c) 2010, 2011, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Passive serialization.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/evcnt.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <sys/xcall.h>
struct pserialize {
char psz_dummy;
};
static kmutex_t psz_lock __cacheline_aligned;
static struct evcnt psz_ev_excl __cacheline_aligned =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pserialize", "exclusive access");
EVCNT_ATTACH_STATIC(psz_ev_excl);
/*
* pserialize_init:
*
* Initialize passive serialization structures.
*/
void
pserialize_init(void)
{
mutex_init(&psz_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* pserialize_create:
*
* Create and initialize a passive serialization object.
*/
pserialize_t
pserialize_create(void)
{
pserialize_t psz;
psz = kmem_zalloc(sizeof(*psz), KM_SLEEP);
return psz;
}
/*
* pserialize_destroy:
*
* Destroy a passive serialization object.
*/
void
pserialize_destroy(pserialize_t psz)
{
kmem_free(psz, sizeof(*psz));
}
/*
* pserialize_perform:
*
* Perform the write side of passive serialization.
*/
void
pserialize_perform(pserialize_t psz)
{ KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); if (__predict_false(panicstr != NULL)) {
return;
}
if (__predict_false(mp_online == false)) {
psz_ev_excl.ev_count++;
return;
}
/*
* Broadcast a NOP to all CPUs and wait until all of them complete.
*/
xc_barrier(XC_HIGHPRI);
mutex_enter(&psz_lock);
psz_ev_excl.ev_count++;
mutex_exit(&psz_lock);
}
int
pserialize_read_enter(void)
{
int s;
s = splsoftserial();
curcpu()->ci_psz_read_depth++;
__insn_barrier();
return s;
}
void
pserialize_read_exit(int s)
{ KASSERT(__predict_false(cold) || kpreempt_disabled());
__insn_barrier();
if (__predict_false(curcpu()->ci_psz_read_depth-- == 0))
panic("mismatching pserialize_read_exit()"); splx(s);
}
/*
* pserialize_in_read_section:
*
* True if the caller is in a pserialize read section. To be used
* only for diagnostic assertions where we want to guarantee the
* condition like:
*
* KASSERT(pserialize_in_read_section());
*/
bool
pserialize_in_read_section(void)
{ return kpreempt_disabled() && curcpu()->ci_psz_read_depth > 0;
}
/*
* pserialize_not_in_read_section:
*
* True if the caller is not in a pserialize read section. To be
* used only for diagnostic assertions where we want to guarantee
* the condition like:
*
* KASSERT(pserialize_not_in_read_section());
*/
bool
pserialize_not_in_read_section(void)
{
bool notin;
long pctr;
pctr = lwp_pctr();
notin = __predict_true(curcpu()->ci_psz_read_depth == 0);
/*
* If we had a context switch, we're definitely not in a
* pserialize read section because pserialize read sections
* block preemption.
*/
if (__predict_false(pctr != lwp_pctr()))
notin = true;
return notin;
}
/* $NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $ */
/*-
* Copyright (c) 2009 David Schultz <das@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $");
#endif /* LIBC_SCCS and not lint */
/* FreeBSD: src/lib/libc/string/strnlen.c,v 1.1 2009/02/28 06:00:58 das Exp */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
#if !HAVE_STRNLEN
size_t
strnlen(const char *s, size_t maxlen)
{
size_t len;
for (len = 0; len < maxlen; len++, s++) { if (!*s)
break;
}
return (len);
}
#endif /* !HAVE_STRNLEN */
/* $NetBSD: prop_stack.c,v 1.3 2019/05/08 02:25:50 thorpej Exp $ */
/*-
* Copyright (c) 2007 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "prop_object_impl.h"
#include "prop_stack.h"
void
_prop_stack_init(prop_stack_t stack)
{
stack->used_intern_elems = 0;
SLIST_INIT(&stack->extern_elems);
}
bool
_prop_stack_push(prop_stack_t stack, prop_object_t obj, void *data1,
void *data2, void *data3)
{
struct _prop_stack_extern_elem *eelem;
struct _prop_stack_intern_elem *ielem;
if (stack->used_intern_elems == PROP_STACK_INTERN_ELEMS) {
eelem = _PROP_MALLOC(sizeof(*eelem), M_TEMP);
if (eelem == NULL)
return false;
eelem->object = obj;
eelem->object_data[0] = data1;
eelem->object_data[1] = data2;
eelem->object_data[2] = data3;
SLIST_INSERT_HEAD(&stack->extern_elems, eelem, stack_link);
return true;
}
_PROP_ASSERT(stack->used_intern_elems < PROP_STACK_INTERN_ELEMS); _PROP_ASSERT(SLIST_EMPTY(&stack->extern_elems));
ielem = &stack->intern_elems[stack->used_intern_elems];
ielem->object = obj;
ielem->object_data[0] = data1;
ielem->object_data[1] = data2;
ielem->object_data[2] = data3;
++stack->used_intern_elems;
return true;
}
bool
_prop_stack_pop(prop_stack_t stack, prop_object_t *obj, void **data1,
void **data2, void **data3)
{
struct _prop_stack_extern_elem *eelem;
struct _prop_stack_intern_elem *ielem;
if (stack->used_intern_elems == 0)
return false;
if ((eelem = SLIST_FIRST(&stack->extern_elems)) != NULL) {
_PROP_ASSERT(stack->used_intern_elems == PROP_STACK_INTERN_ELEMS);
SLIST_REMOVE_HEAD(&stack->extern_elems, stack_link);
if (obj) *obj = eelem->object; if (data1) *data1 = eelem->object_data[0]; if (data2) *data2 = eelem->object_data[1]; if (data3) *data3 = eelem->object_data[2];
_PROP_FREE(eelem, M_TEMP);
return true;
}
--stack->used_intern_elems;
ielem = &stack->intern_elems[stack->used_intern_elems];
if (obj)
*obj = ielem->object;
if (data1) *data1 = ielem->object_data[0]; if (data2) *data2 = ielem->object_data[1]; if (data3) *data3 = ielem->object_data[2];
return true;
}
/* $NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
*/
/*
* System calls relating to files.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/stat.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/ktrace.h>
#include <sys/atomic.h>
#include <sys/disklabel.h>
/*
* Read system call.
*/
/* ARGSUSED */
int
sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(void *) buf;
syscallarg(size_t) nbyte;
} */
file_t *fp;
int fd;
fd = SCARG(uap, fd);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
if ((fp->f_flag & FREAD) == 0) {
fd_putfile(fd);
return (EBADF);
}
/* dofileread() will unuse the descriptor for us */
return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
&fp->f_offset, FOF_UPDATE_OFFSET, retval));
}
int
dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
off_t *offset, int flags, register_t *retval)
{
struct iovec aiov;
struct uio auio;
size_t cnt;
int error;
lwp_t *l;
l = curlwp;
aiov.iov_base = (void *)buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = nbyte;
auio.uio_rw = UIO_READ;
auio.uio_vmspace = l->l_proc->p_vmspace;
/*
* Reads return ssize_t because -1 is returned on error. Therefore
* we must restrict the length to SSIZE_MAX to avoid garbage return
* values.
*/
if (auio.uio_resid > SSIZE_MAX) {
error = EINVAL;
goto out;
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
if (error) if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
cnt -= auio.uio_resid;
ktrgenio(fd, UIO_READ, buf, cnt, error);
*retval = cnt;
out:
fd_putfile(fd);
return (error);
}
/*
* Scatter read system call.
*/
int
sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct iovec *) iovp;
syscallarg(int) iovcnt;
} */
return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}
int
do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
off_t *offset, int flags, register_t *retval)
{
struct uio auio;
struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
int i, error;
size_t cnt;
u_int iovlen;
struct file *fp;
struct iovec *ktriov = NULL;
if (iovcnt == 0)
return EINVAL;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((fp->f_flag & FREAD) == 0) {
fd_putfile(fd);
return EBADF;
}
if (offset == NULL)
offset = &fp->f_offset;
else {
/*
* Caller must not specify &fp->f_offset -- we can't
* safely dereference it for the call to fo_seek
* without holding some underlying object lock.
*/
KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) {
error = ESPIPE;
goto out;
}
error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
0);
if (error != 0)
goto out;
}
iovlen = iovcnt * sizeof(struct iovec);
if (flags & FOF_IOV_SYSSPACE)
iov = __UNCONST(iovp);
else {
iov = aiov;
if ((u_int)iovcnt > UIO_SMALLIOV) {
if ((u_int)iovcnt > IOV_MAX) {
error = EINVAL;
goto out;
}
iov = kmem_alloc(iovlen, KM_SLEEP);
needfree = iov;
}
error = copyin(iovp, iov, iovlen);
if (error)
goto done;
}
auio.uio_iov = iov;
auio.uio_iovcnt = iovcnt;
auio.uio_rw = UIO_READ;
auio.uio_vmspace = curproc->p_vmspace;
auio.uio_resid = 0;
for (i = 0; i < iovcnt; i++, iov++) {
auio.uio_resid += iov->iov_len;
/*
* Reads return ssize_t because -1 is returned on error.
* Therefore we must restrict the length to SSIZE_MAX to
* avoid garbage return values.
*/
if (iov->iov_len > SSIZE_MAX ||
auio.uio_resid > SSIZE_MAX - iov->iov_len) {
error = EINVAL;
goto done;
}
}
/*
* if tracing, save a copy of iovec
*/
if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP);
memcpy(ktriov, auio.uio_iov, iovlen);
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
if (error) if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
cnt -= auio.uio_resid;
*retval = cnt;
if (ktriov != NULL) { ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
kmem_free(ktriov, iovlen);
}
done:
if (needfree) kmem_free(needfree, iovlen);
out:
fd_putfile(fd);
return (error);
}
/*
* Write system call
*/
int
sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const void *) buf;
syscallarg(size_t) nbyte;
} */
file_t *fp;
int fd;
fd = SCARG(uap, fd);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
if ((fp->f_flag & FWRITE) == 0) {
fd_putfile(fd);
return (EBADF);
}
/* dofilewrite() will unuse the descriptor for us */
return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
&fp->f_offset, FOF_UPDATE_OFFSET, retval));
}
int
dofilewrite(int fd, struct file *fp, const void *buf,
size_t nbyte, off_t *offset, int flags, register_t *retval)
{
struct iovec aiov;
struct uio auio;
size_t cnt;
int error;
aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = nbyte;
auio.uio_rw = UIO_WRITE;
auio.uio_vmspace = curproc->p_vmspace;
/*
* Writes return ssize_t because -1 is returned on error. Therefore
* we must restrict the length to SSIZE_MAX to avoid garbage return
* values.
*/
if (auio.uio_resid > SSIZE_MAX) {
error = EINVAL;
goto out;
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
if (error) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) {
mutex_enter(&proc_lock);
psignal(curproc, SIGPIPE);
mutex_exit(&proc_lock);
}
}
cnt -= auio.uio_resid;
ktrgenio(fd, UIO_WRITE, buf, cnt, error);
*retval = cnt;
out:
fd_putfile(fd);
return (error);
}
/*
* Gather write system call
*/
int
sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct iovec *) iovp;
syscallarg(int) iovcnt;
} */
return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}
int
do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
off_t *offset, int flags, register_t *retval)
{
struct uio auio;
struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
int i, error;
size_t cnt;
u_int iovlen;
struct file *fp;
struct iovec *ktriov = NULL;
if (iovcnt == 0)
return EINVAL;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((fp->f_flag & FWRITE) == 0) {
fd_putfile(fd);
return EBADF;
}
if (offset == NULL)
offset = &fp->f_offset;
else {
/*
* Caller must not specify &fp->f_offset -- we can't
* safely dereference it for the call to fo_seek
* without holding some underlying object lock.
*/
KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) {
error = ESPIPE;
goto out;
}
error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
0);
if (error != 0)
goto out;
}
iovlen = iovcnt * sizeof(struct iovec);
if (flags & FOF_IOV_SYSSPACE)
iov = __UNCONST(iovp);
else {
iov = aiov;
if ((u_int)iovcnt > UIO_SMALLIOV) {
if ((u_int)iovcnt > IOV_MAX) {
error = EINVAL;
goto out;
}
iov = kmem_alloc(iovlen, KM_SLEEP);
needfree = iov;
}
error = copyin(iovp, iov, iovlen);
if (error)
goto done;
}
auio.uio_iov = iov;
auio.uio_iovcnt = iovcnt;
auio.uio_rw = UIO_WRITE;
auio.uio_vmspace = curproc->p_vmspace;
auio.uio_resid = 0;
for (i = 0; i < iovcnt; i++, iov++) {
auio.uio_resid += iov->iov_len;
/*
* Writes return ssize_t because -1 is returned on error.
* Therefore we must restrict the length to SSIZE_MAX to
* avoid garbage return values.
*/
if (iov->iov_len > SSIZE_MAX ||
auio.uio_resid > SSIZE_MAX - iov->iov_len) {
error = EINVAL;
goto done;
}
}
/*
* if tracing, save a copy of iovec
*/
if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP);
memcpy(ktriov, auio.uio_iov, iovlen);
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
if (error) { if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) { mutex_enter(&proc_lock);
psignal(curproc, SIGPIPE);
mutex_exit(&proc_lock);
}
}
cnt -= auio.uio_resid;
*retval = cnt;
if (ktriov != NULL) { ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
kmem_free(ktriov, iovlen);
}
done:
if (needfree) kmem_free(needfree, iovlen);
out:
fd_putfile(fd);
return (error);
}
/*
* Ioctl system call
*/
/* ARGSUSED */
int
sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(u_long) com;
syscallarg(void *) data;
} */
struct file *fp;
proc_t *p;
u_long com;
int error;
size_t size, alloc_size;
void *data, *memp;
#define STK_PARAMS 128
u_long stkbuf[STK_PARAMS/sizeof(u_long)];
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
size_t zero_last = 0;
#define zero_size(SZ) ((SZ)+zero_last)
#else
#define zero_size(SZ) (SZ)
#endif
memp = NULL;
alloc_size = 0;
error = 0;
p = l->l_proc;
if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
return (EBADF);
if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
error = EBADF;
com = 0;
goto out;
}
switch (com = SCARG(uap, com)) {
case FIONCLEX:
case FIOCLEX:
fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX);
goto out;
}
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = IOCPARM_LEN(com);
alloc_size = size;
/*
* The disklabel is now padded to a multiple of 8 bytes however the old
* disklabel on 32bit platforms wasn't. This leaves a difference in
* size of 4 bytes between the two but are otherwise identical.
* To deal with this, we allocate enough space for the new disklabel
* but only copyin/out the smaller amount.
*/
if (IOCGROUP(com) == 'd') {
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
u_long ocom = com;
#endif
u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32);
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
/*
* Userland might use struct disklabel that is bigger than the
* the kernel version (historic accident) - alloc userland
* size and zero unused part on copyout.
*/
#define DISKLABELLENDIFF (sizeof(struct partition) \
*(__TMPBIGMAXPARTITIONS-MAXPARTITIONS))
#define IOCFIXUP(NIOC) ((NIOC&~(IOCPARM_MASK<<IOCPARM_SHIFT)) | \
(IOCPARM_LEN(NIOC)-DISKLABELLENDIFF)<<IOCPARM_SHIFT)
switch (IOCFIXUP(ocom)) {
case DIOCGDINFO:
case DIOCWDINFO:
case DIOCSDINFO:
case DIOCGDEFLABEL:
com = ncom = IOCFIXUP(ocom);
zero_last = DISKLABELLENDIFF;
size -= DISKLABELLENDIFF;
goto done;
}
#endif
switch (ncom) {
case DIOCGDINFO:
case DIOCWDINFO:
case DIOCSDINFO:
case DIOCGDEFLABEL:
com = ncom;
if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO))
alloc_size = IOCPARM_LEN(DIOCGDINFO);
break;
}
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
done: ;
#endif
}
if (size > IOCPARM_MAX) {
error = ENOTTY;
goto out;
}
memp = NULL;
if ((com >> IOCPARM_SHIFT) == 0) {
/* UNIX-style ioctl. */
data = SCARG(uap, data);
} else {
if (alloc_size > sizeof(stkbuf)) { memp = kmem_alloc(alloc_size, KM_SLEEP);
data = memp;
} else {
data = (void *)stkbuf;
}
if (com&IOC_IN) {
if (size) {
error = copyin(SCARG(uap, data), data, size);
if (error) {
goto out;
}
/*
* The data between size and alloc_size has
* not been overwritten. It shouldn't matter
* but let's clear that anyway.
*/
if (__predict_false(size < alloc_size)) { memset((char *)data+size, 0,
alloc_size - size);
}
ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data), size, 0);
} else {
*(void **)data = SCARG(uap, data);
}
} else if ((com&IOC_OUT) && size) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
memset(data, 0, zero_size(size)); } else if (com&IOC_VOID) { *(void **)data = SCARG(uap, data);
}
}
switch (com) {
case FIONBIO:
/* XXX Code block is not atomic */
if (*(int *)data != 0)
atomic_or_uint(&fp->f_flag, FNONBLOCK);
else
atomic_and_uint(&fp->f_flag, ~FNONBLOCK);
error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
break;
case FIOASYNC:
/* XXX Code block is not atomic */
if (*(int *)data != 0)
atomic_or_uint(&fp->f_flag, FASYNC);
else
atomic_and_uint(&fp->f_flag, ~FASYNC);
error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
break;
default:
error = (*fp->f_ops->fo_ioctl)(fp, com, data);
/*
* Copy any data to user, size was
* already set and checked above.
*/
if (error == 0 && (com&IOC_OUT) && size) {
error = copyout(data, SCARG(uap, data),
zero_size(size));
ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
size, error);
}
break;
}
out:
if (memp) kmem_free(memp, alloc_size);
fd_putfile(SCARG(uap, fd));
switch (error) {
case -1:
printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
"pid=%d comm=%s\n",
(com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
(char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
p->p_pid, p->p_comm);
/* FALLTHROUGH */
case EPASSTHROUGH:
error = ENOTTY;
/* FALLTHROUGH */
default:
return (error);
}
}
/* $NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/module.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
/*
* PTRACE methods
*/
static int
ptrace_copyin_piod(struct ptrace_io_desc *piod, const void *addr, size_t len)
{
if (len != 0 && sizeof(*piod) != len)
return EINVAL;
return copyin(addr, piod, sizeof(*piod));
}
static int
ptrace_copyout_piod(const struct ptrace_io_desc *piod, void *addr, size_t len)
{
if (len != 0 && sizeof(*piod) != len)
return EINVAL;
return copyout(piod, addr, sizeof(*piod));
}
static int
ptrace_copyin_siginfo(struct ptrace_siginfo *psi, const void *addr, size_t len)
{
if (sizeof(*psi) != len)
return EINVAL;
return copyin(addr, psi, sizeof(*psi));
}
static int
ptrace_copyout_siginfo(const struct ptrace_siginfo *psi, void *addr, size_t len)
{
if (sizeof(*psi) != len)
return EINVAL;
return copyout(psi, addr, sizeof(*psi));
}
static int
ptrace_copyout_lwpstatus(const struct ptrace_lwpstatus *pls, void *addr,
size_t len)
{
return copyout(pls, addr, len);
}
static struct ptrace_methods native_ptm = {
.ptm_copyin_piod = ptrace_copyin_piod,
.ptm_copyout_piod = ptrace_copyout_piod,
.ptm_copyin_siginfo = ptrace_copyin_siginfo,
.ptm_copyout_siginfo = ptrace_copyout_siginfo,
.ptm_copyout_lwpstatus = ptrace_copyout_lwpstatus,
.ptm_doregs = process_doregs,
.ptm_dofpregs = process_dofpregs,
.ptm_dodbregs = process_dodbregs,
};
static const struct syscall_package ptrace_syscalls[] = {
{ SYS_ptrace, 0, (sy_call_t *)sys_ptrace },
{ 0, 0, NULL },
};
/*
* Process debugging system call.
*/
int
sys_ptrace(struct lwp *l, const struct sys_ptrace_args *uap, register_t *retval)
{
/* {
syscallarg(int) req;
syscallarg(pid_t) pid;
syscallarg(void *) addr;
syscallarg(int) data;
} */
return do_ptrace(&native_ptm, l, SCARG(uap, req), SCARG(uap, pid),
SCARG(uap, addr), SCARG(uap, data), retval);
}
#define DEPS "ptrace_common"
MODULE(MODULE_CLASS_EXEC, ptrace, DEPS);
static int
ptrace_init(void)
{
int error;
error = syscall_establish(&emul_netbsd, ptrace_syscalls);
return error;
}
static int
ptrace_fini(void)
{
int error;
error = syscall_disestablish(&emul_netbsd, ptrace_syscalls);
return error;
}
static int
ptrace_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = ptrace_init();
break;
case MODULE_CMD_FINI:
error = ptrace_fini();
break;
default:
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: statvfs.h,v 1.5 2024/01/19 18:39:15 christos Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _COMPAT_SYS_STATVFS_H_
#define _COMPAT_SYS_STATVFS_H_
#include <sys/statvfs.h>
struct statvfs90 {
unsigned long f_flag; /* copy of mount exported flags */
unsigned long f_bsize; /* file system block size */
unsigned long f_frsize; /* fundamental file system block size */
unsigned long f_iosize; /* optimal file system block size */
/* The following are in units of f_frsize */
fsblkcnt_t f_blocks; /* number of blocks in file system, */
fsblkcnt_t f_bfree; /* free blocks avail in file system */
fsblkcnt_t f_bavail; /* free blocks avail to non-root */
fsblkcnt_t f_bresvd; /* blocks reserved for root */
fsfilcnt_t f_files; /* total file nodes in file system */
fsfilcnt_t f_ffree; /* free file nodes in file system */
fsfilcnt_t f_favail; /* free file nodes avail to non-root */
fsfilcnt_t f_fresvd; /* file nodes reserved for root */
uint64_t f_syncreads; /* count of sync reads since mount */
uint64_t f_syncwrites; /* count of sync writes since mount */
uint64_t f_asyncreads; /* count of async reads since mount */
uint64_t f_asyncwrites; /* count of async writes since mount */
fsid_t f_fsidx; /* NetBSD compatible fsid */
unsigned long f_fsid; /* Posix compatible fsid */
unsigned long f_namemax; /* maximum filename length */
uid_t f_owner; /* user that mounted the file system */
uint32_t f_spare[4]; /* spare space */
char f_fstypename[_VFS_NAMELEN]; /* fs type name */
char f_mntonname[_VFS_MNAMELEN]; /* directory on which mounted */
char f_mntfromname[_VFS_MNAMELEN]; /* mounted file system */
};
__BEGIN_DECLS
#ifndef _KERNEL
#include <string.h>
#endif
static __inline void
statvfs_to_statvfs90(const struct statvfs *s, struct statvfs90 *s90)
{
memset(s90, 0, sizeof(*s90));
s90->f_flag = s->f_flag;
s90->f_bsize = s->f_bsize;
s90->f_frsize = s->f_frsize;
s90->f_iosize = s->f_iosize;
s90->f_blocks = s->f_blocks;
s90->f_bfree = s->f_bfree;
s90->f_bavail = s->f_bavail;
s90->f_bresvd = s->f_bresvd;
s90->f_files = s->f_files;
s90->f_ffree = s->f_ffree;
s90->f_favail = s->f_favail;
s90->f_fresvd = s->f_fresvd;
s90->f_syncreads = s->f_syncreads;
s90->f_syncwrites = s->f_syncwrites;
s90->f_asyncreads = s->f_asyncreads;
s90->f_asyncwrites = s->f_asyncwrites;
s90->f_fsidx = s->f_fsidx;
s90->f_fsid = s->f_fsid;
s90->f_namemax = s->f_namemax;
s90->f_owner = s->f_owner;
memcpy(s90->f_fstypename, s->f_fstypename, sizeof(s90->f_fstypename));
memcpy(s90->f_mntonname, s->f_mntonname, sizeof(s90->f_mntonname));
memcpy(s90->f_mntfromname, s->f_mntfromname, sizeof(s90->f_mntfromname));
}
#ifdef _KERNEL
static __inline int
statvfs_to_statvfs90_copy(const void *vs, void *vs90, size_t l)
{
struct statvfs90 *s90 = kmem_zalloc(sizeof(*s90), KM_SLEEP);
int error;
statvfs_to_statvfs90(vs, s90);
error = copyout(s90, vs90, sizeof(*s90));
kmem_free(s90, sizeof(*s90));
return error;
}
#else
#ifdef __LIBC12_SOURCE__
int __compat_statvfs(const char *__restrict, struct statvfs90 *__restrict);
int __compat_statvfs1(const char *__restrict, struct statvfs90 *__restrict,
int);
int __compat_fstatvfs(int, struct statvfs90 *);
int __compat_fstatvfs1(int, struct statvfs90 *, int);
int __compat___getmntinfo13(struct statvfs90 **, int);
int __compat___fhstatvfs40(const void *, size_t, struct statvfs90 *);
int __compat___fhstatvfs140(const void *, size_t, struct statvfs90 *, int);
int __compat_getvfsstat(struct statvfs90 *, size_t, int);
int __statvfs90(const char *__restrict, struct statvfs *__restrict);
int __statvfs190(const char *__restrict, struct statvfs *__restrict, int);
int __fstatvfs90(int, struct statvfs *);
int __fstatvfs190(int, struct statvfs *, int);
int __fhstatvfs90(const void *, size_t, struct statvfs *);
int __fhstatvfs190(const void *, size_t, struct statvfs *, int);
int __getvfsstat90(struct statvfs *, size_t, int);
int __getmntinfo90(struct statvfs **, int);
struct compat_30_fhandle;
int fhstatvfs(const struct compat_30_fhandle *, struct statvfs90 *);
int fhstatvfs1(const struct compat_30_fhandle *, struct statvfs90 *, int);
#endif /* __LIBC12_SOURCE__ */
#endif /* _KERNEL */
__END_DECLS
#endif /* !_COMPAT_SYS_STATVFS_H_ */
/* $NetBSD: tmpfs.h,v 1.56 2020/05/17 19:39:15 ad Exp $ */
/*
* Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _FS_TMPFS_TMPFS_H_
#define _FS_TMPFS_TMPFS_H_
#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error "not supposed to be exposed to userland"
#endif
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/vnode.h>
/*
* Internal representation of a tmpfs directory entry.
*
* All fields are protected by vnode lock.
*/
typedef struct tmpfs_dirent {
TAILQ_ENTRY(tmpfs_dirent) td_entries;
/* Pointer to the inode this entry refers to. */
struct tmpfs_node * td_node;
/* Sequence number, see tmpfs_dir_getseq(). */
uint32_t td_seq;
/* Name and its length. */
char * td_name;
uint16_t td_namelen;
} tmpfs_dirent_t;
TAILQ_HEAD(tmpfs_dir, tmpfs_dirent);
/*
* Internal representation of a tmpfs file system node -- inode.
*
* This structure is split in two parts: one holds attributes common
* to all file types and the other holds data that is only applicable to
* a particular type.
*
* All fields are protected by vnode lock. The vnode association itself
* is protected by vcache.
*/
typedef struct tmpfs_node {
LIST_ENTRY(tmpfs_node) tn_entries;
/*
* Each inode has a corresponding vnode. It is a bi-directional
* association. Whenever vnode is allocated, its v_data field is
* set to the inode it reference, and tmpfs_node_t::tn_vnode is
* set to point to the said vnode.
*
* Further attempts to allocate a vnode for this same node will
* result in returning a new reference to the value stored in
* tn_vnode. It may be NULL when the node is unused (that is,
* no vnode has been allocated or it has been reclaimed).
*/
vnode_t * tn_vnode;
/* Prevent node from being reclaimed. */
uint32_t tn_holdcount;
/* Directory entry. Only a hint, since hard link can have multiple. */
tmpfs_dirent_t * tn_dirent_hint;
/* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */
enum vtype tn_type;
/* Inode identifier and generation number. */
ino_t tn_id;
uint32_t tn_gen;
/* The inode size. */
off_t tn_size;
/* Generic node attributes. */
uid_t tn_uid;
gid_t tn_gid;
mode_t tn_mode;
int tn_flags;
nlink_t tn_links;
unsigned tn_tflags;
struct timespec tn_atime;
struct timespec tn_mtime;
struct timespec tn_ctime;
struct timespec tn_birthtime;
kmutex_t tn_timelock;
/* Head of byte-level lock list (used by tmpfs_advlock). */
struct lockf * tn_lockf;
union {
/* Type case: VBLK or VCHR. */
struct {
dev_t tn_rdev;
} tn_dev;
/* Type case: VDIR. */
struct {
/* Parent directory (root inode points to itself). */
struct tmpfs_node * tn_parent;
/* List of directory entries. */
struct tmpfs_dir tn_dir;
/* Last given sequence number and their arena. */
uint32_t tn_next_seq;
void * tn_seq_arena;
/*
* Pointer of the last directory entry returned
* by the readdir(3) operation.
*/
struct tmpfs_dirent * tn_readdir_lastp;
} tn_dir;
/* Type case: VLNK. */
struct tn_lnk {
/* The link's target. */
char * tn_link;
} tn_lnk;
/* Type case: VREG. */
struct tn_reg {
/* Underlying UVM object to store contents. */
struct uvm_object * tn_aobj;
size_t tn_aobj_pages;
} tn_reg;
} tn_spec;
} tmpfs_node_t;
#if defined(_KERNEL)
VFS_PROTOS(tmpfs);
LIST_HEAD(tmpfs_node_list, tmpfs_node);
#define TMPFS_MAXNAMLEN 255
/* Validate maximum td_namelen length. */
CTASSERT(TMPFS_MAXNAMLEN < UINT16_MAX);
/*
* Reserved values for the virtual entries (the first must be 0) and EOF.
* The start/end of the incremental range, see tmpfs_dir_getseq().
*/
#define TMPFS_DIRSEQ_DOT 0
#define TMPFS_DIRSEQ_DOTDOT 1
#define TMPFS_DIRSEQ_EOF 2
#define TMPFS_DIRSEQ_START 3 /* inclusive */
#define TMPFS_DIRSEQ_END (1U << 30) /* exclusive */
/* Mark to indicate that the number is not set. */
#define TMPFS_DIRSEQ_NONE (1U << 31)
/* Flags: time update requests. */
#define TMPFS_UPDATE_ATIME 0x01
#define TMPFS_UPDATE_MTIME 0x02
#define TMPFS_UPDATE_CTIME 0x04
/*
* Bits indicating whiteout use for the directory.
* We abuse tmpfs_node_t::tn_gen for that.
*/
#define TMPFS_WHITEOUT_BIT (1U << 31)
#define TMPFS_NODE_GEN_MASK (TMPFS_WHITEOUT_BIT - 1)
#define TMPFS_NODE_GEN(node) \
((node)->tn_gen & TMPFS_NODE_GEN_MASK)
/* White-out inode indicator. */
#define TMPFS_NODE_WHITEOUT ((tmpfs_node_t *)-1)
/*
* Bit indicating this node must be reclaimed when holdcount reaches zero.
* Ored into tmpfs_node_t::tn_holdcount.
*/
#define TMPFS_NODE_RECLAIMED (1U << 30)
/*
* Internal representation of a tmpfs mount point.
*/
typedef struct tmpfs_mount {
/* Limit and number of bytes in use by the file system. */
uint64_t tm_mem_limit;
uint64_t tm_bytes_used;
kmutex_t tm_acc_lock;
/* Pointer to the root inode. */
tmpfs_node_t * tm_root;
/* Maximum number of possible nodes for this file system. */
unsigned int tm_nodes_max;
/* Number of nodes currently allocated. */
unsigned int tm_nodes_cnt;
/* List of inodes and the lock protecting it. */
kmutex_t tm_lock;
struct tmpfs_node_list tm_nodes;
} tmpfs_mount_t;
/*
* This structure maps a file identifier to a tmpfs node. Used by the
* NFS code.
*/
typedef struct tmpfs_fid {
uint16_t tf_len;
uint16_t tf_pad;
uint32_t tf_gen;
ino_t tf_id;
} tmpfs_fid_t;
/*
* Prototypes for tmpfs_subr.c.
*/
void tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *);
int tmpfs_construct_node(vnode_t *, vnode_t **, struct vattr *,
struct componentname *, char *);
int tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t,
tmpfs_dirent_t **);
void tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *);
void tmpfs_dir_attach(tmpfs_node_t *, tmpfs_dirent_t *, tmpfs_node_t *);
void tmpfs_dir_detach(tmpfs_node_t *, tmpfs_dirent_t *);
tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *);
tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *);
uint32_t tmpfs_dir_getseq(tmpfs_node_t *, tmpfs_dirent_t *);
tmpfs_dirent_t *tmpfs_dir_lookupbyseq(tmpfs_node_t *, off_t);
int tmpfs_dir_getdents(tmpfs_node_t *, struct uio *, off_t *);
int tmpfs_reg_resize(vnode_t *, off_t);
int tmpfs_chflags(vnode_t *, int, kauth_cred_t, lwp_t *);
int tmpfs_chmod(vnode_t *, mode_t, kauth_cred_t, lwp_t *);
int tmpfs_chown(vnode_t *, uid_t, gid_t, kauth_cred_t, lwp_t *);
int tmpfs_chsize(vnode_t *, u_quad_t, kauth_cred_t, lwp_t *);
int tmpfs_chtimes(vnode_t *, const struct timespec *,
const struct timespec *, const struct timespec *, int,
kauth_cred_t, lwp_t *);
void tmpfs_update(vnode_t *, unsigned);
void tmpfs_update_locked(vnode_t *, unsigned);
void tmpfs_update_lazily(vnode_t *, unsigned);
/*
* Prototypes for tmpfs_mem.c.
*/
void tmpfs_mntmem_init(tmpfs_mount_t *, uint64_t);
void tmpfs_mntmem_destroy(tmpfs_mount_t *);
int tmpfs_mntmem_set(tmpfs_mount_t *, uint64_t);
size_t tmpfs_mem_info(bool);
uint64_t tmpfs_bytes_max(tmpfs_mount_t *);
size_t tmpfs_pages_avail(tmpfs_mount_t *);
bool tmpfs_mem_incr(tmpfs_mount_t *, size_t);
void tmpfs_mem_decr(tmpfs_mount_t *, size_t);
tmpfs_dirent_t *tmpfs_dirent_get(tmpfs_mount_t *);
void tmpfs_dirent_put(tmpfs_mount_t *, tmpfs_dirent_t *);
tmpfs_node_t * tmpfs_node_get(tmpfs_mount_t *);
void tmpfs_node_put(tmpfs_mount_t *, tmpfs_node_t *);
char * tmpfs_strname_alloc(tmpfs_mount_t *, size_t);
void tmpfs_strname_free(tmpfs_mount_t *, char *, size_t);
bool tmpfs_strname_neqlen(struct componentname *, struct componentname *);
/*
* Ensures that the node pointed by 'node' is a directory and that its
* contents are consistent with respect to directories.
*/
#define TMPFS_VALIDATE_DIR(node) \
KASSERT((node)->tn_vnode == NULL || VOP_ISLOCKED((node)->tn_vnode)); \
KASSERT((node)->tn_type == VDIR); \
KASSERT((node)->tn_size % sizeof(tmpfs_dirent_t) == 0);
/*
* Routines to convert VFS structures to tmpfs internal ones.
*/
static __inline tmpfs_mount_t *
VFS_TO_TMPFS(struct mount *mp)
{
tmpfs_mount_t *tmp = mp->mnt_data;
KASSERT(tmp != NULL);
return tmp;
}
static __inline tmpfs_node_t *
VP_TO_TMPFS_DIR(vnode_t *vp)
{
tmpfs_node_t *node = vp->v_data;
KASSERT(node != NULL); TMPFS_VALIDATE_DIR(node);
return node;
}
#endif /* defined(_KERNEL) */
static __inline tmpfs_node_t *
VP_TO_TMPFS_NODE(vnode_t *vp)
{
tmpfs_node_t *node = vp->v_data;
#ifdef KASSERT
KASSERT(node != NULL);
#endif
return node;
}
#endif /* _FS_TMPFS_TMPFS_H_ */
/* $NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c)2007 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* CPU related routines shared with rump.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
static void cpu_topology_fake1(struct cpu_info *);
kmutex_t cpu_lock __cacheline_aligned;
int ncpu __read_mostly;
int ncpuonline __read_mostly;
bool mp_online __read_mostly;
static bool cpu_topology_present __read_mostly;
static bool cpu_topology_haveslow __read_mostly;
int64_t cpu_counts[CPU_COUNT_MAX];
/* An array of CPUs. There are ncpu entries. */
struct cpu_info **cpu_infos __read_mostly;
/* Note: set on mi_cpu_attach() and idle_loop(). */
kcpuset_t * kcpuset_attached __read_mostly = NULL;
kcpuset_t * kcpuset_running __read_mostly = NULL;
static char cpu_model[128];
/*
* mi_cpu_init: early initialisation of MI CPU related structures.
*
* Note: may not block and memory allocator is not yet available.
*/
void
mi_cpu_init(void)
{
struct cpu_info *ci;
mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);
kcpuset_create(&kcpuset_attached, true);
kcpuset_create(&kcpuset_running, true);
kcpuset_set(kcpuset_running, 0);
ci = curcpu();
cpu_topology_fake1(ci);
}
int
cpu_setmodel(const char *fmt, ...)
{
int len;
va_list ap;
va_start(ap, fmt);
len = vsnprintf(cpu_model, sizeof(cpu_model), fmt, ap);
va_end(ap);
return len;
}
const char *
cpu_getmodel(void)
{
return cpu_model;
}
bool
cpu_softintr_p(void)
{
return (curlwp->l_pflag & LP_INTR) != 0;
}
bool
curcpu_stable(void)
{
struct lwp *const l = curlwp;
const int pflag = l->l_pflag;
const int nopreempt = l->l_nopreempt;
/*
* - Softints (LP_INTR) never migrate between CPUs.
* - Bound lwps (LP_BOUND), either kthreads created bound to
* a CPU or any lwps bound with curlwp_bind, never migrate.
* - If kpreemption is disabled, the lwp can't migrate.
* - If we're in interrupt context, preemption is blocked.
*
* We combine the LP_INTR, LP_BOUND, and l_nopreempt test into
* a single predicted-true branch so this is cheap to assert in
* most contexts where it will be used, then fall back to
* calling the full kpreempt_disabled() and cpu_intr_p() as
* subroutines.
*
* XXX Is cpu_intr_p redundant with kpreempt_disabled?
*/
return __predict_true(((pflag & (LP_INTR|LP_BOUND)) | nopreempt)
!= 0) ||
kpreempt_disabled() ||
cpu_intr_p();
}
/*
* Collect CPU topology information as each CPU is attached. This can be
* called early during boot, so we need to be careful what we do.
*/
void
cpu_topology_set(struct cpu_info *ci, u_int package_id, u_int core_id,
u_int smt_id, u_int numa_id)
{
enum cpu_rel rel;
cpu_topology_present = true;
ci->ci_package_id = package_id;
ci->ci_core_id = core_id;
ci->ci_smt_id = smt_id;
ci->ci_numa_id = numa_id;
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
ci->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = 1;
}
}
/*
* Collect CPU relative speed
*/
void
cpu_topology_setspeed(struct cpu_info *ci, bool slow)
{
cpu_topology_haveslow |= slow;
ci->ci_is_slow = slow;
}
/*
* Link a CPU into the given circular list.
*/
static void
cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel)
{
struct cpu_info *ci3;
/* Walk to the end of the existing circular list and append. */
for (ci3 = ci2;; ci3 = ci3->ci_sibling[rel]) {
ci3->ci_nsibling[rel]++;
if (ci3->ci_sibling[rel] == ci2) {
break;
}
}
ci->ci_sibling[rel] = ci2;
ci3->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = ci3->ci_nsibling[rel];
}
/*
* Print out the topology lists.
*/
static void
cpu_topology_dump(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci, *ci2;
const char *names[] = { "core", "pkg", "1st" };
enum cpu_rel rel;
int i;
CTASSERT(__arraycount(names) >= __arraycount(ci->ci_sibling));
if (ncpu == 1) {
return;
}
for (CPU_INFO_FOREACH(cii, ci)) {
if (cpu_topology_haveslow)
aprint_debug("%s ", ci->ci_is_slow ? "slow" : "fast");
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
aprint_debug("%s has %d %s siblings:", cpu_name(ci),
ci->ci_nsibling[rel], names[rel]);
ci2 = ci->ci_sibling[rel];
i = 0;
do {
aprint_debug(" %s", cpu_name(ci2));
ci2 = ci2->ci_sibling[rel];
} while (++i < 64 && ci2 != ci->ci_sibling[rel]);
if (i == 64) {
aprint_debug(" GAVE UP");
}
aprint_debug("\n");
}
aprint_debug("%s first in package: %s\n", cpu_name(ci),
cpu_name(ci->ci_package1st));
}
}
/*
* Fake up topology info if we have none, or if what we got was bogus.
* Used early in boot, and by cpu_topology_fake().
*/
static void
cpu_topology_fake1(struct cpu_info *ci)
{
enum cpu_rel rel;
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
ci->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = 1;
}
if (!cpu_topology_present) {
ci->ci_package_id = cpu_index(ci);
}
ci->ci_schedstate.spc_flags |=
(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
ci->ci_package1st = ci;
if (!cpu_topology_haveslow) {
ci->ci_is_slow = false;
}
}
/*
* Fake up topology info if we have none, or if what we got was bogus.
* Don't override ci_package_id, etc, if cpu_topology_present is set.
* MD code also uses these.
*/
static void
cpu_topology_fake(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
cpu_topology_fake1(ci);
/* Undo (early boot) flag set so everything links OK. */
ci->ci_schedstate.spc_flags &=
~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
}
}
/*
* Fix up basic CPU topology info. Right now that means attach each CPU to
* circular lists of its siblings in the same core, and in the same package.
*/
void
cpu_topology_init(void)
{
CPU_INFO_ITERATOR cii, cii2;
struct cpu_info *ci, *ci2, *ci3;
u_int minsmt, mincore;
if (!cpu_topology_present) {
cpu_topology_fake();
goto linkit;
}
/* Find siblings in same core and package. */
for (CPU_INFO_FOREACH(cii, ci)) {
ci->ci_schedstate.spc_flags &=
~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
for (CPU_INFO_FOREACH(cii2, ci2)) {
/* Avoid bad things happening. */
if (ci2->ci_package_id == ci->ci_package_id &&
ci2->ci_core_id == ci->ci_core_id &&
ci2->ci_smt_id == ci->ci_smt_id &&
ci2 != ci) {
#ifdef DEBUG
printf("cpu%u %p pkg %u core %u smt %u same as "
"cpu%u %p pkg %u core %u smt %u\n",
cpu_index(ci), ci, ci->ci_package_id,
ci->ci_core_id, ci->ci_smt_id,
cpu_index(ci2), ci2, ci2->ci_package_id,
ci2->ci_core_id, ci2->ci_smt_id);
#endif
printf("cpu_topology_init: info bogus, "
"faking it\n");
cpu_topology_fake();
goto linkit;
}
if (ci2 == ci ||
ci2->ci_package_id != ci->ci_package_id) {
continue;
}
/* Find CPUs in the same core. */
if (ci->ci_nsibling[CPUREL_CORE] == 1 &&
ci->ci_core_id == ci2->ci_core_id) {
cpu_topology_link(ci, ci2, CPUREL_CORE);
}
/* Find CPUs in the same package. */
if (ci->ci_nsibling[CPUREL_PACKAGE] == 1) {
cpu_topology_link(ci, ci2, CPUREL_PACKAGE);
}
if (ci->ci_nsibling[CPUREL_CORE] > 1 &&
ci->ci_nsibling[CPUREL_PACKAGE] > 1) {
break;
}
}
}
linkit:
/* Identify lowest numbered SMT in each core. */
for (CPU_INFO_FOREACH(cii, ci)) {
ci2 = ci3 = ci;
minsmt = ci->ci_smt_id;
do {
if (ci2->ci_smt_id < minsmt) {
ci3 = ci2;
minsmt = ci2->ci_smt_id;
}
ci2 = ci2->ci_sibling[CPUREL_CORE];
} while (ci2 != ci);
ci3->ci_schedstate.spc_flags |= SPCF_CORE1ST;
}
/* Identify lowest numbered SMT in each package. */
ci3 = NULL;
for (CPU_INFO_FOREACH(cii, ci)) {
if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) == 0) {
continue;
}
ci2 = ci3 = ci;
mincore = ci->ci_core_id;
do {
if ((ci2->ci_schedstate.spc_flags &
SPCF_CORE1ST) != 0 &&
ci2->ci_core_id < mincore) {
ci3 = ci2;
mincore = ci2->ci_core_id;
}
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci);
if ((ci3->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0) {
/* Already identified - nothing more to do. */
continue;
}
ci3->ci_schedstate.spc_flags |= SPCF_PACKAGE1ST;
/* Walk through all CPUs in package and point to first. */
ci2 = ci3;
do {
ci2->ci_package1st = ci3;
ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci3;
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci3);
/* Now look for somebody else to link to. */
for (CPU_INFO_FOREACH(cii2, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_PACKAGE1ST)
!= 0 && ci2 != ci3) {
cpu_topology_link(ci3, ci2, CPUREL_PACKAGE1ST);
break;
}
}
}
/* Walk through all packages, starting with value of ci3 from above. */
KASSERT(ci3 != NULL);
ci = ci3;
do {
/* Walk through CPUs in the package and copy in PACKAGE1ST. */
ci2 = ci;
do {
ci2->ci_sibling[CPUREL_PACKAGE1ST] =
ci->ci_sibling[CPUREL_PACKAGE1ST];
ci2->ci_nsibling[CPUREL_PACKAGE1ST] =
ci->ci_nsibling[CPUREL_PACKAGE1ST];
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci);
ci = ci->ci_sibling[CPUREL_PACKAGE1ST];
} while (ci != ci3);
if (cpu_topology_haveslow) {
/*
* For asymmetric systems where some CPUs are slower than
* others, mark first class CPUs for the scheduler. This
* conflicts with SMT right now so whinge if observed.
*/
if (curcpu()->ci_nsibling[CPUREL_CORE] > 1) {
printf("cpu_topology_init: asymmetric & SMT??\n");
}
for (CPU_INFO_FOREACH(cii, ci)) {
if (!ci->ci_is_slow) {
ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
}
}
} else {
/*
* For any other configuration mark the 1st CPU in each
* core as a first class CPU.
*/
for (CPU_INFO_FOREACH(cii, ci)) {
if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0) {
ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
}
}
}
cpu_topology_dump();
}
/*
* Adjust one count, for a counter that's NOT updated from interrupt
* context. Hardly worth making an inline due to preemption stuff.
*/
void
cpu_count(enum cpu_count idx, int64_t delta)
{
lwp_t *l = curlwp;
KPREEMPT_DISABLE(l);
l->l_cpu->ci_counts[idx] += delta;
KPREEMPT_ENABLE(l);
}
/*
* Fetch fresh sum total for all counts. Expensive - don't call often.
*
* If poll is true, the caller is okay with less recent values (but
* no more than 1/hz seconds old). Where this is called very often that
* should be the case.
*
* This should be reasonably quick so that any value collected get isn't
* totally out of whack, and it can also be called from interrupt context,
* so go to splvm() while summing the counters. It's tempting to use a spin
* mutex here but this routine is called from DDB.
*/
void
cpu_count_sync(bool poll)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
int64_t sum[CPU_COUNT_MAX], *ptr;
static int lasttick;
int curtick, s;
enum cpu_count i;
KASSERT(sizeof(ci->ci_counts) == sizeof(cpu_counts));
if (__predict_false(!mp_online)) {
memcpy(cpu_counts, curcpu()->ci_counts, sizeof(cpu_counts));
return;
}
s = splvm();
curtick = getticks();
if (poll && atomic_load_acquire(&lasttick) == curtick) { splx(s);
return;
}
memset(sum, 0, sizeof(sum));
curcpu()->ci_counts[CPU_COUNT_SYNC]++;
for (CPU_INFO_FOREACH(cii, ci)) {
ptr = ci->ci_counts;
for (i = 0; i < CPU_COUNT_MAX; i += 8) {
sum[i+0] += ptr[i+0];
sum[i+1] += ptr[i+1];
sum[i+2] += ptr[i+2];
sum[i+3] += ptr[i+3];
sum[i+4] += ptr[i+4];
sum[i+5] += ptr[i+5];
sum[i+6] += ptr[i+6];
sum[i+7] += ptr[i+7];
}
KASSERT(i == CPU_COUNT_MAX);
}
memcpy(cpu_counts, sum, sizeof(cpu_counts));
atomic_store_release(&lasttick, curtick);
splx(s);
}
/* $NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/sysctl.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/mount.h>
#include <compat/sys/statvfs.h>
static const struct syscall_package vfs_syscalls_20_syscalls[] = {
{ SYS_compat_20_fhstatfs, 0, (sy_call_t *)compat_20_sys_fhstatfs },
{ SYS_compat_20_fstatfs, 0, (sy_call_t *)compat_20_sys_fstatfs },
{ SYS_compat_20_getfsstat, 0, (sy_call_t *)compat_20_sys_getfsstat },
{ SYS_compat_20_statfs, 0, (sy_call_t *)compat_20_sys_statfs },
{ 0, 0, NULL }
};
/*
* Get filesystem statistics.
*/
/* ARGSUSED */
int
compat_20_sys_statfs(struct lwp *l, const struct compat_20_sys_statfs_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(struct statfs12 *) buf;
} */
struct mount *mp;
struct statvfs *sbuf;
int error;
struct vnode *vp;
error = namei_simple_user(SCARG(uap, path),
NSM_FOLLOW_TRYEMULROOT, &vp);
if (error != 0)
return error;
mp = vp->v_mount;
sbuf = STATVFSBUF_GET();
if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
goto done;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
done:
vrele(vp);
STATVFSBUF_PUT(sbuf);
return error;
}
/*
* Get filesystem statistics.
*/
/* ARGSUSED */
int
compat_20_sys_fstatfs(struct lwp *l, const struct compat_20_sys_fstatfs_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct statfs12 *) buf;
} */
struct file *fp;
struct mount *mp;
struct statvfs *sbuf;
int error;
/* fd_getvnode() will use the descriptor for us */
if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
return (error);
mp = fp->f_vnode->v_mount;
sbuf = STATVFSBUF_GET();
if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
goto out;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
out:
fd_putfile(SCARG(uap, fd));
STATVFSBUF_PUT(sbuf);
return error;
}
/*
* Get statistics on all filesystems.
*/
int
compat_20_sys_getfsstat(struct lwp *l, const struct compat_20_sys_getfsstat_args *uap, register_t *retval)
{
/* {
syscallarg(struct statfs12 *) buf;
syscallarg(long) bufsize;
syscallarg(int) flags;
} */
return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
SCARG(uap, flags), statvfs_to_statfs12_copy,
sizeof(struct statfs12), retval);
}
int
compat_20_sys_fhstatfs(struct lwp *l, const struct compat_20_sys_fhstatfs_args *uap, register_t *retval)
{
/* {
syscallarg(const struct compat_30_fhandle *) fhp;
syscallarg(struct statfs12 *) buf;
} */
struct statvfs *sbuf;
struct compat_30_fhandle fh;
struct mount *mp;
struct vnode *vp;
int error;
/*
* Must be super user
*/
if ((error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)))
return (error);
if ((error = copyin(SCARG(uap, fhp), &fh, sizeof(fh))) != 0)
return (error);
if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
error = VFS_FHTOVP(mp, (struct fid*)&fh.fh_fid, LK_EXCLUSIVE, &vp);
if (error != 0)
return (error);
mp = vp->v_mount;
VOP_UNLOCK(vp);
sbuf = STATVFSBUF_GET();
if ((error = VFS_STATVFS(mp, sbuf)) != 0)
goto out;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
out:
vrele(vp);
STATVFSBUF_PUT(sbuf);
return error;
}
int
vfs_syscalls_20_init(void)
{
return syscall_establish(NULL, vfs_syscalls_20_syscalls);
}
int
vfs_syscalls_20_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_20_syscalls);
}
/* $NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $ */
/*-
* Copyright (c) 2000, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christopher G. Demetriou.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $");
#include <sys/param.h>
#include <sys/time.h>
/*
* ratecheck(): simple time-based rate-limit checking. see ratecheck(9)
* for usage and rationale.
*/
int
ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
{
struct timeval tv, delta;
int rv = 0;
getmicrouptime(&tv);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once,
* even if interval is huge.
*/
if (timercmp(&delta, mininterval, >=) ||
(lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
*lasttime = tv;
rv = 1;
}
return (rv);
}
/*
* ppsratecheck(): packets (or events) per second limitation.
*/
int
ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
{
struct timeval tv, delta;
int rv;
getmicrouptime(&tv);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once.
* if more than one second have passed since the last update of
* lasttime, reset the counter.
*
* we do increment *curpps even in *curpps < maxpps case, as some may
* try to use *curpps for stat purposes as well.
*/
if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) ||
delta.tv_sec >= 1) {
*lasttime = tv;
*curpps = 0;
}
if (maxpps < 0)
rv = 1;
else if (*curpps < maxpps)
rv = 1;
else
rv = 0;
#if 1 /*DIAGNOSTIC?*/
/* be careful about wrap-around */
if (__predict_true(*curpps != INT_MAX)) *curpps = *curpps + 1;
#else
/*
* assume that there's not too many calls to this function.
* not sure if the assumption holds, as it depends on *caller's*
* behavior, not the behavior of this function.
* IMHO it is wrong to make assumption on the caller's behavior,
* so the above #if is #if 1, not #ifdef DIAGNOSTIC.
*/
*curpps = *curpps + 1;
#endif
return (rv);
}
/* $NetBSD: sleepq.h,v 1.42 2023/10/15 10:30:00 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_SLEEPQ_H_
#define _SYS_SLEEPQ_H_
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/wchan.h>
struct syncobj;
/*
* Generic sleep queues.
*/
typedef struct sleepq sleepq_t;
void sleepq_init(sleepq_t *);
void sleepq_remove(sleepq_t *, lwp_t *, bool);
int sleepq_enter(sleepq_t *, lwp_t *, kmutex_t *);
void sleepq_enqueue(sleepq_t *, wchan_t, const char *,
const struct syncobj *, bool);
void sleepq_transfer(lwp_t *, sleepq_t *, sleepq_t *, wchan_t, const char *,
const struct syncobj *, kmutex_t *, bool);
void sleepq_uncatch(lwp_t *);
void sleepq_unsleep(lwp_t *, bool);
void sleepq_timeout(void *);
void sleepq_wake(sleepq_t *, wchan_t, u_int, kmutex_t *);
int sleepq_abort(kmutex_t *, int);
void sleepq_changepri(lwp_t *, pri_t);
void sleepq_lendpri(lwp_t *, pri_t);
int sleepq_block(int, bool, const struct syncobj *, int);
#ifdef _KERNEL
#include <sys/kernel.h>
typedef union {
kmutex_t lock;
uint8_t pad[COHERENCY_UNIT];
} sleepqlock_t;
/*
* Return non-zero if it is unsafe to sleep.
*
* XXX This only exists because panic() is broken.
*/
static __inline bool
sleepq_dontsleep(lwp_t *l)
{
return cold || (doing_shutdown && (panicstr || CURCPU_IDLE_P()));
}
#endif /* _KERNEL */
#include <sys/sleeptab.h>
#endif /* _SYS_SLEEPQ_H_ */
/* $NetBSD: uvm_pdpolicy.h,v 1.9 2022/08/20 23:26:02 riastradh Exp $ */
/*-
* Copyright (c)2005, 2006 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _UVM_PDPOLICY_H_
#define _UVM_PDPOLICY_H_
#include <sys/mutex.h>
#include <sys/stdint.h>
#include <uvm/uvm_page.h>
struct krwlock;
struct uvm_cpu;
struct vm_anon;
struct vm_page;
/*
* these API is for uvm internal use only.
* don't use them directly from outside of /sys/uvm.
*/
void uvmpdpol_idle(struct uvm_cpu *);
void uvmpdpol_init(void);
void uvmpdpol_init_cpu(struct uvm_cpu *);
void uvmpdpol_reinit(void);
void uvmpdpol_estimatepageable(int *, int *);
bool uvmpdpol_needsscan_p(void);
void uvmpdpol_pageactivate(struct vm_page *);
void uvmpdpol_pagedeactivate(struct vm_page *);
void uvmpdpol_pagedequeue(struct vm_page *);
void uvmpdpol_pageenqueue(struct vm_page *);
bool uvmpdpol_pageactivate_p(struct vm_page *);
bool uvmpdpol_pageisqueued_p(struct vm_page *);
void uvmpdpol_pagerealize(struct vm_page *);
void uvmpdpol_anfree(struct vm_anon *);
void uvmpdpol_tune(void);
void uvmpdpol_scaninit(void);
void uvmpdpol_scanfini(void);
struct vm_page *uvmpdpol_selectvictim(struct krwlock **);
void uvmpdpol_balancequeue(int);
void uvmpdpol_sysctlsetup(void);
/*
* uvmpdpol_set_intent: set an intended state for the page, taking care not
* to overwrite any of the other flags.
*/
static inline void
uvmpdpol_set_intent(struct vm_page *pg, uint32_t i)
{
KASSERT(mutex_owned(&pg->interlock));
pg->pqflags = PQ_INTENT_SET | (pg->pqflags & ~PQ_INTENT_MASK) | i;
}
#endif /* !_UVM_PDPOLICY_H_ */
/* $NetBSD: copystr.c,v 1.1 2020/06/30 16:20:02 maxv Exp $ */
/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/systm.h>
#include <sys/errno.h>
int
copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done)
{
const char *src = kfaddr;
char *dst = kdaddr;
size_t i;
for (i = 0; i < len; i++) {
if ((*dst++ = *src++) == '\0') {
if (done) *done = i + 1;
return 0;
}
}
if (done)
*done = i;
return ENAMETOOLONG;
}
/* $NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: cons.c 1.7 92/01/21$
*
* @(#)cons.c 8.2 (Berkeley) 1/12/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/heartbeat.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/vnode.h>
#include <dev/cons.h>
#include "nullcons.h"
dev_type_open(cnopen);
dev_type_close(cnclose);
dev_type_read(cnread);
dev_type_write(cnwrite);
dev_type_ioctl(cnioctl);
dev_type_poll(cnpoll);
dev_type_kqfilter(cnkqfilter);
static bool cn_redirect(dev_t *, int, int *, struct tty **);
static void cn_release(struct tty *);
const struct cdevsw cons_cdevsw = {
.d_open = cnopen,
.d_close = cnclose,
.d_read = cnread,
.d_write = cnwrite,
.d_ioctl = cnioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = cnpoll,
.d_mmap = nommap,
.d_kqfilter = cnkqfilter,
.d_discard = nodiscard,
.d_flag = D_TTY|D_MPSAFE,
};
static struct kmutex cn_lock;
struct tty *volatile constty; /* virtual console output device */
struct consdev *cn_tab; /* physical console device info */
struct vnode *cn_devvp[2]; /* vnode for underlying device. */
void
cn_set_tab(struct consdev *tab)
{
/*
* This is a point that we should have KASSERT(cold) or add
* synchronization in case this can happen after cold boot.
* However, cn_tab initialization is so critical to any
* diagnostics or debugging that we need to tread carefully
* about introducing new ways to crash. So let's put the
* assertion in only after we've audited most or all of the
* cn_tab updates.
*/
cn_tab = tab;
}
int
cnopen(dev_t dev, int flag, int mode, struct lwp *l)
{
dev_t cndev;
int unit, error;
unit = minor(dev);
if (unit > 1)
return ENODEV;
mutex_enter(&cn_lock);
if (cn_tab == NULL) {
error = 0;
goto out;
}
/*
* always open the 'real' console device, so we don't get nailed
* later. This follows normal device semantics; they always get
* open() calls.
*/
cndev = cn_tab->cn_dev;
#if NNULLCONS > 0
if (cndev == NODEV) {
nullconsattach(0);
}
#else /* NNULLCONS > 0 */
if (cndev == NODEV) {
/*
* This is most likely an error in the console attach
* code. Panicking looks better than jumping into nowhere
* through cdevsw below....
*/
panic("cnopen: no console device");
}
#endif /* NNULLCONS > 0 */
if (dev == cndev) {
/*
* This causes cnopen() to be called recursively, which
* is generally a bad thing. It is often caused when
* dev == 0 and cn_dev has not been set, but was probably
* initialised to 0.
*/
panic("cnopen: cn_tab->cn_dev == dev");
}
if (cn_devvp[unit] != NULLVP) {
error = 0;
goto out;
}
if ((error = cdevvp(cndev, &cn_devvp[unit])) != 0) {
printf("cnopen: unable to get vnode reference\n");
goto out;
}
vn_lock(cn_devvp[unit], LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(cn_devvp[unit], flag, kauth_cred_get());
VOP_UNLOCK(cn_devvp[unit]);
out: mutex_exit(&cn_lock);
return error;
}
int
cnclose(dev_t dev, int flag, int mode, struct lwp *l)
{
struct vnode *vp;
int unit, error;
unit = minor(dev);
if (unit > 1)
return ENODEV;
mutex_enter(&cn_lock);
if (cn_tab == NULL) {
error = 0;
goto out;
}
vp = cn_devvp[unit];
cn_devvp[unit] = NULL;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_CLOSE(vp, flag, kauth_cred_get());
VOP_UNLOCK(vp);
vrele(vp);
out: mutex_exit(&cn_lock);
return error;
}
int
cnread(dev_t dev, struct uio *uio, int flag)
{
struct tty *ctp = NULL;
int error;
/*
* If we would redirect input, punt. This will keep strange
* things from happening to people who are using the real
* console. Nothing should be using /dev/console for
* input (except a shell in single-user mode, but then,
* one wouldn't TIOCCONS then).
*/
if (!cn_redirect(&dev, 1, &error, &ctp))
return error;
error = cdev_read(dev, uio, flag);
cn_release(ctp);
return error;
}
int
cnwrite(dev_t dev, struct uio *uio, int flag)
{
struct tty *ctp = NULL;
int error;
/* Redirect output, if that's appropriate. */
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_write(dev, uio, flag);
cn_release(ctp);
return error;
}
int
cnioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
struct tty *ctp = NULL;
int error;
error = 0;
/*
* Superuser can always use this to wrest control of console
* output from the "virtual" console.
*/
if (cmd == TIOCCONS) {
struct tty *tp;
mutex_enter(&constty_lock);
tp = atomic_load_relaxed(&constty);
if (tp == NULL) {
mutex_exit(&constty_lock);
goto passthrough; /* XXX ??? */
}
error = kauth_authorize_device_tty(l->l_cred,
KAUTH_DEVICE_TTY_VIRTUAL, tp);
if (!error)
atomic_store_relaxed(&constty, NULL);
mutex_exit(&constty_lock);
return error;
}
passthrough:
/*
* Redirect the ioctl, if that's appropriate.
* Note that strange things can happen, if a program does
* ioctls on /dev/console, then the console is redirected
* out from under it.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_ioctl(dev, cmd, data, flag, l);
cn_release(ctp);
return error;
}
/*ARGSUSED*/
int
cnpoll(dev_t dev, int events, struct lwp *l)
{
struct tty *ctp = NULL;
int error;
/*
* Redirect the poll, if that's appropriate.
* I don't want to think of the possible side effects
* of console redirection here.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return POLLHUP;
error = cdev_poll(dev, events, l);
cn_release(ctp);
return error;
}
/*ARGSUSED*/
int
cnkqfilter(dev_t dev, struct knote *kn)
{
struct tty *ctp = NULL;
int error;
/*
* Redirect the kqfilter, if that's appropriate.
* I don't want to think of the possible side effects
* of console redirection here.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_kqfilter(dev, kn);
cn_release(ctp);
return error;
}
int
cngetc(void)
{
if (cn_tab == NULL)
return (0);
int s = splhigh();
for (;;) {
const int rv = (*cn_tab->cn_getc)(cn_tab->cn_dev);
if (rv >= 0) {
splx(s);
return rv;
}
docritpollhooks();
}
}
int
cngetsn(char *cp, int size)
{
char *lp;
int c, len;
cnpollc(1);
lp = cp;
len = 0;
for (;;) {
c = cngetc();
switch (c) {
case '\n':
case '\r':
printf("\n");
*lp++ = '\0';
cnpollc(0);
return (len);
case '\b':
case '\177':
case '#':
if (len) {
--len;
--lp;
printf("\b \b");
}
continue;
case '@':
case 'u'&037: /* CTRL-u */
len = 0;
lp = cp;
printf("\n");
continue;
default:
if (len + 1 >= size || c < ' ') {
printf("\007");
continue;
}
printf("%c", c);
++len;
*lp++ = c;
}
}
}
void
cnputc(int c)
{ if (cn_tab == NULL)
return;
/*
* XXX
* for some reason this causes ARCS firmware to output an endless stream of
* whitespaces with n32 kernels, so use the pre-1.74 code for now until I can
* figure out why this happens
*/
#ifndef sgimips
if (c) {
if (c == '\n') { (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
docritpollhooks();
}
(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
}
#else
if (c) {
(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
if (c == '\n') {
docritpollhooks();
(*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
}
}
#endif
}
void
cnpollc(int on)
{
static int refcount = 0;
if (cn_tab == NULL)
return;
if (!on)
--refcount;
if (refcount == 0) {
if (on) {
/*
* Bind to the current CPU by disabling
* preemption (more convenient than finding a
* place to store a stack to unwind for
* curlwp_bind/bindx, and preemption wouldn't
* happen anyway while spinning at high IPL in
* cngetc) so that curcpu() is stable so that
* we can suspend heartbeat checks for it.
*/
kpreempt_disable();
heartbeat_suspend();
}
(*cn_tab->cn_pollc)(cn_tab->cn_dev, on);
if (!on) {
heartbeat_resume();
kpreempt_enable();
}
}
if (on)
++refcount;
}
void
nullcnpollc(dev_t dev, int on)
{
}
void
cnbell(u_int pitch, u_int period, u_int volume)
{
if (cn_tab == NULL || cn_tab->cn_bell == NULL)
return;
(*cn_tab->cn_bell)(cn_tab->cn_dev, pitch, period, volume);
}
void
cnflush(void)
{ if (cn_tab == NULL || cn_tab->cn_flush == NULL)
return;
(*cn_tab->cn_flush)(cn_tab->cn_dev);
}
void
cnhalt(void)
{
if (cn_tab == NULL || cn_tab->cn_halt == NULL)
return;
(*cn_tab->cn_halt)(cn_tab->cn_dev);
}
/*
* Redirect output, if that's appropriate. If there's no real console,
* return ENXIO.
*/
static bool
cn_redirect(dev_t *devp, int is_read, int *error, struct tty **ctpp)
{
dev_t dev = *devp;
struct tty *ctp;
int s;
bool ok = false;
*error = ENXIO;
*ctpp = NULL;
s = pserialize_read_enter();
if ((ctp = atomic_load_consume(&constty)) != NULL && minor(dev) == 0 && (cn_tab == NULL || (cn_tab->cn_pri != CN_REMOTE))) {
if (is_read) {
*error = 0;
goto out;
}
tty_acquire(ctp);
*ctpp = ctp;
dev = ctp->t_dev;
} else if (cn_tab == NULL)
goto out;
else
dev = cn_tab->cn_dev;
ok = true;
*devp = dev;
out: pserialize_read_exit(s);
return ok;
}
static void
cn_release(struct tty *ctp)
{
if (ctp == NULL)
return;
tty_release(ctp);
}
MODULE(MODULE_CLASS_DRIVER, cons, NULL);
static int
cons_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
mutex_init(&cn_lock, MUTEX_DEFAULT, IPL_NONE);
return 0;
case MODULE_CMD_FINI:
mutex_destroy(&cn_lock);
return 0;
default:
return ENOTTY;
}
}
/* $NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* System calls relating to loadable modules.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/kobj.h>
#include <sys/module.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/compat_stub.h>
#include <compat/sys/module.h>
#include <compat/common/compat_mod.h>
static int
compat_80_modstat(int cmd, struct iovec *iov, void *arg)
{
omodstat_t *oms, *omso;
modinfo_t *mi;
module_t *mod;
vaddr_t addr;
size_t size;
size_t omslen;
size_t used;
int error;
int omscnt;
bool stataddr;
const char *suffix = "...";
if (cmd != MODCTL_OSTAT)
return EINVAL;
error = copyin(arg, iov, sizeof(*iov));
if (error != 0) {
return error;
}
/* If not privileged, don't expose kernel addresses. */
error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
0, (void *)(uintptr_t)MODCTL_STAT, NULL, NULL);
stataddr = (error == 0);
kernconfig_lock();
omscnt = 0;
TAILQ_FOREACH(mod, &module_list, mod_chain) {
omscnt++;
mi = mod->mod_info;
}
TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
omscnt++;
mi = mod->mod_info;
}
omslen = omscnt * sizeof(omodstat_t);
omso = kmem_zalloc(omslen, KM_SLEEP);
oms = omso;
TAILQ_FOREACH(mod, &module_list, mod_chain) {
mi = mod->mod_info;
strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
if (mi->mi_required != NULL) {
used = strlcpy(oms->oms_required, mi->mi_required,
sizeof(oms->oms_required));
if (used >= sizeof(oms->oms_required)) {
oms->oms_required[sizeof(oms->oms_required) -
strlen(suffix) - 1] = '\0';
strlcat(oms->oms_required, suffix,
sizeof(oms->oms_required));
}
}
if (mod->mod_kobj != NULL && stataddr) {
kobj_stat(mod->mod_kobj, &addr, &size);
oms->oms_addr = addr;
oms->oms_size = size;
}
oms->oms_class = mi->mi_class;
oms->oms_refcnt = mod->mod_refcnt;
oms->oms_source = mod->mod_source;
oms->oms_flags = mod->mod_flags;
oms++;
}
TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
mi = mod->mod_info;
strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
if (mi->mi_required != NULL) {
used = strlcpy(oms->oms_required, mi->mi_required,
sizeof(oms->oms_required));
if (used >= sizeof(oms->oms_required)) {
oms->oms_required[sizeof(oms->oms_required) -
strlen(suffix) - 1] = '\0';
strlcat(oms->oms_required, suffix,
sizeof(oms->oms_required));
}
}
if (mod->mod_kobj != NULL && stataddr) {
kobj_stat(mod->mod_kobj, &addr, &size);
oms->oms_addr = addr;
oms->oms_size = size;
}
oms->oms_class = mi->mi_class;
oms->oms_refcnt = -1;
KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL);
oms->oms_source = mod->mod_source;
oms++;
}
kernconfig_unlock();
error = copyout(omso, iov->iov_base, uimin(omslen, iov->iov_len));
kmem_free(omso, omslen);
if (error == 0) { iov->iov_len = omslen;
error = copyout(iov, arg, sizeof(*iov));
}
return error;
}
void
kern_mod_80_init(void)
{
MODULE_HOOK_SET(compat_modstat_80_hook, compat_80_modstat);
}
void
kern_mod_80_fini(void)
{
MODULE_HOOK_UNSET(compat_modstat_80_hook);
}
/* $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $ */
/*-
* Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/sched.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/rndsource.h>
#include <sys/heartbeat.h>
#ifdef GPROF
#include <sys/gmon.h>
#endif
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
#include <sys/cpu.h>
cyclic_clock_func_t cyclic_clock_func[MAXCPUS];
#endif
static int sysctl_kern_clockrate(SYSCTLFN_PROTO);
/*
* Clock handling routines.
*
* This code is written to operate with two timers that run independently of
* each other. The main clock, running hz times per second, is used to keep
* track of real time. The second timer handles kernel and user profiling,
* and does resource use estimation. If the second timer is programmable,
* it is randomized to avoid aliasing between the two clocks. For example,
* the randomization prevents an adversary from always giving up the CPU
* just before its quantum expires. Otherwise, it would never accumulate
* CPU ticks. The mean frequency of the second timer is stathz.
*
* If no second timer exists, stathz will be zero; in this case we drive
* profiling and statistics off the main clock. This WILL NOT be accurate;
* do not do it unless absolutely necessary.
*
* The statistics clock may (or may not) be run at a higher rate while
* profiling. This profile clock runs at profhz. We require that profhz
* be an integral multiple of stathz.
*
* If the statistics clock is running fast, it must be divided by the ratio
* profhz/stathz for statistics. (For profiling, every tick counts.)
*/
int stathz;
int profhz;
int profsrc;
int schedhz;
int profprocs;
static int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv; /* prof => stat divider */
int psratio; /* ratio: prof / stat */
struct clockrnd {
struct krndsource source;
unsigned needed;
};
static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT);
static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT);
static void
clockrnd_get(size_t needed, void *cookie)
{
struct clockrnd *C = cookie;
/* Start sampling. */
atomic_store_relaxed(&C->needed, 2*NBBY*needed);
}
static void
clockrnd_sample(struct clockrnd *C)
{
struct cpu_info *ci = curcpu();
/* If there's nothing needed right now, stop here. */
if (__predict_true(atomic_load_relaxed(&C->needed) == 0))
return;
/*
* If we're not the primary core of a package, we're probably
* driven by the same clock as the primary core, so don't
* bother.
*/
if (ci != ci->ci_package1st)
return;
/* Take a sample and enter it into the pool. */
rnd_add_uint32(&C->source, 0);
/*
* On the primary CPU, count down. Using an atomic decrement
* here isn't really necessary -- on every platform we care
* about, stores to unsigned int are atomic, and the only other
* memory operation that could happen here is for another CPU
* to store a higher value for needed. But using an atomic
* decrement avoids giving the impression of data races, and is
* unlikely to hurt because only one CPU will ever be writing
* to the location.
*/
if (CPU_IS_PRIMARY(curcpu())) {
unsigned needed __diagused;
needed = atomic_dec_uint_nv(&C->needed);
KASSERT(needed != UINT_MAX);
}
}
static u_int get_intr_timecount(struct timecounter *);
static struct timecounter intr_timecounter = {
.tc_get_timecount = get_intr_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
.tc_frequency = 0,
.tc_name = "clockinterrupt",
/* quality - minimum implementation level for a clock */
.tc_quality = 0,
.tc_priv = NULL,
};
static u_int
get_intr_timecount(struct timecounter *tc)
{
return (u_int)getticks();
}
int
getticks(void)
{
return atomic_load_relaxed(&hardclock_ticks);
}
/*
* Initialize clock frequencies and start both clocks running.
*/
void
initclocks(void)
{
static struct sysctllog *clog;
int i;
/*
* Set divisors to 1 (normal case) and let the machine-specific
* code do its bit.
*/
psdiv = 1;
/*
* Call cpu_initclocks() before registering the default
* timecounter, in case it needs to adjust hz.
*/
const int old_hz = hz;
cpu_initclocks();
if (old_hz != hz) {
tick = 1000000 / hz;
tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1;
}
/*
* provide minimum default time counter
* will only run at interrupt resolution
*/
intr_timecounter.tc_frequency = hz;
tc_init(&intr_timecounter);
/*
* Compute profhz and stathz, fix profhz if needed.
*/
i = stathz ? stathz : hz;
if (profhz == 0)
profhz = i;
psratio = profhz / i;
if (schedhz == 0) {
/* 16Hz is best */
hardscheddiv = hz / 16;
if (hardscheddiv <= 0)
panic("hardscheddiv");
}
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "clockrate",
SYSCTL_DESCR("Kernel clock rates"),
sysctl_kern_clockrate, 0, NULL,
sizeof(struct clockinfo),
CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "hardclock_ticks",
SYSCTL_DESCR("Number of hardclock ticks"),
NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);
rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd);
rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW,
RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB);
if (stathz) {
rndsource_setcb(&statclockrnd.source, clockrnd_get,
&statclockrnd);
rnd_attach_source(&statclockrnd.source, "statclock",
RND_TYPE_SKEW,
(RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|
RND_FLAG_HASCB));
}
}
/*
* The real-time timer, interrupting hz times per second.
*/
void
hardclock(struct clockframe *frame)
{
struct lwp *l;
struct cpu_info *ci;
clockrnd_sample(&hardclockrnd);
ci = curcpu();
l = ci->ci_onproc;
ptimer_tick(l, CLKF_USERMODE(frame));
/*
* If no separate statistics clock is available, run it from here.
*/
if (stathz == 0)
statclock(frame);
/*
* If no separate schedclock is provided, call it here
* at about 16 Hz.
*/
if (schedhz == 0) {
if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
schedclock(l);
ci->ci_schedstate.spc_schedticks = hardscheddiv;
}
}
if ((--ci->ci_schedstate.spc_ticks) <= 0)
sched_tick(ci);
if (CPU_IS_PRIMARY(ci)) {
atomic_store_relaxed(&hardclock_ticks,
atomic_load_relaxed(&hardclock_ticks) + 1);
tc_ticktock();
}
/*
* Make sure the CPUs and timecounter are making progress.
*/
heartbeat();
/*
* Update real-time timeout queue.
*/
callout_hardclock();
}
/*
* Start profiling on a process.
*
* Kernel profiling passes proc0 which never exits and hence
* keeps the profile clock running constantly.
*/
void
startprofclock(struct proc *p)
{
KASSERT(mutex_owned(&p->p_stmutex));
if ((p->p_stflag & PST_PROFIL) == 0) {
p->p_stflag |= PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (++profprocs == 1 && stathz != 0)
psdiv = psratio;
}
}
/*
* Stop profiling on a process.
*/
void
stopprofclock(struct proc *p)
{
KASSERT(mutex_owned(&p->p_stmutex));
if (p->p_stflag & PST_PROFIL) {
p->p_stflag &= ~PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (--profprocs == 0 && stathz != 0)
psdiv = 1;
}
}
void
schedclock(struct lwp *l)
{
if ((l->l_flag & LW_IDLE) != 0)
return;
sched_schedclock(l);
}
/*
* Statistics clock. Grab profile sample, and if divider reaches 0,
* do process and kernel statistics.
*/
void
statclock(struct clockframe *frame)
{
#ifdef GPROF
struct gmonparam *g;
intptr_t i;
#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
struct proc *p;
struct lwp *l;
if (stathz)
clockrnd_sample(&statclockrnd);
/*
* Notice changes in divisor frequency, and adjust clock
* frequency accordingly.
*/
if (spc->spc_psdiv != psdiv) {
spc->spc_psdiv = psdiv;
spc->spc_pscnt = psdiv;
if (psdiv == 1) {
setstatclockrate(stathz);
} else {
setstatclockrate(profhz);
}
}
l = ci->ci_onproc;
if ((l->l_flag & LW_IDLE) != 0) {
/*
* don't account idle lwps as swapper.
*/
p = NULL;
} else {
p = l->l_proc;
mutex_spin_enter(&p->p_stmutex);
}
if (CLKF_USERMODE(frame)) {
KASSERT(p != NULL);
if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
addupc_intr(l, CLKF_PC(frame));
if (--spc->spc_pscnt > 0) {
mutex_spin_exit(&p->p_stmutex);
return;
}
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled record the tick.
*/
p->p_uticks++;
if (p->p_nice > NZERO)
spc->spc_cp_time[CP_NICE]++;
else
spc->spc_cp_time[CP_USER]++;
} else {
#ifdef GPROF
/*
* Kernel statistics are just like addupc_intr, only easier.
*/
#if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL)
g = curcpu()->ci_gmon;
if (g != NULL &&
profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#else
g = &_gmonparam;
if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#endif
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i]++;
}
}
#endif
#ifdef LWP_PC
if (p != NULL && profsrc == PROFSRC_CLOCK &&
(p->p_stflag & PST_PROFIL)) {
addupc_intr(l, LWP_PC(l));
}
#endif
if (--spc->spc_pscnt > 0) {
if (p != NULL)
mutex_spin_exit(&p->p_stmutex);
return;
}
/*
* Came from kernel mode, so we were:
* - handling an interrupt,
* - doing syscall or trap work on behalf of the current
* user process, or
* - spinning in the idle loop.
* Whichever it is, charge the time as appropriate.
* Note that we charge interrupts to the current process,
* regardless of whether they are ``for'' that process,
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
if (p != NULL) {
p->p_iticks++;
}
spc->spc_cp_time[CP_INTR]++;
} else if (p != NULL) {
p->p_sticks++;
spc->spc_cp_time[CP_SYS]++;
} else {
spc->spc_cp_time[CP_IDLE]++;
}
}
spc->spc_pscnt = psdiv;
if (p != NULL) {
atomic_inc_uint(&l->l_cpticks);
mutex_spin_exit(&p->p_stmutex);
}
#ifdef KDTRACE_HOOKS
cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
if (func) {
(*func)((struct clockframe *)frame);
}
#endif
}
/*
* sysctl helper routine for kern.clockrate. Assembles a struct on
* the fly to be returned to the caller.
*/
static int
sysctl_kern_clockrate(SYSCTLFN_ARGS)
{
struct clockinfo clkinfo;
struct sysctlnode node;
clkinfo.tick = tick;
clkinfo.tickadj = tickadj;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
clkinfo.stathz = stathz ? stathz : hz;
node = *rnode;
node.sysctl_data = &clkinfo;
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
/* $NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $ */
/*
* Copyright (c) 2005-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program, and by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Efficient memory file system: interfaces for inode and directory entry
* construction, destruction and manipulation.
*
* Reference counting
*
* The link count of inode (tmpfs_node_t::tn_links) is used as a
* reference counter. However, it has slightly different semantics.
*
* For directories - link count represents directory entries, which
* refer to the directories. In other words, it represents the count
* of sub-directories. It also takes into account the virtual '.'
* entry (which has no real entry in the list). For files - link count
* represents the hard links. Since only empty directories can be
* removed - link count aligns the reference counting requirements
* enough. Note: to check whether directory is not empty, the inode
* size (tmpfs_node_t::tn_size) can be used.
*
* The inode itself, as an object, gathers its first reference when
* directory entry is attached via tmpfs_dir_attach(9). For instance,
* after regular tmpfs_create(), a file would have a link count of 1,
* while directory after tmpfs_mkdir() would have 2 (due to '.').
*
* Reclamation
*
* It should be noted that tmpfs inodes rely on a combination of vnode
* reference counting and link counting. That is, an inode can only be
* destroyed if its associated vnode is inactive. The destruction is
* done on vnode reclamation i.e. tmpfs_reclaim(). It should be noted
* that tmpfs_node_t::tn_links being 0 is a destruction criterion.
*
* If an inode has references within the file system (tn_links > 0) and
* its inactive vnode gets reclaimed/recycled - then the association is
* broken in tmpfs_reclaim(). In such case, an inode will always pass
* tmpfs_lookup() and thus vcache_get() to associate a new vnode.
*
* Lock order
*
* vnode_t::v_vlock ->
* vnode_t::v_interlock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $");
#include <sys/param.h>
#include <sys/cprng.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <uvm/uvm_aobj.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>
#include <fs/tmpfs/tmpfs_specops.h>
#include <fs/tmpfs/tmpfs_vnops.h>
static void tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
/*
* Initialize vnode with tmpfs node.
*/
static void
tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node)
{
krwlock_t *slock;
KASSERT(node->tn_vnode == NULL);
/* Share the interlock with the node. */
if (node->tn_type == VREG) { slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock;
rw_obj_hold(slock);
uvm_obj_setlock(&vp->v_uobj, slock);
}
vp->v_tag = VT_TMPFS;
vp->v_type = node->tn_type;
/* Type-specific initialization. */
switch (vp->v_type) {
case VBLK:
case VCHR:
vp->v_op = tmpfs_specop_p;
spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
break;
case VFIFO:
vp->v_op = tmpfs_fifoop_p;
break;
case VDIR:
if (node->tn_spec.tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT;
/* FALLTHROUGH */
case VLNK:
case VREG:
case VSOCK:
vp->v_op = tmpfs_vnodeop_p;
break;
default:
panic("bad node type %d", vp->v_type);
break;
}
vp->v_data = node;
node->tn_vnode = vp;
uvm_vnp_setsize(vp, node->tn_size);
KASSERT(node->tn_mode != VNOVAL);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
}
/*
* tmpfs_loadvnode: initialise a vnode for a specified inode.
*/
int
tmpfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
tmpfs_node_t *node;
KASSERT(key_len == sizeof(node));
memcpy(&node, key, key_len);
if (node->tn_links == 0)
return ENOENT;
tmpfs_init_vnode(vp, node);
*new_key = &vp->v_data;
return 0;
}
/*
* tmpfs_newvnode: allocate a new inode of a specified type and
* attach the vonode.
*/
int
tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
struct vattr *vap, kauth_cred_t cred, void *extra,
size_t *key_len, const void **new_key)
{ tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
tmpfs_node_t *node, *dnode;
if (dvp != NULL) { KASSERT(VOP_ISLOCKED(dvp)); dnode = VP_TO_TMPFS_DIR(dvp); if (dnode->tn_links == 0)
return ENOENT;
if (vap->va_type == VDIR) {
/* Check for maximum links limit. */
if (dnode->tn_links == LINK_MAX)
return EMLINK;
KASSERT(dnode->tn_links < LINK_MAX);
}
} else
dnode = NULL;
node = tmpfs_node_get(tmp);
if (node == NULL)
return ENOSPC;
/* Initially, no references and no associations. */
node->tn_links = 0;
node->tn_vnode = NULL;
node->tn_holdcount = 0;
node->tn_dirent_hint = NULL;
/*
* XXX Where the pool is backed by a map larger than (4GB *
* sizeof(*node)), this may produce duplicate inode numbers
* for applications that do not understand 64-bit ino_t.
*/
node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node));
/*
* Make sure the generation number is not zero.
* tmpfs_inactive() uses generation zero to mark dead nodes.
*/
do {
node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
} while (node->tn_gen == 0);
/* Generic initialization. */
KASSERT((int)vap->va_type != VNOVAL);
node->tn_type = vap->va_type;
node->tn_size = 0;
node->tn_flags = 0;
node->tn_lockf = NULL;
node->tn_tflags = 0;
vfs_timestamp(&node->tn_atime);
node->tn_birthtime = node->tn_atime;
node->tn_ctime = node->tn_atime;
node->tn_mtime = node->tn_atime;
mutex_init(&node->tn_timelock, MUTEX_DEFAULT, IPL_NONE);
if (dvp == NULL) {
KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL);
node->tn_uid = vap->va_uid;
node->tn_gid = vap->va_gid;
vp->v_vflag |= VV_ROOT;
} else {
KASSERT(dnode != NULL);
node->tn_uid = kauth_cred_geteuid(cred);
node->tn_gid = dnode->tn_gid;
}
KASSERT(vap->va_mode != VNOVAL);
node->tn_mode = vap->va_mode;
/* Type-specific initialization. */
switch (node->tn_type) {
case VBLK:
case VCHR:
/* Character/block special device. */
KASSERT(vap->va_rdev != VNOVAL);
node->tn_spec.tn_dev.tn_rdev = vap->va_rdev;
break;
case VDIR:
/* Directory. */
TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir);
node->tn_spec.tn_dir.tn_parent = NULL;
node->tn_spec.tn_dir.tn_seq_arena = NULL;
node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
node->tn_spec.tn_dir.tn_readdir_lastp = NULL;
/* Extra link count for the virtual '.' entry. */
node->tn_links++;
break;
case VFIFO:
case VSOCK:
break;
case VLNK:
node->tn_size = 0;
node->tn_spec.tn_lnk.tn_link = NULL;
break;
case VREG:
/* Regular file. Create an underlying UVM object. */
node->tn_spec.tn_reg.tn_aobj =
uao_create(INT64_MAX - PAGE_SIZE, 0);
node->tn_spec.tn_reg.tn_aobj_pages = 0;
break;
default:
panic("bad node type %d", vp->v_type);
break;
}
tmpfs_init_vnode(vp, node);
mutex_enter(&tmp->tm_lock);
LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries);
mutex_exit(&tmp->tm_lock);
*key_len = sizeof(vp->v_data);
*new_key = &vp->v_data;
return 0;
}
/*
* tmpfs_free_node: remove the inode from a list in the mount point and
* destroy the inode structures.
*/
void
tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
{
size_t objsz;
uint32_t hold;
mutex_enter(&tmp->tm_lock);
hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED);
/* Defer destruction to last thread holding this node. */
if (hold != TMPFS_NODE_RECLAIMED) {
mutex_exit(&tmp->tm_lock);
return;
}
LIST_REMOVE(node, tn_entries);
mutex_exit(&tmp->tm_lock);
switch (node->tn_type) {
case VLNK:
if (node->tn_size > 0) {
tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
node->tn_size);
}
break;
case VREG:
/*
* Calculate the size of inode data, decrease the used-memory
* counter, and destroy the unerlying UVM object (if any).
*/
objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
if (objsz != 0) {
tmpfs_mem_decr(tmp, objsz);
}
if (node->tn_spec.tn_reg.tn_aobj != NULL) {
uao_detach(node->tn_spec.tn_reg.tn_aobj);
}
break;
case VDIR:
KASSERT(node->tn_size == 0);
KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL);
KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
node == tmp->tm_root);
break;
default:
break;
}
KASSERT(node->tn_vnode == NULL);
KASSERT(node->tn_links == 0);
mutex_destroy(&node->tn_timelock);
tmpfs_node_put(tmp, node);
}
/*
* tmpfs_construct_node: allocate a new file of specified type and adds it
* into the parent directory.
*
* => Credentials of the caller are used.
*/
int
tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
struct componentname *cnp, char *target)
{ tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount); tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
tmpfs_dirent_t *de, *wde;
char *slink = NULL;
int ssize = 0;
int error;
/* Allocate symlink target. */
if (target != NULL) { KASSERT(vap->va_type == VLNK);
ssize = strlen(target);
KASSERT(ssize < MAXPATHLEN); if (ssize > 0) {
slink = tmpfs_strname_alloc(tmp, ssize);
if (slink == NULL)
return ENOSPC;
memcpy(slink, target, ssize);
}
}
/* Allocate a directory entry that points to the new file. */
error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
if (error) {
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize);
return error;
}
/* Allocate a vnode that represents the new file. */
error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp);
if (error) {
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de);
return error;
}
error = vn_lock(*vpp, LK_EXCLUSIVE);
if (error) {
vrele(*vpp);
*vpp = NULL;
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de);
return error;
}
node = VP_TO_TMPFS_NODE(*vpp); if (slink != NULL) { node->tn_spec.tn_lnk.tn_link = slink;
node->tn_size = ssize;
}
/* Remove whiteout before adding the new entry. */
if (cnp->cn_flags & ISWHITEOUT) {
wde = tmpfs_dir_lookup(dnode, cnp);
KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
tmpfs_dir_detach(dnode, wde);
tmpfs_free_dirent(tmp, wde);
}
/* Associate inode and attach the entry into the directory. */
tmpfs_dir_attach(dnode, de, node);
/* Make node opaque if requested. */
if (cnp->cn_flags & ISWHITEOUT) node->tn_flags |= UF_OPAQUE;
/* Update the parent's timestamps. */
tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
VOP_UNLOCK(*vpp);
cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
return 0;
}
/*
* tmpfs_alloc_dirent: allocates a new directory entry for the inode.
* The directory entry contains a path name component.
*/
int
tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
tmpfs_dirent_t **de)
{
tmpfs_dirent_t *nde;
nde = tmpfs_dirent_get(tmp);
if (nde == NULL)
return ENOSPC;
nde->td_name = tmpfs_strname_alloc(tmp, len);
if (nde->td_name == NULL) {
tmpfs_dirent_put(tmp, nde);
return ENOSPC;
}
nde->td_namelen = len;
memcpy(nde->td_name, name, len);
nde->td_seq = TMPFS_DIRSEQ_NONE;
nde->td_node = NULL; /* for asserts */
*de = nde;
return 0;
}
/*
* tmpfs_free_dirent: free a directory entry.
*/
void
tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
{
KASSERT(de->td_node == NULL); KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
tmpfs_dirent_put(tmp, de);
}
/*
* tmpfs_dir_attach: associate directory entry with a specified inode,
* and attach the entry into the directory, specified by vnode.
*
* => Increases link count on the associated node.
* => Increases link count on directory node if our node is VDIR.
* => It is caller's responsibility to check for the LINK_MAX limit.
* => Triggers kqueue events here.
*/
void
tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
{
vnode_t *dvp = dnode->tn_vnode;
int events = NOTE_WRITE;
KASSERT(dvp != NULL); KASSERT(VOP_ISLOCKED(dvp));
/* Get a new sequence number. */
KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
de->td_seq = tmpfs_dir_getseq(dnode, de);
/* Associate directory entry and the inode. */
de->td_node = node;
if (node != TMPFS_NODE_WHITEOUT) {
KASSERT(node->tn_links < LINK_MAX);
node->tn_links++;
/* Save the hint (might overwrite). */
node->tn_dirent_hint = de;
} else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
/* Flag that there are whiteout entries. */
atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
}
/* Insert the entry to the directory (parent of inode). */
TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries); KASSERT(dnode->tn_size <= __type_max(off_t) - sizeof(tmpfs_dirent_t));
dnode->tn_size += sizeof(tmpfs_dirent_t);
uvm_vnp_setsize(dvp, dnode->tn_size);
if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
/* Set parent. */
KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
node->tn_spec.tn_dir.tn_parent = dnode;
/* Increase the link count of parent. */
KASSERT(dnode->tn_links < LINK_MAX);
dnode->tn_links++;
events |= NOTE_LINK;
TMPFS_VALIDATE_DIR(node);
}
}
/*
* tmpfs_dir_detach: disassociate directory entry and its inode,
* and detach the entry from the directory, specified by vnode.
*
* => Decreases link count on the associated node.
* => Decreases the link count on directory node, if our node is VDIR.
* => Triggers kqueue events here.
*
* => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
*/
void
tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
tmpfs_node_t *node = de->td_node;
vnode_t *dvp = dnode->tn_vnode;
KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
/* Deassociate the inode and entry. */
node->tn_dirent_hint = NULL;
KASSERT(node->tn_links > 0);
node->tn_links--;
/* If directory - decrease the link count of parent. */
if (node->tn_type == VDIR) {
KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
node->tn_spec.tn_dir.tn_parent = NULL;
KASSERT(dnode->tn_links > 0);
dnode->tn_links--;
}
}
de->td_node = NULL;
/* Remove the entry from the directory. */
if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
}
TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
KASSERT(dnode->tn_size >= sizeof(tmpfs_dirent_t));
dnode->tn_size -= sizeof(tmpfs_dirent_t);
tmpfs_dir_putseq(dnode, de);
if (dvp) {
uvm_vnp_setsize(dvp, dnode->tn_size);
}
}
/*
* tmpfs_dir_lookup: find a directory entry in the specified inode.
*
* Note that the . and .. components are not allowed as they do not
* physically exist within directories.
*/
tmpfs_dirent_t *
tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
{
const char *name = cnp->cn_nameptr;
const uint16_t nlen = cnp->cn_namelen;
tmpfs_dirent_t *de;
KASSERT(VOP_ISLOCKED(node->tn_vnode)); KASSERT(nlen != 1 || !(name[0] == '.')); KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.')); TMPFS_VALIDATE_DIR(node); TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { if (de->td_namelen != nlen)
continue;
if (memcmp(de->td_name, name, nlen) != 0)
continue;
break;
}
return de;
}
/*
* tmpfs_dir_cached: get a cached directory entry if it is valid. Used to
* avoid unnecessary tmpfs_dir_lookup().
*
* => The vnode must be locked.
*/
tmpfs_dirent_t *
tmpfs_dir_cached(tmpfs_node_t *node)
{
tmpfs_dirent_t *de = node->tn_dirent_hint;
KASSERT(VOP_ISLOCKED(node->tn_vnode));
if (de == NULL) {
return NULL;
}
KASSERT(de->td_node == node);
/*
* Directories always have a valid hint. For files, check if there
* are any hard links. If there are - hint might be invalid.
*/
return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
}
/*
* tmpfs_dir_getseq: get a per-directory sequence number for the entry.
*
* => Shall not be larger than 2^31 for linux32 compatibility.
*/
uint32_t
tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
uint32_t seq = de->td_seq;
vmem_t *seq_arena;
vmem_addr_t off;
int error __diagused;
TMPFS_VALIDATE_DIR(dnode);
if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
/* Already set. */
KASSERT(seq >= TMPFS_DIRSEQ_START);
return seq;
}
/*
* The "." and ".." and the end-of-directory have reserved numbers.
* The other sequence numbers are allocated as following:
*
* - The first half of the 2^31 is assigned incrementally.
*
* - If that range is exceeded, then the second half of 2^31
* is used, but managed by vmem(9).
*/
seq = dnode->tn_spec.tn_dir.tn_next_seq;
KASSERT(seq >= TMPFS_DIRSEQ_START); if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
/* First half: just increment and return. */
dnode->tn_spec.tn_dir.tn_next_seq++;
return seq;
}
/*
* First half exceeded, use the second half. May need to create
* vmem(9) arena for the directory first.
*/
if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
seq_arena = vmem_create("tmpfscoo", 0,
TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
VM_SLEEP, IPL_NONE);
dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
KASSERT(seq_arena != NULL);
}
error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
KASSERT(error == 0); KASSERT(off < TMPFS_DIRSEQ_END);
seq = off | TMPFS_DIRSEQ_END;
return seq;
}
static void
tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
uint32_t seq = de->td_seq;
TMPFS_VALIDATE_DIR(dnode);
if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
/* First half (or no sequence number set yet). */
KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
} else {
/* Second half. */
KASSERT(seq_arena != NULL);
KASSERT(seq >= TMPFS_DIRSEQ_END);
seq &= ~TMPFS_DIRSEQ_END;
vmem_free(seq_arena, seq, 1);
}
de->td_seq = TMPFS_DIRSEQ_NONE;
/* Empty? We can reset. */
if (seq_arena && dnode->tn_size == 0) {
dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
vmem_destroy(seq_arena);
}
}
/*
* tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
*/
tmpfs_dirent_t *
tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
{
tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
TMPFS_VALIDATE_DIR(node);
/*
* First, check the cache. If does not match - perform a lookup.
*/
if (de && de->td_seq == seq) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
return de;
}
TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE); if (de->td_seq == seq)
return de;
}
return NULL;
}
/*
* tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
* dot meta entries, that is, "." or "..". Copy it to the UIO space.
*/
static int
tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
{
tmpfs_dirent_t *de;
off_t next = 0;
int error;
switch (uio->uio_offset) {
case TMPFS_DIRSEQ_DOT:
dp->d_fileno = node->tn_id;
strlcpy(dp->d_name, ".", sizeof(dp->d_name));
next = TMPFS_DIRSEQ_DOTDOT;
break;
case TMPFS_DIRSEQ_DOTDOT:
dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
strlcpy(dp->d_name, "..", sizeof(dp->d_name));
de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
break;
default:
KASSERT(false);
}
dp->d_type = DT_DIR;
dp->d_namlen = strlen(dp->d_name);
dp->d_reclen = _DIRENT_SIZE(dp);
if (dp->d_reclen > uio->uio_resid) {
return EJUSTRETURN;
}
if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
return error;
}
uio->uio_offset = next;
return error;
}
/*
* tmpfs_dir_getdents: helper function for tmpfs_readdir.
*
* => Returns as much directory entries as can fit in the uio space.
* => The read starts at uio->uio_offset.
*/
int
tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
{
tmpfs_dirent_t *de;
struct dirent dent;
int error = 0;
KASSERT(VOP_ISLOCKED(node->tn_vnode)); TMPFS_VALIDATE_DIR(node);
/*
* First check for the "." and ".." cases.
* Note: tmpfs_dir_getdotents() will "seek" for us.
*/
memset(&dent, 0, sizeof(dent));
if (uio->uio_offset == TMPFS_DIRSEQ_DOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
goto done;
}
(*cntp)++;
}
if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
goto done;
}
(*cntp)++;
}
/* Done if we reached the end. */
if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
goto done;
}
/* Locate the directory entry given by the given sequence number. */
de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
if (de == NULL) {
error = EINVAL;
goto done;
}
/*
* Read as many entries as possible; i.e., until we reach the end
* of the directory or we exhaust UIO space.
*/
do {
if (de->td_node == TMPFS_NODE_WHITEOUT) {
dent.d_fileno = 1;
dent.d_type = DT_WHT;
} else {
dent.d_fileno = de->td_node->tn_id;
dent.d_type = vtype2dt(de->td_node->tn_type);
}
dent.d_namlen = de->td_namelen;
KASSERT(de->td_namelen < sizeof(dent.d_name));
memcpy(dent.d_name, de->td_name, de->td_namelen);
dent.d_name[de->td_namelen] = '\0';
dent.d_reclen = _DIRENT_SIZE(&dent);
if (dent.d_reclen > uio->uio_resid) {
/* Exhausted UIO space. */
error = EJUSTRETURN;
break;
}
/* Copy out the directory entry and continue. */
error = uiomove(&dent, dent.d_reclen, uio);
if (error) {
break;
}
(*cntp)++;
de = TAILQ_NEXT(de, td_entries);
} while (uio->uio_resid > 0 && de);
/* Cache the last entry or clear and mark EOF. */
uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
node->tn_spec.tn_dir.tn_readdir_lastp = de;
done:
tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);
if (error == EJUSTRETURN) {
/* Exhausted UIO space - just return. */
error = 0;
}
KASSERT(error >= 0);
return error;
}
/*
* tmpfs_reg_resize: resize the underlying UVM object associated with the
* specified regular file.
*/
int
tmpfs_reg_resize(struct vnode *vp, off_t newsize)
{
tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
size_t newpages, oldpages;
off_t oldsize;
KASSERT(vp->v_type == VREG);
KASSERT(newsize >= 0);
if (newsize > __type_max(off_t) - PAGE_SIZE + 1)
return EFBIG;
oldsize = node->tn_size;
oldpages = round_page(oldsize) >> PAGE_SHIFT;
newpages = round_page(newsize) >> PAGE_SHIFT;
KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
if (newsize == oldsize) {
return 0;
}
if (newpages > oldpages) {
/* Increase the used-memory counter if getting extra pages. */
if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
return ENOSPC;
}
} else if (newsize < oldsize) {
size_t zerolen;
zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
ubc_zerorange(uobj, newsize, zerolen, UBC_VNODE_FLAGS(vp));
}
node->tn_spec.tn_reg.tn_aobj_pages = newpages;
node->tn_size = newsize;
uvm_vnp_setsize(vp, newsize);
/*
* Free "backing store".
*/
if (newpages < oldpages) {
rw_enter(uobj->vmobjlock, RW_WRITER);
uao_dropswap_range(uobj, newpages, oldpages);
rw_exit(uobj->vmobjlock);
/* Decrease the used-memory counter. */
tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
}
return 0;
}
/*
* tmpfs_chflags: change flags of the given vnode.
*/
int
tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
int error;
bool changing_sysflags = false;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/*
* If the new flags have non-user flags that are different than
* those on the node, we need special permission to change them.
*/
if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
action |= KAUTH_VNODE_WRITE_SYSFLAGS;
changing_sysflags = true;
}
/*
* Indicate that this node's flags have system attributes in them if
* that's the case.
*/
if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
action |= KAUTH_VNODE_HAS_SYSFLAGS;
}
error = kauth_authorize_vnode(cred, action, vp, NULL,
genfs_can_chflags(vp, cred, node->tn_uid, changing_sysflags));
if (error)
return error;
/*
* Set the flags. If we're not setting non-user flags, be careful not
* to overwrite them.
*
* XXX: Can't we always assign here? if the system flags are different,
* the code above should catch attempts to change them without
* proper permissions, and if we're here it means it's okay to
* change them...
*/
if (!changing_sysflags) {
/* Clear all user-settable flags and re-set them. */
node->tn_flags &= SF_SETTABLE;
node->tn_flags |= (flags & UF_SETTABLE);
} else {
node->tn_flags = flags;
}
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
return 0;
}
/*
* tmpfs_chmod: change access mode on the given vnode.
*/
int
tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
NULL, genfs_can_chmod(vp, cred, node->tn_uid, node->tn_gid, mode));
if (error) {
return error;
}
node->tn_mode = (mode & ALLPERMS);
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
return 0;
}
/*
* tmpfs_chown: change ownership of the given vnode.
*
* => At least one of uid or gid must be different than VNOVAL.
* => Attribute is unchanged for VNOVAL case.
*/
int
tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Assign default values if they are unknown. */
KASSERT(uid != VNOVAL || gid != VNOVAL);
if (uid == VNOVAL) {
uid = node->tn_uid;
}
if (gid == VNOVAL) {
gid = node->tn_gid;
}
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
NULL, genfs_can_chown(vp, cred, node->tn_uid, node->tn_gid, uid,
gid));
if (error) {
return error;
}
node->tn_uid = uid;
node->tn_gid = gid;
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
return 0;
}
/*
* tmpfs_chsize: change size of the given vnode.
*/
int
tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
const off_t length = size;
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Decide whether this is a valid operation based on the file type. */
switch (vp->v_type) {
case VDIR:
return EISDIR;
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY) {
return EROFS;
}
break;
case VBLK:
case VCHR:
case VFIFO:
/*
* Allow modifications of special files even if in the file
* system is mounted read-only (we are not modifying the
* files themselves, but the objects they represent).
*/
return 0;
default:
return EOPNOTSUPP;
}
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND)) {
return EPERM;
}
if (length < 0) {
return EINVAL;
}
/* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
if (node->tn_size != length &&
(error = tmpfs_reg_resize(vp, length)) != 0) {
return error;
}
tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
return 0;
}
/*
* tmpfs_chtimes: change access and modification times for vnode.
*/
int
tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
const struct timespec *mtime, const struct timespec *btime,
int vaflags, kauth_cred_t cred, lwp_t *l)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
genfs_can_chtimes(vp, cred, node->tn_uid, vaflags));
if (error)
return error;
mutex_enter(&node->tn_timelock);
if (atime->tv_sec != VNOVAL) {
atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_ATIME);
node->tn_atime = *atime;
}
if (mtime->tv_sec != VNOVAL) {
atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_MTIME);
node->tn_mtime = *mtime;
}
if (btime->tv_sec != VNOVAL) {
node->tn_birthtime = *btime;
}
mutex_exit(&node->tn_timelock);
return 0;
}
/*
* tmpfs_update_locked: update the timestamps as indicated by the flags.
*/
void
tmpfs_update_locked(vnode_t *vp, unsigned tflags)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
struct timespec nowtm;
KASSERT(mutex_owned(&node->tn_timelock)); if ((tflags |= atomic_swap_uint(&node->tn_tflags, 0)) == 0) {
return;
}
vfs_timestamp(&nowtm);
if (tflags & TMPFS_UPDATE_ATIME) { node->tn_atime = nowtm;
}
if (tflags & TMPFS_UPDATE_MTIME) { node->tn_mtime = nowtm;
}
if (tflags & TMPFS_UPDATE_CTIME) { node->tn_ctime = nowtm;
}
}
/*
* tmpfs_update: update the timestamps as indicated by the flags.
*/
void
tmpfs_update(vnode_t *vp, unsigned tflags)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); if ((tflags | atomic_load_relaxed(&node->tn_tflags)) == 0) {
return;
}
mutex_enter(&node->tn_timelock);
tmpfs_update_locked(vp, tflags);
mutex_exit(&node->tn_timelock);
}
/*
* tmpfs_update_lazily: schedule a deferred timestamp update.
*/
void
tmpfs_update_lazily(vnode_t *vp, unsigned tflags)
{
tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
unsigned cur;
cur = atomic_load_relaxed(&node->tn_tflags);
if ((cur & tflags) != tflags) {
atomic_or_uint(&node->tn_tflags, tflags);
return;
}
}
/* $NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Gregory McGarry, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/xcall.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#define MAX_RAS_PER_PROC 16
u_int ras_per_proc = MAX_RAS_PER_PROC;
#ifdef DEBUG
int ras_debug = 0;
#define DPRINTF(x) if (ras_debug) printf x
#else
#define DPRINTF(x) /* nothing */
#endif
/*
* Force all CPUs through cpu_switchto(), waiting until complete.
* Context switching will drain the write buffer on the calling
* CPU.
*/
static void
ras_sync(void)
{
/* No need to sync if exiting or single threaded. */
if (curproc->p_nlwps > 1 && ncpu > 1) { xc_barrier(0);
}
}
/*
* Check the specified address to see if it is within the
* sequence. If it is found, we return the restart address,
* otherwise we return -1. If we do perform a restart, we
* mark the sequence as hit.
*
* No locking required: we disable preemption and ras_sync()
* guarantees that individual entries are valid while we still
* have visibility of them.
*/
void *
ras_lookup(struct proc *p, void *addr)
{
struct ras *rp;
void *startaddr;
lwp_t *l;
startaddr = (void *)-1;
l = curlwp;
KPREEMPT_DISABLE(l);
for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (addr > rp->ras_startaddr && addr < rp->ras_endaddr) {
startaddr = rp->ras_startaddr;
DPRINTF(("RAS hit: p=%p %p\n", p, addr));
break;
}
}
KPREEMPT_ENABLE(l);
return startaddr;
}
/*
* During a fork, we copy all of the sequences from parent p1 to
* the child p2.
*
* No locking required as the parent must be paused.
*/
int
ras_fork(struct proc *p1, struct proc *p2)
{
struct ras *rp, *nrp;
for (rp = p1->p_raslist; rp != NULL; rp = rp->ras_next) {
nrp = kmem_alloc(sizeof(*nrp), KM_SLEEP);
nrp->ras_startaddr = rp->ras_startaddr;
nrp->ras_endaddr = rp->ras_endaddr;
nrp->ras_next = p2->p_raslist;
p2->p_raslist = nrp;
}
DPRINTF(("ras_fork: p1=%p, p2=%p\n", p1, p2));
return 0;
}
/*
* Nuke all sequences for this process.
*/
int
ras_purgeall(void)
{
struct ras *rp, *nrp;
proc_t *p;
p = curproc;
if (p->p_raslist == NULL)
return 0;
mutex_enter(&p->p_auxlock);
if ((rp = p->p_raslist) != NULL) {
p->p_raslist = NULL;
ras_sync();
for(; rp != NULL; rp = nrp) {
nrp = rp->ras_next;
kmem_free(rp, sizeof(*rp));
}
}
mutex_exit(&p->p_auxlock);
return 0;
}
#if defined(__HAVE_RAS)
/*
* Install the new sequence. If it already exists, return
* an error.
*/
static int
ras_install(void *addr, size_t len)
{
struct ras *rp;
struct ras *newrp;
void *endaddr;
int nras, error;
proc_t *p;
if (len == 0)
return EINVAL;
if ((uintptr_t)addr < VM_MIN_ADDRESS ||
(uintptr_t)addr > VM_MAXUSER_ADDRESS)
return EINVAL;
if (len > VM_MAXUSER_ADDRESS - (uintptr_t)addr)
return EINVAL;
endaddr = (char *)addr + len;
newrp = kmem_alloc(sizeof(*newrp), KM_SLEEP);
newrp->ras_startaddr = addr;
newrp->ras_endaddr = endaddr;
error = 0;
nras = 0;
p = curproc;
mutex_enter(&p->p_auxlock);
for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (++nras >= ras_per_proc) {
error = EINVAL;
break;
}
if (addr < rp->ras_endaddr && endaddr > rp->ras_startaddr) {
error = EEXIST;
break;
}
}
if (rp == NULL) {
newrp->ras_next = p->p_raslist;
p->p_raslist = newrp;
ras_sync();
mutex_exit(&p->p_auxlock);
} else {
mutex_exit(&p->p_auxlock);
kmem_free(newrp, sizeof(*newrp));
}
return error;
}
/*
* Nuke the specified sequence. Both address and len must
* match, otherwise we return an error.
*/
static int
ras_purge(void *addr, size_t len)
{
struct ras *rp, **link;
proc_t *p;
p = curproc;
mutex_enter(&p->p_auxlock);
link = &p->p_raslist;
for (rp = *link; rp != NULL; link = &rp->ras_next, rp = *link) { if (addr == rp->ras_startaddr &&
(char *)rp->ras_endaddr - (char *)rp->ras_startaddr == len)
break;
}
if (rp != NULL) {
*link = rp->ras_next;
ras_sync();
mutex_exit(&p->p_auxlock);
kmem_free(rp, sizeof(*rp));
return 0;
} else {
mutex_exit(&p->p_auxlock);
return ESRCH;
}
}
#endif /* defined(__HAVE_RAS) */
/*ARGSUSED*/
int
sys_rasctl(struct lwp *l, const struct sys_rasctl_args *uap, register_t *retval)
{
#if defined(__HAVE_RAS)
/* {
syscallarg(void *) addr;
syscallarg(size_t) len;
syscallarg(int) op;
} */
void *addr;
size_t len;
int op;
int error;
/*
* first, extract syscall args from the uap.
*/
addr = (void *)SCARG(uap, addr);
len = (size_t)SCARG(uap, len);
op = SCARG(uap, op);
DPRINTF(("sys_rasctl: p=%p addr=%p, len=%ld, op=0x%x\n",
curproc, addr, (long)len, op));
switch (op) {
case RAS_INSTALL:
error = ras_install(addr, len);
break;
case RAS_PURGE:
error = ras_purge(addr, len);
break;
case RAS_PURGE_ALL:
error = ras_purgeall();
break;
default:
error = EINVAL;
break;
}
return (error);
#else
return (EOPNOTSUPP);
#endif
}
/* $NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */
/*-
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/proc.h>
#include <sys/sem.h>
#ifndef SYSVSEM
#define SYSVSEM
#endif
#include <sys/syscallargs.h>
#include <compat/sys/sem.h>
int
compat_50_sys_____semctl13(struct lwp *l, const struct compat_50_sys_____semctl13_args *uap, register_t *retval)
{
/* {
syscallarg(int) semid;
syscallarg(int) semnum;
syscallarg(int) cmd;
syscallarg(union __semun *) arg;
} */
union __semun arg;
struct semid_ds sembuf;
struct semid_ds13 osembuf;
int cmd, error;
void *pass_arg;
cmd = SCARG(uap, cmd);
pass_arg = get_semctl_arg(cmd, &sembuf, &arg);
if (pass_arg != NULL) {
error = copyin(SCARG(uap, arg), &arg, sizeof(arg));
if (error)
return (error);
if (cmd == IPC_SET) {
error = copyin(arg.buf, &osembuf, sizeof(osembuf));
if (error)
return (error);
__semid_ds13_to_native(&osembuf, &sembuf);
}
}
error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd,
pass_arg, retval);
if (error == 0 && cmd == IPC_STAT) { __native_to_semid_ds13(&sembuf, &osembuf);
error = copyout(&osembuf, arg.buf, sizeof(osembuf));
}
return (error);
}
/* $NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $ */
/*
* Copyright (c) 1999, 2000 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Overlay Layer
* (See overlay_vnops.c for a description of what this does.)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <miscfs/overlay/overlay.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, overlay, "layerfs");
VFS_PROTOS(ov);
#define NOVERLAYNODECACHE 16
/*
* Mount overlay layer
*/
int
ov_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0;
struct overlay_args *args = data;
struct vnode *lowerrootvp, *vp;
struct overlay_mount *nmp;
struct layer_mount *lmp;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_mount(mp = %p)\n", mp);
#endif
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
lmp = MOUNTTOLAYERMOUNT(mp);
if (lmp == NULL)
return EIO;
args->la.target = NULL;
*data_len = sizeof *args;
return 0;
}
/*
* Update is not supported
*/
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/*
* Find lower node
*/
lowerrootvp = mp->mnt_vnodecovered;
vref(lowerrootvp);
if ((error = vn_lock(lowerrootvp, LK_EXCLUSIVE))) {
vrele(lowerrootvp);
return (error);
}
/*
* First cut at fixing up upper mount point
*/
nmp = kmem_zalloc(sizeof(struct overlay_mount), KM_SLEEP);
mp->mnt_data = nmp;
/*
* Make sure that the mount point is sufficiently initialized
* that the node create call will work.
*/
vfs_getnewfsid(mp);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(nmp, sizeof(struct overlay_mount));
return error;
}
nmp->ovm_size = sizeof (struct overlay_node);
nmp->ovm_tag = VT_OVERLAY;
nmp->ovm_bypass = layer_bypass;
nmp->ovm_vnodeop_p = overlay_vnodeop_p;
/*
* Fix up overlay node for root vnode
*/
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
/*
* Make sure the fixup worked
*/
if (error) {
vrele(lowerrootvp);
kmem_free(nmp, sizeof(struct overlay_mount));
return error;
}
/*
* Keep a held reference to the root vnode.
* It is vrele'd in ov_unmount.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
nmp->ovm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_mount: lower %s, alias at %s\n",
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
return 0;
}
/*
* Free reference to overlay layer
*/
int
ov_unmount(struct mount *mp, int mntflags)
{
struct vnode *overlay_rootvp = MOUNTTOOVERLAYMOUNT(mp)->ovm_rootvp;
struct overlay_mount *omp;
int error;
int flags = 0;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_unmount(mp = %p)\n", mp);
#endif
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(overlay_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, overlay_rootvp, flags)) != 0)
return (error);
#ifdef OVERLAYFS_DIAGNOSTIC
vprint("alias root of lower", overlay_rootvp);
#endif
/*
* Blow it away for future re-use
*/
vgone(overlay_rootvp);
/*
* Finally, throw away the overlay_mount structure
*/
omp = mp->mnt_data;
kmem_free(omp, sizeof(struct overlay_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc overlay_vnodeop_opv_desc;
const struct vnodeopv_desc * const ov_vnodeopv_descs[] = {
&overlay_vnodeop_opv_desc,
NULL,
};
struct vfsops overlay_vfsops = {
.vfs_name = MOUNT_OVERLAY,
.vfs_min_mount_data = sizeof (struct overlay_args),
.vfs_mount = ov_mount,
.vfs_start = layerfs_start,
.vfs_unmount = ov_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = ov_vnodeopv_descs
};
SYSCTL_SETUP(overlay_sysctl_setup, "overlay fs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "overlay",
SYSCTL_DESCR("Overlay file system"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
}
static int
overlay_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&overlay_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&overlay_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $ */
/*-
* Copyright (c) 1999, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Bill Sommerfeld.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/dirent.h>
#include <sys/kauth.h>
#include <ufs/ufs/dir.h> /* XXX only for DIRBLKSIZ */
#include <sys/syscallargs.h>
/*
* Vnode variable naming conventions in this file:
*
* rvp: the current root we're aiming towards.
* lvp, *lvpp: the "lower" vnode
* uvp, *uvpp: the "upper" vnode.
*
* Since all the vnodes we're dealing with are directories, and the
* lookups are going *up* in the filesystem rather than *down*, the
* usual "pvp" (parent) or "dvp" (directory) naming conventions are
* too confusing.
*/
/*
* XXX Will infinite loop in certain cases if a directory read reliably
* returns EINVAL on last block.
* XXX is EINVAL the right thing to return if a directory is malformed?
*/
/*
* XXX Untested vs. mount -o union; probably does the wrong thing.
*/
/*
* Find parent vnode of *lvpp, return in *uvpp
*
* If we care about the name, scan it looking for name of directory
* entry pointing at lvp.
*
* Place the name in the buffer which starts at bufp, immediately
* before *bpp, and move bpp backwards to point at the start of it.
*
* On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed
* On exit, *uvpp is either NULL or is a locked vnode reference.
*/
static int
getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp,
char *bufp, struct lwp *l)
{
int error = 0;
int eofflag;
off_t off;
int tries;
struct uio uio;
struct iovec iov;
char *dirbuf = NULL;
int dirbuflen;
ino_t fileno;
struct vattr va;
struct vnode *uvp = NULL;
kauth_cred_t cred = l->l_cred;
struct componentname cn;
int len, reclen;
tries = 0;
/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE);
/*
* If we want the filename, get some info we need while the
* current directory is still locked.
*/
if (bufp != NULL) {
error = VOP_GETATTR(lvp, &va, cred);
if (error) { VOP_UNLOCK(lvp);
*uvpp = NULL;
return error;
}
}
/*
* Ok, we have to do it the hard way..
* Next, get parent vnode using lookup of ..
*/
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY;
cn.cn_cred = cred;
cn.cn_nameptr = "..";
cn.cn_namelen = 2;
/* At this point, lvp is locked */
error = VOP_LOOKUP(lvp, uvpp, &cn);
VOP_UNLOCK(lvp);
if (error) {
*uvpp = NULL;
return error;
}
uvp = *uvpp;
/* If we don't care about the pathname, we're done */
if (bufp == NULL) {
return 0;
}
fileno = va.va_fileid;
/* I guess UFS_DIRBLKSIZ is a good guess at a good size to use? */
dirbuflen = UFS_DIRBLKSIZ;
if (dirbuflen < va.va_blocksize)
dirbuflen = va.va_blocksize;
dirbuf = kmem_alloc(dirbuflen, KM_SLEEP);
/* Now lvp is unlocked, try to lock uvp */
error = vn_lock(uvp, LK_SHARED);
if (error) {
vrele(uvp);
*uvpp = NULL;
return error;
}
#if 0
unionread:
#endif
off = 0;
do {
/* call VOP_READDIR of parent */
iov.iov_base = dirbuf;
iov.iov_len = dirbuflen;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = off;
uio.uio_resid = dirbuflen;
uio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&uio);
eofflag = 0;
error = VOP_READDIR(uvp, &uio, cred, &eofflag, 0, 0);
off = uio.uio_offset;
/*
* Try again if NFS tosses its cookies.
* XXX this can still loop forever if the directory is busted
* such that the second or subsequent page of it always
* returns EINVAL
*/
if ((error == EINVAL) && (tries < 3)) {
off = 0;
tries++;
continue; /* once more, with feeling */
}
if (!error) {
char *cpos;
struct dirent *dp;
cpos = dirbuf;
tries = 0;
/* scan directory page looking for matching vnode */
for (len = (dirbuflen - uio.uio_resid); len > 0;
len -= reclen) {
dp = (struct dirent *) cpos;
reclen = dp->d_reclen;
/* check for malformed directory.. */
if (reclen < _DIRENT_MINSIZE(dp) ||
reclen > len) {
error = EINVAL;
goto out;
}
/*
* XXX should perhaps do VOP_LOOKUP to
* check that we got back to the right place,
* but getting the locking games for that
* right would be heinous.
*/
if ((dp->d_type != DT_WHT) &&
(dp->d_fileno == fileno)) {
char *bp = *bpp;
bp -= dp->d_namlen;
if (bp <= bufp) {
error = ERANGE;
goto out;
}
memcpy(bp, dp->d_name, dp->d_namlen);
error = 0;
*bpp = bp;
goto out;
}
cpos += reclen;
}
} else
goto out;
} while (!eofflag);
#if 0
/*
* Deal with mount -o union, which unions only the
* root directory of the mount.
*/
if ((uvp->v_vflag & VV_ROOT) &&
(uvp->v_mount->mnt_flag & MNT_UNION)) {
struct vnode *tvp = uvp;
uvp = uvp->v_mount->mnt_vnodecovered;
vput(tvp);
vref(uvp);
*uvpp = uvp;
vn_lock(uvp, LK_SHARED | LK_RETRY);
goto unionread;
}
#endif
error = ENOENT;
out:
VOP_UNLOCK(uvp);
kmem_free(dirbuf, dirbuflen);
return error;
}
/*
* common routine shared by sys___getcwd() and vn_isunder()
*/
int
getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp,
int limit, int flags, struct lwp *l)
{
struct cwdinfo *cwdi = l->l_proc->p_cwdi;
kauth_cred_t cred = l->l_cred;
struct vnode *uvp = NULL;
char *bp = NULL;
int error;
accmode_t accmode = VEXEC;
error = 0;
if (rvp == NULL) { rvp = cwdi->cwdi_rdir;
if (rvp == NULL)
rvp = rootvnode;
}
vref(rvp);
vref(lvp);
/*
* Error handling invariant:
* Before a `goto out':
* lvp is either NULL, or held.
* uvp is either NULL, or held.
*/
if (bufp)
bp = *bpp;
/*
* this loop will terminate when one of the following happens:
* - we hit the root
* - getdirentries or lookup fails
* - we run out of space in the buffer.
*/
if (lvp == rvp) { if (bp) *(--bp) = '/';
goto out;
}
do {
/*
* access check here is optional, depending on
* whether or not caller cares.
*/
int chkaccess = (flags & GETCWD_CHECK_ACCESS);
bool locked = false;
/*
* step up if we're a covered vnode..
* check access on the first vnode only.
*/
if (lvp->v_vflag & VV_ROOT) {
vn_lock(lvp, LK_SHARED | LK_RETRY);
if (chkaccess) {
error = VOP_ACCESS(lvp, accmode, cred);
if (error) {
VOP_UNLOCK(lvp);
goto out;
}
chkaccess = 0;
}
while (lvp->v_vflag & VV_ROOT) {
struct vnode *tvp;
if (lvp == rvp) {
VOP_UNLOCK(lvp);
goto out;
}
tvp = lvp->v_mount->mnt_vnodecovered;
/*
* hodie natus est radici frater
*/
if (tvp == NULL) {
VOP_UNLOCK(lvp);
error = ENOENT;
goto out;
}
vref(tvp);
vput(lvp);
lvp = tvp;
if (lvp->v_vflag & VV_ROOT) vn_lock(lvp, LK_SHARED | LK_RETRY);
}
}
/* Do we need to check access to the directory? */
if (chkaccess && !cache_have_id(lvp)) {
/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_ACCESS(lvp, accmode, cred);
if (error) {
VOP_UNLOCK(lvp);
goto out;
}
chkaccess = 0;
locked = true;
}
/*
* Look in the name cache; if that fails, look in the
* directory..
*/
error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess,
accmode);
if (error == -1) {
if (!locked) {
locked = true;
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
}
if (lvp->v_type != VDIR) {
VOP_UNLOCK(lvp);
error = ENOTDIR;
goto out;
}
error = getcwd_scandir(lvp, &uvp, &bp, bufp, l);
/* lvp now unlocked */
} else if (locked) {
VOP_UNLOCK(lvp);
}
if (error)
goto out;
#if DIAGNOSTIC
if (bufp && (bp <= bufp)) {
panic("getcwd: oops, went back too far");
}
#endif
accmode = VEXEC | VREAD;
if (bp) *(--bp) = '/';
vrele(lvp);
lvp = uvp;
uvp = NULL;
limit--;
} while ((lvp != rvp) && (limit > 0));
out:
if (bpp)
*bpp = bp;
if (uvp) vrele(uvp); if (lvp) vrele(lvp);
vrele(rvp);
return error;
}
/*
* Check if one directory can be found inside another in the directory
* hierarchy.
*
* Intended to be used in chroot, chdir, fchdir, etc., to ensure that
* chroot() actually means something.
*/
int
vn_isunder(struct vnode *lvp, struct vnode *rvp, struct lwp *l)
{
int error;
error = getcwd_common(lvp, rvp, NULL, NULL, MAXPATHLEN / 2, 0, l);
if (!error)
return 1;
else
return 0;
}
/*
* Returns true if proc p1's root directory equal to or under p2's
* root directory.
*
* Intended to be used from ptrace/procfs sorts of things.
*/
int
proc_isunder(struct proc *p1, struct lwp *l2)
{
struct vnode *r1 = p1->p_cwdi->cwdi_rdir;
struct vnode *r2 = l2->l_proc->p_cwdi->cwdi_rdir;
if (r1 == NULL)
return (r2 == NULL);
else if (r2 == NULL)
return 1;
else
return vn_isunder(r1, r2, l2);
}
/*
* Find pathname of process's current directory.
*
* Use vfs vnode-to-name reverse cache; if that fails, fall back
* to reading directory contents.
*/
int
sys___getcwd(struct lwp *l, const struct sys___getcwd_args *uap, register_t *retval)
{
/* {
syscallarg(char *) bufp;
syscallarg(size_t) length;
} */
int error;
char *path;
char *bp, *bend;
int len = SCARG(uap, length);
int lenused;
struct cwdinfo *cwdi;
if (len > MAXPATHLEN * 4)
len = MAXPATHLEN * 4;
else if (len < 2)
return ERANGE;
path = kmem_alloc(len, KM_SLEEP);
bp = &path[len];
bend = bp;
*(--bp) = '\0';
/*
* 5th argument here is "max number of vnodes to traverse".
* Since each entry takes up at least 2 bytes in the output buffer,
* limit it to N/2 vnodes for an N byte buffer.
*/
cwdi = l->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path,
len/2, GETCWD_CHECK_ACCESS, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto out;
lenused = bend - bp;
*retval = lenused;
/* put the result into user buffer */
error = copyout(bp, SCARG(uap, bufp), lenused);
out:
kmem_free(path, len);
return error;
}
/*
* Try to find a pathname for a vnode. Since there is no mapping vnode ->
* parent directory, this needs the namecache to succeed. Caller holds a
* reference to the vnode.
*/
int
vnode_to_path(char *path, size_t len, struct vnode *vp, struct lwp *curl,
struct proc *p)
{
struct proc *curp = curl->l_proc;
int error, lenused, elen;
char *bp, *bend;
struct vnode *dvp;
KASSERT(vrefcnt(vp) > 0);
bp = bend = &path[len];
*(--bp) = '\0';
error = cache_revlookup(vp, &dvp, &bp, path, false, 0);
if (error != 0)
return (error == -1 ? ENOENT : error);
*(--bp) = '/';
error = getcwd_common(dvp, NULL, &bp, path, len / 2,
GETCWD_CHECK_ACCESS, curl);
vrele(dvp);
if (error != 0)
return error;
/*
* Strip off emulation path for emulated processes looking at
* the maps file of a process of the same emulation. (Won't
* work if /emul/xxx is a symlink..)
*/
if (curp->p_emul == p->p_emul && curp->p_emul->e_path != NULL) {
elen = strlen(curp->p_emul->e_path);
if (!strncmp(bp, curp->p_emul->e_path, elen))
bp = &bp[elen];
}
lenused = bend - bp;
memcpy(path, bp, lenused);
path[lenused] = '\0';
return 0;
}
/* $NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Sleep queue implementation, used by turnstiles and general sleep/wakeup
* interfaces.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* for sleepq_abort:
* During autoconfiguration or after a panic, a sleep will simply lower the
* priority briefly to allow interrupts, then return. The priority to be
* used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and
* maintained in the machine-dependent layers. This priority will typically
* be 0, or the lowest priority that is safe for use on the interrupt stack;
* it can be made higher to block network software interrupts after panics.
*/
#ifndef IPL_SAFEPRI
#define IPL_SAFEPRI 0
#endif
static int sleepq_sigtoerror(lwp_t *, int);
/* General purpose sleep table, used by mtsleep() and condition variables. */
sleeptab_t sleeptab __cacheline_aligned;
sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned;
/*
* sleeptab_init:
*
* Initialize a sleep table.
*/
void
sleeptab_init(sleeptab_t *st)
{
static bool again;
int i;
for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) {
if (!again) {
mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT,
IPL_SCHED);
}
sleepq_init(&st->st_queue[i]);
}
again = true;
}
/*
* sleepq_init:
*
* Prepare a sleep queue for use.
*/
void
sleepq_init(sleepq_t *sq)
{
LIST_INIT(sq);
}
/*
* sleepq_remove:
*
* Remove an LWP from a sleep queue and wake it up. Distinguish
* between deliberate wakeups (which are a valuable information) and
* "unsleep" (an out-of-band action must be taken).
*
* For wakeup, convert any interruptable wait into non-interruptable
* one before waking the LWP. Otherwise, if only one LWP is awoken it
* could fail to do something useful with the wakeup due to an error
* return and the caller of e.g. cv_signal() may not expect this.
*/
void
sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) {
KASSERT(sq != NULL); LIST_REMOVE(l, l_sleepchain);
} else {
KASSERT(sq == NULL);
}
l->l_syncobj = &sched_syncobj;
l->l_wchan = NULL;
l->l_sleepq = NULL;
l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR;
ci = l->l_cpu;
spc = &ci->ci_schedstate;
/*
* If not sleeping, the LWP must have been suspended. Let whoever
* holds it stopped set it running again.
*/
if (l->l_stat != LSSLEEP) { KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED);
lwp_setlock(l, spc->spc_lwplock);
return;
}
/*
* If the LWP is still on the CPU, mark it as LSONPROC. It may be
* about to call mi_switch(), in which case it will yield.
*/
if ((l->l_pflag & LP_RUNNING) != 0) {
l->l_stat = LSONPROC;
l->l_slptime = 0;
lwp_setlock(l, spc->spc_lwplock);
return;
}
/* Update sleep time delta, call the wake-up handler of scheduler */
l->l_slpticksum += (getticks() - l->l_slpticks);
sched_wakeup(l);
/* Look for a CPU to wake up */
l->l_cpu = sched_takecpu(l);
ci = l->l_cpu;
spc = &ci->ci_schedstate;
/*
* Set it running.
*/
spc_lock(ci);
lwp_setlock(l, spc->spc_mutex);
sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
sched_enqueue(l);
sched_resched_lwp(l, true);
/* LWP & SPC now unlocked, but we still hold sleep queue lock. */
}
/*
* sleepq_insert:
*
* Insert an LWP into the sleep queue, optionally sorting by priority.
*/
static void
sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj)
{
if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) {
KASSERT(sq == NULL);
return;
}
KASSERT(sq != NULL); if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
lwp_t *l2, *l_last = NULL;
const pri_t pri = lwp_eprio(l);
LIST_FOREACH(l2, sq, l_sleepchain) {
l_last = l2;
if (lwp_eprio(l2) < pri) {
LIST_INSERT_BEFORE(l2, l, l_sleepchain);
return;
}
}
/*
* Ensure FIFO ordering if no waiters are of lower priority.
*/
if (l_last != NULL) {
LIST_INSERT_AFTER(l_last, l, l_sleepchain);
return;
}
}
LIST_INSERT_HEAD(sq, l, l_sleepchain);
}
/*
* sleepq_enter:
*
* Prepare to block on a sleep queue, after which any interlock can be
* safely released.
*/
int
sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp)
{
int nlocks;
KASSERT((sq != NULL) == (mp != NULL));
/*
* Acquire the per-LWP mutex and lend it our sleep queue lock.
* Once interlocked, we can release the kernel lock.
*/
lwp_lock(l);
if (mp != NULL) { lwp_unlock_to(l, mp);
}
if (__predict_false((nlocks = l->l_blcnt) != 0)) { KERNEL_UNLOCK_ALL(NULL, NULL);
}
return nlocks;
}
/*
* sleepq_enqueue:
*
* Enter an LWP into the sleep queue and prepare for sleep. The sleep
* queue must already be locked, and any interlock (such as the kernel
* lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
*/
void
sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj,
bool catch_p)
{
lwp_t *l = curlwp;
KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_stat == LSONPROC); KASSERT(l->l_wchan == NULL); KASSERT(l->l_sleepq == NULL); KASSERT((l->l_flag & LW_SINTR) == 0);
l->l_syncobj = sobj;
l->l_wchan = wchan;
l->l_sleepq = sq;
l->l_wmesg = wmesg;
l->l_slptime = 0;
l->l_stat = LSSLEEP;
if (catch_p) l->l_flag |= LW_SINTR;
sleepq_insert(sq, l, sobj);
/* Save the time when thread has slept */
l->l_slpticks = getticks();
sched_slept(l);
}
/*
* sleepq_transfer:
*
* Move an LWP from one sleep queue to another. Both sleep queues
* must already be locked.
*
* The LWP will be updated with the new sleepq, wchan, wmesg,
* sobj, and mutex. The interruptible flag will also be updated.
*/
void
sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan,
const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p)
{
KASSERT(l->l_sleepq == from_sq);
LIST_REMOVE(l, l_sleepchain);
l->l_syncobj = sobj;
l->l_wchan = wchan;
l->l_sleepq = sq;
l->l_wmesg = wmesg;
if (catch_p)
l->l_flag = LW_SINTR | LW_CATCHINTR;
else
l->l_flag = ~(LW_SINTR | LW_CATCHINTR);
/*
* This allows the transfer from one sleepq to another where
* it is known that they're both protected by the same lock.
*/
if (mp != NULL)
lwp_setlock(l, mp);
sleepq_insert(sq, l, sobj);
}
/*
* sleepq_uncatch:
*
* Mark the LWP as no longer sleeping interruptibly.
*/
void
sleepq_uncatch(lwp_t *l)
{
l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO);
}
/*
* sleepq_block:
*
* After any intermediate step such as releasing an interlock, switch.
* sleepq_block() may return early under exceptional conditions, for
* example if the LWP's containing process is exiting.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks)
{
const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG;
int error = 0, sig, flag;
struct proc *p;
lwp_t *l = curlwp;
bool early = false;
ktrcsw(1, 0, syncobj);
/*
* If sleeping interruptably, check for pending signals, exits or
* core dump events.
*
* Note the usage of LW_CATCHINTR. This expresses our intent
* to catch or not catch sleep interruptions, which might change
* while we are sleeping. It is independent from LW_SINTR because
* we don't want to leave LW_SINTR set when the LWP is not asleep.
*/
if (catch_p) {
if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
l->l_flag &= ~LW_CANCELLED;
error = EINTR;
early = true;
} else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
early = true;
l->l_flag |= LW_CATCHINTR;
} else
l->l_flag &= ~LW_CATCHINTR; if (early) {
/* lwp_unsleep() will release the lock */
lwp_unsleep(l, true);
} else {
/*
* The LWP may have already been awoken if the caller
* dropped the sleep queue lock between sleepq_enqueue() and
* sleepq_block(). If that happens l_stat will be LSONPROC
* and mi_switch() will treat this as a preemption. No need
* to do anything special here.
*/
if (timo) {
l->l_flag &= ~LW_STIMO;
callout_schedule(&l->l_timeout_ch, timo);
}
l->l_boostpri = l->l_syncobj->sobj_boostpri;
spc_lock(l->l_cpu);
mi_switch(l);
/* The LWP and sleep queue are now unlocked. */
if (timo) {
/*
* Even if the callout appears to have fired, we
* need to stop it in order to synchronise with
* other CPUs. It's important that we do this in
* this LWP's context, and not during wakeup, in
* order to keep the callout & its cache lines
* co-located on the CPU with the LWP.
*/
(void)callout_halt(&l->l_timeout_ch, NULL);
error = (l->l_flag & LW_STIMO) ? EWOULDBLOCK : 0;
}
}
/*
* LW_CATCHINTR is only modified in this function OR when we
* are asleep (with the sleepq locked). We can therefore safely
* test it unlocked here as it is guaranteed to be stable by
* virtue of us running.
*
* We do not bother clearing it if set; that would require us
* to take the LWP lock, and it doesn't seem worth the hassle
* considering it is only meaningful here inside this function,
* and is set to reflect intent upon entry.
*/
flag = atomic_load_relaxed(&l->l_flag); if (__predict_false((flag & mask) != 0)) { if ((flag & LW_CATCHINTR) == 0 || error != 0)
/* nothing */;
else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
error = EINTR;
else if ((flag & LW_PENDSIG) != 0) {
/*
* Acquiring p_lock may cause us to recurse
* through the sleep path and back into this
* routine, but is safe because LWPs sleeping
* on locks are non-interruptable and we will
* not recurse again.
*/
p = l->l_proc;
mutex_enter(p->p_lock);
if (((sig = sigispending(l, 0)) != 0 && (sigprop[sig] & SA_STOP) == 0) ||
(sig = issignal(l)) != 0)
error = sleepq_sigtoerror(l, sig);
mutex_exit(p->p_lock);
}
}
ktrcsw(0, 0, syncobj); if (__predict_false(nlocks != 0)) { KERNEL_LOCK(nlocks, NULL);
}
return error;
}
/*
* sleepq_wake:
*
* Wake zero or more LWPs blocked on a single wait channel.
*/
void
sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp)
{
lwp_t *l, *next;
KASSERT(mutex_owned(mp));
for (l = LIST_FIRST(sq); l != NULL; l = next) {
KASSERT(l->l_sleepq == sq);
KASSERT(l->l_mutex == mp);
next = LIST_NEXT(l, l_sleepchain);
if (l->l_wchan != wchan)
continue;
sleepq_remove(sq, l, true);
if (--expected == 0)
break;
}
mutex_spin_exit(mp);
}
/*
* sleepq_unsleep:
*
* Remove an LWP from its sleep queue and set it runnable again.
* sleepq_unsleep() is called with the LWP's mutex held, and will
* release it if "unlock" is true.
*/
void
sleepq_unsleep(lwp_t *l, bool unlock)
{
sleepq_t *sq = l->l_sleepq;
kmutex_t *mp = l->l_mutex;
KASSERT(lwp_locked(l, mp));
KASSERT(l->l_wchan != NULL);
sleepq_remove(sq, l, false);
if (unlock) {
mutex_spin_exit(mp);
}
}
/*
* sleepq_timeout:
*
* Entered via the callout(9) subsystem to time out an LWP that is on a
* sleep queue.
*/
void
sleepq_timeout(void *arg)
{
lwp_t *l = arg;
/*
* Lock the LWP. Assuming it's still on the sleep queue, its
* current mutex will also be the sleep queue mutex.
*/
lwp_lock(l);
if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) {
/*
* Somebody beat us to it, or the LWP is blocked in
* callout_halt() waiting for us to finish here. In
* neither case should the LWP produce EWOULDBLOCK.
*/
lwp_unlock(l);
return;
}
l->l_flag |= LW_STIMO;
lwp_unsleep(l, true);
}
/*
* sleepq_sigtoerror:
*
* Given a signal number, interpret and return an error code.
*/
static int
sleepq_sigtoerror(lwp_t *l, int sig)
{
struct proc *p = l->l_proc;
int error;
KASSERT(mutex_owned(p->p_lock));
/*
* If this sleep was canceled, don't let the syscall restart.
*/
if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
error = EINTR;
else
error = ERESTART;
return error;
}
/*
* sleepq_abort:
*
* After a panic or during autoconfiguration, lower the interrupt
* priority level to give pending interrupts a chance to run, and
* then return. Called if sleepq_dontsleep() returns non-zero, and
* always returns zero.
*/
int
sleepq_abort(kmutex_t *mtx, int unlock)
{
int s;
s = splhigh();
splx(IPL_SAFEPRI);
splx(s);
if (mtx != NULL && unlock != 0)
mutex_exit(mtx);
return 0;
}
/*
* sleepq_reinsert:
*
* Move the position of the lwp in the sleep queue after a possible
* change of the lwp's effective priority.
*/
static void
sleepq_reinsert(sleepq_t *sq, lwp_t *l)
{ KASSERT(l->l_sleepq == sq); if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) {
return;
}
/*
* Don't let the sleep queue become empty, even briefly.
* cv_signal() and cv_broadcast() inspect it without the
* sleep queue lock held and need to see a non-empty queue
* head if there are waiters.
*/
if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) {
return;
}
LIST_REMOVE(l, l_sleepchain);
sleepq_insert(sq, l, l->l_syncobj);
}
/*
* sleepq_changepri:
*
* Adjust the priority of an LWP residing on a sleepq.
*/
void
sleepq_changepri(lwp_t *l, pri_t pri)
{
sleepq_t *sq = l->l_sleepq;
KASSERT(lwp_locked(l, NULL));
l->l_priority = pri;
sleepq_reinsert(sq, l);
}
/*
* sleepq_changepri:
*
* Adjust the lended priority of an LWP residing on a sleepq.
*/
void
sleepq_lendpri(lwp_t *l, pri_t pri)
{
sleepq_t *sq = l->l_sleepq;
KASSERT(lwp_locked(l, NULL));
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
sleepq_reinsert(sq, l);
}
/* $NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_socket.c 8.3 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
static int soo_fpathconf(struct file *, int, register_t *);
static int soo_posix_fadvise(struct file *, off_t, off_t, int);
const struct fileops socketops = {
.fo_name = "socket",
.fo_read = soo_read,
.fo_write = soo_write,
.fo_ioctl = soo_ioctl,
.fo_fcntl = fnullop_fcntl,
.fo_poll = soo_poll,
.fo_stat = soo_stat,
.fo_close = soo_close,
.fo_kqfilter = soo_kqfilter,
.fo_restart = soo_restart,
.fo_fpathconf = soo_fpathconf,
.fo_posix_fadvise = soo_posix_fadvise,
};
int (*ifioctl)(struct socket *, u_long, void *, struct lwp *) = (void *)eopnotsupp;
/* ARGSUSED */
int
soo_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct socket *so = fp->f_socket;
int error;
error = (*so->so_receive)(so, NULL, uio, NULL, NULL, NULL);
return error;
}
/* ARGSUSED */
int
soo_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct socket *so = fp->f_socket;
int error;
error = (*so->so_send)(so, NULL, uio, NULL, NULL, 0, curlwp);
return error;
}
int
soo_ioctl(file_t *fp, u_long cmd, void *data)
{
struct socket *so = fp->f_socket;
int error = 0;
switch (cmd) {
case FIONBIO:
solock(so);
if (*(int *)data)
so->so_state |= SS_NBIO;
else
so->so_state &= ~SS_NBIO;
sounlock(so);
break;
case FIOASYNC:
solock(so);
if (*(int *)data) {
so->so_rcv.sb_flags |= SB_ASYNC;
so->so_snd.sb_flags |= SB_ASYNC;
} else {
so->so_rcv.sb_flags &= ~SB_ASYNC;
so->so_snd.sb_flags &= ~SB_ASYNC;
}
sounlock(so);
break;
case FIONREAD:
*(int *)data = so->so_rcv.sb_cc;
break;
case FIONWRITE:
*(int *)data = so->so_snd.sb_cc;
break;
case FIONSPACE:
/*
* See the comment around sbspace()'s definition
* in sys/socketvar.h in face of counts about maximum
* to understand the following test. We detect overflow
* and return zero.
*/
solock(so); if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
*(int *)data = 0;
else
*(int *)data = sbspace(&so->so_snd);
sounlock(so);
break;
case SIOCSPGRP:
case FIOSETOWN:
case TIOCSPGRP:
error = fsetown(&so->so_pgid, cmd, data);
break;
case SIOCGPGRP:
case FIOGETOWN:
case TIOCGPGRP:
error = fgetown(so->so_pgid, cmd, data);
break;
case SIOCATMARK:
*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
break;
case SIOCPEELOFF:
solock(so);
error = do_sys_peeloff(so, data);
sounlock(so);
break;
default:
/*
* Interface/routing/protocol specific ioctls:
* interface and routing ioctls should have a
* different entry since a socket's unnecessary
*/
if (IOCGROUP(cmd) == 'i')
/*
* KERNEL_LOCK will be held later if if_ioctl() of the
* interface isn't MP-safe.
*/
error = ifioctl(so, cmd, data, curlwp);
else {
KERNEL_LOCK(1, NULL);
error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
cmd, data, NULL);
KERNEL_UNLOCK_ONE(NULL);
}
break;
}
return error;
}
int
soo_poll(file_t *fp, int events)
{
return sopoll(fp->f_socket, events);
}
int
soo_stat(file_t *fp, struct stat *ub)
{
struct socket *so = fp->f_socket;
int error;
memset(ub, 0, sizeof(*ub));
ub->st_mode = S_IFSOCK;
solock(so);
error = (*so->so_proto->pr_usrreqs->pr_stat)(so, ub);
sounlock(so);
return error;
}
/* ARGSUSED */
int
soo_close(file_t *fp)
{
int error = 0;
if (fp->f_socket) error = soclose(fp->f_socket);
fp->f_socket = NULL;
return error;
}
void
soo_restart(file_t *fp)
{
sorestart(fp->f_socket);
}
static int
soo_fpathconf(struct file *fp, int name, register_t *retval)
{
switch (name) {
case _PC_PIPE_BUF:
*retval = PIPE_BUF;
return 0;
default:
return EINVAL;
}
}
static int
soo_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{
return ESPIPE;
}
/* $NetBSD: dbregs.c,v 1.15 2020/01/31 08:55:38 maxv Exp $ */
/*
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/types.h>
#include <sys/lwp.h>
#include <sys/pool.h>
#include <x86/cpufunc.h>
#include <x86/dbregs.h>
#include <uvm/uvm_prot.h>
#include <uvm/uvm_pmap.h>
#include <machine/pmap.h>
struct pool x86_dbregspl;
static struct dbreg initdbstate;
#define X86_BREAKPOINT_CONDITION_DETECTED ( \
X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED )
#define X86_GLOBAL_BREAKPOINT ( \
X86_DR7_GLOBAL_DR0_BREAKPOINT | \
X86_DR7_GLOBAL_DR1_BREAKPOINT | \
X86_DR7_GLOBAL_DR2_BREAKPOINT | \
X86_DR7_GLOBAL_DR3_BREAKPOINT )
void
x86_dbregs_init(void)
{
/* DR0-DR3 should always be 0 */
initdbstate.dr[0] = rdr0();
initdbstate.dr[1] = rdr1();
initdbstate.dr[2] = rdr2();
initdbstate.dr[3] = rdr3();
/* DR4-DR5 are reserved - skip */
/* DR6 and DR7 contain predefined nonzero bits */
initdbstate.dr[6] = rdr6();
initdbstate.dr[7] = rdr7();
/* DR8-DR15 are reserved - skip */
/*
* Explicitly reset some bits just in case they could be
* set by brave software/hardware before the kernel boot.
*/
initdbstate.dr[6] &= ~X86_BREAKPOINT_CONDITION_DETECTED;
initdbstate.dr[7] &= ~X86_DR7_GENERAL_DETECT_ENABLE;
pool_init(&x86_dbregspl, sizeof(struct dbreg), 16, 0, 0, "dbregs",
NULL, IPL_NONE);
}
static void
x86_dbregs_reset(void)
{
/*
* It's sufficient to just disable Debug Control Register (DR7).
* It will deactivate hardware watchpoints.
*/
ldr7(0);
/*
* However at some point we need to clear Debug Status Registers
* (DR6). The CPU will never do it automatically.
*
* Clear BREAKPOINT_CONDITION_DETECTED bits and ignore the rest.
*/
ldr6(rdr6() & ~X86_BREAKPOINT_CONDITION_DETECTED);
}
void
x86_dbregs_clear(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
struct dbreg *dbregs;
KASSERT(l == curlwp);
if (__predict_true(pcb->pcb_dbregs == NULL)) {
KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0);
return;
}
dbregs = pcb->pcb_dbregs;
kpreempt_disable();
pcb->pcb_dbregs = NULL;
pcb->pcb_flags &= ~PCB_DBREGS;
x86_dbregs_reset();
kpreempt_enable();
pool_put(&x86_dbregspl, dbregs);
}
void
x86_dbregs_abandon(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
kpreempt_disable();
pcb->pcb_flags &= ~PCB_DBREGS;
x86_dbregs_reset();
kpreempt_enable();
}
void
x86_dbregs_read(struct lwp *l, struct dbreg *regs)
{
struct pcb *pcb = lwp_getpcb(l);
if (pcb->pcb_dbregs == NULL) {
pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
memcpy(pcb->pcb_dbregs, &initdbstate, sizeof(initdbstate));
pcb->pcb_flags |= PCB_DBREGS;
}
memcpy(regs, pcb->pcb_dbregs, sizeof(*regs));
}
void
x86_dbregs_save(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
if (!(pcb->pcb_flags & PCB_DBREGS)) {
return;
}
KASSERT(pcb->pcb_dbregs != NULL);
pcb->pcb_dbregs->dr[0] = rdr0();
pcb->pcb_dbregs->dr[1] = rdr1();
pcb->pcb_dbregs->dr[2] = rdr2();
pcb->pcb_dbregs->dr[3] = rdr3();
pcb->pcb_dbregs->dr[6] = rdr6();
pcb->pcb_dbregs->dr[7] = rdr7();
}
void
x86_dbregs_restore(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l); if (!(pcb->pcb_flags & PCB_DBREGS)) {
return;
}
KASSERT(pcb->pcb_dbregs != NULL);
ldr0(pcb->pcb_dbregs->dr[0]);
ldr1(pcb->pcb_dbregs->dr[1]);
ldr2(pcb->pcb_dbregs->dr[2]);
ldr3(pcb->pcb_dbregs->dr[3]);
ldr6(pcb->pcb_dbregs->dr[6]);
ldr7(pcb->pcb_dbregs->dr[7]);
}
void
x86_dbregs_store_dr6(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
KASSERT(l == curlwp);
KASSERT(pcb->pcb_dbregs != NULL);
pcb->pcb_dbregs->dr[6] = rdr6();
}
int
x86_dbregs_user_trap(void)
{
register_t dr7, dr6;
register_t bp;
dr7 = rdr7();
if ((dr7 & X86_GLOBAL_BREAKPOINT) == 0) {
/*
* All Global Breakpoint bits are zero, thus the trap couldn't
* have been caused by the hardware debug registers.
*/
return 0;
}
dr6 = rdr6();
bp = dr6 & X86_BREAKPOINT_CONDITION_DETECTED;
if (!bp) {
/*
* None of the breakpoint bits are set, meaning this
* trap was not caused by any of the debug registers.
*/
return 0;
}
/*
* At least one of the breakpoints was hit, check to see
* which ones and if any of them are user space addresses.
*/
if (bp & X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED)
if (rdr0() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED)
if (rdr1() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED)
if (rdr2() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED)
if (rdr3() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
return 0;
}
int
x86_dbregs_validate(const struct dbreg *regs)
{
size_t i;
/* Check that DR0-DR3 contain user-space address */
for (i = 0; i < X86_DBREGS; i++) {
if (regs->dr[i] >= (vaddr_t)VM_MAXUSER_ADDRESS)
return EINVAL;
}
#ifndef i386
if (regs->dr[6] & X86_DR6_MBZ) {
return EINVAL;
}
if (regs->dr[7] & X86_DR7_MBZ) {
return EINVAL;
}
#endif
if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) {
return EINVAL;
}
/*
* Skip checks for reserved registers (DR4-DR5, DR8-DR15).
*/
return 0;
}
void
x86_dbregs_write(struct lwp *l, const struct dbreg *regs)
{
struct pcb *pcb = lwp_getpcb(l);
if (pcb->pcb_dbregs == NULL) {
pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
}
memcpy(pcb->pcb_dbregs, regs, sizeof(*regs));
pcb->pcb_flags |= PCB_DBREGS;
}
/*
* Called with preemption disabled.
*/
void
x86_dbregs_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
struct pcb *oldpcb, *newpcb;
bool olddb, newdb;
oldpcb = lwp_getpcb(oldlwp);
newpcb = lwp_getpcb(newlwp);
olddb = (oldpcb->pcb_flags & PCB_DBREGS) != 0;
newdb = (newpcb->pcb_flags & PCB_DBREGS) != 0;
if (__predict_true(!olddb && !newdb)) {
/* fast path */
return;
}
if (olddb) {
x86_dbregs_save(oldlwp);
}
if (newdb) { x86_dbregs_restore(newlwp);
} else if (olddb) {
x86_dbregs_reset();
}
}
/* $NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
int64_t *);
/*
* Update the access, modified, and inode change times as specified
* by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
* The IN_MODIFIED flag is used to specify that the inode needs to be
* updated but that the times have already been set. The access
* and modified times are taken from the second and third parameters;
* the inode change time is always taken from the current time. If
* UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
* disk write of the inode to complete.
*/
int
ffs_update(struct vnode *vp, const struct timespec *acc,
const struct timespec *mod, int updflags)
{
struct fs *fs;
struct buf *bp;
struct inode *ip;
int error;
void *cp;
int waitfor, flags;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (0);
ip = VTOI(vp);
FFS_ITIMES(ip, acc, mod, NULL);
if (updflags & UPDATE_CLOSE)
flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
else
flags = ip->i_flag & IN_MODIFIED;
if (flags == 0)
return (0);
fs = ip->i_fs;
if ((flags & IN_MODIFIED) != 0 &&
(vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
waitfor = updflags & UPDATE_WAIT;
if ((updflags & UPDATE_DIROP) != 0)
waitfor |= UPDATE_WAIT;
} else
waitfor = 0;
/*
* Ensure that uid and gid are correct. This is a temporary
* fix until fsck has been changed to do the update.
*/
if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
ip->i_ffs1_ouid = ip->i_uid; /* XXX */
ip->i_ffs1_ogid = ip->i_gid; /* XXX */
} /* XXX */
error = bread(ip->i_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, B_MODIFY, &bp);
if (error) {
return (error);
}
ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
/* Keep unlinked inode list up to date */
KDASSERTMSG(DIP(ip, nlink) == ip->i_nlink,
"DIP(ip, nlink) [%d] == ip->i_nlink [%d]",
DIP(ip, nlink), ip->i_nlink);
if (ip->i_mode) {
if (ip->i_nlink > 0) {
UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
ip->i_number, ip->i_mode);
} else {
UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
ip->i_number, ip->i_mode);
}
}
if (fs->fs_magic == FS_UFS1_MAGIC) {
cp = (char *)bp->b_data +
(ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode1_swap(ip->i_din.ffs1_din,
(struct ufs1_dinode *)cp);
else
#endif
memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
} else {
cp = (char *)bp->b_data +
(ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode2_swap(ip->i_din.ffs2_din,
(struct ufs2_dinode *)cp);
else
#endif
memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
}
if (waitfor) {
return (bwrite(bp));
} else {
bdwrite(bp);
return (0);
}
}
#define SINGLE 0 /* index of single indirect block */
#define DOUBLE 1 /* index of double indirect block */
#define TRIPLE 2 /* index of triple indirect block */
/*
* Truncate the inode oip to at most length size, freeing the
* disk blocks.
*/
int
ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
{
daddr_t lastblock;
struct inode *oip = VTOI(ovp);
struct mount *omp = ovp->v_mount;
daddr_t bn, lastiblock[UFS_NIADDR], indir_lbn[UFS_NIADDR];
daddr_t blks[UFS_NDADDR + UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR];
struct fs *fs;
int extblocks;
int offset, pgoffset, level;
int64_t blocksreleased = 0, datablocks;
int i, aflag, nblocks;
int error, allerror = 0;
off_t osize;
int sync;
struct ufsmount *ump = oip->i_ump;
void *dcookie;
long bsize;
bool wapbl = omp->mnt_wapbl != NULL;
UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp); if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
KASSERT(oip->i_size == 0);
return 0;
}
if (length < 0)
return (EINVAL);
/*
* Historically clients did not have to specify which data
* they were truncating. So, if not specified, we assume
* traditional behavior, e.g., just the normal data.
*/
if ((ioflag & (IO_EXT | IO_NORMAL)) == 0)
ioflag |= IO_NORMAL;
fs = oip->i_fs;
#define i_din2 i_din.ffs2_din
extblocks = 0;
datablocks = DIP(oip, blocks); if (fs->fs_magic == FS_UFS2_MAGIC && oip->i_din2->di_extsize > 0) {
extblocks = btodb(ffs_fragroundup(fs, oip->i_din2->di_extsize));
datablocks -= extblocks;
}
if ((ioflag & IO_EXT) && extblocks > 0) {
if (length != 0)
panic("ffs_truncate: partial trunc of extdata");
{
#ifdef QUOTA
(void) chkdq(oip, -extblocks, NOCRED, FORCE);
#endif
osize = oip->i_din2->di_extsize;
oip->i_din2->di_blocks -= extblocks;
oip->i_din2->di_extsize = 0;
for (i = 0; i < UFS_NXADDR; i++) {
binvalbuf(ovp, -1 - i);
oldblks[i] = oip->i_din2->di_extb[i];
oip->i_din2->di_extb[i] = 0;
}
oip->i_flag |= IN_CHANGE;
if ((error = ffs_update(ovp, NULL, NULL, 0)))
return (error);
for (i = 0; i < UFS_NXADDR; i++) {
if (oldblks[i] == 0)
continue;
bsize = ffs_sblksize(fs, osize, i);
if (wapbl) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(omp,
FFS_FSBTODB(fs, oldblks[i]), bsize, NULL);
if (error)
return error;
} else
ffs_blkfree(fs, oip->i_devvp, oldblks[i],
bsize, oip->i_number);
}
extblocks = 0;
}
}
if ((ioflag & IO_NORMAL) == 0)
return (0);
if (ovp->v_type == VLNK && (oip->i_size < ump->um_maxsymlinklen || (ump->um_maxsymlinklen == 0 && datablocks == 0))) { KDASSERT(length == 0); memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
oip->i_size = 0;
DIP_ASSIGN(oip, size, 0);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (ffs_update(ovp, NULL, NULL, 0));
}
if (oip->i_size == length) {
/* still do a uvm_vnp_setsize() as writesize may be larger */
uvm_vnp_setsize(ovp, length);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (ffs_update(ovp, NULL, NULL, 0));
}
if (length > ump->um_maxfilesize)
return (EFBIG);
if ((oip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(ovp);
osize = oip->i_size;
aflag = ioflag & IO_SYNC ? B_SYNC : 0;
/*
* Lengthen the size of the file. We must ensure that the
* last byte of the file is allocated. Since the smallest
* value of osize is 0, length will be at least 1.
*/
if (osize < length) {
if (ffs_lblkno(fs, osize) < UFS_NDADDR &&
ffs_lblkno(fs, osize) != ffs_lblkno(fs, length) &&
ffs_blkroundup(fs, osize) != osize) {
off_t eob;
eob = ffs_blkroundup(fs, osize);
uvm_vnp_setwritesize(ovp, eob);
error = ufs_balloc_range(ovp, osize, eob - osize,
cred, aflag);
if (error) {
(void) ffs_truncate(ovp, osize,
ioflag & IO_SYNC, cred);
return error;
}
if (ioflag & IO_SYNC) { rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
VOP_PUTPAGES(ovp,
trunc_page(osize & fs->fs_bmask),
round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
PGO_JOURNALLOCKED);
}
}
uvm_vnp_setwritesize(ovp, length);
error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
if (error) {
(void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
return (error);
}
uvm_vnp_setsize(ovp, length);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
KASSERT(ovp->v_size == oip->i_size);
return (ffs_update(ovp, NULL, NULL, 0));
}
/*
* When truncating a regular file down to a non-block-aligned size,
* we must zero the part of last block which is past the new EOF.
* We must synchronously flush the zeroed pages to disk
* since the new pages will be invalidated as soon as we
* inform the VM system of the new, smaller size.
* We must do this before acquiring the GLOCK, since fetching
* the pages will acquire the GLOCK internally.
* So there is a window where another thread could see a whole
* zeroed page past EOF, but that's life.
*/
offset = ffs_blkoff(fs, length);
pgoffset = length & PAGE_MASK;
if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
osize > length) {
daddr_t lbn;
voff_t eoz;
int size;
if (offset != 0) {
error = ufs_balloc_range(ovp, length - 1, 1, cred,
aflag);
if (error)
return error;
}
lbn = ffs_lblkno(fs, length);
size = ffs_blksize(fs, oip, lbn);
eoz = MIN(MAX(ffs_lblktosize(fs, lbn) + size, round_page(pgoffset)),
osize);
ubc_zerorange(&ovp->v_uobj, length, eoz - length,
UBC_VNODE_FLAGS(ovp));
if (round_page(eoz) > round_page(length)) {
rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(ovp, round_page(length),
round_page(eoz),
PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
if (error)
return error;
}
}
genfs_node_wrlock(ovp);
oip->i_size = length;
DIP_ASSIGN(oip, size, length);
uvm_vnp_setsize(ovp, length);
/*
* Calculate index into inode's block list of
* last direct and indirect blocks (if any)
* which we want to keep. Lastblock is -1 when
* the file is truncated to 0.
*/
lastblock = ffs_lblkno(fs, length + fs->fs_bsize - 1) - 1;
lastiblock[SINGLE] = lastblock - UFS_NDADDR;
lastiblock[DOUBLE] = lastiblock[SINGLE] - FFS_NINDIR(fs);
lastiblock[TRIPLE] = lastiblock[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs);
nblocks = btodb(fs->fs_bsize);
/*
* Update file and block pointers on disk before we start freeing
* blocks. If we crash before free'ing blocks below, the blocks
* will be returned to the free list. lastiblock values are also
* normalized to -1 for calls to ffs_indirtrunc below.
*/
sync = 0;
for (level = TRIPLE; level >= SINGLE; level--) {
blks[UFS_NDADDR + level] = DIP(oip, ib[level]); if (lastiblock[level] < 0 && blks[UFS_NDADDR + level] != 0) {
sync = 1;
DIP_ASSIGN(oip, ib[level], 0);
lastiblock[level] = -1;
}
}
for (i = 0; i < UFS_NDADDR; i++) {
blks[i] = DIP(oip, db[i]);
if (i > lastblock && blks[i] != 0) {
sync = 1;
DIP_ASSIGN(oip, db[i], 0);
}
}
oip->i_flag |= IN_CHANGE | IN_UPDATE;
if (sync) { error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
if (error && !allerror)
allerror = error;
}
/*
* Having written the new inode to disk, save its new configuration
* and put back the old block pointers long enough to process them.
* Note that we save the new block configuration so we can check it
* when we are done.
*/
for (i = 0; i < UFS_NDADDR; i++) { bn = DIP(oip, db[i]);
DIP_ASSIGN(oip, db[i], blks[i]);
blks[i] = bn;
}
for (i = 0; i < UFS_NIADDR; i++) {
bn = DIP(oip, ib[i]);
DIP_ASSIGN(oip, ib[i], blks[UFS_NDADDR + i]);
blks[UFS_NDADDR + i] = bn;
}
oip->i_size = osize;
DIP_ASSIGN(oip, size, osize);
error = vtruncbuf(ovp, lastblock + 1, 0, 0);
if (error && !allerror)
allerror = error;
/*
* Indirect blocks first.
*/
indir_lbn[SINGLE] = -UFS_NDADDR;
indir_lbn[DOUBLE] = indir_lbn[SINGLE] - FFS_NINDIR(fs) - 1;
indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs) - 1;
for (level = TRIPLE; level >= SINGLE; level--) { bn = ffs_getib(fs, oip, level);
if (bn != 0) {
if (lastiblock[level] < 0 &&
oip->i_ump->um_mountp->mnt_wapbl) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp,
FFS_FSBTODB(fs, bn), fs->fs_bsize,
&dcookie);
if (error)
goto out;
} else {
dcookie = NULL;
}
error = ffs_indirtrunc(oip, indir_lbn[level],
FFS_FSBTODB(fs, bn), lastiblock[level], level,
&blocksreleased);
if (error) {
if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION(
oip->i_ump->um_mountp, dcookie);
}
goto out;
}
if (lastiblock[level] < 0) { if (!dcookie) ffs_blkfree(fs, oip->i_devvp, bn,
fs->fs_bsize, oip->i_number);
DIP_ASSIGN(oip, ib[level], 0);
blocksreleased += nblocks;
}
}
if (lastiblock[level] >= 0)
goto done;
}
/*
* All whole direct blocks or frags.
*/
for (i = UFS_NDADDR - 1; i > lastblock; i--) { bn = ffs_getdb(fs, oip, i); if (bn == 0)
continue;
bsize = ffs_blksize(fs, oip, i); if ((oip->i_ump->um_mountp->mnt_wapbl) &&
(ovp->v_type != VREG)) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp,
FFS_FSBTODB(fs, bn), bsize, NULL);
if (error)
goto out;
} else
ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
DIP_ASSIGN(oip, db[i], 0);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
goto done;
/*
* Finally, look for a change in size of the
* last direct block; release any frags.
*/
bn = ffs_getdb(fs, oip, lastblock); if (bn != 0) {
long oldspace, newspace;
/*
* Calculate amount of space we're giving
* back as old block size minus new block size.
*/
oldspace = ffs_blksize(fs, oip, lastblock);
oip->i_size = length;
DIP_ASSIGN(oip, size, length); newspace = ffs_blksize(fs, oip, lastblock);
if (newspace == 0)
panic("itrunc: newspace"); if (oldspace - newspace > 0) {
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
* required for the storage we're keeping.
*/
bn += ffs_numfrags(fs, newspace);
if ((oip->i_ump->um_mountp->mnt_wapbl) &&
(ovp->v_type != VREG)) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn),
oldspace - newspace, NULL);
if (error)
goto out;
} else
ffs_blkfree(fs, oip->i_devvp, bn,
oldspace - newspace, oip->i_number);
blocksreleased += btodb(oldspace - newspace);
}
}
done:
for (level = SINGLE; level <= TRIPLE; level++)
KASSERTMSG((blks[UFS_NDADDR + level] == DIP(oip, ib[level])),
"itrunc1 blk mismatch: %jx != %jx",
(uintmax_t)blks[UFS_NDADDR + level],
(uintmax_t)DIP(oip, ib[level]));
for (i = 0; i < UFS_NDADDR; i++) KASSERTMSG((blks[i] == DIP(oip, db[i])),
"itrunc2 blk mismatch: %jx != %jx",
(uintmax_t)blks[i], (uintmax_t)DIP(oip, db[i]));
KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_cleanblkhd)),
"itrunc3: zero length and nonempty cleanblkhd");
KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_dirtyblkhd)),
"itrunc3: zero length and nonempty dirtyblkhd");
out:
/*
* Set length back to old size if deallocation failed. Some indirect
* blocks were deallocated creating a hole, but that is okay.
*/
if (error == EAGAIN) { if (!allerror)
allerror = error;
length = osize;
uvm_vnp_setsize(ovp, length);
}
/*
* Put back the real size.
*/
oip->i_size = length;
DIP_ASSIGN(oip, size, length); DIP_ADD(oip, blocks, -blocksreleased);
genfs_node_unlock(ovp);
oip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
#if defined(QUOTA) || defined(QUOTA2)
(void) chkdq(oip, -blocksreleased, NOCRED, 0);
#endif
KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
return (allerror);
}
/*
* Release blocks associated with the inode ip and stored in the indirect
* block bn. Blocks are free'd in LIFO order up to (but not including)
* lastbn. If level is greater than SINGLE, the block is an indirect block
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*
* NB: triple indirect blocks are untested.
*/
static int
ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
int level, int64_t *countp)
{
int i;
struct buf *bp;
struct fs *fs = ip->i_fs;
int32_t *bap1 = NULL;
int64_t *bap2 = NULL;
struct vnode *vp;
daddr_t nb, nlbn, last;
char *copy = NULL;
int64_t factor;
int64_t nblocks;
int error = 0, allerror = 0;
const int needswap = UFS_FSNEEDSWAP(fs);
const int wapbl = (ip->i_ump->um_mountp->mnt_wapbl != NULL);
void *dcookie;
#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
#define BAP_ASSIGN(ip, i, value) \
do { \
if ((ip)->i_ump->um_fstype == UFS1) \
bap1[i] = (value); \
else \
bap2[i] = (value); \
} while(0)
/*
* Calculate index in current block of last
* block to be kept. -1 indicates the entire
* block so we need not calculate the index.
*/
factor = 1;
for (i = SINGLE; i < level; i++)
factor *= FFS_NINDIR(fs);
last = lastbn;
if (lastbn > 0) last /= factor;
nblocks = btodb(fs->fs_bsize);
/*
* Get buffer of block pointers, zero those entries corresponding
* to blocks to be free'd, and update on disk copy first. Since
* double(triple) indirect before single(double) indirect, calls
* to bmap on these blocks will fail. However, we already have
* the on disk address, so we have to set the b_blkno field
* explicitly instead of letting bread do everything for us.
*/
vp = ITOV(ip);
error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
if (error)
return error;
if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
/* Braces must be here in case trace evaluates to nothing. */
trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
} else {
trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
curlwp->l_ru.ru_inblock++; /* pay for read */
bp->b_flags |= B_READ;
bp->b_flags &= ~B_COWDONE; /* we change blkno below */
if (bp->b_bcount > bp->b_bufsize)
panic("ffs_indirtrunc: bad buffer size");
bp->b_blkno = dbn;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
error = biowait(bp);
if (error == 0)
error = fscow_run(bp, true);
}
if (error) {
brelse(bp, 0);
return error;
}
/*
* Clear reference to blocks to be removed on disk, before actually
* reclaiming them, so that fsck is more likely to be able to recover
* the filesystem if system goes down during the truncate process.
* This assumes the truncate process would not fail, contrary
* to the wapbl case.
*/
if (ip->i_ump->um_fstype == UFS1)
bap1 = (int32_t *)bp->b_data;
else
bap2 = (int64_t *)bp->b_data; if (lastbn >= 0 && !wapbl) {
copy = kmem_alloc(fs->fs_bsize, KM_SLEEP);
memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
for (i = last + 1; i < FFS_NINDIR(fs); i++) BAP_ASSIGN(ip, i, 0);
error = bwrite(bp);
if (error)
allerror = error;
if (ip->i_ump->um_fstype == UFS1)
bap1 = (int32_t *)copy;
else
bap2 = (int64_t *)copy;
}
/*
* Recursively free totally unused blocks.
*/
for (i = FFS_NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
i--, nlbn += factor) {
nb = RBAP(ip, i); if (nb == 0)
continue;
if ((ip->i_ump->um_mountp->mnt_wapbl) && ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
ip->i_ump->um_mountp,
FFS_FSBTODB(fs, nb), fs->fs_bsize,
&dcookie);
if (error)
goto out;
} else {
dcookie = NULL;
}
if (level > SINGLE) {
error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
(daddr_t)-1, level - 1, countp);
if (error) { if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION(
ip->i_ump->um_mountp, dcookie);
}
goto out;
}
}
if (!dcookie) ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
ip->i_number);
BAP_ASSIGN(ip, i, 0);
*countp += nblocks;
}
/*
* Recursively free blocks on the now last partial indirect block.
*/
if (level > SINGLE && lastbn >= 0) {
last = lastbn % factor;
nb = RBAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
last, level - 1, countp);
if (error)
goto out;
}
}
out:
if (error && !allerror)
allerror = error;
if (copy != NULL) {
kmem_free(copy, fs->fs_bsize);
} else if (lastbn < 0 && error == 0) {
/* all freed, release without writing back */
brelse(bp, BC_INVAL); } else if (wapbl) {
/* only partially freed, write the updated block */
error = bwrite(bp);
if (!allerror)
allerror = error;
}
return (allerror);
}
void
ffs_itimes(struct inode *ip, const struct timespec *acc,
const struct timespec *mod, const struct timespec *cre)
{
struct timespec now;
if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
return;
}
vfs_timestamp(&now);
if (ip->i_flag & IN_ACCESS) {
if (acc == NULL)
acc = &now;
DIP_ASSIGN(ip, atime, acc->tv_sec); DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
}
if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { if ((ip->i_flags & SF_SNAPSHOT) == 0) {
if (mod == NULL)
mod = &now;
DIP_ASSIGN(ip, mtime, mod->tv_sec); DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
}
ip->i_modrev++;
}
if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
if (cre == NULL)
cre = &now;
DIP_ASSIGN(ip, ctime, cre->tv_sec); DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
}
if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
ip->i_flag |= IN_ACCESSED;
if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
ip->i_flag |= IN_MODIFIED;
ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
}
/* $NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Generic rename abstraction.
*
* Rename is unbelievably hairy. Try to use this if you can --
* otherwise you are practically guaranteed to get it wrong.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $");
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/types.h>
#include <miscfs/genfs/genfs.h>
/*
* Sample copypasta for implementing VOP_RENAME via genfs_rename.
* Don't change this template without carefully considering whether
* every other file system that already uses it needs to change too.
* That way, once we have changed all the file systems to use it, we
* can easily replace mumblefs_rename by mumblefs_sane_rename and
* eliminate the insane API altogether.
*/
/* begin sample copypasta */
#if 0
static const struct genfs_rename_ops mumblefs_genfs_rename_ops;
/*
* mumblefs_sane_rename: The hairiest vop, with the saner API.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tcnp (to component name),
* . cred (credentials structure), and
* . posixly_correct (flag for behaviour if target & source link same file).
*
* fdvp and tdvp may be the same, and must be referenced and unlocked.
*/
static int
mumblefs_sane_rename(
struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct)
{
struct mumblefs_lookup_results fulr, tulr;
return genfs_sane_rename(&mumblefs_genfs_rename_ops,
fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
cred, posixly_correct);
}
/*
* mumblefs_rename: The hairiest vop, with the insanest API. Defer to
* genfs_insane_rename immediately.
*/
int
mumblefs_rename(void *v)
{
return genfs_insane_rename(v, &mumblefs_sane_rename);
}
#endif
/* end sample copypasta */
/*
* Forward declarations
*/
static int genfs_rename_enter(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode **,
struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_common(const struct genfs_rename_ops *,
struct mount *, kauth_cred_t, struct vnode *,
struct componentname *, void *, struct vnode **,
struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_separate(const struct genfs_rename_ops *,
struct mount *, kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode **,
struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_lock(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t, int, int, int,
struct vnode *, struct componentname *, bool, void *, struct vnode **,
struct vnode *, struct componentname *, bool, void *, struct vnode **);
static void genfs_rename_exit(const struct genfs_rename_ops *, struct mount *,
struct vnode *, struct vnode *,
struct vnode *, struct vnode *);
static int genfs_rename_remove(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode *, nlink_t *);
/*
* genfs_insane_rename: Generic implementation of the insane API for
* the rename vop.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fvp (from vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tvp (to vnode, or NULL), and
* . tcnp (to component name).
*
* Any pair of vnode parameters may have the same vnode.
*
* On entry,
*
* . fdvp, fvp, tdvp, and tvp are referenced,
* . fdvp and fvp are unlocked, and
* . tdvp and tvp (if nonnull) are locked.
*
* On exit,
*
* . fdvp, fvp, tdvp, and tvp (if nonnull) are unreferenced, and
* . tdvp and tvp (if nonnull) are unlocked.
*/
int
genfs_insane_rename(void *v,
int (*sane_rename)(struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct))
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
struct vnode *fdvp = ap->a_fdvp;
struct vnode *fvp = ap->a_fvp;
struct componentname *fcnp = ap->a_fcnp;
struct vnode *tdvp = ap->a_tdvp;
struct vnode *tvp = ap->a_tvp;
struct componentname *tcnp = ap->a_tcnp;
kauth_cred_t cred;
int error;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL);
/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
/* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR);
cred = fcnp->cn_cred;
/*
* XXX Want a better equality test. `tcnp->cn_cred == cred'
* hoses p2k because puffs transmits the creds separately and
* allocates distinct but equivalent structures for them.
*/
KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
/*
* Sanitize our world from the VFS insanity. Unlock the target
* directory and node, which are locked. Release the children,
* which are referenced, since we'll be looking them up again
* later.
*/
VOP_UNLOCK(tdvp);
if ((tvp != NULL) && (tvp != tdvp)) VOP_UNLOCK(tvp);
vrele(fvp);
if (tvp != NULL) vrele(tvp);
error = (*sane_rename)(fdvp, fcnp, tdvp, tcnp, cred, false);
/*
* All done, whether with success or failure. Release the
* directory nodes now, as the caller expects from the VFS
* protocol.
*/
vrele(fdvp);
vrele(tdvp);
return error;
}
/*
* genfs_sane_rename: Generic implementation of the saner API for the
* rename vop. Handles ancestry checks, locking, and permissions
* checks. Caller is responsible for implementing the genfs rename
* operations.
*
* fdvp and tdvp must be referenced and unlocked.
*/
int
genfs_sane_rename(const struct genfs_rename_ops *ops,
struct vnode *fdvp, struct componentname *fcnp, void *fde,
struct vnode *tdvp, struct componentname *tcnp, void *tde,
kauth_cred_t cred, bool posixly_correct)
{
struct mount *mp;
struct vnode *fvp = NULL, *tvp = NULL;
nlink_t tvp_new_nlink = 0;
int error;
KASSERT(ops != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL);
/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
/* KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME);
/* XXX Want a better equality test. */
KASSERT(kauth_cred_uidmatch(cred, fcnp->cn_cred)); KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
mp = fdvp->v_mount;
KASSERT(mp != NULL); KASSERT(mp == tdvp->v_mount);
/* XXX How can we be sure this stays true? */
KASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
/* Reject rename("x/..", ...) and rename(..., "x/..") early. */
if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)
return EINVAL; /* XXX EISDIR? */
error = genfs_rename_enter(ops, mp, cred,
fdvp, fcnp, fde, &fvp,
tdvp, tcnp, tde, &tvp);
if (error)
return error;
/*
* Check that everything is locked and looks right.
*/
KASSERT(fvp != NULL); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* If the source and destination are the same object, we need
* only at most delete the source entry. We are guaranteed at
* this point that the entries are distinct.
*/
if (fvp == tvp) { KASSERT(tvp != NULL); if (fvp->v_type == VDIR)
/* XXX This shouldn't be possible. */
error = EINVAL;
else if (posixly_correct)
/* POSIX sez to leave them alone. */
error = 0;
else if ((fdvp == tdvp) && (fcnp->cn_namelen == tcnp->cn_namelen) &&
(memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr,
fcnp->cn_namelen) == 0))
/* Renaming an entry over itself does nothing. */
error = 0;
else {
/* XXX Can't use VOP_REMOVE because of locking. */
error = genfs_rename_remove(ops, mp, cred,
fdvp, fcnp, fde, fvp, &tvp_new_nlink);
VN_KNOTE(fdvp, NOTE_WRITE); VN_KNOTE(fvp,
tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK);
}
goto out;
}
KASSERT(fvp != tvp);
KASSERT((fdvp != tdvp) ||
(fcnp->cn_namelen != tcnp->cn_namelen) ||
(memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen)
!= 0));
/*
* If the target exists, refuse to rename a directory over a
* non-directory or vice versa, or to clobber a non-empty
* directory.
*/
if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type == VDIR)
error =
(ops->gro_directory_empty_p(mp, cred, tvp, tdvp)?
0 : ENOTEMPTY);
else if (fvp->v_type == VDIR && tvp->v_type != VDIR)
error = ENOTDIR;
else if (fvp->v_type != VDIR && tvp->v_type == VDIR)
error = EISDIR;
else
error = 0;
if (error)
goto out;
KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR));
}
/*
* Authorize the rename.
*/
error = ops->gro_rename_check_possible(mp, fdvp, fvp, tdvp, tvp);
if (error)
goto out;
error = ops->gro_rename_check_permitted(mp, cred, fdvp, fvp, tdvp, tvp);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, fvp, fdvp,
error);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_RENAME, tvp, tdvp,
error);
if (error)
goto out;
/*
* Everything is hunky-dory. Shuffle the directory entries.
*/
error = ops->gro_rename(mp, cred,
fdvp, fcnp, fde, fvp,
tdvp, tcnp, tde, tvp,
&tvp_new_nlink);
if (error)
goto out;
/* Success! */
genfs_rename_knote(fdvp, fvp, tdvp, tvp, tvp_new_nlink);
out:
genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
return error;
}
/*
* genfs_rename_knote: Note events about the various vnodes in a
* rename. To be called by gro_rename on success. The only pair of
* vnodes that may be identical is {fdvp, tdvp}. tvp_new_nlink is
* the resulting link count of tvp.
*/
void
genfs_rename_knote(struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp, nlink_t tvp_new_nlink)
{
long fdvp_events, tdvp_events;
bool directory_p, reparent_p, replaced_p;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); directory_p = (fvp->v_type == VDIR);
reparent_p = (fdvp != tdvp);
replaced_p = (tvp != NULL);
KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR)));
fdvp_events = NOTE_WRITE;
if (directory_p && reparent_p)
fdvp_events |= NOTE_LINK;
VN_KNOTE(fdvp, fdvp_events); VN_KNOTE(fvp, NOTE_RENAME); if (reparent_p) {
tdvp_events = NOTE_WRITE;
if (!replaced_p) {
tdvp_events |= NOTE_EXTEND;
if (directory_p)
tdvp_events |= NOTE_LINK;
}
VN_KNOTE(tdvp, tdvp_events);
}
if (replaced_p) VN_KNOTE(tvp, (tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK));
}
/*
* genfs_rename_cache_purge: Purge the name cache. To be called by
* gro_rename on success. The only pair of vnodes that may be
* identical is {fdvp, tdvp}.
*/
void
genfs_rename_cache_purge(struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{ KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR);
/*
* XXX What actually needs to be purged?
*/
cache_purge(fdvp);
if (fvp->v_type == VDIR) cache_purge(fvp); if (tdvp != fdvp) cache_purge(tdvp); if ((tvp != NULL) && (tvp->v_type == VDIR)) cache_purge(tvp);
}
/*
* genfs_rename_enter: Look up fcnp in fdvp, and store the lookup
* results in *fde_ret and the associated vnode in *fvp_ret; fail if
* not found. Look up tcnp in tdvp, and store the lookup results in
* *tde_ret and the associated vnode in *tvp_ret; store null instead if
* not found. Fail if anything has been mounted on any of the nodes
* involved.
*
* fdvp and tdvp must be referenced.
*
* On entry, nothing is locked.
*
* On success, everything is locked, and *fvp_ret, and *tvp_ret if
* nonnull, are referenced. The only pairs of vnodes that may be
* identical are {fdvp, tdvp} and {fvp, tvp}.
*
* On failure, everything remains as was.
*
* Locking everything including the source and target nodes is
* necessary to make sure that, e.g., link count updates are OK. The
* locking order is, in general, ancestor-first, matching the order you
* need to use to look up a descendant anyway.
*/
static int
genfs_rename_enter(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct vnode *tdvp, struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
int error;
KASSERT(mp != NULL);
KASSERT(fdvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tdvp != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(fvp_ret != tvp_ret);
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp);
if (fdvp == tdvp)
error = genfs_rename_enter_common(ops, mp, cred, fdvp,
fcnp, fde_ret, fvp_ret,
tcnp, tde_ret, tvp_ret);
else
error = genfs_rename_enter_separate(ops, mp, cred,
fdvp, fcnp, fde_ret, fvp_ret,
tdvp, tcnp, tde_ret, tvp_ret);
if (error)
return error;
KASSERT(*fvp_ret != NULL); KASSERT(VOP_ISLOCKED(*fvp_ret) == LK_EXCLUSIVE); KASSERT((*tvp_ret == NULL) || (VOP_ISLOCKED(*tvp_ret) == LK_EXCLUSIVE)); KASSERT(*fvp_ret != fdvp); KASSERT(*fvp_ret != tdvp); KASSERT(*tvp_ret != fdvp); KASSERT(*tvp_ret != tdvp);
return 0;
}
/*
* genfs_rename_enter_common: Lock and look up with a common
* source/target directory.
*/
static int
genfs_rename_enter_common(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred, struct vnode *dvp,
struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
struct vnode *fvp, *tvp;
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(dvp->v_type == VDIR); KASSERT(dvp->v_mount == mp);
error = ops->gro_lock_directory(mp, dvp);
if (error)
goto fail0;
/* Did we lose a race with mount? */
if (dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail1;
}
KASSERT(fcnp->cn_nameiop == DELETE);
error = ops->gro_lookup(mp, dvp, fcnp, fde_ret, &fvp);
if (error)
goto fail1;
KASSERT(fvp != NULL);
/* Refuse to rename `.'. */
if (fvp == dvp) {
error = EINVAL;
goto fail2;
}
KASSERT(fvp != dvp);
KASSERT(tcnp->cn_nameiop == RENAME);
error = ops->gro_lookup(mp, dvp, tcnp, tde_ret, &tvp);
if (error == ENOENT) {
tvp = NULL;
} else if (error) {
goto fail2;
} else {
KASSERT(tvp != NULL);
/* Refuse to rename over `.'. */
if (tvp == dvp) {
error = EISDIR; /* XXX EINVAL? */
goto fail2;
}
}
KASSERT(tvp != dvp);
/*
* We've looked up both nodes. Now lock them and check them.
*/
vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(fvp->v_mount == mp);
/* Refuse to rename a mount point. */
if ((fvp->v_type == VDIR) && (fvp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail3;
}
if ((tvp != NULL) && (tvp != fvp)) {
vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(tvp->v_mount == mp);
/* Refuse to rename over a mount point. */
if ((tvp->v_type == VDIR) && (tvp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail4;
}
}
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
*fvp_ret = fvp;
*tvp_ret = tvp;
return 0;
fail4: if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp);
fail3: VOP_UNLOCK(fvp);
if (tvp != NULL) vrele(tvp);
fail2: vrele(fvp);
fail1: VOP_UNLOCK(dvp);
fail0: return error;
}
/*
* genfs_rename_enter_separate: Lock and look up with separate source
* and target directories.
*/
static int
genfs_rename_enter_separate(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct vnode *tdvp, struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
struct vnode *intermediate_node;
struct vnode *fvp, *tvp;
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(fdvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tdvp != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(fdvp != tdvp);
KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME);
KASSERT(fvp_ret != tvp_ret);
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp);
error = ops->gro_genealogy(mp, cred, fdvp, tdvp, &intermediate_node);
if (error)
return error;
/*
* intermediate_node == NULL means fdvp is not an ancestor of tdvp.
*/
if (intermediate_node == NULL)
error = genfs_rename_lock(ops, mp, cred,
ENOTEMPTY, EISDIR, EINVAL,
tdvp, tcnp, true, tde_ret, &tvp,
fdvp, fcnp, false, fde_ret, &fvp);
else
error = genfs_rename_lock(ops, mp, cred,
EINVAL, EISDIR, EINVAL,
fdvp, fcnp, false, fde_ret, &fvp,
tdvp, tcnp, true, tde_ret, &tvp);
if (error)
goto out;
KASSERT(fvp != NULL);
/*
* Reject rename("foo/bar", "foo/bar/baz/quux/zot").
*/
if (fvp == intermediate_node) { genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
error = EINVAL;
goto out;
}
*fvp_ret = fvp;
*tvp_ret = tvp;
error = 0;
out: if (intermediate_node != NULL) vrele(intermediate_node);
return error;
}
/*
* genfs_rename_lock: Lookup and lock it all. The lock order is:
*
* a_dvp -> a_vp -> b_dvp -> b_vp,
*
* except if a_vp is a nondirectory in which case the lock order is:
*
* a_dvp -> b_dvp -> b_vp -> a_vp,
*
* which can't violate ancestor->descendant because a_vp has no
* descendants in this case. This edge case is necessary because some
* file systems can only lookup/lock/unlock, and we can't hold a_vp
* locked when we lookup/lock/unlock b_vp if they turn out to be the
* same, and we can't find out that they're the same until after the
* lookup.
*
* b_dvp must not be an ancestor of a_dvp, although a_dvp may be an
* ancestor of b_dvp.
*
* Fail with overlap_error if node a is directory b. Neither
* componentname may be `.' or `..'.
*
* a_dvp and b_dvp must be referenced.
*
* On entry, a_dvp and b_dvp are unlocked.
*
* On success,
* . a_dvp and b_dvp are locked,
* . *a_dirent_ret is filled with a directory entry whose node is
* locked and referenced,
* . *b_vp_ret is filled with the corresponding vnode,
* . *b_dirent_ret is filled either with null or with a directory entry
* whose node is locked and referenced,
* . *b_vp is filled either with null or with the corresponding vnode,
* and
* . the only pair of vnodes that may be identical is a_vp and b_vp.
*
* On failure, a_dvp and b_dvp are left unlocked, and *a_dirent_ret,
* *a_vp, *b_dirent_ret, and *b_vp are left alone.
*/
static int
genfs_rename_lock(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
int overlap_error, int a_dot_error, int b_dot_error,
struct vnode *a_dvp, struct componentname *a_cnp, bool a_missing_ok,
void *a_de_ret, struct vnode **a_vp_ret,
struct vnode *b_dvp, struct componentname *b_cnp, bool b_missing_ok,
void *b_de_ret, struct vnode **b_vp_ret)
{
struct vnode *a_vp, *b_vp;
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(a_dvp != NULL);
KASSERT(a_cnp != NULL);
KASSERT(a_vp_ret != NULL);
KASSERT(b_dvp != NULL);
KASSERT(b_cnp != NULL);
KASSERT(b_vp_ret != NULL);
KASSERT(a_dvp != b_dvp);
KASSERT(a_vp_ret != b_vp_ret);
KASSERT(a_dvp->v_type == VDIR);
KASSERT(b_dvp->v_type == VDIR);
KASSERT(a_dvp->v_mount == mp);
KASSERT(b_dvp->v_mount == mp);
KASSERT(a_missing_ok != b_missing_ok);
/*
* 1. Lock a_dvp.
*/
error = ops->gro_lock_directory(mp, a_dvp);
if (error)
goto fail0;
/* Did we lose a race with mount? */
if (a_dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail1;
}
/*
* 2. Lookup a_vp. May lock/unlock a_vp.
*/
error = ops->gro_lookup(mp, a_dvp, a_cnp, a_de_ret, &a_vp);
if (error) {
if (a_missing_ok && (error == ENOENT))
a_vp = NULL;
else
goto fail1;
} else {
KASSERT(a_vp != NULL);
/* Refuse to rename (over) `.'. */
if (a_vp == a_dvp) {
error = a_dot_error;
goto fail2;
}
/* Reject rename("x", "x/y") or rename("x/y", "x"). */
if (a_vp == b_dvp) {
error = overlap_error;
goto fail2;
}
}
KASSERT(a_vp != a_dvp);
KASSERT(a_vp != b_dvp);
/*
* 3. Lock a_vp, if it is a directory.
*
* We already ruled out a_vp == a_dvp (i.e., a_cnp is `.'), so
* this is not locking against self, and we already ruled out
* a_vp == b_dvp, so this won't cause subsequent locking of
* b_dvp to lock against self.
*
* If a_vp is a nondirectory, we can't hold it when we lookup
* b_vp in case (a) the file system can only lookup/lock/unlock
* and (b) b_vp turns out to be the same file as a_vp due to
* hard links -- and we can't even detect that case until after
* we've looked up b_vp. Fortunately, if a_vp is a
* nondirectory, then it is a leaf, so we can safely lock it
* last.
*/
if (a_vp != NULL && a_vp->v_type == VDIR) {
vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(a_vp->v_mount == mp);
/* Refuse to rename (over) a mount point. */
if (a_vp->v_mountedhere != NULL) {
error = EBUSY;
goto fail3;
}
}
/*
* 4. Lock b_dvp.
*/
error = ops->gro_lock_directory(mp, b_dvp);
if (error)
goto fail3;
/* Did we lose a race with mount? */
if (b_dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail4;
}
/*
* 5. Lookup b_vp. May lock/unlock b_vp.
*/
error = ops->gro_lookup(mp, b_dvp, b_cnp, b_de_ret, &b_vp);
if (error) {
if (b_missing_ok && (error == ENOENT))
b_vp = NULL;
else
goto fail4;
} else {
KASSERT(b_vp != NULL);
/* Refuse to rename (over) `.'. */
if (b_vp == b_dvp) {
error = b_dot_error;
goto fail5;
}
/*
* b_dvp must not be an ancestor of a_dvp, so if we
* find b_dvp/b_vp=a_dvp/a_vp something is wrong.
*/
if (b_vp == a_dvp) {
/*
* We have a directory hard link before us.
* XXX What error should this return? EDEADLK?
* Panic?
*/
error = EIO;
goto fail5;
}
}
KASSERT(b_vp != b_dvp);
KASSERT(b_vp != a_dvp);
/*
* 6. Lock a_vp, if it is a nondirectory.
*
* In this case a_vp is a leaf, so it is either equal to or
* incommensurate with b_vp, and so we can safely lock it at
* any point now.
*/
if (a_vp != NULL && a_vp->v_type != VDIR) {
vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(a_vp->v_mount == mp);
/* (not a directory so can't have anything mounted here) */
}
/*
* 7. Lock b_vp, if it is not a_vp.
*
* b_vp and a_vp may the same inode if they are hard links to
* one another.
*/
if ((b_vp != NULL) && (b_vp != a_vp)) {
vn_lock(b_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(b_vp->v_mount == mp);
/* Refuse to rename (over) a mount point. */
if ((b_vp->v_type == VDIR) && (b_vp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail6;
}
}
KASSERT(VOP_ISLOCKED(a_dvp) == LK_EXCLUSIVE);
KASSERT(VOP_ISLOCKED(b_dvp) == LK_EXCLUSIVE);
KASSERT(a_missing_ok || (a_vp != NULL));
KASSERT(b_missing_ok || (b_vp != NULL));
KASSERT((a_vp == NULL) || (VOP_ISLOCKED(a_vp) == LK_EXCLUSIVE));
KASSERT((b_vp == NULL) || (VOP_ISLOCKED(b_vp) == LK_EXCLUSIVE));
*a_vp_ret = a_vp;
*b_vp_ret = b_vp;
return 0;
fail6: if ((b_vp != NULL) && (b_vp != a_vp))
VOP_UNLOCK(b_vp);
if (a_vp != NULL && a_vp->v_type != VDIR)
VOP_UNLOCK(a_vp);
fail5: if (b_vp != NULL)
vrele(b_vp);
fail4: VOP_UNLOCK(b_dvp);
fail3: if (a_vp != NULL && a_vp->v_type == VDIR)
VOP_UNLOCK(a_vp);
fail2: if (a_vp != NULL)
vrele(a_vp);
fail1: VOP_UNLOCK(a_dvp);
fail0: return error;
}
/*
* genfs_rename_exit: Unlock everything we locked for rename.
*
* fdvp and tdvp must be referenced.
*
* On entry, everything is locked, and fvp and tvp referenced.
*
* On exit, everything is unlocked, and fvp and tvp are released.
*/
static void
genfs_rename_exit(const struct genfs_rename_ops *ops,
struct mount *mp,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)ops;
KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != tvp); KASSERT(tdvp != fvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); VOP_UNLOCK(fvp);
if (tvp != NULL)
vrele(tvp);
if (tdvp != fdvp) VOP_UNLOCK(tdvp);
vrele(fvp);
VOP_UNLOCK(fdvp);
}
/*
* genfs_rename_remove: Remove the entry for the non-directory vp with
* componentname cnp from the directory dvp, using the lookup results
* de. It is the responsibility of gro_remove to purge the name cache.
*
* Everything must be locked and referenced.
*/
static int
genfs_rename_remove(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
nlink_t *tvp_nlinkp)
{
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(cnp != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
error = ops->gro_remove_check_possible(mp, dvp, vp);
if (error)
return error;
error = ops->gro_remove_check_permitted(mp, cred, dvp, vp);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, vp, dvp,
error);
if (error)
return error;
error = ops->gro_remove(mp, cred, dvp, cnp, de, vp, tvp_nlinkp);
if (error)
return error;
return 0;
}
static int
genfs_ufslike_check_sticky(kauth_cred_t, mode_t, uid_t, struct vnode *, uid_t);
/*
* genfs_ufslike_rename_check_possible: Check whether a rename is
* possible independent of credentials, assuming UFS-like inode flag
* semantics. clobber_p is true iff the target node already exists.
*/
int
genfs_ufslike_rename_check_possible(
unsigned long fdflags, unsigned long fflags,
unsigned long tdflags, unsigned long tflags, bool clobber_p,
unsigned long immutable, unsigned long append)
{ if ((fdflags | fflags) & (immutable | append))
return EPERM;
if (tdflags & (immutable | (clobber_p? append : 0)))
return EPERM;
if (clobber_p && (tflags & (immutable | append)))
return EPERM;
return 0;
}
/*
* genfs_ufslike_rename_check_permitted: Check whether a rename is
* permitted given our credentials, assuming UFS-like permission and
* ownership semantics.
*
* The only pair of vnodes that may be identical is {fdvp, tdvp}.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_rename_check_permitted(kauth_cred_t cred,
struct vnode *fdvp, mode_t fdmode, uid_t fduid,
struct vnode *fvp, uid_t fuid,
struct vnode *tdvp, mode_t tdmode, uid_t tduid,
struct vnode *tvp, uid_t tuid)
{
int error;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == fvp->v_mount); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT((tvp == NULL) || (fdvp->v_mount == tvp->v_mount)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* We need to remove or change an entry in the source directory.
*/
error = VOP_ACCESS(fdvp, VWRITE, cred);
if (error)
return error;
/*
* If we are changing directories, then we need to write to the
* target directory to add or change an entry. Also, if fvp is
* a directory, we need to write to it to change its `..'
* entry.
*/
if (fdvp != tdvp) {
error = VOP_ACCESS(tdvp, VWRITE, cred);
if (error)
return error;
if (fvp->v_type == VDIR) {
error = VOP_ACCESS(fvp, VWRITE, cred);
if (error)
return error;
}
}
error = genfs_ufslike_check_sticky(cred, fdmode, fduid, fvp, fuid); if (error)
return error;
error = genfs_ufslike_check_sticky(cred, tdmode, tduid, tvp, tuid);
if (error)
return error;
return 0;
}
/*
* genfs_ufslike_remove_check_possible: Check whether a remove is
* possible independent of credentials, assuming UFS-like inode flag
* semantics.
*/
int
genfs_ufslike_remove_check_possible(unsigned long dflags, unsigned long flags,
unsigned long immutable, unsigned long append)
{
/*
* We want to delete the entry. If the directory is immutable,
* we can't write to it to delete the entry. If the directory
* is append-only, the only change we can make is to add
* entries, so we can't delete entries. If the node is
* immutable, we can't change the links to it, so we can't
* delete the entry. If the node is append-only...well, this
* is what UFS does.
*/
if ((dflags | flags) & (immutable | append))
return EPERM;
return 0;
}
/*
* genfs_ufslike_remove_check_permitted: Check whether a remove is
* permitted given our credentials, assuming UFS-like permission and
* ownership semantics.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_remove_check_permitted(kauth_cred_t cred,
struct vnode *dvp, mode_t dmode, uid_t duid,
struct vnode *vp, uid_t uid)
{
int error;
KASSERT(dvp != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp);
KASSERT(dvp->v_type == VDIR);
KASSERT(vp->v_type != VDIR);
KASSERT(dvp->v_mount == vp->v_mount);
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
/*
* We need to write to the directory to remove from it.
*/
error = VOP_ACCESS(dvp, VWRITE, cred);
if (error)
return error;
error = genfs_ufslike_check_sticky(cred, dmode, duid, vp, uid);
if (error)
return error;
return 0;
}
/*
* genfs_ufslike_check_sticky: Check whether a party with credentials
* cred may change an entry in a sticky directory, assuming UFS-like
* permission, ownership, and stickiness semantics: If the directory is
* sticky and the entry exists, the user must own either the directory
* or the entry's node in order to change the entry.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_check_sticky(kauth_cred_t cred, mode_t dmode, uid_t duid,
struct vnode *vp, uid_t uid)
{
if ((dmode & S_ISTXT) && (vp != NULL)) return genfs_can_sticky(vp, cred, duid, uid);
return 0;
}
/* $NetBSD: ufs_bswap.h,v 1.23 2018/04/19 21:50:10 christos Exp $ */
/*
* Copyright (c) 1998 Manuel Bouyer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifndef _UFS_UFS_BSWAP_H_
#define _UFS_UFS_BSWAP_H_
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#endif
#include <sys/bswap.h>
/* Macros to access UFS flags */
#ifdef FFS_EI
#define UFS_MPNEEDSWAP(ump) ((ump)->um_flags & UFS_NEEDSWAP)
#define UFS_FSNEEDSWAP(fs) ((fs)->fs_flags & FS_SWAPPED)
#define UFS_IPNEEDSWAP(ip) UFS_MPNEEDSWAP((ip)->i_ump)
#else
#define UFS_MPNEEDSWAP(ump) ((void)(ump), 0)
#define UFS_FSNEEDSWAP(fs) ((void)(fs), 0)
#define UFS_IPNEEDSWAP(ip) ((void)(ip), 0)
#endif
#if (!defined(_KERNEL) && !defined(NO_FFS_EI)) || defined(FFS_EI)
/* inlines for access to swapped data */
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
return ((ns) ? bswap16(a) : (a));
}
static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
return ((ns) ? bswap32(a) : (a));
}
static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
return ((ns) ? bswap64(a) : (a));
}
#else
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
return a;
}
static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
return a;
}
static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
return a;
}
#endif
#define ufs_add16(a, b, ns) \
(a) = ufs_rw16(ufs_rw16((a), (ns)) + (b), (ns))
#define ufs_add32(a, b, ns) \
(a) = ufs_rw32(ufs_rw32((a), (ns)) + (b), (ns))
#define ufs_add64(a, b, ns) \
(a) = ufs_rw64(ufs_rw64((a), (ns)) + (b), (ns))
#endif /* !_UFS_UFS_BSWAP_H_ */
/* $NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_anon.c: uvm anon ops
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
#include <uvm/uvm_pdpolicy.h>
static struct pool_cache uvm_anon_cache;
static int uvm_anon_ctor(void *, void *, int);
void
uvm_anon_init(void)
{
pool_cache_bootstrap(&uvm_anon_cache, sizeof(struct vm_anon), 0, 0,
PR_LARGECACHE, "anonpl", NULL, IPL_NONE, uvm_anon_ctor,
NULL, NULL);
}
static int
uvm_anon_ctor(void *arg, void *object, int flags)
{
struct vm_anon *anon = object;
anon->an_ref = 0;
anon->an_lock = NULL;
anon->an_page = NULL;
#if defined(VMSWAP)
anon->an_swslot = 0;
#endif
return 0;
}
/*
* uvm_analloc: allocate a new anon.
*
* => anon will have no lock associated.
*/
struct vm_anon *
uvm_analloc(void)
{
struct vm_anon *anon;
anon = pool_cache_get(&uvm_anon_cache, PR_NOWAIT);
if (anon) { KASSERT(anon->an_ref == 0); KASSERT(anon->an_lock == NULL); KASSERT(anon->an_page == NULL);
#if defined(VMSWAP)
KASSERT(anon->an_swslot == 0);
#endif
anon->an_ref = 1;
}
return anon;
}
/*
* uvm_anfree: free a single anon structure
*
* => anon must be removed from the amap (if anon was in an amap).
* => amap must be locked, if anon was owned by amap.
* => we may drop and re-acquire the lock here (to break loans).
*/
void
uvm_anfree(struct vm_anon *anon)
{
struct vm_page *pg = anon->an_page, *pg2 __diagused;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(anon=%#jx)", (uintptr_t)anon, 0,0,0);
KASSERT(anon->an_lock == NULL || rw_write_held(anon->an_lock)); KASSERT(anon->an_ref == 0);
/*
* Dispose of the page, if it is resident.
*/
if (__predict_true(pg != NULL)) {
KASSERT(anon->an_lock != NULL);
/*
* If there is a resident page and it is loaned, then anon
* may not own it. Call out to uvm_anon_lockloanpg() to
* identify and lock the real owner of the page.
*/
if (__predict_false(pg->loan_count != 0)) {
pg2 = uvm_anon_lockloanpg(anon);
KASSERT(pg2 == pg);
}
/*
* If the page is owned by a UVM object (now locked),
* then kill the loan on the page rather than free it,
* and release the object lock.
*/
if (__predict_false(pg->uobject != NULL)) {
mutex_enter(&pg->interlock);
KASSERT(pg->loan_count > 0);
pg->loan_count--;
pg->uanon = NULL;
mutex_exit(&pg->interlock);
rw_exit(pg->uobject->vmobjlock);
} else {
/*
* If page has no UVM object, then anon is the owner,
* and it is already locked.
*/
KASSERT((pg->flags & PG_RELEASED) == 0);
pmap_page_protect(pg, VM_PROT_NONE);
/*
* If the page is busy, mark it as PG_RELEASED, so
* that uvm_anon_release(9) would release it later.
*/
if (__predict_false((pg->flags & PG_BUSY) != 0)) {
pg->flags |= PG_RELEASED;
rw_obj_hold(anon->an_lock);
return;
}
uvm_pagefree(pg);
UVMHIST_LOG(maphist, "anon %#jx, page %#jx: "
"freed now!", (uintptr_t)anon, (uintptr_t)pg,
0, 0);
}
} else {
#if defined(VMSWAP)
if (anon->an_swslot > 0) {
/* This page is no longer only in swap. */
KASSERT(uvmexp.swpgonly > 0);
atomic_dec_uint(&uvmexp.swpgonly);
}
#endif
}
anon->an_lock = NULL;
/*
* Free any swap resources, leave a page replacement hint.
*/
uvm_anon_dropswap(anon);
uvmpdpol_anfree(anon);
UVMHIST_LOG(maphist,"<- done!",0,0,0,0);
pool_cache_put(&uvm_anon_cache, anon);
}
/*
* uvm_anon_lockloanpg: given a locked anon, lock its resident page owner.
*
* => anon is locked by caller
* => on return: anon is locked
* if there is a resident page:
* if it has a uobject, it is locked by us
* if it is ownerless, we take over as owner
* we return the resident page (it can change during
* this function)
* => note that the only time an anon has an ownerless resident page
* is if the page was loaned from a uvm_object and the uvm_object
* disowned it
* => this only needs to be called when you want to do an operation
* on an anon's resident page and that page has a non-zero loan
* count.
*/
struct vm_page *
uvm_anon_lockloanpg(struct vm_anon *anon)
{
struct vm_page *pg;
krw_t op;
KASSERT(rw_lock_held(anon->an_lock));
/*
* loop while we have a resident page that has a non-zero loan count.
* if we successfully get our lock, we will "break" the loop.
* note that the test for pg->loan_count is not protected -- this
* may produce false positive results. note that a false positive
* result may cause us to do more work than we need to, but it will
* not produce an incorrect result.
*/
while (((pg = anon->an_page) != NULL) && pg->loan_count != 0) {
mutex_enter(&pg->interlock);
if (pg->uobject) {
/*
* if we didn't get a lock (try lock failed), then we
* toggle our anon lock and try again
*/
if (!rw_tryenter(pg->uobject->vmobjlock, RW_WRITER)) {
/*
* someone locking the object has a chance to
* lock us right now
*
* XXX Better than yielding but inadequate.
*/
mutex_exit(&pg->interlock);
op = rw_lock_op(anon->an_lock);
rw_exit(anon->an_lock);
kpause("lkloanpg", false, 1, NULL);
rw_enter(anon->an_lock, op);
continue;
}
}
/*
* If page is un-owned i.e. the object dropped its ownership,
* then we have to take the ownership.
*/
if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) {
pg->flags |= PG_ANON;
pg->loan_count--;
}
mutex_exit(&pg->interlock);
break;
}
return pg;
}
#if defined(VMSWAP)
/*
* uvm_anon_pagein: fetch an anon's page.
*
* => anon must be locked, and is unlocked upon return.
* => returns true if pagein was aborted due to lack of memory.
*/
bool
uvm_anon_pagein(struct vm_amap *amap, struct vm_anon *anon)
{
struct vm_page *pg;
struct uvm_object *uobj;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(anon->an_lock == amap->am_lock);
/*
* Get the page of the anon.
*/
switch (uvmfault_anonget(NULL, amap, anon)) {
case 0:
/* Success - we have the page. */
KASSERT(rw_write_held(anon->an_lock));
break;
case EIO:
case ERESTART:
/*
* Nothing more to do on errors. ERESTART means that the
* anon was freed.
*/
return false;
case ENOLCK:
panic("uvm_anon_pagein");
default:
return true;
}
/*
* Mark the page as dirty and clear its swslot.
*/
pg = anon->an_page;
uobj = pg->uobject;
if (anon->an_swslot > 0) {
uvm_swap_free(anon->an_swslot, 1);
}
anon->an_swslot = 0;
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
/*
* Deactivate the page (to put it on a page queue).
*/
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
rw_exit(anon->an_lock);
if (uobj) {
rw_exit(uobj->vmobjlock);
}
return false;
}
/*
* uvm_anon_dropswap: release any swap resources from this anon.
*
* => anon must be locked or have a reference count of 0.
*/
void
uvm_anon_dropswap(struct vm_anon *anon)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (anon->an_swslot == 0)
return;
UVMHIST_LOG(maphist,"freeing swap for anon %#jx, paged to swslot %#jx",
(uintptr_t)anon, anon->an_swslot, 0, 0);
uvm_swap_free(anon->an_swslot, 1);
anon->an_swslot = 0;
}
#endif
/*
* uvm_anon_release: release an anon and its page.
*
* => anon should not have any references.
* => anon must be locked.
*/
void
uvm_anon_release(struct vm_anon *anon)
{
struct vm_page *pg = anon->an_page;
krwlock_t *lock;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(pg != NULL);
KASSERT((pg->flags & PG_RELEASED) != 0);
KASSERT((pg->flags & PG_BUSY) != 0);
KASSERT(pg->uobject == NULL);
KASSERT(pg->uanon == anon);
KASSERT(pg->loan_count == 0);
KASSERT(anon->an_ref == 0);
if ((pg->flags & PG_PAGEOUT) != 0) {
pg->flags &= ~PG_PAGEOUT;
uvm_pageout_done(1);
}
uvm_pagefree(pg);
KASSERT(anon->an_page == NULL);
lock = anon->an_lock;
uvm_anfree(anon);
rw_exit(lock);
/* Note: extra reference is held for PG_RELEASED case. */
rw_obj_free(lock);
}
/* $NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_uvm_page_trkown.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>
#include <sys/cprng.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#ifdef UVM_PAGE_TRKOWN
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#endif
static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int);
static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int);
static ino_t ffs_dirpref(struct inode *);
static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int);
static void ffs_fserr(struct fs *, kauth_cred_t, const char *);
static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int,
daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int));
static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int);
static int32_t ffs_mapsearch(struct fs *, struct cg *,
daddr_t, int);
static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
daddr_t, long, bool);
static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
int, bool);
/* if 1, changes in optimalization strategy are logged */
int ffs_log_changeopt = 0;
/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];
/* Basic consistency check for block allocations */
static int
ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
long size, dev_t dev, ino_t inum)
{
if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
panic("%s: bad size: dev = 0x%llx, bno = %" PRId64
" bsize = %d, size = %ld, fs = %s", func,
(long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
}
if (bno >= fs->fs_size) {
printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
(unsigned long long)inum);
ffs_fserr(fs, NOCRED, "bad block");
return EINVAL;
}
return 0;
}
/*
* Allocate a block in the file system.
*
* The size of the requested block is given, which must be some
* multiple of fs_fsize and <= fs_bsize.
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate a block:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate a block in the same cylinder group.
* 4) quadradically rehash into other cylinder groups, until an
* available block is located.
* If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
* 2) quadradically rehash into other cylinder groups, until an
* available block is located.
*
* => called with um_lock held
* => releases um_lock before returning
*/
int
ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
int flags, kauth_cred_t cred, daddr_t *bnp)
{
struct ufsmount *ump;
struct fs *fs;
daddr_t bno;
u_int cg;
#if defined(QUOTA) || defined(QUOTA2)
int error;
#endif
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock));
#ifdef UVM_PAGE_TRKOWN
/*
* Sanity-check that allocations within the file size
* do not allow other threads to read the stale contents
* of newly allocated blocks.
* Usually pages will exist to cover the new allocation.
* There is an optimization in ffs_write() where we skip
* creating pages if several conditions are met:
* - the file must not be mapped (in any user address space).
* - the write must cover whole pages and whole blocks.
* If those conditions are not met then pages must exist and
* be locked by the current thread.
*/
struct vnode *vp = ITOV(ip);
if (vp->v_type == VREG && (flags & IO_EXT) == 0 &&
ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) &&
((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 ||
ffs_blkoff(fs, size) != 0)) {
struct vm_page *pg __diagused;
struct uvm_object *uobj = &vp->v_uobj;
voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
rw_enter(uobj->vmobjlock, RW_WRITER);
while (off < endoff) {
pg = uvm_pagelookup(uobj, off);
KASSERT((pg != NULL && pg->owner_tag != NULL &&
pg->owner == curproc->p_pid &&
pg->lowner == curlwp->l_lid));
off += PAGE_SIZE;
}
rw_exit(uobj->vmobjlock);
}
#endif
*bnp = 0;
KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)size <= fs->fs_bsize),
"bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
(unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, size) == 0),
"bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
(unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
if (freespace(fs, fs->fs_minfree) <= 0 &&
kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
NULL, NULL) != 0)
goto nospace;
#if defined(QUOTA) || defined(QUOTA2)
mutex_exit(&ump->um_lock);
if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
return (error);
mutex_enter(&ump->um_lock);
#endif
if (bpref >= fs->fs_size)
bpref = 0;
if (bpref == 0)
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg);
if (bno > 0) {
DIP_ADD(ip, blocks, btodb(size));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
*bnp = bno;
return (0);
}
#if defined(QUOTA) || defined(QUOTA2)
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(size), cred, FORCE);
#endif
if (flags & B_CONTIG) {
/*
* XXX ump->um_lock handling is "suspect" at best.
* For the case where ffs_hashalloc() fails early
* in the B_CONTIG case we reach here with um_lock
* already unlocked, so we can't release it again
* like in the normal error path. See kern/39206.
*
*
* Fail silently - it's up to our caller to report
* errors.
*/
return (ENOSPC);
}
nospace:
mutex_exit(&ump->um_lock);
ffs_fserr(fs, cred, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
}
/*
* Reallocate a fragment to a bigger size
*
* The number and size of the old block is given, and a preference
* and new size is also specified. The allocator attempts to extend
* the original block. Failing that, the regular block allocator is
* invoked to get an appropriate block.
*
* => called with um_lock held
* => return with um_lock released
*/
int
ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref,
int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp,
daddr_t *blknop)
{
struct ufsmount *ump;
struct fs *fs;
struct buf *bp;
u_int cg, request;
int error;
daddr_t bno;
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock));
#ifdef UVM_PAGE_TRKOWN
/*
* Sanity-check that allocations within the file size
* do not allow other threads to read the stale contents
* of newly allocated blocks.
* Unlike in ffs_alloc(), here pages must always exist
* for such allocations, because only the last block of a file
* can be a fragment and ffs_write() will reallocate the
* fragment to the new size using ufs_balloc_range(),
* which always creates pages to cover blocks it allocates.
*/
if (ITOV(ip)->v_type == VREG) {
struct vm_page *pg __diagused;
struct uvm_object *uobj = &ITOV(ip)->v_uobj;
voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
rw_enter(uobj->vmobjlock, RW_WRITER);
while (off < endoff) {
pg = uvm_pagelookup(uobj, off);
KASSERT(pg->owner == curproc->p_pid &&
pg->lowner == curlwp->l_lid);
off += PAGE_SIZE;
}
rw_exit(uobj->vmobjlock);
}
#endif
KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)osize <= fs->fs_bsize),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, osize) == 0),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG(((u_int)nsize <= fs->fs_bsize),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, nsize) == 0),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
if (freespace(fs, fs->fs_minfree) <= 0 &&
kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
NULL, NULL) != 0) {
mutex_exit(&ump->um_lock);
goto nospace;
}
if (bprev == 0) {
panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
PRId64 ", fs = %s", __func__,
(unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
fs->fs_fsmnt);
}
mutex_exit(&ump->um_lock);
/*
* Allocate the extra space in the buffer.
*/
if (bpp != NULL &&
(error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) {
return (error);
}
#if defined(QUOTA) || defined(QUOTA2)
if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
if (bpp != NULL) { brelse(bp, 0);
}
return (error);
}
#endif
/*
* Check for extension in the existing location.
*/
cg = dtog(fs, bprev);
mutex_enter(&ump->um_lock);
if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { DIP_ADD(ip, blocks, btodb(nsize - osize));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
panic("%s: bad blockno %#llx != %#llx",
__func__, (unsigned long long) bp->b_blkno,
(unsigned long long)FFS_FSBTODB(fs, bno));
}
allocbuf(bp, nsize, 1);
memset((char *)bp->b_data + osize, 0, nsize - osize);
mutex_enter(bp->b_objlock);
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_oflags |= BO_DONE;
mutex_exit(bp->b_objlock);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
}
/*
* Allocate a new disk location.
*/
if (bpref >= fs->fs_size)
bpref = 0;
switch ((int)fs->fs_optim) {
case FS_OPTSPACE:
/*
* Allocate an exact sized fragment. Although this makes
* best use of space, we will waste time relocating it if
* the file continues to grow. If the fragmentation is
* less than half of the minimum free reserve, we choose
* to begin optimizing for time.
*/
request = nsize;
if (fs->fs_minfree < 5 ||
fs->fs_cstotal.cs_nffree >
fs->fs_dsize * fs->fs_minfree / (2 * 100))
break;
if (ffs_log_changeopt) {
log(LOG_NOTICE,
"%s: optimization changed from SPACE to TIME\n",
fs->fs_fsmnt);
}
fs->fs_optim = FS_OPTTIME;
break;
case FS_OPTTIME:
/*
* At this point we have discovered a file that is trying to
* grow a small fragment to a larger fragment. To save time,
* we allocate a full sized block, then free the unused portion.
* If the file continues to grow, the `ffs_fragextend' call
* above will be able to grow it in place without further
* copying. If aberrant programs cause disk fragmentation to
* grow within 2% of the free reserve, we choose to begin
* optimizing for space.
*/
request = fs->fs_bsize;
if (fs->fs_cstotal.cs_nffree <
fs->fs_dsize * (fs->fs_minfree - 2) / 100)
break;
if (ffs_log_changeopt) {
log(LOG_NOTICE,
"%s: optimization changed from TIME to SPACE\n",
fs->fs_fsmnt);
}
fs->fs_optim = FS_OPTSPACE;
break;
default:
panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
__func__, (unsigned long long)ip->i_dev, fs->fs_optim,
fs->fs_fsmnt);
/* NOTREACHED */
}
bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); if (bno > 0) {
/*
* Use forced deallocation registration, we can't handle
* failure here. This is safe, as this place is ever hit
* maximum once per write operation, when fragment is extended
* to longer fragment, or a full block.
*/
if ((ip->i_ump->um_mountp->mnt_wapbl) &&
(ITOV(ip)->v_type != VREG)) {
/* this should never fail */
error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
osize);
if (error) panic("ffs_realloccg: dealloc registration failed");
} else {
ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
ip->i_number);
}
DIP_ADD(ip, blocks, btodb(nsize - osize));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
bp->b_blkno = FFS_FSBTODB(fs, bno);
allocbuf(bp, nsize, 1);
memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
mutex_enter(bp->b_objlock);
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_oflags |= BO_DONE;
mutex_exit(bp->b_objlock);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
}
mutex_exit(&ump->um_lock);
#if defined(QUOTA) || defined(QUOTA2)
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
#endif
if (bpp != NULL) { brelse(bp, 0);
}
nospace:
/*
* no space available
*/
ffs_fserr(fs, cred, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
}
/*
* Allocate an inode in the file system.
*
* If allocating a directory, use ffs_dirpref to select the inode.
* If allocating in a directory, the following hierarchy is followed:
* 1) allocate the preferred inode.
* 2) allocate an inode in the same cylinder group.
* 3) quadradically rehash into other cylinder groups, until an
* available inode is located.
* If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadradically rehash into other cylinder groups, until an
* available inode is located.
*
* => um_lock not held upon entry or return
*/
int
ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop)
{
struct ufsmount *ump;
struct inode *pip;
struct fs *fs;
ino_t ino, ipref;
u_int cg;
int error;
UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
pip = VTOI(pvp);
fs = pip->i_fs;
ump = pip->i_ump;
error = UFS_WAPBL_BEGIN(pvp->v_mount);
if (error) {
return error;
}
mutex_enter(&ump->um_lock);
if (fs->fs_cstotal.cs_nifree == 0)
goto noinodes;
if ((mode & IFMT) == IFDIR)
ipref = ffs_dirpref(pip);
else
ipref = pip->i_number;
if (ipref >= fs->fs_ncg * fs->fs_ipg)
ipref = 0;
cg = ino_to_cg(fs, ipref);
/*
* Track number of dirs created one after another
* in a same cg without intervening by files.
*/
if ((mode & IFMT) == IFDIR) {
if (fs->fs_contigdirs[cg] < 255)
fs->fs_contigdirs[cg]++;
} else {
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg);
if (ino == 0)
goto noinodes;
UFS_WAPBL_END(pvp->v_mount);
*inop = ino;
return 0;
noinodes:
mutex_exit(&ump->um_lock);
UFS_WAPBL_END(pvp->v_mount);
ffs_fserr(fs, cred, "out of inodes");
uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
return ENOSPC;
}
/*
* Find a cylinder group in which to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
static ino_t
ffs_dirpref(struct inode *pip)
{
register struct fs *fs;
u_int cg, prefcg;
uint64_t dirsize, cgsize, curdsz;
u_int avgifree, avgbfree, avgndir;
u_int minifree, minbfree, maxndir;
u_int mincg, minndir;
u_int maxcontigdirs;
KASSERT(mutex_owned(&pip->i_ump->um_lock));
fs = pip->i_fs;
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
/*
* Force allocation in another cg if creating a first level dir.
*/
if (ITOV(pip)->v_vflag & VV_ROOT) {
prefcg = cprng_fast32() % fs->fs_ncg;
mincg = prefcg;
minndir = fs->fs_ipg;
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
return ((ino_t)(fs->fs_ipg * mincg));
}
/*
* Count various limits which used for
* optimal allocation of a directory inode.
* Try cylinder groups with >75% avgifree and avgbfree.
* Avoid cylinder groups with no free blocks or inodes as that
* triggers an I/O-expensive cylinder group scan.
*/
maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
minifree = avgifree - avgifree / 4;
if (minifree < 1)
minifree = 1;
minbfree = avgbfree - avgbfree / 4;
if (minbfree < 1)
minbfree = 1;
cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
if (avgndir != 0) { curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
if (dirsize < curdsz)
dirsize = curdsz;
}
if (cgsize < dirsize * 255) maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
else
maxcontigdirs = 255;
if (fs->fs_avgfpdir > 0)
maxcontigdirs = uimin(maxcontigdirs,
fs->fs_ipg / fs->fs_avgfpdir);
if (maxcontigdirs == 0)
maxcontigdirs = 1;
/*
* Limit number of dirs in one cg and reserve space for
* regular files, but only if we have no deficit in
* inodes or space.
*/
prefcg = ino_to_cg(fs, pip->i_number);
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg));
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg));
}
/*
* This is a backstop when we are deficient in space.
*/
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
break;
return ((ino_t)(fs->fs_ipg * cg));
}
/*
* Select the desired position for the next block in a file. The file is
* logically divided into sections. The first section is composed of the
* direct blocks. Each additional section contains fs_maxbpg blocks.
*
* If no blocks have been allocated in the first section, the policy is to
* request a block in the same cylinder group as the inode that describes
* the file. If no blocks have been allocated in any other section, the
* policy is to place the section in a cylinder group with a greater than
* average number of free blocks. An appropriate cylinder group is found
* by using a rotor that sweeps the cylinder groups. When a new group of
* blocks is needed, the sweep begins in the cylinder group following the
* cylinder group from which the previous allocation was made. The sweep
* continues until a cylinder group with greater than the average number
* of free blocks is found. If the allocation is for the first block in an
* indirect block, the information on the previous allocation is unavailable;
* here a best guess is made based upon the logical block number being
* allocated.
*
* If a section is already partially allocated, the policy is to
* contiguously allocate fs_maxcontig blocks. The end of one of these
* contiguous blocks and the beginning of the next is laid out
* contigously if possible.
*
* => um_lock held on entry and exit
*/
daddr_t
ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
int32_t *bap /* XXX ondisk32 */)
{
struct fs *fs;
u_int cg;
u_int avgbfree, startcg;
KASSERT(mutex_owned(&ip->i_ump->um_lock));
fs = ip->i_fs;
/*
* If allocating a contiguous file with B_CONTIG, use the hints
* in the inode extensions to return the desired block.
*
* For metadata (indirect blocks) return the address of where
* the first indirect block resides - we'll scan for the next
* available slot if we need to allocate more than one indirect
* block. For data, return the address of the actual block
* relative to the address of the first data block.
*/
if (flags & B_CONTIG) {
KASSERT(ip->i_ffs_first_data_blk != 0);
KASSERT(ip->i_ffs_first_indir_blk != 0);
if (flags & B_METAONLY)
return ip->i_ffs_first_indir_blk;
else
return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
}
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
cg = ino_to_cg(fs, ip->i_number);
return (cgbase(fs, cg) + fs->fs_frag);
}
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg =
ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs,
ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
for (cg = 0; cg < startcg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
return (0);
}
/*
* We just always try to lay things out contiguously.
*/
return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}
daddr_t
ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
int64_t *bap)
{
struct fs *fs;
u_int cg;
u_int avgbfree, startcg;
KASSERT(mutex_owned(&ip->i_ump->um_lock));
fs = ip->i_fs;
/*
* If allocating a contiguous file with B_CONTIG, use the hints
* in the inode extensions to return the desired block.
*
* For metadata (indirect blocks) return the address of where
* the first indirect block resides - we'll scan for the next
* available slot if we need to allocate more than one indirect
* block. For data, return the address of the actual block
* relative to the address of the first data block.
*/
if (flags & B_CONTIG) {
KASSERT(ip->i_ffs_first_data_blk != 0); KASSERT(ip->i_ffs_first_indir_blk != 0);
if (flags & B_METAONLY)
return ip->i_ffs_first_indir_blk;
else
return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
}
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
cg = ino_to_cg(fs, ip->i_number);
return (cgbase(fs, cg) + fs->fs_frag);
}
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg =
ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs,
ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
return (0);
}
/*
* We just always try to lay things out contiguously.
*/
return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}
/*
* Implement the cylinder overflow algorithm.
*
* The policy implemented by this algorithm is:
* 1) allocate the block in its requested cylinder group.
* 2) quadradically rehash on the cylinder group number.
* 3) brute force search for a free block.
*
* => called with um_lock held
* => returns with um_lock released on success, held on failure
* (*allocator releases lock on success, retains lock on failure)
*/
/*VARARGS5*/
static daddr_t
ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref,
int size /* size for data blocks, mode for inodes */,
int realsize,
int flags,
daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int))
{
struct fs *fs;
daddr_t result;
u_int i, icg = cg;
fs = ip->i_fs;
/*
* 1: preferred cylinder group
*/
result = (*allocator)(ip, cg, pref, size, realsize, flags);
if (result)
return (result);
if (flags & B_CONTIG)
return (result);
/*
* 2: quadratic rehash
*/
for (i = 1; i < fs->fs_ncg; i *= 2) {
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
result = (*allocator)(ip, cg, 0, size, realsize, flags);
if (result)
return (result);
}
/*
* 3: brute force search
* Note that we start at i == 2, since 0 was checked initially,
* and 1 is always checked in the quadratic rehash.
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
result = (*allocator)(ip, cg, 0, size, realsize, flags);
if (result)
return (result);
cg++;
if (cg == fs->fs_ncg)
cg = 0;
}
return (0);
}
/*
* Determine whether a fragment can be extended.
*
* Check to see if the necessary fragments are available, and
* if they are, allocate them.
*
* => called with um_lock held
* => returns with um_lock released on success, held on failure
*/
static daddr_t
ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize)
{
struct ufsmount *ump;
struct fs *fs;
struct cg *cgp;
struct buf *bp;
daddr_t bno;
int frags, bbase;
int i, error;
u_int8_t *blksfree;
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
return (0);
frags = ffs_numfrags(fs, nsize);
bbase = ffs_fragnum(fs, bprev);
if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
/* cannot extend across a block boundary */
return (0);
}
mutex_exit(&ump->um_lock);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
goto fail;
cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
bno = dtogd(fs, bprev);
blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); for (i = ffs_numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i))
goto fail;
/*
* the current fragment can be extended
* deduct the count on fragment being extended into
* increase the count on the remaining fragment (if any)
* allocate the extended piece
*/
for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i))
break;
ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); if (i != frags) ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
mutex_enter(&ump->um_lock);
for (i = ffs_numfrags(fs, osize); i < frags; i++) {
clrbit(blksfree, bno + i);
ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
fs->fs_cstotal.cs_nffree--;
fs->fs_cs(fs, cg).cs_nffree--;
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return (bprev);
fail:
if (bp != NULL) brelse(bp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Determine whether a block can be allocated.
*
* Check to see if a block of the appropriate size is available,
* and if it is, allocate it.
*/
static daddr_t
ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize,
int flags)
{
struct ufsmount *ump;
struct fs *fs = ip->i_fs;
struct cg *cgp;
struct buf *bp;
int32_t bno;
daddr_t blkno;
int error, frags, allocsiz, i;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
return (0);
mutex_exit(&ump->um_lock);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
goto fail;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap);
if (size == fs->fs_bsize) {
mutex_enter(&ump->um_lock);
blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
/*
* If actually needed size is lower, free the extra blocks now.
* This is safe to call here, there is no outside reference
* to this block yet. It is not necessary to keep um_lock
* locked.
*/
if (realsize != 0 && realsize < size) { ffs_blkfree_common(ip->i_ump, ip->i_fs,
ip->i_devvp->v_rdev,
bp, blkno + ffs_numfrags(fs, realsize),
(long)(size - realsize), false);
}
bdwrite(bp);
return (blkno);
}
/*
* check to see if any fragments are already available
* allocsiz is the size which will be allocated, hacking
* it down to a smaller size if necessary
*/
blksfree = cg_blksfree(cgp, needswap);
frags = ffs_numfrags(fs, size);
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0)
break;
if (allocsiz == fs->fs_frag) {
/*
* no fragments were available, so a block will be
* allocated, and hacked up
*/
if (cgp->cg_cs.cs_nbfree == 0)
goto fail;
mutex_enter(&ump->um_lock);
blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
bno = dtogd(fs, blkno);
for (i = frags; i < fs->fs_frag; i++)
setbit(blksfree, bno + i);
i = fs->fs_frag - frags;
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
fs->fs_fmod = 1;
ufs_add32(cgp->cg_frsum[i], 1, needswap); ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return (blkno);
}
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
#if 0
/*
* XXX fvdl mapsearch will panic, and never return -1
* also: returning NULL as daddr_t ?
*/
if (bno < 0)
goto fail;
#endif
for (i = 0; i < frags; i++)
clrbit(blksfree, bno + i);
mutex_enter(&ump->um_lock);
ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
fs->fs_cstotal.cs_nffree -= frags;
fs->fs_cs(fs, cg).cs_nffree -= frags;
fs->fs_fmod = 1;
ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
if (frags != allocsiz) ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
blkno = cgbase(fs, cg) + bno;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return blkno;
fail:
if (bp != NULL) brelse(bp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Allocate a block in a cylinder group.
*
* This algorithm implements the following policy:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate the next available block on the block rotor for the
* specified cylinder group.
* Note that this routine only allocates fs_bsize blocks; these
* blocks may be fragmented by the routine that allocates them.
*/
static daddr_t
ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize,
int flags)
{
struct fs *fs = ip->i_fs;
struct cg *cgp;
int cg;
daddr_t blkno;
int32_t bno;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(mutex_owned(&ip->i_ump->um_lock));
cgp = (struct cg *)bp->b_data;
blksfree = cg_blksfree(cgp, needswap); if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { bpref = ufs_rw32(cgp->cg_rotor, needswap);
} else {
bpref = ffs_blknum(fs, bpref);
bno = dtogd(fs, bpref);
/*
* if the requested block is available, use it
*/
if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
goto gotit;
/*
* if the requested data block isn't available and we are
* trying to allocate a contiguous file, return an error.
*/
if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
return (0);
}
/*
* Take the next available block in this cylinder group.
*/
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
#if 0
/*
* XXX jdolecek ffs_mapsearch() succeeds or panics
*/
if (bno < 0)
return (0);
#endif
cgp->cg_rotor = ufs_rw32(bno, needswap);
gotit:
blkno = ffs_fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, blkno);
ffs_clusteracct(fs, cgp, blkno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
int cylno;
cylno = old_cbtocylno(fs, bno);
KASSERT(cylno >= 0); KASSERT(cylno < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
}
fs->fs_fmod = 1;
cg = ufs_rw32(cgp->cg_cgx, needswap);
blkno = cgbase(fs, cg) + bno;
return (blkno);
}
/*
* Determine whether an inode can be allocated.
*
* Check to see if an inode is available, and if it is,
* allocate it using the following policy:
* 1) allocate the requested inode.
* 2) allocate the next available inode after the requested
* inode in the specified cylinder group.
*/
static daddr_t
ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize,
int flags)
{
struct ufsmount *ump = ip->i_ump;
struct fs *fs = ip->i_fs;
struct cg *cgp;
struct buf *bp, *ibp;
u_int8_t *inosused;
int error, start, len, loc, map, i;
int32_t initediblk, maxiblk, irotor;
daddr_t nalloc;
struct ufs2_dinode *dp2;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(mutex_owned(&ump->um_lock)); UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
mutex_exit(&ump->um_lock);
ibp = NULL;
if (fs->fs_magic == FS_UFS2_MAGIC) {
initediblk = -1;
} else {
initediblk = fs->fs_ipg;
}
maxiblk = initediblk;
retry:
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
goto fail;
if (ibp != NULL && initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
/* Another thread allocated more inodes so we retry the test. */
brelse(ibp, 0);
ibp = NULL;
}
/*
* Check to see if we need to initialize more inodes.
*/
if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
maxiblk = initediblk;
nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
if (nalloc + FFS_INOPB(fs) > initediblk && initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
/*
* We have to release the cg buffer here to prevent
* a deadlock when reading the inode block will
* run a copy-on-write that might use this cg.
*/
brelse(bp, 0);
bp = NULL;
error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
FFS_NOBLK, fs->fs_bsize, false, &ibp);
if (error)
goto fail;
maxiblk += FFS_INOPB(fs);
goto retry;
}
}
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap); if (ipref) {
ipref %= fs->fs_ipg;
/* safeguard to stay in (to be) allocated range */
if (ipref < maxiblk && isclr(inosused, ipref))
goto gotit;
}
irotor = ufs_rw32(cgp->cg_irotor, needswap); KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d"
" out of bounds, initediblk=%d",
__func__, cg, irotor, initediblk);
start = irotor / NBBY;
len = howmany(maxiblk - irotor, NBBY);
loc = skpc(0xff, len, &inosused[start]);
if (loc == 0) {
len = start + 1;
start = 0;
loc = skpc(0xff, len, &inosused[0]);
if (loc == 0) {
panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
__func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
fs->fs_fsmnt);
/* NOTREACHED */
}
}
i = start + len - loc;
map = inosused[i] ^ 0xff;
if (map == 0) {
panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
}
ipref = i * NBBY + ffs(map) - 1;
cgp->cg_irotor = ufs_rw32(ipref, needswap);
gotit:
KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to "
"allocate inode index %d beyond max allocated index %d"
" of %d inodes/cg",
__func__, cg, (int)ipref, maxiblk, cgp->cg_niblk);
UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
mode);
/*
* Check to see if we need to initialize more inodes.
*/
if (ibp != NULL) { KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
memset(ibp->b_data, 0, fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
for (i = 0; i < FFS_INOPB(fs); i++) {
/*
* Don't bother to swap, it's supposed to be
* random, after all.
*/
dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
dp2++;
}
initediblk += FFS_INOPB(fs);
cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
}
mutex_enter(&ump->um_lock);
ACTIVECG_CLR(fs, cg);
setbit(inosused, ipref);
ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
fs->fs_cstotal.cs_nifree--;
fs->fs_cs(fs, cg).cs_nifree--;
fs->fs_fmod = 1;
if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
fs->fs_cstotal.cs_ndir++;
fs->fs_cs(fs, cg).cs_ndir++;
}
mutex_exit(&ump->um_lock);
if (ibp != NULL) {
bwrite(ibp);
bwrite(bp);
} else
bdwrite(bp);
return ((ino_t)(cg * fs->fs_ipg + ipref));
fail:
if (bp != NULL) brelse(bp, 0); if (ibp != NULL) brelse(ibp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Allocate a block or fragment.
*
* The specified block or fragment is removed from the
* free map, possibly fragmenting a block in the process.
*
* This implementation should mirror fs_blkfree
*
* => um_lock not held on entry or exit
*/
int
ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
{
int error;
error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
ip->i_dev, ip->i_uid);
if (error)
return error;
return ffs_blkalloc_ump(ip->i_ump, bno, size);
}
int
ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
{
struct fs *fs = ump->um_fs;
struct cg *cgp;
struct buf *bp;
int32_t fragno, cgbno;
int i, error, blk, frags, bbase;
u_int cg;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
KASSERT(bno < fs->fs_size);
cg = dtog(fs, bno);
error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error) {
return error;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return EIO;
}
cgp->cg_old_time = ufs_rw32(time_second, needswap);
cgp->cg_time = ufs_rw64(time_second, needswap);
cgbno = dtogd(fs, bno);
blksfree = cg_blksfree(cgp, needswap);
mutex_enter(&ump->um_lock);
if (size == fs->fs_bsize) {
fragno = ffs_fragstoblks(fs, cgbno);
if (!ffs_isblock(fs, blksfree, fragno)) {
mutex_exit(&ump->um_lock);
brelse(bp, 0);
return EBUSY;
}
ffs_clrblock(fs, blksfree, fragno);
ffs_clusteracct(fs, cgp, fragno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cg).cs_nbfree--;
} else {
bbase = cgbno - ffs_fragnum(fs, cgbno);
frags = ffs_numfrags(fs, size);
for (i = 0; i < frags; i++) {
if (isclr(blksfree, cgbno + i)) {
mutex_exit(&ump->um_lock);
brelse(bp, 0);
return EBUSY;
}
}
/*
* if a complete block is being split, account for it
*/
fragno = ffs_fragstoblks(fs, bbase);
if (ffs_isblock(fs, blksfree, fragno)) {
ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
fs->fs_cstotal.cs_nffree += fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
ffs_clusteracct(fs, cgp, fragno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cg).cs_nbfree--;
}
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
/*
* allocate the fragment
*/
for (i = 0; i < frags; i++) {
clrbit(blksfree, cgbno + i);
}
ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
fs->fs_cstotal.cs_nffree -= i;
fs->fs_cs(fs, cg).cs_nffree -= i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return 0;
}
/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*
* => um_lock not held on entry or exit
*/
static void
ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
{
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
daddr_t cgblkno;
int error;
u_int cg;
dev_t dev;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(!devvp_is_snapshot);
cg = dtog(fs, bno);
dev = devvp->v_rdev;
ump = VFSTOUFS(spec_node_getmountedfs(devvp));
KASSERT(fs == ump->um_fs); cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0);
return;
}
ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
bdwrite(bp);
}
struct discardopdata {
struct work wk; /* must be first */
struct vnode *devvp;
daddr_t bno;
long size;
};
struct discarddata {
struct fs *fs;
struct discardopdata *entry;
long maxsize;
kmutex_t entrylk;
struct workqueue *wq;
int wqcnt, wqdraining;
kmutex_t wqlk;
kcondvar_t wqcv;
/* timer for flush? */
};
static void
ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
{
struct mount *mp = spec_node_getmountedfs(td->devvp);
long todo;
int error;
while (td->size) {
todo = uimin(td->size,
ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
error = UFS_WAPBL_BEGIN(mp);
if (error) {
printf("ffs: failed to begin wapbl transaction"
" for discard: %d\n", error);
break;
}
ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
UFS_WAPBL_END(mp);
td->bno += ffs_numfrags(fs, todo);
td->size -= todo;
}
}
static void
ffs_discardcb(struct work *wk, void *arg)
{
struct discardopdata *td = (void *)wk;
struct discarddata *ts = arg;
struct fs *fs = ts->fs;
off_t start, len;
#ifdef TRIMDEBUG
int error;
#endif
/* like FSBTODB but emits bytes; XXX move to fs.h */
#ifndef FFS_FSBTOBYTES
#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
#endif
start = FFS_FSBTOBYTES(fs, td->bno);
len = td->size;
vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY);
#ifdef TRIMDEBUG
error =
#endif
VOP_FDISCARD(td->devvp, start, len);
VOP_UNLOCK(td->devvp);
#ifdef TRIMDEBUG
printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
#endif
ffs_blkfree_td(fs, td);
kmem_free(td, sizeof(*td));
mutex_enter(&ts->wqlk);
ts->wqcnt--;
if (ts->wqdraining && !ts->wqcnt)
cv_signal(&ts->wqcv);
mutex_exit(&ts->wqlk);
}
void *
ffs_discard_init(struct vnode *devvp, struct fs *fs)
{
struct discarddata *ts;
int error;
ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
PRI_USER, IPL_NONE, 0);
if (error) {
kmem_free(ts, sizeof (*ts));
return NULL;
}
mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ts->wqcv, "trimwqcv");
ts->maxsize = 100*1024; /* XXX */
ts->fs = fs;
return ts;
}
void
ffs_discard_finish(void *vts, int flags)
{
struct discarddata *ts = vts;
struct discardopdata *td = NULL;
/* wait for workqueue to drain */
mutex_enter(&ts->wqlk);
if (ts->wqcnt) {
ts->wqdraining = 1;
cv_wait(&ts->wqcv, &ts->wqlk);
}
mutex_exit(&ts->wqlk);
mutex_enter(&ts->entrylk);
if (ts->entry) {
td = ts->entry;
ts->entry = NULL;
}
mutex_exit(&ts->entrylk);
if (td) {
/* XXX don't tell disk, its optional */
ffs_blkfree_td(ts->fs, td);
#ifdef TRIMDEBUG
printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
kmem_free(td, sizeof(*td));
}
cv_destroy(&ts->wqcv);
mutex_destroy(&ts->entrylk);
mutex_destroy(&ts->wqlk);
workqueue_destroy(ts->wq);
kmem_free(ts, sizeof(*ts));
}
void
ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
ino_t inum)
{
struct ufsmount *ump;
int error;
dev_t dev;
struct discarddata *ts;
struct discardopdata *td;
dev = devvp->v_rdev;
ump = VFSTOUFS(spec_node_getmountedfs(devvp));
if (ffs_snapblkfree(fs, devvp, bno, size, inum))
return;
error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
if (error)
return;
if (!ump->um_discarddata) {
ffs_blkfree_cg(fs, devvp, bno, size);
return;
}
#ifdef TRIMDEBUG
printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
#endif
ts = ump->um_discarddata;
td = NULL;
mutex_enter(&ts->entrylk);
if (ts->entry) {
td = ts->entry;
/* ffs deallocs backwards, check for prepend only */
if (td->bno == bno + ffs_numfrags(fs, size) && td->size + size <= ts->maxsize) {
td->bno = bno;
td->size += size;
if (td->size < ts->maxsize) {
#ifdef TRIMDEBUG
printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_exit(&ts->entrylk);
return;
}
size = 0; /* mark done */
}
ts->entry = NULL;
}
mutex_exit(&ts->entrylk);
if (td) {
#ifdef TRIMDEBUG
printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_enter(&ts->wqlk);
ts->wqcnt++;
mutex_exit(&ts->wqlk);
workqueue_enqueue(ts->wq, &td->wk, NULL);
}
if (!size)
return;
td = kmem_alloc(sizeof(*td), KM_SLEEP);
td->devvp = devvp;
td->bno = bno;
td->size = size;
if (td->size < ts->maxsize) { /* XXX always the case */
mutex_enter(&ts->entrylk);
if (!ts->entry) { /* possible race? */
#ifdef TRIMDEBUG
printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
ts->entry = td;
td = NULL;
}
mutex_exit(&ts->entrylk);
}
if (td) {
#ifdef TRIMDEBUG
printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_enter(&ts->wqlk);
ts->wqcnt++;
mutex_exit(&ts->wqlk);
workqueue_enqueue(ts->wq, &td->wk, NULL);
}
}
/*
* Free a block or fragment from a snapshot cg copy.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*
* => um_lock not held on entry or exit
*/
void
ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
ino_t inum)
{
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
daddr_t cgblkno;
int error, cg;
dev_t dev;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(devvp_is_snapshot);
cg = dtog(fs, bno);
dev = VTOI(devvp)->i_devvp->v_rdev;
ump = VFSTOUFS(devvp->v_mount);
cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
if (error)
return;
error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return;
}
ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
bdwrite(bp);
}
static void
ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
{
struct cg *cgp;
int32_t fragno, cgbno;
int i, blk, frags, bbase;
u_int cg;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
cg = dtog(fs, bno);
cgp = (struct cg *)bp->b_data;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap);
cgbno = dtogd(fs, bno);
blksfree = cg_blksfree(cgp, needswap);
mutex_enter(&ump->um_lock);
if (size == fs->fs_bsize) {
fragno = ffs_fragstoblks(fs, cgbno);
if (!ffs_isfreeblock(fs, blksfree, fragno)) {
if (devvp_is_snapshot) {
mutex_exit(&ump->um_lock);
return;
}
panic("%s: freeing free block: dev = 0x%llx, block = %"
PRId64 ", fs = %s", __func__,
(unsigned long long)dev, bno, fs->fs_fsmnt);
}
ffs_setblock(fs, blksfree, fragno);
ffs_clusteracct(fs, cgp, fragno, 1);
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
i = old_cbtocylno(fs, cgbno);
KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, cgbno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
}
} else {
bbase = cgbno - ffs_fragnum(fs, cgbno);
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
/*
* deallocate the fragment
*/
frags = ffs_numfrags(fs, size);
for (i = 0; i < frags; i++) {
if (isset(blksfree, cgbno + i)) {
panic("%s: freeing free frag: "
"dev = 0x%llx, block = %" PRId64
", fs = %s", __func__,
(unsigned long long)dev, bno + i,
fs->fs_fsmnt);
}
setbit(blksfree, cgbno + i);
}
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
/*
* if a complete block has been reassembled, account for it
*/
fragno = ffs_fragstoblks(fs, bbase);
if (ffs_isblock(fs, blksfree, fragno)) { ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
ffs_clusteracct(fs, cgp, fragno, 1);
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
i = old_cbtocylno(fs, bbase);
KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bbase) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
bbase)], 1, needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
}
}
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
}
/*
* Free an inode.
*/
int
ffs_vfree(struct vnode *vp, ino_t ino, int mode)
{
return ffs_freefile(vp->v_mount, ino, mode);
}
/*
* Do the actual free operation.
* The specified inode is placed back in the free map.
*
* => um_lock not held on entry or exit
*/
int
ffs_freefile(struct mount *mp, ino_t ino, int mode)
{
struct ufsmount *ump = VFSTOUFS(mp);
struct fs *fs = ump->um_fs;
struct vnode *devvp;
struct cg *cgp;
struct buf *bp;
int error;
u_int cg;
daddr_t cgbno;
dev_t dev;
const int needswap = UFS_FSNEEDSWAP(fs);
cg = ino_to_cg(fs, ino);
devvp = ump->um_devvp;
dev = devvp->v_rdev;
cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
(long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
error = bread(devvp, cgbno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0);
return (0);
}
ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
bdwrite(bp);
return 0;
}
int
ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
{
struct ufsmount *ump;
struct cg *cgp;
struct buf *bp;
int error, cg;
daddr_t cgbno;
dev_t dev;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(devvp->v_type != VBLK);
cg = ino_to_cg(fs, ino);
dev = VTOI(devvp)->i_devvp->v_rdev;
ump = VFSTOUFS(devvp->v_mount);
cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
(unsigned long long)dev, (unsigned long long)ino,
fs->fs_fsmnt);
error = bread(devvp, cgbno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return (0);
}
ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
bdwrite(bp);
return 0;
}
static void
ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
{
u_int cg;
struct cg *cgp;
u_int8_t *inosused;
const int needswap = UFS_FSNEEDSWAP(fs);
ino_t cgino;
cg = ino_to_cg(fs, ino);
cgp = (struct cg *)bp->b_data;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap);
cgino = ino % fs->fs_ipg;
if (isclr(inosused, cgino)) {
printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
(unsigned long long)dev, (unsigned long long)ino,
fs->fs_fsmnt);
if (fs->fs_ronly == 0)
panic("%s: freeing free inode", __func__);
}
clrbit(inosused, cgino); if (!devvp_is_snapshot) UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode); if (cgino < ufs_rw32(cgp->cg_irotor, needswap)) cgp->cg_irotor = ufs_rw32(cgino, needswap);
ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
mutex_enter(&ump->um_lock);
fs->fs_cstotal.cs_nifree++;
fs->fs_cs(fs, cg).cs_nifree++;
if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
fs->fs_cstotal.cs_ndir--;
fs->fs_cs(fs, cg).cs_ndir--;
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
}
/*
* Check to see if a file is free.
*/
int
ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
{
struct cg *cgp;
struct buf *bp;
daddr_t cgbno;
int ret;
u_int cg;
u_int8_t *inosused;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
KASSERT(devvp_is_snapshot);
cg = ino_to_cg(fs, ino);
if (devvp_is_snapshot)
cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
else
cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
return 1;
if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) {
return 1;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
brelse(bp, 0);
return 1;
}
inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
ino %= fs->fs_ipg;
ret = isclr(inosused, ino);
brelse(bp, 0);
return ret;
}
/*
* Find a block of the specified size in the specified cylinder group.
*
* It is a panic if a request is made to find a block if none are
* available.
*/
static int32_t
ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
{
int32_t bno;
int start, len, loc, i;
int blk, field, subfield, pos;
int ostart, olen;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
/* KASSERT(mutex_owned(&ump->um_lock)); */
/*
* find the fragment by searching through the free block
* map for an appropriate bit pattern
*/
if (bpref)
start = dtogd(fs, bpref) / NBBY;
else
start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; blksfree = cg_blksfree(cgp, needswap);
len = howmany(fs->fs_fpg, NBBY) - start;
ostart = start;
olen = len;
loc = scanc((u_int)len,
(const u_char *)&blksfree[start],
(const u_char *)fragtbl[fs->fs_frag],
(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
if (loc == 0) {
len = start + 1;
start = 0;
loc = scanc((u_int)len,
(const u_char *)&blksfree[0],
(const u_char *)fragtbl[fs->fs_frag],
(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
if (loc == 0) {
panic("%s: map corrupted: start=%d, len=%d, "
"fs = %s, offset=%d/%ld, cg %d", __func__,
ostart, olen, fs->fs_fsmnt,
ufs_rw32(cgp->cg_freeoff, needswap),
(long)blksfree - (long)cgp, cgp->cg_cgx);
/* NOTREACHED */
}
}
bno = (start + len - loc) * NBBY;
cgp->cg_frotor = ufs_rw32(bno, needswap);
/*
* found the byte in the map
* sift through the bits to find the selected frag
*/
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
blk = blkmap(fs, blksfree, bno);
blk <<= 1;
field = around[allocsiz];
subfield = inside[allocsiz];
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
if ((blk & field) == subfield)
return (bno + pos);
field <<= 1;
subfield <<= 1;
}
}
panic("%s: block not in map: bno=%d, fs=%s", __func__,
bno, fs->fs_fsmnt);
/* return (-1); */
}
/*
* Fserr prints the name of a file system with an error diagnostic.
*
* The form of the error message is:
* fs: error message
*/
static void
ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp)
{
KASSERT(cred != NULL);
if (cred == NOCRED || cred == FSCRED) {
log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
curproc->p_pid, curproc->p_comm,
fs->fs_fsmnt, cp);
} else {
log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
fs->fs_fsmnt, cp);
}
}
/* $NetBSD: in_var.h,v 1.103 2022/11/19 08:00:51 yamt Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1985, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_var.h 8.2 (Berkeley) 1/9/95
*/
#ifndef _NETINET_IN_VAR_H_
#define _NETINET_IN_VAR_H_
#include <sys/queue.h>
#define IN_IFF_TENTATIVE 0x01 /* tentative address */
#define IN_IFF_DUPLICATED 0x02 /* DAD detected duplicate */
#define IN_IFF_DETACHED 0x04 /* may be detached from the link */
#define IN_IFF_TRYTENTATIVE 0x08 /* intent to try DAD */
#define IN_IFFBITS \
"\020\1TENTATIVE\2DUPLICATED\3DETACHED\4TRYTENTATIVE"
/* do not input/output */
#define IN_IFF_NOTREADY \
(IN_IFF_TRYTENTATIVE | IN_IFF_TENTATIVE | IN_IFF_DUPLICATED)
/*
* Interface address, Internet version. One of these structures
* is allocated for each interface with an Internet address.
* The ifaddr structure contains the protocol-independent part
* of the structure and is assumed to be first.
*/
struct in_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
/* ia_{,sub}net{,mask} in host order */
u_int32_t ia_net; /* network number of interface */
u_int32_t ia_netmask; /* mask of net part */
u_int32_t ia_subnet; /* subnet number, including net */
u_int32_t ia_subnetmask; /* mask of subnet part */
struct in_addr ia_netbroadcast; /* to recognize net broadcasts */
LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
TAILQ_ENTRY(in_ifaddr) ia_list; /* list of internet addresses */
struct sockaddr_in ia_addr; /* reserve space for interface name */
struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
#define ia_broadaddr ia_dstaddr
struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
LIST_HEAD(, in_multi) ia_multiaddrs; /* list of multicast addresses */
struct in_multi *ia_allhosts; /* multicast address record for
the allhosts multicast group */
uint16_t ia_idsalt; /* ip_id salt for this ia */
int ia4_flags; /* address flags */
void (*ia_dad_start) (struct ifaddr *); /* DAD start function */
void (*ia_dad_stop) (struct ifaddr *); /* DAD stop function */
time_t ia_dad_defended; /* last time of DAD defence */
#ifdef _KERNEL
struct pslist_entry ia_hash_pslist_entry;
struct pslist_entry ia_pslist_entry;
#endif
};
struct in_nbrinfo {
char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */
struct in_addr addr; /* IPv4 address of the neighbor */
long asked; /* number of queries already sent for this addr */
int state; /* reachability state */
int expire; /* lifetime for NDP state transition */
};
#ifdef _KERNEL
static __inline void
ia4_acquire(struct in_ifaddr *ia, struct psref *psref)
{
KASSERT(ia != NULL);
ifa_acquire(&ia->ia_ifa, psref);
}
static __inline void
ia4_release(struct in_ifaddr *ia, struct psref *psref)
{
if (ia == NULL)
return;
ifa_release(&ia->ia_ifa, psref);
}
#endif
struct in_aliasreq {
char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct sockaddr_in ifra_addr;
struct sockaddr_in ifra_dstaddr;
#define ifra_broadaddr ifra_dstaddr
struct sockaddr_in ifra_mask;
};
/*
* Given a pointer to an in_ifaddr (ifaddr),
* return a pointer to the addr as a sockaddr_in.
*/
#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))
#ifdef _KERNEL
/* Note: 61, 127, 251, 509, 1021, 2039 are good. */
#ifndef IN_IFADDR_HASH_SIZE
#define IN_IFADDR_HASH_SIZE 509
#endif
/*
* This is a bit unconventional, and wastes a little bit of space, but
* because we want a very even hash function we don't use & in_ifaddrhash
* here, but rather % the hash size, which should obviously be prime.
*/
#define IN_IFADDR_HASH(x) in_ifaddrhashtbl[(u_long)(x) % IN_IFADDR_HASH_SIZE]
LIST_HEAD(in_ifaddrhashhead, in_ifaddr); /* Type of the hash head */
TAILQ_HEAD(in_ifaddrhead, in_ifaddr); /* Type of the list head */
extern u_long in_ifaddrhash; /* size of hash table - 1 */
extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* Hash table head */
extern struct in_ifaddrhead in_ifaddrhead; /* List head (in ip_input) */
extern pserialize_t in_ifaddrhash_psz;
extern struct pslist_head *in_ifaddrhashtbl_pslist;
extern u_long in_ifaddrhash_pslist;
extern struct pslist_head in_ifaddrhead_pslist;
#define IN_IFADDR_HASH_PSLIST(x) \
in_ifaddrhashtbl_pslist[(u_long)(x) % IN_IFADDR_HASH_SIZE]
#define IN_ADDRHASH_READER_FOREACH(__ia, __addr) \
PSLIST_READER_FOREACH((__ia), &IN_IFADDR_HASH_PSLIST(__addr), \
struct in_ifaddr, ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_INSERT_HEAD(__ia) \
PSLIST_WRITER_INSERT_HEAD( \
&IN_IFADDR_HASH_PSLIST((__ia)->ia_addr.sin_addr.s_addr), \
(__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_REMOVE(__ia) \
PSLIST_WRITER_REMOVE((__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_ENTRY_INIT(__ia) \
PSLIST_ENTRY_INIT((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_ENTRY_DESTROY(__ia) \
PSLIST_ENTRY_DESTROY((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_READER_NEXT(__ia) \
PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_hash_pslist_entry)
#define IN_ADDRLIST_ENTRY_INIT(__ia) \
PSLIST_ENTRY_INIT((__ia), ia_pslist_entry)
#define IN_ADDRLIST_ENTRY_DESTROY(__ia) \
PSLIST_ENTRY_DESTROY((__ia), ia_pslist_entry);
#define IN_ADDRLIST_READER_EMPTY() \
(PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry) == NULL)
#define IN_ADDRLIST_READER_FIRST() \
PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry)
#define IN_ADDRLIST_READER_NEXT(__ia) \
PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_READER_FOREACH(__ia) \
PSLIST_READER_FOREACH((__ia), &in_ifaddrhead_pslist, \
struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_HEAD(__ia) \
PSLIST_WRITER_INSERT_HEAD(&in_ifaddrhead_pslist, (__ia), \
ia_pslist_entry)
#define IN_ADDRLIST_WRITER_REMOVE(__ia) \
PSLIST_WRITER_REMOVE((__ia), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FOREACH(__ia) \
PSLIST_WRITER_FOREACH((__ia), &in_ifaddrhead_pslist, \
struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FIRST() \
PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry)
#define IN_ADDRLIST_WRITER_NEXT(__ia) \
PSLIST_WRITER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new) \
PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_EMPTY() \
(PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry) == NULL)
#define IN_ADDRLIST_WRITER_INSERT_TAIL(__new) \
do { \
if (IN_ADDRLIST_WRITER_EMPTY()) { \
IN_ADDRLIST_WRITER_INSERT_HEAD((__new)); \
} else { \
struct in_ifaddr *__ia; \
IN_ADDRLIST_WRITER_FOREACH(__ia) { \
if (IN_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \
IN_ADDRLIST_WRITER_INSERT_AFTER(__ia,\
(__new)); \
break; \
} \
} \
} \
} while (0)
extern const int inetctlerrmap[];
/*
* Find whether an internet address (in_addr) belongs to one
* of our interfaces (in_ifaddr). NULL if the address isn't ours.
*/
static __inline struct in_ifaddr *
in_get_ia(struct in_addr addr)
{
struct in_ifaddr *ia;
IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) {
if (in_hosteq(ia->ia_addr.sin_addr, addr))
break;
}
return ia;
}
static __inline struct in_ifaddr *
in_get_ia_psref(struct in_addr addr, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia(addr);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* Find whether an internet address (in_addr) belongs to a specified
* interface. NULL if the address isn't ours.
*/
static __inline struct in_ifaddr *
in_get_ia_on_iface(struct in_addr addr, struct ifnet *ifp)
{
struct in_ifaddr *ia;
IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) {
if (in_hosteq(ia->ia_addr.sin_addr, addr) &&
ia->ia_ifp == ifp)
break;
}
return ia;
}
static __inline struct in_ifaddr *
in_get_ia_on_iface_psref(struct in_addr addr, struct ifnet *ifp, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia_on_iface(addr, ifp);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* Find an internet address structure (in_ifaddr) corresponding
* to a given interface (ifnet structure).
*/
static __inline struct in_ifaddr *
in_get_ia_from_ifp(struct ifnet *ifp)
{
struct ifaddr *ifa;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET)
break;
}
return ifatoia(ifa);
}
static __inline struct in_ifaddr *
in_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia_from_ifp(ifp);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
#include <netinet/in_selsrc.h>
/*
* IPv4 per-interface state.
*/
struct in_ifinfo {
struct lltable *ii_llt; /* ARP state */
struct in_ifsysctl *ii_selsrc;
#ifdef MBUFTRACE
struct mowner ii_mowner;
#endif
};
#endif /* _KERNEL */
/*
* Internet multicast address structure. There is one of these for each IP
* multicast group to which this host belongs on a given network interface.
* They are kept in a linked list, rooted in the interface's in_ifaddr
* structure.
*/
struct router_info;
struct in_multi {
LIST_ENTRY(in_multi) inm_list; /* list of multicast addresses */
struct router_info *inm_rti; /* router version info */
struct ifnet *inm_ifp; /* back pointer to ifnet */
struct in_addr inm_addr; /* IP multicast address */
u_int inm_refcount; /* no. membership claims by sockets */
u_int inm_timer; /* IGMP membership report timer */
u_int inm_state; /* state of membership */
};
#ifdef _KERNEL
#include <net/pktqueue.h>
#include <sys/cprng.h>
extern pktqueue_t *ip_pktq;
extern int ip_dad_count; /* Duplicate Address Detection probes */
static inline bool
ip_dad_enabled(void)
{
#if NARP > 0
return ip_dad_count > 0;
#else
return false;
#endif
}
#if defined(INET) && NARP > 0
extern int arp_debug;
#define ARPLOGADDR(a) IN_PRINT(_ipbuf, a)
#define ARPLOG(level, fmt, args...) \
do { \
char _ipbuf[INET_ADDRSTRLEN]; \
(void)_ipbuf; \
if (arp_debug) \
log(level, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#else
#define ARPLOG(level, fmt, args...)
#endif
/*
* Structure used by functions below to remember position when stepping
* through all of the in_multi records.
*/
struct in_multistep {
int i_n;
struct in_multi *i_inm;
};
bool in_multi_group(struct in_addr, struct ifnet *, int);
struct in_multi *in_first_multi(struct in_multistep *);
struct in_multi *in_next_multi(struct in_multistep *);
struct in_multi *in_lookup_multi(struct in_addr, struct ifnet *);
struct in_multi *in_addmulti(struct in_addr *, struct ifnet *);
void in_delmulti(struct in_multi *);
void in_multi_lock(int);
void in_multi_unlock(void);
int in_multi_lock_held(void);
struct ifaddr;
int in_ifinit(struct ifnet *, struct in_ifaddr *,
const struct sockaddr_in *, const struct sockaddr_in *, int);
void in_savemkludge(struct in_ifaddr *);
void in_restoremkludge(struct in_ifaddr *, struct ifnet *);
void in_purgemkludge(struct ifnet *);
void in_setmaxmtu(void);
int in_control(struct socket *, u_long, void *, struct ifnet *);
void in_purgeaddr(struct ifaddr *);
void in_purgeif(struct ifnet *);
void in_addrhash_insert(struct in_ifaddr *);
void in_addrhash_remove(struct in_ifaddr *);
int ipflow_fastforward(struct mbuf *);
extern uint16_t ip_id;
extern int ip_do_randomid;
static __inline uint16_t
ip_randomid(void)
{
uint16_t id = (uint16_t)cprng_fast32();
return id ? id : 1;
}
/*
* ip_newid_range: "allocate" num contiguous IP IDs.
*
* => Return the first ID.
*/
static __inline uint16_t
ip_newid_range(const struct in_ifaddr *ia, u_int num)
{
uint16_t id;
if (ip_do_randomid) {
/* XXX ignore num */
return ip_randomid();
}
/* Never allow an IP ID of 0 (detect wrap). */
if ((uint16_t)(ip_id + num) < ip_id) {
ip_id = 1;
}
id = htons(ip_id);
ip_id += num;
return id;
}
static __inline uint16_t
ip_newid(const struct in_ifaddr *ia)
{
return ip_newid_range(ia, 1);
}
#ifdef SYSCTLFN_PROTO
int sysctl_inpcblist(SYSCTLFN_PROTO);
#endif
#define LLTABLE(ifp) \
((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
#endif /* !_KERNEL */
/* INET6 stuff */
#include <netinet6/in6_var.h>
#endif /* !_NETINET_IN_VAR_H_ */
/* $NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $ */
/*-
* Copyright (c) 2005, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty_conf.c 8.5 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $");
#define TTY_ALLOW_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/ttycom.h>
#include <sys/conf.h>
#include <sys/mutex.h>
#include <sys/queue.h>
static struct linesw termios_disc = {
.l_name = "termios",
.l_open = ttylopen,
.l_close = ttylclose,
.l_read = ttread,
.l_write = ttwrite,
.l_ioctl = ttynullioctl,
.l_rint = ttyinput,
.l_start = ttstart,
.l_modem = ttymodem,
.l_poll = ttpoll
};
/*
* This is for the benefit of old BSD TTY compatibility, but since it is
* identical to termios (except for the name), don't bother conditionalizing
* it.
*/
static struct linesw ntty_disc = { /* old NTTYDISC */
.l_name = "ntty",
.l_open = ttylopen,
.l_close = ttylclose,
.l_read = ttread,
.l_write = ttwrite,
.l_ioctl = ttynullioctl,
.l_rint = ttyinput,
.l_start = ttstart,
.l_modem = ttymodem,
.l_poll = ttpoll
};
static LIST_HEAD(, linesw) ttyldisc_list = LIST_HEAD_INITIALIZER(ttyldisc_head);
/*
* Note: We don't bother refcounting termios_disc and ntty_disc; they can't
* be removed from the list, and termios_disc is likely to have very many
* references (could we overflow the count?).
*/
#define TTYLDISC_ISSTATIC(disc) \
((disc) == &termios_disc || (disc) == &ntty_disc)
#define TTYLDISC_HOLD(disc) \
do { \
if (! TTYLDISC_ISSTATIC(disc)) { \
KASSERT((disc)->l_refcnt != UINT_MAX); \
(disc)->l_refcnt++; \
} \
} while (/*CONSTCOND*/0)
#define TTYLDISC_RELE(disc) \
do { \
if (! TTYLDISC_ISSTATIC(disc)) { \
KASSERT((disc)->l_refcnt != 0); \
(disc)->l_refcnt--; \
} \
} while (/*CONSTCOND*/0)
#define TTYLDISC_ISINUSE(disc) \
(TTYLDISC_ISSTATIC(disc) || (disc)->l_refcnt != 0)
/*
* Do nothing specific version of line
* discipline specific ioctl command.
*/
/*ARGSUSED*/
int
ttynullioctl(struct tty *tp, u_long cmd, void *data, int flags, struct lwp *l)
{
return (EPASSTHROUGH);
}
/*
* Return error to line discipline
* specific poll call.
*/
/*ARGSUSED*/
int
ttyerrpoll(struct tty *tp, int events, struct lwp *l)
{
return (POLLERR);
}
void
ttyldisc_init(void)
{
if (ttyldisc_attach(&termios_disc) != 0)
panic("ttyldisc_init: termios_disc");
if (ttyldisc_attach(&ntty_disc) != 0)
panic("ttyldisc_init: ntty_disc");
}
static struct linesw *
ttyldisc_lookup_locked(const char *name)
{
struct linesw *disc;
LIST_FOREACH(disc, &ttyldisc_list, l_list) {
if (strcmp(name, disc->l_name) == 0)
return (disc);
}
return (NULL);
}
/*
* Look up a line discipline by its name. Caller holds a reference on
* the returned line discipline.
*/
struct linesw *
ttyldisc_lookup(const char *name)
{
struct linesw *disc;
mutex_spin_enter(&tty_lock);
disc = ttyldisc_lookup_locked(name);
if (disc != NULL)
TTYLDISC_HOLD(disc);
mutex_spin_exit(&tty_lock);
return (disc);
}
/*
* Look up a line discipline by its legacy number. Caller holds a
* reference on the returned line discipline.
*/
struct linesw *
ttyldisc_lookup_bynum(int num)
{
struct linesw *disc;
mutex_spin_enter(&tty_lock);
LIST_FOREACH(disc, &ttyldisc_list, l_list) {
if (disc->l_no == num) {
TTYLDISC_HOLD(disc);
mutex_spin_exit(&tty_lock);
return (disc);
}
}
mutex_spin_exit(&tty_lock);
return (NULL);
}
/*
* Release a reference on a line discipline previously added by
* ttyldisc_lookup() or ttyldisc_lookup_bynum().
*/
void
ttyldisc_release(struct linesw *disc)
{
if (disc == NULL)
return;
mutex_spin_enter(&tty_lock);
TTYLDISC_RELE(disc);
mutex_spin_exit(&tty_lock);
}
#define TTYLDISC_LEGACY_NUMBER_MIN 10
#define TTYLDISC_LEGACY_NUMBER_MAX INT_MAX
static void
ttyldisc_assign_legacy_number(struct linesw *disc)
{
static const struct {
const char *name;
int num;
} table[] = {
{ "termios", TTYDISC },
{ "ntty", 2 /* XXX old NTTYDISC */ },
{ "tablet", TABLDISC },
{ "slip", SLIPDISC },
{ "ppp", PPPDISC },
{ "strip", STRIPDISC },
{ "hdlc", HDLCDISC },
{ NULL, 0 }
};
struct linesw *ldisc;
int i;
for (i = 0; table[i].name != NULL; i++) {
if (strcmp(disc->l_name, table[i].name) == 0) {
disc->l_no = table[i].num;
return;
}
}
disc->l_no = TTYLDISC_LEGACY_NUMBER_MIN;
LIST_FOREACH(ldisc, &ttyldisc_list, l_list) {
if (disc->l_no == ldisc->l_no) {
KASSERT(disc->l_no < TTYLDISC_LEGACY_NUMBER_MAX);
disc->l_no++;
}
}
}
/*
* Register a line discipline.
*/
int
ttyldisc_attach(struct linesw *disc)
{
KASSERT(disc->l_name != NULL);
KASSERT(disc->l_open != NULL);
KASSERT(disc->l_close != NULL);
KASSERT(disc->l_read != NULL);
KASSERT(disc->l_write != NULL);
KASSERT(disc->l_ioctl != NULL);
KASSERT(disc->l_rint != NULL);
KASSERT(disc->l_start != NULL);
KASSERT(disc->l_modem != NULL);
KASSERT(disc->l_poll != NULL);
/* You are not allowed to exceed TTLINEDNAMELEN */
if (strlen(disc->l_name) >= TTLINEDNAMELEN)
return (ENAMETOOLONG);
mutex_spin_enter(&tty_lock);
if (ttyldisc_lookup_locked(disc->l_name) != NULL) {
mutex_spin_exit(&tty_lock);
return (EEXIST);
}
ttyldisc_assign_legacy_number(disc);
LIST_INSERT_HEAD(&ttyldisc_list, disc, l_list);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Remove a line discipline.
*/
int
ttyldisc_detach(struct linesw *disc)
{
#ifdef DIAGNOSTIC
struct linesw *ldisc = ttyldisc_lookup(disc->l_name);
KASSERT(ldisc != NULL);
KASSERT(ldisc == disc);
ttyldisc_release(ldisc);
#endif
mutex_spin_enter(&tty_lock);
if (TTYLDISC_ISINUSE(disc)) {
mutex_spin_exit(&tty_lock);
return (EBUSY);
}
LIST_REMOVE(disc, l_list);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Return the default line discipline.
*/
struct linesw *
ttyldisc_default(void)
{
return (&termios_disc);
}
/* $NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Joerg Sonnenberger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $");
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <limits.h>
#include <stdint.h>
#include <strings.h>
#else
#include <lib/libkern/libkern.h>
#include <machine/limits.h>
#endif
#ifndef popcount32 // might be a builtin
/*
* This a hybrid algorithm for bit counting between parallel counting and
* using multiplication. The idea is to sum up the bits in each Byte, so
* that the final accumulation can be done with a single multiplication.
* If the platform has a slow multiplication instruction, it can be replaced
* by the commented out version below.
*/
unsigned int
popcount32(uint32_t v)
{
unsigned int c;
v = v - ((v >> 1) & 0x55555555U);
v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U);
v = (v + (v >> 4)) & 0x0f0f0f0fU;
c = (v * 0x01010101U) >> 24;
/*
* v = (v >> 16) + v;
* v = (v >> 8) + v;
* c = v & 255;
*/
return c;
}
#if UINT_MAX == 0xffffffffU
__strong_alias(popcount, popcount32)
#endif
#if ULONG_MAX == 0xffffffffU
__strong_alias(popcountl, popcount32)
#endif
#endif /* !popcount32 */
/* $NetBSD: pslist.h,v 1.7 2019/12/01 15:28:19 riastradh Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_PSLIST_H
#define _SYS_PSLIST_H
#include <sys/param.h>
#include <sys/atomic.h>
struct pslist_head;
struct pslist_entry;
struct pslist_head {
struct pslist_entry *plh_first;
};
struct pslist_entry {
struct pslist_entry **ple_prevp;
struct pslist_entry *ple_next;
};
#ifdef _KERNEL
#define _PSLIST_ASSERT KASSERT
#else
#include <assert.h>
#define _PSLIST_ASSERT assert
#endif
#define _PSLIST_POISON ((void *)1ul)
/*
* Initialization. Allowed only when the caller has exclusive access,
* excluding writers and readers.
*/
static __inline void
pslist_init(struct pslist_head *head)
{
head->plh_first = NULL; /* not yet published, so no atomic */
}
static __inline void
pslist_destroy(struct pslist_head *head __diagused)
{
_PSLIST_ASSERT(head->plh_first == NULL);
}
static __inline void
pslist_entry_init(struct pslist_entry *entry)
{
entry->ple_next = NULL;
entry->ple_prevp = NULL;
}
static __inline void
pslist_entry_destroy(struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_prevp == NULL);
/*
* Poison the next entry. If we used NULL here, then readers
* would think they were simply at the end of the list.
* Instead, cause readers to crash.
*/
atomic_store_relaxed(&entry->ple_next, _PSLIST_POISON);
}
/*
* Writer operations. Caller must exclude other writers, but not
* necessarily readers.
*
* Writes to initialize a new entry must precede its publication by
* writing to plh_first / ple_next / *ple_prevp.
*
* The ple_prevp field is serialized by the caller's exclusive lock and
* not read by readers, and hence its ordering relative to the internal
* memory barriers is inconsequential.
*/
static __inline void
pslist_writer_insert_head(struct pslist_head *head, struct pslist_entry *new)
{
_PSLIST_ASSERT(head->plh_first == NULL ||
head->plh_first->ple_prevp == &head->plh_first);
_PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = &head->plh_first;
new->ple_next = head->plh_first; /* not yet published, so no atomic */
if (head->plh_first != NULL) head->plh_first->ple_prevp = &new->ple_next;
atomic_store_release(&head->plh_first, new);
}
static __inline void
pslist_writer_insert_before(struct pslist_entry *entry,
struct pslist_entry *new)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
_PSLIST_ASSERT(entry->ple_prevp != NULL);
_PSLIST_ASSERT(*entry->ple_prevp == entry);
_PSLIST_ASSERT(new->ple_next == NULL);
_PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = entry->ple_prevp;
new->ple_next = entry; /* not yet published, so no atomic */
/*
* Pairs with atomic_load_consume in pslist_reader_first or
* pslist_reader_next.
*/
atomic_store_release(entry->ple_prevp, new);
entry->ple_prevp = &new->ple_next;
}
static __inline void
pslist_writer_insert_after(struct pslist_entry *entry,
struct pslist_entry *new)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
_PSLIST_ASSERT(entry->ple_prevp != NULL);
_PSLIST_ASSERT(*entry->ple_prevp == entry);
_PSLIST_ASSERT(new->ple_next == NULL);
_PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = &entry->ple_next;
new->ple_next = entry->ple_next; /* not yet published, so no atomic */
if (new->ple_next != NULL)
new->ple_next->ple_prevp = &new->ple_next;
/* Pairs with atomic_load_consume in pslist_reader_next. */
atomic_store_release(&entry->ple_next, new);
}
static __inline void
pslist_writer_remove(struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
_PSLIST_ASSERT(entry->ple_prevp != NULL);
_PSLIST_ASSERT(*entry->ple_prevp == entry);
if (entry->ple_next != NULL)
entry->ple_next->ple_prevp = entry->ple_prevp;
/*
* No need for atomic_store_release because there's no
* initialization that this must happen after -- the store
* transitions from a good state with the entry to a good state
* without the entry, both of which are valid for readers to
* witness.
*/
atomic_store_relaxed(entry->ple_prevp, entry->ple_next);
entry->ple_prevp = NULL;
/*
* Leave entry->ple_next intact so that any extant readers can
* continue iterating through the list. The caller must then
* wait for readers to drain, e.g. with pserialize_perform,
* before destroying and reusing the entry.
*/
}
static __inline struct pslist_entry *
pslist_writer_first(const struct pslist_head *head)
{
return head->plh_first;
}
static __inline struct pslist_entry *
pslist_writer_next(const struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
return entry->ple_next;
}
static __inline void *
_pslist_writer_first_container(const struct pslist_head *head,
const ptrdiff_t offset)
{
struct pslist_entry *first = head->plh_first;
return (first == NULL ? NULL : (char *)first - offset);
}
static __inline void *
_pslist_writer_next_container(const struct pslist_entry *entry,
const ptrdiff_t offset)
{
struct pslist_entry *next = entry->ple_next;
_PSLIST_ASSERT(next != _PSLIST_POISON);
return (next == NULL ? NULL : (char *)next - offset);
}
/*
* Reader operations. Caller must block pserialize_perform or
* equivalent and be bound to a CPU. Only plh_first/ple_next may be
* read, and only with consuming memory order so that data-dependent
* loads happen afterward.
*/
static __inline struct pslist_entry *
pslist_reader_first(const struct pslist_head *head)
{
/*
* Pairs with atomic_store_release in pslist_writer_insert_head
* or pslist_writer_insert_before.
*/
return atomic_load_consume(&head->plh_first);
}
static __inline struct pslist_entry *
pslist_reader_next(const struct pslist_entry *entry)
{
/*
* Pairs with atomic_store_release in
* pslist_writer_insert_before or pslist_writer_insert_after.
*/
struct pslist_entry *next = atomic_load_consume(&entry->ple_next); _PSLIST_ASSERT(next != _PSLIST_POISON);
return next;
}
static __inline void *
_pslist_reader_first_container(const struct pslist_head *head,
const ptrdiff_t offset)
{
struct pslist_entry *first = pslist_reader_first(head);
if (first == NULL)
return NULL;
return (char *)first - offset;
}
static __inline void *
_pslist_reader_next_container(const struct pslist_entry *entry,
const ptrdiff_t offset)
{
struct pslist_entry *next = pslist_reader_next(entry);
if (next == NULL)
return NULL;
return (char *)next - offset;
}
/*
* Type-safe macros for convenience.
*/
#if defined(__COVERITY__) || defined(__LGTM_BOT__)
#define _PSLIST_VALIDATE_PTRS(P, Q) 0
#define _PSLIST_VALIDATE_CONTAINER(P, T, F) 0
#else
#define _PSLIST_VALIDATE_PTRS(P, Q) \
(0 * sizeof((P) - (Q)) * sizeof(*(P)) * sizeof(*(Q)))
#define _PSLIST_VALIDATE_CONTAINER(P, T, F) \
(0 * sizeof((P) - &((T *)(((char *)(P)) - offsetof(T, F)))->F))
#endif
#define PSLIST_INITIALIZER { .plh_first = NULL }
#define PSLIST_ENTRY_INITIALIZER { .ple_next = NULL, .ple_prevp = NULL }
#define PSLIST_INIT(H) pslist_init((H))
#define PSLIST_DESTROY(H) pslist_destroy((H))
#define PSLIST_ENTRY_INIT(E, F) pslist_entry_init(&(E)->F)
#define PSLIST_ENTRY_DESTROY(E, F) pslist_entry_destroy(&(E)->F)
#define PSLIST_WRITER_INSERT_HEAD(H, V, F) \
pslist_writer_insert_head((H), &(V)->F)
#define PSLIST_WRITER_INSERT_BEFORE(E, N, F) \
pslist_writer_insert_before(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \
&(N)->F)
#define PSLIST_WRITER_INSERT_AFTER(E, N, F) \
pslist_writer_insert_after(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \
&(N)->F)
#define PSLIST_WRITER_REMOVE(E, F) \
pslist_writer_remove(&(E)->F)
#define PSLIST_WRITER_FIRST(H, T, F) \
((T *)(_pslist_writer_first_container((H), offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_writer_first(H), T, F))
#define PSLIST_WRITER_NEXT(V, T, F) \
((T *)(_pslist_writer_next_container(&(V)->F, offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_writer_next(&(V)->F), T, F))
#define PSLIST_WRITER_FOREACH(V, H, T, F) \
for ((V) = PSLIST_WRITER_FIRST((H), T, F); \
(V) != NULL; \
(V) = PSLIST_WRITER_NEXT((V), T, F))
#define PSLIST_READER_FIRST(H, T, F) \
((T *)(_pslist_reader_first_container((H), offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_reader_first(H), T, F))
#define PSLIST_READER_NEXT(V, T, F) \
((T *)(_pslist_reader_next_container(&(V)->F, offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_reader_next(&(V)->F), T, F))
#define PSLIST_READER_FOREACH(V, H, T, F) \
for ((V) = PSLIST_READER_FIRST((H), T, F); \
(V) != NULL; \
(V) = PSLIST_READER_NEXT((V), T, F))
#endif /* _SYS_PSLIST_H */
/* $NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $ */
/* $KAME: raw_ip6.c,v 1.82 2001/07/23 18:57:56 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#include <netinet6/raw_ip6.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif
#include "faith.h"
#if defined(NFAITH) && 0 < NFAITH
#include <net/if_faith.h>
#endif
extern struct inpcbtable rawcbtable;
struct inpcbtable raw6cbtable;
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
/*
* Raw interface to IP6 protocol.
*/
static percpu_t *rip6stat_percpu;
#define RIP6_STATINC(x) _NET_STATINC(rip6stat_percpu, x)
static void sysctl_net_inet6_raw6_setup(struct sysctllog **);
/*
* Initialize raw connection block queue.
*/
void
rip6_init(void)
{
sysctl_net_inet6_raw6_setup(NULL);
in6pcb_init(&raw6cbtable, 1, 1);
rip6stat_percpu = percpu_alloc(sizeof(uint64_t) * RIP6_NSTATS);
}
static void
rip6_sbappendaddr(struct inpcb *last, struct ip6_hdr *ip6,
const struct sockaddr *sa, int hlen, struct mbuf *n)
{
struct mbuf *opts = NULL;
if (last->inp_flags & IN6P_CONTROLOPTS ||
SOOPT_TIMESTAMP(last->inp_socket->so_options))
ip6_savecontrol(last, &opts, ip6, n);
m_adj(n, hlen);
if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) {
soroverflow(last->inp_socket);
m_freem(n);
if (opts)
m_freem(opts);
RIP6_STATINC(RIP6_STAT_FULLSOCK);
} else {
sorwakeup(last->inp_socket);
}
}
/*
* Setup generic address and protocol structures
* for raw_input routine, then pass them along with
* mbuf chain.
*/
int
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *inp;
struct inpcb *last = NULL;
struct sockaddr_in6 rip6src;
struct mbuf *n;
RIP6_STATINC(RIP6_STAT_IPACKETS);
#if defined(NFAITH) && 0 < NFAITH
if (faithprefix(&ip6->ip6_dst)) {
/* send icmp6 host unreach? */
m_freem(m);
return IPPROTO_DONE;
}
#endif
sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0);
if (sa6_recoverscope(&rip6src) != 0) {
/* XXX: should be impossible. */
m_freem(m);
return IPPROTO_DONE;
}
TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if (in6p_ip6(inp).ip6_nxt &&
in6p_ip6(inp).ip6_nxt != proto)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) &&
!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src))
continue;
if (in6p_cksum(inp) != -1) {
RIP6_STATINC(RIP6_STAT_ISUM);
/*
* Although in6_cksum() does not need the position of
* the checksum field for verification, enforce that it
* is located within the packet. Userland has given
* a checksum offset, a packet too short for that is
* invalid. Avoid overflow with user supplied offset.
*/
if (m->m_pkthdr.len < *offp + 2 ||
m->m_pkthdr.len - *offp - 2 < in6p_cksum(inp) ||
in6_cksum(m, proto, *offp,
m->m_pkthdr.len - *offp)) {
RIP6_STATINC(RIP6_STAT_BADSUM);
continue;
}
}
if (last == NULL) {
;
}
#ifdef IPSEC
else if (ipsec_used && ipsec_in_reject(m, last)) {
/* do not inject data into pcb */
}
#endif
else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src),
*offp, n);
}
last = inp;
}
#ifdef IPSEC
if (ipsec_used && last && ipsec_in_reject(m, last)) {
m_freem(m);
IP6_STATDEC(IP6_STAT_DELIVERED);
/* do not inject data into pcb */
} else
#endif
if (last != NULL) {
rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src), *offp, m);
} else {
RIP6_STATINC(RIP6_STAT_NOSOCK);
if (m->m_flags & M_MCAST)
RIP6_STATINC(RIP6_STAT_NOSOCKMCAST);
if (proto == IPPROTO_NONE)
m_freem(m);
else {
int s;
struct ifnet *rcvif = m_get_rcvif(m, &s);
const int prvnxt = ip6_get_prevhdr(m, *offp);
in6_ifstat_inc(rcvif, ifs6_in_protounknown);
m_put_rcvif(rcvif, &s);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_NEXTHEADER,
prvnxt);
}
IP6_STATDEC(IP6_STAT_DELIVERED);
}
return IPPROTO_DONE;
}
void *
rip6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct ip6_hdr *ip6;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
void (*notify)(struct inpcb *, int) = in6pcb_rtchange;
int nxt;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
ip6 = ip6cp->ip6c_ip6;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
nxt = ip6cp->ip6c_nxt;
} else {
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
nxt = -1;
}
if (ip6 && cmd == PRC_MSGSIZE) {
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
int valid = 0;
struct inpcb *inp;
/*
* Check to see if we have a valid raw IPv6 socket
* corresponding to the address in the ICMPv6 message
* payload, and the protocol (ip6_nxt) meets the socket.
* XXX chase extension headers, or pass final nxt value
* from icmp6_notify_error()
*/
inp = NULL;
inp = in6pcb_lookup(&raw6cbtable, &sa6->sin6_addr, 0,
(const struct in6_addr *)&sa6_src->sin6_addr, 0, 0, 0);
#if 0
if (!inp) {
/*
* As the use of sendto(2) is fairly popular,
* we may want to allow non-connected pcb too.
* But it could be too weak against attacks...
* We should at least check if the local
* address (= s) is really ours.
*/
inp = in6pcb_lookup_bound(&raw6cbtable,
&sa6->sin6_addr, 0, 0);
}
#endif
if (inp && in6p_ip6(inp).ip6_nxt &&
in6p_ip6(inp).ip6_nxt == nxt)
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* regardless of if we called icmp6_mtudisc_update(),
* we need to call in6pcb_notify(), to notify path MTU
* change to the userland (RFC3542), because some
* unconnected sockets may share the same destination
* and want to know the path MTU.
*/
}
(void) in6pcb_notify(&raw6cbtable, sa, 0,
sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
return NULL;
}
/*
* Generate IPv6 header and pass packet to ip6_output.
* Tack on options user may have setup with control call.
*/
int
rip6_output(struct mbuf *m, struct socket * const so,
struct sockaddr_in6 * const dstsock, struct mbuf * const control)
{
struct in6_addr *dst;
struct ip6_hdr *ip6;
struct inpcb *inp;
u_int plen = m->m_pkthdr.len;
int error = 0;
struct ip6_pktopts opt, *optp = NULL;
struct ifnet *oifp = NULL;
int type, code; /* for ICMPv6 output statistics only */
int scope_ambiguous = 0;
int bound = curlwp_bind();
struct psref psref;
inp = sotoinpcb(so);
dst = &dstsock->sin6_addr;
if (control) { if ((error = ip6_setpktopts(control, &opt,
in6p_outputopts(inp),
kauth_cred_get(), so->so_proto->pr_protocol)) != 0) {
goto bad;
}
optp = &opt;
} else
optp = in6p_outputopts(inp);
/*
* Check and convert scope zone ID into internal form.
* XXX: we may still need to determine the zone later.
*/
if (!(so->so_state & SS_ISCONNECTED)) {
if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0)
goto bad;
}
/*
* For an ICMPv6 packet, we should know its type and code
* to update statistics.
*/
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
if (m->m_len < sizeof(struct icmp6_hdr) &&
(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
error = ENOBUFS;
goto bad;
}
icmp6 = mtod(m, struct icmp6_hdr *);
type = icmp6->icmp6_type;
code = icmp6->icmp6_code;
} else {
type = 0;
code = 0;
}
M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
if (!m) {
error = ENOBUFS;
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Next header might not be ICMP6 but use its pseudo header anyway.
*/
ip6->ip6_dst = *dst;
/*
* Source address selection.
*/
error = in6_selectsrc(dstsock, optp, in6p_moptions(inp),
&inp->inp_route, &in6p_laddr(inp), &oifp, &psref, &ip6->ip6_src);
if (error != 0)
goto bad;
if (oifp && scope_ambiguous) {
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined
* (when it's required), if we can determine the outgoing
* interface. determine the zone ID based on the interface.
*/
error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
if (error != 0)
goto bad;
}
ip6->ip6_dst = dstsock->sin6_addr;
/* fill in the rest of the IPv6 header fields */
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be filled in ip6_output, so not fill it here. */
ip6->ip6_nxt = in6p_ip6(inp).ip6_nxt;
ip6->ip6_hlim = in6pcb_selecthlim(inp, oifp);
if_put(oifp, &psref);
oifp = NULL;
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
in6p_cksum(inp) != -1) {
const uint8_t nxt = ip6->ip6_nxt;
int off;
u_int16_t sum;
/* compute checksum */
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
off = offsetof(struct icmp6_hdr, icmp6_cksum);
else
off = in6p_cksum(inp);
if (plen < 2 || plen - 2 < off) {
error = EINVAL;
goto bad;
}
off += sizeof(struct ip6_hdr);
sum = 0;
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
sum = in6_cksum(m, nxt, sizeof(*ip6), plen);
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
}
{
struct ifnet *ret_oifp = NULL;
error = ip6_output(m, optp, &inp->inp_route, 0,
in6p_moptions(inp), inp, &ret_oifp);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
if (ret_oifp) icmp6_ifoutstat_inc(ret_oifp, type, code);
ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
} else
RIP6_STATINC(RIP6_STAT_OPACKETS);
}
goto freectl;
bad:
if (m)
m_freem(m);
freectl:
if (control) { ip6_clearpktopts(&opt, -1);
m_freem(control);
}
if_put(oifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Raw IPv6 socket option processing.
*/
int
rip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int error = 0;
if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
int optval;
/* need to fiddle w/ opt(IPPROTO_IPV6, IPV6_CHECKSUM)? */
if (op == PRCO_GETOPT) {
optval = 1;
error = sockopt_set(sopt, &optval, sizeof(optval));
} else if (op == PRCO_SETOPT) {
error = sockopt_getint(sopt, &optval);
if (error)
goto out;
if (optval == 0)
error = EINVAL;
}
goto out;
} else if (sopt->sopt_level != IPPROTO_IPV6)
return ip6_ctloutput(op, so, sopt);
switch (sopt->sopt_name) {
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
if (op == PRCO_SETOPT) error = ip6_mrouter_set(so, sopt);
else if (op == PRCO_GETOPT)
error = ip6_mrouter_get(so, sopt);
else
error = EINVAL;
break;
case IPV6_CHECKSUM:
return ip6_raw_ctloutput(op, so, sopt);
default:
return ip6_ctloutput(op, so, sopt);
}
out:
return error;
}
extern u_long rip6_sendspace;
extern u_long rip6_recvspace;
int
rip6_attach(struct socket *so, int proto)
{
struct inpcb *inp;
int s, error;
KASSERT(sotoinpcb(so) == NULL);
sosetlock(so);
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_RAWSOCK,
KAUTH_ARG(AF_INET6),
KAUTH_ARG(SOCK_RAW),
KAUTH_ARG(so->so_proto->pr_protocol));
if (error) {
return error;
}
s = splsoftnet();
error = soreserve(so, rip6_sendspace, rip6_recvspace);
if (error) {
splx(s);
return error;
}
if ((error = inpcb_create(so, &raw6cbtable)) != 0) {
splx(s);
return error;
}
splx(s);
inp = sotoinpcb(so);
in6p_ip6(inp).ip6_nxt = proto;
in6p_cksum(inp) = -1;
in6p_icmp6filt(inp) = kmem_alloc(sizeof(struct icmp6_filter), KM_SLEEP);
ICMP6_FILTER_SETPASSALL(in6p_icmp6filt(inp));
KASSERT(solocked(so));
return error;
}
static void
rip6_detach(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
KASSERT(solocked(so)); KASSERT(inp != NULL); if (so == ip6_mrouter) { ip6_mrouter_done();
}
/* xxx: RSVP */
if (in6p_icmp6filt(inp) != NULL) { kmem_free(in6p_icmp6filt(inp), sizeof(struct icmp6_filter));
in6p_icmp6filt(inp) = NULL;
}
inpcb_destroy(inp);
}
static int
rip6_accept(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct ifaddr *ifa = NULL;
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (addr->sin6_len != sizeof(*addr))
return EINVAL;
if (IFNET_READER_EMPTY() || addr->sin6_family != AF_INET6)
return EADDRNOTAVAIL;
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
return error;
/*
* we don't support mapped address here, it would confuse
* users so reject it
*/
if (IN6_IS_ADDR_V4MAPPED(&addr->sin6_addr))
return EADDRNOTAVAIL;
s = pserialize_read_enter();
if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
(ifa = ifa_ifwithaddr(sin6tosa(addr))) == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
if (ifa && (ifatoia6(ifa))->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
error = EADDRNOTAVAIL;
goto out;
}
in6p_laddr(inp) = addr->sin6_addr;
error = 0;
out:
pserialize_read_exit(s);
return error;
}
static int
rip6_listen(struct socket *so, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct in6_addr in6a;
struct ifnet *ifp = NULL;
int scope_ambiguous = 0;
int error = 0;
struct psref psref;
int bound;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (IFNET_READER_EMPTY())
return EADDRNOTAVAIL;
if (addr->sin6_family != AF_INET6)
return EAFNOSUPPORT;
if (addr->sin6_len != sizeof(*addr))
return EINVAL;
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (addr->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
return error;
bound = curlwp_bind();
/* Source address selection. XXX: need pcblookup? */
error = in6_selectsrc(addr, in6p_outputopts(inp),
in6p_moptions(inp), &inp->inp_route,
&in6p_laddr(inp), &ifp, &psref, &in6a);
if (error != 0)
goto out;
/* XXX: see above */
if (ifp && scope_ambiguous &&
(error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
goto out;
}
in6p_laddr(inp) = in6a;
in6p_faddr(inp) = addr->sin6_addr;
soisconnected(so);
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
static int
rip6_connect2(struct socket *so, struct socket *so2)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_disconnect(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
KASSERT(solocked(so)); KASSERT(inp != NULL); if ((so->so_state & SS_ISCONNECTED) == 0)
return ENOTCONN;
in6p_faddr(inp) = in6addr_any;
so->so_state &= ~SS_ISCONNECTED; /* XXX */
return 0;
}
static int
rip6_shutdown(struct socket *so)
{
KASSERT(solocked(so));
/*
* Mark the connection as being incapable of further input.
*/
socantsendmore(so);
return 0;
}
static int
rip6_abort(struct socket *so)
{
KASSERT(solocked(so));
soisdisconnected(so);
rip6_detach(so);
return 0;
}
static int
rip6_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
return in6_control(so, cmd, nam, ifp);
}
static int
rip6_stat(struct socket *so, struct stat *ub)
{
KASSERT(solocked(so));
/* stat: don't bother with a blocksize */
return 0;
}
static int
rip6_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
KASSERT(nam != NULL);
in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
rip6_sockaddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
KASSERT(nam != NULL);
in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
rip6_rcvd(struct socket *so, int flags, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 tmp;
struct sockaddr_in6 *dst;
int error = 0;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL);
/*
* Ship a packet out. The appropriate raw output
* routine handles any messaging necessary.
*/
/* always copy sockaddr to avoid overwrites */
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
error = EISCONN;
goto release;
}
/* XXX */
sockaddr_in6_init(&tmp, &in6p_faddr(inp), 0, 0, 0);
dst = &tmp;
} else {
if (nam == NULL) {
error = ENOTCONN;
goto release;
}
tmp = *(struct sockaddr_in6 *)nam;
dst = &tmp;
if (dst->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
goto release;
}
if (dst->sin6_len != sizeof(*dst)) {
error = EINVAL;
goto release;
}
}
error = rip6_output(m, so, dst, control);
m = NULL;
release:
if (m) m_freem(m);
return error;
}
static int
rip6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
static int
rip6_purgeif(struct socket *so, struct ifnet *ifp)
{
mutex_enter(softnet_lock);
in6pcb_purgeif0(&raw6cbtable, ifp);
#ifdef NET_MPSAFE
mutex_exit(softnet_lock);
#endif
in6_purgeif(ifp);
#ifdef NET_MPSAFE
mutex_enter(softnet_lock);
#endif
in6pcb_purgeif(&raw6cbtable, ifp);
mutex_exit(softnet_lock);
return 0;
}
static int
sysctl_net_inet6_raw6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(rip6stat_percpu, RIP6_NSTATS));
}
static void
sysctl_net_inet6_raw6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "raw6",
SYSCTL_DESCR("Raw IPv6 settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pcblist",
SYSCTL_DESCR("Raw IPv6 control block list"),
sysctl_inpcblist, 0, &raw6cbtable, 0,
CTL_NET, PF_INET6, IPPROTO_RAW,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("Raw IPv6 statistics"),
sysctl_net_inet6_raw6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, RAW6CTL_STATS,
CTL_EOL);
}
PR_WRAP_USRREQS(rip6)
#define rip6_attach rip6_attach_wrapper
#define rip6_detach rip6_detach_wrapper
#define rip6_accept rip6_accept_wrapper
#define rip6_bind rip6_bind_wrapper
#define rip6_listen rip6_listen_wrapper
#define rip6_connect rip6_connect_wrapper
#define rip6_connect2 rip6_connect2_wrapper
#define rip6_disconnect rip6_disconnect_wrapper
#define rip6_shutdown rip6_shutdown_wrapper
#define rip6_abort rip6_abort_wrapper
#define rip6_ioctl rip6_ioctl_wrapper
#define rip6_stat rip6_stat_wrapper
#define rip6_peeraddr rip6_peeraddr_wrapper
#define rip6_sockaddr rip6_sockaddr_wrapper
#define rip6_rcvd rip6_rcvd_wrapper
#define rip6_recvoob rip6_recvoob_wrapper
#define rip6_send rip6_send_wrapper
#define rip6_sendoob rip6_sendoob_wrapper
#define rip6_purgeif rip6_purgeif_wrapper
const struct pr_usrreqs rip6_usrreqs = {
.pr_attach = rip6_attach,
.pr_detach = rip6_detach,
.pr_accept = rip6_accept,
.pr_bind = rip6_bind,
.pr_listen = rip6_listen,
.pr_connect = rip6_connect,
.pr_connect2 = rip6_connect2,
.pr_disconnect = rip6_disconnect,
.pr_shutdown = rip6_shutdown,
.pr_abort = rip6_abort,
.pr_ioctl = rip6_ioctl,
.pr_stat = rip6_stat,
.pr_peeraddr = rip6_peeraddr,
.pr_sockaddr = rip6_sockaddr,
.pr_rcvd = rip6_rcvd,
.pr_recvoob = rip6_recvoob,
.pr_send = rip6_send,
.pr_sendoob = rip6_sendoob,
.pr_purgeif = rip6_purgeif,
};
/* $NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $ */
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)fdesc_vfsops.c 8.10 (Berkeley) 5/14/95
*
* #Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp #
*/
/*
* /dev/fd Filesystem
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/filedesc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/fdesc/fdesc.h>
MODULE(MODULE_CLASS_VFS, fdesc, NULL);
VFS_PROTOS(fdesc);
/*
* Mount the per-process file descriptors (/dev/fd)
*/
int
fdesc_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0, ix;
struct vnode *rvp;
if (mp->mnt_flag & MNT_GETARGS) {
*data_len = 0;
return 0;
}
/*
* Update is a no-op
*/
if (mp->mnt_flag & MNT_UPDATE)
return (EOPNOTSUPP);
ix = FD_ROOT;
error = vcache_get(mp, &ix, sizeof(ix), &rvp);
if (error)
return error;
mp->mnt_stat.f_namemax = FDESC_MAXNAMLEN;
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_data = rvp;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, "fdesc", UIO_SYSSPACE,
mp->mnt_op->vfs_name, mp, l);
return error;
}
int
fdesc_start(struct mount *mp, int flags)
{
return (0);
}
int
fdesc_unmount(struct mount *mp, int mntflags)
{
int error;
int flags = 0;
struct vnode *rtvp = mp->mnt_data;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, rtvp, flags)) != 0)
return (error);
/*
* Blow it away for future re-use
*/
vgone(rtvp);
mp->mnt_data = NULL;
return (0);
}
int
fdesc_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct vnode *vp;
/*
* Return locked reference to root.
*/
vp = mp->mnt_data;
vref(vp);
vn_lock(vp, lktype | LK_RETRY);
*vpp = vp;
return (0);
}
/*ARGSUSED*/
int
fdesc_sync(struct mount *mp, int waitfor,
kauth_cred_t uc)
{
return (0);
}
/*
* Fdesc flat namespace lookup.
* Currently unsupported.
*/
int
fdesc_vget(struct mount *mp, ino_t ino, int lktype,
struct vnode **vpp)
{
return (EOPNOTSUPP);
}
int
fdesc_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
int ix;
struct fdescnode *fd;
KASSERT(key_len == sizeof(ix));
memcpy(&ix, key, key_len);
fd = kmem_alloc(sizeof(struct fdescnode), KM_SLEEP);
fd->fd_fd = -1;
fd->fd_link = NULL;
fd->fd_ix = ix;
fd->fd_vnode = vp;
vp->v_tag = VT_FDESC;
vp->v_op = fdesc_vnodeop_p;
vp->v_data = fd;
switch (ix) {
case FD_ROOT:
fd->fd_type = Froot;
vp->v_type = VDIR;
vp->v_vflag |= VV_ROOT;
break;
case FD_DEVFD:
fd->fd_type = Fdevfd;
vp->v_type = VDIR;
break;
case FD_CTTY:
fd->fd_type = Fctty;
vp->v_type = VCHR;
break;
case FD_STDIN:
fd->fd_type = Flink;
fd->fd_link = "fd/0";
vp->v_type = VLNK;
break;
case FD_STDOUT:
fd->fd_type = Flink;
fd->fd_link = "fd/1";
vp->v_type = VLNK;
break;
case FD_STDERR:
fd->fd_type = Flink;
fd->fd_link = "fd/2";
vp->v_type = VLNK;
break;
default:
KASSERT(ix >= FD_DESC);
fd->fd_type = Fdesc;
fd->fd_fd = ix - FD_DESC;
vp->v_type = VNON;
break;
}
uvm_vnp_setsize(vp, 0);
*new_key = &fd->fd_ix;
return 0;
}
extern const struct vnodeopv_desc fdesc_vnodeop_opv_desc;
const struct vnodeopv_desc * const fdesc_vnodeopv_descs[] = {
&fdesc_vnodeop_opv_desc,
NULL,
};
struct vfsops fdesc_vfsops = {
.vfs_name = MOUNT_FDESC,
.vfs_min_mount_data = 0,
.vfs_mount = fdesc_mount,
.vfs_start = fdesc_start,
.vfs_unmount = fdesc_unmount,
.vfs_root = fdesc_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = genfs_statvfs,
.vfs_sync = fdesc_sync,
.vfs_vget = fdesc_vget,
.vfs_loadvnode = fdesc_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = fdesc_init,
.vfs_done = fdesc_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = fdesc_vnodeopv_descs
};
SYSCTL_SETUP(fdesc_sysctl_setup, "fdesc sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "fdesc",
SYSCTL_DESCR("File-descriptor file system"),
NULL, 0, NULL, 0,
CTL_VFS, 7, CTL_EOL);
/*
* XXX the "7" above could be dynamic, thereby eliminating one
* more instance of the "number to vfs" mapping problem, but
* "7" is the order as taken from sys/mount.h
*/
}
static int
fdesc_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&fdesc_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&fdesc_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $ */
/*-
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Written by Iain Hibbert for Itronix Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/sco.h>
/****************************************************************************
*
* SCO - Upper Protocol API
*/
struct sco_pcb_list sco_pcb = LIST_HEAD_INITIALIZER(sco_pcb);
/*
* sco_attach_pcb(handle, proto, upper)
*
* Attach a new instance of SCO pcb to handle
*/
int
sco_attach_pcb(struct sco_pcb **handle,
const struct btproto *proto, void *upper)
{
struct sco_pcb *pcb;
KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL);
pcb = malloc(sizeof(struct sco_pcb), M_BLUETOOTH,
M_NOWAIT | M_ZERO);
if (pcb == NULL)
return ENOMEM;
pcb->sp_proto = proto;
pcb->sp_upper = upper;
LIST_INSERT_HEAD(&sco_pcb, pcb, sp_next);
*handle = pcb;
return 0;
}
/*
* sco_bind_pcb(pcb, sockaddr)
*
* Bind SCO pcb to local address
*/
int
sco_bind_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
if (pcb->sp_link != NULL || pcb->sp_flags & SP_LISTENING)
return EINVAL;
bdaddr_copy(&pcb->sp_laddr, &addr->bt_bdaddr);
return 0;
}
/*
* sco_sockaddr_pcb(pcb, sockaddr)
*
* Copy local address of PCB to sockaddr
*/
int
sco_sockaddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
memset(addr, 0, sizeof(struct sockaddr_bt));
addr->bt_len = sizeof(struct sockaddr_bt);
addr->bt_family = AF_BLUETOOTH;
bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_laddr);
return 0;
}
/*
* sco_connect_pcb(pcb, sockaddr)
*
* Initiate a SCO connection to the destination address.
*/
int
sco_connect_pcb(struct sco_pcb *pcb, struct sockaddr_bt *dest)
{
hci_add_sco_con_cp cp;
struct hci_unit *unit;
struct hci_link *acl, *sco;
int err;
if (pcb->sp_flags & SP_LISTENING)
return EINVAL;
bdaddr_copy(&pcb->sp_raddr, &dest->bt_bdaddr);
if (bdaddr_any(&pcb->sp_raddr))
return EDESTADDRREQ;
if (bdaddr_any(&pcb->sp_laddr)) {
err = hci_route_lookup(&pcb->sp_laddr, &pcb->sp_raddr);
if (err)
return err;
}
unit = hci_unit_lookup(&pcb->sp_laddr);
if (unit == NULL)
return ENETDOWN;
/*
* We must have an already open ACL connection before we open the SCO
* connection, and since SCO connections dont happen on their own we
* will not open one, the application wanting this should have opened
* it previously.
*/
acl = hci_link_lookup_bdaddr(unit, &pcb->sp_raddr, HCI_LINK_ACL);
if (acl == NULL || acl->hl_state != HCI_LINK_OPEN)
return EHOSTUNREACH;
sco = hci_link_alloc(unit, &pcb->sp_raddr, HCI_LINK_SCO);
if (sco == NULL)
return ENOMEM;
sco->hl_link = hci_acl_open(unit, &pcb->sp_raddr);
KASSERT(sco->hl_link == acl);
cp.con_handle = htole16(acl->hl_handle);
cp.pkt_type = htole16(0x00e0); /* HV1, HV2, HV3 */
err = hci_send_cmd(unit, HCI_CMD_ADD_SCO_CON, &cp, sizeof(cp));
if (err) {
hci_link_free(sco, err);
return err;
}
sco->hl_sco = pcb;
pcb->sp_link = sco;
pcb->sp_mtu = unit->hci_max_sco_size;
return 0;
}
/*
* sco_peeraddr_pcb(pcb, sockaddr)
*
* Copy remote address of SCO pcb to sockaddr
*/
int
sco_peeraddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
memset(addr, 0, sizeof(struct sockaddr_bt));
addr->bt_len = sizeof(struct sockaddr_bt);
addr->bt_family = AF_BLUETOOTH;
bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_raddr);
return 0;
}
/*
* sco_disconnect_pcb(pcb, linger)
*
* Initiate disconnection of connected SCO pcb
*/
int
sco_disconnect_pcb(struct sco_pcb *pcb, int linger)
{
hci_discon_cp cp;
struct hci_link *sco;
int err;
sco = pcb->sp_link;
if (sco == NULL)
return EINVAL;
cp.con_handle = htole16(sco->hl_handle);
cp.reason = 0x13; /* "Remote User Terminated Connection" */
err = hci_send_cmd(sco->hl_unit, HCI_CMD_DISCONNECT, &cp, sizeof(cp));
if (err || linger == 0) {
sco->hl_sco = NULL;
pcb->sp_link = NULL;
hci_link_free(sco, err);
}
return err;
}
/*
* sco_detach_pcb(handle)
*
* Detach SCO pcb from handle and clear up
*/
void
sco_detach_pcb(struct sco_pcb **handle)
{
struct sco_pcb *pcb;
KASSERT(handle != NULL);
pcb = *handle;
*handle = NULL;
if (pcb->sp_link != NULL) {
sco_disconnect_pcb(pcb, 0);
pcb->sp_link = NULL;
}
LIST_REMOVE(pcb, sp_next);
free(pcb, M_BLUETOOTH);
}
/*
* sco_listen_pcb(pcb)
*
* Mark pcb as a listener.
*/
int
sco_listen_pcb(struct sco_pcb *pcb)
{
if (pcb->sp_link != NULL)
return EINVAL;
pcb->sp_flags |= SP_LISTENING;
return 0;
}
/*
* sco_send_pcb(pcb, mbuf)
*
* Send data on SCO pcb.
*
* Gross hackage, we just output the packet directly onto the unit queue.
* This will work fine for one channel per unit, but for more channels it
* really needs fixing. We set the context so that when the packet is sent,
* we can drop a record from the socket buffer.
*/
int
sco_send_pcb(struct sco_pcb *pcb, struct mbuf *m)
{
hci_scodata_hdr_t *hdr;
int plen;
if (pcb->sp_link == NULL) {
m_freem(m);
return EINVAL;
}
plen = m->m_pkthdr.len;
DPRINTFN(10, "%d bytes\n", plen);
/*
* This is a temporary limitation, as USB devices cannot
* handle SCO packet sizes that are not an integer number
* of Isochronous frames. See ubt(4)
*/
if (plen != pcb->sp_mtu) {
m_freem(m);
return EMSGSIZE;
}
M_PREPEND(m, sizeof(hci_scodata_hdr_t), M_DONTWAIT);
if (m == NULL)
return ENOMEM;
hdr = mtod(m, hci_scodata_hdr_t *);
hdr->type = HCI_SCO_DATA_PKT;
hdr->con_handle = htole16(pcb->sp_link->hl_handle);
hdr->length = plen;
pcb->sp_pending++;
M_SETCTX(m, pcb->sp_link);
hci_output_sco(pcb->sp_link->hl_unit, m);
return 0;
}
/*
* sco_setopt(pcb, sopt)
*
* Set SCO pcb options
*/
int
sco_setopt(struct sco_pcb *pcb, const struct sockopt *sopt)
{
int err = 0;
switch (sopt->sopt_name) {
default:
err = ENOPROTOOPT;
break;
}
return err;
}
/*
* sco_getopt(pcb, sopt)
*
* Get SCO pcb options
*/
int
sco_getopt(struct sco_pcb *pcb, struct sockopt *sopt)
{
switch (sopt->sopt_name) {
case SO_SCO_MTU:
return sockopt_set(sopt, &pcb->sp_mtu, sizeof(uint16_t));
case SO_SCO_HANDLE:
if (pcb->sp_link)
return sockopt_set(sopt,
&pcb->sp_link->hl_handle, sizeof(uint16_t));
return ENOTCONN;
default:
break;
}
return ENOPROTOOPT;
}
/* $NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_tcp_debug.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#ifdef TCP_SIGNATURE
#include <sys/md5.h>
#endif
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_debug.h>
#include <netinet/in_offload.h>
#include <netinet6/in6_offload.h>
/*
* Knob to enable Congestion Window Monitoring, and control
* the burst size it allows. Default burst is 4 packets, per
* the Internet draft.
*/
int tcp_cwm = 0;
int tcp_cwm_burstsize = 4;
int tcp_do_autosndbuf = 1;
int tcp_autosndbuf_inc = 8 * 1024;
int tcp_autosndbuf_max = 256 * 1024;
#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>
extern struct evcnt tcp_output_bigheader;
extern struct evcnt tcp_output_predict_hit;
extern struct evcnt tcp_output_predict_miss;
extern struct evcnt tcp_output_copysmall;
extern struct evcnt tcp_output_copybig;
extern struct evcnt tcp_output_refbig;
#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
#else
#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
#endif /* TCP_OUTPUT_COUNTERS */
static int
tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
bool *alwaysfragp)
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so = NULL;
struct rtentry *rt;
struct ifnet *ifp;
int size;
int hdrlen;
int optlen;
*alwaysfragp = false;
size = tcp_mssdflt;
switch (tp->t_family) {
case AF_INET:
hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
break;
#ifdef INET6
case AF_INET6:
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
break;
#endif
default:
hdrlen = 1; /* prevent zero sized segments */
goto out;
}
rt = inpcb_rtentry(inp);
so = inp->inp_socket;
if (rt == NULL) {
goto out;
}
ifp = rt->rt_ifp;
if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
#ifdef INET6
if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph: if path MTU is
* smaller than 1280, use 1280 as packet size and
* attach fragment header.
*/
size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
*alwaysfragp = true;
} else
size = rt->rt_rmx.rmx_mtu - hdrlen;
#else
size = rt->rt_rmx.rmx_mtu - hdrlen;
#endif
} else if (ifp->if_flags & IFF_LOOPBACK)
size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && tp->t_mtudisc) size = ifp->if_mtu - hdrlen;
else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
size = ifp->if_mtu - hdrlen;
#ifdef INET6
else if (inp->inp_af == AF_INET6) { if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
/* mapped addr case */
struct in_addr d;
memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d));
if (tp->t_mtudisc || in_localaddr(d))
size = ifp->if_mtu - hdrlen;
} else {
/*
* for IPv6, path MTU discovery is always turned on,
* or the node must use packet size <= 1280.
*/
size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU;
size -= hdrlen;
}
}
#endif
inpcb_rtentry_unref(rt, inp);
out:
/*
* Now we must make room for whatever extra TCP/IP options are in
* the packet.
*/
optlen = tcp_optlen(tp);
/*
* XXX tp->t_ourmss should have the right size, but without this code
* fragmentation will occur... need more investigation
*/
if (inp->inp_af == AF_INET) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec4_hdrsiz_tcp(tp);
#endif
optlen += ip_optlen(inp);
}
#ifdef INET6
if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec4_hdrsiz_tcp(tp);
#endif
/* XXX size -= ip_optlen(in6p); */
} else if (inp->inp_af == AF_INET6) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec6_hdrsiz_tcp(tp);
#endif
optlen += ip6_optlen(inp);
}
#endif
size -= optlen;
/*
* There may not be any room for data if mtu is too small. This
* includes zero-sized.
*/
if (size <= 0) {
return EMSGSIZE;
}
/*
* *rxsegsizep holds *estimated* inbound segment size (estimation
* assumes that path MTU is the same for both ways). this is only
* for silly window avoidance, do not use the value for other purposes.
*
* ipseclen is subtracted from both sides, this may not be right.
* I'm not quite sure about this (could someone comment).
*/
*txsegsizep = uimin(tp->t_peermss - optlen, size);
*rxsegsizep = uimin(tp->t_ourmss - optlen, size);
/*
* Never send more than half a buffer full. This insures that we can
* always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
* therefore acks will never be delayed unless we run out of data to
* transmit.
*/
if (so) { *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep);
}
/*
* A segment must at least store header + options
*/
if (*txsegsizep < hdrlen + optlen) {
return EMSGSIZE;
}
if (*txsegsizep != tp->t_segsz) {
/*
* If the new segment size is larger, we don't want to
* mess up the congestion window, but if it is smaller
* we'll have to reduce the congestion window to ensure
* that we don't get into trouble with initial windows
* and the rest. In any case, if the segment size
* has changed, chances are the path has, too, and
* our congestion window will be different.
*/
if (*txsegsizep < tp->t_segsz) { tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz)
* *txsegsizep, *txsegsizep);
tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz)
* *txsegsizep, *txsegsizep);
}
tp->t_segsz = *txsegsizep;
}
return 0;
}
static int
tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
long len, int hdrlen, struct mbuf **mp)
{
struct mbuf *m, *m0;
uint64_t *tcps;
tcps = TCP_STAT_GETREF();
if (tp->t_force && len == 1)
tcps[TCP_STAT_SNDPROBE]++;
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->t_sndrexmitpack++;
tcps[TCP_STAT_SNDREXMITPACK]++;
tcps[TCP_STAT_SNDREXMITBYTE] += len;
} else {
tcps[TCP_STAT_SNDPACK]++;
tcps[TCP_STAT_SNDBYTE] += len;
}
TCP_STAT_PUTREF();
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (__predict_false(m == NULL))
return ENOBUFS;
MCLAIM(m, &tcp_tx_mowner);
/*
* XXX Because other code assumes headers will fit in
* XXX one header mbuf.
*
* (This code should almost *never* be run.)
*/
if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
return ENOBUFS;
}
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
/*
* To avoid traversing the whole sb_mb chain for correct
* data to send, remember last sent mbuf, its offset and
* the sent size. When called the next time, see if the
* data to send is directly following the previous transfer.
* This is important for large TCP windows.
*/
if (off == 0 || tp->t_lastm == NULL ||
(tp->t_lastoff + tp->t_lastlen) != off) {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
/*
* Either a new packet or a retransmit.
* Start from the beginning.
*/
tp->t_lastm = so->so_snd.sb_mb;
tp->t_inoff = off;
} else {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
tp->t_inoff += tp->t_lastlen;
}
/* Traverse forward to next packet */
while (tp->t_inoff > 0) {
if (tp->t_lastm == NULL)
panic("tp->t_lastm == NULL"); if (tp->t_inoff < tp->t_lastm->m_len)
break;
tp->t_inoff -= tp->t_lastm->m_len;
tp->t_lastm = tp->t_lastm->m_next;
}
tp->t_lastoff = off;
tp->t_lastlen = len;
m0 = tp->t_lastm;
off = tp->t_inoff;
if (len <= M_TRAILINGSPACE(m)) { m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen);
m->m_len += len;
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
} else {
m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT);
if (m->m_next == NULL) {
m_freem(m);
return ENOBUFS;
}
#ifdef TCP_OUTPUT_COUNTERS
if (m->m_next->m_flags & M_EXT)
TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
else
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
#endif
}
*mp = m;
return 0;
}
/*
* Tcp output routine: figure out what should be sent and send it.
*/
int
tcp_output(struct tcpcb *tp)
{
struct rtentry *rt = NULL;
struct socket *so;
struct route *ro;
long len, win;
int off, flags, error;
struct mbuf *m;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
struct tcphdr *th;
u_char opt[MAX_TCPOPTLEN], *optp;
#define OPT_FITS(more) ((optlen + (more)) <= sizeof(opt))
unsigned optlen, hdrlen, packetlen;
unsigned int sack_numblks;
int idle, sendalot, txsegsize, rxsegsize;
int txsegsize_nosack;
int maxburst = TCP_MAXBURST;
int af; /* address family on the wire */
int iphdrlen;
int has_tso4, has_tso6;
int has_tso, use_tso;
bool alwaysfrag;
int sack_rxmit;
int sack_bytes_rxmt;
int ecn_tos;
struct sackhole *p;
#ifdef TCP_SIGNATURE
int sigoff = 0;
#endif
uint64_t *tcps;
so = tp->t_inpcb->inp_socket;
ro = &tp->t_inpcb->inp_route;
switch (af = tp->t_family) {
case AF_INET:
case AF_INET6:
if (tp->t_inpcb)
break;
return EINVAL;
default:
return EAFNOSUPPORT;
}
if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
return EMSGSIZE;
idle = (tp->snd_max == tp->snd_una);
/*
* Determine if we can use TCP segmentation offload:
* - If we're using IPv4
* - If there is not an IPsec policy that prevents it
* - If the interface can do it
*/
has_tso4 = has_tso6 = false;
has_tso4 = tp->t_inpcb->inp_af == AF_INET &&
#if defined(IPSEC)
(!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) &&
#endif
(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
if (rt != NULL) {
rtcache_unref(rt, &tp->t_inpcb->inp_route);
rt = NULL;
}
#if defined(INET6)
has_tso6 = tp->t_inpcb->inp_af == AF_INET6 &&
#if defined(IPSEC)
(!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) &&
#endif
(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
if (rt != NULL)
rtcache_unref(rt, &tp->t_inpcb->inp_route);
#endif /* defined(INET6) */
has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
/*
* Restart Window computation. From draft-floyd-incr-init-win-03:
*
* Optionally, a TCP MAY set the restart window to the
* minimum of the value used for the initial window and
* the current value of cwnd (in other words, using a
* larger value for the restart window should never increase
* the size of cwnd).
*/
if (tcp_cwm) {
/*
* Hughes/Touch/Heidemann Congestion Window Monitoring.
* Count the number of packets currently pending
* acknowledgement, and limit our congestion window
* to a pre-determined allowed burst size plus that count.
* This prevents bursting once all pending packets have
* been acknowledged (i.e. transmission is idle).
*
* XXX Link this to Initial Window?
*/
tp->snd_cwnd = uimin(tp->snd_cwnd,
(tcp_cwm_burstsize * txsegsize) +
(tp->snd_nxt - tp->snd_una));
} else {
if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
/*
* We have been idle for "a while" and no acks are
* expected to clock out any data we send --
* slow start to get ack "clock" running again.
*/
int ss = tcp_init_win;
if (tp->t_inpcb->inp_af == AF_INET &&
in_localaddr(in4p_faddr(tp->t_inpcb)))
ss = tcp_init_win_local;
#ifdef INET6
else if (tp->t_inpcb->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(tp->t_inpcb)))
ss = tcp_init_win_local;
#endif
tp->snd_cwnd = uimin(tp->snd_cwnd,
TCP_INITIAL_WINDOW(ss, txsegsize));
}
}
txsegsize_nosack = txsegsize;
again:
ecn_tos = 0;
use_tso = has_tso;
if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
/* don't duplicate CWR/ECE. */
use_tso = 0;
}
TCP_REASS_LOCK(tp);
sack_numblks = tcp_sack_numblks(tp);
if (sack_numblks) {
int sackoptlen;
sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
if (sackoptlen > txsegsize_nosack) {
sack_numblks = 0; /* give up SACK */
txsegsize = txsegsize_nosack;
} else {
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
/* don't duplicate D-SACK. */
use_tso = 0;
}
txsegsize = txsegsize_nosack - sackoptlen;
}
} else {
txsegsize = txsegsize_nosack;
}
/*
* Determine length of data that should be transmitted, and
* flags that should be used. If there is some data or critical
* controls (SYN, RST) to send, then transmit; otherwise,
* investigate further.
*
* Readjust SACK information to avoid resending duplicate data.
*/
if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
win = uimin(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
/*
* Send any SACK-generated retransmissions. If we're explicitly trying
* to send out new data (when sendalot is 1), bypass this function.
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
* we're replacing a (future) new transmission with a retransmission
* now, and we previously incremented snd_cwnd in tcp_input().
*/
/*
* Still in sack recovery, reset rxmit flag to zero.
*/
sack_rxmit = 0;
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
do {
long cwin;
if (!TCP_SACK_ENABLED(tp))
break;
if (tp->t_partialacks < 0)
break;
p = tcp_sack_output(tp, &sack_bytes_rxmt);
if (p == NULL)
break;
cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
* (At least) part of sack hole extends beyond
* snd_recover. Check to see if we can rexmit data
* for this hole.
*/
if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
/*
* Can't rexmit any more data for this hole.
* That data will be rexmitted in the next
* sack recovery episode, when snd_recover
* moves past p->rxmit.
*/
p = NULL;
break;
}
/* Can rexmit part of the current hole */
len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
} else
len = ((long)ulmin(cwin, p->end - p->rxmit));
off = p->rxmit - tp->snd_una;
if (off + len > so->so_snd.sb_cc) {
/* 1 for TH_FIN */
KASSERT(off + len == so->so_snd.sb_cc + 1); KASSERT(p->rxmit + len == tp->snd_max);
len = so->so_snd.sb_cc - off;
}
if (len > 0) {
sack_rxmit = 1;
sendalot = 1;
}
} while (/*CONSTCOND*/0);
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
* and timer expired, we will send what we can
* and go to transmit state.
*/
if (tp->t_force) {
if (win == 0) {
/*
* If we still have some data to send, then
* clear the FIN bit. Usually this would
* happen below when it realizes that we
* aren't sending all the data. However,
* if we have exactly 1 byte of unset data,
* then it won't clear the FIN bit below,
* and if we are in persist state, we wind
* up sending the packet without recording
* that we sent the FIN bit.
*
* We can't just blindly clear the FIN bit,
* because if we don't have any more data
* to send then the probe will be the FIN
* itself.
*/
if (off < so->so_snd.sb_cc)
flags &= ~TH_FIN;
win = 1;
} else {
TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
}
if (sack_rxmit == 0) { if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
long cwin;
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
if (tp->snd_wnd < so->so_snd.sb_cc) {
len = tp->snd_wnd - off;
flags &= ~TH_FIN;
} else {
len = so->so_snd.sb_cc - off;
}
/*
* From FreeBSD:
* Don't remove this (len > 0) check !
* We explicitly check for len > 0 here (although it
* isn't really necessary), to work around a gcc
* optimization issue - to force gcc to compute
* len above. Without this check, the computation
* of len is bungled by the optimizer.
*/
if (len > 0) { cwin = tp->snd_cwnd -
(tp->snd_nxt - tp->sack_newdata) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
if (cwin < len) {
len = cwin;
flags &= ~TH_FIN;
}
}
} else if (win < so->so_snd.sb_cc) {
len = win - off;
flags &= ~TH_FIN;
} else {
len = so->so_snd.sb_cc - off;
}
}
if (len < 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
* len will be -1. Otherwise, window shrank
* after we sent into it. If window shrank to 0,
* cancel pending retransmit, pull snd_nxt back
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
* close completely, just wait for an ACK.
*
* If we have a pending FIN, either it has already been
* transmitted or it is outside the window, so drop it.
* If the FIN has been transmitted, but this is not a
* retransmission, then len must be -1. Therefore we also
* prevent here the sending of `gratuitous FINs'. This
* eliminates the need to check for that case below (e.g.
* to back up snd_nxt before the FIN so that the sequence
* number is correct).
*/
len = 0;
flags &= ~TH_FIN;
if (win == 0) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) tcp_setpersist(tp);
}
}
/*
* Automatic sizing enables the performance of large buffers
* and most of the efficiency of small ones by only allocating
* space when it is needed.
*
* The criteria to step up the send buffer one notch are:
* 1. receive window of remote host is larger than send buffer
* (with a fudge factor of 5/4th);
* 2. send buffer is filled to 7/8th with data (so we actually
* have data to make use of it);
* 3. send buffer fill has not hit maximal automatic size;
* 4. our send window (slow start and cogestion controlled) is
* larger than sent but unacknowledged data in send buffer.
*
* The remote host receive window scaling factor may limit the
* growing of the send buffer before it reaches its allowed
* maximum.
*
* It scales directly with slow start or congestion window
* and does at most one step per received ACK. This fast
* scaling has the drawback of growing the send buffer beyond
* what is strictly necessary to make full use of a given
* delay*bandwidth product. However testing has shown this not
* to be much of an problem. At worst we are trading wasting
* of available bandwidth (the non-use of it) for wasting some
* socket buffer memory.
*
* TODO: Shrink send buffer during idle periods together
* with congestion window. Requires another timer.
*/
if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max &&
win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
if (!sbreserve(&so->so_snd,
uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
tcp_autosndbuf_max), so))
so->so_snd.sb_flags &= ~SB_AUTOSIZE;
}
}
if (len > txsegsize) { if (use_tso) {
/*
* Truncate TSO transfers to IP_MAXPACKET, and make
* sure that we send equal size transfers down the
* stack (rather than big-small-big-small-...).
*/
#ifdef INET6
CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
#endif
len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize;
if (len <= txsegsize) {
use_tso = 0;
}
} else
len = txsegsize;
flags &= ~TH_FIN;
sendalot = 1;
} else
use_tso = 0;
if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
}
win = sbspace(&so->so_rcv);
/*
* Sender silly window avoidance. If connection is idle
* and can send all data, a maximum segment,
* at least a maximum default-size segment do it,
* or are forced, do it; otherwise don't bother.
* If peer's buffer is tiny, then send
* when window is at least half open.
* If retransmitting (possibly after persist timer forced us
* to send into a small window), then must resend.
*/
if (len) { if (len >= txsegsize)
goto send;
if ((so->so_state & SS_MORETOCOME) == 0 && ((idle || tp->t_flags & TF_NODELAY) &&
len + off >= so->so_snd.sb_cc))
goto send;
if (tp->t_force)
goto send;
if (len >= tp->max_sndwnd / 2)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
if (sack_rxmit)
goto send;
}
/*
* Compare available window to amount of window known to peer
* (as advertised window less next expected input). If the
* difference is at least twice the size of the largest segment
* we expect to receive (i.e. two segments) or at least 50% of
* the maximum possible window, then want to send a window update
* to peer.
*/
if (win > 0) {
/*
* "adv" is the amount we can increase the window,
* taking into account that we are limited by
* TCP_MAXWIN << tp->rcv_scale.
*/
long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale);
long oldwin, adv;
/*
* rcv_nxt may overtake rcv_adv when we accept a
* zero-window probe.
*/
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
oldwin = tp->rcv_adv - tp->rcv_nxt;
else
oldwin = 0;
/*
* If the new window size ends up being the same as or
* less than the old size when it is scaled, then
* don't force a window update.
*/
if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale)
goto dontupdate;
adv = recwin - oldwin;
if (adv >= (long) (2 * rxsegsize))
goto send;
if (2 * adv >= (long) so->so_rcv.sb_hiwat)
goto send;
}
dontupdate:
/*
* Send if we owe peer an ACK.
*/
if (tp->t_flags & TF_ACKNOW)
goto send;
if (flags & (TH_SYN|TH_FIN|TH_RST))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
/*
* In SACK, it is possible for tcp_output to fail to send a segment
* after the retransmission timer has been turned off. Make sure
* that the retransmission timer is set.
*/
if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
!TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
goto just_return;
}
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
* updates. The three ``states'' for the output side are:
* idle not doing retransmits or persists
* persisting to move a small or zero window
* (re)transmitting and thereby not persisting
*
* tp->t_timer[TCPT_PERSIST]
* is set when we are in persist state.
* tp->t_force
* is set when we are called to send a persist packet.
* tp->t_timer[TCPT_REXMT]
* is set when we are retransmitting
* The output side is idle when both timers are zero.
*
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state.
* If nothing happens soon, send when timer expires:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
/*
* No reason to send a segment, just return.
*/
just_return:
TCP_REASS_UNLOCK(tp);
return 0;
send:
/*
* Before ESTABLISHED, force sending of initial options unless TCP set
* not to do any options.
*
* Note: we assume that the IP/TCP header plus TCP options always fit
* in a single mbuf, leaving room for a maximum link header, i.e.:
* max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES
*/
optlen = 0;
optp = opt;
switch (af) {
case AF_INET:
iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
break;
#ifdef INET6
case AF_INET6:
iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
break;
#endif
default: /*pacify gcc*/
iphdrlen = 0;
break;
}
hdrlen = iphdrlen;
if (flags & TH_SYN) {
struct rtentry *synrt;
synrt = inpcb_rtentry(tp->t_inpcb);
tp->snd_nxt = tp->iss;
tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ? synrt->rt_ifp : NULL, af);
inpcb_rtentry_unref(synrt, tp->t_inpcb);
if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) {
*optp++ = TCPOPT_MAXSEG;
*optp++ = TCPOLEN_MAXSEG;
*optp++ = (tp->t_ourmss >> 8) & 0xff;
*optp++ = tp->t_ourmss & 0xff;
optlen += TCPOLEN_MAXSEG;
if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE)) &&
OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) {
*((uint32_t *)optp) = htonl(
TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 |
TCPOLEN_WINDOW << 8 |
tp->request_r_scale);
optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
}
if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) {
*optp++ = TCPOPT_SACK_PERMITTED;
*optp++ = TCPOLEN_SACK_PERMITTED;
optlen += TCPOLEN_SACK_PERMITTED;
}
}
}
/*
* Send a timestamp and echo-reply if this is a SYN and our side
* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
* and our peer have sent timestamps in our SYN's.
*/
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(flags & TH_RST) == 0 &&
((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
(tp->t_flags & TF_RCVD_TSTMP))) {
int alen = 0;
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
alen++;
}
if (OPT_FITS(TCPOLEN_TIMESTAMP)) {
*optp++ = TCPOPT_TIMESTAMP;
*optp++ = TCPOLEN_TIMESTAMP;
uint32_t *lp = (uint32_t *)optp;
/* Form timestamp option (appendix A of RFC 1323) */
*lp++ = htonl(TCP_TIMESTAMP(tp));
*lp = htonl(tp->ts_recent);
optp += TCPOLEN_TIMESTAMP - 2;
optlen += TCPOLEN_TIMESTAMP;
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = TCP_TIMESTAMP(tp);
} else {
optp -= alen;
optlen -= alen;
}
}
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE) {
/*
* Initialize TCP-MD5 option (RFC2385)
*/
if (!OPT_FITS(TCPOLEN_SIGNATURE))
goto reset;
*optp++ = TCPOPT_SIGNATURE;
*optp++ = TCPOLEN_SIGNATURE;
sigoff = optlen + 2;
memset(optp, 0, TCP_SIGLEN);
optlen += TCPOLEN_SIGNATURE;
optp += TCP_SIGLEN;
}
#endif
/*
* Tack on the SACK block if it is necessary.
*/
if (sack_numblks) {
int alen = 0;
int sack_len = sack_numblks * 8;
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
alen++;
}
if (OPT_FITS(sack_len + 2)) {
struct ipqent *tiqe;
*optp++ = TCPOPT_SACK;
*optp++ = sack_len + 2;
uint32_t *lp = (uint32_t *)optp;
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { sack_numblks--;
*lp++ = htonl(tp->rcv_dsack_block.left);
*lp++ = htonl(tp->rcv_dsack_block.right);
tp->rcv_sack_flags &= ~TCPSACK_HAVED;
}
for (tiqe = TAILQ_FIRST(&tp->timeq);
sack_numblks > 0;
tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
KASSERT(tiqe != NULL);
sack_numblks--;
*lp++ = htonl(tiqe->ipqe_seq);
*lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
}
optlen += sack_len + 2;
optp += sack_len;
} else {
optp -= alen;
optlen -= alen;
}
}
/* Terminate and pad TCP options to a 4 byte boundary. */
if (optlen % 4) { if (!OPT_FITS(TCPOLEN_EOL)) {reset: TCP_REASS_UNLOCK(tp);
error = ECONNABORTED;
goto out;
}
optlen += TCPOLEN_EOL;
*optp++ = TCPOPT_EOL;
}
/*
* According to RFC 793 (STD0007):
* "The content of the header beyond the End-of-Option option
* must be header padding (i.e., zero)."
* and later: "The padding is composed of zeros."
*/
while (optlen % 4) { if (!OPT_FITS(TCPOLEN_PAD))
goto reset;
optlen += TCPOLEN_PAD;
*optp++ = TCPOPT_PAD;
}
TCP_REASS_UNLOCK(tp);
hdrlen += optlen;
#ifdef DIAGNOSTIC
if (!use_tso && len > txsegsize) panic("tcp data to be sent is larger than segment"); else if (use_tso && len > IP_MAXPACKET) panic("tcp data to be sent is larger than max TSO size");
if (max_linkhdr + hdrlen > MCLBYTES)
panic("tcphdr too big");
#endif
/*
* Grab a header mbuf, attaching a copy of data to
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
if (len) {
error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
if (error)
goto out;
/*
* If we're sending everything we've got, set PUSH.
* (This will keep happy those implementations which only
* give data to the user when a buffer fills or
* a PUSH comes in.)
*/
if (off + len == so->so_snd.sb_cc)
flags |= TH_PUSH;
} else {
tcps = TCP_STAT_GETREF();
if (tp->t_flags & TF_ACKNOW)
tcps[TCP_STAT_SNDACKS]++;
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcps[TCP_STAT_SNDCTRL]++;
else if (SEQ_GT(tp->snd_up, tp->snd_una))
tcps[TCP_STAT_SNDURG]++;
else
tcps[TCP_STAT_SNDWINUP]++;
TCP_STAT_PUTREF();
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL) {
error = ENOBUFS;
goto out;
}
MCLAIM(m, &tcp_tx_mowner);
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
m_reset_rcvif(m); switch (af) {
case AF_INET:
ip = mtod(m, struct ip *);
#ifdef INET6
ip6 = NULL;
#endif
th = (struct tcphdr *)(ip + 1);
break;
#ifdef INET6
case AF_INET6:
ip = NULL;
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
break;
#endif
default: /*pacify gcc*/
ip = NULL;
#ifdef INET6
ip6 = NULL;
#endif
th = NULL;
break;
}
if (tp->t_template == NULL)
panic("%s: no template", __func__);
if (tp->t_template->m_len < iphdrlen)
panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen);
bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);
/*
* If we are starting a connection, send ECN setup
* SYN packet. If we are on a retransmit, we may
* resend those bits a number of times as per
* RFC 3168.
*/
if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
if (tp->t_flags & TF_SYN_REXMT) {
if (tp->t_ecn_retries--)
flags |= TH_ECE|TH_CWR;
} else {
flags |= TH_ECE|TH_CWR;
tp->t_ecn_retries = tcp_ecn_maxretries;
}
}
if (TCP_ECN_ALLOWED(tp)) {
/*
* If the peer has ECN, mark data packets
* ECN capable. Ignore pure ack packets, retransmissions
* and window probes.
*/
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !(tp->t_force && len == 1)) {
ecn_tos = IPTOS_ECN_ECT0;
TCP_STATINC(TCP_STAT_ECN_ECT);
}
/*
* Reply with proper ECN notifications.
*/
if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR;
tp->t_flags &= ~TF_ECN_SND_CWR;
}
if (tp->t_flags & TF_ECN_SND_ECE) {
flags |= TH_ECE;
}
}
/*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only
* packets, we do not want the sequence number of the
* retransmitted packet, we want the sequence number
* of the next unsent octet. So, if there is no data
* (and no SYN or FIN), use snd_max instead of snd_nxt
* when filling in ti_seq. But if we are in persist
* state, snd_max might reflect one byte beyond the
* right edge of the window, so use snd_nxt in that
* case, since we know we aren't doing a retransmission.
* (retransmit and persist are mutually exclusive...)
*/
if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
} else {
if (len || (flags & (TH_SYN|TH_FIN)) ||
TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) { memcpy(th + 1, opt, optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
th->th_flags = flags;
/*
* Calculate receive window. Don't shrink window,
* but avoid silly window syndrome.
*/
if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
win = 0;
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
if (th->th_win == 0) { tp->t_sndzerowin++;
}
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
u_int32_t urp = tp->snd_up - tp->snd_nxt;
if (urp > IP_MAXPACKET)
urp = IP_MAXPACKET;
th->th_urp = htons((u_int16_t)urp);
th->th_flags |= TH_URG;
} else
/*
* If no urgent pointer to send, then we pull
* the urgent pointer to the left edge of the send window
* so that it doesn't drift into the send window on sequence
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
#ifdef TCP_SIGNATURE
if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
struct secasvar *sav;
u_int8_t *sigp;
sav = tcp_signature_getsav(m);
if (sav == NULL) {
if (m)
m_freem(m);
return EPERM;
}
m->m_pkthdr.len = hdrlen + len;
sigp = (char *)th + sizeof(*th) + sigoff;
tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);
key_sa_recordxfer(sav, m);
KEY_SA_UNREF(&sav);
}
#endif
/*
* Set ourselves up to be checksummed just before the packet
* hits the wire.
*/
switch (af) {
case AF_INET:
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (use_tso) {
m->m_pkthdr.segsz = txsegsize;
m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
} else {
m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
if (len + optlen) {
/* Fixup the pseudo-header checksum. */
/* XXXJRT Not IP Jumbogram safe. */
th->th_sum = in_cksum_addword(th->th_sum,
htons((u_int16_t) (len + optlen)));
}
}
break;
#ifdef INET6
case AF_INET6:
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (use_tso) {
m->m_pkthdr.segsz = txsegsize;
m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
} else {
m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
if (len + optlen) {
/* Fixup the pseudo-header checksum. */
/* XXXJRT: Not IPv6 Jumbogram safe. */
th->th_sum = in_cksum_addword(th->th_sum,
htons((u_int16_t) (len + optlen)));
}
}
break;
#endif
}
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
*/
if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tcp_seq startseq = tp->snd_nxt;
/*
* Advance snd_nxt over sequence space of this segment.
* There are no states in which we send both a SYN and a FIN,
* so we collapse the tests for these flags.
*/
if (flags & (TH_SYN|TH_FIN)) tp->snd_nxt++;
if (sack_rxmit)
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
*/
if (tp->t_rtttime == 0) { tp->t_rtttime = tcp_now;
tp->t_rtseq = startseq;
TCP_STATINC(TCP_STAT_SEGSTIMED);
}
}
/*
* Set retransmit timer if not currently set,
* and not doing an ack or a keep-alive probe.
* Initial value for retransmit timer is smoothed
* round-trip time + 2 * round-trip time variance.
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
timer:
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) { if ((sack_rxmit && tp->snd_nxt != tp->snd_max) || tp->snd_nxt != tp->snd_una) { if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
} else if (len == 0 && so->so_snd.sb_cc > 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
/*
* If we are sending a window probe and there's
* unacked data in the socket, make sure at
* least the persist timer is running.
*/
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
}
} else
if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len;
#ifdef TCP_DEBUG
/*
* Trace.
*/
if (so->so_options & SO_DEBUG)
tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
#endif
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
* to handle ttl and tos; we could keep them in
* the template, but need a way to checksum without them.
*/
m->m_pkthdr.len = hdrlen + len;
switch (af) {
case AF_INET:
ip->ip_len = htons(m->m_pkthdr.len);
packetlen = m->m_pkthdr.len;
if (tp->t_inpcb->inp_af == AF_INET) { ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl;
ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos;
}
#ifdef INET6
else if (tp->t_inpcb->inp_af == AF_INET6) {
ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/
ip->ip_tos = ecn_tos; /*XXX*/
}
#endif
break;
#ifdef INET6
case AF_INET6:
packetlen = m->m_pkthdr.len;
ip6->ip6_nxt = IPPROTO_TCP;
if (tp->t_family == AF_INET6) {
/*
* we separately set hoplimit for every segment, since
* the user might want to change the value via
* setsockopt. Also, desired default hop limit might
* be changed via Neighbor Discovery.
*/
ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
}
ip6->ip6_flow |= htonl(ecn_tos << 20);
/* ip6->ip6_flow = ??? (from template) */
/* ip6_plen will be filled in ip6_output(). */
break;
#endif
default: /*pacify gcc*/
packetlen = 0;
break;
}
switch (af) {
case AF_INET:
{
struct mbuf *opts;
if (tp->t_inpcb->inp_af == AF_INET) opts = tp->t_inpcb->inp_options;
else
opts = NULL;
error = ip_output(m, opts, ro,
(tp->t_mtudisc ? IP_MTUDISC : 0) |
(so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb);
break;
}
#ifdef INET6
case AF_INET6:
{
struct ip6_pktopts *opts;
if (tp->t_inpcb->inp_af == AF_INET6) opts = in6p_outputopts(tp->t_inpcb);
else
opts = NULL;
error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
NULL, tp->t_inpcb, NULL);
break;
}
#endif
default:
error = EAFNOSUPPORT;
break;
}
if (error) {
out:
if (error == ENOBUFS) {
TCP_STATINC(TCP_STAT_SELFQUENCH);
tcp_quench(tp->t_inpcb);
error = 0;
} else if ((error == EHOSTUNREACH || error == ENETDOWN ||
error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_softerror = error;
error = 0;
}
/* Back out the sequence number advance. */
if (sack_rxmit) p->rxmit -= len;
/* Restart the delayed ACK timer, if necessary. */
if (tp->t_flags & TF_DELACK) TCP_RESTART_DELACK(tp);
return error;
}
if (packetlen > tp->t_pmtud_mtu_sent) tp->t_pmtud_mtu_sent = packetlen;
tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDTOTAL]++;
if (tp->t_flags & TF_DELACK)
tcps[TCP_STAT_DELACK]++;
TCP_STAT_PUTREF();
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
* Any pending ACK has now been sent.
*/
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~TF_ACKNOW;
TCP_CLEAR_DELACK(tp);
#ifdef DIAGNOSTIC
if (maxburst < 0) printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
#endif
if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
goto again;
return 0;
}
void
tcp_setpersist(struct tcpcb *tp)
{
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
int nticks;
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
panic("tcp_output REXMT");
/*
* Start/restart persistance timer.
*/
if (t < tp->t_rttmin)
t = tp->t_rttmin;
TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
TCPTV_PERSMIN, TCPTV_PERSMAX);
TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++;
}
/* $NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel module support.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <uvm/uvm_extern.h>
/* Locking/synchronization stuff for module hooks */
static struct {
kmutex_t mtx;
kcondvar_t cv;
pserialize_t psz;
} module_hook __cacheline_aligned;
/*
* We use pserialize_perform() to issue a memory barrier on the current
* CPU and on all other CPUs so that all prior memory operations on the
* current CPU globally happen before all subsequent memory operations
* on the current CPU, as perceived by any other CPU.
*
* pserialize_perform() might be rather heavy-weight here, but it only
* happens during module loading, and it allows MODULE_HOOK_CALL() to
* work without any other memory barriers.
*/
void
module_hook_set(bool *hooked, struct localcount *lc)
{
KASSERT(kernconfig_is_held());
KASSERT(!*hooked);
localcount_init(lc);
/* Wait until setup has been witnessed by all CPUs. */
pserialize_perform(module_hook.psz);
/* Let others use it */
atomic_store_relaxed(hooked, true);
}
void
module_hook_unset(bool *hooked, struct localcount *lc)
{
KASSERT(kernconfig_is_held());
KASSERT(*hooked);
/* Get exclusive with pserialize and localcount. */
mutex_enter(&module_hook.mtx);
/* Prevent new calls to module_hook_tryenter(). */
atomic_store_relaxed(hooked, false);
/* Wait for existing calls to module_hook_tryenter(). */
pserialize_perform(module_hook.psz);
/* Wait for module_hook_exit. */
localcount_drain(lc, &module_hook.cv, &module_hook.mtx);
/* All done! */
mutex_exit(&module_hook.mtx);
localcount_fini(lc);
}
bool
module_hook_tryenter(bool *hooked, struct localcount *lc)
{
bool call_hook;
int s;
s = pserialize_read_enter();
call_hook = atomic_load_relaxed(hooked);
if (call_hook) localcount_acquire(lc);
pserialize_read_exit(s);
return call_hook;
}
void
module_hook_exit(struct localcount *lc)
{
localcount_release(lc, &module_hook.cv, &module_hook.mtx);
}
void
module_hook_init(void)
{
mutex_init(&module_hook.mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&module_hook.cv, "mod_hook");
module_hook.psz = pserialize_create();
}
/* $NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $ */
/*
* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
* Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/atomic.h>
static void updatepri(struct lwp *);
static void resetpriority(struct lwp *);
/* Number of hardclock ticks per sched_tick() */
u_int sched_rrticks __read_mostly;
/*
* Force switch among equal priority processes every 100ms.
* Called from hardclock every hz/10 == sched_rrticks hardclock ticks.
*/
/* ARGSUSED */
void
sched_tick(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
pri_t pri = PRI_NONE;
lwp_t *l;
spc->spc_ticks = sched_rrticks;
if (CURCPU_IDLE_P()) {
spc_lock(ci);
sched_resched_cpu(ci, MAXPRI_KTHREAD, true);
/* spc now unlocked */
return;
}
l = ci->ci_onproc;
if (l == NULL) {
return;
}
/*
* Can only be spc_lwplock or a turnstile lock at this point
* (if we interrupted priority inheritance trylock dance).
*/
KASSERT(l->l_mutex != spc->spc_mutex);
switch (l->l_class) {
case SCHED_FIFO:
/* No timeslicing for FIFO jobs. */
break;
case SCHED_RR:
/* Force it into mi_switch() to look for other jobs to run. */
pri = MAXPRI_KERNEL_RT;
break;
default:
if (spc->spc_flags & SPCF_SHOULDYIELD) {
/*
* Process is stuck in kernel somewhere, probably
* due to buggy or inefficient code. Force a
* kernel preemption.
*/
pri = MAXPRI_KERNEL_RT;
} else if (spc->spc_flags & SPCF_SEENRR) {
/*
* The process has already been through a roundrobin
* without switching and may be hogging the CPU.
* Indicate that the process should yield.
*/
pri = MAXPRI_KTHREAD;
spc->spc_flags |= SPCF_SHOULDYIELD;
} else if ((spc->spc_flags & SPCF_1STCLASS) == 0) {
/*
* For SMT or asymmetric systems push a little
* harder: if this is not a 1st class CPU, try to
* find a better one to run this LWP.
*/
pri = MAXPRI_KTHREAD;
spc->spc_flags |= SPCF_SHOULDYIELD;
} else {
spc->spc_flags |= SPCF_SEENRR;
}
break;
}
if (pri != PRI_NONE) {
spc_lock(ci);
sched_resched_cpu(ci, pri, true);
/* spc now unlocked */
}
}
/*
* Why PRIO_MAX - 2? From setpriority(2):
*
* prio is a value in the range -20 to 20. The default priority is
* 0; lower priorities cause more favorable scheduling. A value of
* 19 or 20 will schedule a process only when nothing at priority <=
* 0 is runnable.
*
* This gives estcpu influence over 18 priority levels, and leaves nice
* with 40 levels. One way to think about it is that nice has 20 levels
* either side of estcpu's 18.
*/
#define ESTCPU_SHIFT 11
#define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT)
#define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1))
#define ESTCPULIM(e) uimin((e), ESTCPU_MAX)
/*
* The main parameter used by this algorithm is 'l_estcpu'. It is an estimate
* of the recent CPU utilization of the thread.
*
* l_estcpu is:
* - increased each time the hardclock ticks and the thread is found to
* be executing, in sched_schedclock() called from hardclock()
* - decreased (filtered) on each sched tick, in sched_pstats_hook()
* If the lwp is sleeping for more than a second, we don't touch l_estcpu: it
* will be updated in sched_setrunnable() when the lwp wakes up, in burst mode
* (ie, we decrease it n times).
*
* Note that hardclock updates l_estcpu and l_cpticks independently.
*
* -----------------------------------------------------------------------------
*
* Here we describe how l_estcpu is decreased.
*
* Constants for digital decay (filter):
* 90% of l_estcpu usage in (5 * loadavg) seconds
*
* We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we
* want to compute a value of decay such that the following loop:
* for (i = 0; i < (5 * loadavg); i++)
* l_estcpu *= decay;
* will result in
* l_estcpu *= 0.1;
* for all values of loadavg.
*
* Mathematically this loop can be expressed by saying:
* decay ** (5 * loadavg) ~= .1
*
* And finally, the corresponding value of decay we're using is:
* decay = (2 * loadavg) / (2 * loadavg + 1)
*
* -----------------------------------------------------------------------------
*
* Now, let's prove that the value of decay stated above will always fulfill
* the equation:
* decay ** (5 * loadavg) ~= .1
*
* If we compute b as:
* b = 2 * loadavg
* then
* decay = b / (b + 1)
*
* We now need to prove two things:
* 1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)].
* 2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)].
*
* Facts:
* * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ...
* Therefore, for x close to zero, exp(x) =~ 1 + x.
* In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
*
* * For b large enough, (b-1)/b =~ b/(b+1).
*
* * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ...
* Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
*
* * ln(0.1) =~ -2.30
*
* Proof of (1):
* factor ** (5 * loadavg) =~ 0.1
* => ln(factor) =~ -2.30 / (5 * loadavg)
* => factor =~ exp(-1 / ((5 / 2.30) * loadavg))
* =~ exp(-1 / (2 * loadavg))
* =~ exp(-1 / b)
* =~ (b - 1) / b
* =~ b / (b + 1)
* =~ (2 * loadavg) / ((2 * loadavg) + 1)
*
* Proof of (2):
* (b / (b + 1)) ** power =~ .1
* => power * ln(b / (b + 1)) =~ -2.30
* => power * (-1 / (b + 1)) =~ -2.30
* => power =~ 2.30 * (b + 1)
* => power =~ 4.60 * loadavg + 2.30
* => power =~ 5 * loadavg
*
* Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1)
*/
/* See calculations above */
#define loadfactor(loadavg) (2 * (loadavg))
static fixpt_t
decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
{
if (estcpu == 0) {
return 0;
}
#if !defined(_LP64)
/* avoid 64bit arithmetics. */
#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
return estcpu * loadfac / (loadfac + FSCALE);
}
#endif
return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
}
static fixpt_t
decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
{
/*
* For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
* if we slept for at least seven times the loadfactor, we will decay
* l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can
* return zero directly.
*
* Note that our ESTCPU_MAX is actually much smaller than
* (255 << ESTCPU_SHIFT).
*/
if ((n << FSHIFT) >= 7 * loadfac) {
return 0;
}
while (estcpu != 0 && n > 1) { estcpu = decay_cpu(loadfac, estcpu);
n--;
}
return estcpu;
}
/*
* sched_pstats_hook:
*
* Periodically called from sched_pstats(); used to recalculate priorities.
*/
void
sched_pstats_hook(struct lwp *l, int batch)
{
fixpt_t loadfac;
/*
* If the LWP has slept an entire second, stop recalculating
* its priority until it wakes up.
*/
KASSERT(lwp_locked(l, NULL));
if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
l->l_stat == LSSUSPENDED) {
if (l->l_slptime > 1) {
return;
}
}
loadfac = loadfactor(averunnable.ldavg[0]);
l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
resetpriority(l);
}
/*
* Recalculate the priority of an LWP after it has slept for a while.
*/
static void
updatepri(struct lwp *l)
{
fixpt_t loadfac;
KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_slptime > 1);
loadfac = loadfactor(averunnable.ldavg[0]);
l->l_slptime--; /* the first time was done in sched_pstats */
l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); resetpriority(l);
}
void
sched_rqinit(void)
{
}
void
sched_setrunnable(struct lwp *l)
{ if (l->l_slptime > 1) updatepri(l);
}
void
sched_nice(struct proc *p, int n)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
p->p_nice = n;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
resetpriority(l);
lwp_unlock(l);
}
}
/*
* Recompute the priority of an LWP. Arrange to reschedule if
* the resulting priority is better than that of the current LWP.
*/
static void
resetpriority(struct lwp *l)
{
pri_t pri;
struct proc *p = l->l_proc;
KASSERT(lwp_locked(l, NULL)); if (l->l_class != SCHED_OTHER)
return;
/* See comments above ESTCPU_SHIFT definition. */
pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
pri = imax(pri, 0);
if (pri != l->l_priority) lwp_changepri(l, pri);
}
/*
* We adjust the priority of the current LWP. The priority of a LWP
* gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu)
* is increased here. The formula for computing priorities will compute a
* different value each time l_estcpu increases. This can cause a switch,
* but unless the priority crosses a PPQ boundary the actual queue will not
* change. The CPU usage estimator ramps up quite quickly when the process
* is running (linearly), and decays away exponentially, at a rate which is
* proportionally slower when the system is busy. The basic principle is
* that the system will 90% forget that the process used a lot of CPU time
* in (5 * loadavg) seconds. This causes the system to favor processes which
* haven't run much recently, and to round-robin among other processes.
*/
void
sched_schedclock(struct lwp *l)
{
if (l->l_class != SCHED_OTHER)
return;
KASSERT(!CURCPU_IDLE_P());
l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
lwp_lock(l);
resetpriority(l);
lwp_unlock(l);
}
/*
* sched_proc_fork:
*
* Inherit the parent's scheduler history.
*/
void
sched_proc_fork(struct proc *parent, struct proc *child)
{
lwp_t *pl;
KASSERT(mutex_owned(parent->p_lock));
pl = LIST_FIRST(&parent->p_lwps);
child->p_estcpu_inherited = pl->l_estcpu;
child->p_forktime = sched_pstats_ticks;
}
/*
* sched_proc_exit:
*
* Chargeback parents for the sins of their children.
*/
void
sched_proc_exit(struct proc *parent, struct proc *child)
{
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
fixpt_t estcpu;
lwp_t *pl, *cl;
/* XXX Only if parent != init?? */
mutex_enter(parent->p_lock);
pl = LIST_FIRST(&parent->p_lwps);
cl = LIST_FIRST(&child->p_lwps);
estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
sched_pstats_ticks - child->p_forktime);
if (cl->l_estcpu > estcpu) { lwp_lock(pl);
pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
lwp_unlock(pl);
}
mutex_exit(parent->p_lock);
}
void
sched_wakeup(struct lwp *l)
{
}
void
sched_slept(struct lwp *l)
{
}
void
sched_lwp_fork(struct lwp *l1, struct lwp *l2)
{
l2->l_estcpu = l1->l_estcpu;
}
void
sched_lwp_collect(struct lwp *t)
{
lwp_t *l;
/* Absorb estcpu value of collected LWP. */
l = curlwp;
lwp_lock(l);
l->l_estcpu += t->l_estcpu;
lwp_unlock(l);
}
void
sched_oncpu(lwp_t *l)
{
}
void
sched_newts(lwp_t *l)
{
}
/*
* Sysctl nodes and initialization.
*/
static int
sysctl_sched_rtts(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int rttsms = hztoms(sched_rrticks);
node = *rnode;
node.sysctl_data = &rttsms;
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup")
{
const struct sysctlnode *node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sched",
SYSCTL_DESCR("Scheduler options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node == NULL)
return;
sched_rrticks = hz / 10;
sysctl_createv(NULL, 0, &node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "name", NULL,
NULL, 0, __UNCONST("4.4BSD"), 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(NULL, 0, &node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "rtts",
SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
sysctl_sched_rtts, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
}
/* $NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $ */
/*
* Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and
* Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp
*/
/*
* uvm_aobj.c: anonymous memory uvm_object pager
*
* author: Chuck Silvers <chuq@chuq.com>
* started: Jan-1998
*
* - design mostly from Chuck Cranor
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_page_array.h>
/*
* An anonymous UVM object (aobj) manages anonymous-memory. In addition to
* keeping the list of resident pages, it may also keep a list of allocated
* swap blocks. Depending on the size of the object, this list is either
* stored in an array (small objects) or in a hash table (large objects).
*
* Lock order
*
* uao_list_lock ->
* uvm_object::vmobjlock
*/
/*
* Note: for hash tables, we break the address space of the aobj into blocks
* of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two.
*/
#define UAO_SWHASH_CLUSTER_SHIFT 4
#define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT)
/* Get the "tag" for this page index. */
#define UAO_SWHASH_ELT_TAG(idx) ((idx) >> UAO_SWHASH_CLUSTER_SHIFT)
#define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \
((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1))
/* Given an ELT and a page index, find the swap slot. */
#define UAO_SWHASH_ELT_PAGESLOT(elt, idx) \
((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)])
/* Given an ELT, return its pageidx base. */
#define UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \
((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT)
/* The hash function. */
#define UAO_SWHASH_HASH(aobj, idx) \
(&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \
& (aobj)->u_swhashmask)])
/*
* The threshold which determines whether we will use an array or a
* hash table to store the list of allocated swap blocks.
*/
#define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4)
#define UAO_USES_SWHASH(aobj) \
((aobj)->u_pages > UAO_SWHASH_THRESHOLD)
/* The number of buckets in a hash, with an upper bound. */
#define UAO_SWHASH_MAXBUCKETS 256
#define UAO_SWHASH_BUCKETS(aobj) \
(MIN((aobj)->u_pages >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))
/*
* uao_swhash_elt: when a hash table is being used, this structure defines
* the format of an entry in the bucket list.
*/
struct uao_swhash_elt {
LIST_ENTRY(uao_swhash_elt) list; /* the hash list */
voff_t tag; /* our 'tag' */
int count; /* our number of active slots */
int slots[UAO_SWHASH_CLUSTER_SIZE]; /* the slots */
};
/*
* uao_swhash: the swap hash table structure
*/
LIST_HEAD(uao_swhash, uao_swhash_elt);
/*
* uao_swhash_elt_pool: pool of uao_swhash_elt structures.
* Note: pages for this pool must not come from a pageable kernel map.
*/
static struct pool uao_swhash_elt_pool __cacheline_aligned;
/*
* uvm_aobj: the actual anon-backed uvm_object
*
* => the uvm_object is at the top of the structure, this allows
* (struct uvm_aobj *) == (struct uvm_object *)
* => only one of u_swslots and u_swhash is used in any given aobj
*/
struct uvm_aobj {
struct uvm_object u_obj; /* has: lock, pgops, #pages, #refs */
pgoff_t u_pages; /* number of pages in entire object */
int u_flags; /* the flags (see uvm_aobj.h) */
int *u_swslots; /* array of offset->swapslot mappings */
/*
* hashtable of offset->swapslot mappings
* (u_swhash is an array of bucket heads)
*/
struct uao_swhash *u_swhash;
u_long u_swhashmask; /* mask for hashtable */
LIST_ENTRY(uvm_aobj) u_list; /* global list of aobjs */
int u_freelist; /* freelist to allocate pages from */
};
static void uao_free(struct uvm_aobj *);
static int uao_get(struct uvm_object *, voff_t, struct vm_page **,
int *, int, vm_prot_t, int, int);
static int uao_put(struct uvm_object *, voff_t, voff_t, int);
#if defined(VMSWAP)
static struct uao_swhash_elt *uao_find_swhash_elt
(struct uvm_aobj *, int, bool);
static bool uao_pagein(struct uvm_aobj *, int, int);
static bool uao_pagein_page(struct uvm_aobj *, int);
#endif /* defined(VMSWAP) */
static struct vm_page *uao_pagealloc(struct uvm_object *, voff_t, int);
/*
* aobj_pager
*
* note that some functions (e.g. put) are handled elsewhere
*/
const struct uvm_pagerops aobj_pager = {
.pgo_reference = uao_reference,
.pgo_detach = uao_detach,
.pgo_get = uao_get,
.pgo_put = uao_put,
};
/*
* uao_list: global list of active aobjs, locked by uao_list_lock
*/
static LIST_HEAD(aobjlist, uvm_aobj) uao_list __cacheline_aligned;
static kmutex_t uao_list_lock __cacheline_aligned;
/*
* hash table/array related functions
*/
#if defined(VMSWAP)
/*
* uao_find_swhash_elt: find (or create) a hash table entry for a page
* offset.
*
* => the object should be locked by the caller
*/
static struct uao_swhash_elt *
uao_find_swhash_elt(struct uvm_aobj *aobj, int pageidx, bool create)
{
struct uao_swhash *swhash;
struct uao_swhash_elt *elt;
voff_t page_tag;
swhash = UAO_SWHASH_HASH(aobj, pageidx);
page_tag = UAO_SWHASH_ELT_TAG(pageidx);
/*
* now search the bucket for the requested tag
*/
LIST_FOREACH(elt, swhash, list) {
if (elt->tag == page_tag) {
return elt;
}
}
if (!create) {
return NULL;
}
/*
* allocate a new entry for the bucket and init/insert it in
*/
elt = pool_get(&uao_swhash_elt_pool, PR_NOWAIT);
if (elt == NULL) {
return NULL;
}
LIST_INSERT_HEAD(swhash, elt, list);
elt->tag = page_tag;
elt->count = 0;
memset(elt->slots, 0, sizeof(elt->slots));
return elt;
}
/*
* uao_find_swslot: find the swap slot number for an aobj/pageidx
*
* => object must be locked by caller
*/
int
uao_find_swslot(struct uvm_object *uobj, int pageidx)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash_elt *elt;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we never return a slot
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP)
return 0;
/*
* if hashing, look in hash table.
*/
if (UAO_USES_SWHASH(aobj)) {
elt = uao_find_swhash_elt(aobj, pageidx, false);
return elt ? UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) : 0;
}
/*
* otherwise, look in the array
*/
return aobj->u_swslots[pageidx];
}
/*
* uao_set_swslot: set the swap slot for a page in an aobj.
*
* => setting a slot to zero frees the slot
* => object must be locked by caller
* => we return the old slot number, or -1 if we failed to allocate
* memory to record the new slot number
*/
int
uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash_elt *elt;
int oldslot;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "aobj %#jx pageidx %jd slot %jd",
(uintptr_t)aobj, pageidx, slot, 0);
KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0); KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we can't set a non-zero slot.
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP) {
KASSERTMSG(slot == 0, "uao_set_swslot: no swap object");
return 0;
}
/*
* are we using a hash table? if so, add it in the hash.
*/
if (UAO_USES_SWHASH(aobj)) {
/*
* Avoid allocating an entry just to free it again if
* the page had not swap slot in the first place, and
* we are freeing.
*/
elt = uao_find_swhash_elt(aobj, pageidx, slot != 0);
if (elt == NULL) {
return slot ? -1 : 0;
}
oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot;
/*
* now adjust the elt's reference counter and free it if we've
* dropped it to zero.
*/
if (slot) {
if (oldslot == 0) elt->count++;
} else {
if (oldslot) elt->count--; if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
} else {
/* we are using an array */
oldslot = aobj->u_swslots[pageidx];
aobj->u_swslots[pageidx] = slot;
}
return oldslot;
}
#endif /* defined(VMSWAP) */
/*
* end of hash/array functions
*/
/*
* uao_free: free all resources held by an aobj, and then free the aobj
*
* => the aobj should be dead
*/
static void
uao_free(struct uvm_aobj *aobj)
{
struct uvm_object *uobj = &aobj->u_obj;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
uao_dropswap_range(uobj, 0, 0);
rw_exit(uobj->vmobjlock);
#if defined(VMSWAP)
if (UAO_USES_SWHASH(aobj)) {
/*
* free the hash table itself.
*/
hashdone(aobj->u_swhash, HASH_LIST, aobj->u_swhashmask);
} else {
/*
* free the array itself.
*/
kmem_free(aobj->u_swslots, aobj->u_pages * sizeof(int));
}
#endif /* defined(VMSWAP) */
/*
* finally free the aobj itself
*/
uvm_obj_destroy(uobj, true);
kmem_free(aobj, sizeof(struct uvm_aobj));
}
/*
* pager functions
*/
/*
* uao_create: create an aobj of the given size and return its uvm_object.
*
* => for normal use, flags are always zero
* => for the kernel object, the flags are:
* UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once)
* UAO_FLAG_KERNSWAP - enable swapping of kernel object (" ")
*/
struct uvm_object *
uao_create(voff_t size, int flags)
{
static struct uvm_aobj kernel_object_store;
static krwlock_t bootstrap_kernel_object_lock;
static int kobj_alloced __diagused = 0;
pgoff_t pages = round_page((uint64_t)size) >> PAGE_SHIFT;
struct uvm_aobj *aobj;
int refs;
/*
* Allocate a new aobj, unless kernel object is requested.
*/
if (flags & UAO_FLAG_KERNOBJ) {
KASSERT(!kobj_alloced);
aobj = &kernel_object_store;
aobj->u_pages = pages;
aobj->u_flags = UAO_FLAG_NOSWAP;
refs = UVM_OBJ_KERN;
kobj_alloced = UAO_FLAG_KERNOBJ;
} else if (flags & UAO_FLAG_KERNSWAP) {
KASSERT(kobj_alloced == UAO_FLAG_KERNOBJ);
aobj = &kernel_object_store;
kobj_alloced = UAO_FLAG_KERNSWAP;
refs = 0xdeadbeaf; /* XXX: gcc */
} else {
aobj = kmem_alloc(sizeof(struct uvm_aobj), KM_SLEEP);
aobj->u_pages = pages;
aobj->u_flags = 0;
refs = 1;
}
/*
* no freelist by default
*/
aobj->u_freelist = VM_NFREELIST;
/*
* allocate hash/array if necessary
*
* note: in the KERNSWAP case no need to worry about locking since
* we are still booting we should be the only thread around.
*/
const int kernswap = (flags & UAO_FLAG_KERNSWAP) != 0;
if (flags == 0 || kernswap) {
#if defined(VMSWAP)
/* allocate hash table or array depending on object size */
if (UAO_USES_SWHASH(aobj)) {
aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(aobj),
HASH_LIST, true, &aobj->u_swhashmask);
} else {
aobj->u_swslots = kmem_zalloc(pages * sizeof(int),
KM_SLEEP);
}
#endif /* defined(VMSWAP) */
/*
* Replace kernel_object's temporary static lock with
* a regular rw_obj. We cannot use uvm_obj_setlock()
* because that would try to free the old lock.
*/
if (kernswap) { aobj->u_obj.vmobjlock = rw_obj_alloc();
rw_destroy(&bootstrap_kernel_object_lock);
}
if (flags) { aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */
return &aobj->u_obj;
}
}
/*
* Initialise UVM object.
*/
const bool kernobj = (flags & UAO_FLAG_KERNOBJ) != 0;
uvm_obj_init(&aobj->u_obj, &aobj_pager, !kernobj, refs);
if (__predict_false(kernobj)) {
/* Use a temporary static lock for kernel_object. */
rw_init(&bootstrap_kernel_object_lock);
uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock);
}
/*
* now that aobj is ready, add it to the global list
*/
mutex_enter(&uao_list_lock);
LIST_INSERT_HEAD(&uao_list, aobj, u_list);
mutex_exit(&uao_list_lock);
return(&aobj->u_obj);
}
/*
* uao_set_pgfl: allocate pages only from the specified freelist.
*
* => must be called before any pages are allocated for the object.
* => reset by setting it to VM_NFREELIST, meaning any freelist.
*/
void
uao_set_pgfl(struct uvm_object *uobj, int freelist)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
KASSERTMSG((0 <= freelist), "invalid freelist %d", freelist);
KASSERTMSG((freelist <= VM_NFREELIST), "invalid freelist %d",
freelist);
aobj->u_freelist = freelist;
}
/*
* uao_pagealloc: allocate a page for aobj.
*/
static inline struct vm_page *
uao_pagealloc(struct uvm_object *uobj, voff_t offset, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
if (__predict_true(aobj->u_freelist == VM_NFREELIST))
return uvm_pagealloc(uobj, offset, NULL, flags);
else
return uvm_pagealloc_strat(uobj, offset, NULL, flags,
UVM_PGA_STRAT_ONLY, aobj->u_freelist);
}
/*
* uao_init: set up aobj pager subsystem
*
* => called at boot time from uvm_pager_init()
*/
void
uao_init(void)
{
static int uao_initialized;
if (uao_initialized)
return;
uao_initialized = true;
LIST_INIT(&uao_list);
mutex_init(&uao_list_lock, MUTEX_DEFAULT, IPL_NONE);
pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt),
0, 0, 0, "uaoeltpl", NULL, IPL_VM);
}
/*
* uao_reference: hold a reference to an anonymous UVM object.
*/
void
uao_reference(struct uvm_object *uobj)
{
/* Kernel object is persistent. */
if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
return;
}
atomic_inc_uint(&uobj->uo_refs);
}
/*
* uao_detach: drop a reference to an anonymous UVM object.
*/
void
uao_detach(struct uvm_object *uobj)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uvm_page_array a;
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* Detaching from kernel object is a NOP.
*/
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
/*
* Drop the reference. If it was the last one, destroy the object.
*/
KASSERT(uobj->uo_refs > 0);
UVMHIST_LOG(maphist," (uobj=%#jx) ref=%jd",
(uintptr_t)uobj, uobj->uo_refs, 0, 0);
membar_release();
if (atomic_dec_uint_nv(&uobj->uo_refs) > 0) {
UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0);
return;
}
membar_acquire();
/*
* Remove the aobj from the global list.
*/
mutex_enter(&uao_list_lock);
LIST_REMOVE(aobj, u_list);
mutex_exit(&uao_list_lock);
/*
* Free all the pages left in the aobj. For each page, when the
* page is no longer busy (and thus after any disk I/O that it is
* involved in is complete), release any swap resources and free
* the page itself.
*/
uvm_page_array_init(&a, uobj, 0);
rw_enter(uobj->vmobjlock, RW_WRITER);
while ((pg = uvm_page_array_fill_and_peek(&a, 0, 0)) != NULL) {
uvm_page_array_advance(&a);
pmap_page_protect(pg, VM_PROT_NONE);
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uao_det");
uvm_page_array_clear(&a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
uvm_pagefree(pg);
}
uvm_page_array_fini(&a);
/*
* Finally, free the anonymous UVM object itself.
*/
uao_free(aobj);
}
/*
* uao_put: flush pages out of a uvm object
*
* => object should be locked by caller. we may _unlock_ the object
* if (and only if) we need to clean a page (PGO_CLEANIT).
* XXXJRT Currently, however, we don't. In the case of cleaning
* XXXJRT a page, we simply just deactivate it. Should probably
* XXXJRT handle this better, in the future (although "flushing"
* XXXJRT anonymous memory isn't terribly important).
* => if PGO_CLEANIT is not set, then we will neither unlock the object
* or block.
* => if PGO_ALLPAGE is set, then all pages in the object are valid targets
* for flushing.
* => we return 0 unless we encountered some sort of I/O error
* XXXJRT currently never happens, as we never directly initiate
* XXXJRT I/O
*/
static int
uao_put(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uvm_page_array a;
struct vm_page *pg;
voff_t curoff;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (flags & PGO_ALLPAGES) {
start = 0;
stop = aobj->u_pages << PAGE_SHIFT;
} else {
start = trunc_page(start);
if (stop == 0) {
stop = aobj->u_pages << PAGE_SHIFT;
} else {
stop = round_page(stop);
}
if (stop > (uint64_t)(aobj->u_pages << PAGE_SHIFT)) { printf("uao_put: strange, got an out of range "
"flush %#jx > %#jx (fixed)\n",
(uintmax_t)stop,
(uintmax_t)(aobj->u_pages << PAGE_SHIFT));
stop = aobj->u_pages << PAGE_SHIFT;
}
}
UVMHIST_LOG(maphist,
" flush start=%#jx, stop=%#jx, flags=%#jx",
start, stop, flags, 0);
/*
* Don't need to do any work here if we're not freeing
* or deactivating pages.
*/
if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
rw_exit(uobj->vmobjlock);
return 0;
}
/* locked: uobj */
uvm_page_array_init(&a, uobj, 0);
curoff = start;
while ((pg = uvm_page_array_fill_and_peek(&a, curoff, 0)) != NULL) { if (pg->offset >= stop) {
break;
}
/*
* wait and try again if the page is busy.
*/
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uao_put");
uvm_page_array_clear(&a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
uvm_page_array_advance(&a);
curoff = pg->offset + PAGE_SIZE;
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* XXX In these first 3 cases, we always just
* XXX deactivate the page. We may want to
* XXX handle the different cases more specifically
* XXX in the future.
*/
case PGO_CLEANIT|PGO_FREE:
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
break;
case PGO_FREE:
/*
* If there are multiple references to
* the object, just deactivate the page.
*/
if (uobj->uo_refs > 1)
goto deactivate_it;
/*
* free the swap slot and the page.
*/
pmap_page_protect(pg, VM_PROT_NONE);
/*
* freeing swapslot here is not strictly necessary.
* however, leaving it here doesn't save much
* because we need to update swap accounting anyway.
*/
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
uvm_pagefree(pg);
break;
default:
panic("%s: impossible", __func__);
}
}
rw_exit(uobj->vmobjlock);
uvm_page_array_fini(&a);
return 0;
}
/*
* uao_get: fetch me a page
*
* we have three cases:
* 1: page is resident -> just return the page.
* 2: page is zero-fill -> allocate a new page and zero it.
* 3: page is swapped out -> fetch the page from swap.
*
* case 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot.
* so, if the "center" page hits case 2/3 then we will need to return EBUSY.
*
* => prefer map unlocked (not required)
* => object must be locked! we will _unlock_ it before starting any I/O.
* => flags: PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
static int
uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
{
voff_t current_offset;
struct vm_page *ptmp;
int lcv, gotpages, maxpages, swslot, pageidx;
bool overwrite = ((flags & PGO_OVERWRITE) != 0);
struct uvm_page_array a;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "aobj=%#jx offset=%jd, flags=%#jx",
(uintptr_t)uobj, offset, flags,0);
/*
* the object must be locked. it can only be a read lock when
* processing a read fault with PGO_LOCKED.
*/
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) ||
((flags & PGO_LOCKED) != 0 && (access_type & VM_PROT_WRITE) == 0));
/*
* get number of pages
*/
maxpages = *npagesp;
/*
* step 1: handled the case where fault data structures are locked.
*/
if (flags & PGO_LOCKED) {
/*
* step 1a: get pages that are already resident. only do
* this if the data structures are locked (i.e. the first
* time through).
*/
uvm_page_array_init(&a, uobj, 0);
gotpages = 0; /* # of pages we got so far */
for (lcv = 0; lcv < maxpages; lcv++) {
ptmp = uvm_page_array_fill_and_peek(&a,
offset + (lcv << PAGE_SHIFT), maxpages);
if (ptmp == NULL) {
break;
}
KASSERT(ptmp->offset >= offset);
lcv = (ptmp->offset - offset) >> PAGE_SHIFT;
if (lcv >= maxpages) {
break;
}
uvm_page_array_advance(&a);
/*
* to be useful must get a non-busy page
*/
if ((ptmp->flags & PG_BUSY) != 0) {
continue;
}
/*
* useful page: plug it in our result array
*/
KASSERT(uvm_pagegetdirty(ptmp) !=
UVM_PAGE_STATUS_CLEAN);
pps[lcv] = ptmp;
gotpages++;
}
uvm_page_array_fini(&a);
/*
* step 1b: now we've either done everything needed or we
* to unlock and do some waiting or I/O.
*/
UVMHIST_LOG(pdhist, "<- done (done=%jd)",
(pps[centeridx] != NULL), 0,0,0);
*npagesp = gotpages;
return pps[centeridx] != NULL ? 0 : EBUSY;
}
/*
* step 2: get non-resident or busy pages.
* object is locked. data structures are unlocked.
*/
if ((flags & PGO_SYNCIO) == 0) {
goto done;
}
uvm_page_array_init(&a, uobj, 0);
for (lcv = 0, current_offset = offset ; lcv < maxpages ;) {
/*
* we have yet to locate the current page (pps[lcv]). we
* first look for a page that is already at the current offset.
* if we find a page, we check to see if it is busy or
* released. if that is the case, then we sleep on the page
* until it is no longer busy or released and repeat the lookup.
* if the page we found is neither busy nor released, then we
* busy it (so we own it) and plug it into pps[lcv]. we are
* ready to move on to the next page.
*/
ptmp = uvm_page_array_fill_and_peek(&a, current_offset,
maxpages - lcv);
if (ptmp != NULL && ptmp->offset == current_offset) {
/* page is there, see if we need to wait on it */
if ((ptmp->flags & PG_BUSY) != 0) {
UVMHIST_LOG(pdhist,
"sleeping, ptmp->flags %#jx\n",
ptmp->flags,0,0,0);
uvm_pagewait(ptmp, uobj->vmobjlock, "uao_get");
rw_enter(uobj->vmobjlock, RW_WRITER);
uvm_page_array_clear(&a);
continue;
}
/*
* if we get here then the page is resident and
* unbusy. we busy it now (so we own it). if
* overwriting, mark the page dirty up front as
* it will be zapped via an unmanaged mapping.
*/
KASSERT(uvm_pagegetdirty(ptmp) !=
UVM_PAGE_STATUS_CLEAN);
if (overwrite) { uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
}
/* we own it, caller must un-busy */
ptmp->flags |= PG_BUSY;
UVM_PAGE_OWN(ptmp, "uao_get2");
pps[lcv++] = ptmp;
current_offset += PAGE_SIZE;
uvm_page_array_advance(&a);
continue;
} else {
KASSERT(ptmp == NULL || ptmp->offset > current_offset);
}
/*
* not resident. allocate a new busy/fake/clean page in the
* object. if it's in swap we need to do I/O to fill in the
* data, otherwise the page needs to be cleared: if it's not
* destined to be overwritten, then zero it here and now.
*/
pageidx = current_offset >> PAGE_SHIFT;
swslot = uao_find_swslot(uobj, pageidx); ptmp = uao_pagealloc(uobj, current_offset,
swslot != 0 || overwrite ? 0 : UVM_PGA_ZERO);
/* out of RAM? */
if (ptmp == NULL) {
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "sleeping, ptmp == NULL",0,0,0,0);
uvm_wait("uao_getpage");
rw_enter(uobj->vmobjlock, RW_WRITER);
uvm_page_array_clear(&a);
continue;
}
/*
* if swslot == 0, page hasn't existed before and is zeroed.
* otherwise we have a "fake/busy/clean" page that we just
* allocated. do the needed "i/o", reading from swap.
*/
if (swslot != 0) {
#if defined(VMSWAP)
int error;
UVMHIST_LOG(pdhist, "pagein from swslot %jd",
swslot, 0,0,0);
/*
* page in the swapped-out page.
* unlock object for i/o, relock when done.
*/
uvm_page_array_clear(&a);
rw_exit(uobj->vmobjlock);
error = uvm_swap_get(ptmp, swslot, PGO_SYNCIO);
rw_enter(uobj->vmobjlock, RW_WRITER);
/*
* I/O done. check for errors.
*/
if (error != 0) {
UVMHIST_LOG(pdhist, "<- done (error=%jd)",
error,0,0,0);
/*
* remove the swap slot from the aobj
* and mark the aobj as having no real slot.
* don't free the swap slot, thus preventing
* it from being used again.
*/
swslot = uao_set_swslot(uobj, pageidx,
SWSLOT_BAD);
if (swslot > 0) { uvm_swap_markbad(swslot, 1);
}
uvm_pagefree(ptmp);
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "<- done (error)",
error,lcv,0,0);
if (lcv != 0) { uvm_page_unbusy(pps, lcv);
}
memset(pps, 0, maxpages * sizeof(pps[0]));
uvm_page_array_fini(&a);
return error;
}
#else /* defined(VMSWAP) */
panic("%s: pagein", __func__);
#endif /* defined(VMSWAP) */
}
/*
* note that we will allow the page being writably-mapped
* (!PG_RDONLY) regardless of access_type. if overwrite,
* the page can be modified through an unmanaged mapping
* so mark it dirty up front.
*/
if (overwrite) {
uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
} else {
uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_UNKNOWN);
}
/*
* we got the page! clear the fake flag (indicates valid
* data now in page) and plug into our result array. note
* that page is still busy.
*
* it is the callers job to:
* => check if the page is released
* => unbusy the page
* => activate the page
*/
KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN); KASSERT((ptmp->flags & PG_FAKE) != 0); KASSERT(ptmp->offset == current_offset);
ptmp->flags &= ~PG_FAKE;
pps[lcv++] = ptmp;
current_offset += PAGE_SIZE;
}
uvm_page_array_fini(&a);
/*
* finally, unlock object and return.
*/
done:
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "<- done (OK)",0,0,0,0);
return 0;
}
#if defined(VMSWAP)
/*
* uao_dropswap: release any swap resources from this aobj page.
*
* => aobj must be locked or have a reference count of 0.
*/
void
uao_dropswap(struct uvm_object *uobj, int pageidx)
{
int slot;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
slot = uao_set_swslot(uobj, pageidx, 0);
if (slot) { uvm_swap_free(slot, 1);
}
}
/*
* page in every page in every aobj that is paged-out to a range of swslots.
*
* => nothing should be locked.
* => returns true if pagein was aborted due to lack of memory.
*/
bool
uao_swap_off(int startslot, int endslot)
{
struct uvm_aobj *aobj;
/*
* Walk the list of all anonymous UVM objects. Grab the first.
*/
mutex_enter(&uao_list_lock);
if ((aobj = LIST_FIRST(&uao_list)) == NULL) {
mutex_exit(&uao_list_lock);
return false;
}
uao_reference(&aobj->u_obj);
do {
struct uvm_aobj *nextaobj;
bool rv;
/*
* Prefetch the next object and immediately hold a reference
* on it, so neither the current nor the next entry could
* disappear while we are iterating.
*/
if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
uao_reference(&nextaobj->u_obj);
}
mutex_exit(&uao_list_lock);
/*
* Page in all pages in the swap slot range.
*/
rw_enter(aobj->u_obj.vmobjlock, RW_WRITER);
rv = uao_pagein(aobj, startslot, endslot);
rw_exit(aobj->u_obj.vmobjlock);
/* Drop the reference of the current object. */
uao_detach(&aobj->u_obj);
if (rv) {
if (nextaobj) {
uao_detach(&nextaobj->u_obj);
}
return rv;
}
aobj = nextaobj;
mutex_enter(&uao_list_lock);
} while (aobj);
mutex_exit(&uao_list_lock);
return false;
}
/*
* page in any pages from aobj in the given range.
*
* => aobj must be locked and is returned locked.
* => returns true if pagein was aborted due to lack of memory.
*/
static bool
uao_pagein(struct uvm_aobj *aobj, int startslot, int endslot)
{
bool rv;
if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt;
int buck;
restart:
for (buck = aobj->u_swhashmask; buck >= 0; buck--) {
for (elt = LIST_FIRST(&aobj->u_swhash[buck]);
elt != NULL;
elt = LIST_NEXT(elt, list)) {
int i;
for (i = 0; i < UAO_SWHASH_CLUSTER_SIZE; i++) {
int slot = elt->slots[i];
/*
* if the slot isn't in range, skip it.
*/
if (slot < startslot ||
slot >= endslot) {
continue;
}
/*
* process the page,
* the start over on this object
* since the swhash elt
* may have been freed.
*/
rv = uao_pagein_page(aobj,
UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + i);
if (rv) {
return rv;
}
goto restart;
}
}
}
} else {
int i;
for (i = 0; i < aobj->u_pages; i++) {
int slot = aobj->u_swslots[i];
/*
* if the slot isn't in range, skip it
*/
if (slot < startslot || slot >= endslot) {
continue;
}
/*
* process the page.
*/
rv = uao_pagein_page(aobj, i);
if (rv) {
return rv;
}
}
}
return false;
}
/*
* uao_pagein_page: page in a single page from an anonymous UVM object.
*
* => Returns true if pagein was aborted due to lack of memory.
* => Object must be locked and is returned locked.
*/
static bool
uao_pagein_page(struct uvm_aobj *aobj, int pageidx)
{
struct uvm_object *uobj = &aobj->u_obj;
struct vm_page *pg;
int rv, npages;
pg = NULL;
npages = 1;
KASSERT(rw_write_held(uobj->vmobjlock));
rv = uao_get(uobj, (voff_t)pageidx << PAGE_SHIFT, &pg, &npages,
0, VM_PROT_READ | VM_PROT_WRITE, 0, PGO_SYNCIO);
/*
* relock and finish up.
*/
rw_enter(uobj->vmobjlock, RW_WRITER);
switch (rv) {
case 0:
break;
case EIO:
case ERESTART:
/*
* nothing more to do on errors.
* ERESTART can only mean that the anon was freed,
* so again there's nothing to do.
*/
return false;
default:
return true;
}
/*
* ok, we've got the page now.
* mark it as dirty, clear its swslot and un-busy it.
*/
uao_dropswap(&aobj->u_obj, pageidx);
/*
* make sure it's on a page queue.
*/
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
UVM_PAGE_OWN(pg, NULL);
return false;
}
/*
* uao_dropswap_range: drop swapslots in the range.
*
* => aobj must be locked and is returned locked.
* => start is inclusive. end is exclusive.
*/
void
uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int swpgonlydelta = 0;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (end == 0) {
end = INT64_MAX;
}
if (UAO_USES_SWHASH(aobj)) {
int i, hashbuckets = aobj->u_swhashmask + 1;
voff_t taghi;
voff_t taglo;
taglo = UAO_SWHASH_ELT_TAG(start);
taghi = UAO_SWHASH_ELT_TAG(end);
for (i = 0; i < hashbuckets; i++) {
struct uao_swhash_elt *elt, *next;
for (elt = LIST_FIRST(&aobj->u_swhash[i]);
elt != NULL;
elt = next) {
int startidx, endidx;
int j;
next = LIST_NEXT(elt, list);
if (elt->tag < taglo || taghi < elt->tag) {
continue;
}
if (elt->tag == taglo) {
startidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(start);
} else {
startidx = 0;
}
if (elt->tag == taghi) {
endidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(end);
} else {
endidx = UAO_SWHASH_CLUSTER_SIZE;
}
for (j = startidx; j < endidx; j++) {
int slot = elt->slots[j];
KASSERT(uvm_pagelookup(&aobj->u_obj,
(UAO_SWHASH_ELT_PAGEIDX_BASE(elt)
+ j) << PAGE_SHIFT) == NULL);
if (slot > 0) {
uvm_swap_free(slot, 1);
swpgonlydelta++;
KASSERT(elt->count > 0);
elt->slots[j] = 0;
elt->count--;
}
}
if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
}
} else {
int i;
if (aobj->u_pages < end) {
end = aobj->u_pages;
}
for (i = start; i < end; i++) {
int slot = aobj->u_swslots[i];
if (slot > 0) { uvm_swap_free(slot, 1);
swpgonlydelta++;
}
}
}
/*
* adjust the counter of pages only in swap for all
* the swap slots we've freed.
*/
if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta);
atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
#endif /* defined(VMSWAP) */
/* $NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $ */
/* from: NetBSD: exec_aout.c,v 1.15 1996/09/26 23:34:46 cgd Exp */
/*
* Copyright (c) 1998, 2001 Matthew R. Green.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <compat/netbsd32/netbsd32.h>
#ifndef EXEC_AOUT
#define EXEC_AOUT
#endif
#include <compat/netbsd32/netbsd32_exec.h>
#include <machine/frame.h>
#include <machine/netbsd32_machdep.h>
#ifdef COMPAT_NOMID
static int netbsd32_exec_aout_nomid(struct lwp *, struct exec_package *);
#endif
/*
* exec_netbsd32_makecmds(): Check if it's an netbsd32 a.out format
* executable.
*
* Given a lwp pointer and an exec package pointer, see if the referent
* of the epp is in netbsd32 a.out format. Check 'standard' magic
* numbers for this architecture.
*
* This function, in the former case, or the hook, in the latter, is
* responsible for creating a set of vmcmds which can be used to build
* the process's vm space and inserting them into the exec package.
*/
int
exec_netbsd32_makecmds(struct lwp *l, struct exec_package *epp)
{
netbsd32_u_long midmag, magic;
u_short mid;
int error;
struct netbsd32_exec *execp = epp->ep_hdr;
if (epp->ep_hdrvalid < sizeof(struct netbsd32_exec))
return ENOEXEC;
midmag = (netbsd32_u_long)ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0x3ff;
magic = midmag & 0xffff;
midmag = mid << 16 | magic;
/* this is already needed by setup_stack() */
epp->ep_flags |= EXEC_32;
switch (midmag) {
case (NETBSD32_MID_MACHINE << 16) | ZMAGIC:
error = netbsd32_exec_aout_prep_zmagic(l, epp);
break;
case (NETBSD32_MID_MACHINE << 16) | NMAGIC:
error = netbsd32_exec_aout_prep_nmagic(l, epp);
break;
case (NETBSD32_MID_MACHINE << 16) | OMAGIC:
error = netbsd32_exec_aout_prep_omagic(l, epp);
break;
default:
#ifdef COMPAT_NOMID
error = netbsd32_exec_aout_nomid(l, epp);
#else
error = ENOEXEC;
#endif
break;
}
if (error) {
kill_vmcmds(&epp->ep_vmcmds);
epp->ep_flags &= ~EXEC_32;
} else
epp->ep_flags &= ~EXEC_TOPDOWN_VM;
return error;
}
/*
* netbsd32_exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's
* exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
netbsd32_exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
error = vn_marktext(epp->ep_vp);
if (error)
return error;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's
* exec package
*/
int
netbsd32_exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's
* exec package
*/
int
netbsd32_exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
PAGE_SIZE);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
#ifdef COMPAT_NOMID
/*
* netbsd32_exec_aout_prep_oldzmagic():
* Prepare the vmcmds to build a vmspace for an old ZMAGIC
* binary. [386BSD/BSDI/4.4BSD/NetBSD0.8]
*
* Cloned from exec_aout_prep_zmagic() in kern/exec_aout.c; a more verbose
* description of operation is there.
* There were copies of this in the mac68k, hp300, and i386 ports.
*/
static int
netbsd32_exec_aout_prep_oldzmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
error = vn_marktext(epp->ep_vp);
if (error)
return error;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, PAGE_SIZE, /* XXX CLBYTES? */
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
epp->ep_daddr, epp->ep_vp,
execp->a_text + PAGE_SIZE, /* XXX CLBYTES? */
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_oldnmagic():
* Prepare the vmcmds to build a vmspace for an old NMAGIC
* binary. [BSDI]
*
* Cloned from exec_aout_prep_nmagic() in kern/exec_aout.c; with text starting
* at 0.
* XXX: There must be a better way to share this code.
*/
static int
netbsd32_exec_aout_prep_oldnmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_oldomagic():
* Prepare the vmcmds to build a vmspace for an old OMAGIC
* binary. [BSDI]
*
* Cloned from exec_aout_prep_omagic() in kern/exec_aout.c; with text starting
* at 0.
* XXX: There must be a better way to share this code.
*/
static int
netbsd32_exec_aout_prep_oldomagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
PAGE_SIZE);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
static int
netbsd32_exec_aout_nomid(struct lwp *l, struct exec_package *epp)
{
int error;
u_long midmag, magic;
u_short mid;
struct exec *execp = epp->ep_hdr;
/* check on validity of epp->ep_hdr performed by exec_out_makecmds */
midmag = ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0xffff;
magic = midmag & 0xffff;
if (magic == 0) {
magic = (execp->a_midmag & 0xffff);
mid = MID_ZERO;
}
midmag = mid << 16 | magic;
switch (midmag) {
case (MID_ZERO << 16) | ZMAGIC:
/*
* 386BSD's ZMAGIC format:
*/
return netbsd32_exec_aout_prep_oldzmagic(l, epp);
break;
case (MID_ZERO << 16) | QMAGIC:
/*
* BSDI's QMAGIC format:
* same as new ZMAGIC format, but with different magic number
*/
return netbsd32_exec_aout_prep_zmagic(l, epp);
break;
case (MID_ZERO << 16) | NMAGIC:
/*
* BSDI's NMAGIC format:
* same as NMAGIC format, but with different magic number
* and with text starting at 0.
*/
return netbsd32_exec_aout_prep_oldnmagic(l, epp);
case (MID_ZERO << 16) | OMAGIC:
/*
* BSDI's OMAGIC format:
* same as OMAGIC format, but with different magic number
* and with text starting at 0.
*/
return netbsd32_exec_aout_prep_oldomagic(l, epp);
default:
return ENOEXEC;
}
return error;
}
#endif
/* $NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $ */
/*-
* Copyright (c) 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Generic software interrupt framework.
*
* Overview
*
* The soft interrupt framework provides a mechanism to schedule a
* low priority callback that runs with thread context. It allows
* for dynamic registration of software interrupts, and for fair
* queueing and prioritization of those interrupts. The callbacks
* can be scheduled to run from nearly any point in the kernel: by
* code running with thread context, by code running from a
* hardware interrupt handler, and at any interrupt priority
* level.
*
* Priority levels
*
* Since soft interrupt dispatch can be tied to the underlying
* architecture's interrupt dispatch code, it can be limited
* both by the capabilities of the hardware and the capabilities
* of the interrupt dispatch code itself. The number of priority
* levels is restricted to four. In order of priority (lowest to
* highest) the levels are: clock, bio, net, serial.
*
* The names are symbolic and in isolation do not have any direct
* connection with a particular kind of device activity: they are
* only meant as a guide.
*
* The four priority levels map directly to scheduler priority
* levels, and where the architecture implements 'fast' software
* interrupts, they also map onto interrupt priorities. The
* interrupt priorities are intended to be hidden from machine
* independent code, which should use thread-safe mechanisms to
* synchronize with software interrupts (for example: mutexes).
*
* Capabilities
*
* Software interrupts run with limited machine context. In
* particular, they do not posess any address space context. They
* should not try to operate on user space addresses, or to use
* virtual memory facilities other than those noted as interrupt
* safe.
*
* Unlike hardware interrupts, software interrupts do have thread
* context. They may block on synchronization objects, sleep, and
* resume execution at a later time.
*
* Since software interrupts are a limited resource and run with
* higher priority than most other LWPs in the system, all
* block-and-resume activity by a software interrupt must be kept
* short to allow further processing at that level to continue. By
* extension, code running with process context must take care to
* ensure that any lock that may be taken from a software interrupt
* can not be held for more than a short period of time.
*
* The kernel does not allow software interrupts to use facilities
* or perform actions that may block for a significant amount of
* time. This means that it's not valid for a software interrupt
* to sleep on condition variables or wait for resources to become
* available (for example, memory).
*
* Per-CPU operation
*
* If a soft interrupt is triggered on a CPU, it can only be
* dispatched on the same CPU. Each LWP dedicated to handling a
* soft interrupt is bound to its home CPU, so if the LWP blocks
* and needs to run again, it can only run there. Nearly all data
* structures used to manage software interrupts are per-CPU.
*
* The per-CPU requirement is intended to reduce "ping-pong" of
* cache lines between CPUs: lines occupied by data structures
* used to manage the soft interrupts, and lines occupied by data
* items being passed down to the soft interrupt. As a positive
* side effect, this also means that the soft interrupt dispatch
* code does not need to to use spinlocks to synchronize.
*
* Generic implementation
*
* A generic, low performance implementation is provided that
* works across all architectures, with no machine-dependent
* modifications needed. This implementation uses the scheduler,
* and so has a number of restrictions:
*
* 1) The software interrupts are not currently preemptive, so
* must wait for the currently executing LWP to yield the CPU.
* This can introduce latency.
*
* 2) An expensive context switch is required for a software
* interrupt to be handled.
*
* 'Fast' software interrupts
*
* If an architectures defines __HAVE_FAST_SOFTINTS, it implements
* the fast mechanism. Threads running either in the kernel or in
* userspace will be interrupted, but will not be preempted. When
* the soft interrupt completes execution, the interrupted LWP
* is resumed. Interrupt dispatch code must provide the minimum
* level of context necessary for the soft interrupt to block and
* be resumed at a later time. The machine-dependent dispatch
* path looks something like the following:
*
* softintr()
* {
* go to IPL_HIGH if necessary for switch;
* save any necessary registers in a format that can be
* restored by cpu_switchto if the softint blocks;
* arrange for cpu_switchto() to restore into the
* trampoline function;
* identify LWP to handle this interrupt;
* switch to the LWP's stack;
* switch register stacks, if necessary;
* assign new value of curlwp;
* call MI softint_dispatch, passing old curlwp and IPL
* to execute interrupt at;
* switch back to old stack;
* switch back to old register stack, if necessary;
* restore curlwp;
* return to interrupted LWP;
* }
*
* If the soft interrupt blocks, a trampoline function is returned
* to in the context of the interrupted LWP, as arranged for by
* softint():
*
* softint_ret()
* {
* unlock soft interrupt LWP;
* resume interrupt processing, likely returning to
* interrupted LWP or dispatching another, different
* interrupt;
* }
*
* Once the soft interrupt has fired (and even if it has blocked),
* no further soft interrupts at that level will be triggered by
* MI code until the soft interrupt handler has ceased execution.
* If a soft interrupt handler blocks and is resumed, it resumes
* execution as a normal LWP (kthread) and gains VM context. Only
* when it has completed and is ready to fire again will it
* interrupt other threads.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/psref.h>
#include <sys/sdt.h>
#include <uvm/uvm_extern.h>
/* This could overlap with signal info in struct lwp. */
typedef struct softint {
SIMPLEQ_HEAD(, softhand) si_q;
struct lwp *si_lwp;
struct cpu_info *si_cpu;
uintptr_t si_machdep;
struct evcnt si_evcnt;
struct evcnt si_evcnt_block;
volatile int si_active;
int si_ipl;
char si_name[8];
char si_name_block[8+6];
} softint_t;
typedef struct softhand {
SIMPLEQ_ENTRY(softhand) sh_q;
void (*sh_func)(void *);
void *sh_arg;
softint_t *sh_isr;
u_int sh_flags;
u_int sh_ipi_id;
} softhand_t;
typedef struct softcpu {
struct cpu_info *sc_cpu;
softint_t sc_int[SOFTINT_COUNT];
softhand_t sc_hand[1];
} softcpu_t;
static void softint_thread(void *);
u_int softint_bytes = 32768;
u_int softint_timing;
static u_int softint_max;
static kmutex_t softint_lock;
SDT_PROBE_DEFINE4(sdt, kernel, softint, establish,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, softint, disestablish,
"void *"/*sih*/);
SDT_PROBE_DEFINE2(sdt, kernel, softint, schedule,
"void *"/*sih*/,
"struct cpu_info *"/*ci*/);
SDT_PROBE_DEFINE4(sdt, kernel, softint, entry,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE4(sdt, kernel, softint, return,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
/*
* softint_init_isr:
*
* Initialize a single interrupt level for a single CPU.
*/
static void
softint_init_isr(softcpu_t *sc, const char *desc, pri_t pri, u_int level,
int ipl)
{
struct cpu_info *ci;
softint_t *si;
int error;
si = &sc->sc_int[level];
ci = sc->sc_cpu;
si->si_cpu = ci;
SIMPLEQ_INIT(&si->si_q);
error = kthread_create(pri, KTHREAD_MPSAFE | KTHREAD_INTR |
KTHREAD_IDLE, ci, softint_thread, si, &si->si_lwp,
"soft%s/%u", desc, ci->ci_index);
if (error != 0)
panic("softint_init_isr: error %d", error);
snprintf(si->si_name, sizeof(si->si_name), "%s/%u", desc,
ci->ci_index);
evcnt_attach_dynamic(&si->si_evcnt, EVCNT_TYPE_MISC, NULL,
"softint", si->si_name);
snprintf(si->si_name_block, sizeof(si->si_name_block), "%s block/%u",
desc, ci->ci_index);
evcnt_attach_dynamic(&si->si_evcnt_block, EVCNT_TYPE_MISC, NULL,
"softint", si->si_name_block);
si->si_ipl = ipl;
si->si_lwp->l_private = si;
softint_init_md(si->si_lwp, level, &si->si_machdep);
}
/*
* softint_init:
*
* Initialize per-CPU data structures. Called from mi_cpu_attach().
*/
void
softint_init(struct cpu_info *ci)
{
static struct cpu_info *first;
softcpu_t *sc, *scfirst;
softhand_t *sh, *shmax;
if (first == NULL) {
/* Boot CPU. */
first = ci;
mutex_init(&softint_lock, MUTEX_DEFAULT, IPL_NONE);
softint_bytes = round_page(softint_bytes);
softint_max = (softint_bytes - sizeof(softcpu_t)) /
sizeof(softhand_t);
}
/* Use uvm_km(9) for persistent, page-aligned allocation. */
sc = (softcpu_t *)uvm_km_alloc(kernel_map, softint_bytes, 0,
UVM_KMF_WIRED | UVM_KMF_ZERO);
if (sc == NULL)
panic("softint_init_cpu: cannot allocate memory");
ci->ci_data.cpu_softcpu = sc;
ci->ci_data.cpu_softints = 0;
sc->sc_cpu = ci;
softint_init_isr(sc, "net", PRI_SOFTNET, SOFTINT_NET,
IPL_SOFTNET);
softint_init_isr(sc, "bio", PRI_SOFTBIO, SOFTINT_BIO,
IPL_SOFTBIO);
softint_init_isr(sc, "clk", PRI_SOFTCLOCK, SOFTINT_CLOCK,
IPL_SOFTCLOCK);
softint_init_isr(sc, "ser", PRI_SOFTSERIAL, SOFTINT_SERIAL,
IPL_SOFTSERIAL);
if (first != ci) {
mutex_enter(&softint_lock);
scfirst = first->ci_data.cpu_softcpu;
sh = sc->sc_hand;
memcpy(sh, scfirst->sc_hand, sizeof(*sh) * softint_max);
/* Update pointers for this CPU. */
for (shmax = sh + softint_max; sh < shmax; sh++) {
if (sh->sh_func == NULL)
continue;
sh->sh_isr =
&sc->sc_int[sh->sh_flags & SOFTINT_LVLMASK];
}
mutex_exit(&softint_lock);
}
}
/*
* softint_establish:
*
* Register a software interrupt handler.
*/
void *
softint_establish(u_int flags, void (*func)(void *), void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
softcpu_t *sc;
softhand_t *sh;
u_int level, index;
u_int ipi_id = 0;
void *sih;
level = (flags & SOFTINT_LVLMASK);
KASSERT(level < SOFTINT_COUNT);
KASSERT((flags & SOFTINT_IMPMASK) == 0);
mutex_enter(&softint_lock);
/* Find a free slot. */
sc = curcpu()->ci_data.cpu_softcpu;
for (index = 1; index < softint_max; index++) {
if (sc->sc_hand[index].sh_func == NULL)
break;
}
if (index == softint_max) {
mutex_exit(&softint_lock);
printf("WARNING: softint_establish: table full, "
"increase softint_bytes\n");
return NULL;
}
sih = (void *)((uint8_t *)&sc->sc_hand[index] - (uint8_t *)sc);
if (flags & SOFTINT_RCPU) {
if ((ipi_id = ipi_register(softint_schedule, sih)) == 0) {
mutex_exit(&softint_lock);
return NULL;
}
}
/* Set up the handler on each CPU. */
if (ncpu < 2) {
/* XXX hack for machines with no CPU_INFO_FOREACH() early on */
sc = curcpu()->ci_data.cpu_softcpu;
sh = &sc->sc_hand[index];
sh->sh_isr = &sc->sc_int[level];
sh->sh_func = func;
sh->sh_arg = arg;
sh->sh_flags = flags;
sh->sh_ipi_id = ipi_id;
} else for (CPU_INFO_FOREACH(cii, ci)) {
sc = ci->ci_data.cpu_softcpu;
sh = &sc->sc_hand[index];
sh->sh_isr = &sc->sc_int[level];
sh->sh_func = func;
sh->sh_arg = arg;
sh->sh_flags = flags;
sh->sh_ipi_id = ipi_id;
}
mutex_exit(&softint_lock);
SDT_PROBE4(sdt, kernel, softint, establish, sih, func, arg, flags);
return sih;
}
/*
* softint_disestablish:
*
* Unregister a software interrupt handler. The soft interrupt could
* still be active at this point, but the caller commits not to try
* and trigger it again once this call is made. The caller must not
* hold any locks that could be taken from soft interrupt context,
* because we will wait for the softint to complete if it's still
* running.
*/
void
softint_disestablish(void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
softcpu_t *sc;
softhand_t *sh;
uintptr_t offset;
offset = (uintptr_t)arg;
KASSERT(offset != 0);
KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
offset, softint_bytes);
/*
* Unregister IPI handler if there is any. Note: there is no need
* to disable preemption here - ID is stable.
*/
sc = curcpu()->ci_data.cpu_softcpu;
sh = (softhand_t *)((uint8_t *)sc + offset);
if (sh->sh_ipi_id) {
ipi_unregister(sh->sh_ipi_id);
}
/*
* Run a dummy softint at the same level on all CPUs and wait for
* completion, to make sure this softint is no longer running
* anywhere.
*/
xc_barrier(XC_HIGHPRI_IPL(sh->sh_isr->si_ipl));
/*
* Notify dtrace probe when the old softint can't be running
* any more, but before it can be recycled for a new softint.
*/
SDT_PROBE1(sdt, kernel, softint, disestablish, arg);
/* Clear the handler on each CPU. */
mutex_enter(&softint_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
sc = ci->ci_data.cpu_softcpu;
sh = (softhand_t *)((uint8_t *)sc + offset);
KASSERT(sh->sh_func != NULL);
sh->sh_func = NULL;
}
mutex_exit(&softint_lock);
}
/*
* softint_schedule:
*
* Trigger a software interrupt. Must be called from a hardware
* interrupt handler, or with preemption disabled (since we are
* using the value of curcpu()).
*/
void
softint_schedule(void *arg)
{
softhand_t *sh;
softint_t *si;
uintptr_t offset;
int s;
SDT_PROBE2(sdt, kernel, softint, schedule, arg, /*ci*/NULL);
/*
* If this assert fires, rather than disabling preemption explicitly
* to make it stop, consider that you are probably using a softint
* when you don't need to.
*/
KASSERT(kpreempt_disabled());
/* Find the handler record for this CPU. */
offset = (uintptr_t)arg;
KASSERT(offset != 0); KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
offset, softint_bytes);
sh = (softhand_t *)((uint8_t *)curcpu()->ci_data.cpu_softcpu + offset);
/* If it's already pending there's nothing to do. */
if ((sh->sh_flags & SOFTINT_PENDING) != 0) {
return;
}
/*
* Enqueue the handler into the LWP's pending list.
* If the LWP is completely idle, then make it run.
*/
s = splhigh();
if ((sh->sh_flags & SOFTINT_PENDING) == 0) {
si = sh->sh_isr;
sh->sh_flags |= SOFTINT_PENDING;
SIMPLEQ_INSERT_TAIL(&si->si_q, sh, sh_q);
if (si->si_active == 0) { si->si_active = 1;
softint_trigger(si->si_machdep);
}
}
splx(s);
}
/*
* softint_schedule_cpu:
*
* Trigger a software interrupt on a target CPU. This invokes
* softint_schedule() for the local CPU or send an IPI to invoke
* this routine on the remote CPU. Preemption must be disabled.
*/
void
softint_schedule_cpu(void *arg, struct cpu_info *ci)
{ KASSERT(kpreempt_disabled());
if (curcpu() != ci) {
const softcpu_t *sc = ci->ci_data.cpu_softcpu;
const uintptr_t offset = (uintptr_t)arg;
const softhand_t *sh;
SDT_PROBE2(sdt, kernel, softint, schedule, arg, ci);
sh = (const softhand_t *)((const uint8_t *)sc + offset);
KASSERT((sh->sh_flags & SOFTINT_RCPU) != 0);
ipi_trigger(sh->sh_ipi_id, ci);
return;
}
/* Just a local CPU. */
softint_schedule(arg);
}
/*
* softint_execute:
*
* Invoke handlers for the specified soft interrupt.
* Must be entered at splhigh. Will drop the priority
* to the level specified, but returns back at splhigh.
*/
static inline void
softint_execute(lwp_t *l, int s)
{
softint_t *si = l->l_private;
softhand_t *sh;
KASSERT(si->si_lwp == curlwp);
KASSERT(si->si_cpu == curcpu());
KASSERT(si->si_lwp->l_wchan == NULL);
KASSERT(si->si_active);
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
l, l->l_nopreempt);
/*
* Note: due to priority inheritance we may have interrupted a
* higher priority LWP. Since the soft interrupt must be quick
* and is non-preemptable, we don't bother yielding.
*/
while (!SIMPLEQ_EMPTY(&si->si_q)) {
/*
* Pick the longest waiting handler to run. We block
* interrupts but do not lock in order to do this, as
* we are protecting against the local CPU only.
*/
sh = SIMPLEQ_FIRST(&si->si_q);
SIMPLEQ_REMOVE_HEAD(&si->si_q, sh_q);
KASSERT((sh->sh_flags & SOFTINT_PENDING) != 0);
sh->sh_flags ^= SOFTINT_PENDING;
splx(s);
/* Run the handler. */
SDT_PROBE4(sdt, kernel, softint, entry,
((const char *)sh -
(const char *)curcpu()->ci_data.cpu_softcpu),
sh->sh_func, sh->sh_arg, sh->sh_flags);
if (__predict_true((sh->sh_flags & SOFTINT_MPSAFE) != 0)) {
(*sh->sh_func)(sh->sh_arg);
} else {
KERNEL_LOCK(1, l);
(*sh->sh_func)(sh->sh_arg);
KERNEL_UNLOCK_ONE(l);
}
SDT_PROBE4(sdt, kernel, softint, return,
((const char *)sh -
(const char *)curcpu()->ci_data.cpu_softcpu),
sh->sh_func, sh->sh_arg, sh->sh_flags);
/* Diagnostic: check that spin-locks have not leaked. */
KASSERTMSG(curcpu()->ci_mtx_count == 0,
"%s: ci_mtx_count (%d) != 0, sh_func %p\n",
__func__, curcpu()->ci_mtx_count, sh->sh_func);
/* Diagnostic: check that psrefs have not leaked. */
KASSERTMSG(l->l_psrefs == 0, "%s: l_psrefs=%d, sh_func=%p\n",
__func__, l->l_psrefs, sh->sh_func);
/* Diagnostic: check that biglocks have not leaked. */
KASSERTMSG(l->l_blcnt == 0,
"%s: sh_func=%p leaked %d biglocks",
__func__, sh->sh_func, curlwp->l_blcnt);
/* Diagnostic: check that LWP nopreempt remains zero. */
KASSERTMSG(l->l_nopreempt == 0,
"%s: lwp %p nopreempt %d func %p",
__func__, l, l->l_nopreempt, sh->sh_func);
(void)splhigh();
}
PSREF_DEBUG_BARRIER();
CPU_COUNT(CPU_COUNT_NSOFT, 1);
KASSERT(si->si_cpu == curcpu());
KASSERT(si->si_lwp->l_wchan == NULL);
KASSERT(si->si_active);
si->si_evcnt.ev_count++;
si->si_active = 0;
}
/*
* softint_block:
*
* Update statistics when the soft interrupt blocks.
*/
void
softint_block(lwp_t *l)
{
softint_t *si = l->l_private;
KASSERT((l->l_pflag & LP_INTR) != 0);
si->si_evcnt_block.ev_count++;
}
#ifndef __HAVE_FAST_SOFTINTS
#ifdef __HAVE_PREEMPTION
#error __HAVE_PREEMPTION requires __HAVE_FAST_SOFTINTS
#endif
/*
* softint_init_md:
*
* Slow path: perform machine-dependent initialization.
*/
void
softint_init_md(lwp_t *l, u_int level, uintptr_t *machdep)
{
struct proc *p;
softint_t *si;
*machdep = (1 << level);
si = l->l_private;
p = l->l_proc;
mutex_enter(p->p_lock);
lwp_lock(l);
/* Cheat and make the KASSERT in softint_thread() happy. */
si->si_active = 1;
setrunnable(l);
/* LWP now unlocked */
mutex_exit(p->p_lock);
}
/*
* softint_trigger:
*
* Slow path: cause a soft interrupt handler to begin executing.
* Called at IPL_HIGH.
*/
void
softint_trigger(uintptr_t machdep)
{
struct cpu_info *ci;
lwp_t *l;
ci = curcpu();
ci->ci_data.cpu_softints |= machdep;
l = ci->ci_onproc;
/*
* Arrange for mi_switch() to be called. If called from interrupt
* mode, we don't know if curlwp is executing in kernel or user, so
* post an AST and have it take a trip through userret(). If not in
* interrupt mode, curlwp is running in kernel and will notice the
* resched soon enough; avoid the AST.
*/
if (l == ci->ci_data.cpu_idlelwp) {
atomic_or_uint(&ci->ci_want_resched,
RESCHED_IDLE | RESCHED_UPREEMPT);
} else {
atomic_or_uint(&ci->ci_want_resched, RESCHED_UPREEMPT);
if (cpu_intr_p()) {
cpu_signotify(l);
}
}
}
/*
* softint_thread:
*
* Slow path: MI software interrupt dispatch.
*/
void
softint_thread(void *cookie)
{
softint_t *si;
lwp_t *l;
int s;
l = curlwp;
si = l->l_private;
for (;;) {
/* Clear pending status and run it. */
s = splhigh();
l->l_cpu->ci_data.cpu_softints &= ~si->si_machdep;
softint_execute(l, s);
splx(s);
/* Interrupts allowed to run again before switching. */
lwp_lock(l);
l->l_stat = LSIDL;
spc_lock(l->l_cpu);
mi_switch(l);
}
}
/*
* softint_picklwp:
*
* Slow path: called from mi_switch() to pick the highest priority
* soft interrupt LWP that needs to run.
*/
lwp_t *
softint_picklwp(void)
{
struct cpu_info *ci;
u_int mask;
softint_t *si;
lwp_t *l;
ci = curcpu();
si = ((softcpu_t *)ci->ci_data.cpu_softcpu)->sc_int;
mask = ci->ci_data.cpu_softints;
if ((mask & (1 << SOFTINT_SERIAL)) != 0) {
l = si[SOFTINT_SERIAL].si_lwp;
} else if ((mask & (1 << SOFTINT_NET)) != 0) {
l = si[SOFTINT_NET].si_lwp;
} else if ((mask & (1 << SOFTINT_BIO)) != 0) {
l = si[SOFTINT_BIO].si_lwp;
} else if ((mask & (1 << SOFTINT_CLOCK)) != 0) {
l = si[SOFTINT_CLOCK].si_lwp;
} else {
panic("softint_picklwp");
}
return l;
}
#else /* !__HAVE_FAST_SOFTINTS */
/*
* softint_thread:
*
* Fast path: the LWP is switched to without restoring any state,
* so we should not arrive here - there is a direct handoff between
* the interrupt stub and softint_dispatch().
*/
void
softint_thread(void *cookie)
{
panic("softint_thread");
}
/*
* softint_dispatch:
*
* Fast path: entry point from machine-dependent code.
*/
void
softint_dispatch(lwp_t *pinned, int s)
{
struct bintime now;
u_int timing;
lwp_t *l;
#ifdef DIAGNOSTIC
if ((pinned->l_pflag & LP_RUNNING) == 0 || curlwp->l_stat != LSIDL) {
struct lwp *onproc = curcpu()->ci_onproc;
int s2 = splhigh();
printf("curcpu=%d, spl=%d curspl=%d\n"
"onproc=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
"curlwp=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
"pinned=%p => l_stat=%d l_flag=%08x l_cpu=%d\n",
cpu_index(curcpu()), s, s2, onproc, onproc->l_stat,
onproc->l_flag, cpu_index(onproc->l_cpu), curlwp,
curlwp->l_stat, curlwp->l_flag,
cpu_index(curlwp->l_cpu), pinned, pinned->l_stat,
pinned->l_flag, cpu_index(pinned->l_cpu));
splx(s2);
panic("softint screwup");
}
#endif
/*
* Note the interrupted LWP, and mark the current LWP as running
* before proceeding. Although this must as a rule be done with
* the LWP locked, at this point no external agents will want to
* modify the interrupt LWP's state.
*/
timing = softint_timing;
l = curlwp;
l->l_switchto = pinned;
l->l_stat = LSONPROC;
/*
* Dispatch the interrupt. If softints are being timed, charge
* for it.
*/
if (timing) {
binuptime(&l->l_stime);
membar_producer(); /* for calcru */
l->l_pflag |= LP_TIMEINTR;
}
l->l_pflag |= LP_RUNNING;
softint_execute(l, s);
if (timing) {
binuptime(&now);
updatertime(l, &now);
l->l_pflag &= ~LP_TIMEINTR;
}
/*
* If we blocked while handling the interrupt, the pinned LWP is
* gone and we are now running as a kthread, so find another LWP to
* run. softint_dispatch() won't be reentered until the priority is
* finally dropped to IPL_NONE on entry to the next LWP on this CPU.
*/
l->l_stat = LSIDL;
if (l->l_switchto == NULL) {
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
/* NOTREACHED */
}
l->l_switchto = NULL;
l->l_pflag &= ~LP_RUNNING;
}
#endif /* !__HAVE_FAST_SOFTINTS */
/* $NetBSD: prop_number.c,v 1.34 2022/08/03 21:13:46 riastradh Exp $ */
/*-
* Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "prop_object_impl.h"
#include <prop/prop_number.h>
#include <sys/rbtree.h>
#if defined(_KERNEL)
#include <sys/systm.h>
#elif defined(_STANDALONE)
#include <sys/param.h>
#include <lib/libkern/libkern.h>
#else
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#endif
struct _prop_number_value {
union {
int64_t pnu_signed;
uint64_t pnu_unsigned;
} pnv_un;
#define pnv_signed pnv_un.pnu_signed
#define pnv_unsigned pnv_un.pnu_unsigned
unsigned int pnv_is_unsigned :1,
:31;
};
struct _prop_number {
struct _prop_object pn_obj;
struct rb_node pn_link;
struct _prop_number_value pn_value;
};
_PROP_POOL_INIT(_prop_number_pool, sizeof(struct _prop_number), "propnmbr")
static _prop_object_free_rv_t
_prop_number_free(prop_stack_t, prop_object_t *);
static bool _prop_number_externalize(
struct _prop_object_externalize_context *,
void *);
static _prop_object_equals_rv_t
_prop_number_equals(prop_object_t, prop_object_t,
void **, void **,
prop_object_t *, prop_object_t *);
static void _prop_number_lock(void);
static void _prop_number_unlock(void);
static const struct _prop_object_type _prop_object_type_number = {
.pot_type = PROP_TYPE_NUMBER,
.pot_free = _prop_number_free,
.pot_extern = _prop_number_externalize,
.pot_equals = _prop_number_equals,
.pot_lock = _prop_number_lock,
.pot_unlock = _prop_number_unlock,
};
#define prop_object_is_number(x) \
((x) != NULL && (x)->pn_obj.po_type == &_prop_object_type_number)
/*
* Number objects are immutable, and we are likely to have many number
* objects that have the same value. So, to save memory, we unique'ify
* numbers so we only have one copy of each.
*/
static int
_prop_number_compare_values(const struct _prop_number_value *pnv1,
const struct _prop_number_value *pnv2)
{
/* Signed numbers are sorted before unsigned numbers. */
if (pnv1->pnv_is_unsigned) {
if (! pnv2->pnv_is_unsigned)
return (1);
if (pnv1->pnv_unsigned < pnv2->pnv_unsigned)
return (-1);
if (pnv1->pnv_unsigned > pnv2->pnv_unsigned)
return (1);
return (0);
}
if (pnv2->pnv_is_unsigned)
return (-1);
if (pnv1->pnv_signed < pnv2->pnv_signed)
return (-1);
if (pnv1->pnv_signed > pnv2->pnv_signed)
return (1);
return (0);
}
static int
/*ARGSUSED*/
_prop_number_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED,
const void *n1, const void *n2)
{
const struct _prop_number *pn1 = n1;
const struct _prop_number *pn2 = n2;
return _prop_number_compare_values(&pn1->pn_value, &pn2->pn_value);
}
static int
/*ARGSUSED*/
_prop_number_rb_compare_key(void *ctx _PROP_ARG_UNUSED,
const void *n, const void *v)
{
const struct _prop_number *pn = n;
const struct _prop_number_value *pnv = v;
return _prop_number_compare_values(&pn->pn_value, pnv);
}
static const rb_tree_ops_t _prop_number_rb_tree_ops = {
.rbto_compare_nodes = _prop_number_rb_compare_nodes,
.rbto_compare_key = _prop_number_rb_compare_key,
.rbto_node_offset = offsetof(struct _prop_number, pn_link),
.rbto_context = NULL
};
static struct rb_tree _prop_number_tree;
_PROP_MUTEX_DECL_STATIC(_prop_number_tree_mutex)
/* ARGSUSED */
static _prop_object_free_rv_t
_prop_number_free(prop_stack_t stack, prop_object_t *obj)
{
prop_number_t pn = *obj;
rb_tree_remove_node(&_prop_number_tree, pn);
_PROP_POOL_PUT(_prop_number_pool, pn);
return (_PROP_OBJECT_FREE_DONE);
}
_PROP_ONCE_DECL(_prop_number_init_once)
static int
_prop_number_init(void)
{
_PROP_MUTEX_INIT(_prop_number_tree_mutex);
rb_tree_init(&_prop_number_tree, &_prop_number_rb_tree_ops);
return 0;
}
static void
_prop_number_lock(void)
{
/* XXX: init necessary? */
_PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
}
static void
_prop_number_unlock(void)
{
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
}
static bool
_prop_number_externalize(struct _prop_object_externalize_context *ctx,
void *v)
{
prop_number_t pn = v;
char tmpstr[32];
/*
* For unsigned numbers, we output in hex. For signed numbers,
* we output in decimal.
*/
if (pn->pn_value.pnv_is_unsigned)
snprintf(tmpstr, sizeof(tmpstr), "0x%" PRIx64,
pn->pn_value.pnv_unsigned);
else
snprintf(tmpstr, sizeof(tmpstr), "%" PRIi64,
pn->pn_value.pnv_signed);
if (_prop_object_externalize_start_tag(ctx, "integer") == false ||
_prop_object_externalize_append_cstring(ctx, tmpstr) == false ||
_prop_object_externalize_end_tag(ctx, "integer") == false)
return (false);
return (true);
}
/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_number_equals(prop_object_t v1, prop_object_t v2,
void **stored_pointer1, void **stored_pointer2,
prop_object_t *next_obj1, prop_object_t *next_obj2)
{
prop_number_t num1 = v1;
prop_number_t num2 = v2;
/*
* There is only ever one copy of a number object at any given
* time, so we can reduce this to a simple pointer equality check
* in the common case.
*/
if (num1 == num2)
return (_PROP_OBJECT_EQUALS_TRUE);
/*
* If the numbers are the same signed-ness, then we know they
* cannot be equal because they would have had pointer equality.
*/
if (num1->pn_value.pnv_is_unsigned == num2->pn_value.pnv_is_unsigned)
return (_PROP_OBJECT_EQUALS_FALSE);
/*
* We now have one signed value and one unsigned value. We can
* compare them iff:
* - The unsigned value is not larger than the signed value
* can represent.
* - The signed value is not smaller than the unsigned value
* can represent.
*/
if (num1->pn_value.pnv_is_unsigned) {
/*
* num1 is unsigned and num2 is signed.
*/
if (num1->pn_value.pnv_unsigned > INTMAX_MAX)
return (_PROP_OBJECT_EQUALS_FALSE);
if (num2->pn_value.pnv_signed < 0)
return (_PROP_OBJECT_EQUALS_FALSE);
} else {
/*
* num1 is signed and num2 is unsigned.
*/
if (num1->pn_value.pnv_signed < 0)
return (_PROP_OBJECT_EQUALS_FALSE);
if (num2->pn_value.pnv_unsigned > INTMAX_MAX)
return (_PROP_OBJECT_EQUALS_FALSE);
}
if (num1->pn_value.pnv_signed == num2->pn_value.pnv_signed)
return _PROP_OBJECT_EQUALS_TRUE;
else
return _PROP_OBJECT_EQUALS_FALSE;
}
static prop_number_t
_prop_number_alloc(const struct _prop_number_value *pnv)
{
prop_number_t opn, pn, rpn;
_PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);
/*
* Check to see if this already exists in the tree. If it does,
* we just retain it and return it.
*/
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
opn = rb_tree_find_node(&_prop_number_tree, pnv);
if (opn != NULL) {
prop_object_retain(opn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
return (opn);
}
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
/*
* Not in the tree. Create it now.
*/
pn = _PROP_POOL_GET(_prop_number_pool);
if (pn == NULL)
return (NULL);
_prop_object_init(&pn->pn_obj, &_prop_object_type_number);
pn->pn_value = *pnv;
/*
* We dropped the mutex when we allocated the new object, so
* we have to check again if it is in the tree.
*/
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
opn = rb_tree_find_node(&_prop_number_tree, pnv);
if (opn != NULL) {
prop_object_retain(opn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
_PROP_POOL_PUT(_prop_number_pool, pn);
return (opn);
}
rpn = rb_tree_insert_node(&_prop_number_tree, pn);
_PROP_ASSERT(rpn == pn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
return (rpn);
}
/*
* prop_number_create_signed --
* Create a prop_number_t and initialize it with the
* provided signed value.
*/
prop_number_t
prop_number_create_signed(intmax_t val)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
pnv.pnv_signed = val;
pnv.pnv_is_unsigned = false;
return (_prop_number_alloc(&pnv));
}
_PROP_DEPRECATED(prop_number_create_integer,
"this program uses prop_number_create_integer(), "
"which is deprecated; use prop_number_create_signed() instead.")
prop_number_t
prop_number_create_integer(int64_t val)
{
return prop_number_create_signed(val);
}
/*
* prop_number_create_unsigned --
* Create a prop_number_t and initialize it with the
* provided unsigned value.
*/
prop_number_t
prop_number_create_unsigned(uintmax_t val)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
pnv.pnv_unsigned = val;
pnv.pnv_is_unsigned = true;
return (_prop_number_alloc(&pnv));
}
_PROP_DEPRECATED(prop_number_create_unsigned_integer,
"this program uses prop_number_create_unsigned_integer(), "
"which is deprecated; use prop_number_create_unsigned() instead.")
prop_number_t
prop_number_create_unsigned_integer(uint64_t val)
{
return prop_number_create_unsigned(val);
}
/*
* prop_number_copy --
* Copy a prop_number_t.
*/
prop_number_t
prop_number_copy(prop_number_t opn)
{
if (! prop_object_is_number(opn))
return (NULL);
/*
* Because we only ever allocate one object for any given
* value, this can be reduced to a simple retain operation.
*/
prop_object_retain(opn);
return (opn);
}
/*
* prop_number_unsigned --
* Returns true if the prop_number_t has an unsigned value.
*/
bool
prop_number_unsigned(prop_number_t pn)
{
return (pn->pn_value.pnv_is_unsigned);
}
/*
* prop_number_size --
* Return the size, in bits, required to hold the value of
* the specified number.
*/
int
prop_number_size(prop_number_t pn)
{
struct _prop_number_value *pnv;
if (! prop_object_is_number(pn))
return (0);
pnv = &pn->pn_value;
if (pnv->pnv_is_unsigned) {
if (pnv->pnv_unsigned > UINT32_MAX)
return (64);
if (pnv->pnv_unsigned > UINT16_MAX)
return (32);
if (pnv->pnv_unsigned > UINT8_MAX)
return (16);
return (8);
}
if (pnv->pnv_signed > INT32_MAX || pnv->pnv_signed < INT32_MIN)
return (64);
if (pnv->pnv_signed > INT16_MAX || pnv->pnv_signed < INT16_MIN)
return (32);
if (pnv->pnv_signed > INT8_MAX || pnv->pnv_signed < INT8_MIN)
return (16);
return (8);
}
/*
* prop_number_signed_value --
* Get the signed value of a prop_number_t.
*/
intmax_t
prop_number_signed_value(prop_number_t pn)
{
/*
* XXX Impossible to distinguish between "not a prop_number_t"
* XXX and "prop_number_t has a value of 0".
*/
if (! prop_object_is_number(pn))
return (0);
return (pn->pn_value.pnv_signed);
}
_PROP_DEPRECATED(prop_number_integer_value,
"this program uses prop_number_integer_value(), "
"which is deprecated; use prop_number_signed_value() instead.")
int64_t
prop_number_integer_value(prop_number_t pn)
{
return prop_number_signed_value(pn);
}
/*
* prop_number_unsigned_value --
* Get the unsigned value of a prop_number_t.
*/
uintmax_t
prop_number_unsigned_value(prop_number_t pn)
{
/*
* XXX Impossible to distinguish between "not a prop_number_t"
* XXX and "prop_number_t has a value of 0".
*/
if (! prop_object_is_number(pn))
return (0);
return (pn->pn_value.pnv_unsigned);
}
_PROP_DEPRECATED(prop_number_unsigned_integer_value,
"this program uses prop_number_unsigned_integer_value(), "
"which is deprecated; use prop_number_unsigned_value() instead.")
uint64_t
prop_number_unsigned_integer_value(prop_number_t pn)
{
return prop_number_unsigned_value(pn);
}
/*
* prop_number_[...]_value --
* Retrieve the bounds-checked value as the specified type.
* Returns true if successful.
*/
#define TEMPLATE(name, typ, minv, maxv) \
bool \
prop_number_ ## name ## _value(prop_number_t pn, typ * const valp) \
{ \
\
if (! prop_object_is_number(pn)) \
return (false); \
\
if (pn->pn_value.pnv_is_unsigned) { \
if (pn->pn_value.pnv_unsigned > (maxv)) \
return (false); \
*valp = (typ) pn->pn_value.pnv_unsigned; \
} else { \
if ((pn->pn_value.pnv_signed > 0 && \
(uintmax_t)pn->pn_value.pnv_signed > (maxv)) || \
pn->pn_value.pnv_signed < (minv)) \
return (false); \
*valp = (typ) pn->pn_value.pnv_signed; \
} \
\
return (true); \
}
TEMPLATE(schar, signed char, SCHAR_MIN, SCHAR_MAX)
TEMPLATE(short, short, SHRT_MIN, SHRT_MAX)
TEMPLATE(int, int, INT_MIN, INT_MAX)
TEMPLATE(long, long, LONG_MIN, LONG_MAX)
TEMPLATE(longlong, long long, LLONG_MIN, LLONG_MAX)
TEMPLATE(intptr, intptr_t, INTPTR_MIN, INTPTR_MAX)
TEMPLATE(int8, int8_t, INT8_MIN, INT8_MAX)
TEMPLATE(int16, int16_t, INT16_MIN, INT16_MAX)
TEMPLATE(int32, int32_t, INT32_MIN, INT32_MAX)
TEMPLATE(int64, int64_t, INT64_MIN, INT64_MAX)
TEMPLATE(uchar, unsigned char, 0, UCHAR_MAX)
TEMPLATE(ushort, unsigned short, 0, USHRT_MAX)
TEMPLATE(uint, unsigned int, 0, UINT_MAX)
TEMPLATE(ulong, unsigned long, 0, ULONG_MAX)
TEMPLATE(ulonglong, unsigned long long, 0, ULLONG_MAX)
TEMPLATE(uintptr, uintptr_t, 0, UINTPTR_MAX)
TEMPLATE(uint8, uint8_t, 0, UINT8_MAX)
TEMPLATE(uint16, uint16_t, 0, UINT16_MAX)
TEMPLATE(uint32, uint32_t, 0, UINT32_MAX)
TEMPLATE(uint64, uint64_t, 0, UINT64_MAX)
#undef TEMPLATE
/*
* prop_number_equals --
* Return true if two numbers are equivalent.
*/
bool
prop_number_equals(prop_number_t num1, prop_number_t num2)
{
if (!prop_object_is_number(num1) || !prop_object_is_number(num2))
return (false);
return (prop_object_equals(num1, num2));
}
/*
* prop_number_equals_signed --
* Return true if the number is equivalent to the specified signed
* value.
*/
bool
prop_number_equals_signed(prop_number_t pn, intmax_t val)
{
if (! prop_object_is_number(pn))
return (false);
if (pn->pn_value.pnv_is_unsigned &&
(pn->pn_value.pnv_unsigned > INTMAX_MAX || val < 0))
return (false);
return (pn->pn_value.pnv_signed == val);
}
_PROP_DEPRECATED(prop_number_equals_integer,
"this program uses prop_number_equals_integer(), "
"which is deprecated; use prop_number_equals_signed() instead.")
bool
prop_number_equals_integer(prop_number_t pn, int64_t val)
{
return prop_number_equals_signed(pn, val);
}
/*
* prop_number_equals_unsigned --
* Return true if the number is equivalent to the specified
* unsigned value.
*/
bool
prop_number_equals_unsigned(prop_number_t pn, uintmax_t val)
{
if (! prop_object_is_number(pn))
return (false);
if (! pn->pn_value.pnv_is_unsigned &&
(pn->pn_value.pnv_signed < 0 || val > INT64_MAX))
return (false);
return (pn->pn_value.pnv_unsigned == val);
}
_PROP_DEPRECATED(prop_number_equals_unsigned_integer,
"this program uses prop_number_equals_unsigned_integer(), "
"which is deprecated; use prop_number_equals_unsigned() instead.")
bool
prop_number_equals_unsigned_integer(prop_number_t pn, uint64_t val)
{
return prop_number_equals_unsigned(pn, val);
}
static bool
_prop_number_internalize_unsigned(struct _prop_object_internalize_context *ctx,
struct _prop_number_value *pnv)
{
char *cp;
_PROP_ASSERT(/*CONSTCOND*/sizeof(unsigned long long) ==
sizeof(uint64_t));
#ifndef _KERNEL
errno = 0;
#endif
pnv->pnv_unsigned = (uint64_t) strtoull(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */
if (pnv->pnv_unsigned == UINT64_MAX && errno == ERANGE)
return (false);
#endif
pnv->pnv_is_unsigned = true;
ctx->poic_cp = cp;
return (true);
}
static bool
_prop_number_internalize_signed(struct _prop_object_internalize_context *ctx,
struct _prop_number_value *pnv)
{
char *cp;
_PROP_ASSERT(/*CONSTCOND*/sizeof(long long) == sizeof(int64_t));
#ifndef _KERNEL
errno = 0;
#endif
pnv->pnv_signed = (int64_t) strtoll(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */
if ((pnv->pnv_signed == INT64_MAX || pnv->pnv_signed == INT64_MIN) &&
errno == ERANGE)
return (false);
#endif
pnv->pnv_is_unsigned = false;
ctx->poic_cp = cp;
return (true);
}
/*
* _prop_number_internalize --
* Parse a <number>...</number> and return the object created from
* the external representation.
*/
/* ARGSUSED */
bool
_prop_number_internalize(prop_stack_t stack, prop_object_t *obj,
struct _prop_object_internalize_context *ctx)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
/* No attributes, no empty elements. */
if (ctx->poic_tagattr != NULL || ctx->poic_is_empty_element)
return (true);
/*
* If the first character is '-', then we treat as signed.
* If the first two characters are "0x" (i.e. the number is
* in hex), then we treat as unsigned. Otherwise, we try
* signed first, and if that fails (presumably due to ERANGE),
* then we switch to unsigned.
*/
if (ctx->poic_cp[0] == '-') {
if (_prop_number_internalize_signed(ctx, &pnv) == false)
return (true);
} else if (ctx->poic_cp[0] == '0' && ctx->poic_cp[1] == 'x') {
if (_prop_number_internalize_unsigned(ctx, &pnv) == false)
return (true);
} else {
if (_prop_number_internalize_signed(ctx, &pnv) == false &&
_prop_number_internalize_unsigned(ctx, &pnv) == false)
return (true);
}
if (_prop_object_internalize_find_tag(ctx, "integer",
_PROP_TAG_TYPE_END) == false)
return (true);
*obj = _prop_number_alloc(&pnv);
return (true);
}
/* $NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
/*
* ktrace system call, the part of the ktrace framework that
* explicitly interacts with VFS
*/
/* ARGSUSED */
int
sys_ktrace(struct lwp *l, const struct sys_ktrace_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) fname;
syscallarg(int) ops;
syscallarg(int) facs;
syscallarg(int) pid;
} */
struct vnode *vp = NULL;
file_t *fp = NULL;
struct pathbuf *pb;
int error = 0;
int fd;
if (ktrenter(l))
return EAGAIN;
if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
error = pathbuf_copyin(SCARG(uap, fname), &pb);
if (error) {
ktrexit(l);
return (error);
}
error = vn_open(NULL, pb, 0, FREAD|FWRITE, 0, &vp, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb);
ktrexit(l);
return (error);
}
pathbuf_destroy(pb);
VOP_UNLOCK(vp);
if (vp->v_type != VREG) {
vn_close(vp, FREAD|FWRITE, l->l_cred);
ktrexit(l);
return (EACCES);
}
/*
* This uses up a file descriptor slot in the
* tracing process for the duration of this syscall.
* This is not expected to be a problem.
*/
if ((error = fd_allocfile(&fp, &fd)) != 0) {
vn_close(vp, FWRITE, l->l_cred);
ktrexit(l);
return error;
}
fp->f_flag = FWRITE;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_vnode = vp;
vp = NULL;
}
error = ktrace_common(l, SCARG(uap, ops), SCARG(uap, facs),
SCARG(uap, pid), &fp);
if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) fd_abort(curproc, fp, fd);
return (error);
}
/* $NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.c 8.3 (Berkeley) 3/21/94
* from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_page.c: page ops.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $");
#include "opt_ddb.h"
#include "opt_uvm.h"
#include "opt_uvmhist.h"
#include "opt_readahead.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/radixtree.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <ddb/db_active.h>
#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>
/*
* number of pages per-CPU to reserve for the kernel.
*/
#ifndef UVM_RESERVED_PAGES_PER_CPU
#define UVM_RESERVED_PAGES_PER_CPU 5
#endif
int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
/*
* physical memory size;
*/
psize_t physmem;
/*
* local variables
*/
/*
* these variables record the values returned by vm_page_bootstrap,
* for debugging purposes. The implementation of uvm_pageboot_alloc
* and pmap_startup here also uses them internally.
*/
static vaddr_t virtual_space_start;
static vaddr_t virtual_space_end;
/*
* we allocate an initial number of page colors in uvm_page_init(),
* and remember them. We may re-color pages as cache sizes are
* discovered during the autoconfiguration phase. But we can never
* free the initial set of buckets, since they are allocated using
* uvm_pageboot_alloc().
*/
static size_t recolored_pages_memsize /* = 0 */;
static char *recolored_pages_mem;
/*
* freelist locks - one per bucket.
*/
union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
__cacheline_aligned;
/*
* basic NUMA information.
*/
static struct uvm_page_numa_region {
struct uvm_page_numa_region *next;
paddr_t start;
paddr_t size;
u_int numa_id;
} *uvm_page_numa_region;
#ifdef DEBUG
kmutex_t uvm_zerochecklock __cacheline_aligned;
vaddr_t uvm_zerocheckkva;
#endif /* DEBUG */
/*
* These functions are reserved for uvm(9) internal use and are not
* exported in the header file uvm_physseg.h
*
* Thus they are redefined here.
*/
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
/*
* inline functions
*/
/*
* uvm_pageinsert: insert a page in the object.
*
* => caller must lock object
* => call should have already set pg's object and offset pointers
* and bumped the version counter
*/
static inline void
uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
{ KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT((pg->flags & PG_TABLED) == 0); if ((pg->flags & PG_STAT) != 0) {
/* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
if ((pg->flags & PG_FILE) != 0) {
if (uobj->uo_npages == 0) {
struct vnode *vp = (struct vnode *)uobj;
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_PAGES) == 0);
vp->v_iflag |= VI_PAGES;
vholdl(vp);
mutex_exit(vp->v_interlock);
}
if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, 1);
}
cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
} else {
cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
}
}
pg->flags |= PG_TABLED;
uobj->uo_npages++;
}
static inline int
uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
{
const uint64_t idx = pg->offset >> PAGE_SHIFT;
int error;
KASSERT(rw_write_held(uobj->vmobjlock));
error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
if (error != 0) {
return error;
}
if ((pg->flags & PG_CLEAN) == 0) { uvm_obj_page_set_dirty(pg);
}
KASSERT(((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
return 0;
}
/*
* uvm_page_remove: remove page from object.
*
* => caller must lock object
*/
static inline void
uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
{ KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT(pg->flags & PG_TABLED); if ((pg->flags & PG_STAT) != 0) {
/* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
if ((pg->flags & PG_FILE) != 0) {
if (uobj->uo_npages == 1) {
struct vnode *vp = (struct vnode *)uobj;
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_PAGES) != 0);
vp->v_iflag &= ~VI_PAGES;
holdrelel(vp);
mutex_exit(vp->v_interlock);
}
if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, -1);
}
cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
} else {
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
}
uobj->uo_npages--;
pg->flags &= ~PG_TABLED;
pg->uobject = NULL;
}
static inline void
uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
{
struct vm_page *opg __unused;
KASSERT(rw_write_held(uobj->vmobjlock));
opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
KASSERT(pg == opg);
}
static void
uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
{
int i;
pgb->pgb_nfree = 0;
for (i = 0; i < uvmexp.ncolors; i++) {
LIST_INIT(&pgb->pgb_colors[i]);
}
pgfl->pgfl_buckets[num] = pgb;
}
/*
* uvm_page_init: init the page system. called from uvm_init().
*
* => we return the range of kernel virtual memory in kvm_startp/kvm_endp
*/
void
uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
{
static struct uvm_cpu uvm_boot_cpu __cacheline_aligned;
psize_t freepages, pagecount, bucketsize, n;
struct pgflbucket *pgb;
struct vm_page *pagearray;
char *bucketarray;
uvm_physseg_t bank;
int fl, b;
KASSERT(ncpu <= 1);
/*
* init the page queues and free page queue locks, except the
* free list; we allocate that later (with the initial vm_page
* structures).
*/
curcpu()->ci_data.cpu_uvm = &uvm_boot_cpu;
uvmpdpol_init();
for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
}
/*
* allocate vm_page structures.
*/
/*
* sanity check:
* before calling this function the MD code is expected to register
* some free RAM with the uvm_page_physload() function. our job
* now is to allocate vm_page structures for this memory.
*/
if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
panic("uvm_page_bootstrap: no memory pre-allocated");
/*
* first calculate the number of free pages...
*
* note that we use start/end rather than avail_start/avail_end.
* this allows us to allocate extra vm_page structures in case we
* want to return some memory to the pool after booting.
*/
freepages = 0;
for (bank = uvm_physseg_get_first();
uvm_physseg_valid_p(bank) ;
bank = uvm_physseg_get_next(bank)) {
freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
}
/*
* Let MD code initialize the number of colors, or default
* to 1 color if MD code doesn't care.
*/
if (uvmexp.ncolors == 0)
uvmexp.ncolors = 1;
uvmexp.colormask = uvmexp.ncolors - 1;
KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
/* We always start with only 1 bucket. */
uvm.bucketcount = 1;
/*
* we now know we have (PAGE_SIZE * freepages) bytes of memory we can
* use. for each page of memory we use we need a vm_page structure.
* thus, the total number of pages we can use is the total size of
* the memory divided by the PAGE_SIZE plus the size of the vm_page
* structure. we add one to freepages as a fudge factor to avoid
* truncation errors (since we can only allocate in terms of whole
* pages).
*/
pagecount = ((freepages + 1) << PAGE_SHIFT) /
(PAGE_SIZE + sizeof(struct vm_page));
bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
bucketsize = roundup2(bucketsize, coherency_unit);
bucketarray = (void *)uvm_pageboot_alloc(
bucketsize * VM_NFREELIST +
pagecount * sizeof(struct vm_page));
pagearray = (struct vm_page *)
(bucketarray + bucketsize * VM_NFREELIST);
for (fl = 0; fl < VM_NFREELIST; fl++) {
pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
}
memset(pagearray, 0, pagecount * sizeof(struct vm_page));
/*
* init the freelist cache in the disabled state.
*/
uvm_pgflcache_init();
/*
* init the vm_page structures and put them in the correct place.
*/
/* First init the extent */
for (bank = uvm_physseg_get_first(),
uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
uvm_physseg_valid_p(bank);
bank = uvm_physseg_get_next(bank)) {
n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
uvm_physseg_seg_alloc_from_slab(bank, n);
uvm_physseg_init_seg(bank, pagearray);
/* set up page array pointers */
pagearray += n;
pagecount -= n;
}
/*
* pass up the values of virtual_space_start and
* virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
* layers of the VM.
*/
*kvm_startp = round_page(virtual_space_start);
*kvm_endp = trunc_page(virtual_space_end);
/*
* init various thresholds.
*/
uvmexp.reserve_pagedaemon = 1;
uvmexp.reserve_kernel = vm_page_reserve_kernel;
/*
* done!
*/
uvm.page_init_done = true;
}
/*
* uvm_pgfl_lock: lock all freelist buckets
*/
void
uvm_pgfl_lock(void)
{
int i;
for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
mutex_spin_enter(&uvm_freelist_locks[i].lock);
}
}
/*
* uvm_pgfl_unlock: unlock all freelist buckets
*/
void
uvm_pgfl_unlock(void)
{
int i;
for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
mutex_spin_exit(&uvm_freelist_locks[i].lock);
}
}
/*
* uvm_setpagesize: set the page size
*
* => sets page_shift and page_mask from uvmexp.pagesize.
*/
void
uvm_setpagesize(void)
{
/*
* If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
* to be a constant (indicated by being a non-zero value).
*/
if (uvmexp.pagesize == 0) {
if (PAGE_SIZE == 0)
panic("uvm_setpagesize: uvmexp.pagesize not set");
uvmexp.pagesize = PAGE_SIZE;
}
uvmexp.pagemask = uvmexp.pagesize - 1;
if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
panic("uvm_setpagesize: page size %u (%#x) not a power of two",
uvmexp.pagesize, uvmexp.pagesize);
for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
break;
}
/*
* uvm_pageboot_alloc: steal memory from physmem for bootstrapping
*/
vaddr_t
uvm_pageboot_alloc(vsize_t size)
{
static bool initialized = false;
vaddr_t addr;
#if !defined(PMAP_STEAL_MEMORY)
vaddr_t vaddr;
paddr_t paddr;
#endif
/*
* on first call to this function, initialize ourselves.
*/
if (initialized == false) {
pmap_virtual_space(&virtual_space_start, &virtual_space_end);
/* round it the way we like it */
virtual_space_start = round_page(virtual_space_start);
virtual_space_end = trunc_page(virtual_space_end);
initialized = true;
}
/* round to page size */
size = round_page(size);
uvmexp.bootpages += atop(size);
#if defined(PMAP_STEAL_MEMORY)
/*
* defer bootstrap allocation to MD code (it may want to allocate
* from a direct-mapped segment). pmap_steal_memory should adjust
* virtual_space_start/virtual_space_end if necessary.
*/
addr = pmap_steal_memory(size, &virtual_space_start,
&virtual_space_end);
return addr;
#else /* !PMAP_STEAL_MEMORY */
/*
* allocate virtual memory for this request
*/
if (virtual_space_start == virtual_space_end ||
(virtual_space_end - virtual_space_start) < size)
panic("uvm_pageboot_alloc: out of virtual space");
addr = virtual_space_start;
#ifdef PMAP_GROWKERNEL
/*
* If the kernel pmap can't map the requested space,
* then allocate more resources for it.
*/
if (uvm_maxkaddr < (addr + size)) {
uvm_maxkaddr = pmap_growkernel(addr + size);
if (uvm_maxkaddr < (addr + size))
panic("uvm_pageboot_alloc: pmap_growkernel() failed");
}
#endif
virtual_space_start += size;
/*
* allocate and mapin physical pages to back new virtual pages
*/
for (vaddr = round_page(addr) ; vaddr < addr + size ;
vaddr += PAGE_SIZE) {
if (!uvm_page_physget(&paddr))
panic("uvm_pageboot_alloc: out of memory");
/*
* Note this memory is no longer managed, so using
* pmap_kenter is safe.
*/
pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
return addr;
#endif /* PMAP_STEAL_MEMORY */
}
#if !defined(PMAP_STEAL_MEMORY)
/*
* uvm_page_physget: "steal" one page from the vm_physmem structure.
*
* => attempt to allocate it off the end of a segment in which the "avail"
* values match the start/end values. if we can't do that, then we
* will advance both values (making them equal, and removing some
* vm_page structures from the non-avail area).
* => return false if out of memory.
*/
/* subroutine: try to allocate from memory chunks on the specified freelist */
static bool uvm_page_physget_freelist(paddr_t *, int);
static bool
uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
{
uvm_physseg_t lcv;
/* pass 1: try allocating from a matching end */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
{
if (uvm.page_init_done == true)
panic("uvm_page_physget: called _after_ bootstrap");
/* Try to match at front or back on unused segment */
if (uvm_page_physunload(lcv, freelist, paddrp))
return true;
}
/* pass2: forget about matching ends, just allocate something */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
{
/* Try the front regardless. */
if (uvm_page_physunload_force(lcv, freelist, paddrp))
return true;
}
return false;
}
bool
uvm_page_physget(paddr_t *paddrp)
{
int i;
/* try in the order of freelist preference */
for (i = 0; i < VM_NFREELIST; i++)
if (uvm_page_physget_freelist(paddrp, i) == true)
return (true);
return (false);
}
#endif /* PMAP_STEAL_MEMORY */
paddr_t
uvm_vm_page_to_phys(const struct vm_page *pg)
{
return pg->phys_addr & ~(PAGE_SIZE - 1);
}
/*
* uvm_page_numa_load: load NUMA range description.
*/
void
uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
{
struct uvm_page_numa_region *d;
KASSERT(numa_id < PGFL_MAX_BUCKETS);
d = kmem_alloc(sizeof(*d), KM_SLEEP);
d->start = start;
d->size = size;
d->numa_id = numa_id;
d->next = uvm_page_numa_region;
uvm_page_numa_region = d;
}
/*
* uvm_page_numa_lookup: lookup NUMA node for the given page.
*/
static u_int
uvm_page_numa_lookup(struct vm_page *pg)
{
struct uvm_page_numa_region *d;
static bool warned;
paddr_t pa;
KASSERT(uvm_page_numa_region != NULL);
pa = VM_PAGE_TO_PHYS(pg);
for (d = uvm_page_numa_region; d != NULL; d = d->next) {
if (pa >= d->start && pa < d->start + d->size) {
return d->numa_id;
}
}
if (!warned) {
printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
warned = true;
}
return 0;
}
/*
* uvm_page_redim: adjust freelist dimensions if they have changed.
*/
static void
uvm_page_redim(int newncolors, int newnbuckets)
{
struct pgfreelist npgfl;
struct pgflbucket *opgb, *npgb;
struct pgflist *ohead, *nhead;
struct vm_page *pg;
size_t bucketsize, bucketmemsize, oldbucketmemsize;
int fl, ob, oc, nb, nc, obuckets, ocolors;
char *bucketarray, *oldbucketmem, *bucketmem;
KASSERT(((newncolors - 1) & newncolors) == 0);
/* Anything to do? */
if (newncolors <= uvmexp.ncolors &&
newnbuckets == uvm.bucketcount) {
return;
}
if (uvm.page_init_done == false) {
uvmexp.ncolors = newncolors;
return;
}
bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
bucketsize = roundup2(bucketsize, coherency_unit);
bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
coherency_unit - 1;
bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
ocolors = uvmexp.ncolors;
obuckets = uvm.bucketcount;
/* Freelist cache mustn't be enabled. */
uvm_pgflcache_pause();
/* Make sure we should still do this. */
uvm_pgfl_lock();
if (newncolors <= uvmexp.ncolors &&
newnbuckets == uvm.bucketcount) {
uvm_pgfl_unlock();
uvm_pgflcache_resume();
kmem_free(bucketmem, bucketmemsize);
return;
}
uvmexp.ncolors = newncolors;
uvmexp.colormask = uvmexp.ncolors - 1;
uvm.bucketcount = newnbuckets;
for (fl = 0; fl < VM_NFREELIST; fl++) {
/* Init new buckets in new freelist. */
memset(&npgfl, 0, sizeof(npgfl));
for (nb = 0; nb < newnbuckets; nb++) {
npgb = (struct pgflbucket *)bucketarray;
uvm_page_init_bucket(&npgfl, npgb, nb);
bucketarray += bucketsize;
}
/* Now transfer pages from the old freelist. */
for (nb = ob = 0; ob < obuckets; ob++) {
opgb = uvm.page_free[fl].pgfl_buckets[ob];
for (oc = 0; oc < ocolors; oc++) {
ohead = &opgb->pgb_colors[oc];
while ((pg = LIST_FIRST(ohead)) != NULL) {
LIST_REMOVE(pg, pageq.list);
/*
* Here we decide on the NEW color &
* bucket for the page. For NUMA
* we'll use the info that the
* hardware gave us. For non-NUMA
* assign take physical page frame
* number and cache color into
* account. We do this to try and
* avoid defeating any memory
* interleaving in the hardware.
*/
KASSERT(
uvm_page_get_bucket(pg) == ob);
KASSERT(fl ==
uvm_page_get_freelist(pg));
if (uvm_page_numa_region != NULL) {
nb = uvm_page_numa_lookup(pg);
} else {
nb = atop(VM_PAGE_TO_PHYS(pg))
/ uvmexp.ncolors / 8
% newnbuckets;
}
uvm_page_set_bucket(pg, nb);
npgb = npgfl.pgfl_buckets[nb];
npgb->pgb_nfree++;
nc = VM_PGCOLOR(pg);
nhead = &npgb->pgb_colors[nc];
LIST_INSERT_HEAD(nhead, pg, pageq.list);
}
}
}
/* Install the new freelist. */
memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
}
/* Unlock and free the old memory. */
oldbucketmemsize = recolored_pages_memsize;
oldbucketmem = recolored_pages_mem;
recolored_pages_memsize = bucketmemsize;
recolored_pages_mem = bucketmem;
uvm_pgfl_unlock();
uvm_pgflcache_resume();
if (oldbucketmemsize) {
kmem_free(oldbucketmem, oldbucketmemsize);
}
/*
* this calls uvm_km_alloc() which may want to hold
* uvm_freelist_lock.
*/
uvm_pager_realloc_emerg();
}
/*
* uvm_page_recolor: Recolor the pages if the new color count is
* larger than the old one.
*/
void
uvm_page_recolor(int newncolors)
{
uvm_page_redim(newncolors, uvm.bucketcount);
}
/*
* uvm_page_rebucket: Determine a bucket structure and redim the free
* lists to match.
*/
void
uvm_page_rebucket(void)
{
u_int min_numa, max_numa, npackage, shift;
struct cpu_info *ci, *ci2, *ci3;
CPU_INFO_ITERATOR cii;
/*
* If we have more than one NUMA node, and the maximum NUMA node ID
* is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
* for free pages.
*/
min_numa = (u_int)-1;
max_numa = 0;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_numa_id < min_numa) {
min_numa = ci->ci_numa_id;
}
if (ci->ci_numa_id > max_numa) {
max_numa = ci->ci_numa_id;
}
}
if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
aprint_debug("UVM: using NUMA allocation scheme\n");
for (CPU_INFO_FOREACH(cii, ci)) {
ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
}
uvm_page_redim(uvmexp.ncolors, max_numa + 1);
return;
}
/*
* Otherwise we'll go with a scheme to maximise L2/L3 cache locality
* and minimise lock contention. Count the total number of CPU
* packages, and then try to distribute the buckets among CPU
* packages evenly.
*/
npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
/*
* Figure out how to arrange the packages & buckets, and the total
* number of buckets we need. XXX 2 may not be the best factor.
*/
for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
npackage >>= 1;
}
uvm_page_redim(uvmexp.ncolors, npackage);
/*
* Now tell each CPU which bucket to use. In the outer loop, scroll
* through all CPU packages.
*/
npackage = 0;
ci = curcpu();
ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
do {
/*
* In the inner loop, scroll through all CPUs in the package
* and assign the same bucket ID.
*/
ci3 = ci2;
do {
ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
} while (ci3 != ci2);
npackage++;
ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
} while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
aprint_debug("UVM: using package allocation scheme, "
"%d package(s) per bucket\n", 1 << shift);
}
/*
* uvm_cpu_attach: initialize per-CPU data structures.
*/
void
uvm_cpu_attach(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
/* Already done in uvm_page_init(). */
if (!CPU_IS_PRIMARY(ci)) {
/* Add more reserve pages for this CPU. */
uvmexp.reserve_kernel += vm_page_reserve_kernel;
/* Allocate per-CPU data structures. */
ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
KM_SLEEP);
ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
coherency_unit);
ci->ci_data.cpu_uvm = ucpu;
} else {
ucpu = ci->ci_data.cpu_uvm;
}
uvmpdpol_init_cpu(ucpu);
}
/*
* uvm_availmem: fetch the total amount of free memory in pages. this can
* have a detrimental effect on performance due to false sharing; don't call
* unless needed.
*
* some users can request the amount of free memory so often that it begins
* to impact upon performance. if calling frequently and an inexact value
* is okay, call with cached = true.
*/
int
uvm_availmem(bool cached)
{
int64_t fp;
cpu_count_sync(cached);
if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
/*
* XXXAD could briefly go negative because it's impossible
* to get a clean snapshot. address this for other counters
* used as running totals before NetBSD 10 although less
* important for those.
*/
fp = 0;
}
return (int)fp;
}
/*
* uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
* specific freelist and specific bucket only.
*
* => must be at IPL_VM or higher to protect per-CPU data structures.
*/
static struct vm_page *
uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
{
int c, trycolor, colormask;
struct pgflbucket *pgb;
struct vm_page *pg;
kmutex_t *lock;
bool fill;
/*
* Skip the bucket if empty, no lock needed. There could be many
* empty freelists/buckets.
*/
pgb = uvm.page_free[f].pgfl_buckets[b];
if (pgb->pgb_nfree == 0) {
return NULL;
}
/* Skip bucket if low on memory. */
lock = &uvm_freelist_locks[b].lock;
mutex_spin_enter(lock);
if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { if ((flags & UVM_PGA_USERESERVE) == 0 || (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
curlwp != uvm.pagedaemon_lwp)) {
mutex_spin_exit(lock);
return NULL;
}
fill = false;
} else {
fill = true;
}
/* Try all page colors as needed. */
c = trycolor = *trycolorp;
colormask = uvmexp.colormask;
do {
pg = LIST_FIRST(&pgb->pgb_colors[c]);
if (__predict_true(pg != NULL)) {
/*
* Got a free page! PG_FREE must be cleared under
* lock because of uvm_pglistalloc().
*/
LIST_REMOVE(pg, pageq.list); KASSERT(pg->flags == PG_FREE);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
pgb->pgb_nfree--;
CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
/*
* While we have the bucket locked and our data
* structures fresh in L1 cache, we have an ideal
* opportunity to grab some pages for the freelist
* cache without causing extra contention. Only do
* so if we found pages in this CPU's preferred
* bucket.
*/
if (__predict_true(b == ucpu->pgflbucket && fill)) { uvm_pgflcache_fill(ucpu, f, b, c);
}
mutex_spin_exit(lock);
KASSERT(uvm_page_get_bucket(pg) == b); CPU_COUNT(c == trycolor ?
CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
CPU_COUNT(CPU_COUNT_CPUMISS, 1);
*trycolorp = c;
return pg;
}
c = (c + 1) & colormask;
} while (c != trycolor); mutex_spin_exit(lock);
return NULL;
}
/*
* uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
* any color from any bucket, in a specific freelist.
*
* => must be at IPL_VM or higher to protect per-CPU data structures.
*/
static struct vm_page *
uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
{
int b, trybucket, bucketcount;
struct vm_page *pg;
/* Try for the exact thing in the per-CPU cache. */
if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1);
return pg;
}
/* Walk through all buckets, trying our preferred bucket first. */
trybucket = ucpu->pgflbucket;
b = trybucket;
bucketcount = uvm.bucketcount;
do {
pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
if (pg != NULL) {
return pg;
}
b = (b + 1 == bucketcount ? 0 : b + 1);
} while (b != trybucket);
return NULL;
}
/*
* uvm_pagealloc_strat: allocate vm_page from a particular free list.
*
* => return null if no pages free
* => wake up pagedaemon if number of free pages drops below low water mark
* => if obj != NULL, obj must be locked (to put in obj's tree)
* => if anon != NULL, anon must be locked (to put in anon)
* => only one of obj or anon can be non-null
* => caller must activate/deactivate page if it is not wired.
* => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
* => policy decision: it is more important to pull a page off of the
* appropriate priority free list than it is to get a page from the
* correct bucket or color bin. This is because we live with the
* consequences of a bad free list decision for the entire
* lifetime of the page, e.g. if the page comes from memory that
* is slower to access.
*/
struct vm_page *
uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
int flags, int strat, int free_list)
{
int color, lcv, error, s;
struct uvm_cpu *ucpu;
struct vm_page *pg;
lwp_t *l;
KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); KASSERT(off == trunc_page(off)); KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); KASSERT(anon == NULL || anon->an_lock == NULL ||
rw_write_held(anon->an_lock));
/*
* This implements a global round-robin page coloring
* algorithm.
*/
s = splvm();
ucpu = curcpu()->ci_data.cpu_uvm;
if (flags & UVM_FLAG_COLORMATCH) {
color = atop(off) & uvmexp.colormask;
} else {
color = ucpu->pgflcolor;
}
/*
* fail if any of these conditions is true:
* [1] there really are no free pages, or
* [2] only kernel "reserved" pages remain and
* reserved pages have not been requested.
* [3] only pagedaemon "reserved" pages remain and
* the requestor isn't the pagedaemon.
* we make kernel reserve pages available if called by a
* kernel thread.
*/
l = curlwp;
if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
flags |= UVM_PGA_USERESERVE;
}
again:
switch (strat) {
case UVM_PGA_STRAT_NORMAL:
/* Check freelists: descending priority (ascending id) order. */
for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
if (pg != NULL) {
goto gotit;
}
}
/* No pages free! Have pagedaemon free some memory. */
splx(s);
uvm_kick_pdaemon();
return NULL;
case UVM_PGA_STRAT_ONLY:
case UVM_PGA_STRAT_FALLBACK:
/* Attempt to allocate from the specified free list. */
KASSERT(free_list >= 0); KASSERT(free_list < VM_NFREELIST);
pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
if (pg != NULL) {
goto gotit;
}
/* Fall back, if possible. */
if (strat == UVM_PGA_STRAT_FALLBACK) {
strat = UVM_PGA_STRAT_NORMAL;
goto again;
}
/* No pages free! Have pagedaemon free some memory. */
splx(s);
uvm_kick_pdaemon();
return NULL;
case UVM_PGA_STRAT_NUMA:
/*
* NUMA strategy (experimental): allocating from the correct
* bucket is more important than observing freelist
* priority. Look only to the current NUMA node; if that
* fails, we need to look to other NUMA nodes, so retry with
* the normal strategy.
*/
for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
pg = uvm_pgflcache_alloc(ucpu, lcv, color);
if (pg != NULL) { CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1);
goto gotit;
}
pg = uvm_pagealloc_pgb(ucpu, lcv,
ucpu->pgflbucket, &color, flags);
if (pg != NULL) {
goto gotit;
}
}
strat = UVM_PGA_STRAT_NORMAL;
goto again;
default:
panic("uvm_pagealloc_strat: bad strat %d", strat);
/* NOTREACHED */
}
gotit:
/*
* We now know which color we actually allocated from; set
* the next color accordingly.
*/
ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
/*
* while still at IPL_VM, update allocation statistics.
*/
if (anon) { CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
}
splx(s);
KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
/*
* assign the page to the object. as the page was free, we know
* that pg->uobject and pg->uanon are NULL. we only need to take
* the page's interlock if we are changing the values.
*/
if (anon != NULL || obj != NULL) { mutex_enter(&pg->interlock);
}
pg->offset = off;
pg->uobject = obj;
pg->uanon = anon;
KASSERT(uvm_page_owner_locked_p(pg, true));
if (anon) {
anon->an_page = pg;
pg->flags |= PG_ANON;
mutex_exit(&pg->interlock);
} else if (obj) {
/*
* set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
*/
if (UVM_OBJ_IS_VNODE(obj)) {
pg->flags |= PG_FILE;
} else if (UVM_OBJ_IS_AOBJ(obj)) {
pg->flags |= PG_AOBJ;
}
uvm_pageinsert_object(obj, pg);
mutex_exit(&pg->interlock);
error = uvm_pageinsert_tree(obj, pg);
if (error != 0) {
mutex_enter(&pg->interlock);
uvm_pageremove_object(obj, pg);
mutex_exit(&pg->interlock);
uvm_pagefree(pg);
return NULL;
}
}
#if defined(UVM_PAGE_TRKOWN)
pg->owner_tag = NULL;
#endif
UVM_PAGE_OWN(pg, "new alloc");
if (flags & UVM_PGA_ZERO) {
/* A zero'd page is not clean. */
if (obj != NULL || anon != NULL) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
}
pmap_zero_page(VM_PAGE_TO_PHYS(pg));
}
return(pg);
}
/*
* uvm_pagereplace: replace a page with another
*
* => object must be locked
* => page interlocks must be held
*/
void
uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
{
struct uvm_object *uobj = oldpg->uobject;
struct vm_page *pg __diagused;
uint64_t idx;
KASSERT((oldpg->flags & PG_TABLED) != 0);
KASSERT(uobj != NULL);
KASSERT((newpg->flags & PG_TABLED) == 0);
KASSERT(newpg->uobject == NULL);
KASSERT(rw_write_held(uobj->vmobjlock));
KASSERT(mutex_owned(&oldpg->interlock));
KASSERT(mutex_owned(&newpg->interlock));
newpg->uobject = uobj;
newpg->offset = oldpg->offset;
idx = newpg->offset >> PAGE_SHIFT;
pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
KASSERT(pg == oldpg);
if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
if ((newpg->flags & PG_CLEAN) != 0) {
uvm_obj_page_clear_dirty(newpg);
} else {
uvm_obj_page_set_dirty(newpg);
}
}
/*
* oldpg's PG_STAT is stable. newpg is not reachable by others yet.
*/
newpg->flags |=
(newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
uvm_pageinsert_object(uobj, newpg);
uvm_pageremove_object(uobj, oldpg);
}
/*
* uvm_pagerealloc: reallocate a page from one object to another
*
* => both objects must be locked
*/
int
uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
{
int error = 0;
/*
* remove it from the old object
*/
if (pg->uobject) { uvm_pageremove_tree(pg->uobject, pg);
uvm_pageremove_object(pg->uobject, pg);
}
/*
* put it in the new object
*/
if (newobj) {
mutex_enter(&pg->interlock);
pg->uobject = newobj;
pg->offset = newoff;
if (UVM_OBJ_IS_VNODE(newobj)) {
pg->flags |= PG_FILE;
} else if (UVM_OBJ_IS_AOBJ(newobj)) {
pg->flags |= PG_AOBJ;
}
uvm_pageinsert_object(newobj, pg);
mutex_exit(&pg->interlock);
error = uvm_pageinsert_tree(newobj, pg);
if (error != 0) {
mutex_enter(&pg->interlock);
uvm_pageremove_object(newobj, pg);
mutex_exit(&pg->interlock);
}
}
return error;
}
/*
* uvm_pagefree: free page
*
* => erase page's identity (i.e. remove from object)
* => put page on free list
* => caller must lock owning object (either anon or uvm_object)
* => assumes all valid mappings of pg are gone
*/
void
uvm_pagefree(struct vm_page *pg)
{
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
struct uvm_cpu *ucpu;
kmutex_t *lock;
int bucket, s;
bool locked;
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
pg->uanon == (void *)0xdeadbeef) {
panic("uvm_pagefree: freeing free page %p", pg);
}
#endif /* DEBUG */
KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT(!(pg->flags & PG_FREE)); KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
rw_write_held(pg->uanon->an_lock));
/*
* remove the page from the object's tree before acquiring any page
* interlocks: this can acquire locks to free radixtree nodes.
*/
if (pg->uobject != NULL) { uvm_pageremove_tree(pg->uobject, pg);
}
/*
* if the page is loaned, resolve the loan instead of freeing.
*/
if (pg->loan_count) {
KASSERT(pg->wire_count == 0);
/*
* if the page is owned by an anon then we just want to
* drop anon ownership. the kernel will free the page when
* it is done with it. if the page is owned by an object,
* remove it from the object and mark it dirty for the benefit
* of possible anon owners.
*
* regardless of previous ownership, wakeup any waiters,
* unbusy the page, and we're done.
*/
uvm_pagelock(pg);
locked = true;
if (pg->uobject != NULL) {
uvm_pageremove_object(pg->uobject, pg);
pg->flags &= ~(PG_FILE|PG_AOBJ);
} else if (pg->uanon != NULL) {
if ((pg->flags & PG_ANON) == 0) {
pg->loan_count--;
} else {
const unsigned status = uvm_pagegetdirty(pg);
pg->flags &= ~PG_ANON;
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
pg->uanon->an_page = NULL;
pg->uanon = NULL;
}
if (pg->pqflags & PQ_WANTED) { wakeup(pg);
}
pg->pqflags &= ~PQ_WANTED;
pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
#ifdef UVM_PAGE_TRKOWN
pg->owner_tag = NULL;
#endif
KASSERT((pg->flags & PG_STAT) == 0); if (pg->loan_count) { KASSERT(pg->uobject == NULL); if (pg->uanon == NULL) { uvm_pagedequeue(pg);
}
uvm_pageunlock(pg);
return;
}
} else if (pg->uobject != NULL || pg->uanon != NULL ||
pg->wire_count != 0) {
uvm_pagelock(pg);
locked = true;
} else {
locked = false;
}
/*
* remove page from its object or anon.
*/
if (pg->uobject != NULL) {
uvm_pageremove_object(pg->uobject, pg); } else if (pg->uanon != NULL) { const unsigned int status = uvm_pagegetdirty(pg);
pg->uanon->an_page = NULL;
pg->uanon = NULL;
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
/*
* if the page was wired, unwire it now.
*/
if (pg->wire_count) { pg->wire_count = 0;
atomic_dec_uint(&uvmexp.wired);
}
if (locked) {
/*
* wake anyone waiting on the page.
*/
if ((pg->pqflags & PQ_WANTED) != 0) { pg->pqflags &= ~PQ_WANTED;
wakeup(pg);
}
/*
* now remove the page from the queues.
*/
uvm_pagedequeue(pg);
uvm_pageunlock(pg);
} else {
KASSERT(!uvmpdpol_pageisqueued_p(pg));
}
/*
* and put on free queue
*/
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
#endif /* DEBUG */
/* Try to send the page to the per-CPU cache. */
s = splvm();
ucpu = curcpu()->ci_data.cpu_uvm;
bucket = uvm_page_get_bucket(pg);
if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { splx(s);
return;
}
/* Didn't work. Never mind, send it to a global bucket. */
pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
pgb = pgfl->pgfl_buckets[bucket];
lock = &uvm_freelist_locks[bucket].lock;
mutex_spin_enter(lock);
/* PG_FREE must be set under lock because of uvm_pglistalloc(). */
pg->flags = PG_FREE;
LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
pgb->pgb_nfree++;
CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
mutex_spin_exit(lock);
splx(s);
}
/*
* uvm_page_unbusy: unbusy an array of pages.
*
* => pages must either all belong to the same object, or all belong to anons.
* => if pages are object-owned, object must be locked.
* => if pages are anon-owned, anons must be locked.
* => caller must make sure that anon-owned pages are not PG_RELEASED.
*/
void
uvm_page_unbusy(struct vm_page **pgs, int npgs)
{
struct vm_page *pg;
int i, pageout_done;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
pageout_done = 0;
for (i = 0; i < npgs; i++) {
pg = pgs[i];
if (pg == NULL || pg == PGO_DONTCARE) {
continue;
}
KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->flags & PG_BUSY); if (pg->flags & PG_PAGEOUT) { pg->flags &= ~PG_PAGEOUT;
pg->flags |= PG_RELEASED;
pageout_done++;
atomic_inc_uint(&uvmexp.pdfreed);
}
if (pg->flags & PG_RELEASED) {
UVMHIST_LOG(ubchist, "releasing pg %#jx",
(uintptr_t)pg, 0, 0, 0);
KASSERT(pg->uobject != NULL ||
(pg->uanon != NULL && pg->uanon->an_ref > 0));
pg->flags &= ~PG_RELEASED;
uvm_pagefree(pg);
} else {
UVMHIST_LOG(ubchist, "unbusying pg %#jx",
(uintptr_t)pg, 0, 0, 0);
KASSERT((pg->flags & PG_FAKE) == 0);
pg->flags &= ~PG_BUSY;
uvm_pagelock(pg);
uvm_pagewakeup(pg); uvm_pageunlock(pg);
UVM_PAGE_OWN(pg, NULL);
}
}
if (pageout_done != 0) { uvm_pageout_done(pageout_done);
}
}
/*
* uvm_pagewait: wait for a busy page
*
* => page must be known PG_BUSY
* => object must be read or write locked
* => object will be unlocked on return
*/
void
uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
{
KASSERT(rw_lock_held(lock));
KASSERT((pg->flags & PG_BUSY) != 0);
KASSERT(uvm_page_owner_locked_p(pg, false));
mutex_enter(&pg->interlock);
pg->pqflags |= PQ_WANTED;
rw_exit(lock);
UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
}
/*
* uvm_pagewakeup: wake anyone waiting on a page
*
* => page interlock must be held
*/
void
uvm_pagewakeup(struct vm_page *pg)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
KASSERT(mutex_owned(&pg->interlock));
UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
if ((pg->pqflags & PQ_WANTED) != 0) { wakeup(pg);
pg->pqflags &= ~PQ_WANTED;
}
}
/*
* uvm_pagewanted_p: return true if someone is waiting on the page
*
* => object must be write locked (lock out all concurrent access)
*/
bool
uvm_pagewanted_p(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, true));
return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
}
#if defined(UVM_PAGE_TRKOWN)
/*
* uvm_page_own: set or release page ownership
*
* => this is a debugging function that keeps track of who sets PG_BUSY
* and where they do it. it can be used to track down problems
* such a process setting "PG_BUSY" and never releasing it.
* => page's object [if any] must be locked
* => if "tag" is NULL then we are releasing page ownership
*/
void
uvm_page_own(struct vm_page *pg, const char *tag)
{
KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
KASSERT(uvm_page_owner_locked_p(pg, true));
/* gain ownership? */
if (tag) {
KASSERT((pg->flags & PG_BUSY) != 0);
if (pg->owner_tag) {
printf("uvm_page_own: page %p already owned "
"by proc %d.%d [%s]\n", pg,
pg->owner, pg->lowner, pg->owner_tag);
panic("uvm_page_own");
}
pg->owner = curproc->p_pid;
pg->lowner = curlwp->l_lid;
pg->owner_tag = tag;
return;
}
/* drop ownership */
KASSERT((pg->flags & PG_BUSY) == 0);
if (pg->owner_tag == NULL) {
printf("uvm_page_own: dropping ownership of an non-owned "
"page (%p)\n", pg);
panic("uvm_page_own");
}
pg->owner_tag = NULL;
}
#endif
/*
* uvm_pagelookup: look up a page
*
* => caller should lock object to keep someone from pulling the page
* out from under it
*/
struct vm_page *
uvm_pagelookup(struct uvm_object *obj, voff_t off)
{
struct vm_page *pg;
KASSERT(db_active || rw_lock_held(obj->vmobjlock));
pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
KASSERT(pg == NULL || obj->uo_npages != 0); KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
(pg->flags & PG_BUSY) != 0);
return pg;
}
/*
* uvm_pagewire: wire the page, thus removing it from the daemon's grasp
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pagewire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
if ((pg->flags & PG_READAHEAD) != 0) {
uvm_ra_hit.ev_count++;
pg->flags &= ~PG_READAHEAD;
}
#endif /* defined(READAHEAD_STATS) */
if (pg->wire_count == 0) { uvm_pagedequeue(pg);
atomic_inc_uint(&uvmexp.wired);
}
pg->wire_count++;
KASSERT(pg->wire_count > 0); /* detect wraparound */
}
/*
* uvm_pageunwire: unwire the page.
*
* => activate if wire count goes to zero.
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageunwire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->wire_count != 0); KASSERT(!uvmpdpol_pageisqueued_p(pg)); KASSERT(mutex_owned(&pg->interlock));
pg->wire_count--;
if (pg->wire_count == 0) {
uvm_pageactivate(pg);
KASSERT(uvmexp.wired != 0);
atomic_dec_uint(&uvmexp.wired);
}
}
/*
* uvm_pagedeactivate: deactivate page
*
* => caller must lock objects
* => caller must check to make sure page is not wired
* => object that page belongs to must be locked (so we can adjust pg->flags)
* => caller must clear the reference on the page before calling
* => caller must hold pg->interlock
*/
void
uvm_pagedeactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); if (pg->wire_count == 0) { KASSERT(uvmpdpol_pageisqueued_p(pg));
uvmpdpol_pagedeactivate(pg);
}
}
/*
* uvm_pageactivate: activate page
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
if ((pg->flags & PG_READAHEAD) != 0) {
uvm_ra_hit.ev_count++;
pg->flags &= ~PG_READAHEAD;
}
#endif /* defined(READAHEAD_STATS) */
if (pg->wire_count == 0) { uvmpdpol_pageactivate(pg);
}
}
/*
* uvm_pagedequeue: remove a page from any paging queue
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pagedequeue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); if (uvmpdpol_pageisqueued_p(pg)) { uvmpdpol_pagedequeue(pg);
}
}
/*
* uvm_pageenqueue: add a page to a paging queue without activating.
* used where a page is not really demanded (yet). eg. read-ahead
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageenqueue(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, false));
KASSERT(mutex_owned(&pg->interlock));
if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
uvmpdpol_pageenqueue(pg);
}
}
/*
* uvm_pagelock: acquire page interlock
*/
void
uvm_pagelock(struct vm_page *pg)
{
mutex_enter(&pg->interlock);
}
/*
* uvm_pagelock2: acquire two page interlocks
*/
void
uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
{
if (pg1 < pg2) {
mutex_enter(&pg1->interlock);
mutex_enter(&pg2->interlock);
} else {
mutex_enter(&pg2->interlock);
mutex_enter(&pg1->interlock);
}
}
/*
* uvm_pageunlock: release page interlock, and if a page replacement intent
* is set on the page, pass it to uvmpdpol to make real.
*
* => caller must hold pg->interlock
*/
void
uvm_pageunlock(struct vm_page *pg)
{
if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
(pg->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg->interlock);
return;
}
pg->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg->interlock);
uvmpdpol_pagerealize(pg);
}
/*
* uvm_pageunlock2: release two page interlocks, and for both pages if a
* page replacement intent is set on the page, pass it to uvmpdpol to make
* real.
*
* => caller must hold pg->interlock
*/
void
uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
{
if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
(pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg1->interlock);
pg1 = NULL;
} else {
pg1->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg1->interlock);
}
if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
(pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg2->interlock);
pg2 = NULL;
} else {
pg2->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg2->interlock);
}
if (pg1 != NULL) {
uvmpdpol_pagerealize(pg1);
}
if (pg2 != NULL) {
uvmpdpol_pagerealize(pg2);
}
}
/*
* uvm_pagezero: zero fill a page
*
* => if page is part of an object then the object should be locked
* to protect pg->flags.
*/
void
uvm_pagezero(struct vm_page *pg)
{
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
pmap_zero_page(VM_PAGE_TO_PHYS(pg));
}
/*
* uvm_pagecopy: copy a page
*
* => if page is part of an object then the object should be locked
* to protect pg->flags.
*/
void
uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
{
uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
}
/*
* uvm_pageismanaged: test it see that a page (specified by PA) is managed.
*/
bool
uvm_pageismanaged(paddr_t pa)
{
return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
}
/*
* uvm_page_lookup_freelist: look up the free list for the specified page
*/
int
uvm_page_lookup_freelist(struct vm_page *pg)
{
uvm_physseg_t upm;
upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
return uvm_physseg_get_free_list(upm);
}
/*
* uvm_page_owner_locked_p: return true if object associated with page is
* locked. this is a weak check for runtime assertions only.
*/
bool
uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
{
if (pg->uobject != NULL) {
return exclusive
? rw_write_held(pg->uobject->vmobjlock) : rw_lock_held(pg->uobject->vmobjlock);
}
if (pg->uanon != NULL) {
return exclusive
? rw_write_held(pg->uanon->an_lock) : rw_lock_held(pg->uanon->an_lock);
}
return true;
}
/*
* uvm_pagereadonly_p: return if the page should be mapped read-only
*/
bool
uvm_pagereadonly_p(struct vm_page *pg)
{
struct uvm_object * const uobj = pg->uobject;
KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); if ((pg->flags & PG_RDONLY) != 0) {
return true;
}
if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
return true;
}
if (uobj == NULL) {
return false;
}
return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
}
#ifdef PMAP_DIRECT
/*
* Call pmap to translate physical address into a virtual and to run a callback
* for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
* or equivalent.
*/
int
uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
int (*process)(void *, size_t, void *), void *arg)
{
int error = 0;
paddr_t pa;
size_t todo;
voff_t pgoff = (off & PAGE_MASK);
struct vm_page *pg;
KASSERT(npages > 0);
KASSERT(len > 0);
for (int i = 0; i < npages; i++) {
pg = pgs[i];
KASSERT(len > 0);
/*
* Caller is responsible for ensuring all the pages are
* available.
*/
KASSERT(pg != NULL);
KASSERT(pg != PGO_DONTCARE);
pa = VM_PAGE_TO_PHYS(pg);
todo = MIN(len, PAGE_SIZE - pgoff);
error = pmap_direct_process(pa, pgoff, todo, process, arg);
if (error)
break;
pgoff = 0;
len -= todo;
}
KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
return error;
}
#endif /* PMAP_DIRECT */
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_page_printit: actually print the page
*/
static const char page_flagbits[] = UVM_PGFLAGBITS;
static const char page_pqflagbits[] = UVM_PQFLAGBITS;
void
uvm_page_printit(struct vm_page *pg, bool full,
void (*pr)(const char *, ...))
{
struct vm_page *tpg;
struct uvm_object *uobj;
struct pgflbucket *pgb;
struct pgflist *pgl;
char pgbuf[128];
(*pr)("PAGE %p:\n", pg);
snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
(*pr)(" flags=%s\n", pgbuf);
snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
(*pr)(" pqflags=%s\n", pgbuf);
(*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
pg->uobject, pg->uanon, (long long)pg->offset);
(*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
uvm_page_get_freelist(pg));
(*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
#if defined(UVM_PAGE_TRKOWN)
if (pg->flags & PG_BUSY)
(*pr)(" owning process = %d.%d, tag=%s\n",
pg->owner, pg->lowner, pg->owner_tag);
else
(*pr)(" page not busy, no owner\n");
#else
(*pr)(" [page ownership tracking disabled]\n");
#endif
if (!full)
return;
/* cross-verify object/anon */
if ((pg->flags & PG_FREE) == 0) {
if (pg->flags & PG_ANON) {
if (pg->uanon == NULL || pg->uanon->an_page != pg)
(*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
(pg->uanon) ? pg->uanon->an_page : NULL);
else
(*pr)(" anon backpointer is OK\n");
} else {
uobj = pg->uobject;
if (uobj) {
(*pr)(" checking object list\n");
tpg = uvm_pagelookup(uobj, pg->offset);
if (tpg)
(*pr)(" page found on object list\n");
else
(*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
}
}
}
/* cross-verify page queue */
if (pg->flags & PG_FREE) {
int fl = uvm_page_get_freelist(pg);
int b = uvm_page_get_bucket(pg);
pgb = uvm.page_free[fl].pgfl_buckets[b];
pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
(*pr)(" checking pageq list\n");
LIST_FOREACH(tpg, pgl, pageq.list) {
if (tpg == pg) {
break;
}
}
if (tpg)
(*pr)(" page found on pageq list\n");
else
(*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
}
}
/*
* uvm_page_printall - print a summary of all managed pages
*/
void
uvm_page_printall(void (*pr)(const char *, ...))
{
uvm_physseg_t i;
paddr_t pfn;
struct vm_page *pg;
(*pr)("%18s %4s %4s %18s %18s"
#ifdef UVM_PAGE_TRKOWN
" OWNER"
#endif
"\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
for (i = uvm_physseg_get_first();
uvm_physseg_valid_p(i);
i = uvm_physseg_get_next(i)) {
for (pfn = uvm_physseg_get_start(i);
pfn < uvm_physseg_get_end(i);
pfn++) {
pg = PHYS_TO_VM_PAGE(ptoa(pfn));
(*pr)("%18p %04x %08x %18p %18p",
pg, pg->flags, pg->pqflags, pg->uobject,
pg->uanon);
#ifdef UVM_PAGE_TRKOWN
if (pg->flags & PG_BUSY)
(*pr)(" %d [%s]", pg->owner, pg->owner_tag);
#endif
(*pr)("\n");
}
}
}
/*
* uvm_page_print_freelists - print a summary freelists
*/
void
uvm_page_print_freelists(void (*pr)(const char *, ...))
{
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
int fl, b, c;
(*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
for (fl = 0; fl < VM_NFREELIST; fl++) {
pgfl = &uvm.page_free[fl];
(*pr)("freelist(%d) @ %p\n", fl, pgfl);
for (b = 0; b < uvm.bucketcount; b++) {
pgb = uvm.page_free[fl].pgfl_buckets[b];
(*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
b, pgb, pgb->pgb_nfree,
&uvm_freelist_locks[b].lock);
for (c = 0; c < uvmexp.ncolors; c++) {
(*pr)(" color(%d) @ %p, ", c,
&pgb->pgb_colors[c]);
(*pr)("first page = %p\n",
LIST_FIRST(&pgb->pgb_colors[c]));
}
}
}
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_user.c,v 1.1.2.1 1997/08/14 19:10:41 chuck Exp
*/
/*
* uvm_user.c: high level uvm_allocate/uvm_deallocate interface into vm.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <uvm/uvm.h>
/*
* uvm_deallocate: deallocate memory (unmap)
*/
void
uvm_deallocate(struct vm_map *map, vaddr_t start, vsize_t size)
{ if (size == 0)
return;
uvm_unmap(map, trunc_page(start), round_page(start + size));
}
/* $NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty.c 8.13 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#define TTY_ALLOW_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/proc.h>
#define TTYDEFCHARS
#include <sys/tty.h>
#undef TTYDEFCHARS
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/dkstat.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/kmem.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/poll.h>
#include <sys/kprintf.h>
#include <sys/namei.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/intr.h>
#include <sys/ioctl_compat.h>
#include <sys/module.h>
#include <sys/bitops.h>
#include <sys/compat_stub.h>
#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/pserialize.h>
static int ttnread(struct tty *);
static void ttyblock(struct tty *);
static void ttyecho(int, struct tty *);
static void ttyrubo(struct tty *, int);
static void ttyprintf_nolock(struct tty *, const char *fmt, ...)
__printflike(2, 3);
static int proc_compare_wrapper(struct proc *, struct proc *);
static void ttysigintr(void *);
/* Symbolic sleep message strings. */
const char ttclos[] = "ttycls";
const char ttopen[] = "ttyopn";
const char ttybg[] = "ttybg";
const char ttyin[] = "ttyin";
const char ttyout[] = "ttyout";
/*
* Used to determine whether we still have a connection. This is true in
* one of 3 cases:
* 1) We have carrier.
* 2) It's a locally attached terminal, and we are therefore ignoring carrier.
* 3) We're using a flow control mechanism that overloads the carrier signal.
*/
#define CONNECTED(tp) (ISSET(tp->t_state, TS_CARR_ON) || \
ISSET(tp->t_cflag, CLOCAL | MDMBUF))
/*
* Table with character classes and parity. The 8th bit indicates parity,
* the 7th bit indicates the character is an alphameric or underscore (for
* ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits
* are 0 then the character needs no special processing on output; classes
* other than 0 might be translated or (not currently) require delays.
*/
#define E 0x00 /* Even parity. */
#define O 0x80 /* Odd parity. */
#define PARITY(c) (char_type[c] & O)
#define ALPHA 0x40 /* Alpha or underscore. */
#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA)
#define CCLASSMASK 0x3f
#define CCLASS(c) (char_type[c] & CCLASSMASK)
#define BS BACKSPACE
#define CC CONTROL
#define CR RETURN
#define NA ORDINARY | ALPHA
#define NL NEWLINE
#define NO ORDINARY
#define TB TAB
#define VT VTAB
unsigned char const char_type[] = {
E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */
O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
/*
* Meta chars; should be settable per character set;
* for now, treat them all as normal characters.
*/
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
};
#undef BS
#undef CC
#undef CR
#undef NA
#undef NL
#undef NO
#undef TB
#undef VT
static struct ttylist_head tty_sigqueue = TAILQ_HEAD_INITIALIZER(tty_sigqueue);
static void *tty_sigsih;
struct ttylist_head ttylist = TAILQ_HEAD_INITIALIZER(ttylist);
int tty_count;
kmutex_t tty_lock;
kmutex_t constty_lock;
static struct pserialize *constty_psz;
static kcondvar_t ttyref_cv;
struct ptm_pty *ptm = NULL;
uint64_t tk_cancc;
uint64_t tk_nin;
uint64_t tk_nout;
uint64_t tk_rawcc;
static kauth_listener_t tty_listener;
#define TTY_MINQSIZE 0x00400
#define TTY_MAXQSIZE 0x10000
int tty_qsize = TTY_MINQSIZE;
static int
tty_get_qsize(int *qsize, int newsize)
{
if (newsize <= 0)
return EINVAL;
newsize = 1 << ilog2(newsize); /* Make it a power of two */
if (newsize < TTY_MINQSIZE || newsize > TTY_MAXQSIZE)
return EINVAL;
*qsize = newsize;
return 0;
}
static int
tty_set_qsize(struct tty *tp, int newsize)
{
struct clist rawq, canq, outq;
struct clist orawq, ocanq, ooutq;
clalloc(&rawq, newsize, 1);
clalloc(&canq, newsize, 1);
clalloc(&outq, newsize, 0);
mutex_spin_enter(&tty_lock);
if (tp->t_outq.c_cc != 0) {
mutex_spin_exit(&tty_lock);
clfree(&rawq);
clfree(&canq);
clfree(&outq);
return EBUSY;
}
orawq = tp->t_rawq;
ocanq = tp->t_canq;
ooutq = tp->t_outq;
tp->t_qsize = newsize;
tp->t_rawq = rawq;
tp->t_canq = canq;
tp->t_outq = outq;
ttsetwater(tp);
mutex_spin_exit(&tty_lock);
clfree(&orawq);
clfree(&ocanq);
clfree(&ooutq);
return 0;
}
static int
sysctl_kern_tty_qsize(SYSCTLFN_ARGS)
{
int newsize;
int error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &newsize;
newsize = tty_qsize;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
return tty_get_qsize(&tty_qsize, newsize);
}
static void
sysctl_kern_tty_setup(void)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "tkstat",
SYSCTL_DESCR("Number of characters sent and received "
"on ttys"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_TKSTAT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "nin",
SYSCTL_DESCR("Total number of tty input characters"),
NULL, 0, &tk_nin, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NIN, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "nout",
SYSCTL_DESCR("Total number of tty output characters"),
NULL, 0, &tk_nout, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NOUT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "cancc",
SYSCTL_DESCR("Number of canonical tty input characters"),
NULL, 0, &tk_cancc, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_CANCC, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "rawcc",
SYSCTL_DESCR("Number of raw tty input characters"),
NULL, 0, &tk_rawcc, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_RAWCC, CTL_EOL);
sysctl_createv(NULL, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "tty", NULL,
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(NULL, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "qsize",
SYSCTL_DESCR("TTY input and output queue size"),
sysctl_kern_tty_qsize, 0, &tty_qsize, 0,
CTL_CREATE, CTL_EOL);
}
/*
* ttylock(tp), ttyunlock(tp), ttylocked(tp)
*
* Exclusive lock on tty. Currently a single global lock.
*
* ttylocked is for positive DIAGNOSTIC assertions only.
*/
void
ttylock(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
}
void
ttyunlock(struct tty *tp)
{
mutex_spin_exit(&tty_lock);
}
bool
ttylocked(struct tty *tp)
{
return mutex_owned(&tty_lock);
}
int
ttyopen(struct tty *tp, int dialout, int nonblock)
{
int error;
error = 0;
mutex_spin_enter(&tty_lock);
if (dialout) {
/*
* If the device is already open for non-dialout, fail.
* Otherwise, set TS_DIALOUT to block any pending non-dialout
* opens.
*/
if (ISSET(tp->t_state, TS_ISOPEN) &&
!ISSET(tp->t_state, TS_DIALOUT)) {
error = EBUSY;
goto out;
}
SET(tp->t_state, TS_DIALOUT);
} else {
if (!nonblock) {
/*
* Wait for carrier. Also wait for any dialout
* processes to close the tty first.
*/
while (ISSET(tp->t_state, TS_DIALOUT) || !CONNECTED(tp)) {
tp->t_wopen++;
error = ttysleep(tp, &tp->t_rawcv, true, 0);
tp->t_wopen--;
if (error)
goto out;
}
} else {
/*
* Don't allow a non-blocking non-dialout open if the
* device is already open for dialout.
*/
if (ISSET(tp->t_state, TS_DIALOUT)) {
error = EBUSY;
goto out;
}
}
}
out:
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Initial open of tty, or (re)entry to standard tty line discipline.
*/
int
ttylopen(dev_t device, struct tty *tp)
{
mutex_spin_enter(&tty_lock);
tp->t_dev = device;
if (!ISSET(tp->t_state, TS_ISOPEN)) { SET(tp->t_state, TS_ISOPEN);
memset(&tp->t_winsize, 0, sizeof(tp->t_winsize));
tp->t_flags = 0;
}
mutex_spin_exit(&tty_lock);
if (tp->t_qsize != tty_qsize) tty_set_qsize(tp, tty_qsize);
return (0);
}
/*
* Interrupt any pending I/O and make it fail. Used before close to
* interrupt pending open/read/write/&c. and make it fail promptly.
*/
void
ttycancel(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
tp->t_state |= TS_CANCEL;
cv_broadcast(&tp->t_outcv);
cv_broadcast(&tp->t_rawcv);
mutex_spin_exit(&tty_lock);
}
/*
* Handle close() on a tty line: flush and set to initial state,
* bumping generation number so that pending read/write calls
* can detect recycling of the tty.
*/
int
ttyclose(struct tty *tp)
{
struct session *sess;
/*
* Make sure this is not the constty. Without constty_lock it
* is always allowed to transition from nonnull to null.
*/
(void)atomic_cas_ptr(&constty, tp, NULL);
/*
* We don't know if this has _ever_ been the constty: another
* thread may have kicked it out as constty before we started
* to close.
*
* So we wait for all users that might be acquiring references
* to finish doing so -- after that, no more references can be
* made, at which point we can safely flush the tty, wait for
* the existing references to drain, and finally free or reuse
* the tty.
*/
pserialize_perform(constty_psz);
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD | FWRITE);
tp->t_gen++;
tp->t_pgrp = NULL;
tp->t_state = 0;
sess = tp->t_session;
tp->t_session = NULL;
while (tp->t_refcnt)
cv_wait(&ttyref_cv, &tty_lock);
mutex_spin_exit(&tty_lock);
if (sess != NULL) {
mutex_enter(&proc_lock);
/* Releases proc_lock. */
proc_sessrele(sess);
}
return (0);
}
#define FLUSHQ(q) { \
if ((q)->c_cc) \
ndflush(q, (q)->c_cc); \
}
/*
* tty_acquire(tp), tty_release(tp)
*
* Acquire a reference to tp that prevents it from being closed
* until released. Caller must guarantee tp has not yet been
* closed, e.g. by obtaining tp from constty during a pserialize
* read section. Caller must not hold tty_lock.
*/
void
tty_acquire(struct tty *tp)
{
unsigned refcnt __diagused;
refcnt = atomic_inc_uint_nv(&tp->t_refcnt);
KASSERT(refcnt < UINT_MAX);
}
void
tty_release(struct tty *tp)
{
unsigned old, new;
KDASSERT(mutex_ownable(&tty_lock));
do {
old = atomic_load_relaxed(&tp->t_refcnt);
if (old == 1) {
mutex_spin_enter(&tty_lock);
if (atomic_dec_uint_nv(&tp->t_refcnt) == 0)
cv_broadcast(&ttyref_cv);
mutex_spin_exit(&tty_lock);
return;
}
KASSERT(old != 0);
new = old - 1;
} while (atomic_cas_uint(&tp->t_refcnt, old, new) != old);
}
/*
* This macro is used in canonical mode input processing, where a read
* request shall not return unless a 'line delimiter' ('\n') or 'break'
* (EOF, EOL, EOL2) character (or a signal) has been received. As EOL2
* is an extension to the POSIX.1 defined set of special characters,
* recognize it only if IEXTEN is set in the set of local flags.
*/
#define TTBREAKC(c, lflg) \
((c) == '\n' || (((c) == cc[VEOF] || (c) == cc[VEOL] || \
((c) == cc[VEOL2] && ISSET(lflg, IEXTEN))) && (c) != _POSIX_VDISABLE))
/*
* ttyinput() helper.
* Call with the tty lock held.
*/
/* XXX static */ int
ttyinput_wlock(int c, struct tty *tp)
{
int iflag, lflag, i, error;
u_char *cc;
KASSERT(mutex_owned(&tty_lock));
/*
* If input is pending take it first.
*/
lflag = tp->t_lflag;
if (ISSET(lflag, PENDIN))
ttypend(tp);
/*
* Gather stats.
*/
if (ISSET(lflag, ICANON)) {
++tk_cancc;
++tp->t_cancc;
} else {
++tk_rawcc;
++tp->t_rawcc;
}
++tk_nin;
cc = tp->t_cc;
/*
* Handle exceptional conditions (break, parity, framing).
*/
iflag = tp->t_iflag;
if ((error = (ISSET(c, TTY_ERRORMASK))) != 0) {
CLR(c, TTY_ERRORMASK);
if (ISSET(error, TTY_FE) && c == 0) { /* Break. */
if (ISSET(iflag, IGNBRK))
return (0);
else if (ISSET(iflag, BRKINT)) {
ttyflush(tp, FREAD | FWRITE);
ttysig(tp, TTYSIG_PG1, SIGINT);
return (0);
} else if (ISSET(iflag, PARMRK))
goto parmrk;
} else if ((ISSET(error, TTY_PE) && ISSET(iflag, INPCK)) ||
ISSET(error, TTY_FE)) {
if (ISSET(iflag, IGNPAR))
return (0);
else if (ISSET(iflag, PARMRK)) {
parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
(void)putc(c | TTY_QUOTE, &tp->t_rawq);
return (0);
} else
c = 0;
}
} else if (c == 0377 &&
ISSET(iflag, ISTRIP|IGNPAR|INPCK|PARMRK) == (INPCK|PARMRK)) {
/* "Escape" a valid character of '\377'. */
(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
goto endcase;
}
/*
* In tandem mode, check high water mark.
*/
if (ISSET(iflag, IXOFF) || ISSET(tp->t_cflag, CHWFLOW))
ttyblock(tp);
if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
CLR(c, 0x80);
if (!ISSET(lflag, EXTPROC)) {
/*
* Check for literal nexting very first
*/
if (ISSET(tp->t_state, TS_LNCH)) {
SET(c, TTY_QUOTE);
CLR(tp->t_state, TS_LNCH);
}
/*
* Scan for special characters. This code
* is really just a big case statement with
* non-constant cases. The bottom of the
* case statement is labeled ``endcase'', so goto
* it after a case match, or similar.
*/
/*
* Control chars which aren't controlled
* by ICANON, ISIG, or IXON.
*/
if (ISSET(lflag, IEXTEN)) {
if (CCEQ(cc[VLNEXT], c)) {
if (ISSET(lflag, ECHO)) {
if (ISSET(lflag, ECHOE)) {
(void)ttyoutput('^', tp);
(void)ttyoutput('\b', tp);
} else
ttyecho(c, tp);
}
SET(tp->t_state, TS_LNCH);
goto endcase;
}
if (CCEQ(cc[VDISCARD], c)) {
if (ISSET(lflag, FLUSHO))
CLR(tp->t_lflag, FLUSHO);
else {
ttyflush(tp, FWRITE);
ttyecho(c, tp);
if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
ttyretype(tp);
SET(tp->t_lflag, FLUSHO);
}
goto startoutput;
}
}
/*
* Signals.
*/
if (ISSET(lflag, ISIG)) {
if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
if (!ISSET(lflag, NOFLSH))
ttyflush(tp, FREAD | FWRITE);
ttyecho(c, tp);
ttysig(tp, TTYSIG_PG1, CCEQ(cc[VINTR], c) ?
SIGINT : SIGQUIT);
goto endcase;
}
if (CCEQ(cc[VSUSP], c)) {
if (!ISSET(lflag, NOFLSH))
ttyflush(tp, FREAD);
ttyecho(c, tp);
ttysig(tp, TTYSIG_PG1, SIGTSTP);
goto endcase;
}
}
/*
* Handle start/stop characters.
*/
if (ISSET(iflag, IXON)) {
if (CCEQ(cc[VSTOP], c)) {
if (!ISSET(tp->t_state, TS_TTSTOP)) {
SET(tp->t_state, TS_TTSTOP);
cdev_stop(tp, 0);
return (0);
}
if (!CCEQ(cc[VSTART], c))
return (0);
/*
* if VSTART == VSTOP then toggle
*/
goto endcase;
}
if (CCEQ(cc[VSTART], c))
goto restartoutput;
}
/*
* IGNCR, ICRNL, & INLCR
*/
if (c == '\r') {
if (ISSET(iflag, IGNCR))
goto endcase;
else if (ISSET(iflag, ICRNL))
c = '\n';
} else if (c == '\n' && ISSET(iflag, INLCR))
c = '\r';
}
if (!ISSET(lflag, EXTPROC) && ISSET(lflag, ICANON)) {
/*
* From here on down canonical mode character
* processing takes place.
*/
/*
* erase (^H / ^?)
*/
if (CCEQ(cc[VERASE], c)) {
if (tp->t_rawq.c_cc)
ttyrub(unputc(&tp->t_rawq), tp);
goto endcase;
}
/*
* kill (^U)
*/
if (CCEQ(cc[VKILL], c)) {
if (ISSET(lflag, ECHOKE) &&
tp->t_rawq.c_cc == tp->t_rocount &&
!ISSET(lflag, ECHOPRT))
while (tp->t_rawq.c_cc)
ttyrub(unputc(&tp->t_rawq), tp);
else {
ttyecho(c, tp);
if (ISSET(lflag, ECHOK) ||
ISSET(lflag, ECHOKE))
ttyecho('\n', tp);
FLUSHQ(&tp->t_rawq);
tp->t_rocount = 0;
}
CLR(tp->t_state, TS_LOCAL);
goto endcase;
}
/*
* Extensions to the POSIX.1 GTI set of functions.
*/
if (ISSET(lflag, IEXTEN)) {
/*
* word erase (^W)
*/
if (CCEQ(cc[VWERASE], c)) {
int alt = ISSET(lflag, ALTWERASE);
int ctype;
/*
* erase whitespace
*/
while ((c = unputc(&tp->t_rawq)) == ' ' ||
c == '\t')
ttyrub(c, tp);
if (c == -1)
goto endcase;
/*
* erase last char of word and remember the
* next chars type (for ALTWERASE)
*/
ttyrub(c, tp);
c = unputc(&tp->t_rawq);
if (c == -1)
goto endcase;
if (c == ' ' || c == '\t') {
(void)putc(c, &tp->t_rawq);
goto endcase;
}
ctype = ISALPHA(c);
/*
* erase rest of word
*/
do {
ttyrub(c, tp);
c = unputc(&tp->t_rawq);
if (c == -1)
goto endcase;
} while (c != ' ' && c != '\t' &&
(alt == 0 || ISALPHA(c) == ctype));
(void)putc(c, &tp->t_rawq);
goto endcase;
}
/*
* reprint line (^R)
*/
if (CCEQ(cc[VREPRINT], c)) {
ttyretype(tp);
goto endcase;
}
/*
* ^T - kernel info and generate SIGINFO
*/
if (CCEQ(cc[VSTATUS], c)) {
ttysig(tp, TTYSIG_PG1, SIGINFO);
goto endcase;
}
}
}
/*
* Check for input buffer overflow
*/
if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) {
if (ISSET(iflag, IMAXBEL)) {
if (tp->t_outq.c_cc < tp->t_hiwat)
(void)ttyoutput(CTRL('g'), tp);
} else
ttyflush(tp, FREAD | FWRITE);
goto endcase;
}
/*
* Put data char in q for user and
* wakeup on seeing a line delimiter.
*/
if (putc(c, &tp->t_rawq) >= 0) {
if (!ISSET(lflag, ICANON)) {
ttwakeup(tp);
ttyecho(c, tp);
goto endcase;
}
if (TTBREAKC(c, lflag)) {
tp->t_rocount = 0;
catq(&tp->t_rawq, &tp->t_canq);
ttwakeup(tp);
} else if (tp->t_rocount++ == 0)
tp->t_rocol = tp->t_column;
if (ISSET(tp->t_state, TS_ERASE)) {
/*
* end of prterase \.../
*/
CLR(tp->t_state, TS_ERASE);
(void)ttyoutput('/', tp);
}
i = tp->t_column;
ttyecho(c, tp);
if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
/*
* Place the cursor over the '^' of the ^D.
*/
i = uimin(2, tp->t_column - i);
while (i > 0) {
(void)ttyoutput('\b', tp);
i--;
}
}
}
endcase:
/*
* IXANY means allow any character to restart output.
*/
if (ISSET(tp->t_state, TS_TTSTOP) &&
!ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) {
return (0);
}
restartoutput:
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_TTSTOP);
startoutput:
return (ttstart(tp));
}
/*
* Process input of a single character received on a tty.
*
* XXX - this is a hack, all drivers must changed to acquire the
* lock before calling linesw->l_rint()
*/
int
ttyinput(int c, struct tty *tp)
{
int error;
/*
* Unless the receiver is enabled, drop incoming data.
*/
if (!ISSET(tp->t_cflag, CREAD))
return (0);
mutex_spin_enter(&tty_lock);
error = ttyinput_wlock(c, tp);
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Output a single character on a tty, doing output processing
* as needed (expanding tabs, newline processing, etc.).
* Returns < 0 if succeeds, otherwise returns char to resend.
* Must be recursive.
*
* Call with tty lock held.
*/
int
ttyoutput(int c, struct tty *tp)
{
long oflag;
int col, notout;
KASSERT(mutex_owned(&tty_lock));
oflag = tp->t_oflag;
if (!ISSET(oflag, OPOST)) {
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
return (c);
return (-1);
}
/*
* Do tab expansion if OXTABS is set. Special case if we do external
* processing, we don't do the tab expansion because we'll probably
* get it wrong. If tab expansion needs to be done, let it happen
* externally.
*/
CLR(c, ~TTY_CHARMASK);
if (c == '\t' &&
ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
c = 8 - (tp->t_column & 7);
if (ISSET(tp->t_lflag, FLUSHO)) {
notout = 0;
} else {
notout = b_to_q(" ", c, &tp->t_outq);
c -= notout;
tk_nout += c;
tp->t_outcc += c;
}
tp->t_column += c;
return (notout ? '\t' : -1);
}
if (c == CEOT && ISSET(oflag, ONOEOT))
return (-1);
/*
* Newline translation: if ONLCR is set,
* translate newline into "\r\n".
*/
if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
return (c);
}
/* If OCRNL is set, translate "\r" into "\n". */
else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
c = '\n';
/* If ONOCR is set, don't transmit CRs when on column 0. */
else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
return (-1);
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
return (c);
col = tp->t_column;
switch (CCLASS(c)) {
case BACKSPACE:
if (col > 0)
--col;
break;
case CONTROL:
break;
case NEWLINE:
if (ISSET(tp->t_oflag, ONLCR | ONLRET))
col = 0;
break;
case RETURN:
col = 0;
break;
case ORDINARY:
++col;
break;
case TAB:
col = (col + 8) & ~7;
break;
}
tp->t_column = col;
return (-1);
}
/*
* Ioctls for all tty devices. Called after line-discipline specific ioctl
* has been called to do discipline-specific functions and/or reject any
* of these ioctl commands.
*/
/* ARGSUSED */
int
ttioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l)
{
struct proc *p;
struct linesw *lp;
int s, error;
struct pathbuf *pb;
struct nameidata nd;
char infobuf[200];
KASSERT(l != NULL);
p = l->l_proc;
/* If the ioctl involves modification, hang if in the background. */
switch (cmd) {
case TIOCFLUSH:
case TIOCDRAIN:
case TIOCSBRK:
case TIOCCBRK:
case TIOCSTART:
case TIOCSETA:
case TIOCSETD:
case TIOCSLINED:
case TIOCSETAF:
case TIOCSETAW:
#ifdef notdef
case TIOCSPGRP:
case FIOSETOWN:
#endif
case TIOCSTAT:
case TIOCSTI:
case TIOCSWINSZ:
case TIOCSQSIZE:
case TIOCLBIC:
case TIOCLBIS:
case TIOCLSET:
case TIOCSETC:
case OTIOCSETD:
case TIOCSETN:
case TIOCSETP:
case TIOCSLTC:
mutex_spin_enter(&tty_lock);
while (isbackground(curproc, tp) && p->p_pgrp->pg_jobc && (p->p_lflag & PL_PPWAIT) == 0 &&
!sigismasked(l, SIGTTOU)) {
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTOU, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz); if (error) { mutex_spin_exit(&tty_lock);
return (error);
}
}
mutex_spin_exit(&tty_lock);
break;
}
switch (cmd) { /* Process the ioctl. */
case FIOASYNC: /* set/clear async i/o */
mutex_spin_enter(&tty_lock);
if (*(int *)data)
SET(tp->t_state, TS_ASYNC);
else
CLR(tp->t_state, TS_ASYNC);
mutex_spin_exit(&tty_lock);
break;
case FIONBIO: /* set/clear non-blocking i/o */
break; /* XXX: delete. */
case FIONREAD: /* get # bytes to read */
mutex_spin_enter(&tty_lock);
*(int *)data = ttnread(tp);
mutex_spin_exit(&tty_lock);
break;
case FIONWRITE: /* get # bytes to written & unsent */
mutex_spin_enter(&tty_lock);
*(int *)data = tp->t_outq.c_cc;
mutex_spin_exit(&tty_lock);
break;
case FIONSPACE: /* get # bytes to written & unsent */
mutex_spin_enter(&tty_lock);
*(int *)data = tp->t_outq.c_cn - tp->t_outq.c_cc;
mutex_spin_exit(&tty_lock);
break;
case TIOCEXCL: /* set exclusive use of tty */
mutex_spin_enter(&tty_lock);
SET(tp->t_state, TS_XCLUDE);
mutex_spin_exit(&tty_lock);
break;
case TIOCFLUSH: { /* flush buffers */
int flags = *(int *)data;
if (flags == 0)
flags = FREAD | FWRITE;
else
flags &= FREAD | FWRITE;
mutex_spin_enter(&tty_lock);
ttyflush(tp, flags);
mutex_spin_exit(&tty_lock);
break;
}
case TIOCCONS: { /* become virtual console */
struct tty *ctp;
mutex_enter(&constty_lock);
error = 0;
ctp = atomic_load_relaxed(&constty);
if (*(int *)data) {
if (ctp != NULL && ctp != tp &&
ISSET(ctp->t_state, TS_CARR_ON | TS_ISOPEN) ==
(TS_CARR_ON | TS_ISOPEN)) {
error = EBUSY;
goto unlock_constty;
}
pb = pathbuf_create("/dev/console");
if (pb == NULL) {
error = ENOMEM;
goto unlock_constty;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto unlock_constty;
}
error = VOP_ACCESS(nd.ni_vp, VREAD, l->l_cred);
vput(nd.ni_vp);
pathbuf_destroy(pb);
if (error)
goto unlock_constty;
KASSERT(atomic_load_relaxed(&constty) == ctp ||
atomic_load_relaxed(&constty) == NULL);
atomic_store_release(&constty, tp);
} else if (tp == ctp) {
atomic_store_relaxed(&constty, NULL);
}
unlock_constty: mutex_exit(&constty_lock);
if (error)
return error;
break;
}
case TIOCDRAIN: /* wait till output drained */
if ((error = ttywait(tp)) != 0)
return (error);
break;
case TIOCGETA: { /* get termios struct */
struct termios *t = (struct termios *)data;
memcpy(t, &tp->t_termios, sizeof(struct termios));
break;
}
case TIOCGETD: /* get line discipline (old) */
*(int *)data = tp->t_linesw->l_no;
break;
case TIOCGLINED: /* get line discipline (new) */
(void)strncpy((char *)data, tp->t_linesw->l_name,
TTLINEDNAMELEN - 1);
break;
case TIOCGWINSZ: /* get window size */
*(struct winsize *)data = tp->t_winsize;
break;
case TIOCGQSIZE:
*(int *)data = tp->t_qsize;
break;
case FIOGETOWN:
mutex_enter(&proc_lock);
if (tp->t_session != NULL && !isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_pgrp ? -tp->t_pgrp->pg_id : 0;
mutex_exit(&proc_lock);
break;
case TIOCGPGRP: /* get pgrp of tty */
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
mutex_exit(&proc_lock);
break;
case TIOCGSID: /* get sid of tty */
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_session->s_sid;
mutex_exit(&proc_lock);
break;
#ifdef TIOCHPCL
case TIOCHPCL: /* hang up on last close */
mutex_spin_enter(&tty_lock);
SET(tp->t_cflag, HUPCL);
mutex_spin_exit(&tty_lock);
break;
#endif
case TIOCNXCL: /* reset exclusive use of tty */
mutex_spin_enter(&tty_lock);
CLR(tp->t_state, TS_XCLUDE);
mutex_spin_exit(&tty_lock);
break;
case TIOCOUTQ: /* output queue size */
*(int *)data = tp->t_outq.c_cc;
break;
case TIOCSETA: /* set termios struct */
case TIOCSETAW: /* drain output, set */
case TIOCSETAF: { /* drn out, fls in, set */
struct termios *t = (struct termios *)data;
if (cmd == TIOCSETAW || cmd == TIOCSETAF) { if ((error = ttywait(tp)) != 0)
return (error);
if (cmd == TIOCSETAF) { mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD);
mutex_spin_exit(&tty_lock);
}
}
s = spltty();
/*
* XXXSMP - some drivers call back on us from t_param(), so
* don't take the tty spin lock here.
* require t_param() to unlock upon callback?
*/
/* wanted here: mutex_spin_enter(&tty_lock); */
if (!ISSET(t->c_cflag, CIGNORE)) {
/*
* Set device hardware.
*/
if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
/* wanted here: mutex_spin_exit(&tty_lock); */
splx(s);
return (error);
} else {
tp->t_cflag = t->c_cflag;
tp->t_ispeed = t->c_ispeed;
tp->t_ospeed = t->c_ospeed;
if (t->c_ospeed == 0) ttysig(tp, TTYSIG_LEADER, SIGHUP);
}
ttsetwater(tp);
}
/* delayed lock acquiring */
mutex_spin_enter(&tty_lock);
if (cmd != TIOCSETAF) { if (ISSET(t->c_lflag, ICANON) !=
ISSET(tp->t_lflag, ICANON)) {
if (ISSET(t->c_lflag, ICANON)) {
SET(tp->t_lflag, PENDIN);
ttwakeup(tp);
} else {
struct clist tq;
catq(&tp->t_rawq, &tp->t_canq);
tq = tp->t_rawq;
tp->t_rawq = tp->t_canq;
tp->t_canq = tq;
CLR(tp->t_lflag, PENDIN);
}
}
}
tp->t_iflag = t->c_iflag;
tp->t_oflag = t->c_oflag;
/*
* Make the EXTPROC bit read only.
*/
if (ISSET(tp->t_lflag, EXTPROC))
SET(t->c_lflag, EXTPROC);
else
CLR(t->c_lflag, EXTPROC);
tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
memcpy(tp->t_cc, t->c_cc, sizeof(t->c_cc));
mutex_spin_exit(&tty_lock);
splx(s);
break;
}
case TIOCSETD: /* set line discipline (old) */
lp = ttyldisc_lookup_bynum(*(int *)data);
goto setldisc;
case TIOCSLINED: { /* set line discipline (new) */
char *name = (char *)data;
dev_t device;
/* Null terminate to prevent buffer overflow */
name[TTLINEDNAMELEN - 1] = '\0';
lp = ttyldisc_lookup(name);
setldisc:
if (lp == NULL)
return (ENXIO);
if (lp != tp->t_linesw) {
device = tp->t_dev;
s = spltty();
(*tp->t_linesw->l_close)(tp, flag);
error = (*lp->l_open)(device, tp);
if (error) {
(void)(*tp->t_linesw->l_open)(device, tp);
splx(s);
ttyldisc_release(lp);
return (error);
}
ttyldisc_release(tp->t_linesw);
tp->t_linesw = lp;
splx(s);
} else {
/* Drop extra reference. */
ttyldisc_release(lp);
}
break;
}
case TIOCSTART: /* start output, like ^Q */
mutex_spin_enter(&tty_lock);
if (ISSET(tp->t_state, TS_TTSTOP) ||
ISSET(tp->t_lflag, FLUSHO)) {
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_TTSTOP);
ttstart(tp);
}
mutex_spin_exit(&tty_lock);
break;
case TIOCSTI: /* simulate terminal input */
if ((error = kauth_authorize_device_tty(l->l_cred,
KAUTH_DEVICE_TTY_STI, tp)) != 0) {
if (!ISSET(flag, FREAD))
return EPERM;
if (!isctty(p, tp))
return EACCES;
if (tp->t_session->s_leader->p_cred != p->p_cred)
return error;
}
(*tp->t_linesw->l_rint)(*(u_char *)data, tp);
break;
case TIOCSTOP: /* stop output, like ^S */
{
mutex_spin_enter(&tty_lock);
if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP);
cdev_stop(tp, 0);
}
mutex_spin_exit(&tty_lock);
break;
}
case TIOCSCTTY: /* become controlling tty */
mutex_enter(&proc_lock);
mutex_spin_enter(&tty_lock);
/* Session ctty vnode pointer set in vnode layer. */
if (!SESS_LEADER(p) ||
((p->p_session->s_ttyvp || tp->t_session) &&
(tp->t_session != p->p_session))) {
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
return (EPERM);
}
/*
* `p_session' acquires a reference.
* But note that if `t_session' is set at this point,
* it must equal `p_session', in which case the session
* already has the correct reference count.
*/
if (tp->t_session == NULL) {
proc_sesshold(p->p_session);
}
tp->t_session = p->p_session;
tp->t_pgrp = p->p_pgrp;
p->p_session->s_ttyp = tp;
p->p_lflag |= PL_CONTROLT;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
case FIOSETOWN: { /* set pgrp of tty */
pid_t pgid = *(pid_t *)data;
struct pgrp *pgrp;
mutex_enter(&proc_lock);
if (tp->t_session != NULL && !isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
if (pgid < 0) {
if (pgid == INT_MIN) {
mutex_exit(&proc_lock);
return (EINVAL);
}
pgrp = pgrp_find(-pgid);
if (pgrp == NULL) { mutex_exit(&proc_lock);
return (EINVAL);
}
} else {
struct proc *p1;
p1 = proc_find(pgid);
if (!p1) {
mutex_exit(&proc_lock);
return (ESRCH);
}
pgrp = p1->p_pgrp;
}
if (pgrp->pg_session != p->p_session) {
mutex_exit(&proc_lock);
return (EPERM);
}
mutex_spin_enter(&tty_lock);
tp->t_pgrp = pgrp;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
}
case TIOCSPGRP: { /* set pgrp of tty */
struct pgrp *pgrp;
pid_t pgid = *(pid_t *)data;
if (pgid == NO_PGID)
return EINVAL;
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
pgrp = pgrp_find(pgid);
if (pgrp == NULL || pgrp->pg_session != p->p_session) {
mutex_exit(&proc_lock);
return (EPERM);
}
mutex_spin_enter(&tty_lock);
tp->t_pgrp = pgrp;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
}
case TIOCSTAT: /* get load avg stats */
mutex_enter(&proc_lock);
ttygetinfo(tp, 0, infobuf, sizeof(infobuf));
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
ttyputinfo(tp, infobuf);
mutex_spin_exit(&tty_lock);
break;
case TIOCSWINSZ: /* set window size */
mutex_spin_enter(&tty_lock);
if (memcmp((void *)&tp->t_winsize, data,
sizeof(struct winsize))) {
tp->t_winsize = *(struct winsize *)data;
ttysig(tp, TTYSIG_PG1, SIGWINCH);
}
mutex_spin_exit(&tty_lock);
break;
case TIOCSQSIZE:
if ((error = tty_get_qsize(&s, *(int *)data)) == 0 &&
s != tp->t_qsize)
error = tty_set_qsize(tp, s);
return error;
case TIOCSBRK:
case TIOCCBRK:
case TIOCSDTR:
case TIOCCDTR:
case TIOCSFLAGS:
case TIOCGFLAGS:
case TIOCMSET:
case TIOCMGET:
case TIOCMBIS:
case TIOCMBIC:
/* Handled by the driver layer */
return EPASSTHROUGH;
case TIOCEXT:
case TIOCPTSNAME:
case TIOCGRANTPT:
case TIOCPKT:
case TIOCUCNTL:
case TIOCREMOTE:
case TIOCSIG:
/* for ptys */
return EPASSTHROUGH;
default:
/* Pass through various console ioctls */
switch (IOCGROUP(cmd)) {
case 'c': /* syscons console */
case 'v': /* usl console, video - where one letter */
case 'K': /* usl console, keyboard - aint enough */
case 'V': /* pcvt compat */
case 'W': /* wscons console */
return EPASSTHROUGH;
default:
break;
}
/* We may have to load the compat_60 module for this. */
(void)module_autoload("compat_60", MODULE_CLASS_EXEC);
MODULE_HOOK_CALL(tty_ttioctl_60_hook,
(tp, cmd, data, flag, l), enosys(), error);
if (error != EPASSTHROUGH)
return error;
/* We may have to load the compat_43 module for this. */
(void)module_autoload("compat_43", MODULE_CLASS_EXEC);
MODULE_HOOK_CALL(tty_ttioctl_43_hook,
(tp, cmd, data, flag, l), enosys(), error);
return error;
}
return (0);
}
int
ttpoll(struct tty *tp, int events, struct lwp *l)
{
int revents;
revents = 0;
mutex_spin_enter(&tty_lock);
if (events & (POLLIN | POLLRDNORM))
if (ttnread(tp) > 0)
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM))
if (tp->t_outq.c_cc <= tp->t_lowat)
revents |= events & (POLLOUT | POLLWRNORM);
if (events & POLLHUP)
if (!CONNECTED(tp))
revents |= POLLHUP;
if (revents == 0) {
if (events & (POLLIN | POLLHUP | POLLRDNORM))
selrecord(l, &tp->t_rsel);
if (events & (POLLOUT | POLLWRNORM))
selrecord(l, &tp->t_wsel);
}
mutex_spin_exit(&tty_lock);
return (revents);
}
static void
filt_ttyrdetach(struct knote *kn)
{
struct tty *tp;
tp = kn->kn_hook;
mutex_spin_enter(&tty_lock);
selremove_knote(&tp->t_rsel, kn);
mutex_spin_exit(&tty_lock);
}
static int
filt_ttyread(struct knote *kn, long hint)
{
struct tty *tp;
int rv;
tp = kn->kn_hook;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_enter(&tty_lock);
kn->kn_data = ttnread(tp);
rv = kn->kn_data > 0;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_exit(&tty_lock);
return rv;
}
static void
filt_ttywdetach(struct knote *kn)
{
struct tty *tp;
tp = kn->kn_hook;
mutex_spin_enter(&tty_lock);
selremove_knote(&tp->t_wsel, kn);
mutex_spin_exit(&tty_lock);
}
static int
filt_ttywrite(struct knote *kn, long hint)
{
struct tty *tp;
int canwrite;
tp = kn->kn_hook;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_enter(&tty_lock);
kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc;
canwrite = (tp->t_outq.c_cc <= tp->t_lowat) && CONNECTED(tp);
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_exit(&tty_lock);
return (canwrite);
}
static const struct filterops ttyread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_ttyrdetach,
.f_event = filt_ttyread,
};
static const struct filterops ttywrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_ttywdetach,
.f_event = filt_ttywrite,
};
int
ttykqfilter(dev_t dev, struct knote *kn)
{
struct tty *tp;
struct selinfo *sip;
if ((tp = cdev_tty(dev)) == NULL)
return (ENXIO);
switch (kn->kn_filter) {
case EVFILT_READ:
sip = &tp->t_rsel;
kn->kn_fop = &ttyread_filtops;
break;
case EVFILT_WRITE:
sip = &tp->t_wsel;
kn->kn_fop = &ttywrite_filtops;
break;
default:
return EINVAL;
}
kn->kn_hook = tp;
mutex_spin_enter(&tty_lock);
selrecord_knote(sip, kn);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Find the number of chars ready to be read from this tty.
* Call with the tty lock held.
*/
static int
ttnread(struct tty *tp)
{
int nread;
KASSERT(mutex_owned(&tty_lock)); if (ISSET(tp->t_lflag, PENDIN)) ttypend(tp);
nread = tp->t_canq.c_cc;
if (!ISSET(tp->t_lflag, ICANON)) {
nread += tp->t_rawq.c_cc;
if (nread < tp->t_cc[VMIN] && !tp->t_cc[VTIME])
nread = 0;
}
return (nread);
}
/*
* Wait for output to drain, or if this times out, flush it.
*/
static int
ttywait_timo(struct tty *tp, int timo)
{
int error;
error = 0;
mutex_spin_enter(&tty_lock);
while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && CONNECTED(tp) && tp->t_oproc) {
(*tp->t_oproc)(tp);
error = ttysleep(tp, &tp->t_outcv, true, timo);
if (error == EWOULDBLOCK)
ttyflush(tp, FWRITE);
if (error)
break;
}
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Wait for output to drain.
*/
int
ttywait(struct tty *tp)
{
return ttywait_timo(tp, 0);
}
/*
* Flush if successfully wait.
*/
int
ttywflush(struct tty *tp)
{
int error;
error = ttywait_timo(tp, 5 * hz);
if (error == 0 || error == EWOULDBLOCK) {
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD);
mutex_spin_exit(&tty_lock);
}
return (error);
}
/*
* Flush tty read and/or write queues, notifying anyone waiting.
* Call with the tty lock held.
*/
void
ttyflush(struct tty *tp, int rw)
{ KASSERT(mutex_owned(&tty_lock)); if (rw & FREAD) { FLUSHQ(&tp->t_canq); FLUSHQ(&tp->t_rawq);
tp->t_rocount = 0;
tp->t_rocol = 0;
CLR(tp->t_state, TS_LOCAL);
ttwakeup(tp);
}
if (rw & FWRITE) {
CLR(tp->t_state, TS_TTSTOP);
cdev_stop(tp, rw);
FLUSHQ(&tp->t_outq);
cv_broadcast(&tp->t_outcv);
selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
}
}
/*
* Copy in the default termios characters.
*/
void
ttychars(struct tty *tp)
{
memcpy(tp->t_cc, ttydefchars, sizeof(ttydefchars));
}
/*
* Send stop character on input overflow.
* Call with the tty lock held.
*/
static void
ttyblock(struct tty *tp)
{
int total;
KASSERT(mutex_owned(&tty_lock));
total = tp->t_rawq.c_cc + tp->t_canq.c_cc;
if (tp->t_rawq.c_cc > TTYHOG) {
ttyflush(tp, FREAD | FWRITE);
CLR(tp->t_state, TS_TBLOCK);
}
/*
* Block further input iff: current input > threshold
* AND input is available to user program.
*/
if (total >= TTYHOG / 2 &&
!ISSET(tp->t_state, TS_TBLOCK) &&
(!ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0)) {
if (ISSET(tp->t_iflag, IXOFF) &&
tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) {
SET(tp->t_state, TS_TBLOCK);
ttstart(tp);
}
/* Try to block remote output via hardware flow control. */
if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
(*tp->t_hwiflow)(tp, 1) != 0)
SET(tp->t_state, TS_TBLOCK);
}
}
/*
* Delayed line discipline output
*/
void
ttrstrt(void *tp_arg)
{
struct tty *tp;
#ifdef DIAGNOSTIC
if (tp_arg == NULL)
panic("ttrstrt");
#endif
tp = tp_arg;
mutex_spin_enter(&tty_lock);
CLR(tp->t_state, TS_TIMEOUT);
ttstart(tp); /* XXX - Shouldn't this be tp->l_start(tp)? */
mutex_spin_exit(&tty_lock);
}
/*
* start a line discipline
* Always call with tty lock held?
*/
int
ttstart(struct tty *tp)
{
if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ (*tp->t_oproc)(tp);
return (0);
}
/*
* "close" a line discipline
*/
int
ttylclose(struct tty *tp, int flag)
{
if (flag & FNONBLOCK) {
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD | FWRITE);
mutex_spin_exit(&tty_lock);
} else
ttywflush(tp);
return (0);
}
/*
* Handle modem control transition on a tty.
* Flag indicates new state of carrier.
* Returns 0 if the line should be turned off, otherwise 1.
*/
int
ttymodem(struct tty *tp, int flag)
{
mutex_spin_enter(&tty_lock);
if (flag == 0) {
if (ISSET(tp->t_state, TS_CARR_ON)) {
/*
* Lost carrier.
*/
CLR(tp->t_state, TS_CARR_ON);
if (ISSET(tp->t_state, TS_ISOPEN) && !CONNECTED(tp)) { ttysig(tp, TTYSIG_LEADER, SIGHUP);
ttyflush(tp, FREAD | FWRITE);
mutex_spin_exit(&tty_lock);
return (0);
}
}
} else {
if (!ISSET(tp->t_state, TS_CARR_ON)) {
/*
* Carrier now on.
*/
SET(tp->t_state, TS_CARR_ON);
ttwakeup(tp);
}
}
mutex_spin_exit(&tty_lock);
return (1);
}
/*
* Default modem control routine (for other line disciplines).
* Return argument flag, to turn off device on carrier drop.
*/
int
nullmodem(struct tty *tp, int flag)
{
mutex_spin_enter(&tty_lock);
if (flag)
SET(tp->t_state, TS_CARR_ON);
else {
CLR(tp->t_state, TS_CARR_ON);
if (!CONNECTED(tp)) {
ttysig(tp, TTYSIG_LEADER, SIGHUP);
mutex_spin_exit(&tty_lock);
return (0);
}
}
mutex_spin_exit(&tty_lock);
return (1);
}
/*
* Reinput pending characters after state switch.
*/
void
ttypend(struct tty *tp)
{
struct clist tq;
int c;
KASSERT(mutex_owned(&tty_lock));
CLR(tp->t_lflag, PENDIN);
SET(tp->t_state, TS_TYPEN);
tq = tp->t_rawq;
tp->t_rawq.c_cc = 0;
tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0;
while ((c = getc(&tq)) >= 0)
ttyinput_wlock(c, tp);
CLR(tp->t_state, TS_TYPEN);
}
/*
* Process a read call on a tty device.
*/
int
ttread(struct tty *tp, struct uio *uio, int flag)
{
struct clist *qp;
u_char *cc;
struct proc *p;
int c, first, error, has_stime, last_cc;
long lflag, slp;
struct timeval now, stime;
if (uio->uio_resid == 0)
return 0;
stime.tv_usec = 0; /* XXX gcc */
stime.tv_sec = 0; /* XXX gcc */
cc = tp->t_cc;
p = curproc;
error = 0;
has_stime = 0;
last_cc = 0;
slp = 0;
loop:
mutex_spin_enter(&tty_lock);
lflag = tp->t_lflag;
/*
* take pending input first
*/
if (ISSET(lflag, PENDIN))
ttypend(tp);
/*
* Hang process if it's in the background.
*/
if (isbackground(p, tp)) {
if (sigismasked(curlwp, SIGTTIN) ||
p->p_lflag & PL_PPWAIT || p->p_pgrp->pg_jobc == 0) {
mutex_spin_exit(&tty_lock);
return (EIO);
}
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTIN, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz);
mutex_spin_exit(&tty_lock);
if (error)
return (error);
goto loop;
}
if (!ISSET(lflag, ICANON)) {
int m = cc[VMIN];
long t = cc[VTIME];
qp = &tp->t_rawq;
/*
* Check each of the four combinations.
* (m > 0 && t == 0) is the normal read case.
* It should be fairly efficient, so we check that and its
* companion case (m == 0 && t == 0) first.
* For the other two cases, we compute the target sleep time
* into slp.
*/
if (t == 0) {
if (qp->c_cc < m)
goto sleep;
goto read;
}
t *= hz; /* time in deca-ticks */
/*
* Time difference in deca-ticks, split division to avoid numeric overflow.
* Ok for hz < ~200kHz
*/
#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 10 * hz + \
((t1).tv_usec - (t2).tv_usec) / 100 * hz / 1000)
if (m > 0) {
if (qp->c_cc <= 0)
goto sleep;
if (qp->c_cc >= m)
goto read;
if (!has_stime) {
/* first character, start timer */
has_stime = 1;
getmicrotime(&stime);
slp = t;
} else if (qp->c_cc > last_cc) {
/* got a character, restart timer */
getmicrotime(&stime);
slp = t;
} else {
/* nothing, check expiration */
getmicrotime(&now);
slp = t - diff(now, stime);
}
} else { /* m == 0 */
if (qp->c_cc > 0)
goto read;
if (!has_stime) {
has_stime = 1;
getmicrotime(&stime);
slp = t;
} else {
getmicrotime(&now);
slp = t - diff(now, stime);
}
}
last_cc = qp->c_cc;
#undef diff
if (slp > 0) {
/*
* Convert deca-ticks back to ticks.
* Rounding down may make us wake up just short
* of the target, so we round up.
* Maybe we should do 'slp/10 + 1' because the
* first tick maybe almost immediate.
* However it is more useful for a program that sets
* VTIME=10 to wakeup every second not every 1.01
* seconds (if hz=100).
*/
slp = (slp + 9)/ 10;
goto sleep;
}
} else if ((qp = &tp->t_canq)->c_cc <= 0) {
int carrier;
sleep:
/*
* If there is no input, sleep on rawq
* awaiting hardware receipt and notification.
* If we have data, we don't need to check for carrier.
*/
carrier = CONNECTED(tp);
if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) {
mutex_spin_exit(&tty_lock);
return (0); /* EOF */
}
if (!has_stime || slp <= 0) {
if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
return (EWOULDBLOCK);
}
}
error = ttysleep(tp, &tp->t_rawcv, true, slp);
mutex_spin_exit(&tty_lock);
/* VMIN == 0: any quantity read satisfies */
if (cc[VMIN] == 0 && error == EWOULDBLOCK)
return (0);
if (error && error != EWOULDBLOCK)
return (error);
goto loop;
}
read:
/*
* Input present, check for input mapping and processing.
*/
first = 1;
while ((c = getc(qp)) >= 0) {
/*
* delayed suspend (^Y)
*/
if (CCEQ(cc[VDSUSP], c) &&
ISSET(lflag, IEXTEN|ISIG) == (IEXTEN|ISIG)) {
ttysig(tp, TTYSIG_PG1, SIGTSTP);
if (first) {
error = ttypause(tp, hz);
if (error)
break;
mutex_spin_exit(&tty_lock);
goto loop;
}
break;
}
/*
* Interpret EOF only in canonical mode.
*/
if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
break;
/*
* Give user character.
*/
mutex_spin_exit(&tty_lock);
error = ureadc(c, uio);
mutex_spin_enter(&tty_lock);
if (error)
break;
if (uio->uio_resid == 0)
break;
/*
* In canonical mode check for a "break character"
* marking the end of a "line of input".
*/
if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
break;
first = 0;
}
/*
* Look to unblock output now that (presumably)
* the input queue has gone down.
*/
if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG / 5) {
if (ISSET(tp->t_iflag, IXOFF) &&
cc[VSTART] != _POSIX_VDISABLE &&
putc(cc[VSTART], &tp->t_outq) == 0) {
CLR(tp->t_state, TS_TBLOCK);
ttstart(tp);
}
/* Try to unblock remote output via hardware flow control. */
if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
(*tp->t_hwiflow)(tp, 0) != 0)
CLR(tp->t_state, TS_TBLOCK);
}
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Check the output queue on tp for space for a kernel message (from uprintf
* or tprintf). Allow some space over the normal hiwater mark so we don't
* lose messages due to normal flow control, but don't let the tty run amok.
* Sleeps here are not interruptible, but we return prematurely if new signals
* arrive.
* Call with tty lock held.
*/
static int
ttycheckoutq_wlock(struct tty *tp)
{
int hiwat;
KASSERT(mutex_owned(&tty_lock));
hiwat = tp->t_hiwat;
if (tp->t_outq.c_cc > hiwat + 200)
if (tp->t_outq.c_cc > hiwat) {
ttstart(tp);
return (0);
}
return (1);
}
int
ttycheckoutq(struct tty *tp)
{
int r;
mutex_spin_enter(&tty_lock);
r = ttycheckoutq_wlock(tp);
mutex_spin_exit(&tty_lock);
return (r);
}
/*
* Process a write call on a tty device.
*/
int
ttwrite(struct tty *tp, struct uio *uio, int flag)
{
u_char *cp;
struct proc *p;
int cc, cc0, ce, i, hiwat, error;
u_char obuf[OBUFSIZ];
cp = NULL;
hiwat = tp->t_hiwat;
error = 0;
cc0 = cc = 0;
loop:
mutex_spin_enter(&tty_lock);
if (!CONNECTED(tp)) {
if (ISSET(tp->t_state, TS_ISOPEN)) {
mutex_spin_exit(&tty_lock);
return (EIO);
} else if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
error = EWOULDBLOCK;
goto out;
} else {
/* Sleep awaiting carrier. */
error = ttysleep(tp, &tp->t_rawcv, true, 0);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
}
/*
* Hang the process if it's in the background.
*/
p = curproc;
if (isbackground(p, tp) &&
ISSET(tp->t_lflag, TOSTOP) && (p->p_lflag & PL_PPWAIT) == 0 &&
!sigismasked(curlwp, SIGTTOU)) {
if (p->p_pgrp->pg_jobc == 0) {
error = EIO;
mutex_spin_exit(&tty_lock);
goto out;
}
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTOU, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
mutex_spin_exit(&tty_lock);
/*
* Process the user's data in at most OBUFSIZ chunks. Perform any
* output translation. Keep track of high water mark, sleep on
* overflow awaiting device aid in acquiring new space.
*/
while (uio->uio_resid > 0 || cc > 0) {
if (ISSET(tp->t_lflag, FLUSHO)) {
uio->uio_resid = 0;
return (0);
}
if (tp->t_outq.c_cc > hiwat)
goto ovhiwat;
/*
* Grab a hunk of data from the user, unless we have some
* leftover from last time.
*/
if (cc == 0) {
uioskip(cc0, uio);
cc0 = cc = uimin(uio->uio_resid, OBUFSIZ);
cp = obuf;
error = uiopeek(cp, cc, uio);
if (error) {
cc = 0;
goto out;
}
}
/*
* If nothing fancy need be done, grab those characters we
* can handle without any of ttyoutput's processing and
* just transfer them to the output q. For those chars
* which require special processing (as indicated by the
* bits in char_type), call ttyoutput. After processing
* a hunk of data, look for FLUSHO so ^O's will take effect
* immediately.
*/
mutex_spin_enter(&tty_lock);
while (cc > 0) {
if (!ISSET(tp->t_oflag, OPOST))
ce = cc;
else {
ce = cc - scanc((u_int)cc, cp, char_type,
CCLASSMASK);
/*
* If ce is zero, then we're processing
* a special character through ttyoutput.
*/
if (ce == 0) {
tp->t_rocount = 0;
if (ttyoutput(*cp, tp) >= 0) {
/* out of space */
mutex_spin_exit(&tty_lock);
goto overfull;
}
cp++;
cc--;
if (ISSET(tp->t_lflag, FLUSHO) ||
tp->t_outq.c_cc > hiwat) {
mutex_spin_exit(&tty_lock);
goto ovhiwat;
}
continue;
}
}
/*
* A bunch of normal characters have been found.
* Transfer them en masse to the output queue and
* continue processing at the top of the loop.
* If there are any further characters in this
* <= OBUFSIZ chunk, the first should be a character
* requiring special handling by ttyoutput.
*/
tp->t_rocount = 0;
i = b_to_q(cp, ce, &tp->t_outq);
ce -= i;
tp->t_column += ce;
cp += ce, cc -= ce, tk_nout += ce;
tp->t_outcc += ce;
if (i > 0) {
/* out of space */
mutex_spin_exit(&tty_lock);
goto overfull;
}
if (ISSET(tp->t_lflag, FLUSHO) ||
tp->t_outq.c_cc > hiwat)
break;
}
ttstart(tp);
mutex_spin_exit(&tty_lock);
}
out:
KASSERTMSG(error || cc == 0, "error=%d cc=%d", error, cc);
KASSERTMSG(cc0 >= cc, "cc0=%d cc=%d", cc0, cc);
uioskip(cc0 - cc, uio);
return (error);
overfull:
/*
* Since we are using ring buffers, if we can't insert any more into
* the output queue, we can assume the ring is full and that someone
* forgot to set the high water mark correctly. We set it and then
* proceed as normal.
*/
hiwat = tp->t_outq.c_cc - 1;
ovhiwat:
mutex_spin_enter(&tty_lock);
ttstart(tp);
/*
* This can only occur if FLUSHO is set in t_lflag,
* or if ttstart/oproc is synchronous (or very fast).
*/
if (tp->t_outq.c_cc <= hiwat) {
mutex_spin_exit(&tty_lock);
goto loop;
}
if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
error = EWOULDBLOCK;
goto out;
}
error = ttysleep(tp, &tp->t_outcv, true, 0);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
/*
* Try to pull more output from the producer. Return non-zero if
* there is output ready to be sent.
*/
bool
ttypull(struct tty *tp)
{
/* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */
if (tp->t_outq.c_cc <= tp->t_lowat) {
cv_broadcast(&tp->t_outcv);
selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
}
return tp->t_outq.c_cc != 0;
}
/*
* Rubout one character from the rawq of tp
* as cleanly as possible.
* Called with tty lock held.
*/
void
ttyrub(int c, struct tty *tp)
{
u_char *cp;
int savecol, tabc;
KASSERT(mutex_owned(&tty_lock));
if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
return;
CLR(tp->t_lflag, FLUSHO);
if (ISSET(tp->t_lflag, ECHOE)) {
if (tp->t_rocount == 0) {
/*
* Screwed by ttwrite; retype
*/
ttyretype(tp);
return;
}
if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
ttyrubo(tp, 2);
else {
CLR(c, ~TTY_CHARMASK);
switch (CCLASS(c)) {
case ORDINARY:
ttyrubo(tp, 1);
break;
case BACKSPACE:
case CONTROL:
case NEWLINE:
case RETURN:
case VTAB:
if (ISSET(tp->t_lflag, ECHOCTL))
ttyrubo(tp, 2);
break;
case TAB:
if (tp->t_rocount < tp->t_rawq.c_cc) {
ttyretype(tp);
return;
}
savecol = tp->t_column;
SET(tp->t_state, TS_CNTTB);
SET(tp->t_lflag, FLUSHO);
tp->t_column = tp->t_rocol;
for (cp = firstc(&tp->t_rawq, &tabc); cp;
cp = nextc(&tp->t_rawq, cp, &tabc))
ttyecho(tabc, tp);
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_CNTTB);
/* savecol will now be length of the tab. */
savecol -= tp->t_column;
tp->t_column += savecol;
if (savecol > 8)
savecol = 8; /* overflow screw */
while (--savecol >= 0)
(void)ttyoutput('\b', tp);
break;
default: /* XXX */
(void)printf("ttyrub: would panic c = %d, "
"val = %d\n", c, CCLASS(c));
}
}
} else if (ISSET(tp->t_lflag, ECHOPRT)) {
if (!ISSET(tp->t_state, TS_ERASE)) {
SET(tp->t_state, TS_ERASE);
(void)ttyoutput('\\', tp);
}
ttyecho(c, tp);
} else
ttyecho(tp->t_cc[VERASE], tp);
--tp->t_rocount;
}
/*
* Back over cnt characters, erasing them.
* Called with tty lock held.
*/
static void
ttyrubo(struct tty *tp, int cnt)
{
KASSERT(mutex_owned(&tty_lock));
while (cnt-- > 0) {
(void)ttyoutput('\b', tp);
(void)ttyoutput(' ', tp);
(void)ttyoutput('\b', tp);
}
}
/*
* ttyretype --
* Reprint the rawq line. Note, it is assumed that c_cc has already
* been checked.
*
* Called with tty lock held.
*/
void
ttyretype(struct tty *tp)
{
u_char *cp;
int c;
KASSERT(mutex_owned(&tty_lock));
/* Echo the reprint character. */
if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
ttyecho(tp->t_cc[VREPRINT], tp);
(void)ttyoutput('\n', tp);
for (cp = firstc(&tp->t_canq, &c); cp; cp = nextc(&tp->t_canq, cp, &c))
ttyecho(c, tp);
for (cp = firstc(&tp->t_rawq, &c); cp; cp = nextc(&tp->t_rawq, cp, &c))
ttyecho(c, tp);
CLR(tp->t_state, TS_ERASE);
tp->t_rocount = tp->t_rawq.c_cc;
tp->t_rocol = 0;
}
/*
* Echo a typed character to the terminal.
* Called with tty lock held.
*/
static void
ttyecho(int c, struct tty *tp)
{
KASSERT(mutex_owned(&tty_lock));
if (!ISSET(tp->t_state, TS_CNTTB))
CLR(tp->t_lflag, FLUSHO);
if ((!ISSET(tp->t_lflag, ECHO) &&
(!ISSET(tp->t_lflag, ECHONL) || c != '\n')) ||
ISSET(tp->t_lflag, EXTPROC))
return;
if (((ISSET(tp->t_lflag, ECHOCTL) &&
(ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n')) ||
ISSET(c, TTY_CHARMASK) == 0177)) {
(void)ttyoutput('^', tp);
CLR(c, ~TTY_CHARMASK);
if (c == 0177)
c = '?';
else
c += 'A' - 1;
}
(void)ttyoutput(c, tp);
}
/*
* Wake up any readers on a tty.
* Called with tty lock held.
*/
void
ttwakeup(struct tty *tp)
{ KASSERT(mutex_owned(&tty_lock));
selnotify(&tp->t_rsel, 0, NOTE_SUBMIT);
if (ISSET(tp->t_state, TS_ASYNC)) ttysig(tp, TTYSIG_PG2, SIGIO);
cv_broadcast(&tp->t_rawcv);
}
/*
* Look up a code for a specified speed in a conversion table;
* used by drivers to map software speed values to hardware parameters.
*/
int
ttspeedtab(int speed, const struct speedtab *table)
{
for (; table->sp_speed != -1; table++)
if (table->sp_speed == speed)
return (table->sp_code);
return (-1);
}
/*
* Set tty hi and low water marks.
*
* Try to arrange the dynamics so there's about one second
* from hi to low water.
*/
void
ttsetwater(struct tty *tp)
{
int cps, x;
/* XXX not yet KASSERT(mutex_owned(&tty_lock)); */
#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x))
cps = tp->t_ospeed / 10;
tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
x += cps;
x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT);
tp->t_hiwat = roundup(x, TTROUND);
#undef CLAMP
}
/*
* Prepare report on state of foreground process group.
* Call with &proc_lock held.
*/
void
ttygetinfo(struct tty *tp, int fromsig, char *buf, size_t bufsz)
{
struct lwp *l;
struct proc *p, *pick = NULL;
struct timeval utime, stime;
int tmp;
fixpt_t pctcpu = 0;
const char *msg = NULL;
char lmsg[100];
long rss;
bool again = false;
KASSERT(mutex_owned(&proc_lock));
*buf = '\0';
retry:
if (tp->t_session == NULL)
msg = "not a controlling terminal\n";
else if (tp->t_pgrp == NULL)
msg = "no foreground process group\n";
else if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == NULL)
msg = "empty foreground process group\n";
else {
/* Pick interesting process. */
for (; p != NULL; p = LIST_NEXT(p, p_pglist)) {
struct proc *oldpick;
if (pick == NULL) {
pick = p;
continue;
}
if (pick->p_lock < p->p_lock) {
mutex_enter(pick->p_lock);
mutex_enter(p->p_lock);
} else if (pick->p_lock > p->p_lock) {
mutex_enter(p->p_lock);
mutex_enter(pick->p_lock);
} else
mutex_enter(p->p_lock);
oldpick = pick;
if (proc_compare_wrapper(pick, p))
pick = p;
mutex_exit(p->p_lock);
if (p->p_lock != oldpick->p_lock)
mutex_exit(oldpick->p_lock);
}
if (pick != NULL) {
mutex_enter(pick->p_lock);
if (P_ZOMBIE(pick)) {
mutex_exit(pick->p_lock);
pick = NULL;
if (!again) {
again = true;
goto retry;
}
msg = "found only zombie processes\n";
}
if (pick && fromsig &&
(SIGACTION_PS(pick->p_sigacts, SIGINFO).sa_flags &
SA_NOKERNINFO)) {
mutex_exit(pick->p_lock);
return;
}
}
}
/* Print load average. */
tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
snprintf(lmsg, sizeof(lmsg), "load: %d.%02d ", tmp / 100, tmp % 100);
strlcat(buf, lmsg, bufsz);
if (pick == NULL) {
strlcat(buf, msg, bufsz);
return;
}
snprintf(lmsg, sizeof(lmsg), " cmd: %s %d [", pick->p_comm,
pick->p_pid);
strlcat(buf, lmsg, bufsz);
KASSERT(mutex_owned(pick->p_lock));
LIST_FOREACH(l, &pick->p_lwps, l_sibling) {
const char *lp;
lwp_lock(l);
#ifdef LWP_PC
#define FMT_RUN "%#"PRIxVADDR
#define VAL_RUNNING (vaddr_t)LWP_PC(l)
#define VAL_RUNNABLE (vaddr_t)LWP_PC(l)
#else
#define FMT_RUN "%s"
#define VAL_RUNNING "running"
#define VAL_RUNNABLE "runnable"
#endif
switch (l->l_stat) {
case LSONPROC:
snprintf(lmsg, sizeof(lmsg), FMT_RUN"/%d", VAL_RUNNING,
cpu_index(l->l_cpu));
lp = lmsg;
break;
case LSRUN:
snprintf(lmsg, sizeof(lmsg), FMT_RUN, VAL_RUNNABLE);
lp = lmsg;
break;
default:
lp = l->l_wchan ? l->l_wmesg : "iowait";
break;
}
strlcat(buf, lp, bufsz);
strlcat(buf, LIST_NEXT(l, l_sibling) != NULL ? " " : "] ",
bufsz);
pctcpu += l->l_pctcpu;
lwp_unlock(l);
}
pctcpu += pick->p_pctcpu;
calcru(pick, &utime, &stime, NULL, NULL);
mutex_exit(pick->p_lock);
/* Round up and print user+system time, %CPU and RSS. */
utime.tv_usec += 5000;
if (utime.tv_usec >= 1000000) {
utime.tv_sec += 1;
utime.tv_usec -= 1000000;
}
stime.tv_usec += 5000;
if (stime.tv_usec >= 1000000) {
stime.tv_sec += 1;
stime.tv_usec -= 1000000;
}
#define pgtok(a) (((u_long) ((a) * PAGE_SIZE) / 1024))
tmp = (pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
if (pick->p_stat == SIDL || P_ZOMBIE(pick))
rss = 0;
else
rss = pgtok(vm_resident_count(pick->p_vmspace));
snprintf(lmsg, sizeof(lmsg), "%ld.%02ldu %ld.%02lds %d%% %ldk",
(long)utime.tv_sec, (long)utime.tv_usec / 10000,
(long)stime.tv_sec, (long)stime.tv_usec / 10000,
tmp / 100, rss);
strlcat(buf, lmsg, bufsz);
}
/*
* Print report on state of foreground process group.
* Call with tty_lock held.
*/
void
ttyputinfo(struct tty *tp, char *buf)
{
KASSERT(mutex_owned(&tty_lock));
if (ttycheckoutq_wlock(tp) == 0)
return;
ttyprintf_nolock(tp, "%s\n", buf);
tp->t_rocount = 0; /* so pending input will be retyped if BS */
}
/*
* Returns 1 if p2 has a better chance being the active foreground process
* in a terminal instead of p1.
*/
static int
proc_compare_wrapper(struct proc *p1, struct proc *p2)
{
lwp_t *l1, *l2;
KASSERT(mutex_owned(p1->p_lock));
KASSERT(mutex_owned(p2->p_lock));
l1 = LIST_FIRST(&p1->p_lwps);
l2 = LIST_FIRST(&p2->p_lwps);
return proc_compare(p1, l1, p2, l2);
}
/*
* Output char to tty; console putchar style.
* Can be called with tty lock held through kprintf() machinery..
*/
int
tputchar(int c, int flags, struct tty *tp)
{
int r = 0;
if ((flags & NOLOCK) == 0)
mutex_spin_enter(&tty_lock);
if (!CONNECTED(tp)) {
r = -1;
goto out;
}
if (c == '\n')
(void)ttyoutput('\r', tp);
(void)ttyoutput(c, tp);
ttstart(tp);
out:
if ((flags & NOLOCK) == 0)
mutex_spin_exit(&tty_lock);
return (r);
}
/*
* Sleep on chan, returning ERESTART if tty changed while we napped and
* returning any errors (e.g. EINTR/EWOULDBLOCK) reported by
* cv_timedwait(_sig).
* If the tty is revoked, restarting a pending call will redo validation done
* at the start of the call.
*
* Must be called with the tty lock held.
*/
int
ttysleep(struct tty *tp, kcondvar_t *cv, bool catch_p, int timo)
{
int error;
short gen;
KASSERT(mutex_owned(&tty_lock));
gen = tp->t_gen;
if (ISSET(tp->t_state, TS_CANCEL))
error = ERESTART;
else if (cv == NULL)
error = kpause("ttypause", catch_p, timo, &tty_lock);
else if (catch_p)
error = cv_timedwait_sig(cv, &tty_lock, timo);
else
error = cv_timedwait(cv, &tty_lock, timo);
if (error != 0)
return (error);
return (tp->t_gen == gen ? 0 : ERESTART);
}
int
ttypause(struct tty *tp, int timo)
{
int error;
error = ttysleep(tp, NULL, true, timo);
if (error == EWOULDBLOCK)
error = 0;
return error;
}
/*
* Attach a tty to the tty list.
*
* This should be called ONLY once per real tty (including pty's).
* eg, on the sparc, the keyboard and mouse have struct tty's that are
* distinctly NOT usable as tty's, and thus should not be attached to
* the ttylist. This is why this call is not done from tty_alloc().
*
* Device drivers should attach tty's at a similar time that they are
* allocated, or, for the case of statically allocated struct tty's
* either in the attach or (first) open routine.
*/
void
tty_attach(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
TAILQ_INSERT_TAIL(&ttylist, tp, tty_link);
++tty_count;
mutex_spin_exit(&tty_lock);
}
/*
* Remove a tty from the tty list.
*/
void
tty_detach(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
--tty_count;
#ifdef DIAGNOSTIC
if (tty_count < 0)
panic("tty_detach: tty_count < 0");
#endif
TAILQ_REMOVE(&ttylist, tp, tty_link);
mutex_spin_exit(&tty_lock);
}
/*
* Allocate a tty structure and its associated buffers.
*/
struct tty *
tty_alloc(void)
{
struct tty *tp;
int i;
tp = kmem_zalloc(sizeof(*tp), KM_SLEEP);
callout_init(&tp->t_rstrt_ch, 0);
callout_setfunc(&tp->t_rstrt_ch, ttrstrt, tp);
tp->t_qsize = tty_qsize;
clalloc(&tp->t_rawq, tp->t_qsize, 1);
cv_init(&tp->t_rawcv, "ttyraw");
cv_init(&tp->t_rawcvf, "ttyrawf");
clalloc(&tp->t_canq, tp->t_qsize, 1);
cv_init(&tp->t_cancv, "ttycan");
cv_init(&tp->t_cancvf, "ttycanf");
/* output queue doesn't need quoting */
clalloc(&tp->t_outq, tp->t_qsize, 0);
cv_init(&tp->t_outcv, "ttyout");
cv_init(&tp->t_outcvf, "ttyoutf");
/* Set default line discipline. */
tp->t_linesw = ttyldisc_default();
tp->t_dev = NODEV;
selinit(&tp->t_rsel);
selinit(&tp->t_wsel);
for (i = 0; i < TTYSIG_COUNT; i++) {
sigemptyset(&tp->t_sigs[i]);
}
return tp;
}
/*
* Free a tty structure and its buffers.
*
* Be sure to call tty_detach() for any tty that has been
* tty_attach()ed.
*/
void
tty_free(struct tty *tp)
{
int i;
mutex_enter(&proc_lock);
mutex_enter(&tty_lock);
for (i = 0; i < TTYSIG_COUNT; i++)
sigemptyset(&tp->t_sigs[i]);
if (tp->t_sigcount != 0)
TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
mutex_exit(&tty_lock);
mutex_exit(&proc_lock);
callout_halt(&tp->t_rstrt_ch, NULL);
callout_destroy(&tp->t_rstrt_ch);
ttyldisc_release(tp->t_linesw);
clfree(&tp->t_rawq);
clfree(&tp->t_canq);
clfree(&tp->t_outq);
cv_destroy(&tp->t_rawcv);
cv_destroy(&tp->t_rawcvf);
cv_destroy(&tp->t_cancv);
cv_destroy(&tp->t_cancvf);
cv_destroy(&tp->t_outcv);
cv_destroy(&tp->t_outcvf);
seldestroy(&tp->t_rsel);
seldestroy(&tp->t_wsel);
kmem_free(tp, sizeof(*tp));
}
/*
* tty_unit: map dev_t to tty unit number, as with TTUNIT
*
* => defined as function for use with struct cdevsw::d_devtounit
* => not for drivers with different unit numbering, e.g. TTUNIT(d) >> 4
*/
int
tty_unit(dev_t dev)
{
return TTUNIT(dev);
}
/*
* ttyprintf_nolock: send a message to a specific tty, without locking.
*
* => should be used only by tty driver or anything that knows the
* underlying tty will not be revoked(2)'d away. [otherwise,
* use tprintf]
*/
static void
ttyprintf_nolock(struct tty *tp, const char *fmt, ...)
{
va_list ap;
/* No mutex needed; going to process TTY. */
va_start(ap, fmt);
kprintf(fmt, TOTTY|NOLOCK, tp, NULL, ap);
va_end(ap);
}
static int
tty_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct tty *tty;
int result;
result = KAUTH_RESULT_DEFER;
if (action != KAUTH_DEVICE_TTY_OPEN)
return result;
tty = arg0;
/* If it's not opened, we allow. */
if ((tty->t_state & TS_ISOPEN) == 0)
result = KAUTH_RESULT_ALLOW;
else {
/*
* If it's opened, we can only allow if it's not exclusively
* opened; otherwise, that's a privileged operation and we
* let the secmodel handle it.
*/
if ((tty->t_state & TS_XCLUDE) == 0)
result = KAUTH_RESULT_ALLOW;
}
return result;
}
/*
* Initialize the tty subsystem.
*/
void
tty_init(void)
{
mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_VM);
mutex_init(&constty_lock, MUTEX_DEFAULT, IPL_NONE);
constty_psz = pserialize_create();
cv_init(&ttyref_cv, "ttyref");
tty_sigsih = softint_establish(SOFTINT_CLOCK, ttysigintr, NULL);
KASSERT(tty_sigsih != NULL);
tty_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
tty_listener_cb, NULL);
sysctl_kern_tty_setup();
}
/*
* Send a signal from a tty to its process group or session leader.
* Handoff to the target is deferred to a soft interrupt.
*/
void
ttysig(struct tty *tp, enum ttysigtype st, int sig)
{
sigset_t *sp;
/* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */
sp = &tp->t_sigs[st];
if (sigismember(sp, sig))
return;
sigaddset(sp, sig);
if (tp->t_sigcount++ == 0) TAILQ_INSERT_TAIL(&tty_sigqueue, tp, t_sigqueue);
softint_schedule(tty_sigsih);
}
/*
* Deliver deferred signals from ttys. Note that the process groups
* and sessions associated with the ttys may have changed from when
* the signal was originally sent, but in practice it should not matter.
* For signals produced as a result of a syscall, the soft interrupt
* will fire before the syscall returns to the user.
*/
static void
ttysigintr(void *cookie)
{
struct tty *tp;
enum ttysigtype st;
struct pgrp *pgrp;
struct session *sess;
int sig, lflag;
char infobuf[200];
mutex_enter(&proc_lock);
mutex_spin_enter(&tty_lock);
while ((tp = TAILQ_FIRST(&tty_sigqueue)) != NULL) {
KASSERT(tp->t_sigcount > 0);
for (st = TTYSIG_PG1; st < TTYSIG_COUNT; st++) {
if ((sig = firstsig(&tp->t_sigs[st])) != 0)
break;
}
KASSERT(st < TTYSIG_COUNT);
sigdelset(&tp->t_sigs[st], sig);
if (--tp->t_sigcount == 0)
TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
pgrp = tp->t_pgrp;
sess = tp->t_session;
lflag = tp->t_lflag;
if (sig == SIGINFO) {
if (ISSET(tp->t_state, TS_SIGINFO)) {
/* Via ioctl: ignore tty option. */
tp->t_state &= ~TS_SIGINFO;
lflag |= ISIG;
}
if (!ISSET(lflag, NOKERNINFO)) {
mutex_spin_exit(&tty_lock);
ttygetinfo(tp, 1, infobuf, sizeof(infobuf));
mutex_spin_enter(&tty_lock);
ttyputinfo(tp, infobuf);
}
if (!ISSET(lflag, ISIG))
continue;
}
mutex_spin_exit(&tty_lock);
KASSERT(sig != 0);
switch (st) {
case TTYSIG_PG1:
if (pgrp != NULL)
pgsignal(pgrp, sig, 1);
break;
case TTYSIG_PG2:
if (pgrp != NULL)
pgsignal(pgrp, sig, sess != NULL);
break;
case TTYSIG_LEADER:
if (sess != NULL && sess->s_leader != NULL)
psignal(sess->s_leader, sig);
break;
default:
/* NOTREACHED */
break;
}
mutex_spin_enter(&tty_lock);
}
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
}
unsigned char
tty_getctrlchar(struct tty *tp, unsigned which)
{
KASSERT(which < NCCS);
return tp->t_cc[which];
}
void
tty_setctrlchar(struct tty *tp, unsigned which, unsigned char val)
{
KASSERT(which < NCCS);
tp->t_cc[which] = val;
}
int
tty_try_xonxoff(struct tty *tp, unsigned char c)
{
const struct cdevsw *cdev;
if (tp->t_iflag & IXON) {
if (c == tp->t_cc[VSTOP] && tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
if ((tp->t_state & TS_TTSTOP) == 0) {
tp->t_state |= TS_TTSTOP;
cdev = cdevsw_lookup(tp->t_dev);
if (cdev != NULL)
(*cdev->d_stop)(tp, 0);
}
return 0;
}
if (c == tp->t_cc[VSTART] && tp->t_cc[VSTART] != _POSIX_VDISABLE) {
tp->t_state &= ~TS_TTSTOP;
if (tp->t_oproc != NULL) {
mutex_spin_enter(&tty_lock); /* XXX */
(*tp->t_oproc)(tp);
mutex_spin_exit(&tty_lock); /* XXX */
}
return 0;
}
}
return EAGAIN;
}
/* $NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/evcnt.h>
#include <sys/kmem.h>
#include <sys/percpu.h>
#include <sys/pserialize.h>
#include <crypto/chacha/chacha.h>
#define CPRNG_FAST_SEED_BYTES CHACHA_STREAM_KEYBYTES
struct cprng_fast {
/* 128-bit vector unit generates 256 bytes at once */
uint8_t buf[256];
uint8_t key[CPRNG_FAST_SEED_BYTES];
uint8_t nonce[CHACHA_STREAM_NONCEBYTES];
unsigned i;
struct evcnt *reseed_evcnt;
unsigned epoch;
};
static void cprng_fast_init_cpu(void *, void *, struct cpu_info *);
static void cprng_fast_reseed(struct cprng_fast **, unsigned);
static void cprng_fast_seed(struct cprng_fast *, const void *);
static void cprng_fast_buf(struct cprng_fast *, void *, unsigned);
static void cprng_fast_buf_short(void *, size_t);
static void cprng_fast_buf_long(void *, size_t);
static percpu_t *cprng_fast_percpu __read_mostly;
void
cprng_fast_init(void)
{
cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
cprng_fast_init_cpu, NULL, NULL);
}
static void
cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
{
struct cprng_fast *const cprng = p;
cprng->epoch = 0;
cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
KM_SLEEP);
evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
ci->ci_cpuname, "cprng_fast reseed");
}
static int
cprng_fast_get(struct cprng_fast **cprngp)
{
struct cprng_fast *cprng;
unsigned epoch;
int s;
KASSERT(!cpu_intr_p()); KASSERT(pserialize_not_in_read_section());
*cprngp = cprng = percpu_getref(cprng_fast_percpu);
s = splsoftserial();
epoch = entropy_epoch();
if (__predict_false(cprng->epoch != epoch)) { splx(s);
cprng_fast_reseed(cprngp, epoch);
s = splsoftserial();
}
return s;
}
static void
cprng_fast_put(struct cprng_fast *cprng, int s)
{
KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
(percpu_putref(cprng_fast_percpu), true));
splx(s);
percpu_putref(cprng_fast_percpu);
}
static void
cprng_fast_reseed(struct cprng_fast **cprngp, unsigned epoch)
{
struct cprng_fast *cprng;
uint8_t seed[CPRNG_FAST_SEED_BYTES];
int s;
/*
* Drop the percpu(9) reference to extract a fresh seed from
* the entropy pool. cprng_strong may sleep on an adaptive
* lock, which invalidates our percpu(9) reference.
*
* This may race with reseeding in another thread, which is no
* big deal -- worst case, we rewind the entropy epoch here and
* cause the next caller to reseed again, and in the end we
* just reseed a couple more times than necessary.
*/
percpu_putref(cprng_fast_percpu);
cprng_strong(kern_cprng, seed, sizeof(seed), 0);
*cprngp = cprng = percpu_getref(cprng_fast_percpu);
s = splsoftserial();
cprng_fast_seed(cprng, seed);
cprng->epoch = epoch;
cprng->reseed_evcnt->ev_count++;
splx(s);
explicit_memset(seed, 0, sizeof(seed));
}
/* CPRNG algorithm */
static void
cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
{
(void)memset(cprng->buf, 0, sizeof cprng->buf);
(void)memcpy(cprng->key, seed, sizeof cprng->key);
(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
cprng->i = sizeof cprng->buf;
}
static void
cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned len)
{
uint8_t *p = buf;
unsigned n = len, n0;
KASSERT(cprng->i <= sizeof(cprng->buf));
KASSERT(len <= sizeof(cprng->buf));
n0 = MIN(n, sizeof(cprng->buf) - cprng->i);
memcpy(p, &cprng->buf[cprng->i], n0);
if ((n -= n0) == 0) {
cprng->i += n0;
KASSERT(cprng->i <= sizeof(cprng->buf));
return;
}
p += n0;
le64enc(cprng->nonce, 1 + le64dec(cprng->nonce));
chacha_stream(cprng->buf, sizeof(cprng->buf), 0, cprng->nonce,
cprng->key, 8);
memcpy(p, cprng->buf, n);
cprng->i = n;
}
/* Public API */
static void
cprng_fast_buf_short(void *buf, size_t len)
{
struct cprng_fast *cprng;
int s;
KASSERT(len <= sizeof(cprng->buf));
s = cprng_fast_get(&cprng); cprng_fast_buf(cprng, buf, len); cprng_fast_put(cprng, s);
}
static void
cprng_fast_buf_long(void *buf, size_t len)
{
uint8_t seed[CHACHA_STREAM_KEYBYTES];
uint8_t nonce[CHACHA_STREAM_NONCEBYTES] = {0};
CTASSERT(sizeof(seed) <= sizeof(((struct cprng_fast *)0)->buf));
#if SIZE_MAX >= 0x3fffffffff
/* >=256 GB is not reasonable */
KASSERT(len <= 0x3fffffffff);
#endif
cprng_fast_buf_short(seed, sizeof seed);
chacha_stream(buf, len, 0, nonce, seed, 8);
(void)explicit_memset(seed, 0, sizeof seed);
}
uint32_t
cprng_fast32(void)
{
uint32_t v;
cprng_fast_buf_short(&v, sizeof v);
return v;
}
uint64_t
cprng_fast64(void)
{
uint64_t v;
cprng_fast_buf_short(&v, sizeof v);
return v;
}
size_t
cprng_fast(void *buf, size_t len)
{
/*
* We don't want to hog the CPU, so we use the short version,
* to generate output without preemption, only if we can do it
* with at most one ChaCha call.
*/
if (len <= sizeof(((struct cprng_fast *)0)->buf))
cprng_fast_buf_short(buf, len);
else
cprng_fast_buf_long(buf, len);
return len; /* hysterical raisins */
}
/* $NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $ */
/*
* Copyright (c) 1993, 1994 Theo de Raadt
* All rights reserved.
*
* Per Lindqvist <pgd@compuram.bbt.se> supplied an almost fully working
* set of true clist functions that this is very loosely based on.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/kmem.h>
/*
* At compile time, choose:
* There are two ways the TTY_QUOTE bit can be stored. If QBITS is
* defined we allocate an array of bits -- 1/8th as much memory but
* setbit(), clrbit(), and isset() take more CPU. If QBITS is
* undefined, we just use an array of bytes.
*
* If TTY_QUOTE functionality isn't required by a line discipline,
* it can free c_cq and set it to NULL. This speeds things up,
* and also does not use any extra memory. This is useful for (say)
* a SLIP line discipline that wants a 32K ring buffer for data
* but doesn't need quoting.
*/
#define QBITS
#ifdef QBITS
#define QMEM(n) ((((n)-1)/NBBY)+1)
#else
#define QMEM(n) (n)
#endif
#ifdef QBITS
static void clrbits(u_char *, unsigned int, unsigned int);
#endif
/*
* Initialize a particular clist. Ok, they are really ring buffers,
* of the specified length, with/without quoting support.
*/
int
clalloc(struct clist *clp, int size, int quot)
{
clp->c_cs = kmem_zalloc(size, KM_SLEEP);
if (quot) clp->c_cq = kmem_zalloc(QMEM(size), KM_SLEEP);
else
clp->c_cq = NULL;
clp->c_cf = clp->c_cl = NULL;
clp->c_ce = clp->c_cs + size;
clp->c_cn = size;
clp->c_cc = 0;
return (0);
}
void
clfree(struct clist *clp)
{
if (clp->c_cs)
kmem_free(clp->c_cs, clp->c_cn);
if (clp->c_cq)
kmem_free(clp->c_cq, QMEM(clp->c_cn));
clp->c_cs = clp->c_cq = NULL;
}
/*
* Get a character from a clist.
*/
int
getc(struct clist *clp)
{
int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
c = *clp->c_cf & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, clp->c_cf - clp->c_cs) )
c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
c |= TTY_QUOTE;
#endif
}
*clp->c_cf = 0; /* wipe out to avoid information disclosure */
if (++clp->c_cf == clp->c_ce)
clp->c_cf = clp->c_cs;
if (--clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
return c;
}
/*
* Copy clist to buffer.
* Return number of bytes moved.
*/
int
q_to_b(struct clist *clp, u_char *cp, int count)
{
int cc;
u_char *p = cp;
int s;
s = spltty();
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl)
cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
memcpy(p, clp->c_cf, cc);
count -= cc;
p += cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce)
clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
splx(s);
return p - cp;
}
/*
* Return count of contiguous characters in clist.
* Stop counting if flag&character is non-null.
*/
int
ndqb(struct clist *clp, int flag)
{
int count = 0;
int i;
int cc;
int s;
s = spltty();
if ((cc = clp->c_cc) == 0)
goto out;
if (flag == 0) {
count = clp->c_cl - clp->c_cf;
if (count <= 0)
count = clp->c_ce - clp->c_cf;
goto out;
}
i = clp->c_cf - clp->c_cs;
if (flag & TTY_QUOTE) {
while (cc-- > 0 && !(clp->c_cs[i++] & (flag & ~TTY_QUOTE) ||
isset(clp->c_cq, i))) {
count++;
if (i == clp->c_cn)
break;
}
} else {
while (cc-- > 0 && !(clp->c_cs[i++] & flag)) {
count++;
if (i == clp->c_cn)
break;
}
}
out:
splx(s);
return count;
}
/*
* Flush count bytes from clist.
*/
void
ndflush(struct clist *clp, int count)
{
int cc;
int s;
s = spltty();
if (count == clp->c_cc) {
clp->c_cc = 0;
clp->c_cf = clp->c_cl = (u_char *)0;
goto out;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl)
cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
count -= cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce)
clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
}
/*
* Put a character into the output queue.
*/
int
putc(int c, struct clist *clp)
{
int i;
int s;
s = spltty();
if (clp->c_cc == clp->c_cn)
goto out;
if (clp->c_cc == 0) {
if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
printf("putc: required clalloc\n");
#endif
if (clalloc(clp, clp->c_cn, 1)) {
out:
splx(s);
return -1;
}
}
clp->c_cf = clp->c_cl = clp->c_cs;
}
*clp->c_cl = c & 0xff;
i = clp->c_cl - clp->c_cs;
if (clp->c_cq) {
#ifdef QBITS
if (c & TTY_QUOTE)
setbit(clp->c_cq, i);
else
clrbit(clp->c_cq, i);
#else
q = clp->c_cq + i;
*q = (c & TTY_QUOTE) ? 1 : 0;
#endif
}
clp->c_cc++;
clp->c_cl++;
if (clp->c_cl == clp->c_ce)
clp->c_cl = clp->c_cs;
splx(s);
return 0;
}
#ifdef QBITS
/*
* optimized version of
*
* for (i = 0; i < len; i++)
* clrbit(cp, off + len);
*/
static void
clrbits(u_char *cp, unsigned int off, unsigned int len)
{
unsigned int sbi, ebi;
u_char *scp, *ecp;
unsigned int end;
unsigned char mask;
scp = cp + off / NBBY;
sbi = off % NBBY;
end = off + len + NBBY - 1;
ecp = cp + end / NBBY - 1;
ebi = end % NBBY + 1;
if (scp >= ecp) {
mask = ((1 << len) - 1) << sbi;
*scp &= ~mask;
} else {
mask = (1 << sbi) - 1;
*scp++ &= mask;
mask = (1 << ebi) - 1;
*ecp &= ~mask;
while (scp < ecp)
*scp++ = 0x00;
}
}
#endif
/*
* Copy buffer to clist.
* Return number of bytes not transferred.
*/
int
b_to_q(const u_char *cp, int count, struct clist *clp)
{
int cc;
const u_char *p = cp;
int s;
if (count <= 0)
return 0;
s = spltty();
if (clp->c_cc == clp->c_cn)
goto out;
if (clp->c_cc == 0) {
if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
printf("b_to_q: required clalloc\n");
#endif
if (clalloc(clp, clp->c_cn, 1))
goto out;
}
clp->c_cf = clp->c_cl = clp->c_cs;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc < clp->c_cn) {
cc = clp->c_ce - clp->c_cl;
if (clp->c_cf > clp->c_cl)
cc = clp->c_cf - clp->c_cl;
if (cc > count)
cc = count;
memcpy(clp->c_cl, p, cc);
if (clp->c_cq) {
#ifdef QBITS
clrbits(clp->c_cq, clp->c_cl - clp->c_cs, cc);
#else
memset(clp->c_cl - clp->c_cs + clp->c_cq, 0, cc);
#endif
}
p += cc;
count -= cc;
clp->c_cc += cc;
clp->c_cl += cc;
if (clp->c_cl == clp->c_ce)
clp->c_cl = clp->c_cs;
}
out:
splx(s);
return count;
}
static int tty_global_cc;
/*
* Given a non-NULL pointer into the clist return the pointer
* to the next character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and getc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*/
u_char *
nextc(struct clist *clp, u_char *cp, int *c)
{
if (clp->c_cf == cp) {
/*
* First time initialization.
*/
tty_global_cc = clp->c_cc;
}
if (tty_global_cc == 0 || cp == NULL)
return NULL;
if (--tty_global_cc == 0)
return NULL;
if (++cp == clp->c_ce)
cp = clp->c_cs;
*c = *cp & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, cp - clp->c_cs))
*c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
*c |= TTY_QUOTE;
#endif
}
return cp;
}
/*
* Given a non-NULL pointer into the clist return the pointer
* to the first character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and getc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*
* *c is set to the NEXT character
*/
u_char *
firstc(struct clist *clp, int *c)
{
u_char *cp;
tty_global_cc = clp->c_cc;
if (tty_global_cc == 0)
return NULL;
cp = clp->c_cf;
*c = *cp & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, cp - clp->c_cs))
*c |= TTY_QUOTE;
#else
if (*(cp - clp->c_cs + clp->c_cq))
*c |= TTY_QUOTE;
#endif
}
return clp->c_cf;
}
/*
* Remove the last character in the clist and return it.
*/
int
unputc(struct clist *clp)
{
unsigned int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
if (clp->c_cl == clp->c_cs)
clp->c_cl = clp->c_ce - 1;
else
--clp->c_cl;
clp->c_cc--;
c = *clp->c_cl & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, clp->c_cl - clp->c_cs))
c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
c |= TTY_QUOTE;
#endif
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
return c;
}
/*
* Put the chars in the from queue on the end of the to queue.
*/
void
catq(struct clist *from, struct clist *to)
{
int c;
while ((c = getc(from)) != -1)
putc(c, to);
}
/* $NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* Copyright (c) 1998, 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Coyote Point Systems, Inc.
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/once.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
#include <sys/domain.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/portalgo.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet/tcp_vtw.h>
struct in_addr zeroin_addr;
#define INPCBHASH_PORT(table, lport) \
&(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define INPCBHASH_BIND(table, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
((ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_bindhash]
#define INPCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
&(table)->inpt_connecthashtbl[ \
((ntohl((faddr).s_addr) + ntohs(fport)) + \
(ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_connecthash]
int anonportmin = IPPORT_ANONMIN;
int anonportmax = IPPORT_ANONMAX;
int lowportmin = IPPORT_RESERVEDMIN;
int lowportmax = IPPORT_RESERVEDMAX;
static pool_cache_t in4pcb_pool_cache;
#ifdef INET6
static pool_cache_t in6pcb_pool_cache;
#endif
static int
inpcb_poolinit(void)
{
in4pcb_pool_cache = pool_cache_init(sizeof(struct in4pcb), coherency_unit,
0, 0, "in4pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#ifdef INET6
in6pcb_pool_cache = pool_cache_init(sizeof(struct in6pcb), coherency_unit,
0, 0, "in6pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#endif
return 0;
}
void
inpcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{
static ONCE_DECL(control);
TAILQ_INIT(&table->inpt_queue);
table->inpt_porthashtbl = hashinit(bindhashsize, HASH_LIST, true,
&table->inpt_porthash);
table->inpt_bindhashtbl = hashinit(bindhashsize, HASH_LIST, true,
&table->inpt_bindhash);
table->inpt_connecthashtbl = hashinit(connecthashsize, HASH_LIST, true,
&table->inpt_connecthash);
table->inpt_lastlow = IPPORT_RESERVEDMAX;
table->inpt_lastport = (in_port_t)anonportmax;
RUN_ONCE(&control, inpcb_poolinit);
}
/*
* inpcb_create: construct a new PCB and associated with a given socket.
* Sets the PCB state to INP_ATTACHED and makes PCB globally visible.
*/
int
inpcb_create(struct socket *so, void *v)
{
struct inpcbtable *table = v;
struct inpcb *inp;
int s;
#ifdef INET6
KASSERT(soaf(so) == AF_INET || soaf(so) == AF_INET6);
if (soaf(so) == AF_INET)
inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
else
inp = pool_cache_get(in6pcb_pool_cache, PR_NOWAIT);
#else
KASSERT(soaf(so) == AF_INET);
inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
#endif
if (inp == NULL)
return ENOBUFS;
if (soaf(so) == AF_INET)
memset(inp, 0, sizeof(struct in4pcb));
#ifdef INET6
else
memset(inp, 0, sizeof(struct in6pcb));
#endif
inp->inp_af = soaf(so);
inp->inp_table = table;
inp->inp_socket = so;
inp->inp_portalgo = PORTALGO_DEFAULT;
inp->inp_bindportonsend = false;
if (inp->inp_af == AF_INET) {
in4p_errormtu(inp) = -1;
in4p_prefsrcip(inp).s_addr = INADDR_ANY;
}
#ifdef INET6
else {
in6p_hops6(inp) = -1; /* use kernel default */
if (ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
#if defined(IPSEC)
if (ipsec_enabled) {
int error = ipsec_init_pcbpolicy(so, &inp->inp_sp);
if (error != 0) {
#ifdef INET6
if (inp->inp_af == AF_INET)
pool_cache_put(in4pcb_pool_cache, inp);
else
pool_cache_put(in6pcb_pool_cache, inp);
#else
KASSERT(inp->inp_af == AF_INET);
pool_cache_put(in4pcb_pool_cache, inp);
#endif
return error;
}
inp->inp_sp->sp_inp = inp;
}
#endif
so->so_pcb = inp;
s = splsoftnet();
TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
inp_lhash);
inpcb_set_state(inp, INP_ATTACHED);
splx(s);
return 0;
}
static int
inpcb_set_port(struct sockaddr_in *sin, struct inpcb *inp, kauth_cred_t cred)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
in_port_t *lastport;
in_port_t lport = 0;
enum kauth_network_req req;
int error;
if (inp->inp_flags & INP_LOWPORT) {
#ifndef IPNOPRIVPORTS
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
lastport = &table->inpt_lastlow;
} else {
req = KAUTH_REQ_NETWORK_BIND_PORT;
lastport = &table->inpt_lastport;
}
/* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, sin,
NULL);
if (error)
return EACCES;
/*
* Use RFC6056 randomized port selection
*/
error = portalgo_randport(&lport, inp, cred);
if (error)
return error;
inp->inp_flags |= INP_ANONPORT;
*lastport = lport;
lport = htons(lport);
inp->inp_lport = lport;
inpcb_set_state(inp, INP_BOUND);
return 0;
}
int
inpcb_bindableaddr(const struct inpcb *inp, struct sockaddr_in *sin,
kauth_cred_t cred)
{
int error = EADDRNOTAVAIL;
struct ifaddr *ifa = NULL;
int s;
if (sin->sin_family != AF_INET)
return EAFNOSUPPORT;
s = pserialize_read_enter();
if (IN_MULTICAST(sin->sin_addr.s_addr)) {
/* Always succeed; port reuse handled in inpcb_bind_port(). */
} else if (!in_nullhost(sin->sin_addr)) {
struct in_ifaddr *ia;
ia = in_get_ia(sin->sin_addr);
/* check for broadcast addresses */
if (ia == NULL) {
ifa = ifa_ifwithaddr(sintosa(sin));
if (ifa != NULL)
ia = ifatoia(ifa);
else if ((inp->inp_flags & INP_BINDANY) != 0) {
error = 0;
goto error;
}
}
if (ia == NULL)
goto error;
if (ia->ia4_flags & IN_IFF_DUPLICATED)
goto error;
}
error = 0;
error:
pserialize_read_exit(s);
return error;
}
static int
inpcb_bind_addr(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
int error;
error = inpcb_bindableaddr(inp, sin, cred);
if (error == 0)
in4p_laddr(inp) = sin->sin_addr;
return error;
}
static int
inpcb_bind_port(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
int reuseport = (so->so_options & SO_REUSEPORT);
int wild = 0, error;
if (IN_MULTICAST(sin->sin_addr.s_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
reuseport = SO_REUSEADDR|SO_REUSEPORT;
}
if (sin->sin_port == 0) {
error = inpcb_set_port(sin, inp, cred);
if (error)
return error;
} else {
struct inpcb *t;
vestigial_inpcb_t vestige;
#ifdef INET6
struct inpcb *t6;
struct in6_addr mapped;
#endif
enum kauth_network_req req;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
wild = 1;
#ifndef IPNOPRIVPORTS
if (ntohs(sin->sin_port) < IPPORT_RESERVED)
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
else
#endif /* !IPNOPRIVPORTS */
req = KAUTH_REQ_NETWORK_BIND_PORT;
error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req,
so, sin, NULL);
if (error)
return EACCES;
#ifdef INET6
in6_in_2_v4mapin6(&sin->sin_addr, &mapped);
t6 = in6pcb_lookup_local(table, &mapped, sin->sin_port, wild, &vestige);
if (t6 && (reuseport & t6->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t6 && vestige.valid) {
if (!!reuseport != !!vestige.reuse_port) {
return EADDRINUSE;
}
}
#endif
/* XXX-kauth */
if (so->so_uidinfo->ui_uid && !IN_MULTICAST(sin->sin_addr.s_addr)) {
t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, 1, &vestige);
/*
* XXX: investigate ramifications of loosening this
* restriction so that as long as both ports have
* SO_REUSEPORT allow the bind
*/
if (t &&
(!in_nullhost(sin->sin_addr) ||
!in_nullhost(in4p_laddr(t)) ||
(t->inp_socket->so_options & SO_REUSEPORT) == 0)
&& (so->so_uidinfo->ui_uid != t->inp_socket->so_uidinfo->ui_uid)) {
return EADDRINUSE;
}
if (!t && vestige.valid) {
if ((!in_nullhost(sin->sin_addr)
|| !in_nullhost(vestige.laddr.v4)
|| !vestige.reuse_port)
&& so->so_uidinfo->ui_uid != vestige.uid) {
return EADDRINUSE;
}
}
}
t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
inp->inp_lport = sin->sin_port;
inpcb_set_state(inp, INP_BOUND);
}
LIST_REMOVE(inp, inp_lhash);
LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
inp_lhash);
return 0;
}
/*
* inpcb_bind: assign a local IP address and port number to the PCB.
*
* If the address is not a wildcard, verify that it corresponds to a
* local interface. If a port is specified and it is privileged, then
* check the permission. Check whether the address or port is in use,
* and if so, whether we can re-use them.
*/
int
inpcb_bind(void *v, struct sockaddr_in *sin, struct lwp *l)
{
struct inpcb *inp = v;
struct sockaddr_in lsin;
int error;
if (inp->inp_af != AF_INET)
return EINVAL;
if (inp->inp_lport || !in_nullhost(in4p_laddr(inp)))
return EINVAL;
if (NULL != sin) {
if (sin->sin_len != sizeof(*sin))
return EINVAL;
} else {
lsin = *((const struct sockaddr_in *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
sin = &lsin;
}
/* Bind address. */
error = inpcb_bind_addr(inp, sin, l->l_cred);
if (error)
return error;
/* Bind port. */
error = inpcb_bind_port(inp, sin, l->l_cred);
if (error) {
in4p_laddr(inp).s_addr = INADDR_ANY;
return error;
}
return 0;
}
/*
* inpcb_connect: connect from a socket to a specified address, i.e.,
* assign a foreign IP address and port number to the PCB.
*
* Both address and port must be specified in the name argument.
* If there is no local address for this socket yet, then pick one.
*/
int
inpcb_connect(void *v, struct sockaddr_in *sin, struct lwp *l)
{
struct inpcb *inp = v;
vestigial_inpcb_t vestige;
int error;
struct in_addr laddr;
if (inp->inp_af != AF_INET)
return EINVAL;
if (sin->sin_len != sizeof (*sin))
return EINVAL;
if (sin->sin_family != AF_INET)
return EAFNOSUPPORT;
if (sin->sin_port == 0)
return EADDRNOTAVAIL;
if (IN_MULTICAST(sin->sin_addr.s_addr) &&
inp->inp_socket->so_type == SOCK_STREAM)
return EADDRNOTAVAIL;
if (!IN_ADDRLIST_READER_EMPTY()) {
/*
* If the destination address is INADDR_ANY,
* use any local address (likely loopback).
* If the supplied address is INADDR_BROADCAST,
* use the broadcast address of an interface
* which supports broadcast. (loopback does not)
*/
if (in_nullhost(sin->sin_addr)) {
/* XXX racy */
sin->sin_addr =
IN_ADDRLIST_READER_FIRST()->ia_addr.sin_addr;
} else if (sin->sin_addr.s_addr == INADDR_BROADCAST) {
struct in_ifaddr *ia;
int s = pserialize_read_enter();
IN_ADDRLIST_READER_FOREACH(ia) {
if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
sin->sin_addr =
ia->ia_broadaddr.sin_addr;
break;
}
}
pserialize_read_exit(s);
}
}
/*
* If we haven't bound which network number to use as ours,
* we will use the number of the outgoing interface.
* This depends on having done a routing lookup, which
* we will probably have to do anyway, so we might
* as well do it now. On the other hand if we are
* sending to multiple destinations we may have already
* done the lookup, so see if we can use the route
* from before. In any case, we only
* chose a port number once, even if sending to multiple
* destinations.
*/
if (in_nullhost(in4p_laddr(inp))) {
int xerror;
struct in_ifaddr *ia, *_ia;
int s;
struct psref psref;
int bound;
bound = curlwp_bind();
ia = in_selectsrc(sin, &inp->inp_route,
inp->inp_socket->so_options, inp->inp_moptions, &xerror,
&psref);
if (ia == NULL) {
curlwp_bindx(bound);
if (xerror == 0)
xerror = EADDRNOTAVAIL;
return xerror;
}
s = pserialize_read_enter();
_ia = in_get_ia(IA_SIN(ia)->sin_addr);
if (_ia == NULL && (inp->inp_flags & INP_BINDANY) == 0) {
pserialize_read_exit(s);
ia4_release(ia, &psref);
curlwp_bindx(bound);
return EADDRNOTAVAIL;
}
pserialize_read_exit(s);
laddr = IA_SIN(ia)->sin_addr;
ia4_release(ia, &psref);
curlwp_bindx(bound);
} else
laddr = in4p_laddr(inp);
if (inpcb_lookup(inp->inp_table, sin->sin_addr, sin->sin_port,
laddr, inp->inp_lport, &vestige) != NULL ||
vestige.valid) {
return EADDRINUSE;
}
if (in_nullhost(in4p_laddr(inp))) {
if (inp->inp_lport == 0) {
error = inpcb_bind(inp, NULL, l);
/*
* This used to ignore the return value
* completely, but we need to check for
* ephemeral port shortage.
* And attempts to request low ports if not root.
*/
if (error != 0)
return error;
}
in4p_laddr(inp) = laddr;
}
in4p_faddr(inp) = sin->sin_addr;
inp->inp_fport = sin->sin_port;
/* Late bind, if needed */
if (inp->inp_bindportonsend) {
struct sockaddr_in lsin = *((const struct sockaddr_in *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin.sin_addr = in4p_laddr(inp);
lsin.sin_port = 0;
if ((error = inpcb_bind_port(inp, &lsin, l->l_cred)) != 0)
return error;
}
inpcb_set_state(inp, INP_CONNECTED);
#if defined(IPSEC)
if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM)
ipsec_pcbconn(inp->inp_sp);
#endif
return 0;
}
/*
* inpcb_disconnect: remove any foreign IP/port association.
*
* Note: destroys the PCB if socket was closed.
*/
void
inpcb_disconnect(void *v)
{
struct inpcb *inp = v;
if (inp->inp_af != AF_INET)
return;
in4p_faddr(inp) = zeroin_addr;
inp->inp_fport = 0;
inpcb_set_state(inp, INP_BOUND);
#if defined(IPSEC)
if (ipsec_enabled)
ipsec_pcbdisconn(inp->inp_sp);
#endif
if (inp->inp_socket->so_state & SS_NOFDREF)
inpcb_destroy(inp);
}
/*
* inpcb_destroy: destroy PCB as well as the associated socket.
*/
void
inpcb_destroy(void *v)
{
struct inpcb *inp = v;
struct socket *so = inp->inp_socket;
int s;
KASSERT(inp->inp_af == AF_INET || inp->inp_af == AF_INET6);
#if defined(IPSEC)
if (ipsec_enabled) ipsec_delete_pcbpolicy(inp);
#endif
so->so_pcb = NULL;
s = splsoftnet();
inpcb_set_state(inp, INP_ATTACHED);
LIST_REMOVE(inp, inp_lhash); TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue);
splx(s);
if (inp->inp_options) { m_free(inp->inp_options);
}
rtcache_free(&inp->inp_route);
ip_freemoptions(inp->inp_moptions);
#ifdef INET6
if (inp->inp_af == AF_INET6) { if (in6p_outputopts(inp) != NULL) { ip6_clearpktopts(in6p_outputopts(inp), -1);
free(in6p_outputopts(inp), M_IP6OPT);
}
ip6_freemoptions(in6p_moptions(inp));
}
#endif
sofree(so); /* drops the socket's lock */
#ifdef INET6
if (inp->inp_af == AF_INET)
pool_cache_put(in4pcb_pool_cache, inp);
else
pool_cache_put(in6pcb_pool_cache, inp);
#else
KASSERT(inp->inp_af == AF_INET);
pool_cache_put(in4pcb_pool_cache, inp);
#endif
mutex_enter(softnet_lock); /* reacquire the softnet_lock */
}
/*
* inpcb_fetch_sockaddr: fetch the local IP address and port number.
*/
void
inpcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in *sin)
{
if (inp->inp_af != AF_INET)
return;
sockaddr_in_init(sin, &in4p_laddr(inp), inp->inp_lport);
}
/*
* inpcb_fetch_peeraddr: fetch the foreign IP address and port number.
*/
void
inpcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in *sin)
{
if (inp->inp_af != AF_INET)
return;
sockaddr_in_init(sin, &in4p_faddr(inp), inp->inp_fport);
}
/*
* inpcb_notify: pass some notification to all connections of a protocol
* associated with destination address. The local address and/or port
* numbers may be specified to limit the search. The "usual action" will
* be taken, depending on the command.
*
* The caller must filter any commands that are not interesting (e.g.,
* no error in the map). Call the protocol specific routine (if any) to
* report any errors for each matching socket.
*
* Must be called at splsoftnet.
*/
int
inpcb_notify(struct inpcbtable *table, struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg, int errno,
void (*notify)(struct inpcb *, int))
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
int nmatch;
if (in_nullhost(faddr) || notify == NULL)
return 0;
nmatch = 0;
head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr) &&
inp->inp_fport == fport &&
inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr)) {
(*notify)(inp, errno);
nmatch++;
}
}
return nmatch;
}
void
inpcb_notifyall(struct inpcbtable *table, struct in_addr faddr, int errno,
void (*notify)(struct inpcb *, int))
{
struct inpcb *inp;
if (in_nullhost(faddr) || notify == NULL)
return;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr))
(*notify)(inp, errno);
}
}
void
in_purgeifmcast(struct ip_moptions *imo, struct ifnet *ifp)
{
int i, gap;
/* The owner of imo should be protected by solock */
KASSERT(ifp != NULL);
if (imo == NULL)
return;
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (imo->imo_multicast_if_index == ifp->if_index)
imo->imo_multicast_if_index = 0;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*/
for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) {
if (imo->imo_membership[i]->inm_ifp == ifp) {
in_delmulti(imo->imo_membership[i]);
gap++;
} else if (gap != 0)
imo->imo_membership[i - gap] = imo->imo_membership[i];
}
imo->imo_num_memberships -= gap;
}
void
inpcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
bool need_unlock = false;
if (inp->inp_af != AF_INET)
continue;
/* The caller holds either one of inps' lock */
if (!inp_locked(inp)) {
inp_lock(inp);
need_unlock = true;
}
in_purgeifmcast(inp->inp_moptions, ifp);
if (need_unlock)
inp_unlock(inp);
}
}
void
inpcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
struct rtentry *rt;
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET)
continue;
if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
rt->rt_ifp == ifp) {
rtcache_unref(rt, &inp->inp_route);
inpcb_rtchange(inp, 0);
} else
rtcache_unref(rt, &inp->inp_route);
}
}
/*
* inpcb_losing: check for alternatives when higher level complains about
* service problems. For now, invalidate cached routing information.
* If the route was created dynamically (by a redirect), time to try a
* default gateway again.
*/
void
inpcb_losing(struct inpcb *inp)
{
struct rtentry *rt;
struct rt_addrinfo info;
if (inp->inp_af != AF_INET)
return;
if ((rt = rtcache_validate(&inp->inp_route)) == NULL)
return;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = rtcache_getdst(&inp->inp_route);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
if (rt->rt_flags & RTF_DYNAMIC) {
int error;
struct rtentry *nrt;
error = rtrequest(RTM_DELETE, rt_getkey(rt),
rt->rt_gateway, rt_mask(rt), rt->rt_flags, &nrt);
rtcache_unref(rt, &inp->inp_route);
if (error == 0) {
rt_newmsg_dynamic(RTM_DELETE, nrt);
rt_free(nrt);
}
} else
rtcache_unref(rt, &inp->inp_route);
/*
* A new route can be allocated
* the next time output is attempted.
*/
rtcache_free(&inp->inp_route);
}
/*
* inpcb_rtchange: after a routing change, flush old routing.
* A new route can be allocated the next time output is attempted.
*/
void
inpcb_rtchange(struct inpcb *inp, int errno)
{
if (inp->inp_af != AF_INET)
return;
rtcache_free(&inp->inp_route);
/* XXX SHOULD NOTIFY HIGHER-LEVEL PROTOCOLS */
}
/*
* inpcb_lookup_local: find a PCB by looking at the local port and matching
* the local address or resolving the wildcards. Primarily used to detect
* when the local address is already in use.
*/
struct inpcb *
inpcb_lookup_local(struct inpcbtable *table, struct in_addr laddr,
u_int lport_arg, int lookup_wildcard, vestigial_inpcb_t *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
struct inpcb *match = NULL;
int matchwild = 3;
int wildcard;
in_port_t lport = lport_arg;
if (vp)
vp->valid = 0;
head = INPCBHASH_PORT(table, lport);
LIST_FOREACH(inp, head, inp_lhash) {
if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport != lport)
continue;
/*
* check if inp's faddr and laddr match with ours.
* our faddr is considered null.
* count the number of wildcard matches. (0 - 2)
*
* null null match
* A null wildcard match
* null B wildcard match
* A B non match
* A A match
*/
wildcard = 0;
if (!in_nullhost(in4p_faddr(inp)))
wildcard++;
if (in_nullhost(in4p_laddr(inp))) {
if (!in_nullhost(laddr))
wildcard++;
} else {
if (in_nullhost(laddr))
wildcard++;
else {
if (!in_hosteq(in4p_laddr(inp), laddr))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
/*
* prefer an address with less wildcards.
*/
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (match && matchwild == 0)
return match;
if (vp && table->vestige) {
void *state = (*table->vestige->init_ports4)(laddr, lport_arg, lookup_wildcard);
vestigial_inpcb_t better;
bool has_better = false;
while (table->vestige
&& (*table->vestige->next_port4)(state, vp)) {
if (vp->lport != lport)
continue;
wildcard = 0;
if (!in_nullhost(vp->faddr.v4))
wildcard++;
if (in_nullhost(vp->laddr.v4)) {
if (!in_nullhost(laddr))
wildcard++;
} else {
if (in_nullhost(laddr))
wildcard++;
else {
if (!in_hosteq(vp->laddr.v4, laddr))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
better = *vp;
has_better = true;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (has_better) {
*vp = better;
return 0;
}
}
return match;
}
#ifdef DIAGNOSTIC
int inpcb_notifymiss = 0;
#endif
/*
* inpcb_lookup: perform a full 4-tuple PCB lookup.
*/
struct inpcb *
inpcb_lookup(struct inpcbtable *table,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
vestigial_inpcb_t *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
if (vp)
vp->valid = 0;
head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr) &&
inp->inp_fport == fport &&
inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr))
goto out;
}
if (vp && table->vestige) {
if ((*table->vestige->lookup4)(faddr, fport_arg,
laddr, lport_arg, vp))
return 0;
}
#ifdef DIAGNOSTIC
if (inpcb_notifymiss) {
printf("inpcb_lookup: faddr=%08x fport=%d laddr=%08x lport=%d\n",
ntohl(faddr.s_addr), ntohs(fport),
ntohl(laddr.s_addr), ntohs(lport));
}
#endif
return 0;
out:
/* Move this PCB to the head of hash chain. */
if (inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
/*
* inpcb_lookup_bound: find a PCB by looking at the local address and port.
* Primarily used to find the listening (i.e., already bound) socket.
*/
struct inpcb *
inpcb_lookup_bound(struct inpcbtable *table,
struct in_addr laddr, u_int lport_arg)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t lport = lport_arg;
head = INPCBHASH_BIND(table, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr))
goto out;
}
head = INPCBHASH_BIND(table, zeroin_addr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), zeroin_addr))
goto out;
}
#ifdef DIAGNOSTIC
if (inpcb_notifymiss) {
printf("inpcb_lookup_bound: laddr=%08x lport=%d\n",
ntohl(laddr.s_addr), ntohs(lport));
}
#endif
return 0;
out:
/* Move this PCB to the head of hash chain. */
if (inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
void
inpcb_set_state(struct inpcb *inp, int state)
{
#ifdef INET6
if (inp->inp_af == AF_INET6) {
in6pcb_set_state(inp, state);
return;
}
#else
if (inp->inp_af != AF_INET)
return;
#endif
if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) {
case INP_BOUND:
LIST_INSERT_HEAD(INPCBHASH_BIND(inp->inp_table,
in4p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
case INP_CONNECTED:
LIST_INSERT_HEAD(INPCBHASH_CONNECT(inp->inp_table,
in4p_faddr(inp), inp->inp_fport,
in4p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
}
inp->inp_state = state;
}
struct rtentry *
inpcb_rtentry(struct inpcb *inp)
{
struct route *ro;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
#ifdef INET6
if (inp->inp_af == AF_INET6) return in6pcb_rtentry(inp);
#endif
if (inp->inp_af != AF_INET)
return NULL;
ro = &inp->inp_route;
sockaddr_in_init(&u.dst4, &in4p_faddr(inp), 0);
return rtcache_lookup(ro, &u.dst);
}
void
inpcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{
rtcache_unref(rt, &inp->inp_route);
}
/* $NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/module.h>
#include <uvm/uvm_extern.h>
MODULE(MODULE_CLASS_EXEC, exec_aout, NULL);
static struct execsw exec_aout_execsw = {
.es_hdrsz = sizeof(struct exec),
.es_makecmds = exec_aout_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = 0,
.es_copyargs = copyargs,
.es_setregs = NULL,
.es_coredump = coredump_netbsd,
.es_setup_stack = exec_setup_stack,
};
static int
exec_aout_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(&exec_aout_execsw, 1);
case MODULE_CMD_FINI:
return exec_remove(&exec_aout_execsw, 1);
default:
return ENOTTY;
}
}
/*
* exec_aout_makecmds(): Check if it's an a.out-format executable.
*
* Given a lwp pointer and an exec package pointer, see if the referent
* of the epp is in a.out format. First check 'standard' magic numbers for
* this architecture. If that fails, try a CPU-dependent hook.
*
* This function, in the former case, or the hook, in the latter, is
* responsible for creating a set of vmcmds which can be used to build
* the process's vm space and inserting them into the exec package.
*/
int
exec_aout_makecmds(struct lwp *l, struct exec_package *epp)
{
u_long midmag, magic;
u_short mid;
int error;
struct exec *execp = epp->ep_hdr;
if (epp->ep_hdrvalid < sizeof(struct exec))
return ENOEXEC;
midmag = ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0x3ff;
magic = midmag & 0xffff;
midmag = mid << 16 | magic;
switch (midmag) {
case (MID_MACHINE << 16) | ZMAGIC:
error = exec_aout_prep_zmagic(l, epp);
break;
case (MID_MACHINE << 16) | NMAGIC:
error = exec_aout_prep_nmagic(l, epp);
break;
case (MID_MACHINE << 16) | OMAGIC:
error = exec_aout_prep_omagic(l, epp);
break;
default:
error = cpu_exec_aout_makecmds(l, epp);
}
if (error)
kill_vmcmds(&epp->ep_vmcmds);
else
epp->ep_flags &= ~EXEC_TOPDOWN_VM;
return error;
}
/*
* exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
error = vn_marktext(epp->ep_vp);
if (error)
return (error);
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_text),
epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_data),
epp->ep_daddr, epp->ep_vp, execp->a_text,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's exec package
*/
int
exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = round_page(epp->ep_daddr + execp->a_data);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's exec package
*/
int
exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = round_page(epp->ep_daddr + execp->a_data);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - round_page(execp->a_text);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/* $NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $ */
/*-
* Copyright (c)2011 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_page_array.h>
/*
* uvm_page_array_init: initialize the array.
*/
void
uvm_page_array_init(struct uvm_page_array *ar, struct uvm_object *uobj,
unsigned int flags)
{
ar->ar_idx = 0;
ar->ar_npages = 0;
ar->ar_uobj = uobj;
ar->ar_flags = flags;
}
/*
* uvm_page_array_fini: clean up the array.
*/
void
uvm_page_array_fini(struct uvm_page_array *ar)
{
/*
* currently nothing to do.
*/
#if defined(DIAGNOSTIC)
/*
* poison to trigger assertion in uvm_page_array_peek to
* detect usage errors.
*/
ar->ar_npages = 1;
ar->ar_idx = 1000;
#endif /* defined(DIAGNOSTIC) */
}
/*
* uvm_page_array_clear: forget the cached pages and initialize the array.
*/
void
uvm_page_array_clear(struct uvm_page_array *ar)
{ KASSERT(ar->ar_idx <= ar->ar_npages);
ar->ar_idx = 0;
ar->ar_npages = 0;
}
/*
* uvm_page_array_peek: return the next cached page.
*/
struct vm_page *
uvm_page_array_peek(struct uvm_page_array *ar)
{
KASSERT(ar->ar_idx <= ar->ar_npages); if (ar->ar_idx == ar->ar_npages) {
return NULL;
}
return ar->ar_pages[ar->ar_idx];
}
/*
* uvm_page_array_advance: advance the array to the next cached page
*/
void
uvm_page_array_advance(struct uvm_page_array *ar)
{ KASSERT(ar->ar_idx <= ar->ar_npages);
ar->ar_idx++;
KASSERT(ar->ar_idx <= ar->ar_npages);
}
/*
* uvm_page_array_fill: lookup pages and keep them cached.
*
* return 0 on success. in that case, cache the result in the array
* so that they will be picked by later uvm_page_array_peek.
*
* nwant is a number of pages to fetch. a caller should consider it a hint.
* nwant == 0 means a caller have no specific idea.
*
* return ENOENT if no pages are found.
*
* called with object lock held.
*/
int
uvm_page_array_fill(struct uvm_page_array *ar, voff_t off, unsigned int nwant)
{
unsigned int npages;
#if defined(DEBUG)
unsigned int i;
#endif /* defined(DEBUG) */
unsigned int maxpages = __arraycount(ar->ar_pages);
struct uvm_object *uobj = ar->ar_uobj;
const int flags = ar->ar_flags;
const bool dense = (flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0;
const bool backward = (flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0;
int error = 0;
if (nwant != 0 && nwant < maxpages) {
maxpages = nwant;
}
#if 0 /* called from DDB for "show obj/f" without lock */
KASSERT(rw_lock_held(uobj->vmobjlock));
#endif
KASSERT(uvm_page_array_peek(ar) == NULL);
if ((flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0) {
unsigned int tagmask = UVM_PAGE_DIRTY_TAG;
if ((flags & UVM_PAGE_ARRAY_FILL_WRITEBACK) != 0) {
tagmask |= UVM_PAGE_WRITEBACK_TAG;
}
npages =
(backward ? radix_tree_gang_lookup_tagged_node_reverse :
radix_tree_gang_lookup_tagged_node)(
&uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
maxpages, dense, tagmask);
} else {
npages =
(backward ? radix_tree_gang_lookup_node_reverse :
radix_tree_gang_lookup_node)(
&uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
maxpages, dense);
}
if (npages == 0) {
if (flags != 0) {
/*
* if dense or looking for tagged entries (or
* working backwards), fail right away.
*/
npages = 0;
} else {
/*
* there's nothing else to be found with the current
* set of arguments, in the current version of the
* tree.
*
* minimize repeated tree lookups by "finding" a
* null pointer, in case the caller keeps looping (a
* common use case).
*/
npages = 1;
ar->ar_pages[0] = NULL;
}
error = ENOENT;
}
KASSERT(npages <= maxpages); ar->ar_npages = npages;
ar->ar_idx = 0;
#if defined(DEBUG)
for (i = 0; error == 0 && i < ar->ar_npages; i++) {
struct vm_page * const pg = ar->ar_pages[i];
KASSERT(pg != NULL); KDASSERT(pg->uobject == uobj);
if (backward) {
KDASSERT(pg->offset <= off); KDASSERT(i == 0 ||
pg->offset < ar->ar_pages[i - 1]->offset);
} else {
KDASSERT(pg->offset >= off); KDASSERT(i == 0 ||
pg->offset > ar->ar_pages[i - 1]->offset);
}
}
#endif /* defined(DEBUG) */
return error;
}
/*
* uvm_page_array_fill_and_peek:
* same as uvm_page_array_peek except that, if the array is empty, try to fill
* it first.
*/
struct vm_page *
uvm_page_array_fill_and_peek(struct uvm_page_array *ar, voff_t off,
unsigned int nwant)
{
int error;
if (ar->ar_idx != ar->ar_npages) {
return ar->ar_pages[ar->ar_idx];
}
error = uvm_page_array_fill(ar, off, nwant);
if (error != 0) {
return NULL;
}
return uvm_page_array_peek(ar);
}
/* $NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $ */
/*
* Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018,
* 2020, 2021 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
* Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_pool.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/bitops.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/debug.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/fault.h>
#include <uvm/uvm_extern.h>
/*
* Pool resource management utility.
*
* Memory is allocated in pages which are split into pieces according to
* the pool item size. Each page is kept on one of three lists in the
* pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
* for empty, full and partially-full pages respectively. The individual
* pool items are on a linked list headed by `ph_itemlist' in each page
* header. The memory for building the page list is either taken from
* the allocated pages themselves (for small pool items) or taken from
* an internal pool of page headers (`phpool').
*/
/* List of all pools. Non static as needed by 'vmstat -m' */
TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
/* Private pool for page header structures */
#define PHPOOL_MAX 8
static struct pool phpool[PHPOOL_MAX];
#define PHPOOL_FREELIST_NELEM(idx) \
(((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))
#if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN))
#define POOL_REDZONE
#endif
#if defined(POOL_QUARANTINE)
#define POOL_NOCACHE
#endif
#ifdef POOL_REDZONE
# ifdef KASAN
# define POOL_REDZONE_SIZE 8
# else
# define POOL_REDZONE_SIZE 2
# endif
static void pool_redzone_init(struct pool *, size_t);
static void pool_redzone_fill(struct pool *, void *);
static void pool_redzone_check(struct pool *, void *);
static void pool_cache_redzone_check(pool_cache_t, void *);
#else
# define pool_redzone_init(pp, sz) __nothing
# define pool_redzone_fill(pp, ptr) __nothing
# define pool_redzone_check(pp, ptr) __nothing
# define pool_cache_redzone_check(pc, ptr) __nothing
#endif
#ifdef KMSAN
static inline void pool_get_kmsan(struct pool *, void *);
static inline void pool_put_kmsan(struct pool *, void *);
static inline void pool_cache_get_kmsan(pool_cache_t, void *);
static inline void pool_cache_put_kmsan(pool_cache_t, void *);
#else
#define pool_get_kmsan(pp, ptr) __nothing
#define pool_put_kmsan(pp, ptr) __nothing
#define pool_cache_get_kmsan(pc, ptr) __nothing
#define pool_cache_put_kmsan(pc, ptr) __nothing
#endif
#ifdef POOL_QUARANTINE
static void pool_quarantine_init(struct pool *);
static void pool_quarantine_flush(struct pool *);
static bool pool_put_quarantine(struct pool *, void *,
struct pool_pagelist *);
#else
#define pool_quarantine_init(a) __nothing
#define pool_quarantine_flush(a) __nothing
#define pool_put_quarantine(a, b, c) false
#endif
#ifdef POOL_NOCACHE
static bool pool_cache_put_nocache(pool_cache_t, void *);
#else
#define pool_cache_put_nocache(a, b) false
#endif
#define NO_CTOR __FPTRCAST(int (*)(void *, void *, int), nullop)
#define NO_DTOR __FPTRCAST(void (*)(void *, void *), nullop)
#define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0)
#define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR)
#define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR)
#define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0)
#define pool_barrier() xc_barrier(0)
/*
* Pool backend allocators.
*
* Each pool has a backend allocator that handles allocation, deallocation,
* and any additional draining that might be needed.
*
* We provide two standard allocators:
*
* pool_allocator_kmem - the default when no allocator is specified
*
* pool_allocator_nointr - used for pools that will not be accessed
* in interrupt context.
*/
void *pool_page_alloc(struct pool *, int);
void pool_page_free(struct pool *, void *);
static void *pool_page_alloc_meta(struct pool *, int);
static void pool_page_free_meta(struct pool *, void *);
struct pool_allocator pool_allocator_kmem = {
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 0
};
struct pool_allocator pool_allocator_nointr = {
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 0
};
struct pool_allocator pool_allocator_meta = {
.pa_alloc = pool_page_alloc_meta,
.pa_free = pool_page_free_meta,
.pa_pagesz = 0
};
#define POOL_ALLOCATOR_BIG_BASE 13
static struct pool_allocator pool_allocator_big[] = {
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11),
}
};
static int pool_bigidx(size_t);
/* # of seconds to retain page after last use */
int pool_inactive_time = 10;
/* Next candidate for drainage (see pool_drain()) */
static struct pool *drainpp;
/* This lock protects both pool_head and drainpp. */
static kmutex_t pool_head_lock;
static kcondvar_t pool_busy;
/* This lock protects initialization of a potentially shared pool allocator */
static kmutex_t pool_allocator_lock;
static unsigned int poolid_counter = 0;
typedef uint32_t pool_item_bitmap_t;
#define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t))
#define BITMAP_MASK (BITMAP_SIZE - 1)
#define BITMAP_MIN_SIZE (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2))
struct pool_item_header {
/* Page headers */
LIST_ENTRY(pool_item_header)
ph_pagelist; /* pool page list */
union {
/* !PR_PHINPAGE */
struct {
SPLAY_ENTRY(pool_item_header)
phu_node; /* off-page page headers */
} phu_offpage;
/* PR_PHINPAGE */
struct {
unsigned int phu_poolid;
} phu_onpage;
} ph_u1;
void * ph_page; /* this page's address */
uint32_t ph_time; /* last referenced */
uint16_t ph_nmissing; /* # of chunks in use */
uint16_t ph_off; /* start offset in page */
union {
/* !PR_USEBMAP */
struct {
LIST_HEAD(, pool_item)
phu_itemlist; /* chunk list for this page */
} phu_normal;
/* PR_USEBMAP */
struct {
pool_item_bitmap_t phu_bitmap[1];
} phu_notouch;
} ph_u2;
};
#define ph_node ph_u1.phu_offpage.phu_node
#define ph_poolid ph_u1.phu_onpage.phu_poolid
#define ph_itemlist ph_u2.phu_normal.phu_itemlist
#define ph_bitmap ph_u2.phu_notouch.phu_bitmap
#define PHSIZE ALIGN(sizeof(struct pool_item_header))
CTASSERT(offsetof(struct pool_item_header, ph_u2) +
BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header));
#if defined(DIAGNOSTIC) && !defined(KASAN)
#define POOL_CHECK_MAGIC
#endif
struct pool_item {
#ifdef POOL_CHECK_MAGIC
u_int pi_magic;
#endif
#define PI_MAGIC 0xdeaddeadU
/* Other entries use only this list entry */
LIST_ENTRY(pool_item) pi_list;
};
#define POOL_NEEDS_CATCHUP(pp) \
((pp)->pr_nitems < (pp)->pr_minitems || \
(pp)->pr_npages < (pp)->pr_minpages)
#define POOL_OBJ_TO_PAGE(pp, v) \
(void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask)
/*
* Pool cache management.
*
* Pool caches provide a way for constructed objects to be cached by the
* pool subsystem. This can lead to performance improvements by avoiding
* needless object construction/destruction; it is deferred until absolutely
* necessary.
*
* Caches are grouped into cache groups. Each cache group references up
* to PCG_NUMOBJECTS constructed objects. When a cache allocates an
* object from the pool, it calls the object's constructor and places it
* into a cache group. When a cache group frees an object back to the
* pool, it first calls the object's destructor. This allows the object
* to persist in constructed form while freed to the cache.
*
* The pool references each cache, so that when a pool is drained by the
* pagedaemon, it can drain each individual cache as well. Each time a
* cache is drained, the most idle cache group is freed to the pool in
* its entirety.
*
* Pool caches are laid on top of pools. By layering them, we can avoid
* the complexity of cache management for pools which would not benefit
* from it.
*/
static struct pool pcg_normal_pool;
static struct pool pcg_large_pool;
static struct pool cache_pool;
static struct pool cache_cpu_pool;
static pcg_t *volatile pcg_large_cache __cacheline_aligned;
static pcg_t *volatile pcg_normal_cache __cacheline_aligned;
/* List of all caches. */
TAILQ_HEAD(,pool_cache) pool_cache_head =
TAILQ_HEAD_INITIALIZER(pool_cache_head);
int pool_cache_disable; /* global disable for caching */
static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */
static bool pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int,
void *);
static bool pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int,
void **, paddr_t *, int);
static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t);
static int pool_cache_invalidate_groups(pool_cache_t, pcg_t *);
static void pool_cache_invalidate_cpu(pool_cache_t, u_int);
static void pool_cache_transfer(pool_cache_t);
static int pool_pcg_get(pcg_t *volatile *, pcg_t **);
static int pool_pcg_put(pcg_t *volatile *, pcg_t *);
static pcg_t * pool_pcg_trunc(pcg_t *volatile *);
static int pool_catchup(struct pool *);
static void pool_prime_page(struct pool *, void *,
struct pool_item_header *);
static void pool_update_curpage(struct pool *);
static int pool_grow(struct pool *, int);
static void *pool_allocator_alloc(struct pool *, int);
static void pool_allocator_free(struct pool *, void *);
static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
void (*)(const char *, ...) __printflike(1, 2));
static void pool_print1(struct pool *, const char *,
void (*)(const char *, ...) __printflike(1, 2));
static int pool_chk_page(struct pool *, const char *,
struct pool_item_header *);
/* -------------------------------------------------------------------------- */
static inline unsigned int
pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph,
const void *v)
{
const char *cp = v;
unsigned int idx;
KASSERT(pp->pr_roflags & PR_USEBMAP);
idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size;
if (__predict_false(idx >= pp->pr_itemsperpage)) {
panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx,
pp->pr_itemsperpage);
}
return idx;
}
static inline void
pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph,
void *obj)
{
unsigned int idx = pr_item_bitmap_index(pp, ph, obj);
pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE);
pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
if (__predict_false((*bitmap & mask) != 0)) {
panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj);
}
*bitmap |= mask;
}
static inline void *
pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph)
{
pool_item_bitmap_t *bitmap = ph->ph_bitmap;
unsigned int idx;
int i;
for (i = 0; ; i++) {
int bit;
KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); bit = ffs32(bitmap[i]); if (bit) {
pool_item_bitmap_t mask;
bit--;
idx = (i * BITMAP_SIZE) + bit;
mask = 1U << bit;
KASSERT((bitmap[i] & mask) != 0);
bitmap[i] &= ~mask;
break;
}
}
KASSERT(idx < pp->pr_itemsperpage);
return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size;
}
static inline void
pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph)
{
pool_item_bitmap_t *bitmap = ph->ph_bitmap;
const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE);
int i;
for (i = 0; i < n; i++) {
bitmap[i] = (pool_item_bitmap_t)-1;
}
}
/* -------------------------------------------------------------------------- */
static inline void
pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph,
void *obj)
{
struct pool_item *pi = obj;
KASSERT(!pp_has_pser(pp));
#ifdef POOL_CHECK_MAGIC
pi->pi_magic = PI_MAGIC;
#endif
if (pp->pr_redzone) {
/*
* Mark the pool_item as valid. The rest is already
* invalid.
*/
kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0);
}
LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
}
static inline void *
pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph)
{
struct pool_item *pi;
void *v;
v = pi = LIST_FIRST(&ph->ph_itemlist);
if (__predict_false(v == NULL)) {
mutex_exit(&pp->pr_lock);
panic("%s: [%s] page empty", __func__, pp->pr_wchan);
}
KASSERTMSG((pp->pr_nitems > 0),
"%s: [%s] nitems %u inconsistent on itemlist",
__func__, pp->pr_wchan, pp->pr_nitems);
#ifdef POOL_CHECK_MAGIC
KASSERTMSG((pi->pi_magic == PI_MAGIC),
"%s: [%s] free list modified: "
"magic=%x; page %p; item addr %p", __func__,
pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
#endif
/*
* Remove from item list.
*/
LIST_REMOVE(pi, pi_list);
return v;
}
/* -------------------------------------------------------------------------- */
static inline void
pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page,
void *object)
{
if (__predict_false((void *)ph->ph_page != page)) {
panic("%s: [%s] item %p not part of pool", __func__,
pp->pr_wchan, object);
}
if (__predict_false((char *)object < (char *)page + ph->ph_off)) {
panic("%s: [%s] item %p below item space", __func__,
pp->pr_wchan, object);
}
if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
panic("%s: [%s] item %p poolid %u != %u", __func__,
pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid);
}
}
static inline void
pc_phinpage_check(pool_cache_t pc, void *object)
{
struct pool_item_header *ph;
struct pool *pp;
void *page;
pp = &pc->pc_pool;
page = POOL_OBJ_TO_PAGE(pp, object);
ph = (struct pool_item_header *)page;
pr_phinpage_check(pp, ph, page, object);
}
/* -------------------------------------------------------------------------- */
static inline int
phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
{
/*
* We consider pool_item_header with smaller ph_page bigger. This
* unnatural ordering is for the benefit of pr_find_pagehead.
*/
if (a->ph_page < b->ph_page)
return 1;
else if (a->ph_page > b->ph_page)
return -1;
else
return 0;
}
SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
static inline struct pool_item_header *
pr_find_pagehead_noalign(struct pool *pp, void *v)
{
struct pool_item_header *ph, tmp;
tmp.ph_page = (void *)(uintptr_t)v;
ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
if (ph == NULL) {
ph = SPLAY_ROOT(&pp->pr_phtree);
if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph);
}
KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0);
}
return ph;
}
/*
* Return the pool page header based on item address.
*/
static inline struct pool_item_header *
pr_find_pagehead(struct pool *pp, void *v)
{
struct pool_item_header *ph, tmp;
if ((pp->pr_roflags & PR_NOALIGN) != 0) {
ph = pr_find_pagehead_noalign(pp, v);
} else {
void *page = POOL_OBJ_TO_PAGE(pp, v);
if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
ph = (struct pool_item_header *)page;
pr_phinpage_check(pp, ph, page, v);
} else {
tmp.ph_page = page;
ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
}
}
KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) ||
((char *)ph->ph_page <= (char *)v &&
(char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz));
return ph;
}
static void
pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq)
{
struct pool_item_header *ph;
while ((ph = LIST_FIRST(pq)) != NULL) { LIST_REMOVE(ph, ph_pagelist); pool_allocator_free(pp, ph->ph_page); if ((pp->pr_roflags & PR_PHINPAGE) == 0) pool_put(pp->pr_phpool, ph);
}
}
/*
* Remove a page from the pool.
*/
static inline void
pr_rmpage(struct pool *pp, struct pool_item_header *ph,
struct pool_pagelist *pq)
{
KASSERT(mutex_owned(&pp->pr_lock));
/*
* If the page was idle, decrement the idle page count.
*/
if (ph->ph_nmissing == 0) {
KASSERT(pp->pr_nidle != 0);
KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage),
"%s: [%s] nitems=%u < itemsperpage=%u", __func__,
pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage);
pp->pr_nidle--;
}
pp->pr_nitems -= pp->pr_itemsperpage;
/*
* Unlink the page from the pool and queue it for release.
*/
LIST_REMOVE(ph, ph_pagelist);
if (pp->pr_roflags & PR_PHINPAGE) {
if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
panic("%s: [%s] ph %p poolid %u != %u",
__func__, pp->pr_wchan, ph, ph->ph_poolid,
pp->pr_poolid);
}
} else {
SPLAY_REMOVE(phtree, &pp->pr_phtree, ph);
}
LIST_INSERT_HEAD(pq, ph, ph_pagelist);
pp->pr_npages--;
pp->pr_npagefree++;
pool_update_curpage(pp);
}
/*
* Initialize all the pools listed in the "pools" link set.
*/
void
pool_subsystem_init(void)
{
size_t size;
int idx;
mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&pool_busy, "poolbusy");
/*
* Initialize private page header pool and cache magazine pool if we
* haven't done so yet.
*/
for (idx = 0; idx < PHPOOL_MAX; idx++) {
static char phpool_names[PHPOOL_MAX][6+1+6+1];
int nelem;
size_t sz;
nelem = PHPOOL_FREELIST_NELEM(idx);
KASSERT(nelem != 0);
snprintf(phpool_names[idx], sizeof(phpool_names[idx]),
"phpool-%d", nelem);
sz = offsetof(struct pool_item_header,
ph_bitmap[howmany(nelem, BITMAP_SIZE)]);
pool_init(&phpool[idx], sz, 0, 0, 0,
phpool_names[idx], &pool_allocator_meta, IPL_VM);
}
size = sizeof(pcg_t) +
(PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t);
pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0,
"pcgnormal", &pool_allocator_meta, IPL_VM);
size = sizeof(pcg_t) +
(PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t);
pool_init(&pcg_large_pool, size, coherency_unit, 0, 0,
"pcglarge", &pool_allocator_meta, IPL_VM);
pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit,
0, 0, "pcache", &pool_allocator_meta, IPL_NONE);
pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit,
0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE);
}
static inline bool
pool_init_is_phinpage(const struct pool *pp)
{
size_t pagesize;
if (pp->pr_roflags & PR_PHINPAGE) {
return true;
}
if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
return false;
}
pagesize = pp->pr_alloc->pa_pagesz;
/*
* Threshold: the item size is below 1/16 of a page size, and below
* 8 times the page header size. The latter ensures we go off-page
* if the page header would make us waste a rather big item.
*/
if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
return true;
}
/* Put the header into the page if it doesn't waste any items. */
if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
return true;
}
return false;
}
static inline bool
pool_init_is_usebmap(const struct pool *pp)
{
size_t bmapsize;
if (pp->pr_roflags & PR_NOTOUCH) {
return true;
}
/*
* If we're off-page, go with a bitmap.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
return true;
}
/*
* If we're on-page, and the page header can already contain a bitmap
* big enough to cover all the items of the page, go with a bitmap.
*/
bmapsize = roundup(PHSIZE, pp->pr_align) -
offsetof(struct pool_item_header, ph_bitmap[0]);
KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
return true;
}
return false;
}
/*
* Initialize the given pool resource structure.
*
* We export this routine to allow other kernel parts to declare
* static pools that must be initialized before kmem(9) is available.
*/
void
pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
const char *wchan, struct pool_allocator *palloc, int ipl)
{
struct pool *pp1;
size_t prsize;
int itemspace, slack;
/* XXX ioff will be removed. */
KASSERT(ioff == 0);
#ifdef DEBUG
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
/*
* Check that the pool hasn't already been initialised and
* added to the list of all pools.
*/
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (pp == pp1)
panic("%s: [%s] already initialised", __func__,
wchan);
}
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
#endif
if (palloc == NULL)
palloc = &pool_allocator_kmem;
if (!cold)
mutex_enter(&pool_allocator_lock);
if (palloc->pa_refcnt++ == 0) {
if (palloc->pa_pagesz == 0)
palloc->pa_pagesz = PAGE_SIZE;
TAILQ_INIT(&palloc->pa_list);
mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM);
palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
}
if (!cold)
mutex_exit(&pool_allocator_lock);
/*
* PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain
* valid until the the backing page is returned to the system.
*/
if (flags & PR_PSERIALIZE) {
flags |= PR_NOTOUCH;
}
if (align == 0)
align = ALIGN(1);
prsize = size;
if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item))
prsize = sizeof(struct pool_item);
prsize = roundup(prsize, align);
KASSERTMSG((prsize <= palloc->pa_pagesz),
"%s: [%s] pool item size (%zu) larger than page size (%u)",
__func__, wchan, prsize, palloc->pa_pagesz);
/*
* Initialize the pool structure.
*/
LIST_INIT(&pp->pr_emptypages);
LIST_INIT(&pp->pr_fullpages);
LIST_INIT(&pp->pr_partpages);
pp->pr_cache = NULL;
pp->pr_curpage = NULL;
pp->pr_npages = 0;
pp->pr_minitems = 0;
pp->pr_minpages = 0;
pp->pr_maxpages = UINT_MAX;
pp->pr_roflags = flags;
pp->pr_flags = 0;
pp->pr_size = prsize;
pp->pr_reqsize = size;
pp->pr_align = align;
pp->pr_wchan = wchan;
pp->pr_alloc = palloc;
pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter);
pp->pr_nitems = 0;
pp->pr_nout = 0;
pp->pr_hardlimit = UINT_MAX;
pp->pr_hardlimit_warning = NULL;
pp->pr_hardlimit_ratecap.tv_sec = 0;
pp->pr_hardlimit_ratecap.tv_usec = 0;
pp->pr_hardlimit_warning_last.tv_sec = 0;
pp->pr_hardlimit_warning_last.tv_usec = 0;
pp->pr_drain_hook = NULL;
pp->pr_drain_hook_arg = NULL;
pp->pr_freecheck = NULL;
pp->pr_redzone = false;
pool_redzone_init(pp, size);
pool_quarantine_init(pp);
/*
* Decide whether to put the page header off-page to avoid wasting too
* large a part of the page or too big an item. Off-page page headers
* go on a hash table, so we can match a returned item with its header
* based on the page address.
*/
if (pool_init_is_phinpage(pp)) {
/* Use the beginning of the page for the page header */
itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
pp->pr_itemoffset = roundup(PHSIZE, align);
pp->pr_roflags |= PR_PHINPAGE;
} else {
/* The page header will be taken from our page header pool */
itemspace = palloc->pa_pagesz;
pp->pr_itemoffset = 0;
SPLAY_INIT(&pp->pr_phtree);
}
pp->pr_itemsperpage = itemspace / pp->pr_size;
KASSERT(pp->pr_itemsperpage != 0);
/*
* Decide whether to use a bitmap or a linked list to manage freed
* items.
*/
if (pool_init_is_usebmap(pp)) {
pp->pr_roflags |= PR_USEBMAP;
}
/*
* If we're off-page, then we're using a bitmap; choose the appropriate
* pool to allocate page headers, whose size varies depending on the
* bitmap. If we're on-page, nothing to do.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
int idx;
KASSERT(pp->pr_roflags & PR_USEBMAP);
for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
idx++) {
/* nothing */
}
if (idx >= PHPOOL_MAX) {
/*
* if you see this panic, consider to tweak
* PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
*/
panic("%s: [%s] too large itemsperpage(%d) for "
"PR_USEBMAP", __func__,
pp->pr_wchan, pp->pr_itemsperpage);
}
pp->pr_phpool = &phpool[idx];
} else {
pp->pr_phpool = NULL;
}
/*
* Use the slack between the chunks and the page header
* for "cache coloring".
*/
slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
pp->pr_maxcolor = rounddown(slack, align);
pp->pr_curcolor = 0;
pp->pr_nget = 0;
pp->pr_nfail = 0;
pp->pr_nput = 0;
pp->pr_npagealloc = 0;
pp->pr_npagefree = 0;
pp->pr_hiwat = 0;
pp->pr_nidle = 0;
pp->pr_refcnt = 0;
mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl);
cv_init(&pp->pr_cv, wchan);
pp->pr_ipl = ipl;
/* Insert into the list of all pools. */
if (!cold)
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
break;
}
if (pp1 == NULL)
TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
else
TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
if (!cold)
mutex_exit(&pool_head_lock);
/* Insert this into the list of pools using this allocator. */
if (!cold)
mutex_enter(&palloc->pa_lock);
TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
if (!cold)
mutex_exit(&palloc->pa_lock);
}
/*
* De-commission a pool resource.
*/
void
pool_destroy(struct pool *pp)
{
struct pool_pagelist pq;
struct pool_item_header *ph;
pool_quarantine_flush(pp);
/* Remove from global pool list */
mutex_enter(&pool_head_lock);
while (pp->pr_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_head, pp, pr_poollist);
if (drainpp == pp)
drainpp = NULL;
mutex_exit(&pool_head_lock);
/* Remove this pool from its allocator's list of pools. */
mutex_enter(&pp->pr_alloc->pa_lock);
TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
mutex_exit(&pp->pr_alloc->pa_lock);
mutex_enter(&pool_allocator_lock);
if (--pp->pr_alloc->pa_refcnt == 0)
mutex_destroy(&pp->pr_alloc->pa_lock);
mutex_exit(&pool_allocator_lock);
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_cache == NULL);
KASSERTMSG((pp->pr_nout == 0),
"%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
pp->pr_nout);
KASSERT(LIST_EMPTY(&pp->pr_fullpages));
KASSERT(LIST_EMPTY(&pp->pr_partpages));
/* Remove all pages */
LIST_INIT(&pq);
while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
pr_rmpage(pp, ph, &pq);
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
cv_destroy(&pp->pr_cv);
mutex_destroy(&pp->pr_lock);
}
void
pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
{
/* XXX no locking -- must be used just after pool_init() */
KASSERTMSG((pp->pr_drain_hook == NULL),
"%s: [%s] already set", __func__, pp->pr_wchan);
pp->pr_drain_hook = fn;
pp->pr_drain_hook_arg = arg;
}
static struct pool_item_header *
pool_alloc_item_header(struct pool *pp, void *storage, int flags)
{
struct pool_item_header *ph;
if ((pp->pr_roflags & PR_PHINPAGE) != 0)
ph = storage;
else
ph = pool_get(pp->pr_phpool, flags);
return ph;
}
/*
* Grab an item from the pool.
*/
void *
pool_get(struct pool *pp, int flags)
{
struct pool_item_header *ph;
void *v;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); KASSERTMSG((pp->pr_itemsperpage != 0),
"%s: [%s] pr_itemsperpage is zero, "
"pool not initialized?", __func__, pp->pr_wchan);
KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
|| pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pp->pr_wchan);
if (flags & PR_WAITOK) { ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) { if (fault_inject())
return NULL;
}
mutex_enter(&pp->pr_lock);
startover:
/*
* Check to see if we've reached the hard limit. If we have,
* and we can wait, then wait until an item has been returned to
* the pool.
*/
KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
"%s: %s: crossed hard limit", __func__, pp->pr_wchan);
if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
if (pp->pr_drain_hook != NULL) {
/*
* Since the drain hook is going to free things
* back to the pool, unlock, call the hook, re-lock,
* and check the hardlimit condition again.
*/
mutex_exit(&pp->pr_lock);
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
mutex_enter(&pp->pr_lock);
if (pp->pr_nout < pp->pr_hardlimit)
goto startover;
}
if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
/*
* XXX: A warning isn't logged in this case. Should
* it be?
*/
pp->pr_flags |= PR_WANTED;
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_WANTED);
goto startover;
}
/*
* Log a message that the hard limit has been hit.
*/
if (pp->pr_hardlimit_warning != NULL &&
ratecheck(&pp->pr_hardlimit_warning_last,
&pp->pr_hardlimit_ratecap))
log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
pp->pr_nfail++;
mutex_exit(&pp->pr_lock);
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return NULL;
}
/*
* The convention we use is that if `curpage' is not NULL, then
* it points at a non-empty bucket. In particular, `curpage'
* never points at a page header which has PR_PHINPAGE set and
* has no items in its bucket.
*/
if ((ph = pp->pr_curpage) == NULL) {
int error;
KASSERTMSG((pp->pr_nitems == 0),
"%s: [%s] curpage NULL, inconsistent nitems %u",
__func__, pp->pr_wchan, pp->pr_nitems);
/*
* Call the back-end page allocator for more memory.
* Release the pool lock, as the back-end page allocator
* may block.
*/
error = pool_grow(pp, flags);
if (error != 0) {
/*
* pool_grow aborts when another thread
* is allocating a new page. Retry if it
* waited for it.
*/
if (error == ERESTART)
goto startover;
/*
* We were unable to allocate a page or item
* header, but we released the lock during
* allocation, so perhaps items were freed
* back to the pool. Check for this case.
*/
if (pp->pr_curpage != NULL)
goto startover;
pp->pr_nfail++;
mutex_exit(&pp->pr_lock);
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return NULL;
}
/* Start the allocation process over. */
goto startover;
}
if (pp->pr_roflags & PR_USEBMAP) {
KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
"%s: [%s] pool page empty", __func__, pp->pr_wchan);
v = pr_item_bitmap_get(pp, ph);
} else {
v = pr_item_linkedlist_get(pp, ph);
}
pp->pr_nitems--;
pp->pr_nout++;
if (ph->ph_nmissing == 0) { KASSERT(pp->pr_nidle > 0);
pp->pr_nidle--;
/*
* This page was previously empty. Move it to the list of
* partially-full pages. This page is already curpage.
*/
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
}
ph->ph_nmissing++;
if (ph->ph_nmissing == pp->pr_itemsperpage) { KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
LIST_EMPTY(&ph->ph_itemlist)),
"%s: [%s] nmissing (%u) inconsistent", __func__,
pp->pr_wchan, ph->ph_nmissing);
/*
* This page is now full. Move it to the full list
* and select a new current page.
*/
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); pool_update_curpage(pp);
}
pp->pr_nget++;
/*
* If we have a low water mark and we are now below that low
* water mark, add more items to the pool.
*/
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
mutex_exit(&pp->pr_lock);
KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0);
FREECHECK_OUT(&pp->pr_freecheck, v);
pool_redzone_fill(pp, v);
pool_get_kmsan(pp, v);
if (flags & PR_ZERO) memset(v, 0, pp->pr_reqsize);
return v;
}
/*
* Internal version of pool_put(). Pool is already locked/entered.
*/
static void
pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
{
struct pool_item_header *ph;
KASSERT(mutex_owned(&pp->pr_lock));
pool_redzone_check(pp, v);
pool_put_kmsan(pp, v);
FREECHECK_IN(&pp->pr_freecheck, v);
LOCKDEBUG_MEM_CHECK(v, pp->pr_size);
KASSERTMSG((pp->pr_nout > 0),
"%s: [%s] putting with none out", __func__, pp->pr_wchan);
if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
panic("%s: [%s] page header missing", __func__, pp->pr_wchan);
}
/*
* Return to item list.
*/
if (pp->pr_roflags & PR_USEBMAP) {
pr_item_bitmap_put(pp, ph, v);
} else {
pr_item_linkedlist_put(pp, ph, v);
}
KDASSERT(ph->ph_nmissing != 0);
ph->ph_nmissing--;
pp->pr_nput++;
pp->pr_nitems++;
pp->pr_nout--;
/* Cancel "pool empty" condition if it exists */
if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (pp->pr_flags & PR_WANTED) { pp->pr_flags &= ~PR_WANTED;
cv_broadcast(&pp->pr_cv);
}
/*
* If this page is now empty, do one of two things:
*
* (1) If we have more pages than the page high water mark,
* free the page back to the system. ONLY CONSIDER
* FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
* CLAIM.
*
* (2) Otherwise, move the page to the empty page list.
*
* Either way, select a new current page (so we use a partially-full
* page if one is available).
*/
if (ph->ph_nmissing == 0) {
pp->pr_nidle++;
if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems && pp->pr_npages > pp->pr_minpages &&
pp->pr_npages > pp->pr_maxpages) {
pr_rmpage(pp, ph, pq);
} else {
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
/*
* Update the timestamp on the page. A page must
* be idle for some period of time before it can
* be reclaimed by the pagedaemon. This minimizes
* ping-pong'ing for memory.
*
* note for 64-bit time_t: truncating to 32-bit is not
* a problem for our usage.
*/
ph->ph_time = time_uptime;
}
pool_update_curpage(pp);
}
/*
* If the page was previously completely full, move it to the
* partially-full list and make it the current page. The next
* allocation will get the item from this page, instead of
* further fragmenting the pool.
*/
else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
pp->pr_curpage = ph;
}
}
void
pool_put(struct pool *pp, void *v)
{
struct pool_pagelist pq;
LIST_INIT(&pq);
mutex_enter(&pp->pr_lock);
if (!pool_put_quarantine(pp, v, &pq)) {
pool_do_put(pp, v, &pq);
}
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
}
/*
* pool_grow: grow a pool by a page.
*
* => called with pool locked.
* => unlock and relock the pool.
* => return with pool locked.
*/
static int
pool_grow(struct pool *pp, int flags)
{
struct pool_item_header *ph;
char *storage;
/*
* If there's a pool_grow in progress, wait for it to complete
* and try again from the top.
*/
if (pp->pr_flags & PR_GROWING) {
if (flags & PR_WAITOK) {
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_GROWING);
return ERESTART;
} else {
if (pp->pr_flags & PR_GROWINGNOWAIT) {
/*
* This needs an unlock/relock dance so
* that the other caller has a chance to
* run and actually do the thing. Note
* that this is effectively a busy-wait.
*/
mutex_exit(&pp->pr_lock);
mutex_enter(&pp->pr_lock);
return ERESTART;
}
return EWOULDBLOCK;
}
}
pp->pr_flags |= PR_GROWING;
if (flags & PR_WAITOK)
mutex_exit(&pp->pr_lock);
else
pp->pr_flags |= PR_GROWINGNOWAIT; storage = pool_allocator_alloc(pp, flags); if (__predict_false(storage == NULL))
goto out;
ph = pool_alloc_item_header(pp, storage, flags); if (__predict_false(ph == NULL)) { pool_allocator_free(pp, storage);
goto out;
}
if (flags & PR_WAITOK) mutex_enter(&pp->pr_lock); pool_prime_page(pp, storage, ph);
pp->pr_npagealloc++;
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
/*
* If anyone was waiting for pool_grow, notify them that we
* may have just done it.
*/
cv_broadcast(&pp->pr_cv);
return 0;
out:
if (flags & PR_WAITOK)
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
return ENOMEM;
}
void
pool_prime(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
if (pp->pr_maxpages <= pp->pr_minpages)
pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */
while (pp->pr_npages < pp->pr_minpages)
(void) pool_grow(pp, PR_WAITOK);
mutex_exit(&pp->pr_lock);
}
/*
* Add a page worth of items to the pool.
*
* Note, we must be called with the pool descriptor LOCKED.
*/
static void
pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
{
const unsigned int align = pp->pr_align;
struct pool_item *pi;
void *cp = storage;
int n;
KASSERT(mutex_owned(&pp->pr_lock)); KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) ||
(((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)),
"%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp);
/*
* Insert page header.
*/
LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
LIST_INIT(&ph->ph_itemlist);
ph->ph_page = storage;
ph->ph_nmissing = 0;
ph->ph_time = time_uptime;
if (pp->pr_roflags & PR_PHINPAGE)
ph->ph_poolid = pp->pr_poolid;
else
SPLAY_INSERT(phtree, &pp->pr_phtree, ph);
pp->pr_nidle++;
/*
* The item space starts after the on-page header, if any.
*/
ph->ph_off = pp->pr_itemoffset;
/*
* Color this page.
*/
ph->ph_off += pp->pr_curcolor;
cp = (char *)cp + ph->ph_off;
if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
pp->pr_curcolor = 0;
KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
/*
* Insert remaining chunks on the bucket list.
*/
n = pp->pr_itemsperpage;
pp->pr_nitems += n;
if (pp->pr_roflags & PR_USEBMAP) {
pr_item_bitmap_init(pp, ph);
} else {
while (n--) {
pi = (struct pool_item *)cp;
KASSERT((((vaddr_t)pi) & (align - 1)) == 0);
/* Insert on page list */
LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
#ifdef POOL_CHECK_MAGIC
pi->pi_magic = PI_MAGIC;
#endif
cp = (char *)cp + pp->pr_size;
KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
}
}
/*
* If the pool was depleted, point at the new page.
*/
if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (++pp->pr_npages > pp->pr_hiwat) pp->pr_hiwat = pp->pr_npages;
}
/*
* Used by pool_get() when nitems drops below the low water mark. This
* is used to catch up pr_nitems with the low water mark.
*
* Note 1, we never wait for memory here, we let the caller decide what to do.
*
* Note 2, we must be called with the pool already locked, and we return
* with it locked.
*/
static int
pool_catchup(struct pool *pp)
{
int error = 0;
while (POOL_NEEDS_CATCHUP(pp)) {
error = pool_grow(pp, PR_NOWAIT);
if (error) {
if (error == ERESTART)
continue;
break;
}
}
return error;
}
static void
pool_update_curpage(struct pool *pp)
{
pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
if (pp->pr_curpage == NULL) {
pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
}
KASSERTMSG((pp->pr_curpage == NULL) == (pp->pr_nitems == 0),
"pp=%p curpage=%p nitems=%u", pp, pp->pr_curpage, pp->pr_nitems);
}
void
pool_setlowat(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_minitems = n;
/* Make sure we're caught up with the newly-set low water mark. */
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
mutex_exit(&pp->pr_lock);
}
void
pool_sethiwat(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_maxitems = n;
mutex_exit(&pp->pr_lock);
}
void
pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
{
mutex_enter(&pp->pr_lock);
pp->pr_hardlimit = n;
pp->pr_hardlimit_warning = warnmess;
pp->pr_hardlimit_ratecap.tv_sec = ratecap;
pp->pr_hardlimit_warning_last.tv_sec = 0;
pp->pr_hardlimit_warning_last.tv_usec = 0;
pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
mutex_exit(&pp->pr_lock);
}
unsigned int
pool_nget(struct pool *pp)
{
return pp->pr_nget;
}
unsigned int
pool_nput(struct pool *pp)
{
return pp->pr_nput;
}
/*
* Release all complete pages that have not been used recently.
*
* Must not be called from interrupt context.
*/
int
pool_reclaim(struct pool *pp)
{
struct pool_item_header *ph, *phnext;
struct pool_pagelist pq;
struct pool_cache *pc;
uint32_t curtime;
bool klock;
int rv;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
if (pp->pr_drain_hook != NULL) {
/*
* The drain hook must be called with the pool unlocked.
*/
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
}
/*
* XXXSMP Because we do not want to cause non-MPSAFE code
* to block.
*/
if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
pp->pr_ipl == IPL_SOFTSERIAL) {
KERNEL_LOCK(1, NULL);
klock = true;
} else
klock = false;
/* Reclaim items from the pool's cache (if any). */
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL)
pool_cache_invalidate(pc);
if (mutex_tryenter(&pp->pr_lock) == 0) {
if (klock) {
KERNEL_UNLOCK_ONE(NULL);
}
return 0;
}
LIST_INIT(&pq);
curtime = time_uptime;
for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
phnext = LIST_NEXT(ph, ph_pagelist);
/* Check our minimum page claim */
if (pp->pr_npages <= pp->pr_minpages)
break;
KASSERT(ph->ph_nmissing == 0);
if (curtime - ph->ph_time < pool_inactive_time)
continue;
/*
* If freeing this page would put us below the minimum free items
* or the minimum pages, stop now.
*/
if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems ||
pp->pr_npages - 1 < pp->pr_minpages)
break;
pr_rmpage(pp, ph, &pq);
}
mutex_exit(&pp->pr_lock);
if (LIST_EMPTY(&pq))
rv = 0;
else {
pr_pagelist_free(pp, &pq);
rv = 1;
}
if (klock) {
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
/*
* Drain pools, one at a time. The drained pool is returned within ppp.
*
* Note, must never be called from interrupt context.
*/
bool
pool_drain(struct pool **ppp)
{
bool reclaimed;
struct pool *pp;
KASSERT(!TAILQ_EMPTY(&pool_head));
pp = NULL;
/* Find next pool to drain, and add a reference. */
mutex_enter(&pool_head_lock);
do {
if (drainpp == NULL) {
drainpp = TAILQ_FIRST(&pool_head);
}
if (drainpp != NULL) {
pp = drainpp;
drainpp = TAILQ_NEXT(pp, pr_poollist);
}
/*
* Skip completely idle pools. We depend on at least
* one pool in the system being active.
*/
} while (pp == NULL || pp->pr_npages == 0);
pp->pr_refcnt++;
mutex_exit(&pool_head_lock);
/* Drain the cache (if any) and pool.. */
reclaimed = pool_reclaim(pp);
/* Finally, unlock the pool. */
mutex_enter(&pool_head_lock);
pp->pr_refcnt--;
cv_broadcast(&pool_busy);
mutex_exit(&pool_head_lock);
if (ppp != NULL)
*ppp = pp;
return reclaimed;
}
/*
* Calculate the total number of pages consumed by pools.
*/
int
pool_totalpages(void)
{
mutex_enter(&pool_head_lock);
int pages = pool_totalpages_locked();
mutex_exit(&pool_head_lock);
return pages;
}
int
pool_totalpages_locked(void)
{
struct pool *pp;
uint64_t total = 0;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
uint64_t bytes =
(uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz;
if ((pp->pr_roflags & PR_RECURSIVE) != 0)
bytes -= ((uint64_t)pp->pr_nout * pp->pr_size);
total += bytes;
}
return atop(total);
}
/*
* Diagnostic helpers.
*/
void
pool_printall(const char *modif, void (*pr)(const char *, ...))
{
struct pool *pp;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
pool_printit(pp, modif, pr);
}
}
void
pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{
if (pp == NULL) {
(*pr)("Must specify a pool to print.\n");
return;
}
pool_print1(pp, modif, pr);
}
static void
pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl,
void (*pr)(const char *, ...))
{
struct pool_item_header *ph;
LIST_FOREACH(ph, pl, ph_pagelist) {
(*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n",
ph->ph_page, ph->ph_nmissing, ph->ph_time);
#ifdef POOL_CHECK_MAGIC
struct pool_item *pi;
if (!(pp->pr_roflags & PR_USEBMAP)) {
LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
if (pi->pi_magic != PI_MAGIC) {
(*pr)("\t\t\titem %p, magic 0x%x\n",
pi, pi->pi_magic);
}
}
}
#endif
}
}
static void
pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{
struct pool_item_header *ph;
pool_cache_t pc;
pcg_t *pcg;
pool_cache_cpu_t *cc;
uint64_t cpuhit, cpumiss, pchit, pcmiss;
uint32_t nfull;
int i;
bool print_log = false, print_pagelist = false, print_cache = false;
bool print_short = false, skip_empty = false;
char c;
while ((c = *modif++) != '\0') {
if (c == 'l')
print_log = true;
if (c == 'p')
print_pagelist = true;
if (c == 'c')
print_cache = true;
if (c == 's')
print_short = true;
if (c == 'S')
skip_empty = true;
}
if (skip_empty && pp->pr_nget == 0)
return;
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
(*pr)("POOLCACHE");
} else {
(*pr)("POOL");
}
/* Single line output. */
if (print_short) {
(*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u\n",
pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages,
pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput,
pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle);
return;
}
(*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
pp->pr_roflags);
(*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc);
(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
(*pr)("\tnget %lu, nfail %lu, nput %lu\n",
pp->pr_nget, pp->pr_nfail, pp->pr_nput);
(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
if (!print_pagelist)
goto skip_pagelist;
if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
(*pr)("\n\tempty page list:\n");
pool_print_pagelist(pp, &pp->pr_emptypages, pr);
if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
(*pr)("\n\tfull page list:\n");
pool_print_pagelist(pp, &pp->pr_fullpages, pr);
if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
(*pr)("\n\tpartial-page list:\n");
pool_print_pagelist(pp, &pp->pr_partpages, pr);
if (pp->pr_curpage == NULL)
(*pr)("\tno current page\n");
else
(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
skip_pagelist:
if (print_log)
goto skip_log;
(*pr)("\n");
skip_log:
#define PR_GROUPLIST(pcg) \
(*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \
for (i = 0; i < pcg->pcg_size; i++) { \
if (pcg->pcg_objects[i].pcgo_pa != \
POOL_PADDR_INVALID) { \
(*pr)("\t\t\t%p, 0x%llx\n", \
pcg->pcg_objects[i].pcgo_va, \
(unsigned long long) \
pcg->pcg_objects[i].pcgo_pa); \
} else { \
(*pr)("\t\t\t%p\n", \
pcg->pcg_objects[i].pcgo_va); \
} \
}
if (pc != NULL) {
cpuhit = 0;
cpumiss = 0;
pcmiss = 0;
nfull = 0;
for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
if ((cc = pc->pc_cpus[i]) == NULL)
continue;
cpuhit += cc->cc_hits;
cpumiss += cc->cc_misses;
pcmiss += cc->cc_pcmisses;
nfull += cc->cc_nfull;
}
pchit = cpumiss - pcmiss;
(*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss);
(*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss);
(*pr)("\tcache layer full groups %u\n", nfull);
if (print_cache) {
(*pr)("\tfull cache groups:\n");
for (pcg = pc->pc_fullgroups; pcg != NULL;
pcg = pcg->pcg_next) {
PR_GROUPLIST(pcg);
}
}
}
#undef PR_GROUPLIST
}
static int
pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
{
struct pool_item *pi;
void *page;
int n;
if ((pp->pr_roflags & PR_NOALIGN) == 0) {
page = POOL_OBJ_TO_PAGE(pp, ph);
if (page != ph->ph_page &&
(pp->pr_roflags & PR_PHINPAGE) != 0) {
if (label != NULL)
printf("%s: ", label);
printf("pool(%p:%s): page inconsistency: page %p;"
" at page head addr %p (p %p)\n", pp,
pp->pr_wchan, ph->ph_page,
ph, page);
return 1;
}
}
if ((pp->pr_roflags & PR_USEBMAP) != 0)
return 0;
for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0;
pi != NULL;
pi = LIST_NEXT(pi,pi_list), n++) {
#ifdef POOL_CHECK_MAGIC
if (pi->pi_magic != PI_MAGIC) {
if (label != NULL)
printf("%s: ", label);
printf("pool(%s): free list modified: magic=%x;"
" page %p; item ordinal %d; addr %p\n",
pp->pr_wchan, pi->pi_magic, ph->ph_page,
n, pi);
panic("pool");
}
#endif
if ((pp->pr_roflags & PR_NOALIGN) != 0) {
continue;
}
page = POOL_OBJ_TO_PAGE(pp, pi);
if (page == ph->ph_page)
continue;
if (label != NULL)
printf("%s: ", label);
printf("pool(%p:%s): page inconsistency: page %p;"
" item ordinal %d; addr %p (p %p)\n", pp,
pp->pr_wchan, ph->ph_page,
n, pi, page);
return 1;
}
return 0;
}
int
pool_chk(struct pool *pp, const char *label)
{
struct pool_item_header *ph;
int r = 0;
mutex_enter(&pp->pr_lock);
LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
out:
mutex_exit(&pp->pr_lock);
return r;
}
/*
* pool_cache_init:
*
* Initialize a pool cache.
*/
pool_cache_t
pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags,
const char *wchan, struct pool_allocator *palloc, int ipl,
int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg)
{
pool_cache_t pc;
pc = pool_get(&cache_pool, PR_WAITOK);
if (pc == NULL)
return NULL;
pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan,
palloc, ipl, ctor, dtor, arg);
return pc;
}
/*
* pool_cache_bootstrap:
*
* Kernel-private version of pool_cache_init(). The caller
* provides initial storage.
*/
void
pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
u_int align_offset, u_int flags, const char *wchan,
struct pool_allocator *palloc, int ipl,
int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
void *arg)
{
CPU_INFO_ITERATOR cii;
pool_cache_t pc1;
struct cpu_info *ci;
struct pool *pp;
unsigned int ppflags;
pp = &pc->pc_pool;
if (palloc == NULL && ipl == IPL_NONE) {
if (size > PAGE_SIZE) {
int bigidx = pool_bigidx(size);
palloc = &pool_allocator_big[bigidx];
flags |= PR_NOALIGN;
} else
palloc = &pool_allocator_nointr;
}
ppflags = flags;
if (ctor == NULL) {
ctor = NO_CTOR;
}
if (dtor == NULL) {
dtor = NO_DTOR;
} else {
/*
* If we have a destructor, then the pool layer does not
* need to worry about PR_PSERIALIZE.
*/
ppflags &= ~PR_PSERIALIZE;
}
pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl);
pc->pc_fullgroups = NULL;
pc->pc_partgroups = NULL;
pc->pc_ctor = ctor;
pc->pc_dtor = dtor;
pc->pc_arg = arg;
pc->pc_refcnt = 0;
pc->pc_roflags = flags;
pc->pc_freecheck = NULL;
if ((flags & PR_LARGECACHE) != 0) {
pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
pc->pc_pcgpool = &pcg_large_pool;
pc->pc_pcgcache = &pcg_large_cache;
} else {
pc->pc_pcgsize = PCG_NOBJECTS_NORMAL;
pc->pc_pcgpool = &pcg_normal_pool;
pc->pc_pcgcache = &pcg_normal_cache;
}
/* Allocate per-CPU caches. */
memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
pc->pc_ncpu = 0;
if (ncpu < 2) {
/* XXX For sparc: boot CPU is not attached yet. */
pool_cache_cpu_init1(curcpu(), pc);
} else {
for (CPU_INFO_FOREACH(cii, ci)) {
pool_cache_cpu_init1(ci, pc);
}
}
/* Add to list of all pools. */
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
break;
}
if (pc1 == NULL)
TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
else
TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
atomic_store_release(&pp->pr_cache, pc);
}
/*
* pool_cache_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_destroy(pool_cache_t pc)
{
pool_cache_bootstrap_destroy(pc);
pool_put(&cache_pool, pc);
}
/*
* pool_cache_bootstrap_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_bootstrap_destroy(pool_cache_t pc)
{
struct pool *pp = &pc->pc_pool;
u_int i;
/* Remove it from the global list. */
mutex_enter(&pool_head_lock);
while (pc->pc_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
mutex_exit(&pool_head_lock);
/* First, invalidate the entire cache. */
pool_cache_invalidate(pc);
/* Disassociate it from the pool. */
mutex_enter(&pp->pr_lock);
atomic_store_relaxed(&pp->pr_cache, NULL);
mutex_exit(&pp->pr_lock);
/* Destroy per-CPU data */
for (i = 0; i < __arraycount(pc->pc_cpus); i++)
pool_cache_invalidate_cpu(pc, i);
/* Finally, destroy it. */
pool_destroy(pp);
}
/*
* pool_cache_cpu_init1:
*
* Called for each pool_cache whenever a new CPU is attached.
*/
static void
pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
{
pool_cache_cpu_t *cc;
int index;
index = ci->ci_index;
KASSERT(index < __arraycount(pc->pc_cpus));
if ((cc = pc->pc_cpus[index]) != NULL) {
return;
}
/*
* The first CPU is 'free'. This needs to be the case for
* bootstrap - we may not be able to allocate yet.
*/
if (pc->pc_ncpu == 0) {
cc = &pc->pc_cpu0;
pc->pc_ncpu = 1;
} else {
pc->pc_ncpu++;
cc = pool_get(&cache_cpu_pool, PR_WAITOK);
}
cc->cc_current = __UNCONST(&pcg_dummy);
cc->cc_previous = __UNCONST(&pcg_dummy);
cc->cc_pcgcache = pc->pc_pcgcache;
cc->cc_hits = 0;
cc->cc_misses = 0;
cc->cc_pcmisses = 0;
cc->cc_contended = 0;
cc->cc_nfull = 0;
cc->cc_npart = 0;
pc->pc_cpus[index] = cc;
}
/*
* pool_cache_cpu_init:
*
* Called whenever a new CPU is attached.
*/
void
pool_cache_cpu_init(struct cpu_info *ci)
{
pool_cache_t pc;
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) {
pc->pc_refcnt++;
mutex_exit(&pool_head_lock);
pool_cache_cpu_init1(ci, pc);
mutex_enter(&pool_head_lock);
pc->pc_refcnt--;
cv_broadcast(&pool_busy);
}
mutex_exit(&pool_head_lock);
}
/*
* pool_cache_reclaim:
*
* Reclaim memory from a pool cache.
*/
bool
pool_cache_reclaim(pool_cache_t pc)
{
return pool_reclaim(&pc->pc_pool);
}
static inline void
pool_cache_pre_destruct(pool_cache_t pc)
{
/*
* Perform a passive serialization barrier before destructing
* a batch of one or more objects.
*/
if (__predict_false(pc_has_pser(pc))) { pool_barrier();
}
}
static void
pool_cache_destruct_object1(pool_cache_t pc, void *object)
{
(*pc->pc_dtor)(pc->pc_arg, object);
pool_put(&pc->pc_pool, object);
}
/*
* pool_cache_destruct_object:
*
* Force destruction of an object and its release back into
* the pool.
*/
void
pool_cache_destruct_object(pool_cache_t pc, void *object)
{
FREECHECK_IN(&pc->pc_freecheck, object);
pool_cache_pre_destruct(pc);
pool_cache_destruct_object1(pc, object);
}
/*
* pool_cache_invalidate_groups:
*
* Invalidate a chain of groups and destruct all objects. Return the
* number of groups that were invalidated.
*/
static int
pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
{
void *object;
pcg_t *next;
int i, n;
if (pcg == NULL) {
return 0;
}
pool_cache_pre_destruct(pc);
for (n = 0; pcg != NULL; pcg = next, n++) {
next = pcg->pcg_next;
for (i = 0; i < pcg->pcg_avail; i++) {
object = pcg->pcg_objects[i].pcgo_va;
pool_cache_destruct_object1(pc, object);
}
if (pcg->pcg_size == PCG_NOBJECTS_LARGE) {
pool_put(&pcg_large_pool, pcg);
} else {
KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL);
pool_put(&pcg_normal_pool, pcg);
}
}
return n;
}
/*
* pool_cache_invalidate:
*
* Invalidate a pool cache (destruct and release all of the
* cached objects). Does not reclaim objects from the pool.
*
* Note: For pool caches that provide constructed objects, there
* is an assumption that another level of synchronization is occurring
* between the input to the constructor and the cache invalidation.
*
* Invalidation is a costly process and should not be called from
* interrupt context.
*/
void
pool_cache_invalidate(pool_cache_t pc)
{
uint64_t where;
pcg_t *pcg;
int n, s;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
if (ncpu < 2 || !mp_online) {
/*
* We might be called early enough in the boot process
* for the CPU data structures to not be fully initialized.
* In this case, transfer the content of the local CPU's
* cache back into global cache as only this CPU is currently
* running.
*/
pool_cache_transfer(pc);
} else {
/*
* Signal all CPUs that they must transfer their local
* cache back to the global pool then wait for the xcall to
* complete.
*/
where = xc_broadcast(0,
__FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL);
xc_wait(where);
}
/* Now dequeue and invalidate everything. */
pcg = pool_pcg_trunc(&pcg_normal_cache);
(void)pool_cache_invalidate_groups(pc, pcg);
pcg = pool_pcg_trunc(&pcg_large_cache);
(void)pool_cache_invalidate_groups(pc, pcg);
pcg = pool_pcg_trunc(&pc->pc_fullgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
splx(s);
pcg = pool_pcg_trunc(&pc->pc_partgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n;
splx(s);
}
/*
* pool_cache_invalidate_cpu:
*
* Invalidate all CPU-bound cached objects in pool cache, the CPU being
* identified by its associated index.
* It is caller's responsibility to ensure that no operation is
* taking place on this pool cache while doing this invalidation.
* WARNING: as no inter-CPU locking is enforced, trying to invalidate
* pool cached objects from a CPU different from the one currently running
* may result in an undefined behaviour.
*/
static void
pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
if ((cc = pc->pc_cpus[index]) == NULL)
return;
if ((pcg = cc->cc_current) != &pcg_dummy) {
pcg->pcg_next = NULL;
pool_cache_invalidate_groups(pc, pcg);
}
if ((pcg = cc->cc_previous) != &pcg_dummy) {
pcg->pcg_next = NULL;
pool_cache_invalidate_groups(pc, pcg);
}
if (cc != &pc->pc_cpu0)
pool_put(&cache_cpu_pool, cc);
}
void
pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg)
{
pool_set_drain_hook(&pc->pc_pool, fn, arg);
}
void
pool_cache_setlowat(pool_cache_t pc, int n)
{
pool_setlowat(&pc->pc_pool, n);
}
void
pool_cache_sethiwat(pool_cache_t pc, int n)
{
pool_sethiwat(&pc->pc_pool, n);
}
void
pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
{
pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
}
void
pool_cache_prime(pool_cache_t pc, int n)
{
pool_prime(&pc->pc_pool, n);
}
unsigned int
pool_cache_nget(pool_cache_t pc)
{
return pool_nget(&pc->pc_pool);
}
unsigned int
pool_cache_nput(pool_cache_t pc)
{
return pool_nput(&pc->pc_pool);
}
/*
* pool_pcg_get:
*
* Get a cache group from the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because
* of spin wait vs. kernel_lock.
*/
static int
pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count); n = atomic_load_relaxed(head);
continue;
}
if (__predict_false(o == NULL)) {
break;
}
/* Lock out concurrent get/put. */
n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy));
if (o == n) {
/* Fetch pointer to next item and then unlock. */
membar_datadep_consumer(); /* alpha */
n = atomic_load_relaxed(&o->pcg_next); atomic_store_release(head, n);
break;
}
}
*pcgp = o;
return count != SPINLOCK_BACKOFF_MIN;
}
/*
* pool_pcg_trunc:
*
* Chop out entire list of pool cache groups.
*/
static pcg_t *
pool_pcg_trunc(pcg_t *volatile *head)
{
int count = SPINLOCK_BACKOFF_MIN, s;
pcg_t *o, *n;
s = splvm();
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
n = atomic_cas_ptr(head, o, NULL);
if (o == n) {
splx(s);
membar_datadep_consumer(); /* alpha */
return o;
}
}
}
/*
* pool_pcg_put:
*
* Put a pool cache group to the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because of
* spin wait vs. kernel_lock.
*/
static int
pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
pcg->pcg_next = o;
membar_release();
n = atomic_cas_ptr(head, o, pcg);
if (o == n) {
return count != SPINLOCK_BACKOFF_MIN;
}
}
}
static bool __noinline
pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s,
void **objectp, paddr_t *pap, int flags)
{
pcg_t *pcg, *cur;
void *object;
KASSERT(cc->cc_current->pcg_avail == 0); KASSERT(cc->cc_previous->pcg_avail == 0);
cc->cc_misses++;
/*
* If there's a full group, release our empty group back to the
* cache. Install the full group as cc_current and return.
*/
cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
if (__predict_true(pcg != NULL)) {
KASSERT(pcg->pcg_avail == pcg->pcg_size); if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { KASSERT(cur->pcg_avail == 0);
(void)pool_pcg_put(cc->cc_pcgcache, cur);
}
cc->cc_nfull--;
cc->cc_current = pcg;
return true;
}
/*
* Nothing available locally or in cache. Take the slow
* path: fetch a new object from the pool and construct
* it.
*/
cc->cc_pcmisses++;
splx(s);
object = pool_get(&pc->pc_pool, flags);
*objectp = object;
if (__predict_false(object == NULL)) {
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return false;
}
if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) {
pool_put(&pc->pc_pool, object);
*objectp = NULL;
return false;
}
KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); if (pap != NULL) {
#ifdef POOL_VTOPHYS
*pap = POOL_VTOPHYS(object);
#else
*pap = POOL_PADDR_INVALID;
#endif
}
FREECHECK_OUT(&pc->pc_freecheck, object);
return false;
}
/*
* pool_cache_get{,_paddr}:
*
* Get an object from a pool cache (optionally returning
* the physical address of the object).
*/
void *
pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
void *object;
int s;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); if (pc->pc_pool.pr_ipl == IPL_NONE &&
__predict_true(!cold) &&
__predict_true(panicstr == NULL)) {
KASSERTMSG(!cpu_intr_p(),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pc->pc_pool.pr_wchan);
KASSERTMSG(!cpu_softintr_p(),
"%s: [%s] is IPL_NONE,"
" but called from soft interrupt context",
__func__, pc->pc_pool.pr_wchan);
}
if (flags & PR_WAITOK) { ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) { if (fault_inject())
return NULL;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* Try and allocate an object from the current group. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail > 0)) {
object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
if (__predict_false(pap != NULL)) *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
#if defined(DIAGNOSTIC)
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
KASSERT(pcg->pcg_avail < pcg->pcg_size); KASSERT(object != NULL);
#endif
cc->cc_hits++;
splx(s);
FREECHECK_OUT(&pc->pc_freecheck, object);
pool_redzone_fill(&pc->pc_pool, object);
pool_cache_get_kmsan(pc, object);
return object;
}
/*
* That failed. If the previous group isn't empty, swap
* it with the current group and allocate from there.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail > 0)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't allocate from either group: try the slow path.
* If get_slow() allocated an object for us, or if
* no more objects are available, it will return false.
* Otherwise, we need to retry.
*/
if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) { if (object != NULL) { kmsan_orig(object, pc->pc_pool.pr_size,
KMSAN_TYPE_POOL, __RET_ADDR);
}
break;
}
}
/*
* We would like to KASSERT(object || (flags & PR_NOWAIT)), but
* pool_cache_get can fail even in the PR_WAITOK case, if the
* constructor fails.
*/
return object;
}
static bool __noinline
pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
{
pcg_t *pcg, *cur;
KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);
cc->cc_misses++;
/*
* Try to get an empty group from the cache. If there are no empty
* groups in the cache then allocate one.
*/
(void)pool_pcg_get(cc->cc_pcgcache, &pcg);
if (__predict_false(pcg == NULL)) {
if (__predict_true(!pool_cache_disable)) {
pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
}
if (__predict_true(pcg != NULL)) {
pcg->pcg_avail = 0;
pcg->pcg_size = pc->pc_pcgsize;
}
}
/*
* If there's a empty group, release our full group back to the
* cache. Install the empty group to the local CPU and return.
*/
if (pcg != NULL) {
KASSERT(pcg->pcg_avail == 0);
if (__predict_false(cc->cc_previous == &pcg_dummy)) {
cc->cc_previous = pcg;
} else {
cur = cc->cc_current;
if (__predict_true(cur != &pcg_dummy)) {
KASSERT(cur->pcg_avail == cur->pcg_size);
cc->cc_contended +=
pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
}
cc->cc_current = pcg;
}
return true;
}
/*
* Nothing available locally or in cache, and we didn't
* allocate an empty group. Take the slow path and destroy
* the object here and now.
*/
cc->cc_pcmisses++;
splx(s);
pool_cache_destruct_object(pc, object);
return false;
}
/*
* pool_cache_put{,_paddr}:
*
* Put an object back to the pool cache (optionally caching the
* physical address of the object).
*/
void
pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
int s;
KASSERT(object != NULL);
pool_cache_put_kmsan(pc, object);
pool_cache_redzone_check(pc, object);
FREECHECK_IN(&pc->pc_freecheck, object);
if (pc->pc_pool.pr_roflags & PR_PHINPAGE) { pc_phinpage_check(pc, object);
}
if (pool_cache_put_nocache(pc, object)) {
return;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* If the current group isn't full, release it there. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
pcg->pcg_avail++;
cc->cc_hits++;
splx(s);
return;
}
/*
* That failed. If the previous group isn't full, swap
* it with the current group and try again.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't free to either group: try the slow path.
* If put_slow() releases the object for us, it
* will return false. Otherwise we need to retry.
*/
if (!pool_cache_put_slow(pc, cc, s, object))
break;
}
}
/*
* pool_cache_transfer:
*
* Transfer objects from the per-CPU cache to the global cache.
* Run within a cross-call thread.
*/
static void
pool_cache_transfer(pool_cache_t pc)
{
pool_cache_cpu_t *cc;
pcg_t *prev, *cur;
int s;
s = splvm();
cc = pc->pc_cpus[curcpu()->ci_index];
cur = cc->cc_current;
cc->cc_current = __UNCONST(&pcg_dummy);
prev = cc->cc_previous;
cc->cc_previous = __UNCONST(&pcg_dummy);
if (cur != &pcg_dummy) {
if (cur->pcg_avail == cur->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
} else if (cur->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, cur);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, cur);
cc->cc_npart++;
}
}
if (prev != &pcg_dummy) {
if (prev->pcg_avail == prev->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, prev);
cc->cc_nfull++;
} else if (prev->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, prev);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, prev);
cc->cc_npart++;
}
}
splx(s);
}
static int
pool_bigidx(size_t size)
{
int i;
for (i = 0; i < __arraycount(pool_allocator_big); i++) {
if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
return i;
}
panic("pool item size %zu too large, use a custom allocator", size);
}
static void *
pool_allocator_alloc(struct pool *pp, int flags)
{
struct pool_allocator *pa = pp->pr_alloc;
void *res;
res = (*pa->pa_alloc)(pp, flags);
if (res == NULL && (flags & PR_WAITOK) == 0) {
/*
* We only run the drain hook here if PR_NOWAIT.
* In other cases, the hook will be run in
* pool_reclaim().
*/
if (pp->pr_drain_hook != NULL) {
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
res = (*pa->pa_alloc)(pp, flags);
}
}
return res;
}
static void
pool_allocator_free(struct pool *pp, void *v)
{
struct pool_allocator *pa = pp->pr_alloc;
if (pp->pr_redzone) {
KASSERT(!pp_has_pser(pp));
kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
} else if (__predict_false(pp_has_pser(pp))) {
/*
* Perform a passive serialization barrier before freeing
* the pool page back to the system.
*/
pool_barrier();
}
(*pa->pa_free)(pp, v);
}
void *
pool_page_alloc(struct pool *pp, int flags)
{
const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
vmem_addr_t va;
int ret;
ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
vflags | VM_INSTANTFIT, &va);
return ret ? NULL : (void *)va;
}
void
pool_page_free(struct pool *pp, void *v)
{
uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
}
static void *
pool_page_alloc_meta(struct pool *pp, int flags)
{
const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
vmem_addr_t va;
int ret;
ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
vflags | VM_INSTANTFIT, &va);
return ret ? NULL : (void *)va;
}
static void
pool_page_free_meta(struct pool *pp, void *v)
{
vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
}
#ifdef KMSAN
static inline void
pool_get_kmsan(struct pool *pp, void *p)
{
kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR);
kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT);
}
static inline void
pool_put_kmsan(struct pool *pp, void *p)
{
kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED);
}
static inline void
pool_cache_get_kmsan(pool_cache_t pc, void *p)
{
if (__predict_false(pc_has_ctor(pc))) {
return;
}
pool_get_kmsan(&pc->pc_pool, p);
}
static inline void
pool_cache_put_kmsan(pool_cache_t pc, void *p)
{
pool_put_kmsan(&pc->pc_pool, p);
}
#endif
#ifdef POOL_QUARANTINE
static void
pool_quarantine_init(struct pool *pp)
{
pp->pr_quar.rotor = 0;
memset(&pp->pr_quar, 0, sizeof(pp->pr_quar));
}
static void
pool_quarantine_flush(struct pool *pp)
{
pool_quar_t *quar = &pp->pr_quar;
struct pool_pagelist pq;
size_t i;
LIST_INIT(&pq);
mutex_enter(&pp->pr_lock);
for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) {
if (quar->list[i] == 0)
continue;
pool_do_put(pp, (void *)quar->list[i], &pq);
}
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
}
static bool
pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq)
{
pool_quar_t *quar = &pp->pr_quar;
uintptr_t old;
if (pp->pr_roflags & PR_NOTOUCH) {
return false;
}
pool_redzone_check(pp, v);
old = quar->list[quar->rotor];
quar->list[quar->rotor] = (uintptr_t)v;
quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH;
if (old != 0) {
pool_do_put(pp, (void *)old, pq);
}
return true;
}
#endif
#ifdef POOL_NOCACHE
static bool
pool_cache_put_nocache(pool_cache_t pc, void *p)
{
pool_cache_destruct_object(pc, p);
return true;
}
#endif
#ifdef POOL_REDZONE
#if defined(_LP64)
# define PRIME 0x9e37fffffffc0000UL
#else /* defined(_LP64) */
# define PRIME 0x9e3779b1
#endif /* defined(_LP64) */
#define STATIC_BYTE 0xFE
CTASSERT(POOL_REDZONE_SIZE > 1);
#ifndef KASAN
static inline uint8_t
pool_pattern_generate(const void *p)
{
return (uint8_t)(((uintptr_t)p) * PRIME
>> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT);
}
#endif
static void
pool_redzone_init(struct pool *pp, size_t requested_size)
{
size_t redzsz;
size_t nsz;
#ifdef KASAN
redzsz = requested_size;
kasan_add_redzone(&redzsz);
redzsz -= requested_size;
#else
redzsz = POOL_REDZONE_SIZE;
#endif
if (pp->pr_roflags & PR_NOTOUCH) {
pp->pr_redzone = false;
return;
}
/*
* We may have extended the requested size earlier; check if
* there's naturally space in the padding for a red zone.
*/
if (pp->pr_size - requested_size >= redzsz) {
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
return;
}
/*
* No space in the natural padding; check if we can extend a
* bit the size of the pool.
*
* Avoid using redzone for allocations half of a page or larger.
* For pagesize items, we'd waste a whole new page (could be
* unmapped?), and for half pagesize items, approximately half
* the space is lost (eg, 4K pages, you get one 2K allocation.)
*/
nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) {
/* Ok, we can */
pp->pr_size = nsz;
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
} else {
/* No space for a red zone... snif :'( */
pp->pr_redzone = false;
aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan);
}
}
static void
pool_redzone_fill(struct pool *pp, void *p)
{
if (!pp->pr_redzone)
return;
KASSERT(!pp_has_pser(pp));
#ifdef KASAN
kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone,
KASAN_POOL_REDZONE);
#else
uint8_t *cp, pat;
const uint8_t *ep;
cp = (uint8_t *)p + pp->pr_reqsize;
ep = cp + POOL_REDZONE_SIZE;
/*
* We really don't want the first byte of the red zone to be '\0';
* an off-by-one in a string may not be properly detected.
*/
pat = pool_pattern_generate(cp);
*cp = (pat == '\0') ? STATIC_BYTE: pat;
cp++;
while (cp < ep) {
*cp = pool_pattern_generate(cp);
cp++;
}
#endif
}
static void
pool_redzone_check(struct pool *pp, void *p)
{
if (!pp->pr_redzone)
return;
KASSERT(!pp_has_pser(pp));
#ifdef KASAN
kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED);
#else
uint8_t *cp, pat, expected;
const uint8_t *ep;
cp = (uint8_t *)p + pp->pr_reqsize;
ep = cp + POOL_REDZONE_SIZE;
pat = pool_pattern_generate(cp);
expected = (pat == '\0') ? STATIC_BYTE: pat;
if (__predict_false(*cp != expected)) {
panic("%s: [%s] 0x%02x != 0x%02x", __func__,
pp->pr_wchan, *cp, expected);
}
cp++;
while (cp < ep) {
expected = pool_pattern_generate(cp);
if (__predict_false(*cp != expected)) {
panic("%s: [%s] 0x%02x != 0x%02x", __func__,
pp->pr_wchan, *cp, expected);
}
cp++;
}
#endif
}
static void
pool_cache_redzone_check(pool_cache_t pc, void *p)
{
#ifdef KASAN
/*
* If there is a ctor/dtor, or if the cache objects use
* passive serialization, leave the data as valid.
*/
if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) ||
pc_has_pser(pc))) {
return;
}
#endif
pool_redzone_check(&pc->pc_pool, p);
}
#endif /* POOL_REDZONE */
#if defined(DDB)
static bool
pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{
return (uintptr_t)ph->ph_page <= addr &&
addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz;
}
static bool
pool_in_item(struct pool *pp, void *item, uintptr_t addr)
{
return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size;
}
static bool
pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr)
{
int i;
if (pcg == NULL) {
return false;
}
for (i = 0; i < pcg->pcg_avail; i++) {
if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) {
return true;
}
}
return false;
}
static bool
pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{
if ((pp->pr_roflags & PR_USEBMAP) != 0) {
unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr);
pool_item_bitmap_t *bitmap =
ph->ph_bitmap + (idx / BITMAP_SIZE);
pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
return (*bitmap & mask) == 0;
} else {
struct pool_item *pi;
LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
if (pool_in_item(pp, pi, addr)) {
return false;
}
}
return true;
}
}
void
pool_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
struct pool *pp;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
struct pool_item_header *ph;
struct pool_cache *pc;
uintptr_t item;
bool allocated = true;
bool incache = false;
bool incpucache = false;
char cpucachestr[32];
if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
goto found;
}
}
LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
allocated =
pool_allocated(pp, ph, addr);
goto found;
}
}
LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
allocated = false;
goto found;
}
}
continue;
} else {
ph = pr_find_pagehead_noalign(pp, (void *)addr);
if (ph == NULL || !pool_in_page(pp, ph, addr)) {
continue;
}
allocated = pool_allocated(pp, ph, addr);
}
found:
if (allocated &&
(pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
struct pool_cache_group *pcg;
int i;
for (pcg = pc->pc_fullgroups; pcg != NULL;
pcg = pcg->pcg_next) {
if (pool_in_cg(pp, pcg, addr)) {
incache = true;
goto print;
}
}
for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
pool_cache_cpu_t *cc;
if ((cc = pc->pc_cpus[i]) == NULL) {
continue;
}
if (pool_in_cg(pp, cc->cc_current, addr) ||
pool_in_cg(pp, cc->cc_previous, addr)) {
struct cpu_info *ci =
cpu_lookup(i);
incpucache = true;
snprintf(cpucachestr,
sizeof(cpucachestr),
"cached by CPU %u",
ci->ci_index);
goto print;
}
}
}
print:
item = (uintptr_t)ph->ph_page + ph->ph_off;
item = item + rounddown(addr - item, pp->pr_size);
(*pr)("%p is %p+%zu in POOL '%s' (%s)\n",
(void *)addr, item, (size_t)(addr - item),
pp->pr_wchan,
incpucache ? cpucachestr :
incache ? "cached" : allocated ? "allocated" : "free");
}
}
#endif /* defined(DDB) */
static int
pool_sysctl(SYSCTLFN_ARGS)
{
struct pool_sysctl data;
struct pool *pp;
struct pool_cache *pc;
pool_cache_cpu_t *cc;
int error;
size_t i, written;
if (oldp == NULL) {
*oldlenp = 0;
TAILQ_FOREACH(pp, &pool_head, pr_poollist)
*oldlenp += sizeof(data);
return 0;
}
memset(&data, 0, sizeof(data));
error = 0;
written = 0;
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
if (written + sizeof(data) > *oldlenp)
break;
pp->pr_refcnt++;
strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan));
data.pr_pagesize = pp->pr_alloc->pa_pagesz;
data.pr_flags = pp->pr_roflags | pp->pr_flags;
#define COPY(field) data.field = pp->field
COPY(pr_size);
COPY(pr_itemsperpage);
COPY(pr_nitems);
COPY(pr_nout);
COPY(pr_hardlimit);
COPY(pr_npages);
COPY(pr_minpages);
COPY(pr_maxpages);
COPY(pr_nget);
COPY(pr_nfail);
COPY(pr_nput);
COPY(pr_npagealloc);
COPY(pr_npagefree);
COPY(pr_hiwat);
COPY(pr_nidle);
#undef COPY
data.pr_cache_nmiss_pcpu = 0;
data.pr_cache_nhit_pcpu = 0;
data.pr_cache_nmiss_global = 0;
data.pr_cache_nempty = 0;
data.pr_cache_ncontended = 0;
data.pr_cache_npartial = 0;
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
uint32_t nfull = 0;
data.pr_cache_meta_size = pc->pc_pcgsize;
for (i = 0; i < pc->pc_ncpu; ++i) {
cc = pc->pc_cpus[i];
if (cc == NULL)
continue;
data.pr_cache_ncontended += cc->cc_contended;
data.pr_cache_nmiss_pcpu += cc->cc_misses;
data.pr_cache_nhit_pcpu += cc->cc_hits;
data.pr_cache_nmiss_global += cc->cc_pcmisses;
nfull += cc->cc_nfull; /* 32-bit rollover! */
data.pr_cache_npartial += cc->cc_npart;
}
data.pr_cache_nfull = nfull;
} else {
data.pr_cache_meta_size = 0;
data.pr_cache_nfull = 0;
}
data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu -
data.pr_cache_nmiss_global;
if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */
continue;
mutex_exit(&pool_head_lock);
error = sysctl_copyout(l, &data, oldp, sizeof(data));
mutex_enter(&pool_head_lock);
if (--pp->pr_refcnt == 0)
cv_broadcast(&pool_busy);
if (error)
break;
written += sizeof(data);
oldp = (char *)oldp + sizeof(data);
}
mutex_exit(&pool_head_lock);
*oldlenp = written;
return error;
}
SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup")
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pool",
SYSCTL_DESCR("Get pool statistics"),
pool_sysctl, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $ */
/*-
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Nathan J. Williams, and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Overview
*
* Lightweight processes (LWPs) are the basic unit or thread of
* execution within the kernel. The core state of an LWP is described
* by "struct lwp", also known as lwp_t.
*
* Each LWP is contained within a process (described by "struct proc"),
* Every process contains at least one LWP, but may contain more. The
* process describes attributes shared among all of its LWPs such as a
* private address space, global execution state (stopped, active,
* zombie, ...), signal disposition and so on. On a multiprocessor
* machine, multiple LWPs be executing concurrently in the kernel.
*
* Execution states
*
* At any given time, an LWP has overall state that is described by
* lwp::l_stat. The states are broken into two sets below. The first
* set is guaranteed to represent the absolute, current state of the
* LWP:
*
* LSONPROC
*
* On processor: the LWP is executing on a CPU, either in the
* kernel or in user space.
*
* LSRUN
*
* Runnable: the LWP is parked on a run queue, and may soon be
* chosen to run by an idle processor, or by a processor that
* has been asked to preempt a currently running but lower
* priority LWP.
*
* LSIDL
*
* Idle: the LWP has been created but has not yet executed, or
* it has ceased executing a unit of work and is waiting to be
* started again. This state exists so that the LWP can occupy
* a slot in the process & PID table, but without having to
* worry about being touched; lookups of the LWP by ID will
* fail while in this state. The LWP will become visible for
* lookup once its state transitions further. Some special
* kernel threads also (ab)use this state to indicate that they
* are idle (soft interrupts and idle LWPs).
*
* LSSUSPENDED:
*
* Suspended: the LWP has had its execution suspended by
* another LWP in the same process using the _lwp_suspend()
* system call. User-level LWPs also enter the suspended
* state when the system is shutting down.
*
* The second set represent a "statement of intent" on behalf of the
* LWP. The LWP may in fact be executing on a processor, may be
* sleeping or idle. It is expected to take the necessary action to
* stop executing or become "running" again within a short timeframe.
* The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running.
* Importantly, it indicates that its state is tied to a CPU.
*
* LSZOMB:
*
* Dead or dying: the LWP has released most of its resources
* and is about to switch away into oblivion, or has already
* switched away. When it switches away, its few remaining
* resources can be collected.
*
* LSSLEEP:
*
* Sleeping: the LWP has entered itself onto a sleep queue, and
* has switched away or will switch away shortly to allow other
* LWPs to run on the CPU.
*
* LSSTOP:
*
* Stopped: the LWP has been stopped as a result of a job
* control signal, or as a result of the ptrace() interface.
*
* Stopped LWPs may run briefly within the kernel to handle
* signals that they receive, but will not return to user space
* until their process' state is changed away from stopped.
*
* Single LWPs within a process can not be set stopped
* selectively: all actions that can stop or continue LWPs
* occur at the process level.
*
* State transitions
*
* Note that the LSSTOP state may only be set when returning to
* user space in userret(), or when sleeping interruptably. The
* LSSUSPENDED state may only be set in userret(). Before setting
* those states, we try to ensure that the LWPs will release all
* locks that they hold, and at a minimum try to ensure that the
* LWP can be set runnable again by a signal.
*
* LWPs may transition states in the following ways:
*
* RUN -------> ONPROC ONPROC -----> RUN
* > SLEEP
* > STOPPED
* > SUSPENDED
* > ZOMB
* > IDL (special cases)
*
* STOPPED ---> RUN SUSPENDED --> RUN
* > SLEEP
*
* SLEEP -----> ONPROC IDL --------> RUN
* > RUN > SUSPENDED
* > STOPPED > STOPPED
* > ONPROC (special cases)
*
* Some state transitions are only possible with kernel threads (eg
* ONPROC -> IDL) and happen under tightly controlled circumstances
* free of unwanted side effects.
*
* Migration
*
* Migration of threads from one CPU to another could be performed
* internally by the scheduler via sched_takecpu() or sched_catchlwp()
* functions. The universal lwp_migrate() function should be used for
* any other cases. Subsystems in the kernel must be aware that CPU
* of LWP may change, while it is not locked.
*
* Locking
*
* The majority of fields in 'struct lwp' are covered by a single,
* general spin lock pointed to by lwp::l_mutex. The locks covering
* each field are documented in sys/lwp.h.
*
* State transitions must be made with the LWP's general lock held,
* and may cause the LWP's lock pointer to change. Manipulation of
* the general lock is not performed directly, but through calls to
* lwp_lock(), lwp_unlock() and others. It should be noted that the
* adaptive locks are not allowed to be released while the LWP's lock
* is being held (unlike for other spin-locks).
*
* States and their associated locks:
*
* LSIDL, LSONPROC, LSZOMB, LSSUPENDED:
*
* Always covered by spc_lwplock, which protects LWPs not
* associated with any other sync object. This is a per-CPU
* lock and matches lwp::l_cpu.
*
* LSRUN:
*
* Always covered by spc_mutex, which protects the run queues.
* This is a per-CPU lock and matches lwp::l_cpu.
*
* LSSLEEP:
*
* Covered by a lock associated with the sleep queue (sometimes
* a turnstile sleep queue) that the LWP resides on. This can
* be spc_lwplock for SOBJ_SLEEPQ_NULL (an "untracked" sleep).
*
* LSSTOP:
*
* If the LWP was previously sleeping (l_wchan != NULL), then
* l_mutex references the sleep queue lock. If the LWP was
* runnable or on the CPU when halted, or has been removed from
* the sleep queue since halted, then the lock is spc_lwplock.
*
* The lock order is as follows:
*
* sleepq -> turnstile -> spc_lwplock -> spc_mutex
*
* Each process has a scheduler state lock (proc::p_lock), and a
* number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
* so on. When an LWP is to be entered into or removed from one of the
* following states, p_lock must be held and the process wide counters
* adjusted:
*
* LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
*
* (But not always for kernel threads. There are some special cases
* as mentioned above: soft interrupts, and the idle loops.)
*
* Note that an LWP is considered running or likely to run soon if in
* one of the following states. This affects the value of p_nrlwps:
*
* LSRUN, LSONPROC, LSSLEEP
*
* p_lock does not need to be held when transitioning among these
* three states, hence p_lock is rarely taken for state transitions.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $");
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_dtrace.h"
#define _LWP_API_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/futex.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kcov.h>
#include <sys/kmem.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/msan.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/psref.h>
#include <sys/ptrace.h>
#include <sys/sdt.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uidinfo.h>
#include <sys/xcall.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
static pool_cache_t lwp_cache __read_mostly;
struct lwplist alllwp __cacheline_aligned;
static int lwp_ctor(void *, void *, int);
static void lwp_dtor(void *, void *);
/* DTrace proc provider probes */
SDT_PROVIDER_DEFINE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *");
struct turnstile turnstile0 __cacheline_aligned;
struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = {
#ifdef LWP0_CPU_INFO
.l_cpu = LWP0_CPU_INFO,
#endif
#ifdef LWP0_MD_INITIALIZER
.l_md = LWP0_MD_INITIALIZER,
#endif
.l_proc = &proc0,
.l_lid = 0, /* we own proc0's slot in the pid table */
.l_flag = LW_SYSTEM,
.l_stat = LSONPROC,
.l_ts = &turnstile0,
.l_syncobj = &sched_syncobj,
.l_refcnt = 0,
.l_priority = PRI_USER + NPRI_USER - 1,
.l_inheritedprio = -1,
.l_class = SCHED_OTHER,
.l_psid = PS_NONE,
.l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders),
.l_name = __UNCONST("swapper"),
.l_fd = &filedesc0,
};
static int
lwp_maxlwp(void)
{
/* Assume 1 LWP per 1MiB. */
uint64_t lwps_per = ctob(physmem) / (1024 * 1024);
return MAX(MIN(MAXMAXLWP, lwps_per), MAXLWP);
}
static int sysctl_kern_maxlwp(SYSCTLFN_PROTO);
/*
* sysctl helper routine for kern.maxlwp. Ensures that the new
* values are not too low or too high.
*/
static int
sysctl_kern_maxlwp(SYSCTLFN_ARGS)
{
int error, nmaxlwp;
struct sysctlnode node;
nmaxlwp = maxlwp;
node = *rnode;
node.sysctl_data = &nmaxlwp;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (nmaxlwp < 0 || nmaxlwp >= MAXMAXLWP)
return EINVAL;
if (nmaxlwp > lwp_maxlwp())
return EINVAL;
maxlwp = nmaxlwp;
return 0;
}
static void
sysctl_kern_lwp_setup(void)
{
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlwp",
SYSCTL_DESCR("Maximum number of simultaneous threads"),
sysctl_kern_maxlwp, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
void
lwpinit(void)
{
LIST_INIT(&alllwp);
lwpinit_specificdata();
/*
* Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu()
* calls will exit before memory of LWPs is returned to the pool, where
* KVA of LWP structure might be freed and re-used for other purposes.
* Kernel preemption is disabled around mutex_oncpu() and rw_oncpu()
* callers, therefore a regular passive serialization barrier will
* do the job.
*/
lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0,
PR_PSERIALIZE, "lwppl", NULL, IPL_NONE, lwp_ctor, lwp_dtor, NULL);
maxlwp = lwp_maxlwp();
sysctl_kern_lwp_setup();
}
void
lwp0_init(void)
{
struct lwp *l = &lwp0;
KASSERT((void *)uvm_lwp_getuarea(l) != NULL);
LIST_INSERT_HEAD(&alllwp, l, l_list);
callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE);
callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l);
cv_init(&l->l_sigcv, "sigwait");
cv_init(&l->l_waitcv, "vfork");
l->l_cred = kauth_cred_hold(proc0.p_cred);
kdtrace_thread_ctor(NULL, l);
lwp_initspecific(l);
SYSCALL_TIME_LWP_INIT(l);
}
/*
* Initialize the non-zeroed portion of an lwp_t.
*/
static int
lwp_ctor(void *arg, void *obj, int flags)
{
lwp_t *l = obj;
l->l_stat = LSIDL;
l->l_cpu = curcpu();
l->l_mutex = l->l_cpu->ci_schedstate.spc_lwplock;
l->l_ts = kmem_alloc(sizeof(*l->l_ts), flags == PR_WAITOK ?
KM_SLEEP : KM_NOSLEEP);
if (l->l_ts == NULL) {
return ENOMEM;
} else {
turnstile_ctor(l->l_ts);
return 0;
}
}
static void
lwp_dtor(void *arg, void *obj)
{
lwp_t *l = obj;
/*
* The value of l->l_cpu must still be valid at this point.
*/
KASSERT(l->l_cpu != NULL);
/*
* We can't return turnstile0 to the pool (it didn't come from it),
* so if it comes up just drop it quietly and move on.
*/
if (l->l_ts != &turnstile0) kmem_free(l->l_ts, sizeof(*l->l_ts));
}
/*
* Set an LWP suspended.
*
* Must be called with p_lock held, and the LWP locked. Will unlock the
* LWP before return.
*/
int
lwp_suspend(struct lwp *curl, struct lwp *t)
{
int error;
KASSERT(mutex_owned(t->l_proc->p_lock));
KASSERT(lwp_locked(t, NULL));
KASSERT(curl != t || curl->l_stat == LSONPROC);
/*
* If the current LWP has been told to exit, we must not suspend anyone
* else or deadlock could occur. We won't return to userspace.
*/
if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) {
lwp_unlock(t);
return (EDEADLK);
}
if ((t->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(t);
return 0;
}
error = 0;
switch (t->l_stat) {
case LSRUN:
case LSONPROC:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
lwp_unlock(t);
break;
case LSSLEEP:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
/*
* Kick the LWP and try to get it to the kernel boundary
* so that it will release any locks that it holds.
* setrunnable() will release the lock.
*/
if ((t->l_flag & LW_SINTR) != 0)
setrunnable(t);
else
lwp_unlock(t);
break;
case LSSUSPENDED:
lwp_unlock(t);
break;
case LSSTOP:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
setrunnable(t);
break;
case LSIDL:
case LSZOMB:
error = EINTR; /* It's what Solaris does..... */
lwp_unlock(t);
break;
}
return (error);
}
/*
* Restart a suspended LWP.
*
* Must be called with p_lock held, and the LWP locked. Will unlock the
* LWP before return.
*/
void
lwp_continue(struct lwp *l)
{
KASSERT(mutex_owned(l->l_proc->p_lock));
KASSERT(lwp_locked(l, NULL));
/* If rebooting or not suspended, then just bail out. */
if ((l->l_flag & LW_WREBOOT) != 0) {
lwp_unlock(l);
return;
}
l->l_flag &= ~LW_WSUSPEND;
if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(l);
return;
}
/* setrunnable() will release the lock. */
setrunnable(l);
}
/*
* Restart a stopped LWP.
*
* Must be called with p_lock held, and the LWP NOT locked. Will unlock the
* LWP before return.
*/
void
lwp_unstop(struct lwp *l)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
lwp_lock(l);
KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
/* If not stopped, then just bail out. */
if (l->l_stat != LSSTOP) {
lwp_unlock(l);
return;
}
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
if (!p->p_waited)
p->p_pptr->p_nstopchild--;
if (l->l_wchan == NULL) {
/* setrunnable() will release the lock. */
setrunnable(l);
} else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) {
/* setrunnable() so we can receive the signal */
setrunnable(l);
} else {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
lwp_unlock(l);
}
}
/*
* Wait for an LWP within the current process to exit. If 'lid' is
* non-zero, we are waiting for a specific LWP.
*
* Must be called with p->p_lock held.
*/
int
lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting)
{
const lwpid_t curlid = l->l_lid;
proc_t *p = l->l_proc;
lwp_t *l2, *next;
int error;
KASSERT(mutex_owned(p->p_lock));
p->p_nlwpwait++;
l->l_waitingfor = lid;
for (;;) {
int nfound;
/*
* Avoid a race between exit1() and sigexit(): if the
* process is dumping core, then we need to bail out: call
* into lwp_userret() where we will be suspended until the
* deed is done.
*/
if ((p->p_sflag & PS_WCORE) != 0) { mutex_exit(p->p_lock);
lwp_userret(l);
KASSERT(false);
}
/*
* First off, drain any detached LWP that is waiting to be
* reaped.
*/
if ((l2 = p->p_zomblwp) != NULL) {
p->p_zomblwp = NULL;
lwp_free(l2, false, false);/* releases proc mutex */
mutex_enter(p->p_lock);
continue;
}
/*
* Now look for an LWP to collect. If the whole process is
* exiting, count detached LWPs as eligible to be collected,
* but don't drain them here.
*/
nfound = 0;
error = 0;
/*
* If given a specific LID, go via pid_table and make sure
* it's not detached.
*/
if (lid != 0) {
l2 = proc_find_lwp(p, lid);
if (l2 == NULL) {
error = ESRCH;
break;
}
KASSERT(l2->l_lid == lid); if ((l2->l_prflag & LPR_DETACHED) != 0) {
error = EINVAL;
break;
}
} else {
l2 = LIST_FIRST(&p->p_lwps);
}
for (; l2 != NULL; l2 = next) { next = (lid != 0 ? NULL : LIST_NEXT(l2, l_sibling));
/*
* If a specific wait and the target is waiting on
* us, then avoid deadlock. This also traps LWPs
* that try to wait on themselves.
*
* Note that this does not handle more complicated
* cycles, like: t1 -> t2 -> t3 -> t1. The process
* can still be killed so it is not a major problem.
*/
if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
error = EDEADLK;
break;
}
if (l2 == l)
continue;
if ((l2->l_prflag & LPR_DETACHED) != 0) {
nfound += exiting;
continue;
}
if (lid != 0) {
/*
* Mark this LWP as the first waiter, if there
* is no other.
*/
if (l2->l_waiter == 0) l2->l_waiter = curlid; } else if (l2->l_waiter != 0) {
/*
* It already has a waiter - so don't
* collect it. If the waiter doesn't
* grab it we'll get another chance
* later.
*/
nfound++;
continue;
}
nfound++;
/* No need to lock the LWP in order to see LSZOMB. */
if (l2->l_stat != LSZOMB)
continue;
/*
* We're no longer waiting. Reset the "first waiter"
* pointer on the target, in case it was us.
*/
l->l_waitingfor = 0;
l2->l_waiter = 0;
p->p_nlwpwait--;
if (departed)
*departed = l2->l_lid;
sched_lwp_collect(l2);
/* lwp_free() releases the proc lock. */
lwp_free(l2, false, false);
mutex_enter(p->p_lock);
return 0;
}
if (error != 0)
break;
if (nfound == 0) {
error = ESRCH;
break;
}
/*
* Note: since the lock will be dropped, need to restart on
* wakeup to run all LWPs again, e.g. there may be new LWPs.
*/
if (exiting) {
KASSERT(p->p_nlwps > 1);
error = cv_timedwait(&p->p_lwpcv, p->p_lock, 1);
break;
}
/*
* Break out if all LWPs are in _lwp_wait(). There are
* other ways to hang the process with _lwp_wait(), but the
* sleep is interruptable so little point checking for them.
*/
if (p->p_nlwpwait == p->p_nlwps) {
error = EDEADLK;
break;
}
/*
* Sit around and wait for something to happen. We'll be
* awoken if any of the conditions examined change: if an
* LWP exits, is collected, or is detached.
*/
if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0)
break;
}
/*
* We didn't find any LWPs to collect, we may have received a
* signal, or some other condition has caused us to bail out.
*
* If waiting on a specific LWP, clear the waiters marker: some
* other LWP may want it. Then, kick all the remaining waiters
* so that they can re-check for zombies and for deadlock.
*/
if (lid != 0) {
l2 = proc_find_lwp(p, lid);
KASSERT(l2 == NULL || l2->l_lid == lid); if (l2 != NULL && l2->l_waiter == curlid) l2->l_waiter = 0;
}
p->p_nlwpwait--;
l->l_waitingfor = 0;
cv_broadcast(&p->p_lwpcv);
return error;
}
/*
* Create a new LWP within process 'p2', using LWP 'l1' as a template.
* The new LWP is created in state LSIDL and must be set running,
* suspended, or stopped by the caller.
*/
int
lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags,
void *stack, size_t stacksize, void (*func)(void *), void *arg,
lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask,
const stack_t *sigstk)
{
struct lwp *l2;
KASSERT(l1 == curlwp || l1->l_proc == &proc0);
/*
* Enforce limits, excluding the first lwp and kthreads. We must
* use the process credentials here when adjusting the limit, as
* they are what's tied to the accounting entity. However for
* authorizing the action, we'll use the LWP's credentials.
*/
mutex_enter(p2->p_lock);
if (p2->p_nlwps != 0 && p2 != &proc0) {
uid_t uid = kauth_cred_getuid(p2->p_cred);
int count = chglwpcnt(uid, 1);
if (__predict_false(count >
p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) {
if (kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_RLIMIT, p2,
KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR))
!= 0) {
(void)chglwpcnt(uid, -1);
mutex_exit(p2->p_lock);
return EAGAIN;
}
}
}
/*
* First off, reap any detached LWP waiting to be collected.
* We can re-use its LWP structure and turnstile.
*/
if ((l2 = p2->p_zomblwp) != NULL) {
p2->p_zomblwp = NULL;
lwp_free(l2, true, false);
/* p2 now unlocked by lwp_free() */
KASSERT(l2->l_ts != NULL); KASSERT(l2->l_inheritedprio == -1); KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
memset(&l2->l_startzero, 0, sizeof(*l2) -
offsetof(lwp_t, l_startzero));
} else {
mutex_exit(p2->p_lock);
l2 = pool_cache_get(lwp_cache, PR_WAITOK);
memset(&l2->l_startzero, 0, sizeof(*l2) -
offsetof(lwp_t, l_startzero));
SLIST_INIT(&l2->l_pi_lenders);
}
/*
* Because of lockless lookup via pid_table, the LWP can be locked
* and inspected briefly even after it's freed, so a few fields are
* kept stable.
*/
KASSERT(l2->l_stat == LSIDL); KASSERT(l2->l_cpu != NULL); KASSERT(l2->l_ts != NULL); KASSERT(l2->l_mutex == l2->l_cpu->ci_schedstate.spc_lwplock);
l2->l_proc = p2;
l2->l_refcnt = 0;
l2->l_class = sclass;
/*
* Allocate a process ID for this LWP. We need to do this now
* while we can still unwind if it fails. Because we're marked
* as LSIDL, no lookups by the ID will succeed.
*
* N.B. this will always succeed for the first LWP in a process,
* because proc_alloc_lwpid() will usurp the slot. Also note
* that l2->l_proc MUST be valid so that lookups of the proc
* will succeed, even if the LWP itself is not visible.
*/
if (__predict_false(proc_alloc_lwpid(p2, l2) == -1)) {
pool_cache_put(lwp_cache, l2);
return EAGAIN;
}
/*
* If vfork(), we want the LWP to run fast and on the same CPU
* as its parent, so that it can reuse the VM context and cache
* footprint on the local CPU.
*/
l2->l_boostpri = ((flags & LWP_VFORK) ? PRI_KERNEL : PRI_USER);
l2->l_priority = l1->l_priority;
l2->l_inheritedprio = -1;
l2->l_protectprio = -1;
l2->l_auxprio = -1;
l2->l_flag = 0;
l2->l_pflag = LP_MPSAFE;
TAILQ_INIT(&l2->l_ld_locks);
l2->l_psrefs = 0;
kmsan_lwp_alloc(l2);
/*
* For vfork, borrow parent's lwpctl context if it exists.
* This also causes us to return via lwp_userret.
*/
if (flags & LWP_VFORK && l1->l_lwpctl) { l2->l_lwpctl = l1->l_lwpctl;
l2->l_flag |= LW_LWPCTL;
}
/*
* If not the first LWP in the process, grab a reference to the
* descriptor table.
*/
l2->l_fd = p2->p_fd;
if (p2->p_nlwps != 0) {
KASSERT(l1->l_proc == p2);
fd_hold(l2);
} else {
KASSERT(l1->l_proc != p2);
}
if (p2->p_flag & PK_SYSTEM) {
/* Mark it as a system LWP. */
l2->l_flag |= LW_SYSTEM;
}
kdtrace_thread_ctor(NULL, l2);
lwp_initspecific(l2);
sched_lwp_fork(l1, l2);
callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE);
callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2);
cv_init(&l2->l_sigcv, "sigwait");
cv_init(&l2->l_waitcv, "vfork");
l2->l_syncobj = &sched_syncobj;
PSREF_DEBUG_INIT_LWP(l2);
if (rnewlwpp != NULL) *rnewlwpp = l2;
/*
* PCU state needs to be saved before calling uvm_lwp_fork() so that
* the MD cpu_lwp_fork() can copy the saved state to the new LWP.
*/
pcu_save_all(l1);
#if PCU_UNIT_COUNT > 0
l2->l_pcu_valid = l1->l_pcu_valid;
#endif
uvm_lwp_setuarea(l2, uaddr);
uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2);
mutex_enter(p2->p_lock);
l2->l_cred = kauth_cred_hold(p2->p_cred);
if ((flags & LWP_DETACHED) != 0) {
l2->l_prflag = LPR_DETACHED;
p2->p_ndlwps++;
} else
l2->l_prflag = 0;
if (l1->l_proc == p2) {
/*
* These flags are set while p_lock is held. Copy with
* p_lock held too, so the LWP doesn't sneak into the
* process without them being set.
*/
l2->l_flag |= (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE));
} else {
/* fork(): pending core/exit doesn't apply to child. */
l2->l_flag |= (l1->l_flag & LW_WREBOOT);
}
l2->l_sigstk = *sigstk;
l2->l_sigmask = *sigmask;
TAILQ_INIT(&l2->l_sigpend.sp_info);
sigemptyset(&l2->l_sigpend.sp_set);
LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling);
p2->p_nlwps++;
p2->p_nrlwps++;
KASSERT(l2->l_affinity == NULL);
/* Inherit the affinity mask. */
if (l1->l_affinity) {
/*
* Note that we hold the state lock while inheriting
* the affinity to avoid race with sched_setaffinity().
*/
lwp_lock(l1);
if (l1->l_affinity) { kcpuset_use(l1->l_affinity);
l2->l_affinity = l1->l_affinity;
}
lwp_unlock(l1);
}
/* Ensure a trip through lwp_userret() if needed. */
if ((l2->l_flag & LW_USERRET) != 0) { lwp_need_userret(l2);
}
/* This marks the end of the "must be atomic" section. */
mutex_exit(p2->p_lock);
SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0);
mutex_enter(&proc_lock);
LIST_INSERT_HEAD(&alllwp, l2, l_list);
/* Inherit a processor-set */
l2->l_psid = l1->l_psid;
mutex_exit(&proc_lock);
SYSCALL_TIME_LWP_INIT(l2);
if (p2->p_emul->e_lwp_fork) (*p2->p_emul->e_lwp_fork)(l1, l2);
return (0);
}
/*
* Set a new LWP running. If the process is stopping, then the LWP is
* created stopped.
*/
void
lwp_start(lwp_t *l, int flags)
{
proc_t *p = l->l_proc;
mutex_enter(p->p_lock);
lwp_lock(l);
KASSERT(l->l_stat == LSIDL);
if ((flags & LWP_SUSPENDED) != 0) {
/* It'll suspend itself in lwp_userret(). */
l->l_flag |= LW_WSUSPEND;
lwp_need_userret(l);
}
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
KASSERT(l->l_wchan == NULL);
l->l_stat = LSSTOP;
p->p_nrlwps--;
lwp_unlock(l);
} else {
setrunnable(l);
/* LWP now unlocked */
}
mutex_exit(p->p_lock);
}
/*
* Called by MD code when a new LWP begins execution. Must be called
* with the previous LWP locked (so at splsched), or if there is no
* previous LWP, at splsched.
*/
void
lwp_startup(struct lwp *prev, struct lwp *new_lwp)
{
kmutex_t *lock;
KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev);
KASSERT(kpreempt_disabled());
KASSERT(prev != NULL);
KASSERT((prev->l_pflag & LP_RUNNING) != 0);
KASSERT(curcpu()->ci_mtx_count == -2);
/*
* Immediately mark the previous LWP as no longer running and
* unlock (to keep lock wait times short as possible). If a
* zombie, don't touch after clearing LP_RUNNING as it could be
* reaped by another CPU. Use atomic_store_release to ensure
* this -- matches atomic_load_acquire in lwp_free.
*/
lock = prev->l_mutex;
if (__predict_false(prev->l_stat == LSZOMB)) {
atomic_store_release(&prev->l_pflag,
prev->l_pflag & ~LP_RUNNING);
} else {
prev->l_pflag &= ~LP_RUNNING;
}
mutex_spin_exit(lock);
/* Correct spin mutex count after mi_switch(). */
curcpu()->ci_mtx_count = 0;
/* Install new VM context. */
if (__predict_true(new_lwp->l_proc->p_vmspace)) {
pmap_activate(new_lwp);
}
/* We remain at IPL_SCHED from mi_switch() - reset it. */
spl0();
LOCKDEBUG_BARRIER(NULL, 0);
SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0);
/* For kthreads, acquire kernel lock if not MPSAFE. */
if (__predict_false((new_lwp->l_pflag & LP_MPSAFE) == 0)) {
KERNEL_LOCK(1, new_lwp);
}
}
/*
* Exit an LWP.
*
* *** WARNING *** This can be called with (l != curlwp) in error paths.
*/
void
lwp_exit(struct lwp *l)
{
struct proc *p = l->l_proc;
struct lwp *l2;
bool current;
current = (l == curlwp);
KASSERT(current || l->l_stat == LSIDL); KASSERT(current || l->l_target_cpu == NULL); KASSERT(p == curproc); SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0);
/* Verify that we hold no locks; for DIAGNOSTIC check kernel_lock. */
LOCKDEBUG_BARRIER(NULL, 0);
KASSERTMSG(curcpu()->ci_biglock_count == 0, "kernel_lock leaked");
/*
* If we are the last live LWP in a process, we need to exit the
* entire process. We do so with an exit status of zero, because
* it's a "controlled" exit, and because that's what Solaris does.
*
* We are not quite a zombie yet, but for accounting purposes we
* must increment the count of zombies here.
*
* Note: the last LWP's specificdata will be deleted here.
*/
mutex_enter(p->p_lock);
if (p->p_nlwps - p->p_nzlwps == 1) {
KASSERT(current == true); KASSERT(p != &proc0);
exit1(l, 0, 0);
/* NOTREACHED */
}
p->p_nzlwps++;
/*
* Perform any required thread cleanup. Do this early so
* anyone wanting to look us up with lwp_getref_lwpid() will
* fail to find us before we become a zombie.
*
* N.B. this will unlock p->p_lock on our behalf.
*/
lwp_thread_cleanup(l); if (p->p_emul->e_lwp_exit) (*p->p_emul->e_lwp_exit)(l);
/* Drop filedesc reference. */
fd_free();
/* Release fstrans private data. */
fstrans_lwp_dtor(l);
/* Delete the specificdata while it's still safe to sleep. */
lwp_finispecific(l);
/*
* Release our cached credentials.
*/
kauth_cred_free(l->l_cred);
callout_destroy(&l->l_timeout_ch);
/*
* If traced, report LWP exit event to the debugger.
*
* Remove the LWP from the global list.
* Free its LID from the PID namespace if needed.
*/
mutex_enter(&proc_lock);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) ==
(PSL_TRACED|PSL_TRACELWP_EXIT)) {
mutex_enter(p->p_lock);
if (ISSET(p->p_sflag, PS_WEXIT)) {
mutex_exit(p->p_lock);
/*
* We are exiting, bail out without informing parent
* about a terminating LWP as it would deadlock.
*/
} else {
eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid);
mutex_enter(&proc_lock);
}
}
LIST_REMOVE(l, l_list);
mutex_exit(&proc_lock);
/*
* Get rid of all references to the LWP that others (e.g. procfs)
* may have, and mark the LWP as a zombie. If the LWP is detached,
* mark it waiting for collection in the proc structure. Note that
* before we can do that, we need to free any other dead, detached
* LWP waiting to meet its maker.
*
* All conditions need to be observed upon under the same hold of
* p_lock, because if the lock is dropped any of them can change.
*/
mutex_enter(p->p_lock);
for (;;) {
if (lwp_drainrefs(l))
continue;
if ((l->l_prflag & LPR_DETACHED) != 0) {
if ((l2 = p->p_zomblwp) != NULL) {
p->p_zomblwp = NULL;
lwp_free(l2, false, false);
/* proc now unlocked */
mutex_enter(p->p_lock);
continue;
}
p->p_zomblwp = l;
}
break;
}
/*
* If we find a pending signal for the process and we have been
* asked to check for signals, then we lose: arrange to have
* all other LWPs in the process check for signals.
*/
if ((l->l_flag & LW_PENDSIG) != 0 && firstsig(&p->p_sigpend.sp_set) != 0) { LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
lwp_lock(l2);
signotify(l2);
lwp_unlock(l2);
}
}
/*
* Release any PCU resources before becoming a zombie.
*/
pcu_discard_all(l);
lwp_lock(l);
l->l_stat = LSZOMB;
if (l->l_name != NULL) { strcpy(l->l_name, "(zombie)");
}
lwp_unlock(l);
p->p_nrlwps--;
if (l->l_lwpctl != NULL) l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
mutex_exit(p->p_lock);
cv_broadcast(&p->p_lwpcv);
/*
* We can no longer block. At this point, lwp_free() may already
* be gunning for us. On a multi-CPU system, we may be off p_lwps.
*
* Free MD LWP resources.
*/
cpu_lwp_free(l, 0);
if (current) {
/* Switch away into oblivion. */
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
panic("lwp_exit");
}
}
/*
* Free a dead LWP's remaining resources.
*
* XXXLWP limits.
*/
void
lwp_free(struct lwp *l, bool recycle, bool last)
{
struct proc *p = l->l_proc;
struct rusage *ru;
ksiginfoq_t kq;
KASSERT(l != curlwp); KASSERT(last || mutex_owned(p->p_lock));
/*
* We use the process credentials instead of the lwp credentials here
* because the lwp credentials maybe cached (just after a setuid call)
* and we don't want pay for syncing, since the lwp is going away
* anyway
*/
if (p != &proc0 && p->p_nlwps != 1) (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1);
/*
* In the unlikely event that the LWP is still on the CPU,
* then spin until it has switched away.
*
* atomic_load_acquire matches atomic_store_release in
* lwp_startup and mi_switch.
*/
while (__predict_false((atomic_load_acquire(&l->l_pflag) & LP_RUNNING)
!= 0)) {
SPINLOCK_BACKOFF_HOOK;
}
/*
* Now that the LWP's known off the CPU, reset its state back to
* LSIDL, which defeats anything that might have gotten a hold on
* the LWP via pid_table before the ID was freed. It's important
* to do this with both the LWP locked and p_lock held.
*
* Also reset the CPU and lock pointer back to curcpu(), since the
* LWP will in all likelyhood be cached with the current CPU in
* lwp_cache when we free it and later allocated from there again
* (avoid incidental lock contention).
*/
lwp_lock(l);
l->l_stat = LSIDL;
l->l_cpu = curcpu();
lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_lwplock);
/*
* If this was not the last LWP in the process, then adjust counters
* and unlock. This is done differently for the last LWP in exit1().
*/
if (!last) {
/*
* Add the LWP's run time to the process' base value.
* This needs to co-incide with coming off p_lwps.
*/
bintime_add(&p->p_rtime, &l->l_rtime);
p->p_pctcpu += l->l_pctcpu;
ru = &p->p_stats->p_ru;
ruadd(ru, &l->l_ru);
LIST_REMOVE(l, l_sibling);
p->p_nlwps--;
p->p_nzlwps--;
if ((l->l_prflag & LPR_DETACHED) != 0) p->p_ndlwps--;
mutex_exit(p->p_lock);
/*
* Have any LWPs sleeping in lwp_wait() recheck for
* deadlock.
*/
cv_broadcast(&p->p_lwpcv);
/* Free the LWP ID. */
mutex_enter(&proc_lock);
proc_free_lwpid(p, l->l_lid);
mutex_exit(&proc_lock);
}
/*
* Destroy the LWP's remaining signal information.
*/
ksiginfo_queue_init(&kq);
sigclear(&l->l_sigpend, NULL, &kq);
ksiginfo_queue_drain(&kq);
cv_destroy(&l->l_sigcv);
cv_destroy(&l->l_waitcv);
/*
* Free lwpctl structure and affinity.
*/
if (l->l_lwpctl) { lwp_ctl_free(l);
}
if (l->l_affinity) { kcpuset_unuse(l->l_affinity, NULL);
l->l_affinity = NULL;
}
/*
* Free remaining data structures and the LWP itself unless the
* caller wants to recycle.
*/
if (l->l_name != NULL) kmem_free(l->l_name, MAXCOMLEN);
kmsan_lwp_free(l);
kcov_lwp_free(l);
cpu_lwp_free2(l);
uvm_lwp_exit(l);
KASSERT(SLIST_EMPTY(&l->l_pi_lenders)); KASSERT(l->l_inheritedprio == -1); KASSERT(l->l_blcnt == 0); kdtrace_thread_dtor(NULL, l); if (!recycle) pool_cache_put(lwp_cache, l);
}
/*
* Migrate the LWP to the another CPU. Unlocks the LWP.
*/
void
lwp_migrate(lwp_t *l, struct cpu_info *tci)
{
struct schedstate_percpu *tspc;
int lstat = l->l_stat;
KASSERT(lwp_locked(l, NULL));
KASSERT(tci != NULL);
/* If LWP is still on the CPU, it must be handled like LSONPROC */
if ((l->l_pflag & LP_RUNNING) != 0) {
lstat = LSONPROC;
}
/*
* The destination CPU could be changed while previous migration
* was not finished.
*/
if (l->l_target_cpu != NULL) {
l->l_target_cpu = tci;
lwp_unlock(l);
return;
}
/* Nothing to do if trying to migrate to the same CPU */
if (l->l_cpu == tci) {
lwp_unlock(l);
return;
}
KASSERT(l->l_target_cpu == NULL);
tspc = &tci->ci_schedstate;
switch (lstat) {
case LSRUN:
l->l_target_cpu = tci;
break;
case LSSLEEP:
l->l_cpu = tci;
break;
case LSIDL:
case LSSTOP:
case LSSUSPENDED:
l->l_cpu = tci;
if (l->l_wchan == NULL) {
lwp_unlock_to(l, tspc->spc_lwplock);
return;
}
break;
case LSONPROC:
l->l_target_cpu = tci;
spc_lock(l->l_cpu);
sched_resched_cpu(l->l_cpu, PRI_USER_RT, true);
/* spc now unlocked */
break;
}
lwp_unlock(l);
}
#define lwp_find_exclude(l) \
((l)->l_stat == LSIDL || (l)->l_stat == LSZOMB)
/*
* Find the LWP in the process. Arguments may be zero, in such case,
* the calling process and first LWP in the list will be used.
* On success - returns proc locked.
*
* => pid == 0 -> look in curproc.
* => pid == -1 -> match any proc.
* => otherwise look up the proc.
*
* => lid == 0 -> first LWP in the proc
* => otherwise specific LWP
*/
struct lwp *
lwp_find2(pid_t pid, lwpid_t lid)
{
proc_t *p;
lwp_t *l;
/* First LWP of specified proc. */
if (lid == 0) {
switch (pid) {
case -1:
/* No lookup keys. */
return NULL;
case 0:
p = curproc;
mutex_enter(p->p_lock);
break;
default:
mutex_enter(&proc_lock);
p = proc_find(pid);
if (__predict_false(p == NULL)) {
mutex_exit(&proc_lock);
return NULL;
}
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
break;
}
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (__predict_true(!lwp_find_exclude(l)))
break;
}
goto out;
}
l = proc_find_lwp_acquire_proc(lid, &p);
if (l == NULL)
return NULL;
KASSERT(p != NULL);
KASSERT(mutex_owned(p->p_lock));
if (__predict_false(lwp_find_exclude(l))) {
l = NULL;
goto out;
}
/* Apply proc filter, if applicable. */
switch (pid) {
case -1:
/* Match anything. */
break;
case 0:
if (p != curproc)
l = NULL;
break;
default:
if (p->p_pid != pid)
l = NULL;
break;
}
out:
if (__predict_false(l == NULL)) {
mutex_exit(p->p_lock);
}
return l;
}
/*
* Look up a live LWP within the specified process.
*
* Must be called with p->p_lock held (as it looks at the radix tree,
* and also wants to exclude idle and zombie LWPs).
*/
struct lwp *
lwp_find(struct proc *p, lwpid_t id)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
l = proc_find_lwp(p, id);
KASSERT(l == NULL || l->l_lid == id);
/*
* No need to lock - all of these conditions will
* be visible with the process level mutex held.
*/
if (__predict_false(l != NULL && lwp_find_exclude(l)))
l = NULL;
return l;
}
/*
* Verify that an LWP is locked, and optionally verify that the lock matches
* one we specify.
*/
int
lwp_locked(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *cur = l->l_mutex;
return mutex_owned(cur) && (mtx == cur || mtx == NULL);
}
/*
* Lend a new mutex to an LWP. The old mutex must be held.
*/
kmutex_t *
lwp_setlock(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *oldmtx = l->l_mutex;
KASSERT(mutex_owned(oldmtx)); atomic_store_release(&l->l_mutex, mtx);
return oldmtx;
}
/*
* Lend a new mutex to an LWP, and release the old mutex. The old mutex
* must be held.
*/
void
lwp_unlock_to(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *old;
KASSERT(lwp_locked(l, NULL));
old = l->l_mutex;
atomic_store_release(&l->l_mutex, mtx);
mutex_spin_exit(old);
}
int
lwp_trylock(struct lwp *l)
{
kmutex_t *old;
for (;;) {
if (!mutex_tryenter(old = atomic_load_consume(&l->l_mutex)))
return 0;
if (__predict_true(atomic_load_relaxed(&l->l_mutex) == old))
return 1;
mutex_spin_exit(old);
}
}
void
lwp_unsleep(lwp_t *l, bool unlock)
{
KASSERT(mutex_owned(l->l_mutex));
(*l->l_syncobj->sobj_unsleep)(l, unlock);
}
/*
* Lock an LWP.
*/
void
lwp_lock(lwp_t *l)
{ kmutex_t *old = atomic_load_consume(&l->l_mutex);
/*
* Note: mutex_spin_enter() will have posted a read barrier.
* Re-test l->l_mutex. If it has changed, we need to try again.
*/
mutex_spin_enter(old);
while (__predict_false(atomic_load_relaxed(&l->l_mutex) != old)) {
mutex_spin_exit(old);
old = atomic_load_consume(&l->l_mutex);
mutex_spin_enter(old);
}
}
/*
* Unlock an LWP.
*/
void
lwp_unlock(lwp_t *l)
{
mutex_spin_exit(l->l_mutex);
}
void
lwp_changepri(lwp_t *l, pri_t pri)
{ KASSERT(mutex_owned(l->l_mutex)); if (l->l_priority == pri)
return;
(*l->l_syncobj->sobj_changepri)(l, pri);
KASSERT(l->l_priority == pri);
}
void
lwp_lendpri(lwp_t *l, pri_t pri)
{ KASSERT(mutex_owned(l->l_mutex));
(*l->l_syncobj->sobj_lendpri)(l, pri);
KASSERT(l->l_inheritedprio == pri);
}
pri_t
lwp_eprio(lwp_t *l)
{
pri_t pri = l->l_priority;
KASSERT(mutex_owned(l->l_mutex));
/*
* Timeshared/user LWPs get a temporary priority boost for blocking
* in kernel. This is key to good interactive response on a loaded
* system: without it, things will seem very sluggish to the user.
*
* The function of the boost is to get the LWP onto a CPU and
* running quickly. Once that happens the LWP loses the priority
* boost and could be preempted very quickly by another LWP but that
* won't happen often enough to be an annoyance.
*/
if (pri <= MAXPRI_USER && l->l_boostpri > MAXPRI_USER) pri = (pri >> 1) + l->l_boostpri;
return MAX(l->l_auxprio, pri);
}
/*
* Handle exceptions for mi_userret(). Called if a member of LW_USERRET is
* set or a preemption is required.
*/
void
lwp_userret(struct lwp *l)
{
struct proc *p;
int sig, f;
KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC);
p = l->l_proc;
for (;;) {
/*
* This is the main location that user preemptions are
* processed.
*/
preempt_point();
/*
* It is safe to do this unlocked and without raised SPL,
* since whenever a flag of interest is added to l_flag the
* LWP will take an AST and come down this path again. If a
* remote CPU posts the AST, it will be done with an IPI
* (strongly synchronising).
*/
if ((f = atomic_load_relaxed(&l->l_flag) & LW_USERRET) == 0) {
return;
}
/*
* Start out with the correct credentials.
*/
if ((f & LW_CACHECRED) != 0) { kauth_cred_t oc = l->l_cred;
mutex_enter(p->p_lock);
l->l_cred = kauth_cred_hold(p->p_cred);
lwp_lock(l);
l->l_flag &= ~LW_CACHECRED;
lwp_unlock(l);
mutex_exit(p->p_lock);
kauth_cred_free(oc);
}
/*
* Process pending signals first, unless the process
* is dumping core or exiting, where we will instead
* enter the LW_WSUSPEND case below.
*/
if ((f & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == LW_PENDSIG) {
mutex_enter(p->p_lock);
while ((sig = issignal(l)) != 0)
postsig(sig);
mutex_exit(p->p_lock);
continue;
}
/*
* Core-dump or suspend pending.
*
* In case of core dump, suspend ourselves, so that the kernel
* stack and therefore the userland registers saved in the
* trapframe are around for coredump() to write them out.
* We also need to save any PCU resources that we have so that
* they accessible for coredump(). We issue a wakeup on
* p->p_lwpcv so that sigexit() will write the core file out
* once all other LWPs are suspended.
*/
if ((f & LW_WSUSPEND) != 0) {
pcu_save_all(l);
mutex_enter(p->p_lock);
p->p_nrlwps--;
lwp_lock(l);
l->l_stat = LSSUSPENDED;
lwp_unlock(l);
mutex_exit(p->p_lock);
cv_broadcast(&p->p_lwpcv);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
continue;
}
/*
* Process is exiting. The core dump and signal cases must
* be handled first.
*/
if ((f & LW_WEXIT) != 0) { lwp_exit(l);
KASSERT(0);
/* NOTREACHED */
}
/*
* Update lwpctl processor (for vfork child_return).
*/
if ((f & LW_LWPCTL) != 0) {
lwp_lock(l);
KASSERT(kpreempt_disabled());
l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu);
l->l_lwpctl->lc_pctr++;
l->l_flag &= ~LW_LWPCTL;
lwp_unlock(l);
continue;
}
}
}
/*
* Force an LWP to enter the kernel, to take a trip through lwp_userret().
*/
void
lwp_need_userret(struct lwp *l)
{ KASSERT(!cpu_intr_p()); KASSERT(lwp_locked(l, NULL) || l->l_stat == LSIDL);
/*
* If the LWP is in any state other than LSONPROC, we know that it
* is executing in-kernel and will hit userret() on the way out.
*
* If the LWP is curlwp, then we know we'll be back out to userspace
* soon (can't be called from a hardware interrupt here).
*
* Otherwise, we can't be sure what the LWP is doing, so first make
* sure the update to l_flag will be globally visible, and then
* force the LWP to take a trip through trap() where it will do
* userret().
*/
if (l->l_stat == LSONPROC && l != curlwp) { membar_producer();
cpu_signotify(l);
}
}
/*
* Add one reference to an LWP. This will prevent the LWP from
* exiting, thus keep the lwp structure and PCB around to inspect.
*/
void
lwp_addref(struct lwp *l)
{ KASSERT(mutex_owned(l->l_proc->p_lock)); KASSERT(l->l_stat != LSZOMB);
l->l_refcnt++;
}
/*
* Remove one reference to an LWP. If this is the last reference,
* then we must finalize the LWP's death.
*/
void
lwp_delref(struct lwp *l)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
lwp_delref2(l);
mutex_exit(p->p_lock);
}
/*
* Remove one reference to an LWP. If this is the last reference,
* then we must finalize the LWP's death. The proc mutex is held
* on entry.
*/
void
lwp_delref2(struct lwp *l)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock)); KASSERT(l->l_stat != LSZOMB); KASSERT(l->l_refcnt > 0); if (--l->l_refcnt == 0) cv_broadcast(&p->p_lwpcv);
}
/*
* Drain all references to the current LWP. Returns true if
* we blocked.
*/
bool
lwp_drainrefs(struct lwp *l)
{
struct proc *p = l->l_proc;
bool rv = false;
KASSERT(mutex_owned(p->p_lock));
l->l_prflag |= LPR_DRAINING;
while (l->l_refcnt > 0) {
rv = true;
cv_wait(&p->p_lwpcv, p->p_lock);
}
return rv;
}
/*
* Return true if the specified LWP is 'alive'. Only p->p_lock need
* be held.
*/
bool
lwp_alive(lwp_t *l)
{
KASSERT(mutex_owned(l->l_proc->p_lock)); switch (l->l_stat) {
case LSSLEEP:
case LSRUN:
case LSONPROC:
case LSSTOP:
case LSSUSPENDED:
return true;
default:
return false;
}
}
/*
* Return first live LWP in the process.
*/
lwp_t *
lwp_find_first(proc_t *p)
{
lwp_t *l;
KASSERT(mutex_owned(p->p_lock)); LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (lwp_alive(l)) {
return l;
}
}
return NULL;
}
/*
* Allocate a new lwpctl structure for a user LWP.
*/
int
lwp_ctl_alloc(vaddr_t *uaddr)
{
lcproc_t *lp;
u_int bit, i, offset;
struct uvm_object *uao;
int error;
lcpage_t *lcp;
proc_t *p;
lwp_t *l;
l = curlwp;
p = l->l_proc;
/* don't allow a vforked process to create lwp ctls */
if (p->p_lflag & PL_PPWAIT)
return EBUSY;
if (l->l_lcpage != NULL) {
lcp = l->l_lcpage;
*uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr;
return 0;
}
/* First time around, allocate header structure for the process. */
if ((lp = p->p_lwpctl) == NULL) {
lp = kmem_alloc(sizeof(*lp), KM_SLEEP);
mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE);
lp->lp_uao = NULL;
TAILQ_INIT(&lp->lp_pages);
mutex_enter(p->p_lock);
if (p->p_lwpctl == NULL) {
p->p_lwpctl = lp;
mutex_exit(p->p_lock);
} else {
mutex_exit(p->p_lock);
mutex_destroy(&lp->lp_lock);
kmem_free(lp, sizeof(*lp));
lp = p->p_lwpctl;
}
}
/*
* Set up an anonymous memory region to hold the shared pages.
* Map them into the process' address space. The user vmspace
* gets the first reference on the UAO.
*/
mutex_enter(&lp->lp_lock);
if (lp->lp_uao == NULL) {
lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0);
lp->lp_cur = 0;
lp->lp_max = LWPCTL_UAREA_SZ;
lp->lp_uva = p->p_emul->e_vm_default_addr(p,
(vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ,
p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva,
LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW,
UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0));
if (error != 0) {
uao_detach(lp->lp_uao);
lp->lp_uao = NULL;
mutex_exit(&lp->lp_lock);
return error;
}
}
/* Get a free block and allocate for this LWP. */
TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) {
if (lcp->lcp_nfree != 0)
break;
}
if (lcp == NULL) {
/* Nothing available - try to set up a free page. */
if (lp->lp_cur == lp->lp_max) {
mutex_exit(&lp->lp_lock);
return ENOMEM;
}
lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP);
/*
* Wire the next page down in kernel space. Since this
* is a new mapping, we must add a reference.
*/
uao = lp->lp_uao;
(*uao->pgops->pgo_reference)(uao);
lcp->lcp_kaddr = vm_map_min(kernel_map);
error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE,
uao, lp->lp_cur, PAGE_SIZE,
UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
UVM_INH_NONE, UVM_ADV_RANDOM, 0));
if (error != 0) {
mutex_exit(&lp->lp_lock);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
(*uao->pgops->pgo_detach)(uao);
return error;
}
error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0);
if (error != 0) {
mutex_exit(&lp->lp_lock);
uvm_unmap(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
return error;
}
/* Prepare the page descriptor and link into the list. */
lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur;
lp->lp_cur += PAGE_SIZE;
lcp->lcp_nfree = LWPCTL_PER_PAGE;
lcp->lcp_rotor = 0;
memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ);
TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
}
for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) {
if (++i >= LWPCTL_BITMAP_ENTRIES)
i = 0;
}
bit = ffs(lcp->lcp_bitmap[i]) - 1;
lcp->lcp_bitmap[i] ^= (1U << bit);
lcp->lcp_rotor = i;
lcp->lcp_nfree--;
l->l_lcpage = lcp;
offset = (i << 5) + bit;
l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset;
*uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t);
mutex_exit(&lp->lp_lock);
KPREEMPT_DISABLE(l);
l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu());
KPREEMPT_ENABLE(l);
return 0;
}
/*
* Free an lwpctl structure back to the per-process list.
*/
void
lwp_ctl_free(lwp_t *l)
{
struct proc *p = l->l_proc;
lcproc_t *lp;
lcpage_t *lcp;
u_int map, offset;
/* don't free a lwp context we borrowed for vfork */
if (p->p_lflag & PL_PPWAIT) {
l->l_lwpctl = NULL;
return;
}
lp = p->p_lwpctl;
KASSERT(lp != NULL);
lcp = l->l_lcpage;
offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr);
KASSERT(offset < LWPCTL_PER_PAGE);
mutex_enter(&lp->lp_lock);
lcp->lcp_nfree++;
map = offset >> 5;
lcp->lcp_bitmap[map] |= (1U << (offset & 31));
if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0)
lcp->lcp_rotor = map;
if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) {
TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain);
TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
}
mutex_exit(&lp->lp_lock);
}
/*
* Process is exiting; tear down lwpctl state. This can only be safely
* called by the last LWP in the process.
*/
void
lwp_ctl_exit(void)
{
lcpage_t *lcp, *next;
lcproc_t *lp;
proc_t *p;
lwp_t *l;
l = curlwp;
l->l_lwpctl = NULL;
l->l_lcpage = NULL;
p = l->l_proc;
lp = p->p_lwpctl;
KASSERT(lp != NULL);
KASSERT(p->p_nlwps == 1);
for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) {
next = TAILQ_NEXT(lcp, lcp_chain);
uvm_unmap(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
}
if (lp->lp_uao != NULL) {
uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva,
lp->lp_uva + LWPCTL_UAREA_SZ);
}
mutex_destroy(&lp->lp_lock);
kmem_free(lp, sizeof(*lp));
p->p_lwpctl = NULL;
}
/*
* Return the current LWP's "preemption counter". Used to detect
* preemption across operations that can tolerate preemption without
* crashing, but which may generate incorrect results if preempted.
*
* We do arithmetic in unsigned long to avoid undefined behaviour in
* the event of arithmetic overflow on LP32, and issue __insn_barrier()
* on both sides so this can safely be used to detect changes to the
* preemption counter in loops around other memory accesses even in the
* event of whole-program optimization (e.g., gcc -flto).
*/
long
lwp_pctr(void)
{
unsigned long pctr;
__insn_barrier();
pctr = curlwp->l_ru.ru_nvcsw;
pctr += curlwp->l_ru.ru_nivcsw;
__insn_barrier();
return pctr;
}
/*
* Set an LWP's private data pointer.
*/
int
lwp_setprivate(struct lwp *l, void *ptr)
{
int error = 0;
l->l_private = ptr;
#ifdef __HAVE_CPU_LWP_SETPRIVATE
error = cpu_lwp_setprivate(l, ptr);
#endif
return error;
}
/*
* Perform any thread-related cleanup on LWP exit.
* N.B. l->l_proc->p_lock must be HELD on entry but will
* be released before returning!
*/
void
lwp_thread_cleanup(struct lwp *l)
{
KASSERT(mutex_owned(l->l_proc->p_lock));
mutex_exit(l->l_proc->p_lock);
/*
* If the LWP has robust futexes, release them all
* now.
*/
if (__predict_false(l->l_robust_head != 0)) { futex_release_all_lwp(l);
}
}
#if defined(DDB)
#include <machine/pcb.h>
void
lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
lwp_t *l;
LIST_FOREACH(l, &alllwp, l_list) {
uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l);
if (addr < stack || stack + KSTACK_SIZE <= addr) {
continue;
}
(*pr)("%p is %p+%zu, LWP %p's stack\n",
(void *)addr, (void *)stack,
(size_t)(addr - stack), l);
}
}
#endif /* defined(DDB) */
/* $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $ */
/*
* Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_object.c: operate with memory objects
*
* TODO:
* 1. Support PG_RELEASED-using objects
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/rwlock.h>
#include <sys/queue.h>
#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_page_array.h>
/* Page count to fetch per single step. */
#define FETCH_PAGECOUNT 16
/*
* uvm_obj_init: initialize UVM memory object.
*/
void
uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops,
bool alock, u_int refs)
{
#if 0 /* notyet */
KASSERT(ops);
#endif
if (alock) {
/* Allocate and assign a lock. */
uo->vmobjlock = rw_obj_alloc();
} else {
/* The lock will need to be set via uvm_obj_setlock(). */
uo->vmobjlock = NULL;
}
uo->pgops = ops;
LIST_INIT(&uo->uo_ubc);
uo->uo_npages = 0;
uo->uo_refs = refs;
radix_tree_init_tree(&uo->uo_pages);
}
/*
* uvm_obj_destroy: destroy UVM memory object.
*/
void
uvm_obj_destroy(struct uvm_object *uo, bool dlock)
{ KASSERT(radix_tree_empty_tree_p(&uo->uo_pages));
/* Purge any UBC entries associated with this object. */
ubc_purge(uo);
/* Destroy the lock, if requested. */
if (dlock) { rw_obj_free(uo->vmobjlock);
}
radix_tree_fini_tree(&uo->uo_pages);
}
/*
* uvm_obj_setlock: assign a vmobjlock to the UVM object.
*
* => Caller is responsible to ensure that UVM objects is not use.
* => Only dynamic lock may be previously set. We drop the reference then.
*/
void
uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr)
{
krwlock_t *olockptr = uo->vmobjlock;
if (olockptr) {
/* Drop the reference on the old lock. */
rw_obj_free(olockptr);
}
if (lockptr == NULL) {
/* If new lock is not passed - allocate default one. */
lockptr = rw_obj_alloc();
}
uo->vmobjlock = lockptr;
}
/*
* uvm_obj_wirepages: wire the pages of entire UVM object.
*
* => NOTE: this function should only be used for types of objects
* where PG_RELEASED flag is never set (aobj objects)
* => caller must pass page-aligned start and end values
*/
int
uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end,
struct pglist *list)
{
int i, npages, error;
struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL;
off_t offset = start, left;
left = (end - start) >> PAGE_SHIFT;
rw_enter(uobj->vmobjlock, RW_WRITER);
while (left) {
npages = MIN(FETCH_PAGECOUNT, left);
/* Get the pages */
memset(pgs, 0, sizeof(pgs));
error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0,
VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL,
PGO_SYNCIO);
if (error)
goto error;
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
KASSERT(pgs[i] != NULL);
KASSERT(!(pgs[i]->flags & PG_RELEASED));
/*
* Loan break
*/
if (pgs[i]->loan_count) {
while (pgs[i]->loan_count) {
pg = uvm_loanbreak(pgs[i]);
if (!pg) {
rw_exit(uobj->vmobjlock);
uvm_wait("uobjwirepg");
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
}
pgs[i] = pg;
}
if (pgs[i]->flags & PG_AOBJ) {
uvm_pagemarkdirty(pgs[i],
UVM_PAGE_STATUS_DIRTY);
uao_dropswap(uobj, i);
}
}
/* Wire the pages */
for (i = 0; i < npages; i++) {
uvm_pagelock(pgs[i]);
uvm_pagewire(pgs[i]);
uvm_pageunlock(pgs[i]);
if (list != NULL)
TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue);
}
/* Unbusy the pages */
uvm_page_unbusy(pgs, npages);
left -= npages;
offset += npages << PAGE_SHIFT;
}
rw_exit(uobj->vmobjlock);
return 0;
error:
/* Unwire the pages which has been wired */
uvm_obj_unwirepages(uobj, start, offset);
return error;
}
/*
* uvm_obj_unwirepages: unwire the pages of entire UVM object.
*
* => NOTE: this function should only be used for types of objects
* where PG_RELEASED flag is never set
* => caller must pass page-aligned start and end values
*/
void
uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end)
{
struct vm_page *pg;
off_t offset;
rw_enter(uobj->vmobjlock, RW_WRITER);
for (offset = start; offset < end; offset += PAGE_SIZE) {
pg = uvm_pagelookup(uobj, offset);
KASSERT(pg != NULL);
KASSERT(!(pg->flags & PG_RELEASED));
uvm_pagelock(pg);
uvm_pageunwire(pg);
uvm_pageunlock(pg);
}
rw_exit(uobj->vmobjlock);
}
static inline bool
uvm_obj_notag_p(struct uvm_object *uobj, int tag)
{
KASSERT(rw_lock_held(uobj->vmobjlock));
return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag);
}
bool
uvm_obj_clean_p(struct uvm_object *uobj)
{ return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG);
}
bool
uvm_obj_nowriteback_p(struct uvm_object *uobj)
{ return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG);
}
static inline bool
uvm_obj_page_tag_p(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_lock_held(uobj->vmobjlock));
return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0;
}
static inline void
uvm_obj_page_set_tag(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock));
radix_tree_set_tag(&uobj->uo_pages, pgidx, tag);
}
static inline void
uvm_obj_page_clear_tag(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock));
radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag);
}
bool
uvm_obj_page_dirty_p(struct vm_page *pg)
{ return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG);
}
void
uvm_obj_page_set_dirty(struct vm_page *pg)
{ uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG);
}
void
uvm_obj_page_clear_dirty(struct vm_page *pg)
{ uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG);
}
bool
uvm_obj_page_writeback_p(struct vm_page *pg)
{
return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG);
}
void
uvm_obj_page_set_writeback(struct vm_page *pg)
{ uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}
void
uvm_obj_page_clear_writeback(struct vm_page *pg)
{
uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_object_printit: actually prints the object
*/
void
uvm_object_printit(struct uvm_object *uobj, bool full,
void (*pr)(const char *, ...))
{
struct uvm_page_array a;
struct vm_page *pg;
int cnt = 0;
voff_t off;
(*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ",
uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages);
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
(*pr)("refs=<SYSTEM>\n");
else
(*pr)("refs=%d\n", uobj->uo_refs);
if (!full) {
return;
}
(*pr)(" PAGES <pg,offset>:\n ");
uvm_page_array_init(&a, uobj, 0);
off = 0;
while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) {
cnt++;
(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
if ((cnt % 3) == 0) {
(*pr)("\n ");
}
off = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
}
if ((cnt % 3) != 0) {
(*pr)("\n");
}
uvm_page_array_fini(&a);
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Inter-processor interrupt (IPI) interface: asynchronous IPIs to
* invoke functions with a constant argument and synchronous IPIs
* with the cross-call support.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/ipi.h>
#include <sys/intr.h>
#include <sys/kcpuset.h>
#include <sys/kmem.h>
#include <sys/lock.h>
#include <sys/mutex.h>
/*
* An array of the IPI handlers used for asynchronous invocation.
* The lock protects the slot allocation.
*/
typedef struct {
ipi_func_t func;
void * arg;
} ipi_intr_t;
static kmutex_t ipi_mngmt_lock;
static ipi_intr_t ipi_intrs[IPI_MAXREG] __cacheline_aligned;
/*
* Per-CPU mailbox for IPI messages: it is a single cache line storing
* up to IPI_MSG_MAX messages. This interface is built on top of the
* synchronous IPIs.
*/
#define IPI_MSG_SLOTS (CACHE_LINE_SIZE / sizeof(ipi_msg_t *))
#define IPI_MSG_MAX IPI_MSG_SLOTS
typedef struct {
ipi_msg_t * msg[IPI_MSG_SLOTS];
} ipi_mbox_t;
/* Mailboxes for the synchronous IPIs. */
static ipi_mbox_t * ipi_mboxes __read_mostly;
static struct evcnt ipi_mboxfull_ev __cacheline_aligned;
static void ipi_msg_cpu_handler(void *);
/* Handler for the synchronous IPIs - it must be zero. */
#define IPI_SYNCH_ID 0
#ifndef MULTIPROCESSOR
#define cpu_ipi(ci) KASSERT(ci == NULL)
#endif
void
ipi_sysinit(void)
{
mutex_init(&ipi_mngmt_lock, MUTEX_DEFAULT, IPL_NONE);
memset(ipi_intrs, 0, sizeof(ipi_intrs));
/*
* Register the handler for synchronous IPIs. This mechanism
* is built on top of the asynchronous interface. Slot zero is
* reserved permanently; it is also handy to use zero as a failure
* for other registers (as it is potentially less error-prone).
*/
ipi_intrs[IPI_SYNCH_ID].func = ipi_msg_cpu_handler;
evcnt_attach_dynamic(&ipi_mboxfull_ev, EVCNT_TYPE_MISC, NULL,
"ipi", "full");
}
void
ipi_percpu_init(void)
{
const size_t len = ncpu * sizeof(ipi_mbox_t);
/* Initialise the per-CPU bit fields. */
for (u_int i = 0; i < ncpu; i++) {
struct cpu_info *ci = cpu_lookup(i);
memset(&ci->ci_ipipend, 0, sizeof(ci->ci_ipipend));
}
/* Allocate per-CPU IPI mailboxes. */
ipi_mboxes = kmem_zalloc(len, KM_SLEEP);
KASSERT(ipi_mboxes != NULL);
}
/*
* ipi_register: register an asynchronous IPI handler.
*
* => Returns IPI ID which is greater than zero; on failure - zero.
*/
u_int
ipi_register(ipi_func_t func, void *arg)
{
mutex_enter(&ipi_mngmt_lock);
for (u_int i = 0; i < IPI_MAXREG; i++) {
if (ipi_intrs[i].func == NULL) {
/* Register the function. */
ipi_intrs[i].func = func;
ipi_intrs[i].arg = arg;
mutex_exit(&ipi_mngmt_lock);
KASSERT(i != IPI_SYNCH_ID);
return i;
}
}
mutex_exit(&ipi_mngmt_lock);
printf("WARNING: ipi_register: table full, increase IPI_MAXREG\n");
return 0;
}
/*
* ipi_unregister: release the IPI handler given the ID.
*/
void
ipi_unregister(u_int ipi_id)
{
ipi_msg_t ipimsg = { .func = __FPTRCAST(ipi_func_t, nullop) };
KASSERT(ipi_id != IPI_SYNCH_ID);
KASSERT(ipi_id < IPI_MAXREG);
/* Release the slot. */
mutex_enter(&ipi_mngmt_lock);
KASSERT(ipi_intrs[ipi_id].func != NULL);
ipi_intrs[ipi_id].func = NULL;
/* Ensure that there are no IPIs in flight. */
kpreempt_disable();
ipi_broadcast(&ipimsg, false);
ipi_wait(&ipimsg);
kpreempt_enable();
mutex_exit(&ipi_mngmt_lock);
}
/*
* ipi_mark_pending: internal routine to mark an IPI pending on the
* specified CPU (which might be curcpu()).
*/
static bool
ipi_mark_pending(u_int ipi_id, struct cpu_info *ci)
{
const u_int i = ipi_id >> IPI_BITW_SHIFT;
const uint32_t bitm = 1U << (ipi_id & IPI_BITW_MASK);
KASSERT(ipi_id < IPI_MAXREG); KASSERT(kpreempt_disabled());
/* Mark as pending and return true if not previously marked. */
if ((atomic_load_acquire(&ci->ci_ipipend[i]) & bitm) == 0) { membar_release();
atomic_or_32(&ci->ci_ipipend[i], bitm);
return true;
}
return false;
}
/*
* ipi_trigger: asynchronously send an IPI to the specified CPU.
*/
void
ipi_trigger(u_int ipi_id, struct cpu_info *ci)
{ KASSERT(curcpu() != ci); if (ipi_mark_pending(ipi_id, ci)) { cpu_ipi(ci);
}
}
/*
* ipi_trigger_multi_internal: the guts of ipi_trigger_multi() and
* ipi_trigger_broadcast().
*/
static void
ipi_trigger_multi_internal(u_int ipi_id, const kcpuset_t *target,
bool skip_self)
{
const cpuid_t selfid = cpu_index(curcpu());
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(kpreempt_disabled());
KASSERT(target != NULL);
for (CPU_INFO_FOREACH(cii, ci)) {
const cpuid_t cpuid = cpu_index(ci);
if (!kcpuset_isset(target, cpuid) || cpuid == selfid) {
continue;
}
ipi_trigger(ipi_id, ci);
}
if (!skip_self && kcpuset_isset(target, selfid)) {
ipi_mark_pending(ipi_id, curcpu());
int s = splhigh();
ipi_cpu_handler();
splx(s);
}
}
/*
* ipi_trigger_multi: same as ipi_trigger() but sends to the multiple
* CPUs given the target CPU set.
*/
void
ipi_trigger_multi(u_int ipi_id, const kcpuset_t *target)
{
ipi_trigger_multi_internal(ipi_id, target, false);
}
/*
* ipi_trigger_broadcast: same as ipi_trigger_multi() to kcpuset_attached,
* optionally skipping the sending CPU.
*/
void
ipi_trigger_broadcast(u_int ipi_id, bool skip_self)
{
ipi_trigger_multi_internal(ipi_id, kcpuset_attached, skip_self);
}
/*
* put_msg: insert message into the mailbox.
*
* Caller is responsible for issuing membar_release first.
*/
static inline void
put_msg(ipi_mbox_t *mbox, ipi_msg_t *msg)
{
int count = SPINLOCK_BACKOFF_MIN;
again:
for (u_int i = 0; i < IPI_MSG_MAX; i++) {
if (atomic_cas_ptr(&mbox->msg[i], NULL, msg) == NULL) {
return;
}
}
/* All slots are full: we have to spin-wait. */
ipi_mboxfull_ev.ev_count++;
SPINLOCK_BACKOFF(count);
goto again;
}
/*
* ipi_cpu_handler: the IPI handler.
*/
void
ipi_cpu_handler(void)
{
struct cpu_info * const ci = curcpu();
/*
* Handle asynchronous IPIs: inspect per-CPU bit field, extract
* IPI ID numbers and execute functions in those slots.
*/
for (u_int i = 0; i < IPI_BITWORDS; i++) {
uint32_t pending, bit;
if (atomic_load_relaxed(&ci->ci_ipipend[i]) == 0) {
continue;
}
pending = atomic_swap_32(&ci->ci_ipipend[i], 0);
membar_acquire();
while ((bit = ffs(pending)) != 0) {
const u_int ipi_id = (i << IPI_BITW_SHIFT) | --bit;
ipi_intr_t *ipi_hdl = &ipi_intrs[ipi_id];
pending &= ~(1U << bit);
KASSERT(ipi_hdl->func != NULL);
ipi_hdl->func(ipi_hdl->arg);
}
}
}
/*
* ipi_msg_cpu_handler: handle synchronous IPIs - iterate mailbox,
* execute the passed functions and acknowledge the messages.
*/
static void
ipi_msg_cpu_handler(void *arg __unused)
{
const struct cpu_info * const ci = curcpu();
ipi_mbox_t *mbox = &ipi_mboxes[cpu_index(ci)];
for (u_int i = 0; i < IPI_MSG_MAX; i++) {
ipi_msg_t *msg;
/* Get the message. */
if ((msg = atomic_load_acquire(&mbox->msg[i])) == NULL) {
continue;
}
atomic_store_relaxed(&mbox->msg[i], NULL);
/* Execute the handler. */
KASSERT(msg->func);
msg->func(msg->arg);
/* Ack the request. */
membar_release();
atomic_dec_uint(&msg->_pending);
}
}
/*
* ipi_unicast: send an IPI to a single CPU.
*
* => The CPU must be remote; must not be local.
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_unicast(ipi_msg_t *msg, struct cpu_info *ci)
{
const cpuid_t id = cpu_index(ci);
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
KASSERT(curcpu() != ci);
msg->_pending = 1;
membar_release();
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
/*
* ipi_multicast: send an IPI to each CPU in the specified set.
*
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_multicast(ipi_msg_t *msg, const kcpuset_t *target)
{
const struct cpu_info * const self = curcpu();
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
u_int local;
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
local = !!kcpuset_isset(target, cpu_index(self));
msg->_pending = kcpuset_countset(target) - local;
membar_release();
for (CPU_INFO_FOREACH(cii, ci)) {
cpuid_t id;
if (__predict_false(ci == self)) {
continue;
}
id = cpu_index(ci);
if (!kcpuset_isset(target, id)) {
continue;
}
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
if (local) {
msg->func(msg->arg);
}
}
/*
* ipi_broadcast: send an IPI to all CPUs.
*
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_broadcast(ipi_msg_t *msg, bool skip_self)
{
const struct cpu_info * const self = curcpu();
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
msg->_pending = ncpu - 1;
membar_release();
/* Broadcast IPIs for remote CPUs. */
for (CPU_INFO_FOREACH(cii, ci)) {
cpuid_t id;
if (__predict_false(ci == self)) {
continue;
}
id = cpu_index(ci);
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
if (!skip_self) {
/* Finally, execute locally. */
msg->func(msg->arg);
}
}
/*
* ipi_wait: spin-wait until the message is processed.
*/
void
ipi_wait(ipi_msg_t *msg)
{
int count = SPINLOCK_BACKOFF_MIN;
while (atomic_load_acquire(&msg->_pending)) {
KASSERT(atomic_load_relaxed(&msg->_pending) < ncpu);
SPINLOCK_BACKOFF(count);
}
}
/* $NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $");
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/idle.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/evcnt.h>
#include <sys/atomic.h>
/*
* Bits per map.
*/
#define BITMAP_BITS (32)
#define BITMAP_SHIFT (5)
#define BITMAP_MSB (0x80000000U)
#define BITMAP_MASK (BITMAP_BITS - 1)
const int schedppq = 1;
static void *sched_getrq(struct schedstate_percpu *, const pri_t);
#ifdef MULTIPROCESSOR
static lwp_t * sched_catchlwp(struct cpu_info *);
#endif
/*
* Preemption control.
*/
#ifdef __HAVE_PREEMPTION
# ifdef DEBUG
int sched_kpreempt_pri = 0;
# else
int sched_kpreempt_pri = PRI_USER_RT;
# endif
#else
int sched_kpreempt_pri = 1000;
#endif
/*
* Migration and balancing.
*/
static u_int cacheht_time; /* Cache hotness time */
static u_int min_catch; /* Minimal LWP count for catching */
static u_int skim_interval; /* Rate limit for stealing LWPs */
#ifdef KDTRACE_HOOKS
struct lwp *curthread;
#endif
void
runq_init(void)
{
/* Pulling from remote packages, LWP must not have run for 10ms. */
cacheht_time = 10;
/* Minimal count of LWPs for catching */
min_catch = 1;
/* Steal from other CPUs at most every 10ms. */
skim_interval = 10;
}
void
sched_cpuattach(struct cpu_info *ci)
{
struct schedstate_percpu *spc;
size_t size;
void *p;
u_int i;
spc = &ci->ci_schedstate;
spc->spc_nextpkg = ci;
if (spc->spc_lwplock == NULL) {
spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
}
if (ci == lwp0.l_cpu) {
/* Initialize the scheduler structure of the primary LWP */
lwp0.l_mutex = spc->spc_lwplock;
}
if (spc->spc_mutex != NULL) {
/* Already initialized. */
return;
}
/* Allocate the run queue */
size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
coherency_unit;
p = kmem_alloc(size, KM_SLEEP);
spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
/* Initialize run queues */
spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
for (i = 0; i < PRI_COUNT; i++)
TAILQ_INIT(&spc->spc_queue[i]);
}
/*
* Control of the runqueue.
*/
static inline void *
sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
{
KASSERT(prio < PRI_COUNT);
return &spc->spc_queue[prio];
}
/*
* Put an LWP onto a run queue. The LWP must be locked by spc_mutex for
* l_cpu.
*/
void
sched_enqueue(struct lwp *l)
{
struct schedstate_percpu *spc;
TAILQ_HEAD(, lwp) *q_head;
const pri_t eprio = lwp_eprio(l);
struct cpu_info *ci;
ci = l->l_cpu;
spc = &ci->ci_schedstate;
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
/* Enqueue the thread */
q_head = sched_getrq(spc, eprio); if (TAILQ_EMPTY(q_head)) {
u_int i;
uint32_t q;
/* Mark bit */
i = eprio >> BITMAP_SHIFT;
q = BITMAP_MSB >> (eprio & BITMAP_MASK);
KASSERT((spc->spc_bitmap[i] & q) == 0);
spc->spc_bitmap[i] |= q;
}
/*
* Determine run queue position according to POSIX. XXX Explicitly
* lowering a thread's priority with pthread_setschedparam() is not
* handled.
*/
if ((l->l_pflag & LP_PREEMPTING) != 0) {
switch (l->l_class) {
case SCHED_OTHER:
TAILQ_INSERT_TAIL(q_head, l, l_runq);
break;
case SCHED_FIFO:
TAILQ_INSERT_HEAD(q_head, l, l_runq);
break;
case SCHED_RR:
if (getticks() - l->l_rticks >= sched_rrticks) {
TAILQ_INSERT_TAIL(q_head, l, l_runq);
} else {
TAILQ_INSERT_HEAD(q_head, l, l_runq);
}
break;
default:
panic("sched_enqueue: LWP %p has class %d\n",
l, l->l_class);
}
} else {
TAILQ_INSERT_TAIL(q_head, l, l_runq);
}
spc->spc_flags &= ~SPCF_IDLE;
spc->spc_count++;
if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount,
atomic_load_relaxed(&spc->spc_mcount) + 1);
}
/*
* Update the value of highest priority in the runqueue,
* if priority of this thread is higher.
*/
if (eprio > spc->spc_maxpriority) spc->spc_maxpriority = eprio;
sched_newts(l);
}
/*
* Remove and LWP from the run queue it's on. The LWP must be in state
* LSRUN.
*/
void
sched_dequeue(struct lwp *l)
{
TAILQ_HEAD(, lwp) *q_head;
struct schedstate_percpu *spc;
const pri_t eprio = lwp_eprio(l);
spc = &l->l_cpu->ci_schedstate;
KASSERT(lwp_locked(l, spc->spc_mutex)); KASSERT(eprio <= spc->spc_maxpriority); KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); KASSERT(spc->spc_count > 0); if (spc->spc_migrating == l) spc->spc_migrating = NULL;
spc->spc_count--;
if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount,
atomic_load_relaxed(&spc->spc_mcount) - 1);
}
q_head = sched_getrq(spc, eprio); TAILQ_REMOVE(q_head, l, l_runq); if (TAILQ_EMPTY(q_head)) {
u_int i;
uint32_t q;
/* Unmark bit */
i = eprio >> BITMAP_SHIFT;
q = BITMAP_MSB >> (eprio & BITMAP_MASK);
KASSERT((spc->spc_bitmap[i] & q) != 0);
spc->spc_bitmap[i] &= ~q;
/*
* Update the value of highest priority in the runqueue, in a
* case it was a last thread in the queue of highest priority.
*/
if (eprio != spc->spc_maxpriority)
return;
do {
if (spc->spc_bitmap[i] != 0) {
q = ffs(spc->spc_bitmap[i]);
spc->spc_maxpriority =
(i << BITMAP_SHIFT) + (BITMAP_BITS - q);
return;
}
} while (i--);
/* If not found - set the lowest value */
spc->spc_maxpriority = 0;
}
}
/*
* Cause a preemption on the given CPU, if the priority "pri" is higher
* priority than the running LWP. If "unlock" is specified, and ideally it
* will be for concurrency reasons, spc_mutex will be dropped before return.
*/
void
sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
{
struct schedstate_percpu *spc;
u_int o, n, f;
lwp_t *l;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(spc->spc_mutex));
/*
* If the priority level we're evaluating wouldn't cause a new LWP
* to be run on the CPU, then we have nothing to do.
*/
if (pri <= spc->spc_curpriority || !mp_online) {
if (__predict_true(unlock)) { spc_unlock(ci);
}
return;
}
/*
* Figure out what kind of preemption we should do.
*/
l = ci->ci_onproc;
if ((l->l_flag & LW_IDLE) != 0) {
f = RESCHED_IDLE | RESCHED_UPREEMPT;
} else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
/* We can't currently preempt softints - should be able to. */
#ifdef __HAVE_PREEMPTION
f = RESCHED_KPREEMPT;
#else
/* Leave door open for test: set kpreempt_pri with sysctl. */
f = RESCHED_UPREEMPT;
#endif
/*
* l_dopreempt must be set with the CPU locked to sync with
* mi_switch(). It must also be set with an atomic to sync
* with kpreempt().
*/
atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
} else {
f = RESCHED_UPREEMPT;
}
if (ci != curcpu()) {
f |= RESCHED_REMOTE;
}
/*
* Things can start as soon as ci_want_resched is touched: x86 has
* an instruction that monitors the memory cell it's in. Drop the
* schedstate lock in advance, otherwise the remote CPU can awaken
* and immediately block on the lock.
*/
if (__predict_true(unlock)) { spc_unlock(ci);
}
/*
* The caller almost always has a second scheduler lock held: either
* the running LWP lock (spc_lwplock), or a sleep queue lock. That
* keeps preemption disabled, which among other things ensures all
* LWPs involved won't be freed while we're here (see lwp_dtor()).
*/
KASSERT(kpreempt_disabled());
for (o = 0;; o = n) {
n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
if (__predict_true(o == n)) {
/*
* We're the first to set a resched on the CPU. Try
* to avoid causing a needless trip through trap()
* to handle an AST fault, if it's known the LWP
* will either block or go through userret() soon.
*/
if (l != curlwp || cpu_intr_p()) {
cpu_need_resched(ci, l, f);
}
break;
}
if (__predict_true(
(n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
(f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
/* Already in progress, nothing to do. */
break;
}
}
}
/*
* Cause a preemption on the given CPU, if the priority of LWP "l" in state
* LSRUN, is higher priority than the running LWP. If "unlock" is
* specified, and ideally it will be for concurrency reasons, spc_mutex will
* be dropped before return.
*/
void
sched_resched_lwp(struct lwp *l, bool unlock)
{
struct cpu_info *ci = l->l_cpu;
KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); KASSERT(l->l_stat == LSRUN);
sched_resched_cpu(ci, lwp_eprio(l), unlock);
}
/*
* Migration and balancing.
*/
#ifdef MULTIPROCESSOR
/*
* Estimate if LWP is cache-hot.
*/
static inline bool
lwp_cache_hot(const struct lwp *l)
{
/* Leave new LWPs in peace, determination has already been made. */
if (l->l_stat == LSIDL)
return true;
if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
return false;
return (getticks() - l->l_rticks < mstohz(cacheht_time));
}
/*
* Check if LWP can migrate to the chosen CPU.
*/
static inline bool
sched_migratable(const struct lwp *l, struct cpu_info *ci)
{
const struct schedstate_percpu *spc = &ci->ci_schedstate;
KASSERT(lwp_locked(__UNCONST(l), NULL));
/* Is CPU offline? */
if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
return false;
/* Is affinity set? */
if (__predict_false(l->l_affinity))
return kcpuset_isset(l->l_affinity, cpu_index(ci));
/* Is there a processor-set? */
return (spc->spc_psid == l->l_psid);
}
/*
* A small helper to do round robin through CPU packages.
*/
static struct cpu_info *
sched_nextpkg(void)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
spc->spc_nextpkg =
spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
return spc->spc_nextpkg;
}
/*
* Find a CPU to run LWP "l". Look for the CPU with the lowest priority
* thread. In case of equal priority, prefer first class CPUs, and amongst
* the remainder choose the CPU with the fewest runqueue entries.
*
* Begin the search in the CPU package which "pivot" is a member of.
*/
static struct cpu_info * __noinline
sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
{
struct cpu_info *bestci, *curci, *outer;
struct schedstate_percpu *bestspc, *curspc;
pri_t bestpri, curpri;
/*
* If this fails (it shouldn't), run on the given CPU. This also
* gives us a weak preference for "pivot" to begin with.
*/
bestci = pivot;
bestspc = &bestci->ci_schedstate;
if (sched_migratable(l, bestci)) { bestpri = MAX(bestspc->spc_curpriority,
bestspc->spc_maxpriority);
} else {
/* Invalidate the priority. */
bestpri = PRI_COUNT;
}
/* In the outer loop scroll through all CPU packages. */
pivot = pivot->ci_package1st;
outer = pivot;
do {
/* In the inner loop scroll through all CPUs in package. */
curci = outer;
do {
if (!sched_migratable(l, curci)) {
continue;
}
curspc = &curci->ci_schedstate;
/* If this CPU is idle and 1st class, we're done. */
if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
(SPCF_IDLE | SPCF_1STCLASS)) {
return curci;
}
curpri = MAX(curspc->spc_curpriority,
curspc->spc_maxpriority);
if (curpri > bestpri) {
continue;
}
if (curpri == bestpri) {
/* Prefer first class CPUs over others. */
if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
(bestspc->spc_flags & SPCF_1STCLASS) != 0) {
continue;
}
/*
* Pick the least busy CPU. Make sure this is not
* <=, otherwise it defeats the above preference.
*/
if (bestspc->spc_count < curspc->spc_count) {
continue;
}
}
bestpri = curpri;
bestci = curci;
bestspc = curspc;
} while (curci = curci->ci_sibling[CPUREL_PACKAGE],
curci != outer);
} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
outer != pivot);
return bestci;
}
/*
* Estimate the migration of LWP to the other CPU.
* Take and return the CPU, if migration is needed.
*/
struct cpu_info *
sched_takecpu(struct lwp *l)
{
struct schedstate_percpu *spc, *tspc;
struct cpu_info *ci, *curci, *tci;
pri_t eprio;
int flags;
KASSERT(lwp_locked(l, NULL));
/* If thread is strictly bound, do not estimate other CPUs */
ci = l->l_cpu;
if (l->l_pflag & LP_BOUND)
return ci;
spc = &ci->ci_schedstate;
eprio = lwp_eprio(l);
/*
* Handle new LWPs. For vfork() with a timeshared child, make it
* run on the same CPU as the parent if no other LWPs in queue.
* Otherwise scatter far and wide - try for an even distribution
* across all CPU packages and CPUs.
*/
if (l->l_stat == LSIDL) { if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { if (sched_migratable(l, curlwp->l_cpu) && eprio >
curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
return curlwp->l_cpu;
}
} else {
return sched_bestcpu(l, sched_nextpkg());
}
flags = SPCF_IDLE;
} else {
flags = SPCF_IDLE | SPCF_1STCLASS;
}
/*
* Try to send the LWP back to the first CPU in the same core if
* idle. This keeps LWPs clustered in the run queues of 1st class
* CPUs. This implies stickiness. If we didn't find a home for
* a vfork() child above, try to use any SMT sibling to help out.
*/
tci = ci;
do {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
return tci;
}
tci = tci->ci_sibling[CPUREL_CORE];
} while (tci != ci);
/*
* Otherwise the LWP is "sticky", i.e. generally preferring to stay
* on the same CPU.
*/
if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
return ci;
}
/*
* If the current CPU core is idle, run there and avoid the
* expensive scan of CPUs below.
*/
curci = curcpu();
tci = curci;
do {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
return tci;
}
tci = tci->ci_sibling[CPUREL_CORE];
} while (tci != curci);
/*
* Didn't find a new home above - happens infrequently. Start the
* search in last CPU package that the LWP ran in, but expand to
* include the whole system if needed.
*/
return sched_bestcpu(l, l->l_cpu);
}
/*
* Tries to catch an LWP from the runqueue of other CPU.
*/
static struct lwp *
sched_catchlwp(struct cpu_info *ci)
{
struct cpu_info *curci = curcpu();
struct schedstate_percpu *spc, *curspc;
TAILQ_HEAD(, lwp) *q_head;
struct lwp *l;
bool gentle;
curspc = &curci->ci_schedstate;
spc = &ci->ci_schedstate;
/*
* Be more aggressive if this CPU is first class, and the other
* is not.
*/
gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
(spc->spc_flags & SPCF_1STCLASS) != 0);
if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
curspc->spc_psid != spc->spc_psid) {
spc_unlock(ci);
return NULL;
}
/* Take the highest priority thread */
q_head = sched_getrq(spc, spc->spc_maxpriority);
l = TAILQ_FIRST(q_head);
for (;;) {
/* Check the first and next result from the queue */
if (l == NULL) {
break;
}
KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
ci->ci_data.cpu_name,
l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
/* Look for threads, whose are allowed to migrate */
if ((l->l_pflag & LP_BOUND) ||
(gentle && lwp_cache_hot(l)) ||
!sched_migratable(l, curci)) {
l = TAILQ_NEXT(l, l_runq);
/* XXX Gap: could walk down priority list. */
continue;
}
/* Grab the thread, and move to the local run queue */
sched_dequeue(l);
l->l_cpu = curci;
lwp_unlock_to(l, curspc->spc_mutex);
sched_enqueue(l);
return l;
}
spc_unlock(ci);
return l;
}
/*
* Called from sched_idle() to handle migration. Return the CPU that we
* pushed the LWP to (may be NULL).
*/
static struct cpu_info *
sched_idle_migrate(void)
{
struct cpu_info *ci = curcpu(), *tci = NULL;
struct schedstate_percpu *spc, *tspc;
bool dlock = false;
spc = &ci->ci_schedstate;
spc_lock(ci);
for (;;) {
struct lwp *l;
l = spc->spc_migrating;
if (l == NULL)
break;
/*
* If second attempt, and target CPU has changed,
* drop the old lock.
*/
if (dlock == true && tci != l->l_target_cpu) {
KASSERT(tci != NULL);
spc_unlock(tci);
dlock = false;
}
/*
* Nothing to do if destination has changed to the
* local CPU, or migration was done by other CPU.
*/
tci = l->l_target_cpu;
if (tci == NULL || tci == ci) {
spc->spc_migrating = NULL;
l->l_target_cpu = NULL;
break;
}
tspc = &tci->ci_schedstate;
/*
* Double-lock the runqueues.
* We do that only once.
*/
if (dlock == false) {
dlock = true;
if (ci < tci) {
spc_lock(tci);
} else if (!mutex_tryenter(tspc->spc_mutex)) {
spc_unlock(ci);
spc_lock(tci);
spc_lock(ci);
/* Check the situation again.. */
continue;
}
}
/* Migrate the thread */
KASSERT(l->l_stat == LSRUN);
spc->spc_migrating = NULL;
l->l_target_cpu = NULL;
sched_dequeue(l);
l->l_cpu = tci;
lwp_setlock(l, tspc->spc_mutex);
sched_enqueue(l);
sched_resched_lwp(l, true);
/* tci now unlocked */
spc_unlock(ci);
return tci;
}
if (dlock == true) {
KASSERT(tci != NULL);
spc_unlock(tci);
}
spc_unlock(ci);
return NULL;
}
/*
* Try to steal an LWP from "tci".
*/
static bool
sched_steal(struct cpu_info *ci, struct cpu_info *tci)
{
struct schedstate_percpu *spc, *tspc;
lwp_t *l;
spc = &ci->ci_schedstate;
tspc = &tci->ci_schedstate;
if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
spc->spc_psid == tspc->spc_psid) {
spc_dlock(ci, tci);
l = sched_catchlwp(tci);
spc_unlock(ci);
if (l != NULL) {
return true;
}
}
return false;
}
/*
* Called from each CPU's idle loop.
*/
void
sched_idle(void)
{
struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
struct schedstate_percpu *spc, *tspc;
struct lwp *l;
ci = curcpu();
spc = &ci->ci_schedstate;
tci = NULL;
mci = NULL;
/*
* Handle LWP migrations off this CPU to another. If there a is
* migration to do then remember the CPU the LWP was sent to, and
* don't steal the LWP back from that CPU below.
*/
if (spc->spc_migrating != NULL) {
mci = sched_idle_migrate();
}
/* If this CPU is offline, or we have an LWP to run, we're done. */
if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
return;
}
/* Deal with SMT. */
if (ci->ci_nsibling[CPUREL_CORE] > 1) {
/* Try to help our siblings out. */
tci = ci->ci_sibling[CPUREL_CORE];
while (tci != ci) {
if (tci != mci && sched_steal(ci, tci)) {
return;
}
tci = tci->ci_sibling[CPUREL_CORE];
}
/*
* If not the first SMT in the core, and in the default
* processor set, the search ends here.
*/
if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
spc->spc_psid == PS_NONE) {
return;
}
}
/*
* Find something to run, unless this CPU exceeded the rate limit.
* Start looking on the current package to maximise L2/L3 cache
* locality. Then expand to looking at the rest of the system.
*
* XXX Should probably look at 2nd class CPUs first, but they will
* shed jobs via preempt() anyway.
*/
if (spc->spc_nextskim > getticks()) {
return;
}
spc->spc_nextskim = getticks() + mstohz(skim_interval);
/* In the outer loop scroll through all CPU packages, starting here. */
first = ci->ci_package1st;
outer = first;
do {
/* In the inner loop scroll through all CPUs in package. */
inner = outer;
do {
/* Don't hit the locks unless needed. */
tspc = &inner->ci_schedstate;
if (ci == inner || ci == mci ||
spc->spc_psid != tspc->spc_psid ||
atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
continue;
}
spc_dlock(ci, inner);
l = sched_catchlwp(inner);
spc_unlock(ci);
if (l != NULL) {
/* Got it! */
return;
}
} while (inner = inner->ci_sibling[CPUREL_PACKAGE],
inner != outer);
} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
outer != first);
}
/*
* Called from mi_switch() when an LWP has been preempted / has yielded.
* The LWP is presently in the CPU's run queue. Here we look for a better
* CPU to teleport the LWP to; there may not be one.
*/
void
sched_preempted(struct lwp *l)
{
const int flags = SPCF_IDLE | SPCF_1STCLASS;
struct schedstate_percpu *tspc;
struct cpu_info *ci, *tci;
ci = l->l_cpu;
tspc = &ci->ci_schedstate;
KASSERT(tspc->spc_count >= 1);
/*
* Try to select another CPU if:
*
* - there is no migration pending already
* - and this LWP is running on a 2nd class CPU
* - or this LWP is a child of vfork() that has just done execve()
*/
if (l->l_target_cpu != NULL || ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
(l->l_pflag & LP_TELEPORT) == 0)) {
return;
}
/*
* Fast path: if the first SMT in the core is idle, send it back
* there, because the cache is shared (cheap) and we want all LWPs
* to be clustered on 1st class CPUs (either running there or on
* their runqueues).
*/
tci = ci->ci_sibling[CPUREL_CORE];
while (tci != ci) {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
l->l_target_cpu = tci;
l->l_pflag &= ~LP_TELEPORT;
return;
}
tci = tci->ci_sibling[CPUREL_CORE];
}
if ((l->l_pflag & LP_TELEPORT) != 0) {
/*
* A child of vfork(): now that the parent is released,
* scatter far and wide, to match the LSIDL distribution
* done in sched_takecpu().
*/
l->l_pflag &= ~LP_TELEPORT;
tci = sched_bestcpu(l, sched_nextpkg());
if (tci != ci) { l->l_target_cpu = tci;
}
} else {
/*
* Try to find a better CPU to take it, but don't move to
* another 2nd class CPU, and don't move to a non-idle CPU,
* because that would prevent SMT being used to maximise
* throughput.
*
* Search in the current CPU package in order to try and
* keep L2/L3 cache locality, but expand to include the
* whole system if needed.
*/
tci = sched_bestcpu(l, l->l_cpu);
if (tci != ci &&
(tci->ci_schedstate.spc_flags & flags) == flags) {
l->l_target_cpu = tci;
}
}
}
/*
* Called during execve() by a child of vfork(). Does two things:
*
* - If the parent has been awoken and put back on curcpu then give the
* CPU back to the parent.
*
* - If curlwp is not on a 1st class CPU then find somewhere else to run,
* since it dodged the distribution in sched_takecpu() when first set
* runnable.
*/
void
sched_vforkexec(struct lwp *l, bool samecpu)
{
KASSERT(l == curlwp);
if ((samecpu && ncpu > 1) ||
(l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
l->l_pflag |= LP_TELEPORT;
preempt();
}
}
#else
/*
* stubs for !MULTIPROCESSOR
*/
struct cpu_info *
sched_takecpu(struct lwp *l)
{
return l->l_cpu;
}
void
sched_idle(void)
{
}
void
sched_preempted(struct lwp *l)
{
}
void
sched_vforkexec(struct lwp *l, bool samecpu)
{
KASSERT(l == curlwp);
}
#endif /* MULTIPROCESSOR */
/*
* Scheduling statistics and balancing.
*/
void
sched_lwp_stats(struct lwp *l)
{
int batch;
KASSERT(lwp_locked(l, NULL));
/* Update sleep time */
if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
l->l_stat == LSSUSPENDED)
l->l_slptime++;
/*
* Set that thread is more CPU-bound, if sum of run time exceeds the
* sum of sleep time. Check if thread is CPU-bound a first time.
*/
batch = (l->l_rticksum > l->l_slpticksum);
if (batch != 0) {
if ((l->l_flag & LW_BATCH) == 0)
batch = 0;
l->l_flag |= LW_BATCH;
} else
l->l_flag &= ~LW_BATCH;
/* Reset the time sums */
l->l_slpticksum = 0;
l->l_rticksum = 0;
/* Scheduler-specific hook */
sched_pstats_hook(l, batch);
#ifdef KDTRACE_HOOKS
curthread = l;
#endif
}
/*
* Scheduler mill.
*/
struct lwp *
sched_nextlwp(void)
{
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc;
TAILQ_HEAD(, lwp) *q_head;
struct lwp *l;
/* Update the last run time on switch */
l = curlwp;
l->l_rticksum += (getticks() - l->l_rticks);
/* Return to idle LWP if there is a migrating thread */
spc = &ci->ci_schedstate;
if (__predict_false(spc->spc_migrating != NULL))
return NULL;
/* Return to idle LWP if there is no runnable job */
if (__predict_false(spc->spc_count == 0))
return NULL;
/* Take the highest priority thread */
KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); q_head = sched_getrq(spc, spc->spc_maxpriority);
l = TAILQ_FIRST(q_head);
KASSERT(l != NULL);
sched_oncpu(l);
l->l_rticks = getticks();
return l;
}
/*
* sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
*/
bool
sched_curcpu_runnable_p(void)
{
const struct cpu_info *ci;
const struct schedstate_percpu *spc;
bool rv;
kpreempt_disable();
ci = curcpu();
spc = &ci->ci_schedstate;
rv = (spc->spc_count != 0);
#ifndef __HAVE_FAST_SOFTINTS
rv |= (ci->ci_data.cpu_softints != 0);
#endif
kpreempt_enable();
return rv;
}
/*
* Sysctl nodes and initialization.
*/
SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
{
const struct sysctlnode *node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sched",
SYSCTL_DESCR("Scheduler options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node == NULL)
return;
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "cacheht_time",
SYSCTL_DESCR("Cache hotness time (in ms)"),
NULL, 0, &cacheht_time, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "skim_interval",
SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
NULL, 0, &skim_interval, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "min_catch",
SYSCTL_DESCR("Minimal count of threads for catching"),
NULL, 0, &min_catch, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "timesoftints",
SYSCTL_DESCR("Track CPU time for soft interrupts"),
NULL, 0, &softint_timing, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "kpreempt_pri",
SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
NULL, 0, &sched_kpreempt_pri, 0,
CTL_CREATE, CTL_EOL);
}
/*
* Debugging.
*/
#ifdef DDB
void
sched_print_runqueue(void (*pr)(const char *, ...))
{
struct cpu_info *ci, *tci;
struct schedstate_percpu *spc;
struct lwp *l;
struct proc *p;
CPU_INFO_ITERATOR cii;
for (CPU_INFO_FOREACH(cii, ci)) {
int i;
spc = &ci->ci_schedstate;
(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
(*pr)(" pid.lid = %d.%d, r_count = %u, "
"maxpri = %d, mlwp = %p\n",
#ifdef MULTIPROCESSOR
ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
#else
curlwp->l_proc->p_pid, curlwp->l_lid,
#endif
spc->spc_count, spc->spc_maxpriority,
spc->spc_migrating);
i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
do {
uint32_t q;
q = spc->spc_bitmap[i];
(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
} while (i--);
}
(*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
"LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
PROCLIST_FOREACH(p, &allproc) {
(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
ci = l->l_cpu;
tci = l->l_target_cpu;
(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
(int)l->l_lid, l->l_priority, lwp_eprio(l),
l->l_flag, l->l_stat == LSRUN ? "RQ" :
(l->l_stat == LSSLEEP ? "SQ" : "-"),
l, ci->ci_index, (tci ? tci->ci_index : -1),
(u_int)(getticks() - l->l_rticks));
}
}
}
#endif
/* $NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Emmanuel Dreyfus.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>
int
compat50_clockctlioctl(dev_t dev, u_long cmd, void *data, int flags,
struct lwp *l)
{
int error = 0;
const struct cdevsw *cd = cdevsw_lookup(dev);
if (cd == NULL || cd->d_ioctl == NULL)
return ENXIO;
switch (cmd) {
case CLOCKCTL_OSETTIMEOFDAY: {
struct timeval50 tv50;
struct timeval tv;
struct clockctl50_settimeofday *args = data;
error = copyin(args->tv, &tv50, sizeof(tv50));
if (error)
return (error);
timeval50_to_timeval(&tv50, &tv);
error = settimeofday1(&tv, false, args->tzp, l, false);
break;
}
case CLOCKCTL_OADJTIME: {
struct timeval atv, oldatv;
struct timeval50 atv50;
struct clockctl50_adjtime *args = data;
if (args->delta) {
error = copyin(args->delta, &atv50, sizeof(atv50));
if (error)
return (error);
timeval50_to_timeval(&atv50, &atv);
}
adjtime1(args->delta ? &atv : NULL,
args->olddelta ? &oldatv : NULL, l->l_proc);
if (args->olddelta) { timeval_to_timeval50(&oldatv, &atv50);
error = copyout(&atv50, args->olddelta, sizeof(atv50));
}
break;
}
case CLOCKCTL_OCLOCK_SETTIME: {
struct timespec50 tp50;
struct timespec tp;
struct clockctl50_clock_settime *args = data;
error = copyin(args->tp, &tp50, sizeof(tp50));
if (error)
return (error);
timespec50_to_timespec(&tp50, &tp);
error = clock_settime1(l->l_proc, args->clock_id, &tp, true);
break;
}
case CLOCKCTL_ONTP_ADJTIME: {
if (vec_ntp_timestatus == NULL) {
error = ENOTTY;
break;
}
/* The ioctl number changed but the data did not change. */
error = (cd->d_ioctl)(dev, CLOCKCTL_NTP_ADJTIME,
data, flags, l);
break;
}
default:
error = ENOTTY;
}
return (error);
}
void
clockctl_50_init(void)
{
MODULE_HOOK_SET(clockctl_ioctl_50_hook, compat50_clockctlioctl);
}
void
clockctl_50_fini(void)
{
MODULE_HOOK_UNSET(clockctl_ioctl_50_hook);
}
/* $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
* Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $");
#include "opt_kstack.h"
#include "opt_ddb.h"
#include "opt_dtrace.h"
#define __MUTEX_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
#include <dev/lockstat.h>
int dtrace_vtime_active=0;
dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
#ifdef DDB
#include <ddb/ddb.h>
#endif
static void sched_unsleep(struct lwp *, bool);
static void sched_changepri(struct lwp *, pri_t);
static void sched_lendpri(struct lwp *, pri_t);
syncobj_t sleep_syncobj = {
.sobj_name = "sleep",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
syncobj_t sched_syncobj = {
.sobj_name = "sched",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_USER,
.sobj_unsleep = sched_unsleep,
.sobj_changepri = sched_changepri,
.sobj_lendpri = sched_lendpri,
.sobj_owner = syncobj_noowner,
};
syncobj_t kpause_syncobj = {
.sobj_name = "kpause",
.sobj_flag = SOBJ_SLEEPQ_NULL,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
/* "Lightning bolt": once a second sleep address. */
kcondvar_t lbolt __cacheline_aligned;
u_int sched_pstats_ticks __cacheline_aligned;
/* Preemption event counters. */
static struct evcnt kpreempt_ev_crit __cacheline_aligned;
static struct evcnt kpreempt_ev_klock __cacheline_aligned;
static struct evcnt kpreempt_ev_immed __cacheline_aligned;
void
synch_init(void)
{
cv_init(&lbolt, "lbolt");
evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
"kpreempt", "defer: critical section");
evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
"kpreempt", "defer: kernel_lock");
evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
"kpreempt", "immediate");
}
/*
* OBSOLETE INTERFACE
*
* General sleep call. Suspends the current LWP until a wakeup is
* performed on the specified identifier. The LWP will then be made
* runnable with the specified priority. Sleeps at most timo/hz seconds (0
* means no timeout). If pri includes PCATCH flag, signals are checked
* before and after sleeping, else signals are not checked. Returns 0 if
* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
* signal needs to be delivered, ERESTART is returned if the current system
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*/
int
tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo)
{
struct lwp *l = curlwp;
sleepq_t *sq;
kmutex_t *mp;
bool catch_p;
int nlocks;
KASSERT((l->l_pflag & LP_INTR) == 0);
KASSERT(ident != &lbolt);
//KASSERT(KERNEL_LOCKED_P());
if (sleepq_dontsleep(l)) {
(void)sleepq_abort(NULL, 0);
return 0;
}
catch_p = priority & PCATCH;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);
}
int
mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
kmutex_t *mtx)
{
struct lwp *l = curlwp;
sleepq_t *sq;
kmutex_t *mp;
bool catch_p;
int error, nlocks;
KASSERT((l->l_pflag & LP_INTR) == 0);
KASSERT(ident != &lbolt);
if (sleepq_dontsleep(l)) {
(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
return 0;
}
catch_p = priority & PCATCH;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
mutex_exit(mtx);
error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);
if ((priority & PNORELOCK) == 0)
mutex_enter(mtx);
return error;
}
/*
* General sleep call for situations where a wake-up is not expected.
*/
int
kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
{
struct lwp *l = curlwp;
int error, nlocks;
KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p",
wmesg, intr ? "true" : "false", timo, mtx);
if (sleepq_dontsleep(l))
return sleepq_abort(NULL, 0);
if (mtx != NULL) mutex_exit(mtx);
nlocks = sleepq_enter(NULL, l, NULL);
sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr);
error = sleepq_block(timo, intr, &kpause_syncobj, nlocks);
if (mtx != NULL) mutex_enter(mtx);
return error;
}
/*
* OBSOLETE INTERFACE
*
* Make all LWPs sleeping on the specified identifier runnable.
*/
void
wakeup(wchan_t ident)
{
sleepq_t *sq;
kmutex_t *mp;
if (__predict_false(cold))
return;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
sleepq_wake(sq, ident, (u_int)-1, mp);
}
/*
* General yield call. Puts the current LWP back on its run queue and
* performs a context switch.
*/
void
yield(void)
{
struct lwp *l = curlwp;
int nlocks;
KERNEL_UNLOCK_ALL(l, &nlocks);
lwp_lock(l);
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
KASSERT(l->l_stat == LSONPROC);
spc_lock(l->l_cpu);
mi_switch(l);
KERNEL_LOCK(nlocks, l);
}
/*
* General preemption call. Puts the current LWP back on its run queue
* and performs an involuntary context switch. Different from yield()
* in that:
*
* - It's counted differently (involuntary vs. voluntary).
* - Realtime threads go to the head of their runqueue vs. tail for yield().
*/
void
preempt(void)
{
struct lwp *l = curlwp;
int nlocks;
KERNEL_UNLOCK_ALL(l, &nlocks);
lwp_lock(l);
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); KASSERT(l->l_stat == LSONPROC);
spc_lock(l->l_cpu);
l->l_pflag |= LP_PREEMPTING;
mi_switch(l);
KERNEL_LOCK(nlocks, l);
}
/*
* Return true if the current LWP should yield the processor. Intended to
* be used by long-running code in kernel.
*/
inline bool
preempt_needed(void)
{
lwp_t *l = curlwp;
int needed;
KPREEMPT_DISABLE(l);
needed = l->l_cpu->ci_want_resched;
KPREEMPT_ENABLE(l);
return (needed != 0);
}
/*
* A breathing point for long running code in kernel.
*/
void
preempt_point(void)
{ if (__predict_false(preempt_needed())) { preempt();
}
}
/*
* Handle a request made by another agent to preempt the current LWP
* in-kernel. Usually called when l_dopreempt may be non-zero.
*
* Character addresses for lockstat only.
*/
static char kpreempt_is_disabled;
static char kernel_lock_held;
static char is_softint_lwp;
static char spl_is_raised;
bool
kpreempt(uintptr_t where)
{
uintptr_t failed;
lwp_t *l;
int s, dop, lsflag;
l = curlwp;
failed = 0;
while ((dop = l->l_dopreempt) != 0) {
if (l->l_stat != LSONPROC) {
/*
* About to block (or die), let it happen.
* Doesn't really count as "preemption has
* been blocked", since we're going to
* context switch.
*/
atomic_swap_uint(&l->l_dopreempt, 0);
return true;
}
KASSERT((l->l_flag & LW_IDLE) == 0);
if (__predict_false(l->l_nopreempt != 0)) {
/* LWP holds preemption disabled, explicitly. */
if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_crit.ev_count++;
}
failed = (uintptr_t)&kpreempt_is_disabled;
break;
}
if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
/* Can't preempt soft interrupts yet. */
atomic_swap_uint(&l->l_dopreempt, 0);
failed = (uintptr_t)&is_softint_lwp;
break;
}
s = splsched();
if (__predict_false(l->l_blcnt != 0 ||
curcpu()->ci_biglock_wanted != NULL)) {
/* Hold or want kernel_lock, code is not MT safe. */
splx(s);
if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_klock.ev_count++;
}
failed = (uintptr_t)&kernel_lock_held;
break;
}
if (__predict_false(!cpu_kpreempt_enter(where, s))) {
/*
* It may be that the IPL is too high.
* kpreempt_enter() can schedule an
* interrupt to retry later.
*/
splx(s);
failed = (uintptr_t)&spl_is_raised;
break;
}
/* Do it! */
if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { kpreempt_ev_immed.ev_count++;
}
lwp_lock(l);
l->l_pflag |= LP_PREEMPTING;
spc_lock(l->l_cpu);
mi_switch(l);
l->l_nopreempt++;
splx(s);
/* Take care of any MD cleanup. */
cpu_kpreempt_exit(where);
l->l_nopreempt--;
}
if (__predict_true(!failed)) {
return false;
}
/* Record preemption failure for reporting via lockstat. */
atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
lsflag = 0;
LOCKSTAT_ENTER(lsflag);
if (__predict_false(lsflag)) { if (where == 0) { where = (uintptr_t)__builtin_return_address(0);
}
/* Preemption is on, might recurse, so make it atomic. */
if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
(void *)where) == NULL) {
LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
l->l_pfaillock = failed;
}
}
LOCKSTAT_EXIT(lsflag);
return true;
}
/*
* Return true if preemption is explicitly disabled.
*/
bool
kpreempt_disabled(void)
{
const lwp_t *l = curlwp;
return l->l_nopreempt != 0 || l->l_stat == LSZOMB || (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || cpu_kpreempt_disabled();
}
/*
* Disable kernel preemption.
*/
void
kpreempt_disable(void)
{
KPREEMPT_DISABLE(curlwp);
}
/*
* Reenable kernel preemption.
*/
void
kpreempt_enable(void)
{ KPREEMPT_ENABLE(curlwp);
}
/*
* Compute the amount of time during which the current lwp was running.
*
* - update l_rtime unless it's an idle lwp.
*/
void
updatertime(lwp_t *l, const struct bintime *now)
{
static bool backwards = false;
if (__predict_false(l->l_flag & LW_IDLE))
return;
if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { char caller[128];
#ifdef DDB
db_symstr(caller, sizeof(caller),
(db_expr_t)(intptr_t)__builtin_return_address(0),
DB_STGY_PROC);
#else
snprintf(caller, sizeof(caller), "%p",
__builtin_return_address(0));
#endif
backwards = true;
printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:"
" timecounter went backwards"
" from (%jd + 0x%016"PRIx64"/2^64) sec"
" to (%jd + 0x%016"PRIx64"/2^64) sec"
" in %s\n",
(long)l->l_lid,
l->l_proc->p_comm,
l->l_name ? " " : "",
l->l_name ? l->l_name : "",
l->l_pflag,
(intmax_t)l->l_stime.sec, l->l_stime.frac,
(intmax_t)now->sec, now->frac,
caller);
}
/* rtime += now - stime */
bintime_add(&l->l_rtime, now);
bintime_sub(&l->l_rtime, &l->l_stime);
}
/*
* Select next LWP from the current CPU to run..
*/
static inline lwp_t *
nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
{
lwp_t *newl;
/*
* Let sched_nextlwp() select the LWP to run the CPU next.
* If no LWP is runnable, select the idle LWP.
*
* On arrival here LWPs on a run queue are locked by spc_mutex which
* is currently held. Idle LWPs are always locked by spc_lwplock,
* which may or may not be held here. On exit from this code block,
* in all cases newl is locked by spc_lwplock.
*/
newl = sched_nextlwp();
if (newl != NULL) {
sched_dequeue(newl);
KASSERT(lwp_locked(newl, spc->spc_mutex)); KASSERT(newl->l_cpu == ci);
newl->l_stat = LSONPROC;
newl->l_pflag |= LP_RUNNING;
newl->l_boostpri = PRI_NONE;
spc->spc_curpriority = lwp_eprio(newl);
spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE);
lwp_setlock(newl, spc->spc_lwplock);
} else {
/*
* The idle LWP does not get set to LSONPROC, because
* otherwise it screws up the output from top(1) etc.
*/
newl = ci->ci_data.cpu_idlelwp;
newl->l_pflag |= LP_RUNNING;
spc->spc_curpriority = PRI_IDLE;
spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) |
SPCF_IDLE;
}
/*
* Only clear want_resched if there are no pending (slow) software
* interrupts. We can do this without an atomic, because no new
* LWPs can appear in the queue due to our hold on spc_mutex, and
* the update to ci_want_resched will become globally visible before
* the release of spc_mutex becomes globally visible.
*/
if (ci->ci_data.cpu_softints == 0) ci->ci_want_resched = 0;
return newl;
}
/*
* The machine independent parts of context switch.
*
* NOTE: l->l_cpu is not changed in this routine, because an LWP never
* changes its own l_cpu (that would screw up curcpu on many ports and could
* cause all kinds of other evil stuff). l_cpu is always changed by some
* other actor, when it's known the LWP is not running (the LP_RUNNING flag
* is checked under lock).
*/
void
mi_switch(lwp_t *l)
{
struct cpu_info *ci;
struct schedstate_percpu *spc;
struct lwp *newl;
kmutex_t *lock;
int oldspl;
struct bintime bt;
bool returning;
KASSERT(lwp_locked(l, NULL)); KASSERT(kpreempt_disabled()); KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked");
kstack_check_magic(l);
binuptime(&bt);
KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); KASSERT((l->l_pflag & LP_RUNNING) != 0); KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN);
ci = curcpu();
spc = &ci->ci_schedstate;
returning = false;
newl = NULL;
/*
* If we have been asked to switch to a specific LWP, then there
* is no need to inspect the run queues. If a soft interrupt is
* blocking, then return to the interrupted thread without adjusting
* VM context or its start time: neither have been changed in order
* to take the interrupt.
*/
if (l->l_switchto != NULL) { if ((l->l_pflag & LP_INTR) != 0) {
returning = true;
softint_block(l);
if ((l->l_pflag & LP_TIMEINTR) != 0) updatertime(l, &bt);
}
newl = l->l_switchto;
l->l_switchto = NULL;
}
#ifndef __HAVE_FAST_SOFTINTS
else if (ci->ci_data.cpu_softints != 0) {
/* There are pending soft interrupts, so pick one. */
newl = softint_picklwp();
newl->l_stat = LSONPROC;
newl->l_pflag |= LP_RUNNING;
}
#endif /* !__HAVE_FAST_SOFTINTS */
/*
* If on the CPU and we have gotten this far, then we must yield.
*/
if (l->l_stat == LSONPROC && l != newl) { KASSERT(lwp_locked(l, spc->spc_lwplock)); KASSERT((l->l_flag & LW_IDLE) == 0);
l->l_stat = LSRUN;
lwp_setlock(l, spc->spc_mutex);
sched_enqueue(l);
sched_preempted(l);
/*
* Handle migration. Note that "migrating LWP" may
* be reset here, if interrupt/preemption happens
* early in idle LWP.
*/
if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { KASSERT((l->l_pflag & LP_INTR) == 0);
spc->spc_migrating = l;
}
}
/* Pick new LWP to run. */
if (newl == NULL) { newl = nextlwp(ci, spc);
}
/* Items that must be updated with the CPU locked. */
if (!returning) {
/* Count time spent in current system call */
SYSCALL_TIME_SLEEP(l);
updatertime(l, &bt);
/* Update the new LWP's start time. */
newl->l_stime = bt;
/*
* ci_curlwp changes when a fast soft interrupt occurs.
* We use ci_onproc to keep track of which kernel or
* user thread is running 'underneath' the software
* interrupt. This is important for time accounting,
* itimers and forcing user threads to preempt (aston).
*/
ci->ci_onproc = newl;
}
/*
* Preemption related tasks. Must be done holding spc_mutex. Clear
* l_dopreempt without an atomic - it's only ever set non-zero by
* sched_resched_cpu() which also holds spc_mutex, and only ever
* cleared by the LWP itself (us) with atomics when not under lock.
*/
l->l_dopreempt = 0;
if (__predict_false(l->l_pfailaddr != 0)) {
LOCKSTAT_FLAG(lsflag);
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
1, l->l_pfailtime, l->l_pfailaddr);
LOCKSTAT_EXIT(lsflag);
l->l_pfailtime = 0;
l->l_pfaillock = 0;
l->l_pfailaddr = 0;
}
if (l != newl) {
struct lwp *prevlwp;
/* Release all locks, but leave the current LWP locked */
if (l->l_mutex == spc->spc_mutex) {
/*
* Drop spc_lwplock, if the current LWP has been moved
* to the run queue (it is now locked by spc_mutex).
*/
mutex_spin_exit(spc->spc_lwplock);
} else {
/*
* Otherwise, drop the spc_mutex, we are done with the
* run queues.
*/
mutex_spin_exit(spc->spc_mutex);
}
/* We're down to only one lock, so do debug checks. */
LOCKDEBUG_BARRIER(l->l_mutex, 1);
/* Count the context switch. */
CPU_COUNT(CPU_COUNT_NSWTCH, 1);
if ((l->l_pflag & LP_PREEMPTING) != 0) {
l->l_ru.ru_nivcsw++;
l->l_pflag &= ~LP_PREEMPTING;
} else {
l->l_ru.ru_nvcsw++;
}
/*
* Increase the count of spin-mutexes before the release
* of the last lock - we must remain at IPL_SCHED after
* releasing the lock.
*/
KASSERTMSG(ci->ci_mtx_count == -1,
"%s: cpu%u: ci_mtx_count (%d) != -1 "
"(block with spin-mutex held)",
__func__, cpu_index(ci), ci->ci_mtx_count);
oldspl = MUTEX_SPIN_OLDSPL(ci);
ci->ci_mtx_count = -2;
/* Update status for lwpctl, if present. */
if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ?
LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE);
}
/*
* If curlwp is a soft interrupt LWP, there's nobody on the
* other side to unlock - we're returning into an assembly
* trampoline. Unlock now. This is safe because this is a
* kernel LWP and is bound to current CPU: the worst anyone
* else will do to it, is to put it back onto this CPU's run
* queue (and the CPU is busy here right now!).
*/
if (returning) {
/* Keep IPL_SCHED after this; MD code will fix up. */
l->l_pflag &= ~LP_RUNNING;
lwp_unlock(l);
} else {
/* A normal LWP: save old VM context. */
pmap_deactivate(l);
}
/*
* If DTrace has set the active vtime enum to anything
* other than INACTIVE (0), then it should have set the
* function to call.
*/
if (__predict_false(dtrace_vtime_active)) { (*dtrace_vtime_switch_func)(newl);
}
/*
* We must ensure not to come here from inside a read section.
*/
KASSERT(pserialize_not_in_read_section());
/* Switch to the new LWP.. */
#ifdef MULTIPROCESSOR
KASSERT(curlwp == ci->ci_curlwp);
#endif
KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
prevlwp = cpu_switchto(l, newl, returning);
ci = curcpu();
#ifdef MULTIPROCESSOR
KASSERT(curlwp == ci->ci_curlwp);
#endif
KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p",
l, curlwp, prevlwp);
KASSERT(prevlwp != NULL); KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -2);
/*
* Immediately mark the previous LWP as no longer running
* and unlock (to keep lock wait times short as possible).
* We'll still be at IPL_SCHED afterwards. If a zombie,
* don't touch after clearing LP_RUNNING as it could be
* reaped by another CPU. Issue a memory barrier to ensure
* this.
*
* atomic_store_release matches atomic_load_acquire in
* lwp_free.
*/
KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0);
lock = prevlwp->l_mutex;
if (__predict_false(prevlwp->l_stat == LSZOMB)) {
atomic_store_release(&prevlwp->l_pflag,
prevlwp->l_pflag & ~LP_RUNNING);
} else {
prevlwp->l_pflag &= ~LP_RUNNING;
}
mutex_spin_exit(lock);
/*
* Switched away - we have new curlwp.
* Restore VM context and IPL.
*/
pmap_activate(l);
pcu_switchpoint(l);
/* Update status for lwpctl, if present. */
if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
l->l_lwpctl->lc_pctr++;
}
/*
* Normalize the spin mutex count and restore the previous
* SPL. Note that, unless the caller disabled preemption,
* we can be preempted at any time after this splx().
*/
KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -1);
ci->ci_mtx_count = 0;
splx(oldspl);
} else {
/* Nothing to do - just unlock and return. */
mutex_spin_exit(spc->spc_mutex);
l->l_pflag &= ~LP_PREEMPTING;
lwp_unlock(l);
}
KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0);
SYSCALL_TIME_WAKEUP(l);
LOCKDEBUG_BARRIER(NULL, 1);
}
/*
* setrunnable: change LWP state to be runnable, placing it on the run queue.
*
* Call with the process and LWP locked. Will return with the LWP unlocked.
*/
void
setrunnable(struct lwp *l)
{
struct proc *p = l->l_proc;
struct cpu_info *ci;
kmutex_t *oldlock;
KASSERT((l->l_flag & LW_IDLE) == 0); KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); KASSERT(mutex_owned(p->p_lock)); KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
switch (l->l_stat) {
case LSSTOP:
/*
* If we're being traced (possibly because someone attached us
* while we were stopped), check for a signal from the debugger.
*/
if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) signotify(l);
p->p_nrlwps++;
break;
case LSSUSPENDED:
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
l->l_flag &= ~LW_WSUSPEND;
p->p_nrlwps++;
cv_broadcast(&p->p_lwpcv);
break;
case LSSLEEP:
KASSERT(l->l_wchan != NULL);
break;
case LSIDL:
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
break;
default:
panic("setrunnable: lwp %p state was %d", l, l->l_stat);
}
/*
* If the LWP was sleeping, start it again.
*/
if (l->l_wchan != NULL) {
l->l_stat = LSSLEEP;
/* lwp_unsleep() will release the lock. */
lwp_unsleep(l, true);
return;
}
/*
* If the LWP is still on the CPU, mark it as LSONPROC. It may be
* about to call mi_switch(), in which case it will yield.
*/
if ((l->l_pflag & LP_RUNNING) != 0) {
l->l_stat = LSONPROC;
l->l_slptime = 0;
lwp_unlock(l);
return;
}
/*
* Look for a CPU to run.
* Set the LWP runnable.
*/
ci = sched_takecpu(l);
l->l_cpu = ci;
spc_lock(ci);
oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex);
sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
sched_enqueue(l);
sched_resched_lwp(l, true);
/* SPC & LWP now unlocked. */
mutex_spin_exit(oldlock);
}
/*
* suspendsched:
*
* Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
*/
void
suspendsched(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct lwp *l;
struct proc *p;
/*
* We do this by process in order not to violate the locking rules.
*/
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
mutex_enter(p->p_lock);
if ((p->p_flag & PK_SYSTEM) != 0) {
mutex_exit(p->p_lock);
continue;
}
if (p->p_stat != SSTOP) {
if (p->p_stat != SZOMB && p->p_stat != SDEAD) {
p->p_pptr->p_nstopchild++;
p->p_waited = 0;
}
p->p_stat = SSTOP;
}
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (l == curlwp)
continue;
lwp_lock(l);
/*
* Set L_WREBOOT so that the LWP will suspend itself
* when it tries to return to user mode. We want to
* try and get to get as many LWPs as possible to
* the user / kernel boundary, so that they will
* release any locks that they hold.
*/
l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
if (l->l_stat == LSSLEEP &&
(l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
continue;
}
lwp_unlock(l);
}
mutex_exit(p->p_lock);
}
mutex_exit(&proc_lock);
/*
* Kick all CPUs to make them preempt any LWPs running in user mode.
* They'll trap into the kernel and suspend themselves in userret().
*
* Unusually, we don't hold any other scheduler object locked, which
* would keep preemption off for sched_resched_cpu(), so disable it
* explicitly.
*/
kpreempt_disable();
for (CPU_INFO_FOREACH(cii, ci)) {
spc_lock(ci);
sched_resched_cpu(ci, PRI_KERNEL, true);
/* spc now unlocked */
}
kpreempt_enable();
}
/*
* sched_unsleep:
*
* The is called when the LWP has not been awoken normally but instead
* interrupted: for example, if the sleep timed out. Because of this,
* it's not a valid action for running or idle LWPs.
*/
static void
sched_unsleep(struct lwp *l, bool cleanup)
{
lwp_unlock(l);
panic("sched_unsleep");
}
static void
sched_changepri(struct lwp *l, pri_t pri)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
ci = l->l_cpu;
spc = &ci->ci_schedstate;
if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex));
sched_dequeue(l);
l->l_priority = pri;
sched_enqueue(l);
sched_resched_lwp(l, false);
} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
/* On priority drop, only evict realtime LWPs. */
KASSERT(lwp_locked(l, spc->spc_lwplock));
l->l_priority = pri;
spc_lock(ci);
sched_resched_cpu(ci, spc->spc_maxpriority, true);
/* spc now unlocked */
} else {
l->l_priority = pri;
}
}
static void
sched_lendpri(struct lwp *l, pri_t pri)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
ci = l->l_cpu;
spc = &ci->ci_schedstate;
if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex));
sched_dequeue(l);
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
sched_enqueue(l);
sched_resched_lwp(l, false);
} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
/* On priority drop, only evict realtime LWPs. */
KASSERT(lwp_locked(l, spc->spc_lwplock));
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
spc_lock(ci);
sched_resched_cpu(ci, spc->spc_maxpriority, true);
/* spc now unlocked */
} else {
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
}
}
struct lwp *
syncobj_noowner(wchan_t wchan)
{
return NULL;
}
/* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
const fixpt_t ccpu = 0.95122942450071400909 * FSCALE;
/*
* Constants for averages over 1, 5 and 15 minutes when sampling at
* 5 second intervals.
*/
static const fixpt_t cexp[ ] = {
0.9200444146293232 * FSCALE, /* exp(-1/12) */
0.9834714538216174 * FSCALE, /* exp(-1/60) */
0.9944598480048967 * FSCALE, /* exp(-1/180) */
};
/*
* sched_pstats:
*
* => Update process statistics and check CPU resource allocation.
* => Call scheduler-specific hook to eventually adjust LWP priorities.
* => Compute load average of a quantity on 1, 5 and 15 minute intervals.
*/
void
sched_pstats(void)
{
struct loadavg *avg = &averunnable;
const int clkhz = (stathz != 0 ? stathz : hz);
static bool backwardslwp = false;
static bool backwardsproc = false;
static u_int lavg_count = 0;
struct proc *p;
int nrun;
sched_pstats_ticks++;
if (++lavg_count >= 5) {
lavg_count = 0;
nrun = 0;
}
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
struct lwp *l;
struct rlimit *rlim;
time_t runtm;
int sig;
/* Increment sleep time (if sleeping), ignore overflow. */
mutex_enter(p->p_lock);
runtm = p->p_rtime.sec;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
fixpt_t lpctcpu;
u_int lcpticks;
if (__predict_false((l->l_flag & LW_IDLE) != 0))
continue;
lwp_lock(l);
if (__predict_false(l->l_rtime.sec < 0) &&
!backwardslwp) {
backwardslwp = true;
printf("WARNING: lwp %ld (%s%s%s): "
"negative runtime: "
"(%jd + 0x%016"PRIx64"/2^64) sec\n",
(long)l->l_lid,
l->l_proc->p_comm,
l->l_name ? " " : "",
l->l_name ? l->l_name : "",
(intmax_t)l->l_rtime.sec,
l->l_rtime.frac);
}
runtm += l->l_rtime.sec;
l->l_swtime++;
sched_lwp_stats(l);
/* For load average calculation. */
if (__predict_false(lavg_count == 0) &&
(l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) {
switch (l->l_stat) {
case LSSLEEP:
if (l->l_slptime > 1) {
break;
}
/* FALLTHROUGH */
case LSRUN:
case LSONPROC:
case LSIDL:
nrun++;
}
}
lwp_unlock(l);
l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
if (l->l_slptime != 0)
continue;
lpctcpu = l->l_pctcpu;
lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
lpctcpu += ((FSCALE - ccpu) *
(lcpticks * FSCALE / clkhz)) >> FSHIFT;
l->l_pctcpu = lpctcpu;
}
/* Calculating p_pctcpu only for ps(1) */
p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
if (__predict_false(runtm < 0)) {
if (!backwardsproc) {
backwardsproc = true;
printf("WARNING: pid %ld (%s): "
"negative runtime; "
"monotonic clock has gone backwards\n",
(long)p->p_pid, p->p_comm);
}
mutex_exit(p->p_lock);
continue;
}
/*
* Check if the process exceeds its CPU resource allocation.
* If over the hard limit, kill it with SIGKILL.
* If over the soft limit, send SIGXCPU and raise
* the soft limit a little.
*/
rlim = &p->p_rlimit[RLIMIT_CPU];
sig = 0;
if (__predict_false(runtm >= rlim->rlim_cur)) {
if (runtm >= rlim->rlim_max) {
sig = SIGKILL;
log(LOG_NOTICE,
"pid %d, command %s, is killed: %s\n",
p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
uprintf("pid %d, command %s, is killed: %s\n",
p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
} else {
sig = SIGXCPU;
if (rlim->rlim_cur < rlim->rlim_max)
rlim->rlim_cur += 5;
}
}
mutex_exit(p->p_lock);
if (__predict_false(sig)) {
KASSERT((p->p_flag & PK_SYSTEM) == 0);
psignal(p, sig);
}
}
/* Load average calculation. */
if (__predict_false(lavg_count == 0)) {
int i;
CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg));
for (i = 0; i < __arraycount(cexp); i++) {
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
}
}
/* Lightning bolt. */
cv_broadcast(&lbolt);
mutex_exit(&proc_lock);
}
/* $NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 2000 Paycounter, Inc.
* Copyright (c) 2005 Robert N. M. Watson
* Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $");
#define ACCEPT_FILTER_MOD
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/rwlock.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/queue.h>
#include <sys/once.h>
#include <sys/atomic.h>
#include <sys/module.h>
static krwlock_t accept_filter_lock;
static LIST_HEAD(, accept_filter) accept_filtlsthd =
LIST_HEAD_INITIALIZER(&accept_filtlsthd);
/*
* Names of Accept filter sysctl objects
*/
static struct sysctllog *ctllog;
static void
sysctl_net_inet_accf_setup(void)
{
sysctl_createv(&ctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET, CTL_EOL);
sysctl_createv(&ctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "accf",
SYSCTL_DESCR("Accept filters"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET, SO_ACCEPTFILTER, CTL_EOL);
}
int
accept_filt_add(struct accept_filter *filt)
{
struct accept_filter *p;
accept_filter_init();
rw_enter(&accept_filter_lock, RW_WRITER);
LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
if (strcmp(p->accf_name, filt->accf_name) == 0) {
rw_exit(&accept_filter_lock);
return EEXIST;
}
}
LIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
rw_exit(&accept_filter_lock);
return 0;
}
int
accept_filt_del(struct accept_filter *p)
{
rw_enter(&accept_filter_lock, RW_WRITER);
if (p->accf_refcnt != 0) {
rw_exit(&accept_filter_lock);
return EBUSY;
}
LIST_REMOVE(p, accf_next);
rw_exit(&accept_filter_lock);
return 0;
}
struct accept_filter *
accept_filt_get(char *name)
{
struct accept_filter *p;
char buf[32];
u_int gen;
do {
rw_enter(&accept_filter_lock, RW_READER);
LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
if (strcmp(p->accf_name, name) == 0) {
atomic_inc_uint(&p->accf_refcnt);
break;
}
}
rw_exit(&accept_filter_lock);
if (p != NULL) {
break;
}
/* Try to autoload a module to satisfy the request. */
strcpy(buf, "accf_");
strlcat(buf, name, sizeof(buf));
gen = module_gen;
(void)module_autoload(buf, MODULE_CLASS_ANY);
} while (gen != module_gen);
return p;
}
/*
* Accept filter initialization routine.
* This should be called only once.
*/
static int
accept_filter_init0(void)
{
rw_init(&accept_filter_lock);
sysctl_net_inet_accf_setup();
return 0;
}
/*
* Initialization routine: This can also be replaced with
* accept_filt_generic_mod_event for attaching new accept filter.
*/
void
accept_filter_init(void)
{
static ONCE_DECL(accept_filter_init_once);
RUN_ONCE(&accept_filter_init_once, accept_filter_init0);
}
int
accept_filt_getopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg afa;
int error;
KASSERT(solocked(so));
if ((so->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto out;
}
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
error = EINVAL;
goto out;
}
memset(&afa, 0, sizeof(afa));
strcpy(afa.af_name, so->so_accf->so_accept_filter->accf_name);
if (so->so_accf->so_accept_filter_str != NULL)
strcpy(afa.af_arg, so->so_accf->so_accept_filter_str);
error = sockopt_set(sopt, &afa, sizeof(afa));
out:
return error;
}
/*
* Simple delete case, with socket locked.
*/
int
accept_filt_clear(struct socket *so)
{
struct accept_filter_arg afa;
struct accept_filter *afp;
struct socket *so2, *next;
struct so_accf *af;
KASSERT(solocked(so));
if ((so->so_options & SO_ACCEPTCONN) == 0) {
return EINVAL;
}
if (so->so_accf != NULL) {
/* Break in-flight processing. */
for (so2 = TAILQ_FIRST(&so->so_q0); so2 != NULL; so2 = next) {
next = TAILQ_NEXT(so2, so_qe);
if (so2->so_upcall == NULL) {
continue;
}
so2->so_upcall = NULL;
so2->so_upcallarg = NULL;
so2->so_options &= ~SO_ACCEPTFILTER;
so2->so_rcv.sb_flags &= ~SB_UPCALL;
soisconnected(so2);
}
af = so->so_accf;
afp = af->so_accept_filter;
if (afp != NULL && afp->accf_destroy != NULL) {
(*afp->accf_destroy)(so);
}
if (af->so_accept_filter_str != NULL) {
kmem_free(af->so_accept_filter_str,
sizeof(afa.af_name));
}
kmem_free(af, sizeof(*af));
so->so_accf = NULL;
atomic_dec_uint(&afp->accf_refcnt);
}
so->so_options &= ~SO_ACCEPTFILTER;
return 0;
}
/*
* setsockopt() for accept filters. Called with the socket unlocked,
* will always return it locked.
*/
int
accept_filt_setopt(struct socket *so, const struct sockopt *sopt)
{
struct accept_filter_arg afa;
struct accept_filter *afp;
struct so_accf *newaf;
int error;
accept_filter_init(); if (sopt == NULL || sopt->sopt_size == 0) { solock(so);
return accept_filt_clear(so);
}
/*
* Pre-allocate any memory we may need later to avoid blocking at
* untimely moments. This does not optimize for invalid arguments.
*/
error = sockopt_get(sopt, &afa, sizeof(afa));
if (error) {
solock(so);
return error;
}
afa.af_name[sizeof(afa.af_name)-1] = '\0';
afa.af_arg[sizeof(afa.af_arg)-1] = '\0';
afp = accept_filt_get(afa.af_name);
if (afp == NULL) {
solock(so);
return ENOENT;
}
/*
* Allocate the new accept filter instance storage. We may
* have to free it again later if we fail to attach it. If
* attached properly, 'newaf' is NULLed to avoid a free()
* while in use.
*/
newaf = kmem_zalloc(sizeof(*newaf), KM_SLEEP);
if (afp->accf_create != NULL && afa.af_name[0] != '\0') {
/*
* FreeBSD did a variable-size allocation here
* with the actual string length from afa.af_name
* but it is so short, why bother tracking it?
* XXX as others have noted, this is an API mistake;
* XXX accept_filter_arg should have a mandatory namelen.
* XXX (but it's a bit too late to fix that now)
*/
newaf->so_accept_filter_str =
kmem_alloc(sizeof(afa.af_name), KM_SLEEP);
strcpy(newaf->so_accept_filter_str, afa.af_name);
}
/*
* Require a listen socket; don't try to replace an existing filter
* without first removing it.
*/
solock(so); if ((so->so_options & SO_ACCEPTCONN) == 0 || so->so_accf != NULL) {
error = EINVAL;
goto out;
}
/*
* Invoke the accf_create() method of the filter if required. The
* socket lock is held over this call, so create methods for filters
* shouldn't block.
*/
if (afp->accf_create != NULL) {
newaf->so_accept_filter_arg =
(*afp->accf_create)(so, afa.af_arg);
if (newaf->so_accept_filter_arg == NULL) {
error = EINVAL;
goto out;
}
}
newaf->so_accept_filter = afp;
so->so_accf = newaf;
so->so_options |= SO_ACCEPTFILTER;
newaf = NULL;
out:
if (newaf != NULL) { if (newaf->so_accept_filter_str != NULL) kmem_free(newaf->so_accept_filter_str,
sizeof(afa.af_name));
kmem_free(newaf, sizeof(*newaf));
atomic_dec_uint(&afp->accf_refcnt);
}
return error;
}
/* $NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $ */
/*
* Copyright (c) 1988, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)radix.c 8.6 (Berkeley) 10/17/95
*/
/*
* Routines to build and maintain radix trees for routing lookups.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $");
#ifndef _NET_RADIX_H_
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/kmem.h>
#ifdef _KERNEL
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif
#include <sys/systm.h>
#include <sys/malloc.h>
#define M_DONTWAIT M_NOWAIT
#include <sys/domain.h>
#else
#include <stdlib.h>
#endif
#include <sys/syslog.h>
#include <net/radix.h>
#endif
typedef void (*rn_printer_t)(void *, const char *fmt, ...);
int max_keylen;
struct radix_mask *rn_mkfreelist;
struct radix_node_head *mask_rnhead;
static char *addmask_key;
static const char normal_chars[] =
{0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1};
static char *rn_zeros, *rn_ones;
#define rn_masktop (mask_rnhead->rnh_treetop)
static int rn_satisfies_leaf(const char *, struct radix_node *, int);
static int rn_lexobetter(const void *, const void *);
static struct radix_mask *rn_new_radix_mask(struct radix_node *,
struct radix_mask *);
static struct radix_node *rn_walknext(struct radix_node *, rn_printer_t,
void *);
static struct radix_node *rn_walkfirst(struct radix_node *, rn_printer_t,
void *);
static void rn_nodeprint(struct radix_node *, rn_printer_t, void *,
const char *);
#define SUBTREE_OPEN "[ "
#define SUBTREE_CLOSE " ]"
#ifdef RN_DEBUG
static void rn_treeprint(struct radix_node_head *, rn_printer_t, void *);
#endif /* RN_DEBUG */
/*
* The data structure for the keys is a radix tree with one way
* branching removed. The index rn_b at an internal node n represents a bit
* position to be tested. The tree is arranged so that all descendants
* of a node n have keys whose bits all agree up to position rn_b - 1.
* (We say the index of n is rn_b.)
*
* There is at least one descendant which has a one bit at position rn_b,
* and at least one with a zero there.
*
* A route is determined by a pair of key and mask. We require that the
* bit-wise logical and of the key and mask to be the key.
* We define the index of a route to associated with the mask to be
* the first bit number in the mask where 0 occurs (with bit number 0
* representing the highest order bit).
*
* We say a mask is normal if every bit is 0, past the index of the mask.
* If a node n has a descendant (k, m) with index(m) == index(n) == rn_b,
* and m is a normal mask, then the route applies to every descendant of n.
* If the index(m) < rn_b, this implies the trailing last few bits of k
* before bit b are all 0, (and hence consequently true of every descendant
* of n), so the route applies to all descendants of the node as well.
*
* Similar logic shows that a non-normal mask m such that
* index(m) <= index(n) could potentially apply to many children of n.
* Thus, for each non-host route, we attach its mask to a list at an internal
* node as high in the tree as we can go.
*
* The present version of the code makes use of normal routes in short-
* circuiting an explicit mask and compare operation when testing whether
* a key satisfies a normal route, and also in remembering the unique leaf
* that governs a subtree.
*/
struct radix_node *
rn_search(
const void *v_arg,
struct radix_node *head)
{
const u_char * const v = v_arg;
struct radix_node *x;
for (x = head; x->rn_b >= 0;) {
if (x->rn_bmask & v[x->rn_off])
x = x->rn_r;
else
x = x->rn_l;
}
return x;
}
struct radix_node *
rn_search_m(
const void *v_arg,
struct radix_node *head,
const void *m_arg)
{
struct radix_node *x;
const u_char * const v = v_arg;
const u_char * const m = m_arg;
for (x = head; x->rn_b >= 0;) { if ((x->rn_bmask & m[x->rn_off]) &&
(x->rn_bmask & v[x->rn_off]))
x = x->rn_r;
else
x = x->rn_l;
}
return x;
}
int
rn_refines(
const void *m_arg,
const void *n_arg)
{
const char *m = m_arg;
const char *n = n_arg;
const char *lim = n + *(const u_char *)n;
const char *lim2 = lim;
int longer = (*(const u_char *)n++) - (int)(*(const u_char *)m++);
int masks_are_equal = 1;
if (longer > 0)
lim -= longer;
while (n < lim) {
if (*n & ~(*m))
return 0;
if (*n++ != *m++)
masks_are_equal = 0;
}
while (n < lim2)
if (*n++)
return 0;
if (masks_are_equal && (longer < 0))
for (lim2 = m - longer; m < lim2; )
if (*m++)
return 1;
return !masks_are_equal;
}
struct radix_node *
rn_lookup(
const void *v_arg,
const void *m_arg,
struct radix_node_head *head)
{
struct radix_node *x;
const char *netmask = NULL;
if (m_arg) {
if ((x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off)) == 0)
return NULL;
netmask = x->rn_key;
}
x = rn_match(v_arg, head);
if (x != NULL && netmask != NULL) {
while (x != NULL && x->rn_mask != netmask)
x = x->rn_dupedkey;
}
return x;
}
static int
rn_satisfies_leaf(
const char *trial,
struct radix_node *leaf,
int skip)
{
const char *cp = trial;
const char *cp2 = leaf->rn_key;
const char *cp3 = leaf->rn_mask;
const char *cplim;
int length = uimin(*(const u_char *)cp, *(const u_char *)cp2);
if (cp3 == 0)
cp3 = rn_ones;
else
length = uimin(length, *(const u_char *)cp3);
cplim = cp + length; cp3 += skip; cp2 += skip;
for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3)
return 0;
return 1;
}
struct radix_node *
rn_match(
const void *v_arg,
struct radix_node_head *head)
{
const char * const v = v_arg;
struct radix_node *t = head->rnh_treetop;
struct radix_node *top = t;
struct radix_node *x;
struct radix_node *saved_t;
const char *cp = v;
const char *cp2;
const char *cplim;
int off = t->rn_off;
int vlen = *(const u_char *)cp;
int matched_off;
int test, b, rn_b;
/*
* Open code rn_search(v, top) to avoid overhead of extra
* subroutine call.
*/
for (; t->rn_b >= 0; ) {
if (t->rn_bmask & cp[t->rn_off])
t = t->rn_r;
else
t = t->rn_l;
}
/*
* See if we match exactly as a host destination
* or at least learn how many bits match, for normal mask finesse.
*
* It doesn't hurt us to limit how many bytes to check
* to the length of the mask, since if it matches we had a genuine
* match and the leaf we have is the most specific one anyway;
* if it didn't match with a shorter length it would fail
* with a long one. This wins big for class B&C netmasks which
* are probably the most common case...
*/
if (t->rn_mask) vlen = *(const u_char *)t->rn_mask;
cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
for (; cp < cplim; cp++, cp2++)
if (*cp != *cp2)
goto on1;
/*
* This extra grot is in case we are explicitly asked
* to look up the default. Ugh!
*/
if ((t->rn_flags & RNF_ROOT) && t->rn_dupedkey)
t = t->rn_dupedkey;
return t;
on1:
test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
for (b = 7; (test >>= 1) > 0;)
b--;
matched_off = cp - v;
b += matched_off << 3;
rn_b = -1 - b;
/*
* If there is a host route in a duped-key chain, it will be first.
*/
if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey)
/*
* Even if we don't match exactly as a host,
* we may match if the leaf we wound up at is
* a route to a net.
*/
if (t->rn_flags & RNF_NORMAL) {
if (rn_b <= t->rn_b)
return t;
} else if (rn_satisfies_leaf(v, t, matched_off))
return t;
t = saved_t;
/* start searching up the tree */
do {
struct radix_mask *m;
t = t->rn_p;
m = t->rn_mklist;
if (m) {
/*
* If non-contiguous masks ever become important
* we can restore the masking and open coding of
* the search and satisfaction test and put the
* calculation of "off" back before the "do".
*/
do {
if (m->rm_flags & RNF_NORMAL) {
if (rn_b <= m->rm_b) return m->rm_leaf;
} else {
off = uimin(t->rn_off, matched_off);
x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask)
x = x->rn_dupedkey;
if (x && rn_satisfies_leaf(v, x, off))
return x;
}
m = m->rm_mklist;
} while (m);
}
} while (t != top);
return NULL;
}
static void
rn_nodeprint(struct radix_node *rn, rn_printer_t printer, void *arg,
const char *delim)
{
(*printer)(arg, "%s(%s%p: p<%p> l<%p> r<%p>)",
delim, ((void *)rn == arg) ? "*" : "", rn, rn->rn_p,
rn->rn_l, rn->rn_r);
}
#ifdef RN_DEBUG
int rn_debug = 1;
static void
rn_dbg_print(void *arg, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vlog(LOG_DEBUG, fmt, ap);
va_end(ap);
}
static void
rn_treeprint(struct radix_node_head *h, rn_printer_t printer, void *arg)
{
struct radix_node *dup, *rn;
const char *delim;
if (printer == NULL)
return;
rn = rn_walkfirst(h->rnh_treetop, printer, arg);
for (;;) {
/* Process leaves */
delim = "";
for (dup = rn; dup != NULL; dup = dup->rn_dupedkey) {
if ((dup->rn_flags & RNF_ROOT) != 0)
continue;
rn_nodeprint(dup, printer, arg, delim);
delim = ", ";
}
rn = rn_walknext(rn, printer, arg);
if (rn->rn_flags & RNF_ROOT)
return;
}
/* NOTREACHED */
}
#define traverse(__head, __rn) rn_treeprint((__head), rn_dbg_print, (__rn))
#endif /* RN_DEBUG */
struct radix_node *
rn_newpair(
const void *v,
int b,
struct radix_node nodes[2])
{
struct radix_node *tt = nodes;
struct radix_node *t = tt + 1;
t->rn_b = b; t->rn_bmask = 0x80 >> (b & 7);
t->rn_l = tt; t->rn_off = b >> 3;
tt->rn_b = -1; tt->rn_key = v; tt->rn_p = t;
tt->rn_flags = t->rn_flags = RNF_ACTIVE;
return t;
}
struct radix_node *
rn_insert(
const void *v_arg,
struct radix_node_head *head,
int *dupentry,
struct radix_node nodes[2])
{
struct radix_node *top = head->rnh_treetop;
struct radix_node *t = rn_search(v_arg, top);
struct radix_node *tt;
const char *v = v_arg;
int head_off = top->rn_off;
int vlen = *((const u_char *)v);
const char *cp = v + head_off;
int b;
/*
* Find first bit at which v and t->rn_key differ
*/
{
const char *cp2 = t->rn_key + head_off;
const char *cplim = v + vlen;
int cmp_res;
while (cp < cplim)
if (*cp2++ != *cp++)
goto on1;
*dupentry = 1;
return t;
on1:
*dupentry = 0;
cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
for (b = (cp - v) << 3; cmp_res; b--)
cmp_res >>= 1;
}
{
struct radix_node *p, *x = top;
cp = v;
do {
p = x;
if (cp[x->rn_off] & x->rn_bmask)
x = x->rn_r;
else x = x->rn_l;
} while (b > (unsigned) x->rn_b); /* x->rn_b < b && x->rn_b >= 0 */
#ifdef RN_DEBUG
if (rn_debug)
log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, p);
#endif
t = rn_newpair(v_arg, b, nodes); tt = t->rn_l;
if ((cp[p->rn_off] & p->rn_bmask) == 0)
p->rn_l = t;
else
p->rn_r = t;
x->rn_p = t; t->rn_p = p; /* frees x, p as temp vars below */
if ((cp[t->rn_off] & t->rn_bmask) == 0) {
t->rn_r = x;
} else {
t->rn_r = tt; t->rn_l = x;
}
#ifdef RN_DEBUG
if (rn_debug) {
log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
traverse(head, p);
}
#endif /* RN_DEBUG */
}
return tt;
}
struct radix_node *
rn_addmask(
const void *n_arg,
int search,
int skip)
{
const char *netmask = n_arg;
const char *cp;
const char *cplim;
struct radix_node *x;
struct radix_node *saved_x;
int b = 0, mlen, j;
int maskduplicated, m0, isnormal;
static int last_zeroed = 0;
if ((mlen = *(const u_char *)netmask) > max_keylen)
mlen = max_keylen;
if (skip == 0)
skip = 1;
if (mlen <= skip)
return mask_rnhead->rnh_nodes;
if (skip > 1)
memmove(addmask_key + 1, rn_ones + 1, skip - 1);
if ((m0 = mlen) > skip)
memmove(addmask_key + skip, netmask + skip, mlen - skip);
/*
* Trim trailing zeroes.
*/
for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
cp--;
mlen = cp - addmask_key;
if (mlen <= skip) {
if (m0 >= last_zeroed)
last_zeroed = mlen;
return mask_rnhead->rnh_nodes;
}
if (m0 < last_zeroed)
memset(addmask_key + m0, 0, last_zeroed - m0);
*addmask_key = last_zeroed = mlen;
x = rn_search(addmask_key, rn_masktop);
if (memcmp(addmask_key, x->rn_key, mlen) != 0)
x = 0;
if (x || search)
return x;
R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
if ((saved_x = x) == NULL)
return NULL;
memset(x, 0, max_keylen + 2 * sizeof (*x));
cp = netmask = (void *)(x + 2);
memmove(x + 2, addmask_key, mlen);
x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
if (maskduplicated) {
log(LOG_ERR, "rn_addmask: mask impossibly already in tree\n");
Free(saved_x);
return x;
}
/*
* Calculate index of mask, and check for normalcy.
*/
cplim = netmask + mlen; isnormal = 1;
for (cp = netmask + skip; (cp < cplim) && *(const u_char *)cp == 0xff;)
cp++;
if (cp != cplim) {
for (j = 0x80; (j & *cp) != 0; j >>= 1)
b++;
if (*cp != normal_chars[b] || cp != (cplim - 1))
isnormal = 0;
}
b += (cp - netmask) << 3;
x->rn_b = -1 - b;
if (isnormal)
x->rn_flags |= RNF_NORMAL;
return x;
}
static int /* XXX: arbitrary ordering for non-contiguous masks */
rn_lexobetter(
const void *m_arg,
const void *n_arg)
{
const u_char *mp = m_arg;
const u_char *np = n_arg;
const u_char *lim;
if (*mp > *np)
return 1; /* not really, but need to check longer one first */
if (*mp == *np)
for (lim = mp + *mp; mp < lim;)
if (*mp++ > *np++)
return 1;
return 0;
}
static struct radix_mask *
rn_new_radix_mask(
struct radix_node *tt,
struct radix_mask *next)
{
struct radix_mask *m;
MKGet(m);
if (m == NULL) {
log(LOG_ERR, "Mask for route not entered\n");
return NULL;
}
memset(m, 0, sizeof(*m));
m->rm_b = tt->rn_b;
m->rm_flags = tt->rn_flags;
if (tt->rn_flags & RNF_NORMAL)
m->rm_leaf = tt;
else
m->rm_mask = tt->rn_mask;
m->rm_mklist = next;
tt->rn_mklist = m;
return m;
}
struct radix_node *
rn_addroute(
const void *v_arg,
const void *n_arg,
struct radix_node_head *head,
struct radix_node treenodes[2])
{
const char *v = v_arg, *netmask = n_arg;
struct radix_node *t, *x = NULL, *tt;
struct radix_node *saved_tt, *top = head->rnh_treetop;
short b = 0, b_leaf = 0;
int keyduplicated;
const char *mmask;
struct radix_mask *m, **mp;
/*
* In dealing with non-contiguous masks, there may be
* many different routes which have the same mask.
* We will find it useful to have a unique pointer to
* the mask to speed avoiding duplicate references at
* nodes and possibly save time in calculating indices.
*/
if (netmask != NULL) {
if ((x = rn_addmask(netmask, 0, top->rn_off)) == NULL)
return NULL;
b_leaf = x->rn_b;
b = -1 - x->rn_b;
netmask = x->rn_key;
}
/*
* Deal with duplicated keys: attach node to previous instance
*/
saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
if (keyduplicated) {
for (t = tt; tt != NULL; t = tt, tt = tt->rn_dupedkey) {
if (tt->rn_mask == netmask)
return NULL;
if (netmask == NULL ||
(tt->rn_mask != NULL &&
(b_leaf < tt->rn_b || /* index(netmask) > node */
rn_refines(netmask, tt->rn_mask) ||
rn_lexobetter(netmask, tt->rn_mask))))
break;
}
/*
* If the mask is not duplicated, we wouldn't
* find it among possible duplicate key entries
* anyway, so the above test doesn't hurt.
*
* We sort the masks for a duplicated key the same way as
* in a masklist -- most specific to least specific.
* This may require the unfortunate nuisance of relocating
* the head of the list.
*
* We also reverse, or doubly link the list through the
* parent pointer.
*/
if (tt == saved_tt) {
struct radix_node *xx = x;
/* link in at head of list */
(tt = treenodes)->rn_dupedkey = t;
tt->rn_flags = t->rn_flags;
tt->rn_p = x = t->rn_p;
t->rn_p = tt;
if (x->rn_l == t)
x->rn_l = tt;
else
x->rn_r = tt;
saved_tt = tt;
x = xx;
} else {
(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
t->rn_dupedkey = tt;
tt->rn_p = t;
if (tt->rn_dupedkey)
tt->rn_dupedkey->rn_p = tt;
}
tt->rn_key = v;
tt->rn_b = -1;
tt->rn_flags = RNF_ACTIVE;
}
/*
* Put mask in tree.
*/
if (netmask != NULL) {
tt->rn_mask = netmask;
tt->rn_b = x->rn_b;
tt->rn_flags |= x->rn_flags & RNF_NORMAL;
}
t = saved_tt->rn_p;
if (keyduplicated)
goto on2;
b_leaf = -1 - t->rn_b;
if (t->rn_r == saved_tt)
x = t->rn_l;
else
x = t->rn_r;
/* Promote general routes from below */
if (x->rn_b < 0) {
for (mp = &t->rn_mklist; x != NULL; x = x->rn_dupedkey) {
if (x->rn_mask != NULL && x->rn_b >= b_leaf &&
x->rn_mklist == NULL) {
*mp = m = rn_new_radix_mask(x, NULL);
if (m != NULL)
mp = &m->rm_mklist;
}
}
} else if (x->rn_mklist != NULL) {
/*
* Skip over masks whose index is > that of new node
*/
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
if (m->rm_b >= b_leaf)
break;
t->rn_mklist = m;
*mp = NULL;
}
on2:
/* Add new route to highest possible ancestor's list */
if (netmask == NULL || b > t->rn_b)
return tt; /* can't lift at all */
b_leaf = tt->rn_b;
do {
x = t;
t = t->rn_p;
} while (b <= t->rn_b && x != top);
/*
* Search through routes associated with node to
* insert new route according to index.
* Need same criteria as when sorting dupedkeys to avoid
* double loop on deletion.
*/
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
if (m->rm_b < b_leaf)
continue;
if (m->rm_b > b_leaf)
break;
if (m->rm_flags & RNF_NORMAL) {
mmask = m->rm_leaf->rn_mask;
if (tt->rn_flags & RNF_NORMAL) {
log(LOG_ERR, "Non-unique normal route,"
" mask not entered\n");
return tt;
}
} else
mmask = m->rm_mask;
if (mmask == netmask) {
m->rm_refs++;
tt->rn_mklist = m;
return tt;
}
if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask))
break;
}
*mp = rn_new_radix_mask(tt, *mp);
return tt;
}
struct radix_node *
rn_delete1(
const void *v_arg,
const void *netmask_arg,
struct radix_node_head *head,
struct radix_node *rn)
{
struct radix_node *t, *p, *x, *tt;
struct radix_mask *m, *saved_m, **mp;
struct radix_node *dupedkey, *saved_tt, *top;
const char *v, *netmask;
int b, head_off, vlen;
v = v_arg;
netmask = netmask_arg;
x = head->rnh_treetop;
tt = rn_search(v, x);
head_off = x->rn_off;
vlen = *(const u_char *)v;
saved_tt = tt;
top = x;
if (tt == NULL ||
memcmp(v + head_off, tt->rn_key + head_off, vlen - head_off) != 0)
return NULL;
/*
* Delete our route from mask lists.
*/
if (netmask != NULL) {
if ((x = rn_addmask(netmask, 1, head_off)) == NULL)
return NULL;
netmask = x->rn_key;
while (tt->rn_mask != netmask)
if ((tt = tt->rn_dupedkey) == NULL)
return NULL;
}
if (tt->rn_mask == NULL || (saved_m = m = tt->rn_mklist) == NULL)
goto on1;
if (tt->rn_flags & RNF_NORMAL) {
if (m->rm_leaf != tt || m->rm_refs > 0) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
return NULL; /* dangling ref could cause disaster */
}
} else {
if (m->rm_mask != tt->rn_mask) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
goto on1;
}
if (--m->rm_refs >= 0)
goto on1;
}
b = -1 - tt->rn_b;
t = saved_tt->rn_p;
if (b > t->rn_b)
goto on1; /* Wasn't lifted at all */
do {
x = t;
t = t->rn_p;
} while (b <= t->rn_b && x != top);
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
if (m == saved_m) {
*mp = m->rm_mklist;
MKFree(m);
break;
}
}
if (m == NULL) {
log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
if (tt->rn_flags & RNF_NORMAL)
return NULL; /* Dangling ref to us */
}
on1:
/*
* Eliminate us from tree
*/
if (tt->rn_flags & RNF_ROOT)
return NULL;
#ifdef RN_DEBUG
if (rn_debug)
log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, tt);
#endif
t = tt->rn_p;
dupedkey = saved_tt->rn_dupedkey;
if (dupedkey != NULL) {
/*
* Here, tt is the deletion target, and
* saved_tt is the head of the dupedkey chain.
*/
if (tt == saved_tt) {
x = dupedkey;
x->rn_p = t;
if (t->rn_l == tt)
t->rn_l = x;
else
t->rn_r = x;
} else {
/* find node in front of tt on the chain */
for (x = p = saved_tt;
p != NULL && p->rn_dupedkey != tt;)
p = p->rn_dupedkey;
if (p != NULL) {
p->rn_dupedkey = tt->rn_dupedkey;
if (tt->rn_dupedkey != NULL)
tt->rn_dupedkey->rn_p = p;
} else
log(LOG_ERR, "rn_delete: couldn't find us\n");
}
t = tt + 1;
if (t->rn_flags & RNF_ACTIVE) {
*++x = *t;
p = t->rn_p;
if (p->rn_l == t)
p->rn_l = x;
else
p->rn_r = x;
x->rn_l->rn_p = x;
x->rn_r->rn_p = x;
}
goto out;
}
if (t->rn_l == tt)
x = t->rn_r;
else
x = t->rn_l;
p = t->rn_p;
if (p->rn_r == t)
p->rn_r = x;
else
p->rn_l = x;
x->rn_p = p;
/*
* Demote routes attached to us.
*/
if (t->rn_mklist == NULL)
;
else if (x->rn_b >= 0) {
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
;
*mp = t->rn_mklist;
} else {
/* If there are any key,mask pairs in a sibling
duped-key chain, some subset will appear sorted
in the same order attached to our mklist */
for (m = t->rn_mklist;
m != NULL && x != NULL;
x = x->rn_dupedkey) {
if (m == x->rn_mklist) {
struct radix_mask *mm = m->rm_mklist;
x->rn_mklist = NULL;
if (--(m->rm_refs) < 0)
MKFree(m);
m = mm;
}
}
if (m != NULL) {
log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n",
m, x);
}
}
/*
* We may be holding an active internal node in the tree.
*/
x = tt + 1;
if (t != x) {
*t = *x;
t->rn_l->rn_p = t;
t->rn_r->rn_p = t;
p = x->rn_p;
if (p->rn_l == x)
p->rn_l = t;
else
p->rn_r = t;
}
out:
#ifdef RN_DEBUG
if (rn_debug) {
log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
traverse(head, tt);
}
#endif /* RN_DEBUG */
tt->rn_flags &= ~RNF_ACTIVE;
tt[1].rn_flags &= ~RNF_ACTIVE;
return tt;
}
struct radix_node *
rn_delete(
const void *v_arg,
const void *netmask_arg,
struct radix_node_head *head)
{
return rn_delete1(v_arg, netmask_arg, head, NULL);
}
static struct radix_node *
rn_walknext(struct radix_node *rn, rn_printer_t printer, void *arg)
{
/* If at right child go back up, otherwise, go right */
while (rn->rn_p->rn_r == rn && (rn->rn_flags & RNF_ROOT) == 0) {
if (printer != NULL)
(*printer)(arg, SUBTREE_CLOSE);
rn = rn->rn_p;
}
if (printer)
rn_nodeprint(rn->rn_p, printer, arg, "");
/* Find the next *leaf* since next node might vanish, too */
for (rn = rn->rn_p->rn_r; rn->rn_b >= 0;) {
if (printer != NULL)
(*printer)(arg, SUBTREE_OPEN);
rn = rn->rn_l;
}
return rn;
}
static struct radix_node *
rn_walkfirst(struct radix_node *rn, rn_printer_t printer, void *arg)
{
/* First time through node, go left */
while (rn->rn_b >= 0) {
if (printer != NULL)
(*printer)(arg, SUBTREE_OPEN);
rn = rn->rn_l;
}
return rn;
}
int
rn_walktree(
struct radix_node_head *h,
int (*f)(struct radix_node *, void *),
void *w)
{
int error;
struct radix_node *base, *next, *rn;
/*
* This gets complicated because we may delete the node
* while applying the function f to it, so we need to calculate
* the successor node in advance.
*/
rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
for (;;) {
base = rn;
next = rn_walknext(rn, NULL, NULL);
/* Process leaves */
while ((rn = base) != NULL) {
base = rn->rn_dupedkey;
if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w)))
return error;
}
rn = next;
if (rn->rn_flags & RNF_ROOT)
return 0;
}
/* NOTREACHED */
}
struct radix_node *
rn_search_matched(struct radix_node_head *h,
int (*matcher)(struct radix_node *, void *), void *w)
{
bool matched;
struct radix_node *base, *next, *rn;
/*
* This gets complicated because we may delete the node
* while applying the function f to it, so we need to calculate
* the successor node in advance.
*/
rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
for (;;) {
base = rn;
next = rn_walknext(rn, NULL, NULL);
/* Process leaves */
while ((rn = base) != NULL) {
base = rn->rn_dupedkey;
if (!(rn->rn_flags & RNF_ROOT)) {
matched = (*matcher)(rn, w);
if (matched)
return rn;
}
}
rn = next;
if (rn->rn_flags & RNF_ROOT)
return NULL;
}
/* NOTREACHED */
}
struct delayinit {
void **head;
int off;
SLIST_ENTRY(delayinit) entries;
};
static SLIST_HEAD(, delayinit) delayinits = SLIST_HEAD_INITIALIZER(delayheads);
static int radix_initialized;
/*
* Initialize a radix tree once radix is initialized. Only for bootstrap.
* Assume that no concurrency protection is necessary at this stage.
*/
void
rn_delayedinit(void **head, int off)
{
struct delayinit *di;
if (radix_initialized)
return;
di = kmem_alloc(sizeof(*di), KM_SLEEP);
di->head = head;
di->off = off;
SLIST_INSERT_HEAD(&delayinits, di, entries);
}
int
rn_inithead(void **head, int off)
{
struct radix_node_head *rnh;
if (*head != NULL)
return 1;
R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh));
if (rnh == NULL)
return 0;
*head = rnh;
return rn_inithead0(rnh, off);
}
int
rn_inithead0(struct radix_node_head *rnh, int off)
{
struct radix_node *t;
struct radix_node *tt;
struct radix_node *ttt;
memset(rnh, 0, sizeof(*rnh));
t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
ttt = rnh->rnh_nodes + 2;
t->rn_r = ttt;
t->rn_p = t;
tt = t->rn_l;
tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
tt->rn_b = -1 - off;
*ttt = *tt;
ttt->rn_key = rn_ones;
rnh->rnh_addaddr = rn_addroute;
rnh->rnh_deladdr = rn_delete;
rnh->rnh_matchaddr = rn_match;
rnh->rnh_lookup = rn_lookup;
rnh->rnh_treetop = t;
return 1;
}
void
rn_init(void)
{
char *cp, *cplim;
struct delayinit *di;
#ifdef _KERNEL
struct domain *dp;
if (radix_initialized)
panic("radix already initialized");
radix_initialized = 1;
DOMAIN_FOREACH(dp) {
if (dp->dom_maxrtkey > max_keylen)
max_keylen = dp->dom_maxrtkey;
}
#endif
if (max_keylen == 0) {
#ifndef _KERNEL
log(LOG_ERR,
"rn_init: radix functions require max_keylen be set\n");
#endif
return;
}
R_Malloc(rn_zeros, char *, 3 * max_keylen);
if (rn_zeros == NULL)
panic("rn_init");
memset(rn_zeros, 0, 3 * max_keylen);
rn_ones = cp = rn_zeros + max_keylen;
addmask_key = cplim = rn_ones + max_keylen;
while (cp < cplim)
*cp++ = -1;
if (rn_inithead((void *)&mask_rnhead, 0) == 0)
panic("rn_init 2");
while ((di = SLIST_FIRST(&delayinits)) != NULL) {
if (!rn_inithead(di->head, di->off))
panic("delayed rn_inithead failed");
SLIST_REMOVE_HEAD(&delayinits, entries);
kmem_free(di, sizeof(*di));
}
}
/* $NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/module.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/quota.h>
#include <sys/quotactl.h>
#include <sys/systm.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode.h>
#include <ufs/ufs/quota1.h>
static const struct syscall_package vfs_syscalls_50_quota_syscalls[] = {
{ SYS_compat_50_quotactl, 0, (sy_call_t *)compat_50_sys_quotactl },
{ 0, 0, NULL }
};
/* ARGSUSED */
int
compat_50_sys_quotactl(struct lwp *l, const struct compat_50_sys_quotactl_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(int) cmd;
syscallarg(int) uid;
syscallarg(void *) arg;
} */
struct vnode *vp;
struct mount *mp;
int q1cmd;
int idtype;
char *qfile;
struct dqblk dqblk;
struct quotakey key;
struct quotaval blocks, files;
struct quotastat qstat;
int error;
error = namei_simple_user(SCARG(uap, path),
NSM_FOLLOW_TRYEMULROOT, &vp);
if (error != 0)
return (error);
mp = vp->v_mount;
q1cmd = SCARG(uap, cmd);
idtype = quota_idtype_from_ufs(q1cmd & SUBCMDMASK);
if (idtype == -1) {
return EINVAL;
}
switch ((q1cmd & ~SUBCMDMASK) >> SUBCMDSHIFT) {
case Q_QUOTAON:
qfile = PNBUF_GET();
error = copyinstr(SCARG(uap, arg), qfile, PATH_MAX, NULL);
if (error != 0) {
PNBUF_PUT(qfile);
break;
}
error = vfs_quotactl_quotaon(mp, idtype, qfile);
PNBUF_PUT(qfile);
break;
case Q_QUOTAOFF:
error = vfs_quotactl_quotaoff(mp, idtype);
break;
case Q_GETQUOTA:
key.qk_idtype = idtype;
key.qk_id = SCARG(uap, uid);
key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
error = vfs_quotactl_get(mp, &key, &blocks);
if (error) {
break;
}
key.qk_objtype = QUOTA_OBJTYPE_FILES;
error = vfs_quotactl_get(mp, &key, &files);
if (error) {
break;
}
quotavals_to_dqblk(&blocks, &files, &dqblk);
error = copyout(&dqblk, SCARG(uap, arg), sizeof(dqblk));
break;
case Q_SETQUOTA:
error = copyin(SCARG(uap, arg), &dqblk, sizeof(dqblk));
if (error) {
break;
}
dqblk_to_quotavals(&dqblk, &blocks, &files);
key.qk_idtype = idtype;
key.qk_id = SCARG(uap, uid);
key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
error = vfs_quotactl_put(mp, &key, &blocks);
if (error) {
break;
}
key.qk_objtype = QUOTA_OBJTYPE_FILES;
error = vfs_quotactl_put(mp, &key, &files);
break;
case Q_SYNC:
/*
* not supported but used only to see if quota is supported,
* emulate with stat
*
* XXX should probably be supported
*/
(void)idtype; /* not used */
error = vfs_quotactl_stat(mp, &qstat);
break;
case Q_SETUSE:
default:
error = EOPNOTSUPP;
break;
}
vrele(vp);
return error;
}
MODULE(MODULE_CLASS_EXEC, compat_50_quota, "compat_50,ufs");
static int
compat_50_quota_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return syscall_establish(NULL, vfs_syscalls_50_quota_syscalls);
case MODULE_CMD_FINI:
return syscall_disestablish(NULL, vfs_syscalls_50_quota_syscalls);
default:
return ENOTTY;
}
}
/* $NetBSD: hash.h,v 1.8 2014/09/05 05:46:15 matt Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Luke Mewburn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_HASH_H_
#define _SYS_HASH_H_
#include <sys/types.h>
#ifdef __HAVE_MACHINE_HASH_H
#include <machine/hash.h>
#endif
#ifndef __HAVE_HASH32_BUF /* not overridden by MD hash */
#define HASH32_BUF_INIT 5381
/*
* uint32_t
* hash32_buf(const void *bf, size_t len, uint32_t hash)
* return a 32 bit hash of the binary buffer buf (size len),
* seeded with an initial hash value of hash (usually HASH32_BUF_INIT).
*/
static __inline uint32_t
hash32_buf(const void *bf, size_t len, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
while (len-- != 0) /* "nemesi": k=257, r=r*257 */
hash = hash * 257 + *s++;
return (hash * 257);
}
#endif /* __HAVE_HASH32_BUF */
#ifndef __HAVE_HASH32_STR /* not overridden by MD hash */
#define HASH32_STR_INIT 5381
/*
* uint32_t
* hash32_str(const void *bf, uint32_t hash)
* return a 32 bit hash of NUL terminated ASCII string buf,
* seeded with an initial hash value of hash (usually HASH32_STR_INIT).
*/
static __inline uint32_t
hash32_str(const void *bf, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
uint8_t c;
while ((c = *s++) != 0)
hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */
return (hash + (hash >> 5));
}
/*
* uint32_t
* hash32_strn(const void *bf, size_t len, uint32_t hash)
* return a 32 bit hash of NUL terminated ASCII string buf up to
* a maximum of len bytes,
* seeded with an initial hash value of hash (usually HASH32_STR_INIT).
*/
static __inline uint32_t
hash32_strn(const void *bf, size_t len, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
uint8_t c;
while ((c = *s++) != 0 && len-- != 0)
hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */
return (hash + (hash >> 5));
}
#endif /* __HAVE_HASH32_STR */
__BEGIN_DECLS
uint32_t murmurhash2(const void *, size_t, uint32_t);
__END_DECLS
#endif /* !_SYS_HASH_H_ */
/* $NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $ */
/*-
* Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008, 2019
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $");
#include "opt_ktrace.h"
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/acct.h>
#include <sys/ktrace.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/uidinfo.h>
#include <sys/sdt.h>
#include <sys/ptrace.h>
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , create,
"struct proc *", /* new process */
"struct proc *", /* parent process */
"int" /* flags */);
u_int nprocs __cacheline_aligned = 1; /* process 0 */
/*
* Number of ticks to sleep if fork() would fail due to process hitting
* limits. Exported in miliseconds to userland via sysctl.
*/
int forkfsleep = 0;
int
sys_fork(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, 0, SIGCHLD, NULL, 0, NULL, NULL, retval);
}
/*
* vfork(2) system call compatible with 4.4BSD (i.e. BSD with Mach VM).
* Address space is not shared, but parent is blocked until child exit.
*/
int
sys_vfork(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, FORK_PPWAIT, SIGCHLD, NULL, 0, NULL, NULL,
retval);
}
/*
* New vfork(2) system call for NetBSD, which implements original 3BSD vfork(2)
* semantics. Address space is shared, and parent is blocked until child exit.
*/
int
sys___vfork14(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, FORK_PPWAIT|FORK_SHAREVM, SIGCHLD, NULL, 0,
NULL, NULL, retval);
}
/*
* Linux-compatible __clone(2) system call.
*/
int
sys___clone(struct lwp *l, const struct sys___clone_args *uap,
register_t *retval)
{
/* {
syscallarg(int) flags;
syscallarg(void *) stack;
} */
int flags, sig;
/*
* We don't support the CLONE_PTRACE flag.
*/
if (SCARG(uap, flags) & (CLONE_PTRACE))
return EINVAL;
/*
* Linux enforces CLONE_VM with CLONE_SIGHAND, do same.
*/
if (SCARG(uap, flags) & CLONE_SIGHAND
&& (SCARG(uap, flags) & CLONE_VM) == 0)
return EINVAL;
flags = 0;
if (SCARG(uap, flags) & CLONE_VM)
flags |= FORK_SHAREVM;
if (SCARG(uap, flags) & CLONE_FS)
flags |= FORK_SHARECWD;
if (SCARG(uap, flags) & CLONE_FILES)
flags |= FORK_SHAREFILES;
if (SCARG(uap, flags) & CLONE_SIGHAND)
flags |= FORK_SHARESIGS;
if (SCARG(uap, flags) & CLONE_VFORK)
flags |= FORK_PPWAIT;
sig = SCARG(uap, flags) & CLONE_CSIGNAL;
if (sig < 0 || sig >= _NSIG)
return EINVAL;
/*
* Note that the Linux API does not provide a portable way of
* specifying the stack area; the caller must know if the stack
* grows up or down. So, we pass a stack size of 0, so that the
* code that makes this adjustment is a noop.
*/
return fork1(l, flags, sig, SCARG(uap, stack), 0,
NULL, NULL, retval);
}
/*
* Print the 'table full' message once per 10 seconds.
*/
static struct timeval fork_tfmrate = { 10, 0 };
/*
* Check if a process is traced and shall inform about FORK events.
*/
static inline bool
tracefork(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEFORK|PSL_TRACED)) ==
(PSL_TRACEFORK|PSL_TRACED) && (flags & FORK_PPWAIT) == 0;
}
/*
* Check if a process is traced and shall inform about VFORK events.
*/
static inline bool
tracevfork(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEVFORK|PSL_TRACED)) ==
(PSL_TRACEVFORK|PSL_TRACED) && (flags & FORK_PPWAIT) != 0;
}
/*
* Check if a process is traced and shall inform about VFORK_DONE events.
*/
static inline bool
tracevforkdone(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEVFORK_DONE|PSL_TRACED)) ==
(PSL_TRACEVFORK_DONE|PSL_TRACED) && (flags & FORK_PPWAIT);
}
/*
* General fork call. Note that another LWP in the process may call exec()
* or exit() while we are forking. It's safe to continue here, because
* neither operation will complete until all LWPs have exited the process.
*/
int
fork1(struct lwp *l1, int flags, int exitsig, void *stack, size_t stacksize,
void (*func)(void *), void *arg, register_t *retval)
{
struct proc *p1, *p2, *parent;
struct plimit *p1_lim;
uid_t uid;
struct lwp *l2;
int count;
vaddr_t uaddr;
int tnprocs;
int error = 0;
p1 = l1->l_proc;
uid = kauth_cred_getuid(l1->l_cred);
tnprocs = atomic_inc_uint_nv(&nprocs);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create.
*/
if (__predict_false(tnprocs >= maxproc))
error = -1;
else
error = kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
if (error) {
static struct timeval lasttfm;
atomic_dec_uint(&nprocs);
if (ratecheck(&lasttfm, &fork_tfmrate)) tablefull("proc", "increase kern.maxproc or NPROC"); if (forkfsleep) kpause("forkmx", false, forkfsleep, NULL);
return EAGAIN;
}
/*
* Enforce limits.
*/
count = chgproccnt(uid, 1);
if (__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) { if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
if (forkfsleep) kpause("forkulim", false, forkfsleep, NULL);
return EAGAIN;
}
}
/*
* Allocate virtual address space for the U-area now, while it
* is still easy to abort the fork operation if we're out of
* kernel virtual address space.
*/
uaddr = uvm_uarea_alloc();
if (__predict_false(uaddr == 0)) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
return ENOMEM;
}
/* Allocate new proc. */
p2 = proc_alloc();
if (p2 == NULL) {
/* We were unable to allocate a process ID. */
uvm_uarea_free(uaddr);
mutex_enter(p1->p_lock);
uid = kauth_cred_getuid(p1->p_cred);
(void)chgproccnt(uid, -1);
mutex_exit(p1->p_lock);
atomic_dec_uint(&nprocs);
return EAGAIN;
}
/*
* We are now committed to the fork. From here on, we may
* block on resources, but resource allocation may NOT fail.
*/
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
memset(&p2->p_startzero, 0,
(unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
memcpy(&p2->p_startcopy, &p1->p_startcopy,
(unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
TAILQ_INIT(&p2->p_sigpend.sp_info);
LIST_INIT(&p2->p_lwps);
LIST_INIT(&p2->p_sigwaiters);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
* Inherit flags we want to keep. The flags related to SIGCHLD
* handling are important in order to keep a consistent behaviour
* for the child after the fork. If we are a 32-bit process, the
* child will be too.
*/
p2->p_flag =
p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
p2->p_emul = p1->p_emul;
p2->p_execsw = p1->p_execsw;
if (flags & FORK_SYSTEM) {
/*
* Mark it as a system process. Set P_NOCLDWAIT so that
* children are reparented to init(8) when they exit.
* init(8) can easily wait them out for us.
*/
p2->p_flag |= (PK_SYSTEM | PK_NOCLDWAIT);
}
mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&p2->p_reflock);
cv_init(&p2->p_waitcv, "wait");
cv_init(&p2->p_lwpcv, "lwpwait");
/*
* Share a lock between the processes if they are to share signal
* state: we must synchronize access to it.
*/
if (flags & FORK_SHARESIGS) {
p2->p_lock = p1->p_lock;
mutex_obj_hold(p1->p_lock);
} else
p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
kauth_proc_fork(p1, p2);
p2->p_raslist = NULL;
#if defined(__HAVE_RAS)
ras_fork(p1, p2);
#endif
/* bump references to the text vnode (for procfs) */
p2->p_textvp = p1->p_textvp;
if (p2->p_textvp) vref(p2->p_textvp); if (p1->p_path) p2->p_path = kmem_strdupsize(p1->p_path, NULL, KM_SLEEP);
else
p2->p_path = NULL;
if (flags & FORK_SHAREFILES)
fd_share(p2);
else if (flags & FORK_CLEANFILES)
p2->p_fd = fd_init(NULL);
else
p2->p_fd = fd_copy();
/* XXX racy */
p2->p_mqueue_cnt = p1->p_mqueue_cnt;
if (flags & FORK_SHARECWD)
cwdshare(p2);
else
p2->p_cwdi = cwdinit();
/*
* Note: p_limit (rlimit stuff) is copy-on-write, so normally
* we just need increase pl_refcnt.
*/
p1_lim = p1->p_limit;
if (!p1_lim->pl_writeable) {
lim_addref(p1_lim);
p2->p_limit = p1_lim;
} else {
p2->p_limit = lim_copy(p1_lim);
}
if (flags & FORK_PPWAIT) {
/* Mark ourselves as waiting for a child. */
p2->p_lflag = PL_PPWAIT;
l1->l_vforkwaiting = true;
p2->p_vforklwp = l1;
} else {
p2->p_lflag = 0;
l1->l_vforkwaiting = false;
}
p2->p_sflag = 0;
p2->p_slflag = 0;
parent = (flags & FORK_NOWAIT) ? initproc : p1;
p2->p_pptr = parent;
p2->p_ppid = parent->p_pid;
LIST_INIT(&p2->p_children);
p2->p_aio = NULL;
#ifdef KTRACE
/*
* Copy traceflag and tracefile if enabled.
* If not inherited, these were zeroed above.
*/
if (p1->p_traceflag & KTRFAC_INHERIT) {
mutex_enter(&ktrace_lock);
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2);
mutex_exit(&ktrace_lock);
}
#endif
/*
* Create signal actions for the child process.
*/
p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
mutex_enter(p1->p_lock);
p2->p_sflag |=
(p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
sched_proc_fork(p1, p2);
mutex_exit(p1->p_lock);
p2->p_stflag = p1->p_stflag;
/*
* p_stats.
* Copy parts of p_stats, and zero out the rest.
*/
p2->p_stats = pstatscopy(p1->p_stats);
/*
* Set up the new process address space.
*/
uvm_proc_fork(p1, p2, (flags & FORK_SHAREVM) ? true : false);
/*
* Finish creating the child process.
* It will return through a different path later.
*/
lwp_create(l1, p2, uaddr, (flags & FORK_PPWAIT) ? LWP_VFORK : 0,
stack, stacksize, (func != NULL) ? func : child_return, arg, &l2,
l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
/*
* Inherit l_private from the parent.
* Note that we cannot use lwp_setprivate() here since that
* also sets the CPU TLS register, which is incorrect if the
* process has changed that without letting the kernel know.
*/
l2->l_private = l1->l_private;
/*
* If emulation has a process fork hook, call it now.
*/
if (p2->p_emul->e_proc_fork) (*p2->p_emul->e_proc_fork)(p2, l1, flags);
/*
* ...and finally, any other random fork hooks that subsystems
* might have registered.
*/
doforkhooks(p2, p1);
SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
/*
* It's now safe for the scheduler and other processes to see the
* child process.
*/
mutex_enter(&proc_lock);
if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&parent->p_children, p2, p_sibling);
p2->p_exitsig = exitsig; /* signal for parent on exit */
/*
* Trace fork(2) and vfork(2)-like events on demand in a debugger.
*/
if (tracefork(p1, flags) || tracevfork(p1, flags)) {
proc_changeparent(p2, p1->p_pptr);
SET(p2->p_slflag, PSL_TRACEDCHILD);
}
p2->p_oppid = p1->p_pid; /* Remember the original parent id. */
LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list);
p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
(*p2->p_emul->e_syscall_intern)(p2);
#endif
/*
* Update stats now that we know the fork was successful.
*/
KPREEMPT_DISABLE(l1); CPU_COUNT(CPU_COUNT_FORKS, 1); if (flags & FORK_PPWAIT) CPU_COUNT(CPU_COUNT_FORKS_PPWAIT, 1); if (flags & FORK_SHAREVM) CPU_COUNT(CPU_COUNT_FORKS_SHAREVM, 1); KPREEMPT_ENABLE(l1); if (ktrpoint(KTR_EMUL)) p2->p_traceflag |= KTRFAC_TRC_EMUL;
/*
* Notify any interested parties about the new process.
*/
if (!SLIST_EMPTY(&p1->p_klist)) { mutex_exit(&proc_lock);
knote_proc_fork(p1, p2);
mutex_enter(&proc_lock);
}
/*
* Make child runnable, set start time, and add to run queue except
* if the parent requested the child to start in SSTOP state.
*/
mutex_enter(p2->p_lock);
/*
* Start profiling.
*/
if ((p2->p_stflag & PST_PROFIL) != 0) { mutex_spin_enter(&p2->p_stmutex);
startprofclock(p2);
mutex_spin_exit(&p2->p_stmutex);
}
getmicrotime(&p2->p_stats->p_start);
p2->p_acflag = AFORK;
lwp_lock(l2);
KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL);
if (p2->p_sflag & PS_STOPFORK) {
p2->p_nrlwps = 0;
p2->p_stat = SSTOP;
p2->p_waited = 0;
p1->p_nstopchild++;
l2->l_stat = LSSTOP;
KASSERT(l2->l_wchan == NULL);
lwp_unlock(l2);
} else {
p2->p_nrlwps = 1;
p2->p_stat = SACTIVE;
setrunnable(l2);
/* LWP now unlocked */
}
/*
* Return child pid to parent process,
* marking us as parent via retval[1].
*/
if (retval != NULL) { retval[0] = p2->p_pid;
retval[1] = 0;
}
mutex_exit(p2->p_lock);
/*
* Let the parent know that we are tracing its child.
*/
if (tracefork(p1, flags) || tracevfork(p1, flags)) {
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD,
tracefork(p1, flags) ? PTRACE_FORK : PTRACE_VFORK,
retval[0]);
mutex_enter(&proc_lock);
}
/*
* Preserve synchronization semantics of vfork. If waiting for
* child to exec or exit, sleep until it clears p_vforkwaiting.
*/
while (l1->l_vforkwaiting)
cv_wait(&l1->l_waitcv, &proc_lock);
/*
* Let the parent know that we are tracing its child.
*/
if (tracevforkdone(p1, flags)) {
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD, PTRACE_VFORK_DONE, retval[0]);
} else
mutex_exit(&proc_lock);
return 0;
}
/*
* MI code executed in each newly spawned process before returning to userland.
*/
void
child_return(void *arg)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
(PSL_TRACED|PSL_TRACEDCHILD)) {
eventswitchchild(p, TRAP_CHLD,
ISSET(p->p_lflag, PL_PPWAIT) ? PTRACE_VFORK : PTRACE_FORK);
}
md_child_return(l);
/*
* Return SYS_fork for all fork types, including vfork(2) and clone(2).
*
* This approach simplifies the code and avoids extra locking.
*/
ktrsysret(SYS_fork, 0, 0);
}
/* $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel condition variable implementation.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $");
#include <sys/param.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* Accessors for the private contents of the kcondvar_t data type.
*
* cv_opaque[0] sleepq_t
* cv_opaque[1] description for ps(1)
*
* cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue
* only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
* and dequeue).
*
* cv_opaque[1] (the wmesg) is static and does not change throughout the life
* of the CV.
*/
#define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque)
#define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1])
#define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v)
#define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug)
#define CV_RA ((uintptr_t)__builtin_return_address(0))
static void cv_unsleep(lwp_t *, bool);
static inline void cv_wakeup_one(kcondvar_t *);
static inline void cv_wakeup_all(kcondvar_t *);
syncobj_t cv_syncobj = {
.sobj_name = "cv",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = cv_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static const char deadcv[] = "deadcv";
/*
* cv_init:
*
* Initialize a condition variable for use.
*/
void
cv_init(kcondvar_t *cv, const char *wmesg)
{ KASSERT(wmesg != NULL);
CV_SET_WMESG(cv, wmesg);
sleepq_init(CV_SLEEPQ(cv));
}
/*
* cv_destroy:
*
* Tear down a condition variable.
*/
void
cv_destroy(kcondvar_t *cv)
{
sleepq_destroy(CV_SLEEPQ(cv));
#ifdef DIAGNOSTIC
KASSERT(cv_is_valid(cv)); KASSERT(!cv_has_waiters(cv));
CV_SET_WMESG(cv, deadcv);
#endif
}
/*
* cv_enter:
*
* Look up and lock the sleep queue corresponding to the given
* condition variable, and increment the number of waiters.
*/
static inline int
cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p)
{
sleepq_t *sq;
kmutex_t *mp;
int nlocks;
KASSERT(cv_is_valid(cv)); KASSERT(!cpu_intr_p()); KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p);
mutex_exit(mtx);
KASSERT(cv_has_waiters(cv));
return nlocks;
}
/*
* cv_unsleep:
*
* Remove an LWP from the condition variable and sleep queue. This
* is called when the LWP has not been awoken normally but instead
* interrupted: for example, when a signal is received. Must be
* called with the LWP locked. Will unlock if "unlock" is true.
*/
static void
cv_unsleep(lwp_t *l, bool unlock)
{
kcondvar_t *cv __diagused;
cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
KASSERT(l->l_wchan == (wchan_t)cv);
KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
KASSERT(cv_is_valid(cv));
KASSERT(cv_has_waiters(cv));
sleepq_unsleep(l, unlock);
}
/*
* cv_wait:
*
* Wait non-interruptably on a condition variable until awoken.
*/
void
cv_wait(kcondvar_t *cv, kmutex_t *mtx)
{
lwp_t *l = curlwp;
int nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, false);
(void)sleepq_block(0, false, &cv_syncobj, nlocks);
mutex_enter(mtx);
}
/*
* cv_wait_sig:
*
* Wait on a condition variable until a awoken or a signal is received.
* Will also return early if the process is exiting. Returns zero if
* awoken normally, ERESTART if a signal was received and the system
* call is restartable, or EINTR otherwise.
*/
int
cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, true);
error = sleepq_block(0, true, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* cv_timedwait:
*
* Wait on a condition variable until awoken or the specified timeout
* expires. Returns zero if awoken normally or EWOULDBLOCK if the
* timeout expired.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, false);
error = sleepq_block(timo, false, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* cv_timedwait_sig:
*
* Wait on a condition variable until a timeout expires, awoken or a
* signal is received. Will also return early if the process is
* exiting. Returns zero if awoken normally, EWOULDBLOCK if the
* timeout expires, ERESTART if a signal was received and the system
* call is restartable, or EINTR otherwise.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, true);
error = sleepq_block(timo, true, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* Given a number of seconds, sec, and 2^64ths of a second, frac, we
* want a number of ticks for a timeout:
*
* timo = hz*(sec + frac/2^64)
* = hz*sec + hz*frac/2^64
* = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
* = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
*
* where frachi is the high 32 bits of frac and fraclo is the
* low 32 bits.
*
* We assume hz < INT_MAX/2 < UINT32_MAX, so
*
* hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
*
* since fraclo < 2^32.
*
* We clamp the result at INT_MAX/2 for a timeout in ticks, since we
* can't represent timeouts higher than INT_MAX in cv_timedwait, and
* spurious wakeup is OK. Moreover, we don't want to wrap around,
* because we compute end - start in ticks in order to compute the
* remaining timeout, and that difference cannot wrap around, so we use
* a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of
* margin for paranoia and will exceed most waits in practice by far.
*/
static unsigned
bintime2timo(const struct bintime *bt)
{
KASSERT(hz < INT_MAX/2);
CTASSERT(INT_MAX/2 < UINT32_MAX);
if (bt->sec > ((INT_MAX/2)/hz))
return INT_MAX/2;
if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
return INT_MAX/2;
return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
}
/*
* timo is in units of ticks. We want units of seconds and 2^64ths of
* a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
* second), from which we can conclude 2^64 / hz = 1 (2^64th of a
* second)/tick. So for the fractional part, we compute
*
* frac = rem * 2^64 / hz
* = ((rem * 2^32) / hz) * 2^32
*
* Using truncating integer division instead of real division will
* leave us with only about 32 bits of precision, which means about
* 1/4-nanosecond resolution, which is good enough for our purposes.
*/
static struct bintime
timo2bintime(unsigned timo)
{
return (struct bintime) {
.sec = timo / hz,
.frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
};
}
/*
* cv_timedwaitbt:
*
* Wait on a condition variable until awoken or the specified
* timeout expires. Returns zero if awoken normally or
* EWOULDBLOCK if the timeout expires.
*
* On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts
* the time slept, so on exit, bt is the time remaining after
* sleeping, possibly negative if the complete time has elapsed.
* No infinite timeout; use cv_wait_sig instead.
*
* epsilon is a requested maximum error in timeout (excluding
* spurious wakeups). Currently not used, will be used in the
* future to choose between low- and high-resolution timers.
* Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
* where r is the finest resolution of clock available and s is
* scheduling delays for scheduler overhead and competing threads.
* Time is measured by the interrupt source implementing the
* timeout, not by another timecounter.
*/
int
cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
const struct bintime *epsilon __diagused)
{
struct bintime slept;
unsigned start, end;
int timo;
int error;
KASSERTMSG(bt->sec >= 0, "negative timeout");
KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
/* If there's nothing left to wait, time out. */
if (bt->sec == 0 && bt->frac == 0)
return EWOULDBLOCK;
/* Convert to ticks, but clamp to be >=1. */
timo = bintime2timo(bt);
KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
if (timo == 0)
timo = 1;
/*
* getticks() is technically int, but nothing special
* happens instead of overflow, so we assume two's-complement
* wraparound and just treat it as unsigned.
*/
start = getticks();
error = cv_timedwait(cv, mtx, timo);
end = getticks();
/*
* Set it to the time left, or zero, whichever is larger. We
* do not fail with EWOULDBLOCK here because this may have been
* an explicit wakeup, so the caller needs to check before they
* give up or else cv_signal would be lost.
*/
slept = timo2bintime(end - start);
if (bintimecmp(bt, &slept, <=)) {
bt->sec = 0;
bt->frac = 0;
} else {
/* bt := bt - slept */
bintime_sub(bt, &slept);
}
return error;
}
/*
* cv_timedwaitbt_sig:
*
* Wait on a condition variable until awoken, the specified
* timeout expires, or interrupted by a signal. Returns zero if
* awoken normally, EWOULDBLOCK if the timeout expires, or
* EINTR/ERESTART if interrupted by a signal.
*
* On entry, bt is a timeout in bintime. cv_timedwaitbt_sig
* subtracts the time slept, so on exit, bt is the time remaining
* after sleeping. No infinite timeout; use cv_wait instead.
*
* epsilon is a requested maximum error in timeout (excluding
* spurious wakeups). Currently not used, will be used in the
* future to choose between low- and high-resolution timers.
*/
int
cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
const struct bintime *epsilon __diagused)
{
struct bintime slept;
unsigned start, end;
int timo;
int error;
KASSERTMSG(bt->sec >= 0, "negative timeout");
KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
/* If there's nothing left to wait, time out. */
if (bt->sec == 0 && bt->frac == 0)
return EWOULDBLOCK;
/* Convert to ticks, but clamp to be >=1. */
timo = bintime2timo(bt);
KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
if (timo == 0)
timo = 1;
/*
* getticks() is technically int, but nothing special
* happens instead of overflow, so we assume two's-complement
* wraparound and just treat it as unsigned.
*/
start = getticks();
error = cv_timedwait_sig(cv, mtx, timo);
end = getticks();
/*
* Set it to the time left, or zero, whichever is larger. We
* do not fail with EWOULDBLOCK here because this may have been
* an explicit wakeup, so the caller needs to check before they
* give up or else cv_signal would be lost.
*/
slept = timo2bintime(end - start);
if (bintimecmp(bt, &slept, <=)) {
bt->sec = 0;
bt->frac = 0;
} else {
/* bt := bt - slept */
bintime_sub(bt, &slept);
}
return error;
}
/*
* cv_signal:
*
* Wake the highest priority LWP waiting on a condition variable. Must
* be called with the interlocking mutex held or just after it has been
* released (so the awoken LWP will see the changed condition).
*/
void
cv_signal(kcondvar_t *cv)
{ KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
/*
* Compiler turns into a tail call usually, i.e. jmp,
* because the arguments are the same and no locals.
*/
cv_wakeup_one(cv);
}
}
/*
* cv_wakeup_one:
*
* Slow path for cv_signal(). Deliberately marked __noinline to
* prevent the compiler pulling it in to cv_signal(), which adds
* extra prologue and epilogue code.
*/
static __noinline void
cv_wakeup_one(kcondvar_t *cv)
{
sleepq_t *sq;
kmutex_t *mp;
lwp_t *l;
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv);
sleepq_remove(sq, l, true);
}
mutex_spin_exit(mp);
}
/*
* cv_broadcast:
*
* Wake all LWPs waiting on a condition variable. Must be called with
* the interlocking mutex held or just after it has been released (so
* the awoken LWP will see the changed condition).
*/
void
cv_broadcast(kcondvar_t *cv)
{ KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
/*
* Compiler turns into a tail call usually, i.e. jmp,
* because the arguments are the same and no locals.
*/
cv_wakeup_all(cv);
}
}
/*
* cv_wakeup_all:
*
* Slow path for cv_broadcast(). Deliberately marked __noinline to
* prevent the compiler pulling it in to cv_broadcast(), which adds
* extra prologue and epilogue code.
*/
static __noinline void
cv_wakeup_all(kcondvar_t *cv)
{
sleepq_t *sq;
kmutex_t *mp;
lwp_t *l;
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
while ((l = LIST_FIRST(sq)) != NULL) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv);
sleepq_remove(sq, l, true);
}
mutex_spin_exit(mp);
}
/*
* cv_has_waiters:
*
* For diagnostic assertions: return non-zero if a condition
* variable has waiters.
*/
bool
cv_has_waiters(kcondvar_t *cv)
{
return !LIST_EMPTY(CV_SLEEPQ(cv));
}
/*
* cv_is_valid:
*
* For diagnostic assertions: return non-zero if a condition
* variable appears to be valid. No locks need be held.
*/
bool
cv_is_valid(kcondvar_t *cv)
{
return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
}