/* $OpenBSD: kern_watchdog.c,v 1.16 2022/08/14 01:58:27 jsg Exp $ */
/*
* Copyright (c) 2003 Markus Friedl. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/timeout.h>
#include <sys/sysctl.h>
void wdog_tickle(void *arg);
int (*wdog_ctl_cb)(void *, int) = NULL;
void *wdog_ctl_cb_arg = NULL;
int wdog_period = 0;
int wdog_auto = 1;
struct timeout wdog_timeout;
void
wdog_register(int (*cb)(void *, int), void *cb_arg)
{
if (wdog_ctl_cb != NULL)
return;
wdog_ctl_cb = cb;
wdog_ctl_cb_arg = cb_arg;
timeout_set(&wdog_timeout, wdog_tickle, NULL);
}
void
wdog_tickle(void *arg)
{
if (wdog_ctl_cb == NULL)
return;
(void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, wdog_period);
timeout_add_msec(&wdog_timeout, wdog_period * 1000 / 2);
}
void
wdog_shutdown(void *arg)
{
if (wdog_ctl_cb == NULL || wdog_ctl_cb_arg != arg)
return;
timeout_del(&wdog_timeout);
(void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, 0);
wdog_ctl_cb = NULL;
wdog_period = 0;
wdog_auto = 1;
}
int
sysctl_wdog(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
int error, period;
if (wdog_ctl_cb == NULL)
return (EOPNOTSUPP);
switch (name[0]) {
case KERN_WATCHDOG_PERIOD:
period = wdog_period;
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&period, 0, INT_MAX);
if (error)
return (error);
if (newp) { timeout_del(&wdog_timeout);
wdog_period = (*wdog_ctl_cb)(wdog_ctl_cb_arg, period);
}
break;
case KERN_WATCHDOG_AUTO:
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&wdog_auto, 0, 1);
if (error)
return (error);
break;
default:
return (EINVAL);
}
if (wdog_auto && wdog_period > 0) {
(void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, wdog_period);
timeout_add_msec(&wdog_timeout, wdog_period * 1000 / 2);
} else
timeout_del(&wdog_timeout);
return (error);
}
/* $OpenBSD: uipc_socket2.c,v 1.128 2022/09/05 14:56:09 bluhm Exp $ */
/* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/event.h>
#include <sys/pool.h>
/*
* Primitive routines for operating on sockets and socket buffers
*/
u_long sb_max = SB_MAX; /* patchable */
extern struct pool mclpools[];
extern struct pool mbpool;
/*
* Procedures to manipulate state flags of socket
* and do appropriate wakeups. Normal sequence from the
* active (originating) side is that soisconnecting() is
* called during processing of connect() call,
* resulting in an eventual call to soisconnected() if/when the
* connection is established. When the connection is torn down
* soisdisconnecting() is called during processing of disconnect() call,
* and soisdisconnected() is called when the connection to the peer
* is totally severed. The semantics of these routines are such that
* connectionless protocols can call soisconnected() and soisdisconnected()
* only, bypassing the in-progress calls when setting up a ``connection''
* takes no time.
*
* From the passive side, a socket is created with
* two queues of sockets: so_q0 for connections in progress
* and so_q for connections already made and awaiting user acceptance.
* As a protocol is preparing incoming connections, it creates a socket
* structure queued on so_q0 by calling sonewconn(). When the connection
* is established, soisconnected() is called, and transfers the
* socket structure to so_q, making it available to accept().
*
* If a socket is closed with sockets on either
* so_q0 or so_q, these sockets are dropped.
*
* If higher level protocols are implemented in
* the kernel, the wakeups done here will sometimes
* cause software-interrupt process scheduling.
*/
void
soisconnecting(struct socket *so)
{
soassertlocked(so);
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTING;
}
void
soisconnected(struct socket *so)
{
struct socket *head = so->so_head;
soassertlocked(so);
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTED;
if (head != NULL && so->so_onq == &head->so_q0) {
int persocket = solock_persocket(so);
if (persocket) {
soref(so);
soref(head);
sounlock(so);
solock(head);
solock(so);
if (so->so_onq != &head->so_q0) {
sounlock(head);
sorele(head);
sorele(so);
return;
}
sorele(head);
sorele(so);
}
soqremque(so, 0);
soqinsque(head, so, 1);
sorwakeup(head);
wakeup_one(&head->so_timeo);
if (persocket)
sounlock(head);
} else {
wakeup(&so->so_timeo);
sorwakeup(so);
sowwakeup(so);
}
}
void
soisdisconnecting(struct socket *so)
{
soassertlocked(so);
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
wakeup(&so->so_timeo);
sowwakeup(so);
sorwakeup(so);
}
void
soisdisconnected(struct socket *so)
{
soassertlocked(so);
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
wakeup(&so->so_timeo);
sowwakeup(so);
sorwakeup(so);
}
/*
* When an attempt at a new connection is noted on a socket
* which accepts connections, sonewconn is called. If the
* connection is possible (subject to space constraints, etc.)
* then we allocate a new structure, properly linked into the
* data structure of the original socket, and return this.
* Connstatus may be 0 or SS_ISCONNECTED.
*/
struct socket *
sonewconn(struct socket *head, int connstatus)
{
struct socket *so;
int persocket = solock_persocket(head);
int error;
/*
* XXXSMP as long as `so' and `head' share the same lock, we
* can call soreserve() and pr_attach() below w/o explicitly
* locking `so'.
*/
soassertlocked(head);
if (m_pool_used() > 95)
return (NULL);
if (head->so_qlen + head->so_q0len > head->so_qlimit * 3)
return (NULL);
so = soalloc(PR_NOWAIT | PR_ZERO);
if (so == NULL)
return (NULL);
so->so_type = head->so_type;
so->so_options = head->so_options &~ SO_ACCEPTCONN;
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_proto = head->so_proto;
so->so_timeo = head->so_timeo;
so->so_euid = head->so_euid;
so->so_ruid = head->so_ruid;
so->so_egid = head->so_egid;
so->so_rgid = head->so_rgid;
so->so_cpid = head->so_cpid;
/*
* Lock order will be `head' -> `so' while these sockets are linked.
*/
if (persocket)
solock(so);
/*
* Inherit watermarks but those may get clamped in low mem situations.
*/
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
if (persocket)
sounlock(so);
pool_put(&socket_pool, so);
return (NULL);
}
so->so_snd.sb_wat = head->so_snd.sb_wat;
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
so->so_rcv.sb_wat = head->so_rcv.sb_wat;
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
sigio_init(&so->so_sigio);
sigio_copy(&so->so_sigio, &head->so_sigio);
soqinsque(head, so, 0);
/*
* We need to unlock `head' because PCB layer could release
* solock() to enforce desired lock order.
*/
if (persocket) {
head->so_newconn++;
sounlock(head);
}
error = pru_attach(so, 0);
if (persocket) {
sounlock(so);
solock(head);
solock(so);
if ((head->so_newconn--) == 0) { if ((head->so_state & SS_NEWCONN_WAIT) != 0) { head->so_state &= ~SS_NEWCONN_WAIT;
wakeup(&head->so_newconn);
}
}
}
if (error) { soqremque(so, 0); if (persocket)
sounlock(so);
sigio_free(&so->so_sigio);
klist_free(&so->so_rcv.sb_sel.si_note);
klist_free(&so->so_snd.sb_sel.si_note);
pool_put(&socket_pool, so);
return (NULL);
}
if (connstatus) { so->so_state |= connstatus;
soqremque(so, 0);
soqinsque(head, so, 1);
sorwakeup(head);
wakeup(&head->so_timeo);
}
if (persocket)
sounlock(so);
return (so);
}
void
soqinsque(struct socket *head, struct socket *so, int q)
{
soassertlocked(head);
soassertlocked(so);
KASSERT(so->so_onq == NULL);
so->so_head = head;
if (q == 0) {
head->so_q0len++;
so->so_onq = &head->so_q0;
} else {
head->so_qlen++;
so->so_onq = &head->so_q;
}
TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
}
int
soqremque(struct socket *so, int q)
{
struct socket *head = so->so_head;
soassertlocked(so);
soassertlocked(head);
if (q == 0) {
if (so->so_onq != &head->so_q0)
return (0);
head->so_q0len--;
} else {
if (so->so_onq != &head->so_q)
return (0);
head->so_qlen--;
}
TAILQ_REMOVE(so->so_onq, so, so_qe);
so->so_onq = NULL;
so->so_head = NULL;
return (1);
}
/*
* Socantsendmore indicates that no more data will be sent on the
* socket; it would normally be applied to a socket when the user
* informs the system that no more data is to be sent, by the protocol
* code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
* will be received, and will normally be applied to the socket by a
* protocol when it detects that the peer will send no more data.
* Data queued for reading in the socket may yet be read.
*/
void
socantsendmore(struct socket *so)
{
soassertlocked(so);
so->so_state |= SS_CANTSENDMORE;
sowwakeup(so);
}
void
socantrcvmore(struct socket *so)
{
soassertlocked(so);
so->so_state |= SS_CANTRCVMORE;
sorwakeup(so);
}
void
solock(struct socket *so)
{ switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
NET_LOCK();
break;
default:
rw_enter_write(&so->so_lock);
break;
}
}
void
solock_shared(struct socket *so)
{ switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
if (so->so_proto->pr_usrreqs->pru_lock != NULL) {
NET_LOCK_SHARED();
pru_lock(so);
} else
NET_LOCK();
break;
default:
rw_enter_write(&so->so_lock);
break;
}
}
int
solock_persocket(struct socket *so)
{
switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
return 0;
default:
return 1;
}
}
void
solock_pair(struct socket *so1, struct socket *so2)
{ KASSERT(so1 != so2); KASSERT(so1->so_type == so2->so_type); KASSERT(solock_persocket(so1));
if (so1 < so2) {
solock(so1);
solock(so2);
} else {
solock(so2);
solock(so1);
}
}
void
sounlock(struct socket *so)
{ switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
NET_UNLOCK();
break;
default:
rw_exit_write(&so->so_lock);
break;
}
}
void
sounlock_shared(struct socket *so)
{ switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
if (so->so_proto->pr_usrreqs->pru_unlock != NULL) {
pru_unlock(so);
NET_UNLOCK_SHARED();
} else
NET_UNLOCK();
break;
default:
rw_exit_write(&so->so_lock);
break;
}
}
void
soassertlocked(struct socket *so)
{ switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
NET_ASSERT_LOCKED();
break;
default:
rw_assert_wrlock(&so->so_lock);
break;
}
}
int
sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
uint64_t nsecs)
{
int ret;
switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
if (so->so_proto->pr_usrreqs->pru_unlock != NULL &&
rw_status(&netlock) == RW_READ) {
pru_unlock(so);
}
ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
if (so->so_proto->pr_usrreqs->pru_lock != NULL &&
rw_status(&netlock) == RW_READ) {
pru_lock(so);
}
break;
default:
ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
break;
}
return ret;
}
/*
* Wait for data to arrive at/drain from a socket buffer.
*/
int
sbwait(struct socket *so, struct sockbuf *sb)
{
int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
soassertlocked(so);
sb->sb_flags |= SB_WAIT;
return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs);
}
int
sblock(struct socket *so, struct sockbuf *sb, int wait)
{
int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
soassertlocked(so);
if ((sb->sb_flags & SB_LOCK) == 0) {
sb->sb_flags |= SB_LOCK;
return (0);
}
if (wait & M_NOWAIT)
return (EWOULDBLOCK);
while (sb->sb_flags & SB_LOCK) {
sb->sb_flags |= SB_WANT;
error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP);
if (error)
return (error);
}
sb->sb_flags |= SB_LOCK;
return (0);
}
void
sbunlock(struct socket *so, struct sockbuf *sb)
{
soassertlocked(so);
sb->sb_flags &= ~SB_LOCK;
if (sb->sb_flags & SB_WANT) { sb->sb_flags &= ~SB_WANT;
wakeup(&sb->sb_flags);
}
}
/*
* Wakeup processes waiting on a socket buffer.
* Do asynchronous notification via SIGIO
* if the socket buffer has the SB_ASYNC flag set.
*/
void
sowakeup(struct socket *so, struct sockbuf *sb)
{
soassertlocked(so);
if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_cc);
}
if (sb->sb_flags & SB_ASYNC) pgsigio(&so->so_sigio, SIGIO, 0); KNOTE(&sb->sb_sel.si_note, 0);
}
/*
* Socket buffer (struct sockbuf) utility routines.
*
* Each socket contains two socket buffers: one for sending data and
* one for receiving data. Each buffer contains a queue of mbufs,
* information about the number of mbufs and amount of data in the
* queue, and other fields allowing select() statements and notification
* on data availability to be implemented.
*
* Data stored in a socket buffer is maintained as a list of records.
* Each record is a list of mbufs chained together with the m_next
* field. Records are chained together with the m_nextpkt field. The upper
* level routine soreceive() expects the following conventions to be
* observed when placing information in the receive buffer:
*
* 1. If the protocol requires each message be preceded by the sender's
* name, then a record containing that name must be present before
* any associated data (mbuf's must be of type MT_SONAME).
* 2. If the protocol supports the exchange of ``access rights'' (really
* just additional data associated with the message), and there are
* ``rights'' to be received, then a record containing this data
* should be present (mbuf's must be of type MT_CONTROL).
* 3. If a name or rights record exists, then it must be followed by
* a data record, perhaps of zero length.
*
* Before using a new socket structure it is first necessary to reserve
* buffer space to the socket, by calling sbreserve(). This should commit
* some of the available buffer space in the system buffer pool for the
* socket (currently, it does nothing but enforce limits). The space
* should be released by calling sbrelease() when the socket is destroyed.
*/
int
soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
{
soassertlocked(so);
if (sbreserve(so, &so->so_snd, sndcc))
goto bad;
if (sbreserve(so, &so->so_rcv, rcvcc))
goto bad2;
so->so_snd.sb_wat = sndcc;
so->so_rcv.sb_wat = rcvcc;
if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
return (0);
bad2:
sbrelease(so, &so->so_snd);
bad:
return (ENOBUFS);
}
/*
* Allot mbufs to a sockbuf.
* Attempt to scale mbmax so that mbcnt doesn't become limiting
* if buffering efficiency is near the normal case.
*/
int
sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
{ KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
if (cc == 0 || cc > sb_max)
return (1);
sb->sb_hiwat = cc;
sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8);
if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat;
return (0);
}
/*
* In low memory situation, do not accept any greater than normal request.
*/
int
sbcheckreserve(u_long cnt, u_long defcnt)
{ if (cnt > defcnt && sbchecklowmem())
return (ENOBUFS);
return (0);
}
int
sbchecklowmem(void)
{
static int sblowmem;
unsigned int used = m_pool_used();
if (used < 60)
sblowmem = 0;
else if (used > 80)
sblowmem = 1; return (sblowmem);
}
/*
* Free mbufs held by a socket, and reserved mbuf space.
*/
void
sbrelease(struct socket *so, struct sockbuf *sb)
{
sbflush(so, sb);
sb->sb_hiwat = sb->sb_mbmax = 0;
}
/*
* Routines to add and remove
* data from an mbuf queue.
*
* The routines sbappend() or sbappendrecord() are normally called to
* append new mbufs to a socket buffer, after checking that adequate
* space is available, comparing the function sbspace() with the amount
* of data to be added. sbappendrecord() differs from sbappend() in
* that data supplied is treated as the beginning of a new record.
* To place a sender's address, optional access rights, and data in a
* socket receive buffer, sbappendaddr() should be used. To place
* access rights and data in a socket receive buffer, sbappendrights()
* should be used. In either case, the new data begins a new record.
* Note that unlike sbappend() and sbappendrecord(), these routines check
* for the caller that there will be enough space to store the data.
* Each fails if there is not enough space, or if it cannot find mbufs
* to store additional information in.
*
* Reliable protocols may use the socket send buffer to hold data
* awaiting acknowledgement. Data is normally copied from a socket
* send buffer in a protocol with m_copym for output to a peer,
* and then removing the data from the socket buffer with sbdrop()
* or sbdroprecord() when the data is acknowledged by the peer.
*/
#ifdef SOCKBUF_DEBUG
void
sblastrecordchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
while (m && m->m_nextpkt)
m = m->m_nextpkt;
if (m != sb->sb_lastrecord) {
printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
sb->sb_mb, sb->sb_lastrecord, m);
printf("packet chain:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
printf("\t%p\n", m);
panic("sblastrecordchk from %s", where);
}
}
void
sblastmbufchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
struct mbuf *n;
while (m && m->m_nextpkt)
m = m->m_nextpkt;
while (m && m->m_next)
m = m->m_next;
if (m != sb->sb_mbtail) {
printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
sb->sb_mb, sb->sb_mbtail, m);
printf("packet tree:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
printf("\t");
for (n = m; n != NULL; n = n->m_next)
printf("%p ", n);
printf("\n");
}
panic("sblastmbufchk from %s", where);
}
}
#endif /* SOCKBUF_DEBUG */
#define SBLINKRECORD(sb, m0) \
do { \
if ((sb)->sb_lastrecord != NULL) \
(sb)->sb_lastrecord->m_nextpkt = (m0); \
else \
(sb)->sb_mb = (m0); \
(sb)->sb_lastrecord = (m0); \
} while (/*CONSTCOND*/0)
/*
* Append mbuf chain m to the last record in the
* socket buffer sb. The additional space associated
* the mbuf chain is recorded in sb. Empty mbufs are
* discarded and mbufs are compacted where possible.
*/
void
sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m)
{
struct mbuf *n;
if (m == NULL)
return;
soassertlocked(so);
SBLASTRECORDCHK(sb, "sbappend 1");
if ((n = sb->sb_lastrecord) != NULL) {
/*
* XXX Would like to simply use sb_mbtail here, but
* XXX I need to verify that I won't miss an EOR that
* XXX way.
*/
do {
if (n->m_flags & M_EOR) {
sbappendrecord(so, sb, m); /* XXXXXX!!!! */
return;
}
} while (n->m_next && (n = n->m_next));
} else {
/*
* If this is the first record in the socket buffer, it's
* also the last record.
*/
sb->sb_lastrecord = m;
}
sbcompress(so, sb, m, n);
SBLASTRECORDCHK(sb, "sbappend 2");
}
/*
* This version of sbappend() should only be used when the caller
* absolutely knows that there will never be more than one record
* in the socket buffer, that is, a stream protocol (such as TCP).
*/
void
sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
{ KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
KDASSERT(m->m_nextpkt == NULL);
KASSERT(sb->sb_mb == sb->sb_lastrecord);
SBLASTMBUFCHK(sb, __func__);
sbcompress(so, sb, m, sb->sb_mbtail);
sb->sb_lastrecord = sb->sb_mb;
SBLASTRECORDCHK(sb, __func__);
}
#ifdef SOCKBUF_DEBUG
void
sbcheck(struct socket *so, struct sockbuf *sb)
{
struct mbuf *m, *n;
u_long len = 0, mbcnt = 0;
for (m = sb->sb_mb; m; m = m->m_nextpkt) {
for (n = m; n; n = n->m_next) {
len += n->m_len;
mbcnt += MSIZE;
if (n->m_flags & M_EXT)
mbcnt += n->m_ext.ext_size;
if (m != n && n->m_nextpkt)
panic("sbcheck nextpkt");
}
}
if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
mbcnt, sb->sb_mbcnt);
panic("sbcheck");
}
}
#endif
/*
* As above, except the mbuf chain
* begins a new record.
*/
void
sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0)
{
struct mbuf *m;
KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
if (m0 == NULL)
return;
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(so, sb, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 1");
SBLINKRECORD(sb, m0);
m = m0->m_next;
m0->m_next = NULL;
if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR;
m->m_flags |= M_EOR;
}
sbcompress(so, sb, m, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 2");
}
/*
* Append address and data, and optionally, control (ancillary) data
* to the receive queue of a socket. If present,
* m0 must include a packet header with total length.
* Returns 0 if no space in sockbuf or insufficient mbufs.
*/
int
sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa,
struct mbuf *m0, struct mbuf *control)
{
struct mbuf *m, *n, *nlast;
int space = asa->sa_len;
soassertlocked(so);
if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr");
if (m0)
space += m0->m_pkthdr.len; for (n = control; n; n = n->m_next) {
space += n->m_len;
if (n->m_next == NULL) /* keep pointer to last control buf */
break;
}
if (space > sbspace(so, sb))
return (0);
if (asa->sa_len > MLEN)
return (0);
MGET(m, M_DONTWAIT, MT_SONAME);
if (m == NULL)
return (0);
m->m_len = asa->sa_len;
memcpy(mtod(m, caddr_t), asa, asa->sa_len);
if (n) n->m_next = m0; /* concatenate data to control */
else
control = m0;
m->m_next = control;
SBLASTRECORDCHK(sb, "sbappendaddr 1");
for (n = m; n->m_next != NULL; n = n->m_next) sballoc(so, sb, n); sballoc(so, sb, n);
nlast = n;
SBLINKRECORD(sb, m);
sb->sb_mbtail = nlast;
SBLASTMBUFCHK(sb, "sbappendaddr");
SBLASTRECORDCHK(sb, "sbappendaddr 2");
return (1);
}
int
sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0,
struct mbuf *control)
{
struct mbuf *m, *mlast, *n;
int space = 0;
if (control == NULL)
panic("sbappendcontrol");
for (m = control; ; m = m->m_next) {
space += m->m_len;
if (m->m_next == NULL)
break;
}
n = m; /* save pointer to last control buffer */
for (m = m0; m; m = m->m_next)
space += m->m_len;
if (space > sbspace(so, sb))
return (0);
n->m_next = m0; /* concatenate data to control */
SBLASTRECORDCHK(sb, "sbappendcontrol 1");
for (m = control; m->m_next != NULL; m = m->m_next) sballoc(so, sb, m); sballoc(so, sb, m);
mlast = m;
SBLINKRECORD(sb, control);
sb->sb_mbtail = mlast;
SBLASTMBUFCHK(sb, "sbappendcontrol");
SBLASTRECORDCHK(sb, "sbappendcontrol 2");
return (1);
}
/*
* Compress mbuf chain m into the socket
* buffer sb following mbuf n. If n
* is null, the buffer is presumed empty.
*/
void
sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m,
struct mbuf *n)
{
int eor = 0;
struct mbuf *o;
while (m) {
eor |= m->m_flags & M_EOR;
if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) &&
o->m_type == m->m_type))) {
if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next;
m = m_free(m);
continue;
}
if (n && (n->m_flags & M_EOR) == 0 &&
/* m_trailingspace() checks buffer writeability */
m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : MCLBYTES) / 4 && /* XXX Don't copy too much */ m->m_len <= m_trailingspace(n) &&
n->m_type == m->m_type) {
memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
m->m_len);
n->m_len += m->m_len;
sb->sb_cc += m->m_len;
if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) sb->sb_datacc += m->m_len;
m = m_free(m);
continue;
}
if (n)
n->m_next = m;
else
sb->sb_mb = m;
sb->sb_mbtail = m;
sballoc(so, sb, m);
n = m;
m->m_flags &= ~M_EOR;
m = m->m_next;
n->m_next = NULL;
}
if (eor) {
if (n)
n->m_flags |= eor;
else
printf("semi-panic: sbcompress");
}
SBLASTMBUFCHK(sb, __func__);
}
/*
* Free all mbufs in a sockbuf.
* Check that all resources are reclaimed.
*/
void
sbflush(struct socket *so, struct sockbuf *sb)
{ KASSERT(sb == &so->so_rcv || sb == &so->so_snd); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt)
sbdrop(so, sb, (int)sb->sb_cc);
KASSERT(sb->sb_cc == 0); KASSERT(sb->sb_datacc == 0); KASSERT(sb->sb_mb == NULL); KASSERT(sb->sb_mbtail == NULL); KASSERT(sb->sb_lastrecord == NULL);}
/*
* Drop data from (the front of) a sockbuf.
*/
void
sbdrop(struct socket *so, struct sockbuf *sb, int len)
{
struct mbuf *m, *mn;
struct mbuf *next;
KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { if (m == NULL) {
if (next == NULL)
panic("sbdrop");
m = next;
next = m->m_nextpkt;
continue;
}
if (m->m_len > len) {
m->m_len -= len;
m->m_data += len;
sb->sb_cc -= len;
if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) sb->sb_datacc -= len;
break;
}
len -= m->m_len;
sbfree(so, sb, m);
mn = m_free(m);
m = mn;
}
while (m && m->m_len == 0) { sbfree(so, sb, m);
mn = m_free(m);
m = mn;
}
if (m) {
sb->sb_mb = m;
m->m_nextpkt = next;
} else
sb->sb_mb = next;
/*
* First part is an inline SB_EMPTY_FIXUP(). Second part
* makes sure sb_lastrecord is up-to-date if we dropped
* part of the last record.
*/
m = sb->sb_mb;
if (m == NULL) {
sb->sb_mbtail = NULL;
sb->sb_lastrecord = NULL;
} else if (m->m_nextpkt == NULL)
sb->sb_lastrecord = m;
}
/*
* Drop a record off the front of a sockbuf
* and move the next record to the front.
*/
void
sbdroprecord(struct socket *so, struct sockbuf *sb)
{
struct mbuf *m, *mn;
m = sb->sb_mb;
if (m) {
sb->sb_mb = m->m_nextpkt;
do {
sbfree(so, sb, m);
mn = m_free(m);
} while ((m = mn) != NULL);
}
SB_EMPTY_FIXUP(sb);
}
/*
* Create a "control" mbuf containing the specified data
* with the specified type for presentation on a socket buffer.
*/
struct mbuf *
sbcreatecontrol(const void *p, size_t size, int type, int level)
{
struct cmsghdr *cp;
struct mbuf *m;
if (CMSG_SPACE(size) > MCLBYTES) {
printf("sbcreatecontrol: message too large %zu\n", size);
return (NULL);
}
if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
return (NULL);
if (CMSG_SPACE(size) > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
cp = mtod(m, struct cmsghdr *);
memset(cp, 0, CMSG_SPACE(size));
memcpy(CMSG_DATA(cp), p, size);
m->m_len = CMSG_SPACE(size);
cp->cmsg_len = CMSG_LEN(size);
cp->cmsg_level = level;
cp->cmsg_type = type;
return (m);
}
/* $OpenBSD: ip_icmp.c,v 1.191 2022/05/05 13:57:40 claudio Exp $ */
/* $NetBSD: ip_icmp.c,v 1.19 1996/02/13 23:42:22 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "carp.h"
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#include <netinet/icmp_var.h>
#if NCARP > 0
#include <net/if_types.h>
#include <netinet/ip_carp.h>
#endif
#if NPF > 0
#include <net/pfvar.h>
#endif
/*
* ICMP routines: error generation, receive packet processing, and
* routines to turnaround packets back to the originator, and
* host table maintenance routines.
*/
#ifdef ICMPPRINTFS
int icmpprintfs = 0; /* Settable from ddb */
#endif
/* values controllable via sysctl */
int icmpmaskrepl = 0;
int icmpbmcastecho = 0;
int icmptstamprepl = 1;
int icmperrppslim = 100;
int icmp_rediraccept = 0;
int icmp_redirtimeout = 10 * 60;
static int icmperrpps_count = 0;
static struct timeval icmperrppslim_last;
struct rttimer_queue ip_mtudisc_timeout_q;
struct rttimer_queue icmp_redirect_timeout_q;
struct cpumem *icmpcounters;
const struct sysctl_bounded_args icmpctl_vars[] = {
{ ICMPCTL_MASKREPL, &icmpmaskrepl, 0, 1 },
{ ICMPCTL_BMCASTECHO, &icmpbmcastecho, 0, 1 },
{ ICMPCTL_ERRPPSLIMIT, &icmperrppslim, -1, INT_MAX },
{ ICMPCTL_REDIRACCEPT, &icmp_rediraccept, 0, 1 },
{ ICMPCTL_TSTAMPREPL, &icmptstamprepl, 0, 1 },
};
void icmp_mtudisc_timeout(struct rtentry *, u_int);
int icmp_ratelimit(const struct in_addr *, const int, const int);
int icmp_input_if(struct ifnet *, struct mbuf **, int *, int, int);
int icmp_sysctl_icmpstat(void *, size_t *, void *);
void
icmp_init(void)
{
rt_timer_queue_init(&ip_mtudisc_timeout_q, ip_mtudisc_timeout,
&icmp_mtudisc_timeout);
rt_timer_queue_init(&icmp_redirect_timeout_q, icmp_redirtimeout,
NULL);
icmpcounters = counters_alloc(icps_ncounters);
}
struct mbuf *
icmp_do_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu)
{
struct ip *oip = mtod(n, struct ip *), *nip;
unsigned oiplen = oip->ip_hl << 2;
struct icmp *icp;
struct mbuf *m;
unsigned icmplen, mblen;
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("icmp_error(%x, %d, %d)\n", oip, type, code);
#endif
if (type != ICMP_REDIRECT) icmpstat_inc(icps_error);
/*
* Don't send error if not the first fragment of message.
* Don't error if the old packet protocol was ICMP
* error message, only known informational types.
*/
if (oip->ip_off & htons(IP_OFFMASK))
goto freeit;
if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiplen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)
((caddr_t)oip + oiplen))->icmp_type)) {
icmpstat_inc(icps_oldicmp);
goto freeit;
}
/* Don't send error in response to a multicast or broadcast packet */
if (n->m_flags & (M_BCAST|M_MCAST))
goto freeit;
/*
* First, do a rate limitation check.
*/
if (icmp_ratelimit(&oip->ip_src, type, code)) {
icmpstat_inc(icps_toofreq);
goto freeit;
}
/*
* Now, formulate icmp message
*/
icmplen = oiplen + min(8, ntohs(oip->ip_len));
/*
* Defend against mbuf chains shorter than oip->ip_len:
*/
mblen = 0;
for (m = n; m && (mblen < icmplen); m = m->m_next)
mblen += m->m_len;
icmplen = min(mblen, icmplen);
/*
* As we are not required to return everything we have,
* we return whatever we can return at ease.
*
* Note that ICMP datagrams longer than 576 octets are out of spec
* according to RFC1812;
*/
KASSERT(ICMP_MINLEN + sizeof (struct ip) <= MCLBYTES);
if (sizeof (struct ip) + icmplen + ICMP_MINLEN > MCLBYTES)
icmplen = MCLBYTES - ICMP_MINLEN - sizeof (struct ip);
m = m_gethdr(M_DONTWAIT, MT_HEADER);
if (m && ((sizeof (struct ip) + icmplen + ICMP_MINLEN +
sizeof(long) - 1) &~ (sizeof(long) - 1)) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) { m_freem(m);
m = NULL;
}
}
if (m == NULL)
goto freeit;
/* keep in same rtable and preserve other pkthdr bits */
m->m_pkthdr.ph_rtableid = n->m_pkthdr.ph_rtableid;
m->m_pkthdr.ph_ifidx = n->m_pkthdr.ph_ifidx;
/* move PF_GENERATED to new packet, if existent XXX preserve more? */
if (n->m_pkthdr.pf.flags & PF_TAG_GENERATED) m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
m->m_pkthdr.len = m->m_len = icmplen + ICMP_MINLEN;
m_align(m, m->m_len);
icp = mtod(m, struct icmp *);
if ((u_int)type > ICMP_MAXTYPE)
panic("icmp_error");
icmpstat_inc(icps_outhist + type);
icp->icmp_type = type;
if (type == ICMP_REDIRECT)
icp->icmp_gwaddr.s_addr = dest;
else {
icp->icmp_void = 0;
/*
* The following assignments assume an overlay with the
* zeroed icmp_void field.
*/
if (type == ICMP_PARAMPROB) {
icp->icmp_pptr = code;
code = 0;
} else if (type == ICMP_UNREACH &&
code == ICMP_UNREACH_NEEDFRAG && destmtu)
icp->icmp_nextmtu = htons(destmtu);
}
icp->icmp_code = code;
m_copydata(n, 0, icmplen, &icp->icmp_ip);
/*
* Now, copy old ip header (without options)
* in front of icmp message.
*/
m = m_prepend(m, sizeof(struct ip), M_DONTWAIT);
if (m == NULL)
goto freeit;
nip = mtod(m, struct ip *);
/* ip_v set in ip_output */
nip->ip_hl = sizeof(struct ip) >> 2;
nip->ip_tos = 0;
nip->ip_len = htons(m->m_len);
/* ip_id set in ip_output */
nip->ip_off = 0;
/* ip_ttl set in icmp_reflect */
nip->ip_p = IPPROTO_ICMP;
nip->ip_src = oip->ip_src;
nip->ip_dst = oip->ip_dst;
m_freem(n);
return (m);
freeit:
m_freem(n);
return (NULL);
}
/*
* Generate an error packet of type error
* in response to bad packet ip.
*
* The ip packet inside has ip_off and ip_len in host byte order.
*/
void
icmp_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu)
{
struct mbuf *m;
m = icmp_do_error(n, type, code, dest, destmtu);
if (m != NULL) if (!icmp_reflect(m, NULL, NULL)) icmp_send(m, NULL);
}
/*
* Process a received ICMP message.
*/
int
icmp_input(struct mbuf **mp, int *offp, int proto, int af)
{
struct ifnet *ifp;
ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
if (ifp == NULL) {
m_freemp(mp);
return IPPROTO_DONE;
}
proto = icmp_input_if(ifp, mp, offp, proto, af);
if_put(ifp);
return proto;
}
int
icmp_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto, int af)
{
struct mbuf *m = *mp;
int hlen = *offp;
struct icmp *icp;
struct ip *ip = mtod(m, struct ip *);
struct sockaddr_in sin;
int icmplen, i, code;
struct in_ifaddr *ia;
void (*ctlfunc)(int, struct sockaddr *, u_int, void *);
struct mbuf *opts;
/*
* Locate icmp structure in mbuf, and check
* that not corrupted and of at least minimum length.
*/
icmplen = ntohs(ip->ip_len) - hlen;
#ifdef ICMPPRINTFS
if (icmpprintfs) {
char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN];
inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
printf("icmp_input from %s to %s, len %d\n", src, dst, icmplen);
}
#endif
if (icmplen < ICMP_MINLEN) {
icmpstat_inc(icps_tooshort);
goto freeit;
}
i = hlen + min(icmplen, ICMP_ADVLENMAX);
if ((m = *mp = m_pullup(m, i)) == NULL) {
icmpstat_inc(icps_tooshort);
return IPPROTO_DONE;
}
ip = mtod(m, struct ip *);
if (in4_cksum(m, 0, hlen, icmplen)) {
icmpstat_inc(icps_checksum);
goto freeit;
}
icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
#ifdef ICMPPRINTFS
/*
* Message type specific processing.
*/
if (icmpprintfs)
printf("icmp_input, type %d code %d\n", icp->icmp_type,
icp->icmp_code);
#endif
if (icp->icmp_type > ICMP_MAXTYPE)
goto raw;
#if NPF > 0
if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
switch (icp->icmp_type) {
/*
* As pf_icmp_mapping() considers redirects belonging to a
* diverted connection, we must include it here.
*/
case ICMP_REDIRECT:
/* FALLTHROUGH */
/*
* These ICMP types map to other connections. They must be
* delivered to pr_ctlinput() also for diverted connections.
*/
case ICMP_UNREACH:
case ICMP_TIMXCEED:
case ICMP_PARAMPROB:
case ICMP_SOURCEQUENCH:
/*
* Do not use the divert-to property of the TCP or UDP
* rule when doing the PCB lookup for the raw socket.
*/
m->m_pkthdr.pf.flags &=~ PF_TAG_DIVERTED;
break;
default:
goto raw;
}
}
#endif /* NPF */
icmpstat_inc(icps_inhist + icp->icmp_type);
code = icp->icmp_code;
switch (icp->icmp_type) {
case ICMP_UNREACH:
switch (code) {
case ICMP_UNREACH_NET:
case ICMP_UNREACH_HOST:
case ICMP_UNREACH_PROTOCOL:
case ICMP_UNREACH_PORT:
case ICMP_UNREACH_SRCFAIL:
code += PRC_UNREACH_NET;
break;
case ICMP_UNREACH_NEEDFRAG:
code = PRC_MSGSIZE;
break;
case ICMP_UNREACH_NET_UNKNOWN:
case ICMP_UNREACH_NET_PROHIB:
case ICMP_UNREACH_TOSNET:
code = PRC_UNREACH_NET;
break;
case ICMP_UNREACH_HOST_UNKNOWN:
case ICMP_UNREACH_ISOLATED:
case ICMP_UNREACH_HOST_PROHIB:
case ICMP_UNREACH_TOSHOST:
case ICMP_UNREACH_FILTER_PROHIB:
case ICMP_UNREACH_HOST_PRECEDENCE:
case ICMP_UNREACH_PRECEDENCE_CUTOFF:
code = PRC_UNREACH_HOST;
break;
default:
goto badcode;
}
goto deliver;
case ICMP_TIMXCEED:
if (code > 1)
goto badcode;
code += PRC_TIMXCEED_INTRANS;
goto deliver;
case ICMP_PARAMPROB:
if (code > 1)
goto badcode;
code = PRC_PARAMPROB;
goto deliver;
case ICMP_SOURCEQUENCH:
if (code)
goto badcode;
code = PRC_QUENCH;
deliver:
/*
* Problem with datagram; advise higher level routines.
*/
if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
icmpstat_inc(icps_badlen);
goto freeit;
}
if (IN_MULTICAST(icp->icmp_ip.ip_dst.s_addr))
goto badcode;
#ifdef INET6
/* Get more contiguous data for a v6 in v4 ICMP message. */
if (icp->icmp_ip.ip_p == IPPROTO_IPV6) {
if (icmplen < ICMP_V6ADVLENMIN ||
icmplen < ICMP_V6ADVLEN(icp)) {
icmpstat_inc(icps_badlen);
goto freeit;
}
}
#endif /* INET6 */
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
#endif
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_addr = icp->icmp_ip.ip_dst;
#if NCARP > 0
if (carp_lsdrop(ifp, m, AF_INET, &sin.sin_addr.s_addr,
&ip->ip_dst.s_addr, 1))
goto freeit;
#endif
/*
* XXX if the packet contains [IPv4 AH TCP], we can't make a
* notification to TCP layer.
*/
ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
if (ctlfunc)
(*ctlfunc)(code, sintosa(&sin), m->m_pkthdr.ph_rtableid,
&icp->icmp_ip);
break;
badcode:
icmpstat_inc(icps_badcode);
break;
case ICMP_ECHO:
if (!icmpbmcastecho &&
(m->m_flags & (M_MCAST | M_BCAST)) != 0) {
icmpstat_inc(icps_bmcastecho);
break;
}
icp->icmp_type = ICMP_ECHOREPLY;
goto reflect;
case ICMP_TSTAMP:
if (icmptstamprepl == 0)
break;
if (!icmpbmcastecho &&
(m->m_flags & (M_MCAST | M_BCAST)) != 0) {
icmpstat_inc(icps_bmcastecho);
break;
}
if (icmplen < ICMP_TSLEN) {
icmpstat_inc(icps_badlen);
break;
}
icp->icmp_type = ICMP_TSTAMPREPLY;
icp->icmp_rtime = iptime();
icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */
goto reflect;
case ICMP_MASKREQ:
if (icmpmaskrepl == 0)
break;
if (icmplen < ICMP_MASKLEN) {
icmpstat_inc(icps_badlen);
break;
}
/*
* We are not able to respond with all ones broadcast
* unless we receive it over a point-to-point interface.
*/
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_len = sizeof(struct sockaddr_in);
if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
ip->ip_dst.s_addr == INADDR_ANY)
sin.sin_addr = ip->ip_src;
else
sin.sin_addr = ip->ip_dst;
if (ifp == NULL)
break;
ia = ifatoia(ifaof_ifpforaddr(sintosa(&sin), ifp));
if (ia == NULL)
break;
icp->icmp_type = ICMP_MASKREPLY;
icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
if (ip->ip_src.s_addr == 0) {
if (ifp->if_flags & IFF_BROADCAST) {
if (ia->ia_broadaddr.sin_addr.s_addr)
ip->ip_src = ia->ia_broadaddr.sin_addr;
else
ip->ip_src.s_addr = INADDR_BROADCAST;
}
else if (ifp->if_flags & IFF_POINTOPOINT)
ip->ip_src = ia->ia_dstaddr.sin_addr;
}
reflect:
#if NCARP > 0
if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr,
&ip->ip_dst.s_addr, 1))
goto freeit;
#endif
icmpstat_inc(icps_reflect);
icmpstat_inc(icps_outhist + icp->icmp_type);
if (!icmp_reflect(m, &opts, NULL)) {
icmp_send(m, opts);
m_free(opts);
}
return IPPROTO_DONE;
case ICMP_REDIRECT:
{
struct sockaddr_in sdst;
struct sockaddr_in sgw;
struct sockaddr_in ssrc;
struct rtentry *newrt = NULL;
if (icmp_rediraccept == 0 || ipforwarding == 1)
goto freeit;
if (code > 3)
goto badcode;
if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
icmpstat_inc(icps_badlen);
break;
}
/*
* Short circuit routing redirects to force
* immediate change in the kernel's routing
* tables. The message is also handed to anyone
* listening on a raw socket (e.g. the routing
* daemon for use in updating its tables).
*/
memset(&sdst, 0, sizeof(sdst));
memset(&sgw, 0, sizeof(sgw));
memset(&ssrc, 0, sizeof(ssrc));
sdst.sin_family = sgw.sin_family = ssrc.sin_family = AF_INET;
sdst.sin_len = sgw.sin_len = ssrc.sin_len = sizeof(sdst);
memcpy(&sdst.sin_addr, &icp->icmp_ip.ip_dst,
sizeof(sdst.sin_addr));
memcpy(&sgw.sin_addr, &icp->icmp_gwaddr,
sizeof(sgw.sin_addr));
memcpy(&ssrc.sin_addr, &ip->ip_src,
sizeof(ssrc.sin_addr));
#ifdef ICMPPRINTFS
if (icmpprintfs) {
char gw[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
inet_ntop(AF_INET, &icp->icmp_gwaddr, gw, sizeof(gw));
inet_ntop(AF_INET, &icp->icmp_ip.ip_dst,
dst, sizeof(dst));
printf("redirect dst %s to %s\n", dst, gw);
}
#endif
#if NCARP > 0
if (carp_lsdrop(ifp, m, AF_INET, &sdst.sin_addr.s_addr,
&ip->ip_dst.s_addr, 1))
goto freeit;
#endif
rtredirect(sintosa(&sdst), sintosa(&sgw),
sintosa(&ssrc), &newrt, m->m_pkthdr.ph_rtableid);
if (newrt != NULL && icmp_redirtimeout > 0) {
rt_timer_add(newrt, &icmp_redirect_timeout_q,
m->m_pkthdr.ph_rtableid);
}
rtfree(newrt);
pfctlinput(PRC_REDIRECT_HOST, sintosa(&sdst));
break;
}
/*
* No kernel processing for the following;
* just fall through to send to raw listener.
*/
case ICMP_ECHOREPLY:
case ICMP_ROUTERADVERT:
case ICMP_ROUTERSOLICIT:
case ICMP_TSTAMPREPLY:
case ICMP_IREQREPLY:
case ICMP_MASKREPLY:
case ICMP_TRACEROUTE:
case ICMP_DATACONVERR:
case ICMP_MOBILE_REDIRECT:
case ICMP_IPV6_WHEREAREYOU:
case ICMP_IPV6_IAMHERE:
case ICMP_MOBILE_REGREQUEST:
case ICMP_MOBILE_REGREPLY:
case ICMP_PHOTURIS:
default:
break;
}
raw:
return rip_input(mp, offp, proto, af);
freeit:
m_freem(m);
return IPPROTO_DONE;
}
/*
* Reflect the ip packet back to the source
*/
int
icmp_reflect(struct mbuf *m, struct mbuf **op, struct in_ifaddr *ia)
{
struct ip *ip = mtod(m, struct ip *);
struct mbuf *opts = NULL;
struct sockaddr_in sin;
struct rtentry *rt = NULL;
int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
u_int rtableid;
u_int8_t pfflags;
if (!in_canforward(ip->ip_src) &&
((ip->ip_src.s_addr & IN_CLASSA_NET) !=
htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
m_freem(m); /* Bad return address */
return (EHOSTUNREACH);
}
if (m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) {
m_freem(m);
return (ELOOP);
}
rtableid = m->m_pkthdr.ph_rtableid;
pfflags = m->m_pkthdr.pf.flags;
m_resethdr(m);
m->m_pkthdr.ph_rtableid = rtableid;
m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
/*
* If the incoming packet was addressed directly to us,
* use dst as the src for the reply. For broadcast, use
* the address which corresponds to the incoming interface.
*/
if (ia == NULL) {
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = ip->ip_dst;
rt = rtalloc(sintosa(&sin), 0, rtableid);
if (rtisvalid(rt) &&
ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST))
ia = ifatoia(rt->rt_ifa);
}
/*
* The following happens if the packet was not addressed to us.
* Use the new source address and do a route lookup. If it fails
* drop the packet as there is no path to the host.
*/
if (ia == NULL) {
rtfree(rt);
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = ip->ip_src;
/* keep packet in the original virtual instance */
rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
if (rt == NULL) {
ipstat_inc(ips_noroute);
m_freem(m);
return (EHOSTUNREACH);
}
ia = ifatoia(rt->rt_ifa);
}
ip->ip_dst = ip->ip_src;
ip->ip_ttl = MAXTTL;
/* It is safe to dereference ``ia'' iff ``rt'' is valid. */
ip->ip_src = ia->ia_addr.sin_addr;
rtfree(rt);
if (optlen > 0) {
u_char *cp;
int opt, cnt;
u_int len;
/*
* Retrieve any source routing from the incoming packet;
* add on any record-route or timestamp options.
*/
cp = (u_char *) (ip + 1);
if (op && (opts = ip_srcroute(m)) == NULL &&
(opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
opts->m_len = sizeof(struct in_addr);
mtod(opts, struct in_addr *)->s_addr = 0;
}
if (op && opts) {
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("icmp_reflect optlen %d rt %d => ",
optlen, opts->m_len);
#endif
for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
len = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp))
break;
len = cp[IPOPT_OLEN];
if (len < IPOPT_OLEN + sizeof(*cp) ||
len > cnt)
break;
}
/*
* Should check for overflow, but it
* "can't happen"
*/
if (opt == IPOPT_RR || opt == IPOPT_TS ||
opt == IPOPT_SECURITY) {
memcpy(mtod(opts, caddr_t) +
opts->m_len, cp, len);
opts->m_len += len;
}
}
/* Terminate & pad, if necessary */
if ((cnt = opts->m_len % 4) != 0) for (; cnt < 4; cnt++) {
*(mtod(opts, caddr_t) + opts->m_len) =
IPOPT_EOL;
opts->m_len++;
}
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("%d\n", opts->m_len);
#endif
}
ip_stripoptions(m);
}
m->m_flags &= ~(M_BCAST|M_MCAST);
if (op) *op = opts;
return (0);
}
/*
* Send an icmp packet back to the ip level
*/
void
icmp_send(struct mbuf *m, struct mbuf *opts)
{
struct ip *ip = mtod(m, struct ip *);
int hlen;
struct icmp *icp;
hlen = ip->ip_hl << 2;
icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
icp->icmp_cksum = 0;
m->m_pkthdr.csum_flags = M_ICMP_CSUM_OUT;
#ifdef ICMPPRINTFS
if (icmpprintfs) {
char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN];
inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
printf("icmp_send dst %s src %s\n", dst, src);
}
#endif
/*
* ip_send() cannot handle IP options properly. So in case we have
* options fill out the IP header here and use ip_send_raw() instead.
*/
if (opts != NULL) {
m = ip_insertoptions(m, opts, &hlen);
ip = mtod(m, struct ip *);
ip->ip_hl = (hlen >> 2);
ip->ip_v = IPVERSION;
ip->ip_off &= htons(IP_DF);
ip->ip_id = htons(ip_randomid());
ipstat_inc(ips_localout);
ip_send_raw(m);
} else
ip_send(m);
}
u_int32_t
iptime(void)
{
struct timeval atv;
u_long t;
microtime(&atv);
t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
return (htonl(t));
}
int
icmp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
int error;
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case ICMPCTL_REDIRTIMEOUT:
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&icmp_redirtimeout, 0, INT_MAX);
rt_timer_queue_change(&icmp_redirect_timeout_q,
icmp_redirtimeout);
NET_UNLOCK();
break;
case ICMPCTL_STATS:
error = icmp_sysctl_icmpstat(oldp, oldlenp, newp);
break;
default:
NET_LOCK();
error = sysctl_bounded_arr(icmpctl_vars, nitems(icmpctl_vars),
name, namelen, oldp, oldlenp, newp, newlen);
NET_UNLOCK();
break;
}
return (error);
}
int
icmp_sysctl_icmpstat(void *oldp, size_t *oldlenp, void *newp)
{
uint64_t counters[icps_ncounters];
struct icmpstat icmpstat;
u_long *words = (u_long *)&icmpstat;
int i;
CTASSERT(sizeof(icmpstat) == (nitems(counters) * sizeof(u_long)));
memset(&icmpstat, 0, sizeof icmpstat);
counters_read(icmpcounters, counters, nitems(counters));
for (i = 0; i < nitems(counters); i++)
words[i] = (u_long)counters[i];
return (sysctl_rdstruct(oldp, oldlenp, newp,
&icmpstat, sizeof(icmpstat)));
}
struct rtentry *
icmp_mtudisc_clone(struct in_addr dst, u_int rtableid, int ipsec)
{
struct sockaddr_in sin;
struct rtentry *rt;
int error;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_len = sizeof(sin);
sin.sin_addr = dst;
rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
/* Check if the route is actually usable */
if (!rtisvalid(rt))
goto bad;
/* IPsec needs the route only for PMTU, it can use reject for that */
if (!ipsec && (rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)))
goto bad;
/*
* No PMTU for local routes and permanent neighbors,
* ARP and NDP use the same expire timer as the route.
*/
if (ISSET(rt->rt_flags, RTF_LOCAL) ||
(ISSET(rt->rt_flags, RTF_LLINFO) && rt->rt_expire == 0))
goto bad;
/* If we didn't get a host route, allocate one */
if ((rt->rt_flags & RTF_HOST) == 0) {
struct rtentry *nrt;
struct rt_addrinfo info;
struct sockaddr_rtlabel sa_rl;
memset(&info, 0, sizeof(info));
info.rti_ifa = rt->rt_ifa;
info.rti_flags = RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC;
info.rti_info[RTAX_DST] = sintosa(&sin);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_LABEL] =
rtlabel_id2sa(rt->rt_labelid, &sa_rl);
error = rtrequest(RTM_ADD, &info, rt->rt_priority, &nrt,
rtableid);
if (error)
goto bad;
nrt->rt_rmx = rt->rt_rmx;
rtfree(rt);
rt = nrt;
rtm_send(rt, RTM_ADD, 0, rtableid);
}
error = rt_timer_add(rt, &ip_mtudisc_timeout_q, rtableid);
if (error)
goto bad;
return (rt);
bad:
rtfree(rt);
return (NULL);
}
/* Table of common MTUs: */
static const u_short mtu_table[] = {
65535, 65280, 32000, 17914, 9180, 8166,
4352, 2002, 1492, 1006, 508, 296, 68, 0
};
void
icmp_mtudisc(struct icmp *icp, u_int rtableid)
{
struct rtentry *rt;
struct ifnet *ifp;
u_long mtu = ntohs(icp->icmp_nextmtu); /* Why a long? IPv6 */
rt = icmp_mtudisc_clone(icp->icmp_ip.ip_dst, rtableid, 0);
if (rt == NULL)
return;
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL) {
rtfree(rt);
return;
}
if (mtu == 0) {
int i = 0;
mtu = ntohs(icp->icmp_ip.ip_len);
/* Some 4.2BSD-based routers incorrectly adjust the ip_len */
if (mtu > rt->rt_mtu && rt->rt_mtu != 0)
mtu -= (icp->icmp_ip.ip_hl << 2);
/* If we still can't guess a value, try the route */
if (mtu == 0) {
mtu = rt->rt_mtu;
/* If no route mtu, default to the interface mtu */
if (mtu == 0)
mtu = ifp->if_mtu;
}
for (i = 0; i < nitems(mtu_table); i++)
if (mtu > mtu_table[i]) {
mtu = mtu_table[i];
break;
}
}
/*
* XXX: RTV_MTU is overloaded, since the admin can set it
* to turn off PMTU for a route, and the kernel can
* set it to indicate a serious problem with PMTU
* on a route. We should be using a separate flag
* for the kernel to indicate this.
*/
if ((rt->rt_locks & RTV_MTU) == 0) {
if (mtu < 296 || mtu > ifp->if_mtu)
rt->rt_locks |= RTV_MTU;
else if (rt->rt_mtu > mtu || rt->rt_mtu == 0)
rt->rt_mtu = mtu;
}
if_put(ifp);
rtfree(rt);
}
void
icmp_mtudisc_timeout(struct rtentry *rt, u_int rtableid)
{
struct ifnet *ifp;
NET_ASSERT_LOCKED();
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
return;
if ((rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == (RTF_DYNAMIC|RTF_HOST)) {
void (*ctlfunc)(int, struct sockaddr *, u_int, void *);
struct sockaddr_in sin;
sin = *satosin(rt_key(rt));
rtdeletemsg(rt, ifp, rtableid);
/* Notify TCP layer of increased Path MTU estimate */
ctlfunc = inetsw[ip_protox[IPPROTO_TCP]].pr_ctlinput;
if (ctlfunc)
(*ctlfunc)(PRC_MTUINC, sintosa(&sin),
rtableid, NULL);
} else {
if ((rt->rt_locks & RTV_MTU) == 0)
rt->rt_mtu = 0;
}
if_put(ifp);
}
/*
* Perform rate limit check.
* Returns 0 if it is okay to send the icmp packet.
* Returns 1 if the router SHOULD NOT send this icmp packet due to rate
* limitation.
*
* XXX per-destination/type check necessary?
*/
int
icmp_ratelimit(const struct in_addr *dst, const int type, const int code)
{
/* PPS limit */
if (!ppsratecheck(&icmperrppslim_last, &icmperrpps_count,
icmperrppslim))
return 1; /* The packet is subject to rate limit */
return 0; /* okay to send */
}
int
icmp_do_exthdr(struct mbuf *m, u_int16_t class, u_int8_t ctype, void *buf,
size_t len)
{
struct ip *ip = mtod(m, struct ip *);
int hlen, off;
struct mbuf *n;
struct icmp *icp;
struct icmp_ext_hdr *ieh;
struct {
struct icmp_ext_hdr ieh;
struct icmp_ext_obj_hdr ieo;
} hdr;
hlen = ip->ip_hl << 2;
icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
if (icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH &&
icp->icmp_type != ICMP_PARAMPROB)
/* exthdr not supported */
return (0);
if (icp->icmp_length != 0)
/* exthdr already present, giving up */
return (0);
/* the actual offset starts after the common ICMP header */
hlen += ICMP_MINLEN;
/* exthdr must start on a word boundary */
off = roundup(ntohs(ip->ip_len) - hlen, sizeof(u_int32_t));
/* ... and at an offset of ICMP_EXT_OFFSET or bigger */
off = max(off, ICMP_EXT_OFFSET);
icp->icmp_length = off / sizeof(u_int32_t);
memset(&hdr, 0, sizeof(hdr));
hdr.ieh.ieh_version = ICMP_EXT_HDR_VERSION;
hdr.ieo.ieo_length = htons(sizeof(struct icmp_ext_obj_hdr) + len);
hdr.ieo.ieo_cnum = class;
hdr.ieo.ieo_ctype = ctype;
if (m_copyback(m, hlen + off, sizeof(hdr), &hdr, M_NOWAIT) ||
m_copyback(m, hlen + off + sizeof(hdr), len, buf, M_NOWAIT)) {
m_freem(m);
return (ENOBUFS);
}
/* calculate checksum */
n = m_getptr(m, hlen + off, &off);
if (n == NULL)
panic("icmp_do_exthdr: m_getptr failure");
ieh = (struct icmp_ext_hdr *)(mtod(n, caddr_t) + off);
ieh->ieh_cksum = in4_cksum(n, 0, off, sizeof(hdr) + len);
ip->ip_len = htons(m->m_pkthdr.len);
return (0);
}
/* $OpenBSD: kern_sig.c,v 1.299 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_sig.c,v 1.54 1996/04/22 01:38:32 christos Exp $ */
/*
* Copyright (c) 1997 Theo de Raadt. All rights reserved.
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
*/
#include <sys/param.h>
#include <sys/signalvar.h>
#include <sys/queue.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/event.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/acct.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/wait.h>
#include <sys/ktrace.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/sched.h>
#include <sys/user.h>
#include <sys/syslog.h>
#include <sys/ttycom.h>
#include <sys/pledge.h>
#include <sys/witness.h>
#include <sys/exec_elf.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#include <machine/tcb.h>
int nosuidcoredump = 1;
int filt_sigattach(struct knote *kn);
void filt_sigdetach(struct knote *kn);
int filt_signal(struct knote *kn, long hint);
const struct filterops sig_filtops = {
.f_flags = 0,
.f_attach = filt_sigattach,
.f_detach = filt_sigdetach,
.f_event = filt_signal,
};
/*
* The array below categorizes the signals and their default actions.
*/
const int sigprop[NSIG] = {
0, /* unused */
SA_KILL, /* SIGHUP */
SA_KILL, /* SIGINT */
SA_KILL|SA_CORE, /* SIGQUIT */
SA_KILL|SA_CORE, /* SIGILL */
SA_KILL|SA_CORE, /* SIGTRAP */
SA_KILL|SA_CORE, /* SIGABRT */
SA_KILL|SA_CORE, /* SIGEMT */
SA_KILL|SA_CORE, /* SIGFPE */
SA_KILL, /* SIGKILL */
SA_KILL|SA_CORE, /* SIGBUS */
SA_KILL|SA_CORE, /* SIGSEGV */
SA_KILL|SA_CORE, /* SIGSYS */
SA_KILL, /* SIGPIPE */
SA_KILL, /* SIGALRM */
SA_KILL, /* SIGTERM */
SA_IGNORE, /* SIGURG */
SA_STOP, /* SIGSTOP */
SA_STOP|SA_TTYSTOP, /* SIGTSTP */
SA_IGNORE|SA_CONT, /* SIGCONT */
SA_IGNORE, /* SIGCHLD */
SA_STOP|SA_TTYSTOP, /* SIGTTIN */
SA_STOP|SA_TTYSTOP, /* SIGTTOU */
SA_IGNORE, /* SIGIO */
SA_KILL, /* SIGXCPU */
SA_KILL, /* SIGXFSZ */
SA_KILL, /* SIGVTALRM */
SA_KILL, /* SIGPROF */
SA_IGNORE, /* SIGWINCH */
SA_IGNORE, /* SIGINFO */
SA_KILL, /* SIGUSR1 */
SA_KILL, /* SIGUSR2 */
SA_IGNORE, /* SIGTHR */
};
#define CONTSIGMASK (sigmask(SIGCONT))
#define STOPSIGMASK (sigmask(SIGSTOP) | sigmask(SIGTSTP) | \
sigmask(SIGTTIN) | sigmask(SIGTTOU))
void setsigvec(struct proc *, int, struct sigaction *);
void proc_stop(struct proc *p, int);
void proc_stop_sweep(void *);
void *proc_stop_si;
void setsigctx(struct proc *, int, struct sigctx *);
void postsig_done(struct proc *, int, sigset_t, int);
void postsig(struct proc *, int, struct sigctx *);
int cansignal(struct proc *, struct process *, int);
struct pool sigacts_pool; /* memory pool for sigacts structures */
void sigio_del(struct sigiolst *);
void sigio_unlink(struct sigio_ref *, struct sigiolst *);
struct mutex sigio_lock = MUTEX_INITIALIZER(IPL_HIGH);
/*
* Can thread p, send the signal signum to process qr?
*/
int
cansignal(struct proc *p, struct process *qr, int signum)
{
struct process *pr = p->p_p;
struct ucred *uc = p->p_ucred;
struct ucred *quc = qr->ps_ucred;
if (uc->cr_uid == 0)
return (1); /* root can always signal */
if (pr == qr)
return (1); /* process can always signal itself */
/* optimization: if the same creds then the tests below will pass */
if (uc == quc)
return (1);
if (signum == SIGCONT && qr->ps_session == pr->ps_session)
return (1); /* SIGCONT in session */
/*
* Using kill(), only certain signals can be sent to setugid
* child processes
*/
if (qr->ps_flags & PS_SUGID) {
switch (signum) {
case 0:
case SIGKILL:
case SIGINT:
case SIGTERM:
case SIGALRM:
case SIGSTOP:
case SIGTTIN:
case SIGTTOU:
case SIGTSTP:
case SIGHUP:
case SIGUSR1:
case SIGUSR2:
if (uc->cr_ruid == quc->cr_ruid ||
uc->cr_uid == quc->cr_ruid)
return (1);
}
return (0);
}
if (uc->cr_ruid == quc->cr_ruid ||
uc->cr_ruid == quc->cr_svuid ||
uc->cr_uid == quc->cr_ruid ||
uc->cr_uid == quc->cr_svuid)
return (1);
return (0);
}
/*
* Initialize signal-related data structures.
*/
void
signal_init(void)
{
proc_stop_si = softintr_establish(IPL_SOFTCLOCK, proc_stop_sweep,
NULL);
if (proc_stop_si == NULL)
panic("signal_init failed to register softintr");
pool_init(&sigacts_pool, sizeof(struct sigacts), 0, IPL_NONE,
PR_WAITOK, "sigapl", NULL);
}
/*
* Initialize a new sigaltstack structure.
*/
void
sigstkinit(struct sigaltstack *ss)
{
ss->ss_flags = SS_DISABLE;
ss->ss_size = 0;
ss->ss_sp = NULL;
}
/*
* Create an initial sigacts structure, using the same signal state
* as pr.
*/
struct sigacts *
sigactsinit(struct process *pr)
{
struct sigacts *ps;
ps = pool_get(&sigacts_pool, PR_WAITOK);
memcpy(ps, pr->ps_sigacts, sizeof(struct sigacts));
return (ps);
}
/*
* Release a sigacts structure.
*/
void
sigactsfree(struct sigacts *ps)
{
pool_put(&sigacts_pool, ps);
}
int
sys_sigaction(struct proc *p, void *v, register_t *retval)
{
struct sys_sigaction_args /* {
syscallarg(int) signum;
syscallarg(const struct sigaction *) nsa;
syscallarg(struct sigaction *) osa;
} */ *uap = v;
struct sigaction vec;
#ifdef KTRACE
struct sigaction ovec;
#endif
struct sigaction *sa;
const struct sigaction *nsa;
struct sigaction *osa;
struct sigacts *ps = p->p_p->ps_sigacts;
int signum;
int bit, error;
signum = SCARG(uap, signum);
nsa = SCARG(uap, nsa);
osa = SCARG(uap, osa);
if (signum <= 0 || signum >= NSIG ||
(nsa && (signum == SIGKILL || signum == SIGSTOP)))
return (EINVAL);
sa = &vec;
if (osa) {
mtx_enter(&p->p_p->ps_mtx);
sa->sa_handler = ps->ps_sigact[signum];
sa->sa_mask = ps->ps_catchmask[signum];
bit = sigmask(signum);
sa->sa_flags = 0;
if ((ps->ps_sigonstack & bit) != 0)
sa->sa_flags |= SA_ONSTACK;
if ((ps->ps_sigintr & bit) == 0)
sa->sa_flags |= SA_RESTART;
if ((ps->ps_sigreset & bit) != 0)
sa->sa_flags |= SA_RESETHAND;
if ((ps->ps_siginfo & bit) != 0)
sa->sa_flags |= SA_SIGINFO;
if (signum == SIGCHLD) {
if ((ps->ps_sigflags & SAS_NOCLDSTOP) != 0)
sa->sa_flags |= SA_NOCLDSTOP;
if ((ps->ps_sigflags & SAS_NOCLDWAIT) != 0)
sa->sa_flags |= SA_NOCLDWAIT;
}
mtx_leave(&p->p_p->ps_mtx);
if ((sa->sa_mask & bit) == 0)
sa->sa_flags |= SA_NODEFER;
sa->sa_mask &= ~bit;
error = copyout(sa, osa, sizeof (vec));
if (error)
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ovec = vec;
#endif
}
if (nsa) {
error = copyin(nsa, sa, sizeof (vec));
if (error)
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrsigaction(p, sa);
#endif
setsigvec(p, signum, sa);
}
#ifdef KTRACE
if (osa && KTRPOINT(p, KTR_STRUCT))
ktrsigaction(p, &ovec);
#endif
return (0);
}
void
setsigvec(struct proc *p, int signum, struct sigaction *sa)
{
struct sigacts *ps = p->p_p->ps_sigacts;
int bit;
bit = sigmask(signum);
mtx_enter(&p->p_p->ps_mtx);
ps->ps_sigact[signum] = sa->sa_handler;
if ((sa->sa_flags & SA_NODEFER) == 0)
sa->sa_mask |= sigmask(signum);
ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask;
if (signum == SIGCHLD) {
if (sa->sa_flags & SA_NOCLDSTOP)
atomic_setbits_int(&ps->ps_sigflags, SAS_NOCLDSTOP);
else
atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDSTOP);
/*
* If the SA_NOCLDWAIT flag is set or the handler
* is SIG_IGN we reparent the dying child to PID 1
* (init) which will reap the zombie. Because we use
* init to do our dirty work we never set SAS_NOCLDWAIT
* for PID 1.
* XXX exit1 rework means this is unnecessary?
*/
if (initprocess->ps_sigacts != ps &&
((sa->sa_flags & SA_NOCLDWAIT) ||
sa->sa_handler == SIG_IGN))
atomic_setbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT);
else
atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT);
}
if ((sa->sa_flags & SA_RESETHAND) != 0)
ps->ps_sigreset |= bit;
else
ps->ps_sigreset &= ~bit;
if ((sa->sa_flags & SA_SIGINFO) != 0)
ps->ps_siginfo |= bit;
else
ps->ps_siginfo &= ~bit;
if ((sa->sa_flags & SA_RESTART) == 0)
ps->ps_sigintr |= bit;
else
ps->ps_sigintr &= ~bit;
if ((sa->sa_flags & SA_ONSTACK) != 0)
ps->ps_sigonstack |= bit;
else
ps->ps_sigonstack &= ~bit;
/*
* Set bit in ps_sigignore for signals that are set to SIG_IGN,
* and for signals set to SIG_DFL where the default is to ignore.
* However, don't put SIGCONT in ps_sigignore,
* as we have to restart the process.
*/
if (sa->sa_handler == SIG_IGN ||
(sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) {
atomic_clearbits_int(&p->p_siglist, bit);
atomic_clearbits_int(&p->p_p->ps_siglist, bit);
if (signum != SIGCONT)
ps->ps_sigignore |= bit; /* easier in psignal */
ps->ps_sigcatch &= ~bit;
} else {
ps->ps_sigignore &= ~bit;
if (sa->sa_handler == SIG_DFL)
ps->ps_sigcatch &= ~bit;
else
ps->ps_sigcatch |= bit;
}
mtx_leave(&p->p_p->ps_mtx);
}
/*
* Initialize signal state for process 0;
* set to ignore signals that are ignored by default.
*/
void
siginit(struct sigacts *ps)
{
int i;
for (i = 0; i < NSIG; i++)
if (sigprop[i] & SA_IGNORE && i != SIGCONT)
ps->ps_sigignore |= sigmask(i);
ps->ps_sigflags = SAS_NOCLDWAIT | SAS_NOCLDSTOP;
}
/*
* Reset signals for an exec by the specified thread.
*/
void
execsigs(struct proc *p)
{
struct sigacts *ps;
int nc, mask;
ps = p->p_p->ps_sigacts;
mtx_enter(&p->p_p->ps_mtx);
/*
* Reset caught signals. Held signals remain held
* through p_sigmask (unless they were caught,
* and are now ignored by default).
*/
while (ps->ps_sigcatch) {
nc = ffs((long)ps->ps_sigcatch);
mask = sigmask(nc);
ps->ps_sigcatch &= ~mask;
if (sigprop[nc] & SA_IGNORE) {
if (nc != SIGCONT)
ps->ps_sigignore |= mask;
atomic_clearbits_int(&p->p_siglist, mask);
atomic_clearbits_int(&p->p_p->ps_siglist, mask);
}
ps->ps_sigact[nc] = SIG_DFL;
}
/*
* Reset stack state to the user stack.
* Clear set of signals caught on the signal stack.
*/
sigstkinit(&p->p_sigstk);
atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT);
if (ps->ps_sigact[SIGCHLD] == SIG_IGN)
ps->ps_sigact[SIGCHLD] = SIG_DFL;
mtx_leave(&p->p_p->ps_mtx);
}
/*
* Manipulate signal mask.
* Note that we receive new mask, not pointer,
* and return old mask as return value;
* the library stub does the rest.
*/
int
sys_sigprocmask(struct proc *p, void *v, register_t *retval)
{
struct sys_sigprocmask_args /* {
syscallarg(int) how;
syscallarg(sigset_t) mask;
} */ *uap = v;
int error = 0;
sigset_t mask;
KASSERT(p == curproc);
*retval = p->p_sigmask;
mask = SCARG(uap, mask) &~ sigcantmask;
switch (SCARG(uap, how)) {
case SIG_BLOCK:
atomic_setbits_int(&p->p_sigmask, mask);
break;
case SIG_UNBLOCK:
atomic_clearbits_int(&p->p_sigmask, mask);
break;
case SIG_SETMASK:
p->p_sigmask = mask;
break;
default:
error = EINVAL;
break;
}
return (error);
}
int
sys_sigpending(struct proc *p, void *v, register_t *retval)
{
*retval = p->p_siglist | p->p_p->ps_siglist;
return (0);
}
/*
* Temporarily replace calling proc's signal mask for the duration of a
* system call. Original signal mask will be restored by userret().
*/
void
dosigsuspend(struct proc *p, sigset_t newmask)
{
KASSERT(p == curproc);
p->p_oldmask = p->p_sigmask;
atomic_setbits_int(&p->p_flag, P_SIGSUSPEND);
p->p_sigmask = newmask;
}
/*
* Suspend thread until signal, providing mask to be set
* in the meantime. Note nonstandard calling convention:
* libc stub passes mask, not pointer, to save a copyin.
*/
int
sys_sigsuspend(struct proc *p, void *v, register_t *retval)
{
struct sys_sigsuspend_args /* {
syscallarg(int) mask;
} */ *uap = v;
dosigsuspend(p, SCARG(uap, mask) &~ sigcantmask);
while (tsleep_nsec(&nowake, PPAUSE|PCATCH, "sigsusp", INFSLP) == 0)
continue;
/* always return EINTR rather than ERESTART... */
return (EINTR);
}
int
sigonstack(size_t stack)
{
const struct sigaltstack *ss = &curproc->p_sigstk;
return (ss->ss_flags & SS_DISABLE ? 0 :
(stack - (size_t)ss->ss_sp < ss->ss_size));
}
int
sys_sigaltstack(struct proc *p, void *v, register_t *retval)
{
struct sys_sigaltstack_args /* {
syscallarg(const struct sigaltstack *) nss;
syscallarg(struct sigaltstack *) oss;
} */ *uap = v;
struct sigaltstack ss;
const struct sigaltstack *nss;
struct sigaltstack *oss;
int onstack = sigonstack(PROC_STACK(p));
int error;
nss = SCARG(uap, nss);
oss = SCARG(uap, oss);
if (oss != NULL) {
ss = p->p_sigstk;
if (onstack)
ss.ss_flags |= SS_ONSTACK;
if ((error = copyout(&ss, oss, sizeof(ss))))
return (error);
}
if (nss == NULL)
return (0);
error = copyin(nss, &ss, sizeof(ss));
if (error)
return (error);
if (onstack)
return (EPERM);
if (ss.ss_flags & ~SS_DISABLE)
return (EINVAL);
if (ss.ss_flags & SS_DISABLE) {
p->p_sigstk.ss_flags = ss.ss_flags;
return (0);
}
if (ss.ss_size < MINSIGSTKSZ)
return (ENOMEM);
error = uvm_map_remap_as_stack(p, (vaddr_t)ss.ss_sp, ss.ss_size);
if (error)
return (error);
p->p_sigstk = ss;
return (0);
}
int
sys_kill(struct proc *cp, void *v, register_t *retval)
{
struct sys_kill_args /* {
syscallarg(int) pid;
syscallarg(int) signum;
} */ *uap = v;
struct process *pr;
int pid = SCARG(uap, pid);
int signum = SCARG(uap, signum);
int error;
int zombie = 0;
if ((error = pledge_kill(cp, pid)) != 0)
return (error);
if (((u_int)signum) >= NSIG)
return (EINVAL);
if (pid > 0) {
if ((pr = prfind(pid)) == NULL) {
if ((pr = zombiefind(pid)) == NULL)
return (ESRCH);
else
zombie = 1;
}
if (!cansignal(cp, pr, signum))
return (EPERM);
/* kill single process */
if (signum && !zombie)
prsignal(pr, signum);
return (0);
}
switch (pid) {
case -1: /* broadcast signal */
return (killpg1(cp, signum, 0, 1));
case 0: /* signal own process group */
return (killpg1(cp, signum, 0, 0));
default: /* negative explicit process group */
return (killpg1(cp, signum, -pid, 0));
}
}
int
sys_thrkill(struct proc *cp, void *v, register_t *retval)
{
struct sys_thrkill_args /* {
syscallarg(pid_t) tid;
syscallarg(int) signum;
syscallarg(void *) tcb;
} */ *uap = v;
struct proc *p;
int tid = SCARG(uap, tid);
int signum = SCARG(uap, signum);
void *tcb;
if (((u_int)signum) >= NSIG)
return (EINVAL);
if (tid > THREAD_PID_OFFSET) {
if ((p = tfind(tid - THREAD_PID_OFFSET)) == NULL)
return (ESRCH);
/* can only kill threads in the same process */
if (p->p_p != cp->p_p)
return (ESRCH);
} else if (tid == 0)
p = cp;
else
return (EINVAL);
/* optionally require the target thread to have the given tcb addr */
tcb = SCARG(uap, tcb);
if (tcb != NULL && tcb != TCB_GET(p))
return (ESRCH);
if (signum)
ptsignal(p, signum, STHREAD);
return (0);
}
/*
* Common code for kill process group/broadcast kill.
* cp is calling process.
*/
int
killpg1(struct proc *cp, int signum, int pgid, int all)
{
struct process *pr;
struct pgrp *pgrp;
int nfound = 0;
if (all) {
/*
* broadcast
*/
LIST_FOREACH(pr, &allprocess, ps_list) {
if (pr->ps_pid <= 1 ||
pr->ps_flags & (PS_SYSTEM | PS_NOBROADCASTKILL) ||
pr == cp->p_p || !cansignal(cp, pr, signum))
continue;
nfound++;
if (signum)
prsignal(pr, signum);
}
} else {
if (pgid == 0)
/*
* zero pgid means send to my process group.
*/
pgrp = cp->p_p->ps_pgrp;
else {
pgrp = pgfind(pgid);
if (pgrp == NULL)
return (ESRCH);
}
LIST_FOREACH(pr, &pgrp->pg_members, ps_pglist) {
if (pr->ps_pid <= 1 || pr->ps_flags & PS_SYSTEM ||
!cansignal(cp, pr, signum))
continue;
nfound++;
if (signum)
prsignal(pr, signum);
}
}
return (nfound ? 0 : ESRCH);
}
#define CANDELIVER(uid, euid, pr) \
(euid == 0 || \
(uid) == (pr)->ps_ucred->cr_ruid || \
(uid) == (pr)->ps_ucred->cr_svuid || \
(uid) == (pr)->ps_ucred->cr_uid || \
(euid) == (pr)->ps_ucred->cr_ruid || \
(euid) == (pr)->ps_ucred->cr_svuid || \
(euid) == (pr)->ps_ucred->cr_uid)
#define CANSIGIO(cr, pr) \
CANDELIVER((cr)->cr_ruid, (cr)->cr_uid, (pr))
/*
* Send a signal to a process group. If checktty is 1,
* limit to members which have a controlling terminal.
*/
void
pgsignal(struct pgrp *pgrp, int signum, int checkctty)
{
struct process *pr;
if (pgrp) LIST_FOREACH(pr, &pgrp->pg_members, ps_pglist) if (checkctty == 0 || pr->ps_flags & PS_CONTROLT)
prsignal(pr, signum);
}
/*
* Send a SIGIO or SIGURG signal to a process or process group using stored
* credentials rather than those of the current process.
*/
void
pgsigio(struct sigio_ref *sir, int sig, int checkctty)
{
struct process *pr;
struct sigio *sigio;
if (sir->sir_sigio == NULL)
return;
KERNEL_LOCK();
mtx_enter(&sigio_lock);
sigio = sir->sir_sigio;
if (sigio == NULL)
goto out;
if (sigio->sio_pgid > 0) {
if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc))
prsignal(sigio->sio_proc, sig);
} else if (sigio->sio_pgid < 0) { LIST_FOREACH(pr, &sigio->sio_pgrp->pg_members, ps_pglist) { if (CANSIGIO(sigio->sio_ucred, pr) && (checkctty == 0 || (pr->ps_flags & PS_CONTROLT)))
prsignal(pr, sig);
}
}
out:
mtx_leave(&sigio_lock);
KERNEL_UNLOCK();
}
/*
* Recalculate the signal mask and reset the signal disposition after
* usermode frame for delivery is formed.
*/
void
postsig_done(struct proc *p, int signum, sigset_t catchmask, int reset)
{
p->p_ru.ru_nsignals++;
atomic_setbits_int(&p->p_sigmask, catchmask);
if (reset != 0) {
sigset_t mask = sigmask(signum);
struct sigacts *ps = p->p_p->ps_sigacts;
mtx_enter(&p->p_p->ps_mtx);
ps->ps_sigcatch &= ~mask;
if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
ps->ps_sigignore |= mask;
ps->ps_sigact[signum] = SIG_DFL;
mtx_leave(&p->p_p->ps_mtx);
}
}
/*
* Send a signal caused by a trap to the current thread
* If it will be caught immediately, deliver it with correct code.
* Otherwise, post it normally.
*/
void
trapsignal(struct proc *p, int signum, u_long trapno, int code,
union sigval sigval)
{
struct process *pr = p->p_p;
struct sigctx ctx;
int mask;
switch (signum) {
case SIGILL:
case SIGBUS:
case SIGSEGV:
pr->ps_acflag |= ATRAP;
break;
}
mask = sigmask(signum);
setsigctx(p, signum, &ctx);
if ((pr->ps_flags & PS_TRACED) == 0 && ctx.sig_catch != 0 &&
(p->p_sigmask & mask) == 0) {
siginfo_t si;
initsiginfo(&si, signum, trapno, code, sigval);
#ifdef KTRACE
if (KTRPOINT(p, KTR_PSIG)) {
ktrpsig(p, signum, ctx.sig_action,
p->p_sigmask, code, &si);
}
#endif
if (sendsig(ctx.sig_action, signum, p->p_sigmask, &si,
ctx.sig_info, ctx.sig_onstack)) {
KERNEL_LOCK();
sigexit(p, SIGILL);
/* NOTREACHED */
}
postsig_done(p, signum, ctx.sig_catchmask, ctx.sig_reset);
} else {
p->p_sisig = signum;
p->p_sitrapno = trapno; /* XXX for core dump/debugger */
p->p_sicode = code;
p->p_sigval = sigval;
/*
* If traced, stop if signal is masked, and stay stopped
* until released by the debugger. If our parent process
* is waiting for us, don't hang as we could deadlock.
*/
if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) &&
signum != SIGKILL && (p->p_sigmask & mask) != 0) {
int s;
single_thread_set(p, SINGLE_SUSPEND, 0);
pr->ps_xsig = signum;
SCHED_LOCK(s);
proc_stop(p, 1);
SCHED_UNLOCK(s);
signum = pr->ps_xsig;
single_thread_clear(p, 0);
/*
* If we are no longer being traced, or the parent
* didn't give us a signal, skip sending the signal.
*/
if ((pr->ps_flags & PS_TRACED) == 0 ||
signum == 0)
return;
/* update signal info */
p->p_sisig = signum;
mask = sigmask(signum);
}
/*
* Signals like SIGBUS and SIGSEGV should not, when
* generated by the kernel, be ignorable or blockable.
* If it is and we're not being traced, then just kill
* the process.
* After vfs_shutdown(9), init(8) cannot receive signals
* because new code pages of the signal handler cannot be
* mapped from halted storage. init(8) may not die or the
* kernel panics. Better loop between signal handler and
* page fault trap until the machine is halted.
*/
if ((pr->ps_flags & PS_TRACED) == 0 &&
(sigprop[signum] & SA_KILL) &&
((p->p_sigmask & mask) || ctx.sig_ignore) &&
pr->ps_pid != 1) {
KERNEL_LOCK();
sigexit(p, signum);
/* NOTREACHED */
}
KERNEL_LOCK();
ptsignal(p, signum, STHREAD);
KERNEL_UNLOCK();
}
}
/*
* Send the signal to the process. If the signal has an action, the action
* is usually performed by the target process rather than the caller; we add
* the signal to the set of pending signals for the process.
*
* Exceptions:
* o When a stop signal is sent to a sleeping process that takes the
* default action, the process is stopped without awakening it.
* o SIGCONT restarts stopped processes (or puts them back to sleep)
* regardless of the signal action (eg, blocked or ignored).
*
* Other ignored signals are discarded immediately.
*/
void
psignal(struct proc *p, int signum)
{
ptsignal(p, signum, SPROCESS);
}
/*
* type = SPROCESS process signal, can be diverted (sigwait())
* type = STHREAD thread signal, but should be propagated if unhandled
* type = SPROPAGATED propagated to this thread, so don't propagate again
*/
void
ptsignal(struct proc *p, int signum, enum signal_type type)
{
int s, prop;
sig_t action;
int mask;
int *siglist;
struct process *pr = p->p_p;
struct proc *q;
int wakeparent = 0;
KERNEL_ASSERT_LOCKED();
#ifdef DIAGNOSTIC
if ((u_int)signum >= NSIG || signum == 0)
panic("psignal signal number");
#endif
/* Ignore signal if the target process is exiting */
if (pr->ps_flags & PS_EXITING)
return;
mask = sigmask(signum);
if (type == SPROCESS) {
/* Accept SIGKILL to coredumping processes */
if (pr->ps_flags & PS_COREDUMP && signum == SIGKILL) {
atomic_setbits_int(&pr->ps_siglist, mask);
return;
}
/*
* If the current thread can process the signal
* immediately (it's unblocked) then have it take it.
*/
q = curproc;
if (q != NULL && q->p_p == pr && (q->p_flag & P_WEXIT) == 0 &&
(q->p_sigmask & mask) == 0)
p = q;
else {
/*
* A process-wide signal can be diverted to a
* different thread that's in sigwait() for this
* signal. If there isn't such a thread, then
* pick a thread that doesn't have it blocked so
* that the stop/kill consideration isn't
* delayed. Otherwise, mark it pending on the
* main thread.
*/
TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) {
/* ignore exiting threads */
if (q->p_flag & P_WEXIT)
continue;
/* skip threads that have the signal blocked */
if ((q->p_sigmask & mask) != 0)
continue;
/* okay, could send to this thread */
p = q;
/*
* sigsuspend, sigwait, ppoll/pselect, etc?
* Definitely go to this thread, as it's
* already blocked in the kernel.
*/
if (q->p_flag & P_SIGSUSPEND)
break;
}
}
}
if (type != SPROPAGATED) KNOTE(&pr->ps_klist, NOTE_SIGNAL | signum);
prop = sigprop[signum];
/*
* If proc is traced, always give parent a chance.
*/
if (pr->ps_flags & PS_TRACED) {
action = SIG_DFL;
} else {
sigset_t sigcatch, sigignore;
/*
* If the signal is being ignored,
* then we forget about it immediately.
* (Note: we don't set SIGCONT in ps_sigignore,
* and if it is set to SIG_IGN,
* action will be SIG_DFL here.)
*/
mtx_enter(&pr->ps_mtx);
sigignore = pr->ps_sigacts->ps_sigignore;
sigcatch = pr->ps_sigacts->ps_sigcatch;
mtx_leave(&pr->ps_mtx);
if (sigignore & mask)
return;
if (p->p_sigmask & mask) {
action = SIG_HOLD;
} else if (sigcatch & mask) {
action = SIG_CATCH;
} else {
action = SIG_DFL;
if (prop & SA_KILL && pr->ps_nice > NZERO) pr->ps_nice = NZERO;
/*
* If sending a tty stop signal to a member of an
* orphaned process group, discard the signal here if
* the action is default; don't stop the process below
* if sleeping, and don't clear any pending SIGCONT.
*/
if (prop & SA_TTYSTOP && pr->ps_pgrp->pg_jobc == 0)
return;
}
}
/*
* If delivered to process, mark as pending there. Continue and stop
* signals will be propagated to all threads. So they are always
* marked at thread level.
*/
siglist = (type == SPROCESS) ? &pr->ps_siglist : &p->p_siglist;
if (prop & SA_CONT) {
siglist = &p->p_siglist;
atomic_clearbits_int(siglist, STOPSIGMASK);
}
if (prop & SA_STOP) {
siglist = &p->p_siglist;
atomic_clearbits_int(siglist, CONTSIGMASK);
atomic_clearbits_int(&p->p_flag, P_CONTINUED);
}
/*
* XXX delay processing of SA_STOP signals unless action == SIG_DFL?
*/
if (prop & (SA_CONT | SA_STOP) && type != SPROPAGATED) TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) if (q != p) ptsignal(q, signum, SPROPAGATED);
/*
* Defer further processing for signals which are held,
* except that stopped processes must be continued by SIGCONT.
*/
if (action == SIG_HOLD && ((prop & SA_CONT) == 0 ||
p->p_stat != SSTOP)) {
atomic_setbits_int(siglist, mask);
return;
}
SCHED_LOCK(s);
switch (p->p_stat) {
case SSLEEP:
/*
* If process is sleeping uninterruptibly
* we can't interrupt the sleep... the signal will
* be noticed when the process returns through
* trap() or syscall().
*/
if ((p->p_flag & P_SINTR) == 0)
goto out;
/*
* Process is sleeping and traced... make it runnable
* so it can discover the signal in cursig() and stop
* for the parent.
*/
if (pr->ps_flags & PS_TRACED)
goto run;
/*
* If SIGCONT is default (or ignored) and process is
* asleep, we are finished; the process should not
* be awakened.
*/
if ((prop & SA_CONT) && action == SIG_DFL) {
mask = 0;
goto out;
}
/*
* When a sleeping process receives a stop
* signal, process immediately if possible.
*/
if ((prop & SA_STOP) && action == SIG_DFL) {
/*
* If a child holding parent blocked,
* stopping could cause deadlock.
*/
if (pr->ps_flags & PS_PPWAIT)
goto out;
mask = 0;
pr->ps_xsig = signum;
proc_stop(p, 0);
goto out;
}
/*
* All other (caught or default) signals
* cause the process to run.
*/
goto runfast;
/* NOTREACHED */
case SSTOP:
/*
* If traced process is already stopped,
* then no further action is necessary.
*/
if (pr->ps_flags & PS_TRACED)
goto out;
/*
* Kill signal always sets processes running.
*/
if (signum == SIGKILL) {
atomic_clearbits_int(&p->p_flag, P_SUSPSIG);
goto runfast;
}
if (prop & SA_CONT) {
/*
* If SIGCONT is default (or ignored), we continue the
* process but don't leave the signal in p_siglist, as
* it has no further action. If SIGCONT is held, we
* continue the process and leave the signal in
* p_siglist. If the process catches SIGCONT, let it
* handle the signal itself. If it isn't waiting on
* an event, then it goes back to run state.
* Otherwise, process goes back to sleep state.
*/
atomic_setbits_int(&p->p_flag, P_CONTINUED);
atomic_clearbits_int(&p->p_flag, P_SUSPSIG);
wakeparent = 1;
if (action == SIG_DFL) atomic_clearbits_int(siglist, mask);
if (action == SIG_CATCH)
goto runfast;
if (p->p_wchan == NULL)
goto run;
p->p_stat = SSLEEP;
goto out;
}
if (prop & SA_STOP) {
/*
* Already stopped, don't need to stop again.
* (If we did the shell could get confused.)
*/
mask = 0;
goto out;
}
/*
* If process is sleeping interruptibly, then simulate a
* wakeup so that when it is continued, it will be made
* runnable and can look at the signal. But don't make
* the process runnable, leave it stopped.
*/
if (p->p_flag & P_SINTR) unsleep(p);
goto out;
case SONPROC:
/* set siglist before issuing the ast */
atomic_setbits_int(siglist, mask);
mask = 0;
signotify(p);
/* FALLTHROUGH */
default:
/*
* SRUN, SIDL, SDEAD do nothing with the signal,
* other than kicking ourselves if we are running.
* It will either never be noticed, or noticed very soon.
*/
goto out;
}
/* NOTREACHED */
runfast:
/*
* Raise priority to at least PUSER.
*/
if (p->p_usrpri > PUSER) p->p_usrpri = PUSER;
run:
setrunnable(p);
out:
/* finally adjust siglist */
if (mask)
atomic_setbits_int(siglist, mask);
SCHED_UNLOCK(s);
if (wakeparent) wakeup(pr->ps_pptr);
}
/* fill the signal context which should be used by postsig() and issignal() */
void
setsigctx(struct proc *p, int signum, struct sigctx *sctx)
{
struct sigacts *ps = p->p_p->ps_sigacts;
sigset_t mask;
mtx_enter(&p->p_p->ps_mtx);
mask = sigmask(signum);
sctx->sig_action = ps->ps_sigact[signum];
sctx->sig_catchmask = ps->ps_catchmask[signum];
sctx->sig_reset = (ps->ps_sigreset & mask) != 0;
sctx->sig_info = (ps->ps_siginfo & mask) != 0;
sctx->sig_intr = (ps->ps_sigintr & mask) != 0;
sctx->sig_onstack = (ps->ps_sigonstack & mask) != 0;
sctx->sig_ignore = (ps->ps_sigignore & mask) != 0;
sctx->sig_catch = (ps->ps_sigcatch & mask) != 0;
mtx_leave(&p->p_p->ps_mtx);
}
/*
* Determine signal that should be delivered to process p, the current
* process, 0 if none.
*
* If the current process has received a signal (should be caught or cause
* termination, should interrupt current syscall), return the signal number.
* Stop signals with default action are processed immediately, then cleared;
* they aren't returned. This is checked after each entry to the system for
* a syscall or trap. The normal call sequence is
*
* while (signum = cursig(curproc, &ctx))
* postsig(signum, &ctx);
*
* Assumes that if the P_SINTR flag is set, we're holding both the
* kernel and scheduler locks.
*/
int
cursig(struct proc *p, struct sigctx *sctx)
{
struct process *pr = p->p_p;
int signum, mask, prop;
int dolock = (p->p_flag & P_SINTR) == 0;
sigset_t ps_siglist;
int s;
KASSERT(p == curproc);
for (;;) {
ps_siglist = READ_ONCE(pr->ps_siglist);
membar_consumer();
mask = SIGPENDING(p);
if (pr->ps_flags & PS_PPWAIT)
mask &= ~STOPSIGMASK;
if (mask == 0) /* no signal to send */
return (0);
signum = ffs((long)mask);
mask = sigmask(signum);
/* take the signal! */
if (atomic_cas_uint(&pr->ps_siglist, ps_siglist,
ps_siglist & ~mask) != ps_siglist) {
/* lost race taking the process signal, restart */
continue;
}
atomic_clearbits_int(&p->p_siglist, mask);
setsigctx(p, signum, sctx);
/*
* We should see pending but ignored signals
* only if PS_TRACED was on when they were posted.
*/
if (sctx->sig_ignore && (pr->ps_flags & PS_TRACED) == 0)
continue;
/*
* If traced, always stop, and stay stopped until released
* by the debugger. If our parent process is waiting for
* us, don't hang as we could deadlock.
*/
if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) &&
signum != SIGKILL) {
single_thread_set(p, SINGLE_SUSPEND, 0);
pr->ps_xsig = signum;
if (dolock)
SCHED_LOCK(s);
proc_stop(p, 1);
if (dolock)
SCHED_UNLOCK(s);
/*
* re-take the signal before releasing
* the other threads. Must check the continue
* conditions below and only take the signal if
* those are not true.
*/
signum = pr->ps_xsig;
mask = sigmask(signum);
setsigctx(p, signum, sctx);
if (!((pr->ps_flags & PS_TRACED) == 0 ||
signum == 0 ||
(p->p_sigmask & mask) != 0)) {
atomic_clearbits_int(&p->p_siglist, mask);
atomic_clearbits_int(&pr->ps_siglist, mask);
}
single_thread_clear(p, 0);
/*
* If we are no longer being traced, or the parent
* didn't give us a signal, look for more signals.
*/
if ((pr->ps_flags & PS_TRACED) == 0 ||
signum == 0)
continue;
/*
* If the new signal is being masked, look for other
* signals.
*/
if ((p->p_sigmask & mask) != 0)
continue;
}
prop = sigprop[signum];
/*
* Decide whether the signal should be returned.
* Return the signal's number, or fall through
* to clear it from the pending mask.
*/
switch ((long)sctx->sig_action) {
case (long)SIG_DFL:
/*
* Don't take default actions on system processes.
*/
if (pr->ps_pid <= 1) {
#ifdef DIAGNOSTIC
/*
* Are you sure you want to ignore SIGSEGV
* in init? XXX
*/
printf("Process (pid %d) got signal"
" %d\n", pr->ps_pid, signum);
#endif
break; /* == ignore */
}
/*
* If there is a pending stop signal to process
* with default action, stop here,
* then clear the signal. However,
* if process is member of an orphaned
* process group, ignore tty stop signals.
*/
if (prop & SA_STOP) {
if (pr->ps_flags & PS_TRACED || (pr->ps_pgrp->pg_jobc == 0 &&
prop & SA_TTYSTOP))
break; /* == ignore */
pr->ps_xsig = signum;
if (dolock)
SCHED_LOCK(s);
proc_stop(p, 1);
if (dolock)
SCHED_UNLOCK(s);
break;
} else if (prop & SA_IGNORE) {
/*
* Except for SIGCONT, shouldn't get here.
* Default action is to ignore; drop it.
*/
break; /* == ignore */
} else
goto keep;
/* NOTREACHED */
case (long)SIG_IGN:
/*
* Masking above should prevent us ever trying
* to take action on an ignored signal other
* than SIGCONT, unless process is traced.
*/
if ((prop & SA_CONT) == 0 &&
(pr->ps_flags & PS_TRACED) == 0)
printf("%s\n", __func__);
break; /* == ignore */
default:
/*
* This signal has an action, let
* postsig() process it.
*/
goto keep;
}
}
/* NOTREACHED */
keep:
atomic_setbits_int(&p->p_siglist, mask); /*leave the signal for later */
return (signum);
}
/*
* Put the argument process into the stopped state and notify the parent
* via wakeup. Signals are handled elsewhere. The process must not be
* on the run queue.
*/
void
proc_stop(struct proc *p, int sw)
{
struct process *pr = p->p_p;
#ifdef MULTIPROCESSOR
SCHED_ASSERT_LOCKED();
#endif
p->p_stat = SSTOP;
atomic_clearbits_int(&pr->ps_flags, PS_WAITED);
atomic_setbits_int(&pr->ps_flags, PS_STOPPED);
atomic_setbits_int(&p->p_flag, P_SUSPSIG);
/*
* We need this soft interrupt to be handled fast.
* Extra calls to softclock don't hurt.
*/
softintr_schedule(proc_stop_si);
if (sw)
mi_switch();
}
/*
* Called from a soft interrupt to send signals to the parents of stopped
* processes.
* We can't do this in proc_stop because it's called with nasty locks held
* and we would need recursive scheduler lock to deal with that.
*/
void
proc_stop_sweep(void *v)
{
struct process *pr;
LIST_FOREACH(pr, &allprocess, ps_list) {
if ((pr->ps_flags & PS_STOPPED) == 0)
continue;
atomic_clearbits_int(&pr->ps_flags, PS_STOPPED);
if ((pr->ps_pptr->ps_sigacts->ps_sigflags & SAS_NOCLDSTOP) == 0)
prsignal(pr->ps_pptr, SIGCHLD);
wakeup(pr->ps_pptr);
}
}
/*
* Take the action for the specified signal
* from the current set of pending signals.
*/
void
postsig(struct proc *p, int signum, struct sigctx *sctx)
{
u_long trapno;
int mask, returnmask;
siginfo_t si;
union sigval sigval;
int code;
KASSERT(signum != 0);
mask = sigmask(signum);
atomic_clearbits_int(&p->p_siglist, mask);
sigval.sival_ptr = NULL;
if (p->p_sisig != signum) {
trapno = 0;
code = SI_USER;
sigval.sival_ptr = NULL;
} else {
trapno = p->p_sitrapno;
code = p->p_sicode;
sigval = p->p_sigval;
}
initsiginfo(&si, signum, trapno, code, sigval);
#ifdef KTRACE
if (KTRPOINT(p, KTR_PSIG)) {
ktrpsig(p, signum, sctx->sig_action, p->p_flag & P_SIGSUSPEND ?
p->p_oldmask : p->p_sigmask, code, &si);
}
#endif
if (sctx->sig_action == SIG_DFL) {
/*
* Default action, where the default is to kill
* the process. (Other cases were ignored above.)
*/
KERNEL_LOCK();
sigexit(p, signum);
/* NOTREACHED */
} else {
/*
* If we get here, the signal must be caught.
*/
#ifdef DIAGNOSTIC
if (sctx->sig_action == SIG_IGN || (p->p_sigmask & mask))
panic("postsig action");
#endif
/*
* Set the new mask value and also defer further
* occurrences of this signal.
*
* Special case: user has done a sigpause. Here the
* current mask is not of interest, but rather the
* mask from before the sigpause is what we want
* restored after the signal processing is completed.
*/
if (p->p_flag & P_SIGSUSPEND) {
atomic_clearbits_int(&p->p_flag, P_SIGSUSPEND);
returnmask = p->p_oldmask;
} else {
returnmask = p->p_sigmask;
}
if (p->p_sisig == signum) {
p->p_sisig = 0;
p->p_sitrapno = 0;
p->p_sicode = SI_USER;
p->p_sigval.sival_ptr = NULL;
}
if (sendsig(sctx->sig_action, signum, returnmask, &si,
sctx->sig_info, sctx->sig_onstack)) {
KERNEL_LOCK();
sigexit(p, SIGILL);
/* NOTREACHED */
}
postsig_done(p, signum, sctx->sig_catchmask, sctx->sig_reset);
}
}
/*
* Force the current process to exit with the specified signal, dumping core
* if appropriate. We bypass the normal tests for masked and caught signals,
* allowing unrecoverable failures to terminate the process without changing
* signal state. Mark the accounting record with the signal termination.
* If dumping core, save the signal number for the debugger. Calls exit and
* does not return.
*/
void
sigexit(struct proc *p, int signum)
{
/* Mark process as going away */
atomic_setbits_int(&p->p_flag, P_WEXIT);
p->p_p->ps_acflag |= AXSIG;
if (sigprop[signum] & SA_CORE) {
p->p_sisig = signum;
/* if there are other threads, pause them */
if (P_HASSIBLING(p))
single_thread_set(p, SINGLE_SUSPEND, 1);
if (coredump(p) == 0)
signum |= WCOREFLAG;
}
exit1(p, 0, signum, EXIT_NORMAL);
/* NOTREACHED */
}
/*
* Send uncatchable SIGABRT for coredump.
*/
void
sigabort(struct proc *p)
{
struct sigaction sa;
memset(&sa, 0, sizeof sa);
sa.sa_handler = SIG_DFL;
setsigvec(p, SIGABRT, &sa);
atomic_clearbits_int(&p->p_sigmask, sigmask(SIGABRT));
psignal(p, SIGABRT);
}
/*
* Return 1 if `sig', a given signal, is ignored or masked for `p', a given
* thread, and 0 otherwise.
*/
int
sigismasked(struct proc *p, int sig)
{
struct process *pr = p->p_p;
int rv;
mtx_enter(&pr->ps_mtx);
rv = (pr->ps_sigacts->ps_sigignore & sigmask(sig)) ||
(p->p_sigmask & sigmask(sig));
mtx_leave(&pr->ps_mtx);
return !!rv;
}
struct coredump_iostate {
struct proc *io_proc;
struct vnode *io_vp;
struct ucred *io_cred;
off_t io_offset;
};
/*
* Dump core, into a file named "progname.core", unless the process was
* setuid/setgid.
*/
int
coredump(struct proc *p)
{
#ifdef SMALL_KERNEL
return EPERM;
#else
struct process *pr = p->p_p;
struct vnode *vp;
struct ucred *cred = p->p_ucred;
struct vmspace *vm = p->p_vmspace;
struct nameidata nd;
struct vattr vattr;
struct coredump_iostate io;
int error, len, incrash = 0;
char *name;
const char *dir = "/var/crash";
atomic_setbits_int(&pr->ps_flags, PS_COREDUMP);
/* Don't dump if will exceed file size limit. */
if (USPACE + ptoa(vm->vm_dsize + vm->vm_ssize) >= lim_cur(RLIMIT_CORE))
return (EFBIG);
name = pool_get(&namei_pool, PR_WAITOK);
/*
* If the process has inconsistent uids, nosuidcoredump
* determines coredump placement policy.
*/
if (((pr->ps_flags & PS_SUGID) && (error = suser(p))) ||
((pr->ps_flags & PS_SUGID) && nosuidcoredump)) {
if (nosuidcoredump == 3) {
/*
* If the program directory does not exist, dumps of
* that core will silently fail.
*/
len = snprintf(name, MAXPATHLEN, "%s/%s/%u.core",
dir, pr->ps_comm, pr->ps_pid);
incrash = KERNELPATH;
} else if (nosuidcoredump == 2) {
len = snprintf(name, MAXPATHLEN, "%s/%s.core",
dir, pr->ps_comm);
incrash = KERNELPATH;
} else {
pool_put(&namei_pool, name);
return (EPERM);
}
} else
len = snprintf(name, MAXPATHLEN, "%s.core", pr->ps_comm);
if (len >= MAXPATHLEN) {
pool_put(&namei_pool, name);
return (EACCES);
}
/*
* Control the UID used to write out. The normal case uses
* the real UID. If the sugid case is going to write into the
* controlled directory, we do so as root.
*/
if (incrash == 0) {
cred = crdup(cred);
cred->cr_uid = cred->cr_ruid;
cred->cr_gid = cred->cr_rgid;
} else {
if (p->p_fd->fd_rdir) {
vrele(p->p_fd->fd_rdir);
p->p_fd->fd_rdir = NULL;
}
p->p_ucred = crdup(p->p_ucred);
crfree(cred);
cred = p->p_ucred;
crhold(cred);
cred->cr_uid = 0;
cred->cr_gid = 0;
}
/* incrash should be 0 or KERNELPATH only */
NDINIT(&nd, 0, incrash, UIO_SYSSPACE, name, p);
error = vn_open(&nd, O_CREAT | FWRITE | O_NOFOLLOW | O_NONBLOCK,
S_IRUSR | S_IWUSR);
if (error)
goto out;
/*
* Don't dump to non-regular files, files with links, or files
* owned by someone else.
*/
vp = nd.ni_vp;
if ((error = VOP_GETATTR(vp, &vattr, cred, p)) != 0) {
VOP_UNLOCK(vp);
vn_close(vp, FWRITE, cred, p);
goto out;
}
if (vp->v_type != VREG || vattr.va_nlink != 1 ||
vattr.va_mode & ((VREAD | VWRITE) >> 3 | (VREAD | VWRITE) >> 6) ||
vattr.va_uid != cred->cr_uid) {
error = EACCES;
VOP_UNLOCK(vp);
vn_close(vp, FWRITE, cred, p);
goto out;
}
VATTR_NULL(&vattr);
vattr.va_size = 0;
VOP_SETATTR(vp, &vattr, cred, p);
pr->ps_acflag |= ACORE;
io.io_proc = p;
io.io_vp = vp;
io.io_cred = cred;
io.io_offset = 0;
VOP_UNLOCK(vp);
vref(vp);
error = vn_close(vp, FWRITE, cred, p);
if (error == 0)
error = coredump_elf(p, &io);
vrele(vp);
out:
crfree(cred);
pool_put(&namei_pool, name);
return (error);
#endif
}
#ifndef SMALL_KERNEL
int
coredump_write(void *cookie, enum uio_seg segflg, const void *data, size_t len)
{
struct coredump_iostate *io = cookie;
off_t coffset = 0;
size_t csize;
int chunk, error;
csize = len;
do {
if (sigmask(SIGKILL) &
(io->io_proc->p_siglist | io->io_proc->p_p->ps_siglist))
return (EINTR);
/* Rest of the loop sleeps with lock held, so... */
yield();
chunk = MIN(csize, MAXPHYS);
error = vn_rdwr(UIO_WRITE, io->io_vp,
(caddr_t)data + coffset, chunk,
io->io_offset + coffset, segflg,
IO_UNIT, io->io_cred, NULL, io->io_proc);
if (error) {
struct process *pr = io->io_proc->p_p;
if (error == ENOSPC)
log(LOG_ERR,
"coredump of %s(%d) failed, filesystem full\n",
pr->ps_comm, pr->ps_pid);
else
log(LOG_ERR,
"coredump of %s(%d), write failed: errno %d\n",
pr->ps_comm, pr->ps_pid, error);
return (error);
}
coffset += chunk;
csize -= chunk;
} while (csize > 0);
io->io_offset += len;
return (0);
}
void
coredump_unmap(void *cookie, vaddr_t start, vaddr_t end)
{
struct coredump_iostate *io = cookie;
uvm_unmap(&io->io_proc->p_vmspace->vm_map, start, end);
}
#endif /* !SMALL_KERNEL */
/*
* Nonexistent system call-- signal process (may want to handle it).
* Flag error in case process won't see signal immediately (blocked or ignored).
*/
int
sys_nosys(struct proc *p, void *v, register_t *retval)
{
ptsignal(p, SIGSYS, STHREAD);
return (ENOSYS);
}
int
sys___thrsigdivert(struct proc *p, void *v, register_t *retval)
{
static int sigwaitsleep;
struct sys___thrsigdivert_args /* {
syscallarg(sigset_t) sigmask;
syscallarg(siginfo_t *) info;
syscallarg(const struct timespec *) timeout;
} */ *uap = v;
struct sigctx ctx;
sigset_t mask = SCARG(uap, sigmask) &~ sigcantmask;
siginfo_t si;
uint64_t nsecs = INFSLP;
int timeinvalid = 0;
int error = 0;
memset(&si, 0, sizeof(si));
if (SCARG(uap, timeout) != NULL) {
struct timespec ts;
if ((error = copyin(SCARG(uap, timeout), &ts, sizeof(ts))) != 0)
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrreltimespec(p, &ts);
#endif
if (!timespecisvalid(&ts))
timeinvalid = 1;
else
nsecs = TIMESPEC_TO_NSEC(&ts);
}
dosigsuspend(p, p->p_sigmask &~ mask);
for (;;) {
si.si_signo = cursig(p, &ctx);
if (si.si_signo != 0) {
sigset_t smask = sigmask(si.si_signo);
if (smask & mask) {
atomic_clearbits_int(&p->p_siglist, smask);
error = 0;
break;
}
}
/* per-POSIX, delay this error until after the above */
if (timeinvalid)
error = EINVAL;
/* per-POSIX, return immediately if timeout is zero-valued */
if (nsecs == 0)
error = EAGAIN;
if (error != 0)
break;
error = tsleep_nsec(&sigwaitsleep, PPAUSE|PCATCH, "sigwait",
nsecs);
}
if (error == 0) {
*retval = si.si_signo;
if (SCARG(uap, info) != NULL)
error = copyout(&si, SCARG(uap, info), sizeof(si));
} else if (error == ERESTART && SCARG(uap, timeout) != NULL) {
/*
* Restarting is wrong if there's a timeout, as it'll be
* for the same interval again
*/
error = EINTR;
}
return (error);
}
void
initsiginfo(siginfo_t *si, int sig, u_long trapno, int code, union sigval val)
{
memset(si, 0, sizeof(*si));
si->si_signo = sig;
si->si_code = code;
if (code == SI_USER) {
si->si_value = val;
} else {
switch (sig) {
case SIGSEGV:
case SIGILL:
case SIGBUS:
case SIGFPE:
si->si_addr = val.sival_ptr;
si->si_trapno = trapno;
break;
case SIGXFSZ:
break;
}
}
}
int
filt_sigattach(struct knote *kn)
{
struct process *pr = curproc->p_p;
int s;
if (kn->kn_id >= NSIG)
return EINVAL;
kn->kn_ptr.p_process = pr;
kn->kn_flags |= EV_CLEAR; /* automatically set */
s = splhigh();
klist_insert_locked(&pr->ps_klist, kn);
splx(s);
return (0);
}
void
filt_sigdetach(struct knote *kn)
{
struct process *pr = kn->kn_ptr.p_process;
int s;
s = splhigh();
klist_remove_locked(&pr->ps_klist, kn);
splx(s);
}
/*
* signal knotes are shared with proc knotes, so we apply a mask to
* the hint in order to differentiate them from process hints. This
* could be avoided by using a signal-specific knote list, but probably
* isn't worth the trouble.
*/
int
filt_signal(struct knote *kn, long hint)
{ if (hint & NOTE_SIGNAL) {
hint &= ~NOTE_SIGNAL;
if (kn->kn_id == hint) kn->kn_data++;
}
return (kn->kn_data != 0);
}
void
userret(struct proc *p)
{
struct sigctx ctx;
int signum;
/* send SIGPROF or SIGVTALRM if their timers interrupted this thread */
if (p->p_flag & P_PROFPEND) { atomic_clearbits_int(&p->p_flag, P_PROFPEND);
KERNEL_LOCK();
psignal(p, SIGPROF);
KERNEL_UNLOCK();
}
if (p->p_flag & P_ALRMPEND) { atomic_clearbits_int(&p->p_flag, P_ALRMPEND);
KERNEL_LOCK();
psignal(p, SIGVTALRM);
KERNEL_UNLOCK();
}
if (SIGPENDING(p) != 0) { while ((signum = cursig(p, &ctx)) != 0)
postsig(p, signum, &ctx);
}
/*
* If P_SIGSUSPEND is still set here, then we still need to restore
* the original sigmask before returning to userspace. Also, this
* might unmask some pending signals, so we need to check a second
* time for signals to post.
*/
if (p->p_flag & P_SIGSUSPEND) {
atomic_clearbits_int(&p->p_flag, P_SIGSUSPEND);
p->p_sigmask = p->p_oldmask;
while ((signum = cursig(p, &ctx)) != 0)
postsig(p, signum, &ctx);
}
if (p->p_flag & P_SUSPSINGLE) single_thread_check(p, 0);
WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri;
}
int
single_thread_check_locked(struct proc *p, int deep, int s)
{
struct process *pr = p->p_p;
SCHED_ASSERT_LOCKED(); if (pr->ps_single != NULL && pr->ps_single != p) {
do {
/* if we're in deep, we need to unwind to the edge */
if (deep) { if (pr->ps_flags & PS_SINGLEUNWIND)
return (ERESTART);
if (pr->ps_flags & PS_SINGLEEXIT)
return (EINTR);
}
if (atomic_dec_int_nv(&pr->ps_singlecount) == 0) wakeup(&pr->ps_singlecount);
if (pr->ps_flags & PS_SINGLEEXIT) {
SCHED_UNLOCK(s);
KERNEL_LOCK();
exit1(p, 0, 0, EXIT_THREAD_NOCHECK);
/* NOTREACHED */
}
/* not exiting and don't need to unwind, so suspend */
p->p_stat = SSTOP;
mi_switch();
} while (pr->ps_single != NULL);
}
return (0);
}
int
single_thread_check(struct proc *p, int deep)
{
int s, error;
SCHED_LOCK(s);
error = single_thread_check_locked(p, deep, s);
SCHED_UNLOCK(s);
return error;
}
/*
* Stop other threads in the process. The mode controls how and
* where the other threads should stop:
* - SINGLE_SUSPEND: stop wherever they are, will later either be told to exit
* (by setting to SINGLE_EXIT) or be released (via single_thread_clear())
* - SINGLE_UNWIND: just unwind to kernel boundary, will be told to exit
* or released as with SINGLE_SUSPEND
* - SINGLE_EXIT: unwind to kernel boundary and exit
*/
int
single_thread_set(struct proc *p, enum single_thread_mode mode, int wait)
{
struct process *pr = p->p_p;
struct proc *q;
int error, s;
KASSERT(curproc == p);
SCHED_LOCK(s);
error = single_thread_check_locked(p, (mode == SINGLE_UNWIND), s);
if (error) {
SCHED_UNLOCK(s);
return error;
}
switch (mode) {
case SINGLE_SUSPEND:
break;
case SINGLE_UNWIND:
atomic_setbits_int(&pr->ps_flags, PS_SINGLEUNWIND);
break;
case SINGLE_EXIT:
atomic_setbits_int(&pr->ps_flags, PS_SINGLEEXIT);
atomic_clearbits_int(&pr->ps_flags, PS_SINGLEUNWIND);
break;
#ifdef DIAGNOSTIC
default:
panic("single_thread_mode = %d", mode);
#endif
}
pr->ps_singlecount = 0;
membar_producer();
pr->ps_single = p;
TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { if (q == p)
continue;
if (q->p_flag & P_WEXIT) {
if (mode == SINGLE_EXIT) { if (q->p_stat == SSTOP) { setrunnable(q);
atomic_inc_int(&pr->ps_singlecount);
}
}
continue;
}
atomic_setbits_int(&q->p_flag, P_SUSPSINGLE);
switch (q->p_stat) {
case SIDL:
case SRUN:
atomic_inc_int(&pr->ps_singlecount);
break;
case SSLEEP:
/* if it's not interruptible, then just have to wait */
if (q->p_flag & P_SINTR) {
/* merely need to suspend? just stop it */
if (mode == SINGLE_SUSPEND) {
q->p_stat = SSTOP;
break;
}
/* need to unwind or exit, so wake it */
setrunnable(q);
}
atomic_inc_int(&pr->ps_singlecount);
break;
case SSTOP:
if (mode == SINGLE_EXIT) { setrunnable(q);
atomic_inc_int(&pr->ps_singlecount);
}
break;
case SDEAD:
break;
case SONPROC:
atomic_inc_int(&pr->ps_singlecount);
signotify(q);
break;
}
}
SCHED_UNLOCK(s);
if (wait)
single_thread_wait(pr, 1);
return 0;
}
/*
* Wait for other threads to stop. If recheck is false then the function
* returns non-zero if the caller needs to restart the check else 0 is
* returned. If recheck is true the return value is always 0.
*/
int
single_thread_wait(struct process *pr, int recheck)
{
struct sleep_state sls;
int wait;
/* wait until they're all suspended */
wait = pr->ps_singlecount > 0;
while (wait) {
sleep_setup(&sls, &pr->ps_singlecount, PWAIT, "suspend", 0);
wait = pr->ps_singlecount > 0;
sleep_finish(&sls, wait);
if (!recheck)
break;
}
return wait;
}
void
single_thread_clear(struct proc *p, int flag)
{
struct process *pr = p->p_p;
struct proc *q;
int s;
KASSERT(pr->ps_single == p); KASSERT(curproc == p);
SCHED_LOCK(s);
pr->ps_single = NULL;
atomic_clearbits_int(&pr->ps_flags, PS_SINGLEUNWIND | PS_SINGLEEXIT);
TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { if (q == p || (q->p_flag & P_SUSPSINGLE) == 0)
continue;
atomic_clearbits_int(&q->p_flag, P_SUSPSINGLE);
/*
* if the thread was only stopped for single threading
* then clearing that either makes it runnable or puts
* it back into some sleep queue
*/
if (q->p_stat == SSTOP && (q->p_flag & flag) == 0) {
if (q->p_wchan == NULL)
setrunnable(q);
else
q->p_stat = SSLEEP;
}
}
SCHED_UNLOCK(s);
}
void
sigio_del(struct sigiolst *rmlist)
{
struct sigio *sigio;
while ((sigio = LIST_FIRST(rmlist)) != NULL) { LIST_REMOVE(sigio, sio_pgsigio);
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO, sizeof(*sigio));
}
}
void
sigio_unlink(struct sigio_ref *sir, struct sigiolst *rmlist)
{
struct sigio *sigio;
MUTEX_ASSERT_LOCKED(&sigio_lock);
sigio = sir->sir_sigio;
if (sigio != NULL) { KASSERT(sigio->sio_myref == sir);
sir->sir_sigio = NULL;
if (sigio->sio_pgid > 0)
sigio->sio_proc = NULL;
else
sigio->sio_pgrp = NULL; LIST_REMOVE(sigio, sio_pgsigio); LIST_INSERT_HEAD(rmlist, sigio, sio_pgsigio);
}
}
void
sigio_free(struct sigio_ref *sir)
{
struct sigiolst rmlist;
if (sir->sir_sigio == NULL)
return;
LIST_INIT(&rmlist);
mtx_enter(&sigio_lock);
sigio_unlink(sir, &rmlist);
mtx_leave(&sigio_lock);
sigio_del(&rmlist);
}
void
sigio_freelist(struct sigiolst *sigiolst)
{
struct sigiolst rmlist;
struct sigio *sigio;
if (LIST_EMPTY(sigiolst))
return;
LIST_INIT(&rmlist);
mtx_enter(&sigio_lock);
while ((sigio = LIST_FIRST(sigiolst)) != NULL)
sigio_unlink(sigio->sio_myref, &rmlist);
mtx_leave(&sigio_lock);
sigio_del(&rmlist);
}
int
sigio_setown(struct sigio_ref *sir, u_long cmd, caddr_t data)
{
struct sigiolst rmlist;
struct proc *p = curproc;
struct pgrp *pgrp = NULL;
struct process *pr = NULL;
struct sigio *sigio;
int error;
pid_t pgid = *(int *)data;
if (pgid == 0) {
sigio_free(sir);
return (0);
}
if (cmd == TIOCSPGRP) { if (pgid < 0)
return (EINVAL);
pgid = -pgid;
}
sigio = malloc(sizeof(*sigio), M_SIGIO, M_WAITOK);
sigio->sio_pgid = pgid;
sigio->sio_ucred = crhold(p->p_ucred);
sigio->sio_myref = sir;
LIST_INIT(&rmlist);
/*
* The kernel lock, and not sleeping between prfind()/pgfind() and
* linking of the sigio ensure that the process or process group does
* not disappear unexpectedly.
*/
KERNEL_LOCK();
mtx_enter(&sigio_lock);
if (pgid > 0) {
pr = prfind(pgid);
if (pr == NULL) {
error = ESRCH;
goto fail;
}
/*
* Policy - Don't allow a process to FSETOWN a process
* in another session.
*
* Remove this test to allow maximum flexibility or
* restrict FSETOWN to the current process or process
* group for maximum safety.
*/
if (pr->ps_session != p->p_p->ps_session) {
error = EPERM;
goto fail;
}
if ((pr->ps_flags & PS_EXITING) != 0) {
error = ESRCH;
goto fail;
}
} else /* if (pgid < 0) */ {
pgrp = pgfind(-pgid);
if (pgrp == NULL) {
error = ESRCH;
goto fail;
}
/*
* Policy - Don't allow a process to FSETOWN a process
* in another session.
*
* Remove this test to allow maximum flexibility or
* restrict FSETOWN to the current process or process
* group for maximum safety.
*/
if (pgrp->pg_session != p->p_p->ps_session) {
error = EPERM;
goto fail;
}
}
if (pgid > 0) {
sigio->sio_proc = pr;
LIST_INSERT_HEAD(&pr->ps_sigiolst, sigio, sio_pgsigio);
} else {
sigio->sio_pgrp = pgrp;
LIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
}
sigio_unlink(sir, &rmlist);
sir->sir_sigio = sigio;
mtx_leave(&sigio_lock);
KERNEL_UNLOCK();
sigio_del(&rmlist);
return (0);
fail:
mtx_leave(&sigio_lock);
KERNEL_UNLOCK();
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO, sizeof(*sigio));
return (error);
}
void
sigio_getown(struct sigio_ref *sir, u_long cmd, caddr_t data)
{
struct sigio *sigio;
pid_t pgid = 0;
mtx_enter(&sigio_lock);
sigio = sir->sir_sigio;
if (sigio != NULL) pgid = sigio->sio_pgid;
mtx_leave(&sigio_lock);
if (cmd == TIOCGPGRP)
pgid = -pgid;
*(int *)data = pgid;
}
void
sigio_copy(struct sigio_ref *dst, struct sigio_ref *src)
{
struct sigiolst rmlist;
struct sigio *newsigio, *sigio;
sigio_free(dst);
if (src->sir_sigio == NULL)
return;
newsigio = malloc(sizeof(*newsigio), M_SIGIO, M_WAITOK);
LIST_INIT(&rmlist);
mtx_enter(&sigio_lock);
sigio = src->sir_sigio;
if (sigio == NULL) {
mtx_leave(&sigio_lock);
free(newsigio, M_SIGIO, sizeof(*newsigio));
return;
}
newsigio->sio_pgid = sigio->sio_pgid;
newsigio->sio_ucred = crhold(sigio->sio_ucred);
newsigio->sio_myref = dst;
if (newsigio->sio_pgid > 0) {
newsigio->sio_proc = sigio->sio_proc;
LIST_INSERT_HEAD(&newsigio->sio_proc->ps_sigiolst, newsigio,
sio_pgsigio);
} else {
newsigio->sio_pgrp = sigio->sio_pgrp;
LIST_INSERT_HEAD(&newsigio->sio_pgrp->pg_sigiolst, newsigio,
sio_pgsigio);
}
sigio_unlink(dst, &rmlist);
dst->sir_sigio = newsigio;
mtx_leave(&sigio_lock);
sigio_del(&rmlist);
}
/* $OpenBSD: in_var.h,v 1.41 2018/10/18 15:23:04 cheloha Exp $ */
/* $NetBSD: in_var.h,v 1.16 1996/02/13 23:42:15 christos Exp $ */
/*
* Copyright (c) 1985, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET_IN_VAR_H_
#define _NETINET_IN_VAR_H_
#include <sys/queue.h>
#ifdef _KERNEL
/*
* Interface address, Internet version. One of these structures
* is allocated for each interface with an Internet address.
* The ifaddr structure contains the protocol-independent part
* of the structure and is assumed to be first.
*/
struct in_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
/* ia_net{,mask} in host order */
u_int32_t ia_net; /* network number of interface */
u_int32_t ia_netmask; /* mask of net part */
TAILQ_ENTRY(in_ifaddr) ia_list; /* list of internet addresses */
struct sockaddr_in ia_addr; /* reserve space for interface name */
struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
#define ia_broadaddr ia_dstaddr
struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
struct in_multi *ia_allhosts; /* multicast address record for
the allhosts multicast group */
};
#endif
struct in_aliasreq {
char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
union {
struct sockaddr_in ifrau_addr;
int ifrau_align;
} ifra_ifrau;
#ifndef ifra_addr
#define ifra_addr ifra_ifrau.ifrau_addr
#endif
struct sockaddr_in ifra_dstaddr;
#define ifra_broadaddr ifra_dstaddr
struct sockaddr_in ifra_mask;
};
#ifdef _KERNEL
/*
* Macro for finding the internet address structure (in_ifaddr) corresponding
* to a given interface (ifnet structure).
*/
#define IFP_TO_IA(ifp, ia) \
/* struct ifnet *ifp; */ \
/* struct in_ifaddr *ia; */ \
do { \
struct ifaddr *ifa; \
NET_ASSERT_LOCKED(); \
TAILQ_FOREACH(ifa, &(ifp)->if_addrlist, ifa_list) { \
if (ifa->ifa_addr->sa_family == AF_INET) \
break; \
} \
(ia) = ifatoia(ifa); \
} while (/* CONSTCOND */ 0)
#endif
/*
* Per-interface router version information.
*/
struct router_info {
unsigned int rti_ifidx;
int rti_type; /* type of router on this interface */
int rti_age; /* time since last v1 query */
LIST_ENTRY(router_info) rti_list;
};
#ifdef _KERNEL
/*
* Internet multicast address structure. There is one of these for each IP
* multicast group to which this host belongs on a given network interface.
*/
struct in_multi {
struct ifmaddr inm_ifma; /* Protocol-independent info */
#define inm_refcnt inm_ifma.ifma_refcnt
#define inm_ifidx inm_ifma.ifma_ifidx
struct sockaddr_in inm_sin; /* IPv4 multicast address */
#define inm_addr inm_sin.sin_addr
u_int inm_state; /* state of membership */
u_int inm_timer; /* IGMP membership report timer */
struct router_info *inm_rti; /* router version info */
};
static __inline struct in_multi *
ifmatoinm(struct ifmaddr *ifma)
{
return ((struct in_multi *)(ifma));
}
/*
* Macro for looking up the in_multi record for a given IP multicast
* address on a given interface. If no matching record is found, "inm"
* returns NULL.
*/
#define IN_LOOKUP_MULTI(addr, ifp, inm) \
/* struct in_addr addr; */ \
/* struct ifnet *ifp; */ \
/* struct in_multi *inm; */ \
do { \
struct ifmaddr *ifma; \
\
(inm) = NULL; \
NET_ASSERT_LOCKED(); \
TAILQ_FOREACH(ifma, &(ifp)->if_maddrlist, ifma_list) \
if (ifma->ifma_addr->sa_family == AF_INET && \
ifmatoinm(ifma)->inm_addr.s_addr == (addr).s_addr) {\
(inm) = ifmatoinm(ifma); \
break; \
} \
} while (/* CONSTCOND */ 0)
int in_ifinit(struct ifnet *,
struct in_ifaddr *, struct sockaddr_in *, int);
struct in_multi *in_addmulti(struct in_addr *, struct ifnet *);
void in_delmulti(struct in_multi *);
int in_hasmulti(struct in_addr *, struct ifnet *);
void in_ifscrub(struct ifnet *, struct in_ifaddr *);
int in_control(struct socket *, u_long, caddr_t, struct ifnet *);
int in_ioctl(u_long, caddr_t, struct ifnet *, int);
void in_prefixlen2mask(struct in_addr *, int);
#endif
#endif /* _NETINET_IN_VAR_H_ */
/* $OpenBSD: vfs_getcwd.c,v 1.37 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_getcwd.c,v 1.3.2.3 1999/07/11 10:24:09 sommerfeld Exp $ */
/*
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Bill Sommerfeld.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/stat.h>
#include <sys/lock.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/ktrace.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <ufs/ufs/dir.h> /* only for DIRBLKSIZ */
#include <sys/syscallargs.h>
/* Find parent vnode of *lvpp, return in *uvpp */
int
vfs_getcwd_scandir(struct vnode **lvpp, struct vnode **uvpp, char **bpp,
char *bufp, struct proc *p)
{
int eofflag, tries, dirbuflen = 0, len, reclen, error = 0;
off_t off;
struct uio uio;
struct iovec iov;
char *dirbuf = NULL;
ino_t fileno;
struct vattr va;
struct vnode *uvp = NULL;
struct vnode *lvp = *lvpp;
struct componentname cn;
tries = 0;
/*
* If we want the filename, get some info we need while the
* current directory is still locked.
*/
if (bufp != NULL) {
error = VOP_GETATTR(lvp, &va, p->p_ucred, p);
if (error) { vput(lvp);
*lvpp = NULL;
*uvpp = NULL;
return (error);
}
}
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY;
cn.cn_proc = p;
cn.cn_cred = p->p_ucred;
cn.cn_pnbuf = NULL;
cn.cn_nameptr = "..";
cn.cn_namelen = 2;
cn.cn_consume = 0;
/* Get parent vnode using lookup of '..' */
error = VOP_LOOKUP(lvp, uvpp, &cn);
if (error) {
vput(lvp);
*lvpp = NULL;
*uvpp = NULL;
return (error);
}
uvp = *uvpp;
/* If we don't care about the pathname, we're done */
if (bufp == NULL) {
error = 0;
goto out;
}
fileno = va.va_fileid;
dirbuflen = DIRBLKSIZ;
if (dirbuflen < va.va_blocksize)
dirbuflen = va.va_blocksize;
/* XXX we need some limit for fuse, 1 MB should be enough */
if (dirbuflen > 0xfffff) {
error = EINVAL;
goto out;
}
dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
off = 0;
do {
char *cpos;
struct dirent *dp;
iov.iov_base = dirbuf;
iov.iov_len = dirbuflen;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = off;
uio.uio_resid = dirbuflen;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_procp = p;
eofflag = 0;
/* Call VOP_READDIR of parent */
error = VOP_READDIR(uvp, &uio, p->p_ucred, &eofflag);
off = uio.uio_offset;
/* Try again if NFS tosses its cookies */
if (error == EINVAL && tries < 3) {
tries++;
off = 0;
continue;
} else if (error) {
goto out; /* Old userland getcwd() behaviour */
}
cpos = dirbuf;
tries = 0;
/* Scan directory page looking for matching vnode */
for (len = (dirbuflen - uio.uio_resid); len > 0;
len -= reclen) {
dp = (struct dirent *)cpos;
reclen = dp->d_reclen;
/* Check for malformed directory */
if (reclen < DIRENT_RECSIZE(1) || reclen > len) {
error = EINVAL;
goto out;
}
if (dp->d_fileno == fileno) {
char *bp = *bpp;
if (offsetof(struct dirent, d_name) +
dp->d_namlen > reclen) {
error = EINVAL;
goto out;
}
bp -= dp->d_namlen;
if (bp <= bufp) {
error = ERANGE;
goto out;
}
memmove(bp, dp->d_name, dp->d_namlen);
error = 0;
*bpp = bp;
goto out;
}
cpos += reclen;
}
} while (!eofflag);
error = ENOENT;
out:
vrele(lvp);
*lvpp = NULL;
free(dirbuf, M_TEMP, dirbuflen);
return (error);
}
/* Do a lookup in the vnode-to-name reverse */
int
vfs_getcwd_getcache(struct vnode **lvpp, struct vnode **uvpp, char **bpp,
char *bufp)
{
struct vnode *lvp, *uvp = NULL;
char *obp;
int error, vpid;
lvp = *lvpp;
obp = *bpp; /* Save original position to restore to on error */
error = cache_revlookup(lvp, uvpp, bpp, bufp);
if (error) {
if (error != -1) {
vput(lvp);
*lvpp = NULL;
*uvpp = NULL;
}
return (error);
}
uvp = *uvpp;
vpid = uvp->v_id;
/* Release current lock before acquiring the parent lock */
VOP_UNLOCK(lvp);
error = vget(uvp, LK_EXCLUSIVE | LK_RETRY);
if (error)
*uvpp = NULL;
/*
* Verify that vget() succeeded, and check that vnode capability
* didn't change while we were waiting for the lock.
*/
if (error || (vpid != uvp->v_id)) {
/*
* Try to get our lock back. If that works, tell the caller to
* try things the hard way, otherwise give up.
*/
if (!error)
vput(uvp);
*uvpp = NULL;
error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
if (!error) { *bpp = obp; /* restore the buffer */
return (-1);
}
}
vrele(lvp);
*lvpp = NULL;
return (error);
}
/* Common routine shared by sys___getcwd() and vn_isunder() and sys___realpath() */
int
vfs_getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp,
int limit, int flags, struct proc *p)
{
struct filedesc *fdp = p->p_fd;
struct vnode *uvp = NULL;
char *bp = NULL;
int error, perms = VEXEC;
if (rvp == NULL) { rvp = fdp->fd_rdir;
if (rvp == NULL)
rvp = rootvnode;
}
vref(rvp);
vref(lvp);
error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
if (error) {
vrele(lvp);
lvp = NULL;
goto out;
}
if (bufp)
bp = *bpp;
if (lvp == rvp) { if (bp) *(--bp) = '/';
goto out;
}
/*
* This loop will terminate when we hit the root, VOP_READDIR() or
* VOP_LOOKUP() fails, or we run out of space in the user buffer.
*/
do {
if (lvp->v_type != VDIR) {
error = ENOTDIR;
goto out;
}
/* Check for access if caller cares */
if (flags & GETCWD_CHECK_ACCESS) {
error = VOP_ACCESS(lvp, perms, p->p_ucred, p);
if (error)
goto out;
perms = VEXEC|VREAD;
}
/* Step up if we're a covered vnode */
while (lvp->v_flag & VROOT) {
struct vnode *tvp;
if (lvp == rvp)
goto out;
tvp = lvp;
lvp = lvp->v_mount->mnt_vnodecovered;
vput(tvp);
if (lvp == NULL) {
error = ENOENT;
goto out;
}
vref(lvp);
error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
if (error) {
vrele(lvp);
lvp = NULL;
goto out;
}
}
/* Look in the name cache */
error = vfs_getcwd_getcache(&lvp, &uvp, &bp, bufp);
if (error == -1) {
/* If that fails, look in the directory */
error = vfs_getcwd_scandir(&lvp, &uvp, &bp, bufp, p);
}
if (error)
goto out;
#ifdef DIAGNOSTIC
if (lvp != NULL)
panic("getcwd: oops, forgot to null lvp");
if (bufp && (bp <= bufp)) {
panic("getcwd: oops, went back too far");
}
#endif
if (bp) *(--bp) = '/';
lvp = uvp;
uvp = NULL;
limit--;
} while ((lvp != rvp) && (limit > 0));
out:
if (bpp)
*bpp = bp;
if (uvp) vput(uvp); if (lvp) vput(lvp);
vrele(rvp);
return (error);
}
/* Find pathname of a process's current directory */
int
sys___getcwd(struct proc *p, void *v, register_t *retval)
{
struct sys___getcwd_args *uap = v;
int error, len = SCARG(uap, len);
char *path, *bp;
if (len > MAXPATHLEN * 4)
len = MAXPATHLEN * 4;
else if (len < 2)
return (ERANGE);
path = malloc(len, M_TEMP, M_WAITOK);
bp = &path[len - 1];
*bp = '\0';
/*
* 5th argument here is "max number of vnodes to traverse".
* Since each entry takes up at least 2 bytes in the output
* buffer, limit it to N/2 vnodes for an N byte buffer.
*/
error = vfs_getcwd_common(p->p_fd->fd_cdir, NULL, &bp, path, len/2,
GETCWD_CHECK_ACCESS, p);
if (error)
goto out;
/* Put the result into user buffer */
error = copyoutstr(bp, SCARG(uap, buf), MAXPATHLEN, NULL);
#ifdef KTRACE
if (KTRPOINT(p, KTR_NAMEI))
ktrnamei(p, bp);
#endif
out:
free(path, M_TEMP, len);
return (error);
}
/* $OpenBSD: nd6.c,v 1.246 2022/08/09 21:10:03 kn Exp $ */
/* $KAME: nd6.c,v 1.280 2002/06/08 19:52:07 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/timeout.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/syslog.h>
#include <sys/queue.h>
#include <sys/stdint.h>
#include <sys/task.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <netinet/ip_ipsp.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet/icmp6.h>
#define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
#define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
/* timer values */
int nd6_timer_next = -1; /* at which uptime nd6_timer runs */
time_t nd6_expire_next = -1; /* at which uptime nd6_expire runs */
int nd6_delay = 5; /* delay first probe time 5 second */
int nd6_umaxtries = 3; /* maximum unicast query */
int nd6_mmaxtries = 3; /* maximum multicast query */
int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */
/* preventing too many loops in ND option parsing */
int nd6_maxndopt = 10; /* max # of ND options allowed */
int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */
#ifdef ND6_DEBUG
int nd6_debug = 1;
#else
int nd6_debug = 0;
#endif
TAILQ_HEAD(llinfo_nd6_head, llinfo_nd6) nd6_list;
struct pool nd6_pool; /* pool for llinfo_nd6 structures */
int nd6_inuse;
void nd6_timer(void *);
void nd6_slowtimo(void *);
void nd6_expire(void *);
void nd6_expire_timer(void *);
void nd6_invalidate(struct rtentry *);
void nd6_free(struct rtentry *);
int nd6_llinfo_timer(struct rtentry *);
struct timeout nd6_timer_to;
struct timeout nd6_slowtimo_ch;
struct timeout nd6_expire_timeout;
struct task nd6_expire_task;
void
nd6_init(void)
{
static int nd6_init_done = 0;
if (nd6_init_done) {
log(LOG_NOTICE, "%s called more than once\n", __func__);
return;
}
TAILQ_INIT(&nd6_list);
pool_init(&nd6_pool, sizeof(struct llinfo_nd6), 0,
IPL_SOFTNET, 0, "nd6", NULL);
task_set(&nd6_expire_task, nd6_expire, NULL);
nd6_init_done = 1;
/* start timer */
timeout_set_proc(&nd6_timer_to, nd6_timer, NULL);
timeout_set_proc(&nd6_slowtimo_ch, nd6_slowtimo, NULL);
timeout_add_sec(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL);
timeout_set(&nd6_expire_timeout, nd6_expire_timer, NULL);
}
struct nd_ifinfo *
nd6_ifattach(struct ifnet *ifp)
{
struct nd_ifinfo *nd;
nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO);
nd->initialized = 1;
nd->basereachable = REACHABLE_TIME;
nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
nd->retrans = RETRANS_TIMER;
return nd;
}
void
nd6_ifdetach(struct nd_ifinfo *nd)
{
free(nd, M_IP6NDP, sizeof(*nd));
}
void
nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
{
bzero(ndopts, sizeof(*ndopts));
ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
ndopts->nd_opts_last
= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
if (icmp6len == 0) {
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
}
/*
* Take one ND option.
*/
struct nd_opt_hdr *
nd6_option(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int olen;
if (!ndopts)
panic("%s: ndopts == NULL", __func__);
if (!ndopts->nd_opts_last)
panic("%s: uninitialized ndopts", __func__);
if (!ndopts->nd_opts_search)
return NULL;
if (ndopts->nd_opts_done)
return NULL;
nd_opt = ndopts->nd_opts_search;
/* make sure nd_opt_len is inside the buffer */
if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
bzero(ndopts, sizeof(*ndopts));
return NULL;
}
olen = nd_opt->nd_opt_len << 3;
if (olen == 0) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
bzero(ndopts, sizeof(*ndopts));
return NULL;
}
ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
/* option overruns the end of buffer, invalid */
bzero(ndopts, sizeof(*ndopts));
return NULL;
} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
/* reached the end of options chain */
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
return nd_opt;
}
/*
* Parse multiple ND options.
* This function is much easier to use, for ND routines that do not need
* multiple options of the same type.
*/
int
nd6_options(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int i = 0;
if (!ndopts)
panic("%s: ndopts == NULL", __func__);
if (!ndopts->nd_opts_last)
panic("%s: uninitialized ndopts", __func__);
if (!ndopts->nd_opts_search)
return 0;
while (1) {
nd_opt = nd6_option(ndopts);
if (!nd_opt && !ndopts->nd_opts_last) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
icmp6stat_inc(icp6s_nd_badopt);
bzero(ndopts, sizeof(*ndopts));
return -1;
}
if (!nd_opt)
goto skip1;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LINKADDR:
case ND_OPT_TARGET_LINKADDR:
case ND_OPT_MTU:
case ND_OPT_REDIRECTED_HEADER:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
nd6log((LOG_INFO,
"duplicated ND6 option found (type=%d)\n",
nd_opt->nd_opt_type));
/* XXX bark? */
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
break;
case ND_OPT_PREFIX_INFORMATION:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
ndopts->nd_opts_pi_end =
(struct nd_opt_prefix_info *)nd_opt;
break;
case ND_OPT_DNSSL:
case ND_OPT_RDNSS:
/* Don't warn */
break;
default:
/*
* Unknown options must be silently ignored,
* to accommodate future extension to the protocol.
*/
nd6log((LOG_DEBUG,
"nd6_options: unsupported option %d - "
"option ignored\n", nd_opt->nd_opt_type));
}
skip1:
i++;
if (i > nd6_maxndopt) {
icmp6stat_inc(icp6s_nd_toomanyopt);
nd6log((LOG_INFO, "too many loop in nd opt\n"));
break;
}
if (ndopts->nd_opts_done)
break;
}
return 0;
}
/*
* ND6 timer routine to handle ND6 entries
*/
void
nd6_llinfo_settimer(const struct llinfo_nd6 *ln, unsigned int secs)
{
time_t expire = getuptime() + secs;
NET_ASSERT_LOCKED(); KASSERT(!ISSET(ln->ln_rt->rt_flags, RTF_LOCAL));
ln->ln_rt->rt_expire = expire;
if (!timeout_pending(&nd6_timer_to) || expire < nd6_timer_next) {
nd6_timer_next = expire;
timeout_add_sec(&nd6_timer_to, secs);
}
}
void
nd6_timer(void *unused)
{
struct llinfo_nd6 *ln, *nln;
time_t expire = getuptime() + nd6_gctimer;
int secs;
NET_LOCK();
TAILQ_FOREACH_SAFE(ln, &nd6_list, ln_list, nln) {
struct rtentry *rt = ln->ln_rt;
if (rt->rt_expire && rt->rt_expire <= getuptime())
if (nd6_llinfo_timer(rt))
continue;
if (rt->rt_expire && rt->rt_expire < expire)
expire = rt->rt_expire;
}
secs = expire - getuptime();
if (secs < 0)
secs = 0;
if (!TAILQ_EMPTY(&nd6_list)) {
nd6_timer_next = getuptime() + secs;
timeout_add_sec(&nd6_timer_to, secs);
}
NET_UNLOCK();
}
/*
* ND timer state handling.
*
* Returns 1 if `rt' should no longer be used, 0 otherwise.
*/
int
nd6_llinfo_timer(struct rtentry *rt)
{
struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo;
struct sockaddr_in6 *dst = satosin6(rt_key(rt));
struct ifnet *ifp;
struct nd_ifinfo *ndi = NULL;
NET_ASSERT_LOCKED();
if ((ifp = if_get(rt->rt_ifidx)) == NULL)
return 1;
ndi = ND_IFINFO(ifp);
switch (ln->ln_state) {
case ND6_LLINFO_INCOMPLETE:
if (ln->ln_asked < nd6_mmaxtries) {
ln->ln_asked++;
nd6_llinfo_settimer(ln, ndi->retrans / 1000);
nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0);
} else {
struct mbuf *m = ln->ln_hold;
if (m) {
ln->ln_hold = NULL;
/*
* Fake rcvif to make the ICMP error
* more helpful in diagnosing for the
* receiver.
* XXX: should we consider
* older rcvif?
*/
m->m_pkthdr.ph_ifidx = rt->rt_ifidx;
icmp6_error(m, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_ADDR, 0);
if (ln->ln_hold == m) {
/* m is back in ln_hold. Discard. */
m_freem(ln->ln_hold);
ln->ln_hold = NULL;
}
}
nd6_free(rt);
ln = NULL;
}
break;
case ND6_LLINFO_REACHABLE:
if (!ND6_LLINFO_PERMANENT(ln)) {
ln->ln_state = ND6_LLINFO_STALE;
nd6_llinfo_settimer(ln, nd6_gctimer);
}
break;
case ND6_LLINFO_STALE:
case ND6_LLINFO_PURGE:
/* Garbage Collection(RFC 2461 5.3) */
if (!ND6_LLINFO_PERMANENT(ln)) {
nd6_free(rt);
ln = NULL;
}
break;
case ND6_LLINFO_DELAY:
if (ndi) {
/* We need NUD */
ln->ln_asked = 1;
ln->ln_state = ND6_LLINFO_PROBE;
nd6_llinfo_settimer(ln, ndi->retrans / 1000);
nd6_ns_output(ifp, &dst->sin6_addr,
&dst->sin6_addr, ln, 0);
}
break;
case ND6_LLINFO_PROBE:
if (ln->ln_asked < nd6_umaxtries) {
ln->ln_asked++;
nd6_llinfo_settimer(ln, ndi->retrans / 1000);
nd6_ns_output(ifp, &dst->sin6_addr,
&dst->sin6_addr, ln, 0);
} else {
nd6_free(rt);
ln = NULL;
}
break;
}
if_put(ifp);
return (ln == NULL);
}
void
nd6_expire_timer_update(struct in6_ifaddr *ia6)
{
time_t expire_time = INT64_MAX;
int secs;
KERNEL_ASSERT_LOCKED();
if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME)
expire_time = ia6->ia6_lifetime.ia6t_expire;
if (!(ia6->ia6_flags & IN6_IFF_DEPRECATED) &&
ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME &&
expire_time > ia6->ia6_lifetime.ia6t_preferred)
expire_time = ia6->ia6_lifetime.ia6t_preferred;
if (expire_time == INT64_MAX)
return;
/*
* IFA6_IS_INVALID() and IFA6_IS_DEPRECATED() check for uptime
* greater than ia6t_expire or ia6t_preferred, not greater or equal.
* Schedule timeout one second later so that either IFA6_IS_INVALID()
* or IFA6_IS_DEPRECATED() is true.
*/
expire_time++;
if (!timeout_pending(&nd6_expire_timeout) ||
nd6_expire_next > expire_time) {
secs = expire_time - getuptime();
if (secs < 0)
secs = 0;
timeout_add_sec(&nd6_expire_timeout, secs);
nd6_expire_next = expire_time;
}
}
/*
* Expire interface addresses.
*/
void
nd6_expire(void *unused)
{
struct ifnet *ifp;
KERNEL_LOCK();
NET_LOCK();
TAILQ_FOREACH(ifp, &ifnet, if_list) {
struct ifaddr *ifa, *nifa;
struct in6_ifaddr *ia6;
TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrlist, ifa_list, nifa) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia6 = ifatoia6(ifa);
/* check address lifetime */
if (IFA6_IS_INVALID(ia6)) {
in6_purgeaddr(&ia6->ia_ifa);
} else {
if (IFA6_IS_DEPRECATED(ia6))
ia6->ia6_flags |= IN6_IFF_DEPRECATED;
nd6_expire_timer_update(ia6);
}
}
}
NET_UNLOCK();
KERNEL_UNLOCK();
}
void
nd6_expire_timer(void *unused)
{
task_add(net_tq(0), &nd6_expire_task);
}
/*
* Nuke neighbor cache/prefix/default router management table, right before
* ifp goes away.
*/
void
nd6_purge(struct ifnet *ifp)
{
struct llinfo_nd6 *ln, *nln;
NET_ASSERT_LOCKED();
/*
* Nuke neighbor cache entries for the ifp.
*/
TAILQ_FOREACH_SAFE(ln, &nd6_list, ln_list, nln) {
struct rtentry *rt;
struct sockaddr_dl *sdl;
rt = ln->ln_rt;
if (rt != NULL && rt->rt_gateway != NULL &&
rt->rt_gateway->sa_family == AF_LINK) {
sdl = satosdl(rt->rt_gateway);
if (sdl->sdl_index == ifp->if_index)
nd6_free(rt);
}
}
}
struct rtentry *
nd6_lookup(const struct in6_addr *addr6, int create, struct ifnet *ifp,
u_int rtableid)
{
struct rtentry *rt;
struct sockaddr_in6 sin6;
int flags;
bzero(&sin6, sizeof(sin6));
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = *addr6;
flags = (create) ? RT_RESOLVE : 0;
rt = rtalloc(sin6tosa(&sin6), flags, rtableid);
if (rt != NULL && (rt->rt_flags & RTF_LLINFO) == 0) {
/*
* This is the case for the default route.
* If we want to create a neighbor cache for the address, we
* should free the route for the destination and allocate an
* interface route.
*/
if (create) {
rtfree(rt);
rt = NULL;
}
}
if (rt == NULL) {
if (create && ifp) {
struct rt_addrinfo info;
struct ifaddr *ifa;
int error;
/*
* If no route is available and create is set,
* we allocate a host route for the destination
* and treat it like an interface route.
* This hack is necessary for a neighbor which can't
* be covered by our own prefix.
*/
ifa = ifaof_ifpforaddr(sin6tosa(&sin6), ifp);
if (ifa == NULL)
return (NULL);
/*
* Create a new route. RTF_LLINFO is necessary
* to create a Neighbor Cache entry for the
* destination in nd6_rtrequest which will be
* called in rtrequest.
*/
bzero(&info, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = RTF_HOST | RTF_LLINFO;
info.rti_info[RTAX_DST] = sin6tosa(&sin6);
info.rti_info[RTAX_GATEWAY] = sdltosa(ifp->if_sadl);
error = rtrequest(RTM_ADD, &info, RTP_CONNECTED, &rt,
rtableid);
if (error)
return (NULL);
if (rt->rt_llinfo != NULL) {
struct llinfo_nd6 *ln =
(struct llinfo_nd6 *)rt->rt_llinfo;
ln->ln_state = ND6_LLINFO_NOSTATE;
}
} else
return (NULL);
}
/*
* Validation for the entry.
* Note that the check for rt_llinfo is necessary because a cloned
* route from a parent route that has the L flag (e.g. the default
* route to a p2p interface) may have the flag, too, while the
* destination is not actually a neighbor.
*/
if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL ||
(ifp != NULL && rt->rt_ifidx != ifp->if_index)) {
if (create) {
char addr[INET6_ADDRSTRLEN];
nd6log((LOG_DEBUG, "%s: failed to lookup %s (if=%s)\n",
__func__,
inet_ntop(AF_INET6, addr6, addr, sizeof(addr)),
ifp ? ifp->if_xname : "unspec"));
}
rtfree(rt);
return (NULL);
}
return (rt);
}
/*
* Detect if a given IPv6 address identifies a neighbor on a given link.
* XXX: should take care of the destination of a p2p link?
*/
int
nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
{
struct in6_ifaddr *ia6;
struct ifaddr *ifa;
struct rtentry *rt;
/*
* A link-local address is always a neighbor.
* XXX: we should use the sin6_scope_id field rather than the embedded
* interface index.
* XXX: a link does not necessarily specify a single interface.
*/
if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr) &&
ntohs(*(u_int16_t *)&addr->sin6_addr.s6_addr[2]) == ifp->if_index)
return (1);
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia6 = ifatoia6(ifa);
/* Prefix check down below. */
if (ia6->ia6_flags & IN6_IFF_AUTOCONF)
continue;
if (IN6_ARE_MASKED_ADDR_EQUAL(&addr->sin6_addr,
&ia6->ia_addr.sin6_addr,
&ia6->ia_prefixmask.sin6_addr))
return (1);
}
/*
* Even if the address matches none of our addresses, it might be
* in the neighbor cache.
*/
rt = nd6_lookup(&addr->sin6_addr, 0, ifp, ifp->if_rdomain);
if (rt != NULL) {
rtfree(rt);
return (1);
}
return (0);
}
void
nd6_invalidate(struct rtentry *rt)
{
struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo;
struct sockaddr_dl *sdl = satosdl(rt->rt_gateway);
m_freem(ln->ln_hold);
sdl->sdl_alen = 0;
ln->ln_hold = NULL;
ln->ln_state = ND6_LLINFO_INCOMPLETE;
ln->ln_asked = 0;
}
/*
* Free an nd6 llinfo entry.
*/
void
nd6_free(struct rtentry *rt)
{
struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo;
struct in6_addr in6 = satosin6(rt_key(rt))->sin6_addr;
struct ifnet *ifp;
NET_ASSERT_LOCKED();
ifp = if_get(rt->rt_ifidx);
if (!ip6_forwarding) {
if (ln->ln_router) {
/*
* rt6_flush must be called whether or not the neighbor
* is in the Default Router List.
* See a corresponding comment in nd6_na_input().
*/
rt6_flush(&in6, ifp);
}
}
KASSERT(!ISSET(rt->rt_flags, RTF_LOCAL));
nd6_invalidate(rt);
/*
* Detach the route from the routing tree and the list of neighbor
* caches, and disable the route entry not to be used in already
* cached routes.
*/
if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED))
rtdeletemsg(rt, ifp, ifp->if_rdomain);
if_put(ifp);
}
/*
* Upper-layer reachability hint for Neighbor Unreachability Detection.
*
* XXX cost-effective methods?
*/
void
nd6_nud_hint(struct rtentry *rt)
{
struct llinfo_nd6 *ln;
struct ifnet *ifp;
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
return;
if ((rt->rt_flags & RTF_GATEWAY) != 0 ||
(rt->rt_flags & RTF_LLINFO) == 0 ||
rt->rt_llinfo == NULL || rt->rt_gateway == NULL ||
rt->rt_gateway->sa_family != AF_LINK) {
/* This is not a host route. */
goto out;
}
ln = (struct llinfo_nd6 *)rt->rt_llinfo;
if (ln->ln_state < ND6_LLINFO_REACHABLE)
goto out;
/*
* if we get upper-layer reachability confirmation many times,
* it is possible we have false information.
*/
ln->ln_byhint++;
if (ln->ln_byhint > nd6_maxnudhint)
goto out;
ln->ln_state = ND6_LLINFO_REACHABLE;
if (!ND6_LLINFO_PERMANENT(ln))
nd6_llinfo_settimer(ln, ND_IFINFO(ifp)->reachable);
out:
if_put(ifp);
}
void
nd6_rtrequest(struct ifnet *ifp, int req, struct rtentry *rt)
{
struct sockaddr *gate = rt->rt_gateway;
struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo;
struct ifaddr *ifa;
struct in6_ifaddr *ifa6;
if (ISSET(rt->rt_flags, RTF_GATEWAY|RTF_MULTICAST|RTF_MPLS))
return;
if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) {
/*
* This is probably an interface direct route for a link
* which does not need neighbor caches (e.g. fe80::%lo0/64).
* We do not need special treatment below for such a route.
* Moreover, the RTF_LLINFO flag which would be set below
* would annoy the ndp(8) command.
*/
return;
}
if (req == RTM_RESOLVE && nd6_need_cache(ifp) == 0) {
/*
* For routing daemons like ospf6d we allow neighbor discovery
* based on the cloning route only. This allows us to sent
* packets directly into a network without having an address
* with matching prefix on the interface. If the cloning
* route is used for an stf interface, we would mistakenly
* make a neighbor cache for the host route, and would see
* strange neighbor solicitation for the corresponding
* destination. In order to avoid confusion, we check if the
* interface is suitable for neighbor discovery, and stop the
* process if not. Additionally, we remove the LLINFO flag
* so that ndp(8) will not try to get the neighbor information
* of the destination.
*/
rt->rt_flags &= ~RTF_LLINFO;
return;
}
switch (req) {
case RTM_ADD:
if ((rt->rt_flags & RTF_CLONING) ||
((rt->rt_flags & (RTF_LLINFO | RTF_LOCAL)) && ln == NULL)) {
if (ln != NULL)
nd6_llinfo_settimer(ln, 0);
if ((rt->rt_flags & RTF_CLONING) != 0)
break;
}
/*
* In IPv4 code, we try to announce new RTF_ANNOUNCE entry here.
* We don't do that here since llinfo is not ready yet.
*
* There are also couple of other things to be discussed:
* - unsolicited NA code needs improvement beforehand
* - RFC2461 says we MAY send multicast unsolicited NA
* (7.2.6 paragraph 4), however, it also says that we
* SHOULD provide a mechanism to prevent multicast NA storm.
* we don't have anything like it right now.
* note that the mechanism needs a mutual agreement
* between proxies, which means that we need to implement
* a new protocol, or a new kludge.
* - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA.
* we need to check ip6forwarding before sending it.
* (or should we allow proxy ND configuration only for
* routers? there's no mention about proxy ND from hosts)
*/
#if 0
/* XXX it does not work */
if (rt->rt_flags & RTF_ANNOUNCE)
nd6_na_output(ifp,
&satosin6(rt_key(rt))->sin6_addr,
&satosin6(rt_key(rt))->sin6_addr,
ip6_forwarding ? ND_NA_FLAG_ROUTER : 0,
1, NULL);
#endif
/* FALLTHROUGH */
case RTM_RESOLVE:
if (gate->sa_family != AF_LINK ||
gate->sa_len < sizeof(struct sockaddr_dl)) {
log(LOG_DEBUG, "%s: bad gateway value: %s\n",
__func__, ifp->if_xname);
break;
}
satosdl(gate)->sdl_type = ifp->if_type;
satosdl(gate)->sdl_index = ifp->if_index;
if (ln != NULL)
break; /* This happens on a route change */
/*
* Case 2: This route may come from cloning, or a manual route
* add with a LL address.
*/
ln = pool_get(&nd6_pool, PR_NOWAIT | PR_ZERO);
rt->rt_llinfo = (caddr_t)ln;
if (ln == NULL) {
log(LOG_DEBUG, "%s: pool get failed\n", __func__);
break;
}
nd6_inuse++;
ln->ln_rt = rt;
/* this is required for "ndp" command. - shin */
if (req == RTM_ADD) {
/*
* gate should have some valid AF_LINK entry,
* and ln expire should have some lifetime
* which is specified by ndp command.
*/
ln->ln_state = ND6_LLINFO_REACHABLE;
ln->ln_byhint = 0;
} else {
/*
* When req == RTM_RESOLVE, rt is created and
* initialized in rtrequest(), so rt_expire is 0.
*/
ln->ln_state = ND6_LLINFO_NOSTATE;
nd6_llinfo_settimer(ln, 0);
}
rt->rt_flags |= RTF_LLINFO;
TAILQ_INSERT_HEAD(&nd6_list, ln, ln_list);
/*
* If we have too many cache entries, initiate immediate
* purging for some "less recently used" entries. Note that
* we cannot directly call nd6_free() here because it would
* cause re-entering rtable related routines triggering an LOR
* problem for FreeBSD.
*/
if (ip6_neighborgcthresh >= 0 &&
nd6_inuse >= ip6_neighborgcthresh) {
int i;
for (i = 0; i < 10; i++) {
struct llinfo_nd6 *ln_end;
ln_end = TAILQ_LAST(&nd6_list, llinfo_nd6_head);
if (ln_end == ln)
break;
/* Move this entry to the head */
TAILQ_REMOVE(&nd6_list, ln_end, ln_list);
TAILQ_INSERT_HEAD(&nd6_list, ln_end, ln_list);
if (ND6_LLINFO_PERMANENT(ln_end))
continue;
if (ln_end->ln_state > ND6_LLINFO_INCOMPLETE)
ln_end->ln_state = ND6_LLINFO_STALE;
else
ln_end->ln_state = ND6_LLINFO_PURGE;
nd6_llinfo_settimer(ln_end, 0);
}
}
/*
* check if rt_key(rt) is one of my address assigned
* to the interface.
*/
ifa6 = in6ifa_ifpwithaddr(ifp,
&satosin6(rt_key(rt))->sin6_addr);
ifa = ifa6 ? &ifa6->ia_ifa : NULL;
if (ifa) {
ln->ln_state = ND6_LLINFO_REACHABLE;
ln->ln_byhint = 0;
rt->rt_expire = 0;
KASSERT(ifa == rt->rt_ifa); } else if (rt->rt_flags & RTF_ANNOUNCE) {
ln->ln_state = ND6_LLINFO_REACHABLE;
ln->ln_byhint = 0;
rt->rt_expire = 0;
/* join solicited node multicast for proxy ND */
if (ifp->if_flags & IFF_MULTICAST) {
struct in6_addr llsol;
int error;
llsol = satosin6(rt_key(rt))->sin6_addr;
llsol.s6_addr16[0] = htons(0xff02);
llsol.s6_addr16[1] = htons(ifp->if_index);
llsol.s6_addr32[1] = 0;
llsol.s6_addr32[2] = htonl(1);
llsol.s6_addr8[12] = 0xff;
if (in6_addmulti(&llsol, ifp, &error)) {
char addr[INET6_ADDRSTRLEN];
nd6log((LOG_ERR, "%s: failed to join "
"%s (errno=%d)\n", ifp->if_xname,
inet_ntop(AF_INET6, &llsol,
addr, sizeof(addr)),
error));
}
}
}
break;
case RTM_DELETE:
if (ln == NULL)
break;
/* leave from solicited node multicast for proxy ND */
if ((rt->rt_flags & RTF_ANNOUNCE) != 0 &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
struct in6_addr llsol;
struct in6_multi *in6m;
llsol = satosin6(rt_key(rt))->sin6_addr;
llsol.s6_addr16[0] = htons(0xff02);
llsol.s6_addr16[1] = htons(ifp->if_index);
llsol.s6_addr32[1] = 0;
llsol.s6_addr32[2] = htonl(1);
llsol.s6_addr8[12] = 0xff;
IN6_LOOKUP_MULTI(llsol, ifp, in6m);
if (in6m)
in6_delmulti(in6m);
}
nd6_inuse--;
TAILQ_REMOVE(&nd6_list, ln, ln_list);
rt->rt_expire = 0;
rt->rt_llinfo = NULL;
rt->rt_flags &= ~RTF_LLINFO;
m_freem(ln->ln_hold);
pool_put(&nd6_pool, ln);
break;
case RTM_INVALIDATE:
if (ln == NULL)
break;
if (!ISSET(rt->rt_flags, RTF_LOCAL))
nd6_invalidate(rt);
break;
}
}
int
nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
{
struct in6_ndireq *ndi = (struct in6_ndireq *)data;
struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
struct rtentry *rt;
switch (cmd) {
case SIOCGIFINFO_IN6:
NET_LOCK_SHARED();
ndi->ndi = *ND_IFINFO(ifp);
NET_UNLOCK_SHARED();
return (0);
case SIOCGNBRINFO_IN6:
{
struct llinfo_nd6 *ln;
struct in6_addr nb_addr = nbi->addr; /* make local for safety */
time_t expire;
NET_LOCK_SHARED();
/*
* XXX: KAME specific hack for scoped addresses
* XXXX: for other scopes than link-local?
*/
if (IN6_IS_ADDR_LINKLOCAL(&nbi->addr) ||
IN6_IS_ADDR_MC_LINKLOCAL(&nbi->addr)) {
u_int16_t *idp = (u_int16_t *)&nb_addr.s6_addr[2];
if (*idp == 0)
*idp = htons(ifp->if_index);
}
rt = nd6_lookup(&nb_addr, 0, ifp, ifp->if_rdomain);
if (rt == NULL ||
(ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) {
rtfree(rt);
NET_UNLOCK_SHARED();
return (EINVAL);
}
expire = ln->ln_rt->rt_expire;
if (expire != 0) { expire -= getuptime();
expire += gettime();
}
nbi->state = ln->ln_state;
nbi->asked = ln->ln_asked;
nbi->isrouter = ln->ln_router;
nbi->expire = expire;
rtfree(rt);
NET_UNLOCK_SHARED();
return (0);
}
}
return (0);
}
/*
* Create neighbor cache entry and cache link-layer address,
* on reception of inbound ND6 packets. (RS/RA/NS/redirect)
*
* type - ICMP6 type
* code - type dependent information
*/
void
nd6_cache_lladdr(struct ifnet *ifp, const struct in6_addr *from, char *lladdr,
int lladdrlen, int type, int code)
{
struct rtentry *rt = NULL;
struct llinfo_nd6 *ln = NULL;
int is_newentry;
struct sockaddr_dl *sdl = NULL;
int do_update;
int olladdr;
int llchange;
int newstate = 0;
if (!ifp)
panic("%s: ifp == NULL", __func__);
if (!from)
panic("%s: from == NULL", __func__);
/* nothing must be updated for unspecified address */
if (IN6_IS_ADDR_UNSPECIFIED(from))
return;
/*
* Validation about ifp->if_addrlen and lladdrlen must be done in
* the caller.
*
* XXX If the link does not have link-layer address, what should
* we do? (ifp->if_addrlen == 0)
* Spec says nothing in sections for RA, RS and NA. There's small
* description on it in NS section (RFC 2461 7.2.3).
*/
rt = nd6_lookup(from, 0, ifp, ifp->if_rdomain);
if (rt == NULL) {
rt = nd6_lookup(from, 1, ifp, ifp->if_rdomain);
is_newentry = 1;
} else {
/* do not overwrite local or static entry */
if (ISSET(rt->rt_flags, RTF_STATIC|RTF_LOCAL)) {
rtfree(rt);
return;
}
is_newentry = 0;
}
if (!rt)
return;
if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) {
fail:
nd6_free(rt);
rtfree(rt);
return;
}
ln = (struct llinfo_nd6 *)rt->rt_llinfo;
if (ln == NULL)
goto fail;
if (rt->rt_gateway == NULL)
goto fail;
if (rt->rt_gateway->sa_family != AF_LINK)
goto fail;
sdl = satosdl(rt->rt_gateway);
olladdr = (sdl->sdl_alen) ? 1 : 0;
if (olladdr && lladdr) {
if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen))
llchange = 1;
else
llchange = 0;
} else
llchange = 0;
/*
* newentry olladdr lladdr llchange (*=record)
* 0 n n -- (1)
* 0 y n -- (2)
* 0 n y -- (3) * STALE
* 0 y y n (4) *
* 0 y y y (5) * STALE
* 1 -- n -- (6) NOSTATE(= PASSIVE)
* 1 -- y -- (7) * STALE
*/
if (llchange) {
char addr[INET6_ADDRSTRLEN];
log(LOG_INFO, "ndp info overwritten for %s by %s on %s\n",
inet_ntop(AF_INET6, from, addr, sizeof(addr)),
ether_sprintf(lladdr), ifp->if_xname);
}
if (lladdr) { /* (3-5) and (7) */
/*
* Record source link-layer address
* XXX is it dependent to ifp->if_type?
*/
sdl->sdl_alen = ifp->if_addrlen;
bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
}
if (!is_newentry) {
if ((!olladdr && lladdr) || /* (3) */
(olladdr && lladdr && llchange)) { /* (5) */
do_update = 1;
newstate = ND6_LLINFO_STALE;
} else /* (1-2,4) */
do_update = 0;
} else {
do_update = 1;
if (!lladdr) /* (6) */
newstate = ND6_LLINFO_NOSTATE;
else /* (7) */
newstate = ND6_LLINFO_STALE;
}
if (do_update) {
/*
* Update the state of the neighbor cache.
*/
ln->ln_state = newstate;
if (ln->ln_state == ND6_LLINFO_STALE) {
/*
* Since nd6_resolve() in ifp->if_output() will cause
* state transition to DELAY and reset the timer,
* we must set the timer now, although it is actually
* meaningless.
*/
nd6_llinfo_settimer(ln, nd6_gctimer);
if (ln->ln_hold) {
struct mbuf *n = ln->ln_hold;
ln->ln_hold = NULL;
/*
* we assume ifp is not a p2p here, so just
* set the 2nd argument as the 1st one.
*/
ifp->if_output(ifp, n, rt_key(rt), rt);
if (ln->ln_hold == n) {
/* n is back in ln_hold. Discard. */
m_freem(ln->ln_hold);
ln->ln_hold = NULL;
}
}
} else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
/* probe right away */
nd6_llinfo_settimer(ln, 0);
}
}
/*
* ICMP6 type dependent behavior.
*
* NS: clear IsRouter if new entry
* RS: clear IsRouter
* RA: set IsRouter if there's lladdr
* redir: clear IsRouter if new entry
*
* RA case, (1):
* The spec says that we must set IsRouter in the following cases:
* - If lladdr exist, set IsRouter. This means (1-5).
* - If it is old entry (!newentry), set IsRouter. This means (7).
* So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
* A question arises for (1) case. (1) case has no lladdr in the
* neighbor cache, this is similar to (6).
* This case is rare but we figured that we MUST NOT set IsRouter.
*
* newentry olladdr lladdr llchange NS RS RA redir
* D R
* 0 n n -- (1) c ? s
* 0 y n -- (2) c s s
* 0 n y -- (3) c s s
* 0 y y n (4) c s s
* 0 y y y (5) c s s
* 1 -- n -- (6) c c c s
* 1 -- y -- (7) c c s c s
*
* (c=clear s=set)
*/
switch (type & 0xff) {
case ND_NEIGHBOR_SOLICIT:
/*
* New entry must have is_router flag cleared.
*/
if (is_newentry) /* (6-7) */
ln->ln_router = 0;
break;
case ND_REDIRECT:
/*
* If the icmp is a redirect to a better router, always set the
* is_router flag. Otherwise, if the entry is newly created,
* clear the flag. [RFC 2461, sec 8.3]
*/
if (code == ND_REDIRECT_ROUTER)
ln->ln_router = 1;
else if (is_newentry) /* (6-7) */
ln->ln_router = 0;
break;
case ND_ROUTER_SOLICIT:
/*
* is_router flag must always be cleared.
*/
ln->ln_router = 0;
break;
case ND_ROUTER_ADVERT:
/*
* Mark an entry with lladdr as a router.
*/
if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */
(is_newentry && lladdr)) { /* (7) */
ln->ln_router = 1;
}
break;
}
rtfree(rt);
}
void
nd6_slowtimo(void *ignored_arg)
{
struct nd_ifinfo *nd6if;
struct ifnet *ifp;
NET_LOCK();
timeout_add_sec(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL);
TAILQ_FOREACH(ifp, &ifnet, if_list) {
nd6if = ND_IFINFO(ifp);
if (nd6if->basereachable && /* already initialized */
(nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
/*
* Since reachable time rarely changes by router
* advertisements, we SHOULD insure that a new random
* value gets recomputed at least once every few hours.
* (RFC 2461, 6.3.4)
*/
nd6if->recalctm = ND6_RECALC_REACHTM_INTERVAL;
nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
}
}
NET_UNLOCK();
}
int
nd6_resolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
struct sockaddr *dst, u_char *desten)
{
struct sockaddr_dl *sdl;
struct rtentry *rt;
struct llinfo_nd6 *ln = NULL;
if (m->m_flags & M_MCAST) {
ETHER_MAP_IPV6_MULTICAST(&satosin6(dst)->sin6_addr, desten);
return (0);
}
rt = rt_getll(rt0);
if (ISSET(rt->rt_flags, RTF_REJECT) && (rt->rt_expire == 0 || getuptime() < rt->rt_expire)) {
m_freem(m);
return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
}
/*
* Address resolution or Neighbor Unreachability Detection
* for the next hop.
* At this point, the destination of the packet must be a unicast
* or an anycast address(i.e. not a multicast).
*/
if (!ISSET(rt->rt_flags, RTF_LLINFO)) { char addr[INET6_ADDRSTRLEN];
log(LOG_DEBUG, "%s: %s: route contains no ND information\n",
__func__, inet_ntop(AF_INET6,
&satosin6(rt_key(rt))->sin6_addr, addr, sizeof(addr)));
m_freem(m);
return (EINVAL);
}
if (rt->rt_gateway->sa_family != AF_LINK) {
printf("%s: something odd happens\n", __func__);
m_freem(m);
return (EINVAL);
}
ln = (struct llinfo_nd6 *)rt->rt_llinfo;
KASSERT(ln != NULL);
/*
* Move this entry to the head of the queue so that it is less likely
* for this entry to be a target of forced garbage collection (see
* nd6_rtrequest()).
*/
TAILQ_REMOVE(&nd6_list, ln, ln_list);
TAILQ_INSERT_HEAD(&nd6_list, ln, ln_list);
/*
* The first time we send a packet to a neighbor whose entry is
* STALE, we have to change the state to DELAY and a sets a timer to
* expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
* neighbor unreachability detection on expiration.
* (RFC 2461 7.3.3)
*/
if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0;
ln->ln_state = ND6_LLINFO_DELAY;
nd6_llinfo_settimer(ln, nd6_delay);
}
/*
* If the neighbor cache entry has a state other than INCOMPLETE
* (i.e. its link-layer address is already resolved), just
* send the packet.
*/
if (ln->ln_state > ND6_LLINFO_INCOMPLETE) {
sdl = satosdl(rt->rt_gateway);
if (sdl->sdl_alen != ETHER_ADDR_LEN) {
char addr[INET6_ADDRSTRLEN];
log(LOG_DEBUG, "%s: %s: incorrect nd6 information\n",
__func__,
inet_ntop(AF_INET6, &satosin6(dst)->sin6_addr,
addr, sizeof(addr)));
m_freem(m);
return (EINVAL);
}
bcopy(LLADDR(sdl), desten, sdl->sdl_alen);
return (0);
}
/*
* There is a neighbor cache entry, but no ethernet address
* response yet. Replace the held mbuf (if any) with this
* latest one.
*/
if (ln->ln_state == ND6_LLINFO_NOSTATE) ln->ln_state = ND6_LLINFO_INCOMPLETE;
m_freem(ln->ln_hold);
ln->ln_hold = m;
/*
* If there has been no NS for the neighbor after entering the
* INCOMPLETE state, send the first solicitation.
*/
if (!ND6_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) { ln->ln_asked++;
nd6_llinfo_settimer(ln, ND_IFINFO(ifp)->retrans / 1000);
nd6_ns_output(ifp, NULL, &satosin6(dst)->sin6_addr, ln, 0);
}
return (EAGAIN);
}
int
nd6_need_cache(struct ifnet *ifp)
{
/*
* RFC2893 says:
* - unidirectional tunnels needs no ND
*/
switch (ifp->if_type) {
case IFT_ETHER:
case IFT_IEEE80211:
case IFT_CARP:
return (1);
default:
return (0);
}
}
/* $OpenBSD: kern_time.c,v 1.157 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_time.c,v 1.20 1996/02/18 11:57:06 fvdl Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_time.c 8.4 (Berkeley) 5/26/95
*/
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/proc.h>
#include <sys/ktrace.h>
#include <sys/signalvar.h>
#include <sys/stdint.h>
#include <sys/pledge.h>
#include <sys/task.h>
#include <sys/timeout.h>
#include <sys/timetc.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <dev/clock_subr.h>
int itimerfix(struct itimerval *);
/*
* Time of day and interval timer support.
*
* These routines provide the kernel entry points to get and set
* the time-of-day and per-process interval timers. Subroutines
* here provide support for adding and subtracting timeval structures
* and decrementing interval timers, optionally reloading the interval
* timers when they expire.
*/
/* This function is used by clock_settime and settimeofday */
int
settime(const struct timespec *ts)
{
struct timespec now;
/*
* Don't allow the time to be set forward so far it will wrap
* and become negative, thus allowing an attacker to bypass
* the next check below. The cutoff is 1 year before rollover
* occurs, so even if the attacker uses adjtime(2) to move
* the time past the cutoff, it will take a very long time
* to get to the wrap point.
*
* XXX: we check against UINT_MAX until we can figure out
* how to deal with the hardware RTCs.
*/
if (ts->tv_sec > UINT_MAX - 365*24*60*60) {
printf("denied attempt to set clock forward to %lld\n",
(long long)ts->tv_sec);
return (EPERM);
}
/*
* If the system is secure, we do not allow the time to be
* set to an earlier value (it may be slowed using adjtime,
* but not set back). This feature prevent interlopers from
* setting arbitrary time stamps on files.
*/
nanotime(&now);
if (securelevel > 1 && timespeccmp(ts, &now, <=)) {
printf("denied attempt to set clock back %lld seconds\n",
(long long)now.tv_sec - ts->tv_sec);
return (EPERM);
}
tc_setrealtimeclock(ts);
KERNEL_LOCK();
resettodr();
KERNEL_UNLOCK();
return (0);
}
int
clock_gettime(struct proc *p, clockid_t clock_id, struct timespec *tp)
{
struct proc *q;
int error = 0;
switch (clock_id) {
case CLOCK_REALTIME:
nanotime(tp);
break;
case CLOCK_UPTIME:
nanoruntime(tp);
break;
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
nanouptime(tp);
break;
case CLOCK_PROCESS_CPUTIME_ID:
nanouptime(tp);
timespecsub(tp, &curcpu()->ci_schedstate.spc_runtime, tp); timespecadd(tp, &p->p_p->ps_tu.tu_runtime, tp); timespecadd(tp, &p->p_rtime, tp);
break;
case CLOCK_THREAD_CPUTIME_ID:
nanouptime(tp);
timespecsub(tp, &curcpu()->ci_schedstate.spc_runtime, tp); timespecadd(tp, &p->p_tu.tu_runtime, tp); timespecadd(tp, &p->p_rtime, tp);
break;
default:
/* check for clock from pthread_getcpuclockid() */
if (__CLOCK_TYPE(clock_id) == CLOCK_THREAD_CPUTIME_ID) {
KERNEL_LOCK();
q = tfind(__CLOCK_PTID(clock_id) - THREAD_PID_OFFSET);
if (q == NULL || q->p_p != p->p_p)
error = ESRCH;
else
*tp = q->p_tu.tu_runtime;
KERNEL_UNLOCK();
} else
error = EINVAL;
break;
}
return (error);
}
int
sys_clock_gettime(struct proc *p, void *v, register_t *retval)
{
struct sys_clock_gettime_args /* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */ *uap = v;
struct timespec ats;
int error;
memset(&ats, 0, sizeof(ats));
if ((error = clock_gettime(p, SCARG(uap, clock_id), &ats)) != 0)
return (error);
error = copyout(&ats, SCARG(uap, tp), sizeof(ats));
#ifdef KTRACE
if (error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrabstimespec(p, &ats);
#endif
return (error);
}
int
sys_clock_settime(struct proc *p, void *v, register_t *retval)
{
struct sys_clock_settime_args /* {
syscallarg(clockid_t) clock_id;
syscallarg(const struct timespec *) tp;
} */ *uap = v;
struct timespec ats;
clockid_t clock_id;
int error;
if ((error = suser(p)) != 0)
return (error);
if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
return (error);
clock_id = SCARG(uap, clock_id);
switch (clock_id) {
case CLOCK_REALTIME:
if (!timespecisvalid(&ats))
return (EINVAL);
if ((error = settime(&ats)) != 0)
return (error);
break;
default: /* Other clocks are read-only */
return (EINVAL);
}
return (0);
}
int
sys_clock_getres(struct proc *p, void *v, register_t *retval)
{
struct sys_clock_getres_args /* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */ *uap = v;
clockid_t clock_id;
struct bintime bt;
struct timespec ts;
struct proc *q;
u_int64_t scale;
int error = 0, realstathz;
memset(&ts, 0, sizeof(ts));
realstathz = (stathz == 0) ? hz : stathz;
clock_id = SCARG(uap, clock_id);
switch (clock_id) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
case CLOCK_UPTIME:
memset(&bt, 0, sizeof(bt));
rw_enter_read(&tc_lock);
scale = ((1ULL << 63) / tc_getfrequency()) * 2;
bt.frac = tc_getprecision() * scale;
rw_exit_read(&tc_lock);
BINTIME_TO_TIMESPEC(&bt, &ts);
break;
case CLOCK_PROCESS_CPUTIME_ID:
case CLOCK_THREAD_CPUTIME_ID:
ts.tv_nsec = 1000000000 / realstathz;
break;
default:
/* check for clock from pthread_getcpuclockid() */
if (__CLOCK_TYPE(clock_id) == CLOCK_THREAD_CPUTIME_ID) {
KERNEL_LOCK();
q = tfind(__CLOCK_PTID(clock_id) - THREAD_PID_OFFSET);
if (q == NULL || q->p_p != p->p_p)
error = ESRCH;
else
ts.tv_nsec = 1000000000 / realstathz;
KERNEL_UNLOCK();
} else
error = EINVAL;
break;
}
if (error == 0 && SCARG(uap, tp)) {
ts.tv_nsec = MAX(ts.tv_nsec, 1);
error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
#ifdef KTRACE
if (error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts);
#endif
}
return error;
}
int
sys_nanosleep(struct proc *p, void *v, register_t *retval)
{
static int chan;
struct sys_nanosleep_args/* {
syscallarg(const struct timespec *) rqtp;
syscallarg(struct timespec *) rmtp;
} */ *uap = v;
struct timespec elapsed, remainder, request, start, stop;
uint64_t nsecs;
struct timespec *rmtp;
int copyout_error, error;
rmtp = SCARG(uap, rmtp);
error = copyin(SCARG(uap, rqtp), &request, sizeof(request));
if (error)
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &request);
#endif
if (request.tv_sec < 0 || !timespecisvalid(&request))
return (EINVAL);
do {
getnanouptime(&start);
nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&request), MAXTSLP));
error = tsleep_nsec(&chan, PWAIT | PCATCH, "nanoslp", nsecs);
getnanouptime(&stop);
timespecsub(&stop, &start, &elapsed);
timespecsub(&request, &elapsed, &request); if (request.tv_sec < 0) timespecclear(&request); if (error != EWOULDBLOCK)
break;
} while (timespecisset(&request));
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
if (rmtp) {
memset(&remainder, 0, sizeof(remainder));
remainder = request;
copyout_error = copyout(&remainder, rmtp, sizeof(remainder));
if (copyout_error)
error = copyout_error;
#ifdef KTRACE
if (copyout_error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &remainder);
#endif
}
return error;
}
int
sys_gettimeofday(struct proc *p, void *v, register_t *retval)
{
struct sys_gettimeofday_args /* {
syscallarg(struct timeval *) tp;
syscallarg(struct timezone *) tzp;
} */ *uap = v;
struct timeval atv;
static const struct timezone zerotz = { 0, 0 };
struct timeval *tp;
struct timezone *tzp;
int error = 0;
tp = SCARG(uap, tp);
tzp = SCARG(uap, tzp);
if (tp) {
memset(&atv, 0, sizeof(atv));
microtime(&atv);
if ((error = copyout(&atv, tp, sizeof (atv))))
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrabstimeval(p, &atv);
#endif
}
if (tzp)
error = copyout(&zerotz, tzp, sizeof(zerotz));
return (error);
}
int
sys_settimeofday(struct proc *p, void *v, register_t *retval)
{
struct sys_settimeofday_args /* {
syscallarg(const struct timeval *) tv;
syscallarg(const struct timezone *) tzp;
} */ *uap = v;
struct timezone atz;
struct timeval atv;
const struct timeval *tv;
const struct timezone *tzp;
int error;
tv = SCARG(uap, tv);
tzp = SCARG(uap, tzp);
if ((error = suser(p)))
return (error);
/* Verify all parameters before changing time. */
if (tv && (error = copyin(tv, &atv, sizeof(atv))))
return (error);
if (tzp && (error = copyin(tzp, &atz, sizeof(atz))))
return (error);
if (tv) {
struct timespec ts;
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrabstimeval(p, &atv);
#endif
if (!timerisvalid(&atv))
return (EINVAL);
TIMEVAL_TO_TIMESPEC(&atv, &ts);
if ((error = settime(&ts)) != 0)
return (error);
}
return (0);
}
#define ADJFREQ_MAX (500000000LL << 32)
#define ADJFREQ_MIN (-ADJFREQ_MAX)
int
sys_adjfreq(struct proc *p, void *v, register_t *retval)
{
struct sys_adjfreq_args /* {
syscallarg(const int64_t *) freq;
syscallarg(int64_t *) oldfreq;
} */ *uap = v;
int error = 0;
int64_t f, oldf;
const int64_t *freq = SCARG(uap, freq);
int64_t *oldfreq = SCARG(uap, oldfreq);
if (freq) {
if ((error = suser(p)))
return (error);
if ((error = copyin(freq, &f, sizeof(f))))
return (error);
if (f < ADJFREQ_MIN || f > ADJFREQ_MAX)
return (EINVAL);
}
rw_enter(&tc_lock, (freq == NULL) ? RW_READ : RW_WRITE);
if (oldfreq) {
tc_adjfreq(&oldf, NULL);
if ((error = copyout(&oldf, oldfreq, sizeof(oldf))))
goto out;
}
if (freq)
tc_adjfreq(NULL, &f);
out:
rw_exit(&tc_lock);
return (error);
}
int
sys_adjtime(struct proc *p, void *v, register_t *retval)
{
struct sys_adjtime_args /* {
syscallarg(const struct timeval *) delta;
syscallarg(struct timeval *) olddelta;
} */ *uap = v;
struct timeval atv;
const struct timeval *delta = SCARG(uap, delta);
struct timeval *olddelta = SCARG(uap, olddelta);
int64_t adjustment, remaining;
int error;
error = pledge_adjtime(p, delta);
if (error)
return error;
if (delta) {
if ((error = suser(p)))
return (error);
if ((error = copyin(delta, &atv, sizeof(struct timeval))))
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrreltimeval(p, &atv);
#endif
if (!timerisvalid(&atv))
return (EINVAL);
if (atv.tv_sec > INT64_MAX / 1000000)
return EINVAL;
if (atv.tv_sec < INT64_MIN / 1000000)
return EINVAL;
adjustment = atv.tv_sec * 1000000;
if (adjustment > INT64_MAX - atv.tv_usec)
return EINVAL;
adjustment += atv.tv_usec;
rw_enter_write(&tc_lock);
}
if (olddelta) {
tc_adjtime(&remaining, NULL);
memset(&atv, 0, sizeof(atv));
atv.tv_sec = remaining / 1000000;
atv.tv_usec = remaining % 1000000;
if (atv.tv_usec < 0) {
atv.tv_usec += 1000000;
atv.tv_sec--;
}
if ((error = copyout(&atv, olddelta, sizeof(struct timeval))))
goto out;
}
if (delta)
tc_adjtime(NULL, &adjustment);
out:
if (delta)
rw_exit_write(&tc_lock);
return (error);
}
struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
/*
* Get or set value of an interval timer. The process virtual and
* profiling virtual time timers are kept internally in the
* way they are specified externally: in time until they expire.
*
* The real time interval timer's it_value, in contrast, is kept as an
* absolute time rather than as a delta, so that it is easy to keep
* periodic real-time signals from drifting.
*
* Virtual time timers are processed in the hardclock() routine of
* kern_clock.c. The real time timer is processed by a timeout
* routine, called from the softclock() routine. Since a callout
* may be delayed in real time due to interrupt processing in the system,
* it is possible for the real time timeout routine (realitexpire, given below),
* to be delayed in real time past when it is supposed to occur. It
* does not suffice, therefore, to reload the real timer .it_value from the
* real time timers .it_interval. Rather, we compute the next time in
* absolute time the timer should go off.
*/
void
setitimer(int which, const struct itimerval *itv, struct itimerval *olditv)
{
struct itimerspec its, oldits;
struct timespec now;
struct itimerspec *itimer;
struct process *pr;
KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
pr = curproc->p_p;
itimer = &pr->ps_timer[which];
if (itv != NULL) { TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
}
if (which == ITIMER_REAL) {
mtx_enter(&pr->ps_mtx);
nanouptime(&now);
} else
mtx_enter(&itimer_mtx); if (olditv != NULL) oldits = *itimer; if (itv != NULL) { if (which == ITIMER_REAL) {
if (timespecisset(&its.it_value)) {
timespecadd(&its.it_value, &now, &its.it_value);
timeout_at_ts(&pr->ps_realit_to, &its.it_value);
} else
timeout_del(&pr->ps_realit_to);
}
*itimer = its;
}
if (which == ITIMER_REAL)
mtx_leave(&pr->ps_mtx);
else
mtx_leave(&itimer_mtx);
if (olditv != NULL) { if (which == ITIMER_REAL && timespecisset(&oldits.it_value)) { if (timespeccmp(&oldits.it_value, &now, <))
timespecclear(&oldits.it_value);
else {
timespecsub(&oldits.it_value, &now,
&oldits.it_value);
}
}
TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value);
TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval);
}
}
void
cancel_all_itimers(void)
{
struct itimerval itv;
int i;
timerclear(&itv.it_value);
timerclear(&itv.it_interval);
for (i = 0; i < nitems(curproc->p_p->ps_timer); i++)
setitimer(i, &itv, NULL);
}
int
sys_getitimer(struct proc *p, void *v, register_t *retval)
{
struct sys_getitimer_args /* {
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
struct itimerval aitv;
int which;
which = SCARG(uap, which);
if (which < ITIMER_REAL || which > ITIMER_PROF)
return EINVAL;
memset(&aitv, 0, sizeof(aitv));
setitimer(which, NULL, &aitv);
return copyout(&aitv, SCARG(uap, itv), sizeof(aitv));
}
int
sys_setitimer(struct proc *p, void *v, register_t *retval)
{
struct sys_setitimer_args /* {
syscallarg(int) which;
syscallarg(const struct itimerval *) itv;
syscallarg(struct itimerval *) oitv;
} */ *uap = v;
struct itimerval aitv, olditv;
struct itimerval *newitvp, *olditvp;
int error, which;
which = SCARG(uap, which);
if (which < ITIMER_REAL || which > ITIMER_PROF)
return EINVAL;
newitvp = olditvp = NULL;
if (SCARG(uap, itv) != NULL) {
error = copyin(SCARG(uap, itv), &aitv, sizeof(aitv));
if (error)
return error;
error = itimerfix(&aitv);
if (error)
return error;
newitvp = &aitv;
}
if (SCARG(uap, oitv) != NULL) {
memset(&olditv, 0, sizeof(olditv));
olditvp = &olditv;
}
if (newitvp == NULL && olditvp == NULL)
return 0;
setitimer(which, newitvp, olditvp);
if (SCARG(uap, oitv) != NULL) return copyout(&olditv, SCARG(uap, oitv), sizeof(olditv));
return 0;
}
/*
* Real interval timer expired:
* send process whose timer expired an alarm signal.
* If time is not set up to reload, then just return.
* Else compute next time timer should go off which is > current time.
* This is where delay in processing this timeout causes multiple
* SIGALRM calls to be compressed into one.
*/
void
realitexpire(void *arg)
{
struct timespec cts;
struct process *pr = arg;
struct itimerspec *tp = &pr->ps_timer[ITIMER_REAL];
int need_signal = 0;
mtx_enter(&pr->ps_mtx);
/*
* Do nothing if the timer was cancelled or rescheduled while we
* were entering the mutex.
*/
if (!timespecisset(&tp->it_value) || timeout_pending(&pr->ps_realit_to))
goto out;
/* The timer expired. We need to send the signal. */
need_signal = 1;
/* One-shot timers are not reloaded. */
if (!timespecisset(&tp->it_interval)) {
timespecclear(&tp->it_value);
goto out;
}
/*
* Find the nearest future expiration point and restart
* the timeout.
*/
nanouptime(&cts);
while (timespeccmp(&tp->it_value, &cts, <=))
timespecadd(&tp->it_value, &tp->it_interval, &tp->it_value);
if ((pr->ps_flags & PS_EXITING) == 0)
timeout_at_ts(&pr->ps_realit_to, &tp->it_value);
out:
mtx_leave(&pr->ps_mtx);
if (need_signal)
prsignal(pr, SIGALRM);
}
/*
* Check if the given setitimer(2) input is valid. Clear it_interval
* if it_value is unset. Round it_interval up to the minimum interval
* if necessary.
*/
int
itimerfix(struct itimerval *itv)
{
static const struct timeval max = { .tv_sec = UINT_MAX, .tv_usec = 0 };
struct timeval min_interval = { .tv_sec = 0, .tv_usec = tick };
if (itv->it_value.tv_sec < 0 || !timerisvalid(&itv->it_value))
return EINVAL;
if (timercmp(&itv->it_value, &max, >))
return EINVAL;
if (itv->it_interval.tv_sec < 0 || !timerisvalid(&itv->it_interval))
return EINVAL;
if (timercmp(&itv->it_interval, &max, >))
return EINVAL;
if (!timerisset(&itv->it_value))
timerclear(&itv->it_interval);
if (timerisset(&itv->it_interval)) {
if (timercmp(&itv->it_interval, &min_interval, <))
itv->it_interval = min_interval;
}
return 0;
}
/*
* Decrement an interval timer by the given number of nanoseconds.
* If the timer expires and it is periodic then reload it. When reloading
* the timer we subtract any overrun from the next period so that the timer
* does not drift.
*/
int
itimerdecr(struct itimerspec *itp, long nsec)
{
struct timespec decrement;
NSEC_TO_TIMESPEC(nsec, &decrement);
mtx_enter(&itimer_mtx);
/*
* Double-check that the timer is enabled. A different thread
* in setitimer(2) may have disabled it while we were entering
* the mutex.
*/
if (!timespecisset(&itp->it_value)) {
mtx_leave(&itimer_mtx);
return (1);
}
/*
* The timer is enabled. Update and reload it as needed.
*/
timespecsub(&itp->it_value, &decrement, &itp->it_value);
if (itp->it_value.tv_sec >= 0 && timespecisset(&itp->it_value)) {
mtx_leave(&itimer_mtx);
return (1);
}
if (!timespecisset(&itp->it_interval)) {
timespecclear(&itp->it_value);
mtx_leave(&itimer_mtx);
return (0);
}
while (itp->it_value.tv_sec < 0 || !timespecisset(&itp->it_value))
timespecadd(&itp->it_value, &itp->it_interval, &itp->it_value);
mtx_leave(&itimer_mtx);
return (0);
}
struct mutex ratecheck_mtx = MUTEX_INITIALIZER(IPL_HIGH);
/*
* ratecheck(): simple time-based rate-limit checking. see ratecheck(9)
* for usage and rationale.
*/
int
ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
{
struct timeval tv, delta;
int rv = 0;
getmicrouptime(&tv);
mtx_enter(&ratecheck_mtx);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once,
* even if interval is huge.
*/
if (timercmp(&delta, mininterval, >=) ||
(lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
*lasttime = tv;
rv = 1;
}
mtx_leave(&ratecheck_mtx);
return (rv);
}
struct mutex ppsratecheck_mtx = MUTEX_INITIALIZER(IPL_HIGH);
/*
* ppsratecheck(): packets (or events) per second limitation.
*/
int
ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
{
struct timeval tv, delta;
int rv;
microuptime(&tv);
mtx_enter(&ppsratecheck_mtx);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once.
* if more than one second have passed since the last update of
* lasttime, reset the counter.
*
* we do increment *curpps even in *curpps < maxpps case, as some may
* try to use *curpps for stat purposes as well.
*/
if (maxpps == 0)
rv = 0;
else if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) ||
delta.tv_sec >= 1) {
*lasttime = tv;
*curpps = 0;
rv = 1;
} else if (maxpps < 0)
rv = 1;
else if (*curpps < maxpps)
rv = 1;
else
rv = 0;
/* be careful about wrap-around */
if (*curpps + 1 > *curpps)
*curpps = *curpps + 1;
mtx_leave(&ppsratecheck_mtx);
return (rv);
}
todr_chip_handle_t todr_handle;
int inittodr_done;
#define MINYEAR ((OpenBSD / 100) - 1) /* minimum plausible year */
/*
* inittodr:
*
* Initialize time from the time-of-day register.
*/
void
inittodr(time_t base)
{
time_t deltat;
struct timeval rtctime;
struct timespec ts;
int badbase;
inittodr_done = 1;
if (base < (MINYEAR - 1970) * SECYR) {
printf("WARNING: preposterous time in file system\n");
/* read the system clock anyway */
base = (MINYEAR - 1970) * SECYR;
badbase = 1;
} else
badbase = 0;
rtctime.tv_sec = base;
rtctime.tv_usec = 0;
if (todr_handle == NULL ||
todr_gettime(todr_handle, &rtctime) != 0 ||
rtctime.tv_sec < (MINYEAR - 1970) * SECYR) {
/*
* Believe the time in the file system for lack of
* anything better, resetting the TODR.
*/
rtctime.tv_sec = base;
rtctime.tv_usec = 0;
if (todr_handle != NULL && !badbase)
printf("WARNING: bad clock chip time\n");
ts.tv_sec = rtctime.tv_sec;
ts.tv_nsec = rtctime.tv_usec * 1000;
tc_setclock(&ts);
goto bad;
} else {
ts.tv_sec = rtctime.tv_sec;
ts.tv_nsec = rtctime.tv_usec * 1000;
tc_setclock(&ts);
}
if (!badbase) {
/*
* See if we gained/lost two or more days; if
* so, assume something is amiss.
*/
deltat = rtctime.tv_sec - base;
if (deltat < 0)
deltat = -deltat;
if (deltat < 2 * SECDAY)
return; /* all is well */
#ifndef SMALL_KERNEL
printf("WARNING: clock %s %lld days\n",
rtctime.tv_sec < base ? "lost" : "gained",
(long long)(deltat / SECDAY));
#endif
}
bad:
printf("WARNING: CHECK AND RESET THE DATE!\n");
}
/*
* resettodr:
*
* Reset the time-of-day register with the current time.
*/
void
resettodr(void)
{
struct timeval rtctime;
/*
* Skip writing the RTC if inittodr(9) never ran. We don't
* want to overwrite a reasonable value with a nonsense value.
*/
if (!inittodr_done)
return;
microtime(&rtctime);
if (todr_handle != NULL &&
todr_settime(todr_handle, &rtctime) != 0)
printf("WARNING: can't update clock chip time\n");
}
void
todr_attach(struct todr_chip_handle *todr)
{
todr_handle = todr;
}
#define RESETTODR_PERIOD 1800
void periodic_resettodr(void *);
void perform_resettodr(void *);
struct timeout resettodr_to = TIMEOUT_INITIALIZER(periodic_resettodr, NULL);
struct task resettodr_task = TASK_INITIALIZER(perform_resettodr, NULL);
void
periodic_resettodr(void *arg __unused)
{
task_add(systq, &resettodr_task);
}
void
perform_resettodr(void *arg __unused)
{
resettodr();
timeout_add_sec(&resettodr_to, RESETTODR_PERIOD);
}
void
start_periodic_resettodr(void)
{
timeout_add_sec(&resettodr_to, RESETTODR_PERIOD);
}
void
stop_periodic_resettodr(void)
{
timeout_del(&resettodr_to);
task_del(systq, &resettodr_task);
}
/* $OpenBSD: in_pcb.c,v 1.275 2022/09/03 22:43:38 mvs Exp $ */
/* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/pfvar.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#ifdef IPSEC
#include <netinet/ip_esp.h>
#endif /* IPSEC */
#include "stoeplitz.h"
#if NSTOEPLITZ > 0
#include <net/toeplitz.h>
#endif
const struct in_addr zeroin_addr;
union {
struct in_addr za_in;
struct in6_addr za_in6;
} zeroin46_addr;
/*
* These configure the range of local port addresses assigned to
* "unspecified" outgoing connections/packets/whatever.
*/
int ipport_firstauto = IPPORT_RESERVED;
int ipport_lastauto = IPPORT_USERRESERVED;
int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;
int ipport_hilastauto = IPPORT_HILASTAUTO;
struct baddynamicports baddynamicports;
struct baddynamicports rootonlyports;
struct pool inpcb_pool;
void in_pcbhash_insert(struct inpcb *);
struct inpcb *in_pcbhash_lookup(struct inpcbtable *, u_int,
const struct in_addr *, u_short, const struct in_addr *, u_short);
int in_pcbresize(struct inpcbtable *, int);
#define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4)
struct inpcbhead *in_pcbhash(struct inpcbtable *, u_int,
const struct in_addr *, u_short, const struct in_addr *, u_short);
struct inpcbhead *in_pcblhash(struct inpcbtable *, u_int, u_short);
/*
* in_pcb is used for inet and inet6. in6_pcb only contains special
* IPv6 cases. So the internet initializer is used for both domains.
*/
void
in_init(void)
{
pool_init(&inpcb_pool, sizeof(struct inpcb), 0,
IPL_SOFTNET, 0, "inpcb", NULL);
}
struct inpcbhead *
in_pcbhash(struct inpcbtable *table, u_int rdomain,
const struct in_addr *faddr, u_short fport,
const struct in_addr *laddr, u_short lport)
{
SIPHASH_CTX ctx;
u_int32_t nrdom = htonl(rdomain);
SipHash24_Init(&ctx, &table->inpt_key);
SipHash24_Update(&ctx, &nrdom, sizeof(nrdom));
SipHash24_Update(&ctx, faddr, sizeof(*faddr));
SipHash24_Update(&ctx, &fport, sizeof(fport));
SipHash24_Update(&ctx, laddr, sizeof(*laddr));
SipHash24_Update(&ctx, &lport, sizeof(lport));
return (&table->inpt_hashtbl[SipHash24_End(&ctx) & table->inpt_mask]);
}
struct inpcbhead *
in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport)
{
SIPHASH_CTX ctx;
u_int32_t nrdom = htonl(rdomain);
SipHash24_Init(&ctx, &table->inpt_lkey);
SipHash24_Update(&ctx, &nrdom, sizeof(nrdom));
SipHash24_Update(&ctx, &lport, sizeof(lport));
return (&table->inpt_lhashtbl[SipHash24_End(&ctx) & table->inpt_lmask]);
}
void
in_pcbinit(struct inpcbtable *table, int hashsize)
{
mtx_init(&table->inpt_mtx, IPL_SOFTNET);
rw_init(&table->inpt_notify, "inpnotify");
TAILQ_INIT(&table->inpt_queue);
table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
&table->inpt_mask);
table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
&table->inpt_lmask);
table->inpt_count = 0;
table->inpt_size = hashsize;
arc4random_buf(&table->inpt_key, sizeof(table->inpt_key));
arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey));
}
/*
* Check if the specified port is invalid for dynamic allocation.
*/
int
in_baddynamic(u_int16_t port, u_int16_t proto)
{
switch (proto) {
case IPPROTO_TCP:
return (DP_ISSET(baddynamicports.tcp, port));
case IPPROTO_UDP:
#ifdef IPSEC
/* Cannot preset this as it is a sysctl */
if (port == udpencap_port)
return (1);
#endif
return (DP_ISSET(baddynamicports.udp, port));
default:
return (0);
}
}
int
in_rootonly(u_int16_t port, u_int16_t proto)
{
switch (proto) {
case IPPROTO_TCP:
return (port < IPPORT_RESERVED || DP_ISSET(rootonlyports.tcp, port));
case IPPROTO_UDP:
return (port < IPPORT_RESERVED || DP_ISSET(rootonlyports.udp, port));
default:
return (0);
}
}
int
in_pcballoc(struct socket *so, struct inpcbtable *table)
{
struct inpcb *inp;
inp = pool_get(&inpcb_pool, PR_NOWAIT|PR_ZERO);
if (inp == NULL)
return (ENOBUFS);
inp->inp_table = table;
inp->inp_socket = so;
refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
mtx_init(&inp->inp_mtx, IPL_SOFTNET);
inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
inp->inp_seclevel[SL_IPCOMP] = IPSEC_IPCOMP_LEVEL_DEFAULT;
inp->inp_rtableid = curproc->p_p->ps_rtableid;
inp->inp_hops = -1;
#ifdef INET6
/*
* Small change in this function to set the INP_IPV6 flag so routines
* outside pcb-specific routines don't need to use sotopf(), and all
* of its pointer chasing, later.
*/
if (sotopf(so) == PF_INET6) inp->inp_flags = INP_IPV6;
inp->inp_cksum6 = -1;
#endif /* INET6 */
mtx_enter(&table->inpt_mtx);
if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size)) (void)in_pcbresize(table, table->inpt_size * 2);
TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
in_pcbhash_insert(inp);
mtx_leave(&table->inpt_mtx);
so->so_pcb = inp;
return (0);
}
int
in_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p)
{
struct socket *so = inp->inp_socket;
u_int16_t lport = 0;
int wild = 0;
void *laddr = &zeroin46_addr;
int error;
if (inp->inp_lport)
return (EINVAL);
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
(so->so_options & SO_ACCEPTCONN) == 0))
wild = INPLOOKUP_WILDCARD;
switch (sotopf(so)) {
#ifdef INET6
case PF_INET6:
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6))
return (EINVAL);
wild |= INPLOOKUP_IPV6;
if (nam) {
struct sockaddr_in6 *sin6;
if ((error = in6_nam2sin6(nam, &sin6)))
return (error);
if ((error = in6_pcbaddrisavail(inp, sin6, wild, p)))
return (error);
laddr = &sin6->sin6_addr;
lport = sin6->sin6_port;
}
break;
#endif
case PF_INET:
if (inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
if (nam) {
struct sockaddr_in *sin;
if ((error = in_nam2sin(nam, &sin)))
return (error);
if ((error = in_pcbaddrisavail(inp, sin, wild, p)))
return (error);
laddr = &sin->sin_addr;
lport = sin->sin_port;
}
break;
default:
return (EINVAL);
}
if (lport == 0) {
if ((error = in_pcbpickport(&lport, laddr, wild, inp, p)))
return (error);
} else {
if (in_rootonly(ntohs(lport), so->so_proto->pr_protocol) &&
suser(p) != 0)
return (EACCES);
}
if (nam) { switch (sotopf(so)) {
#ifdef INET6
case PF_INET6:
inp->inp_laddr6 = *(struct in6_addr *)laddr;
break;
#endif
case PF_INET:
inp->inp_laddr = *(struct in_addr *)laddr;
break;
}
}
inp->inp_lport = lport;
in_pcbrehash(inp);
return (0);
}
int
in_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in *sin, int wild,
struct proc *p)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
u_int16_t lport = sin->sin_port;
int reuseport = (so->so_options & SO_REUSEPORT);
if (IN_MULTICAST(sin->sin_addr.s_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT))
reuseport = SO_REUSEADDR|SO_REUSEPORT;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
/*
* we must check that we are binding to an address we
* own except when:
* - SO_BINDANY is set or
* - we are binding a UDP socket to 255.255.255.255 or
* - we are binding a UDP socket to one of our broadcast
* addresses
*/
if (!ISSET(so->so_options, SO_BINDANY) && !(so->so_type == SOCK_DGRAM &&
sin->sin_addr.s_addr == INADDR_BROADCAST) &&
!(so->so_type == SOCK_DGRAM &&
in_broadcast(sin->sin_addr, inp->inp_rtableid))) {
struct ifaddr *ia;
sin->sin_port = 0;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
ia = ifa_ifwithaddr(sintosa(sin), inp->inp_rtableid);
sin->sin_port = lport;
if (ia == NULL)
return (EADDRNOTAVAIL);
}
}
if (lport) {
struct inpcb *t;
int error = 0;
if (so->so_euid && !IN_MULTICAST(sin->sin_addr.s_addr)) {
t = in_pcblookup_local(table, &sin->sin_addr, lport,
INPLOOKUP_WILDCARD, inp->inp_rtableid);
if (t && (so->so_euid != t->inp_socket->so_euid))
error = EADDRINUSE;
in_pcbunref(t);
if (error)
return (error);
}
t = in_pcblookup_local(table, &sin->sin_addr, lport,
wild, inp->inp_rtableid);
if (t && (reuseport & t->inp_socket->so_options) == 0)
error = EADDRINUSE;
in_pcbunref(t);
if (error)
return (error);
}
return (0);
}
int
in_pcbpickport(u_int16_t *lport, void *laddr, int wild, struct inpcb *inp,
struct proc *p)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
struct inpcb *t;
u_int16_t first, last, lower, higher, candidate, localport;
int count;
if (inp->inp_flags & INP_HIGHPORT) {
first = ipport_hifirstauto; /* sysctl */
last = ipport_hilastauto;
} else if (inp->inp_flags & INP_LOWPORT) {
if (suser(p))
return (EACCES);
first = IPPORT_RESERVED-1; /* 1023 */
last = 600; /* not IPPORT_RESERVED/2 */
} else {
first = ipport_firstauto; /* sysctl */
last = ipport_lastauto;
}
if (first < last) {
lower = first;
higher = last;
} else {
lower = last;
higher = first;
}
/*
* Simple check to ensure all ports are not used up causing
* a deadlock here.
*/
count = higher - lower;
candidate = lower + arc4random_uniform(count);
t = NULL;
do {
in_pcbunref(t);
do {
if (count-- < 0) /* completely used? */
return (EADDRNOTAVAIL);
++candidate;
if (candidate < lower || candidate > higher)
candidate = lower;
localport = htons(candidate);
} while (in_baddynamic(candidate, so->so_proto->pr_protocol));
t = in_pcblookup_local(table, laddr, localport, wild,
inp->inp_rtableid);
} while (t != NULL); *lport = localport;
return (0);
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in_pcbconnect(struct inpcb *inp, struct mbuf *nam)
{
struct in_addr ina;
struct sockaddr_in *sin;
struct inpcb *t;
int error;
#ifdef INET6
if (sotopf(inp->inp_socket) == PF_INET6)
return (in6_pcbconnect(inp, nam)); KASSERT((inp->inp_flags & INP_IPV6) == 0);
#endif /* INET6 */
if ((error = in_nam2sin(nam, &sin)))
return (error);
if (sin->sin_port == 0)
return (EADDRNOTAVAIL);
error = in_pcbselsrc(&ina, sin, inp);
if (error)
return (error);
t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port,
ina, inp->inp_lport, inp->inp_rtableid);
if (t != NULL) {
in_pcbunref(t);
return (EADDRINUSE);
}
KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport);
if (inp->inp_laddr.s_addr == INADDR_ANY) {
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, NULL, curproc);
if (error)
return (error);
t = in_pcblookup(inp->inp_table, sin->sin_addr,
sin->sin_port, ina, inp->inp_lport,
inp->inp_rtableid);
if (t != NULL) { inp->inp_lport = 0;
in_pcbunref(t);
return (EADDRINUSE);
}
}
inp->inp_laddr = ina;
}
inp->inp_faddr = sin->sin_addr;
inp->inp_fport = sin->sin_port;
in_pcbrehash(inp);
#if NSTOEPLITZ > 0
inp->inp_flowid = stoeplitz_ip4port(inp->inp_faddr.s_addr,
inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport);
#endif
return (0);
}
void
in_pcbdisconnect(struct inpcb *inp)
{
#if NPF > 0
if (inp->inp_pf_sk) { pf_remove_divert_state(inp->inp_pf_sk);
/* pf_remove_divert_state() may have detached the state */
pf_inp_unlink(inp);
}
#endif
switch (sotopf(inp->inp_socket)) {
#ifdef INET6
case PF_INET6:
inp->inp_faddr6 = in6addr_any;
break;
#endif
case PF_INET:
inp->inp_faddr.s_addr = INADDR_ANY;
break;
}
inp->inp_fport = 0;
inp->inp_flowid = 0;
in_pcbrehash(inp);
if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp);
}
void
in_pcbdetach(struct inpcb *inp)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
so->so_pcb = NULL;
/*
* As long as the NET_LOCK() is the default lock for Internet
* sockets, do not release it to not introduce new sleeping
* points.
*/
sofree(so, 1);
m_freem(inp->inp_options);
if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt);
inp->inp_route.ro_rt = NULL;
}
#ifdef INET6
if (inp->inp_flags & INP_IPV6) {
ip6_freepcbopts(inp->inp_outputopts6);
ip6_freemoptions(inp->inp_moptions6);
} else
#endif
ip_freemoptions(inp->inp_moptions);
#if NPF > 0
if (inp->inp_pf_sk) { pf_remove_divert_state(inp->inp_pf_sk);
/* pf_remove_divert_state() may have detached the state */
pf_inp_unlink(inp);
}
#endif
mtx_enter(&table->inpt_mtx);
LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash);
TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue);
table->inpt_count--;
mtx_leave(&table->inpt_mtx);
in_pcbunref(inp);
}
struct inpcb *
in_pcbref(struct inpcb *inp)
{ if (inp == NULL)
return NULL;
refcnt_take(&inp->inp_refcnt);
return inp;
}
void
in_pcbunref(struct inpcb *inp)
{ if (inp == NULL)
return;
if (refcnt_rele(&inp->inp_refcnt) == 0)
return;
KASSERT((LIST_NEXT(inp, inp_hash) == NULL) ||
(LIST_NEXT(inp, inp_hash) == _Q_INVALID));
KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) ||
(LIST_NEXT(inp, inp_lhash) == _Q_INVALID));
KASSERT((TAILQ_NEXT(inp, inp_queue) == NULL) ||
(TAILQ_NEXT(inp, inp_queue) == _Q_INVALID));
pool_put(&inpcb_pool, inp);
}
void
in_setsockaddr(struct inpcb *inp, struct mbuf *nam)
{
struct sockaddr_in *sin;
nam->m_len = sizeof(*sin);
sin = mtod(nam, struct sockaddr_in *);
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_port = inp->inp_lport;
sin->sin_addr = inp->inp_laddr;
}
void
in_setpeeraddr(struct inpcb *inp, struct mbuf *nam)
{
struct sockaddr_in *sin;
#ifdef INET6
if (sotopf(inp->inp_socket) == PF_INET6) {
in6_setpeeraddr(inp, nam);
return;
}
#endif /* INET6 */
nam->m_len = sizeof(*sin);
sin = mtod(nam, struct sockaddr_in *);
memset(sin, 0, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_port = inp->inp_fport;
sin->sin_addr = inp->inp_faddr;
}
int
in_sockaddr(struct socket *so, struct mbuf *nam)
{
struct inpcb *inp;
inp = sotoinpcb(so);
in_setsockaddr(inp, nam);
return (0);
}
int
in_peeraddr(struct socket *so, struct mbuf *nam)
{
struct inpcb *inp;
inp = sotoinpcb(so);
in_setpeeraddr(inp, nam);
return (0);
}
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The "usual action" will be
* taken, depending on the ctlinput cmd. The caller must filter any
* cmds that are uninteresting (e.g., no error in the map).
* Call the protocol specific routine (if any) to report
* any errors for each matching socket.
*/
void
in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable,
int errno, void (*notify)(struct inpcb *, int))
{
SIMPLEQ_HEAD(, inpcb) inpcblist;
struct inpcb *inp;
struct in_addr faddr;
u_int rdomain;
if (dst->sa_family != AF_INET)
return;
faddr = satosin(dst)->sin_addr;
if (faddr.s_addr == INADDR_ANY)
return;
if (notify == NULL)
return;
/*
* Use a temporary notify list protected by rwlock to run over
* selected PCB. This is necessary as the list of all PCB is
* protected by a mutex. Notify may call ip_output() eventually
* which may sleep as pf lock is a rwlock. Also the SRP
* implementation of the routing table might sleep.
* The same inp_notify list entry and inpt_notify rwlock are
* used for UDP multicast and raw IP delivery.
*/
SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
rw_enter_write(&table->inpt_notify);
mtx_enter(&table->inpt_mtx);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
#ifdef INET6
if (inp->inp_flags & INP_IPV6)
continue;
#endif
if (inp->inp_faddr.s_addr != faddr.s_addr ||
rtable_l2(inp->inp_rtableid) != rdomain ||
inp->inp_socket == NULL) {
continue;
}
in_pcbref(inp);
SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
}
mtx_leave(&table->inpt_mtx);
while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
(*notify)(inp, errno);
in_pcbunref(inp);
}
rw_exit_write(&table->inpt_notify);
}
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
* routing information. If the route was created dynamically
* (by a redirect), time to try a default gateway again.
*/
void
in_losing(struct inpcb *inp)
{
struct rtentry *rt = inp->inp_route.ro_rt;
if (rt) {
inp->inp_route.ro_rt = NULL;
if (rt->rt_flags & RTF_DYNAMIC) {
struct ifnet *ifp;
ifp = if_get(rt->rt_ifidx);
/*
* If the interface is gone, all its attached
* route entries have been removed from the table,
* so we're dealing with a stale cache and have
* nothing to do.
*/
if (ifp != NULL)
rtdeletemsg(rt, ifp, inp->inp_rtableid);
if_put(ifp);
}
/*
* A new route can be allocated
* the next time output is attempted.
* rtfree() needs to be called in anycase because the inp
* is still holding a reference to rt.
*/
rtfree(rt);
}
}
/*
* After a routing change, flush old routing
* and allocate a (hopefully) better one.
*/
void
in_rtchange(struct inpcb *inp, int errno)
{
if (inp->inp_route.ro_rt) {
rtfree(inp->inp_route.ro_rt);
inp->inp_route.ro_rt = NULL;
/*
* A new route can be allocated the next time
* output is attempted.
*/
}
}
struct inpcb *
in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg,
int flags, u_int rtable)
{
struct inpcb *inp, *match = NULL;
int matchwild = 3, wildcard;
u_int16_t lport = lport_arg;
struct in_addr laddr = *(struct in_addr *)laddrp;
#ifdef INET6
struct in6_addr *laddr6 = (struct in6_addr *)laddrp;
#endif
struct inpcbhead *head;
u_int rdomain;
rdomain = rtable_l2(rtable);
mtx_enter(&table->inpt_mtx);
head = in_pcblhash(table, rdomain, lport);
LIST_FOREACH(inp, head, inp_lhash) { if (rtable_l2(inp->inp_rtableid) != rdomain)
continue;
if (inp->inp_lport != lport)
continue;
wildcard = 0;
#ifdef INET6
if (ISSET(flags, INPLOOKUP_IPV6)) {
if (!ISSET(inp->inp_flags, INP_IPV6))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
wildcard++;
if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6)) { if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) || IN6_IS_ADDR_UNSPECIFIED(laddr6))
wildcard++;
else
continue;
}
} else
#endif /* INET6 */
{
#ifdef INET6
if (ISSET(inp->inp_flags, INP_IPV6))
continue;
#endif /* INET6 */
if (inp->inp_faddr.s_addr != INADDR_ANY)
wildcard++;
if (inp->inp_laddr.s_addr != laddr.s_addr) { if (inp->inp_laddr.s_addr == INADDR_ANY ||
laddr.s_addr == INADDR_ANY)
wildcard++;
else
continue;
}
}
if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) &&
wildcard < matchwild) {
match = inp;
if ((matchwild = wildcard) == 0)
break;
}
}
in_pcbref(match);
mtx_leave(&table->inpt_mtx);
return (match);
}
struct rtentry *
in_pcbrtentry(struct inpcb *inp)
{
struct route *ro;
ro = &inp->inp_route;
/* check if route is still valid */
if (!rtisvalid(ro->ro_rt)) {
rtfree(ro->ro_rt);
ro->ro_rt = NULL;
}
/*
* No route yet, so try to acquire one.
*/
if (ro->ro_rt == NULL) {
#ifdef INET6
memset(ro, 0, sizeof(struct route_in6));
#else
memset(ro, 0, sizeof(struct route));
#endif
switch(sotopf(inp->inp_socket)) {
#ifdef INET6
case PF_INET6:
if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
break;
ro->ro_dst.sa_family = AF_INET6;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in6);
satosin6(&ro->ro_dst)->sin6_addr = inp->inp_faddr6;
ro->ro_tableid = inp->inp_rtableid;
ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
&inp->inp_laddr6.s6_addr32[0], ro->ro_tableid);
break;
#endif /* INET6 */
case PF_INET:
if (inp->inp_faddr.s_addr == INADDR_ANY)
break;
ro->ro_dst.sa_family = AF_INET;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr;
ro->ro_tableid = inp->inp_rtableid;
ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
&inp->inp_laddr.s_addr, ro->ro_tableid);
break;
}
}
return (ro->ro_rt);
}
/*
* Return an IPv4 address, which is the most appropriate for a given
* destination.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
int
in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin,
struct inpcb *inp)
{
struct ip_moptions *mopts = inp->inp_moptions;
struct route *ro = &inp->inp_route;
struct in_addr *laddr = &inp->inp_laddr;
u_int rtableid = inp->inp_rtableid;
struct sockaddr *ip4_source = NULL;
struct sockaddr_in *sin2;
struct in_ifaddr *ia = NULL;
/*
* If the socket(if any) is already bound, use that bound address
* unless it is INADDR_ANY or INADDR_BROADCAST.
*/
if (laddr->s_addr != INADDR_ANY &&
laddr->s_addr != INADDR_BROADCAST) {
*insrc = *laddr;
return (0);
}
/*
* If the destination address is multicast or limited
* broadcast (255.255.255.255) and an outgoing interface has
* been set as a multicast option, use the address of that
* interface as our source address.
*/
if ((IN_MULTICAST(sin->sin_addr.s_addr) || sin->sin_addr.s_addr == INADDR_BROADCAST) && mopts != NULL) {
struct ifnet *ifp;
ifp = if_get(mopts->imo_ifidx);
if (ifp != NULL) { if (ifp->if_rdomain == rtable_l2(rtableid)) IFP_TO_IA(ifp, ia);
if (ia == NULL) {
if_put(ifp);
return (EADDRNOTAVAIL);
}
*insrc = ia->ia_addr.sin_addr;
if_put(ifp);
return (0);
}
}
/*
* If route is known or can be allocated now,
* our src addr is taken from the i/f, else punt.
*/
if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) ||
(satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr)) {
rtfree(ro->ro_rt);
ro->ro_rt = NULL;
}
if (ro->ro_rt == NULL) {
/* No route yet, so try to acquire one */
ro->ro_dst.sa_family = AF_INET;
ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
satosin(&ro->ro_dst)->sin_addr = sin->sin_addr;
ro->ro_tableid = rtableid;
ro->ro_rt = rtalloc_mpath(&ro->ro_dst, NULL, ro->ro_tableid);
/*
* It is important to zero out the rest of the
* struct sockaddr_in when mixing v6 & v4!
*/
sin2 = satosin(&ro->ro_dst);
memset(sin2->sin_zero, 0, sizeof(sin2->sin_zero));
}
/*
* If we found a route, use the address
* corresponding to the outgoing interface.
*/
if (ro->ro_rt != NULL)
ia = ifatoia(ro->ro_rt->rt_ifa);
/*
* Use preferred source address if :
* - destination is not onlink
* - preferred source address is set
* - output interface is UP
*/
if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) &&
!(ro->ro_rt->rt_flags & RTF_HOST)) {
ip4_source = rtable_getsource(rtableid, AF_INET);
if (ip4_source != NULL) {
struct ifaddr *ifa;
if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) != NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) { *insrc = satosin(ip4_source)->sin_addr;
return (0);
}
}
}
if (ia == NULL)
return (EADDRNOTAVAIL);
*insrc = ia->ia_addr.sin_addr;
return (0);
}
void
in_pcbrehash(struct inpcb *inp)
{
struct inpcbtable *table = inp->inp_table;
mtx_enter(&table->inpt_mtx);
LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash);
in_pcbhash_insert(inp);
mtx_leave(&table->inpt_mtx);
}
void
in_pcbhash_insert(struct inpcb *inp)
{
struct inpcbtable *table = inp->inp_table;
struct inpcbhead *head;
NET_ASSERT_LOCKED(); MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport);
LIST_INSERT_HEAD(head, inp, inp_lhash);
#ifdef INET6
if (inp->inp_flags & INP_IPV6)
head = in6_pcbhash(table, rtable_l2(inp->inp_rtableid),
&inp->inp_faddr6, inp->inp_fport,
&inp->inp_laddr6, inp->inp_lport);
else
#endif /* INET6 */
head = in_pcbhash(table, rtable_l2(inp->inp_rtableid),
&inp->inp_faddr, inp->inp_fport,
&inp->inp_laddr, inp->inp_lport);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
struct inpcb *
in_pcbhash_lookup(struct inpcbtable *table, u_int rdomain,
const struct in_addr *faddr, u_short fport,
const struct in_addr *laddr, u_short lport)
{
struct inpcbhead *head;
struct inpcb *inp;
NET_ASSERT_LOCKED(); MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
head = in_pcbhash(table, rdomain, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
if (ISSET(inp->inp_flags, INP_IPV6))
continue;
#endif
if (inp->inp_fport == fport && inp->inp_lport == lport && inp->inp_faddr.s_addr == faddr->s_addr && inp->inp_laddr.s_addr == laddr->s_addr &&
rtable_l2(inp->inp_rtableid) == rdomain) {
break;
}
}
if (inp != NULL) {
/*
* Move this PCB to the head of hash chain so that
* repeated accesses are quicker. This is analogous to
* the historic single-entry PCB cache.
*/
if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash);
}
}
return (inp);
}
int
in_pcbresize(struct inpcbtable *table, int hashsize)
{
u_long nmask, nlmask;
int osize;
void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl;
struct inpcb *inp;
MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
ohashtbl = table->inpt_hashtbl;
olhashtbl = table->inpt_lhashtbl;
osize = table->inpt_size;
nhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nmask);
if (nhashtbl == NULL)
return ENOBUFS;
nlhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nlmask);
if (nlhashtbl == NULL) {
hashfree(nhashtbl, hashsize, M_PCB);
return ENOBUFS;
}
table->inpt_hashtbl = nhashtbl;
table->inpt_lhashtbl = nlhashtbl;
table->inpt_mask = nmask;
table->inpt_lmask = nlmask;
table->inpt_size = hashsize;
arc4random_buf(&table->inpt_key, sizeof(table->inpt_key));
arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey));
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
LIST_REMOVE(inp, inp_lhash);
LIST_REMOVE(inp, inp_hash);
in_pcbhash_insert(inp);
}
hashfree(ohashtbl, osize, M_PCB);
hashfree(olhashtbl, osize, M_PCB);
return (0);
}
#ifdef DIAGNOSTIC
int in_pcbnotifymiss = 0;
#endif
/*
* The in(6)_pcblookup functions are used to locate connected sockets
* quickly:
* faddr.fport <-> laddr.lport
* No wildcard matching is done so that listening sockets are not found.
* If the functions return NULL in(6)_pcblookup_listen can be used to
* find a listening/bound socket that may accept the connection.
* After those two lookups no other are necessary.
*/
struct inpcb *
in_pcblookup(struct inpcbtable *table, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, u_int rtable)
{
struct inpcb *inp;
u_int rdomain;
rdomain = rtable_l2(rtable);
mtx_enter(&table->inpt_mtx);
inp = in_pcbhash_lookup(table, rdomain, &faddr, fport, &laddr, lport);
in_pcbref(inp);
mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
__func__, ntohl(faddr.s_addr), ntohs(fport),
ntohl(laddr.s_addr), ntohs(lport), rdomain);
}
#endif
return (inp);
}
/*
* The in(6)_pcblookup_listen functions are used to locate listening
* sockets quickly. This are sockets with unspecified foreign address
* and port:
* *.* <-> laddr.lport
* *.* <-> *.lport
*/
struct inpcb *
in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
u_int lport_arg, struct mbuf *m, u_int rtable)
{
const struct in_addr *key1, *key2;
struct inpcb *inp;
u_int16_t lport = lport_arg;
u_int rdomain;
key1 = &laddr;
key2 = &zeroin_addr;
#if NPF > 0
if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL); switch (divert->type) {
case PF_DIVERT_TO:
key1 = key2 = &divert->addr.v4;
lport = divert->port;
break;
case PF_DIVERT_REPLY:
return (NULL);
default:
panic("%s: unknown divert type %d, mbuf %p, divert %p",
__func__, divert->type, m, divert);
}
} else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) {
/*
* Redirected connections should not be treated the same
* as connections directed to 127.0.0.0/8 since localhost
* can only be accessed from the host itself.
* For example portmap(8) grants more permissions for
* connections to the socket bound to 127.0.0.1 than
* to the * socket.
*/
key1 = &zeroin_addr;
key2 = &laddr;
}
#endif
rdomain = rtable_l2(rtable);
mtx_enter(&table->inpt_mtx);
inp = in_pcbhash_lookup(table, rdomain, &zeroin_addr, 0, key1, lport);
if (inp == NULL && key1->s_addr != key2->s_addr) {
inp = in_pcbhash_lookup(table, rdomain,
&zeroin_addr, 0, key2, lport);
}
in_pcbref(inp);
mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: laddr=%08x lport=%d rdom=%u\n",
__func__, ntohl(laddr.s_addr), ntohs(lport), rdomain);
}
#endif
return (inp);
}
/* $OpenBSD: ffs_vnops.c,v 1.100 2022/06/26 05:20:43 visa Exp $ */
/* $NetBSD: ffs_vnops.c,v 1.7 1996/05/11 18:27:24 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_vnops.c 8.10 (Berkeley) 8/10/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/stat.h>
#include <sys/buf.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/signalvar.h>
#include <sys/pool.h>
#include <sys/event.h>
#include <sys/specdev.h>
#include <miscfs/fifofs/fifo.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
const struct vops ffs_vops = {
.vop_lookup = ufs_lookup,
.vop_create = ufs_create,
.vop_mknod = ufs_mknod,
.vop_open = ufs_open,
.vop_close = ufs_close,
.vop_access = ufs_access,
.vop_getattr = ufs_getattr,
.vop_setattr = ufs_setattr,
.vop_read = ffs_read,
.vop_write = ffs_write,
.vop_ioctl = ufs_ioctl,
.vop_kqfilter = ufs_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync = ffs_fsync,
.vop_remove = ufs_remove,
.vop_link = ufs_link,
.vop_rename = ufs_rename,
.vop_mkdir = ufs_mkdir,
.vop_rmdir = ufs_rmdir,
.vop_symlink = ufs_symlink,
.vop_readdir = ufs_readdir,
.vop_readlink = ufs_readlink,
.vop_abortop = vop_generic_abortop,
.vop_inactive = ufs_inactive,
.vop_reclaim = ffs_reclaim,
.vop_lock = ufs_lock,
.vop_unlock = ufs_unlock,
.vop_bmap = ufs_bmap,
.vop_strategy = ufs_strategy,
.vop_print = ufs_print,
.vop_islocked = ufs_islocked,
.vop_pathconf = ufs_pathconf,
.vop_advlock = ufs_advlock,
.vop_bwrite = vop_generic_bwrite
};
const struct vops ffs_specvops = {
.vop_close = ufsspec_close,
.vop_access = ufs_access,
.vop_getattr = ufs_getattr,
.vop_setattr = ufs_setattr,
.vop_read = ufsspec_read,
.vop_write = ufsspec_write,
.vop_fsync = ffs_fsync,
.vop_inactive = ufs_inactive,
.vop_reclaim = ffs_reclaim,
.vop_lock = ufs_lock,
.vop_unlock = ufs_unlock,
.vop_print = ufs_print,
.vop_islocked = ufs_islocked,
/* XXX: Keep in sync with spec_vops */
.vop_lookup = vop_generic_lookup,
.vop_create = vop_generic_badop,
.vop_mknod = vop_generic_badop,
.vop_open = spec_open,
.vop_ioctl = spec_ioctl,
.vop_kqfilter = spec_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_remove = vop_generic_badop,
.vop_link = vop_generic_badop,
.vop_rename = vop_generic_badop,
.vop_mkdir = vop_generic_badop,
.vop_rmdir = vop_generic_badop,
.vop_symlink = vop_generic_badop,
.vop_readdir = vop_generic_badop,
.vop_readlink = vop_generic_badop,
.vop_abortop = vop_generic_badop,
.vop_bmap = vop_generic_bmap,
.vop_strategy = spec_strategy,
.vop_pathconf = spec_pathconf,
.vop_advlock = spec_advlock,
.vop_bwrite = vop_generic_bwrite,
};
#ifdef FIFO
const struct vops ffs_fifovops = {
.vop_close = ufsfifo_close,
.vop_access = ufs_access,
.vop_getattr = ufs_getattr,
.vop_setattr = ufs_setattr,
.vop_read = ufsfifo_read,
.vop_write = ufsfifo_write,
.vop_fsync = ffs_fsync,
.vop_inactive = ufs_inactive,
.vop_reclaim = ffsfifo_reclaim,
.vop_lock = ufs_lock,
.vop_unlock = ufs_unlock,
.vop_print = ufs_print,
.vop_islocked = ufs_islocked,
.vop_bwrite = vop_generic_bwrite,
/* XXX: Keep in sync with fifo_vops */
.vop_lookup = vop_generic_lookup,
.vop_create = vop_generic_badop,
.vop_mknod = vop_generic_badop,
.vop_open = fifo_open,
.vop_ioctl = fifo_ioctl,
.vop_kqfilter = fifo_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_remove = vop_generic_badop,
.vop_link = vop_generic_badop,
.vop_rename = vop_generic_badop,
.vop_mkdir = vop_generic_badop,
.vop_rmdir = vop_generic_badop,
.vop_symlink = vop_generic_badop,
.vop_readdir = vop_generic_badop,
.vop_readlink = vop_generic_badop,
.vop_abortop = vop_generic_badop,
.vop_bmap = vop_generic_bmap,
.vop_strategy = vop_generic_badop,
.vop_pathconf = fifo_pathconf,
.vop_advlock = fifo_advlock
};
#endif /* FIFO */
/*
* Vnode op for reading.
*/
int
ffs_read(void *v)
{
struct vop_read_args *ap = v;
struct vnode *vp;
struct inode *ip;
struct uio *uio;
struct fs *fs;
struct buf *bp;
daddr_t lbn, nextlbn;
off_t bytesinfile;
int size, xfersize, blkoffset;
mode_t mode;
int error;
vp = ap->a_vp;
ip = VTOI(vp);
mode = DIP(ip, mode);
uio = ap->a_uio;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
panic("ffs_read: mode"); if (vp->v_type == VLNK) { if (DIP(ip, size) < ip->i_ump->um_maxsymlinklen || (ip->i_ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
panic("ffs_read: short symlink");
} else if (vp->v_type != VREG && vp->v_type != VDIR)
panic("ffs_read: type %d", vp->v_type);
#endif
fs = ip->i_fs;
if (uio->uio_offset < 0)
return (EINVAL);
if (uio->uio_resid == 0)
return (0);
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = DIP(ip, size) - uio->uio_offset) <= 0)
break;
lbn = lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
size = fs->fs_bsize; /* WAS blksize(fs, ip, lbn); */
blkoffset = blkoff(fs, uio->uio_offset);
xfersize = fs->fs_bsize - blkoffset;
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
if (bytesinfile < xfersize)
xfersize = bytesinfile;
if (lblktosize(fs, nextlbn) >= DIP(ip, size))
error = bread(vp, lbn, size, &bp);
else if (lbn - 1 == ip->i_ci.ci_lastr ||
uio->uio_resid > xfersize) {
error = bread_cluster(vp, lbn, size, &bp);
} else
error = bread(vp, lbn, size, &bp); if (error)
break;
ip->i_ci.ci_lastr = lbn;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < xfersize) { if (size == 0)
break;
xfersize = size;
}
error = uiomove(bp->b_data + blkoffset, xfersize, uio);
if (error)
break;
brelse(bp);
}
if (bp != NULL) brelse(bp); if (!(vp->v_mount->mnt_flag & MNT_NOATIME) ||
(ip->i_flag & (IN_CHANGE | IN_UPDATE))) {
ip->i_flag |= IN_ACCESS;
}
return (error);
}
/*
* Vnode op for writing.
*/
int
ffs_write(void *v)
{
struct vop_write_args *ap = v;
struct vnode *vp;
struct uio *uio;
struct inode *ip;
struct fs *fs;
struct buf *bp;
daddr_t lbn;
off_t osize;
int blkoffset, error, extended, flags, ioflag, size, xfersize;
size_t resid;
ssize_t overrun;
extended = 0;
ioflag = ap->a_ioflag;
uio = ap->a_uio;
vp = ap->a_vp;
ip = VTOI(vp);
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE)
panic("ffs_write: mode");
#endif
/*
* If writing 0 bytes, succeed and do not change
* update time or file offset (standards compliance)
*/
if (uio->uio_resid == 0)
return (0);
switch (vp->v_type) {
case VREG:
if (ioflag & IO_APPEND) uio->uio_offset = DIP(ip, size); if ((DIP(ip, flags) & APPEND) && uio->uio_offset != DIP(ip, size))
return (EPERM);
/* FALLTHROUGH */
case VLNK:
break;
case VDIR:
if ((ioflag & IO_SYNC) == 0) panic("ffs_write: nonsync dir write");
break;
default:
panic("ffs_write: type %d", vp->v_type);
}
fs = ip->i_fs;
if (uio->uio_offset < 0 ||
(u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
return (EFBIG);
/* do the filesize rlimit check */
if ((error = vn_fsizechk(vp, uio, ioflag, &overrun)))
return (error);
resid = uio->uio_resid;
osize = DIP(ip, size);
flags = ioflag & IO_SYNC ? B_SYNC : 0;
for (error = 0; uio->uio_resid > 0;) {
lbn = lblkno(fs, uio->uio_offset);
blkoffset = blkoff(fs, uio->uio_offset);
xfersize = fs->fs_bsize - blkoffset;
if (uio->uio_resid < xfersize)
xfersize = uio->uio_resid;
if (fs->fs_bsize > xfersize)
flags |= B_CLRBUF;
else
flags &= ~B_CLRBUF;
if ((error = UFS_BUF_ALLOC(ip, uio->uio_offset, xfersize,
ap->a_cred, flags, &bp)) != 0)
break;
if (uio->uio_offset + xfersize > DIP(ip, size)) {
DIP_ASSIGN(ip, size, uio->uio_offset + xfersize);
uvm_vnp_setsize(vp, DIP(ip, size));
extended = 1;
}
(void)uvm_vnp_uncache(vp);
size = blksize(fs, ip, lbn) - bp->b_resid;
if (size < xfersize)
xfersize = size;
error = uiomove(bp->b_data + blkoffset, xfersize, uio);
/*
* If the buffer is not already filled and we encounter an
* error while trying to fill it, we have to clear out any
* garbage data from the pages instantiated for the buffer.
* If we do not, a failed uiomove() during a write can leave
* the prior contents of the pages exposed to a userland mmap.
*
* Note that we don't need to clear buffers that were
* allocated with the B_CLRBUF flag set.
*/
if (error != 0 && !(flags & B_CLRBUF)) memset(bp->b_data + blkoffset, 0, xfersize); if (ioflag & IO_NOCACHE) bp->b_flags |= B_NOCACHE;
if (ioflag & IO_SYNC)
(void)bwrite(bp);
else if (xfersize + blkoffset == fs->fs_bsize) {
bawrite(bp);
} else
bdwrite(bp); if (error || xfersize == 0)
break;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
/*
* If we successfully wrote any data, and we are not the superuser
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0 &&
!vnoperm(vp))
DIP_ASSIGN(ip, mode, DIP(ip, mode) & ~(ISUID | ISGID)); if (resid > uio->uio_resid) VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
if (error) {
if (ioflag & IO_UNIT) { (void)UFS_TRUNCATE(ip, osize,
ioflag & IO_SYNC, ap->a_cred);
uio->uio_offset -= resid - uio->uio_resid;
uio->uio_resid = resid;
}
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { error = UFS_UPDATE(ip, 1);
}
/* correct the result for writes clamped by vn_fsizechk() */
uio->uio_resid += overrun;
return (error);
}
/*
* Synch an open file.
*/
int
ffs_fsync(void *v)
{
struct vop_fsync_args *ap = v;
struct vnode *vp = ap->a_vp;
struct buf *bp, *nbp;
int s, error, passes, skipmeta;
if (vp->v_type == VBLK && vp->v_specmountpoint != NULL &&
(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP))
softdep_fsync_mountdev(vp, ap->a_waitfor);
/*
* Flush all dirty buffers associated with a vnode.
*/
passes = NIADDR + 1;
skipmeta = 0;
if (ap->a_waitfor == MNT_WAIT)
skipmeta = 1;
s = splbio();
loop:
LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
bp->b_flags &= ~B_SCANNED;
}
LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
/*
* Reasons to skip this buffer: it has already been considered
* on this pass, this pass is the first time through on a
* synchronous flush request and the buffer being considered
* is metadata, the buffer has dependencies that will cause
* it to be redirtied and it has not already been deferred,
* or it is already being written.
*/
if (bp->b_flags & (B_BUSY | B_SCANNED))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("ffs_fsync: not dirty"); if (skipmeta && bp->b_lblkno < 0)
continue;
if (ap->a_waitfor != MNT_WAIT && LIST_FIRST(&bp->b_dep) != NULL && (bp->b_flags & B_DEFERRED) == 0 &&
buf_countdeps(bp, 0, 1)) {
bp->b_flags |= B_DEFERRED;
continue;
}
bremfree(bp);
buf_acquire(bp);
bp->b_flags |= B_SCANNED;
splx(s);
/*
* On our final pass through, do all I/O synchronously
* so that we can find out if our flush is failing
* because of write errors.
*/
if (passes > 0 || ap->a_waitfor != MNT_WAIT)
(void) bawrite(bp);
else if ((error = bwrite(bp)) != 0)
return (error);
s = splbio();
/*
* Since we may have slept during the I/O, we need
* to start from a known point.
*/
nbp = LIST_FIRST(&vp->v_dirtyblkhd);
}
if (skipmeta) {
skipmeta = 0;
goto loop;
}
if (ap->a_waitfor == MNT_WAIT) {
vwaitforio(vp, 0, "ffs_fsync", INFSLP);
/*
* Ensure that any filesystem metadata associated
* with the vnode has been written.
*/
splx(s);
if ((error = softdep_sync_metadata(ap)) != 0)
return (error);
s = splbio();
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
/*
* Block devices associated with filesystems may
* have new I/O requests posted for them even if
* the vnode is locked, so no amount of trying will
* get them clean. Thus we give block devices a
* good effort, then just give up. For all other file
* types, go around and try again until it is clean.
*/
if (passes > 0) {
passes -= 1;
goto loop;
}
#ifdef DIAGNOSTIC
if (vp->v_type != VBLK) vprint("ffs_fsync: dirty", vp);
#endif
}
}
splx(s);
return (UFS_UPDATE(VTOI(vp), ap->a_waitfor == MNT_WAIT));
}
/*
* Reclaim an inode so that it can be used for other purposes.
*/
int
ffs_reclaim(void *v)
{
struct vop_reclaim_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
int error;
if ((error = ufs_reclaim(vp)) != 0)
return (error);
if (ip->i_din1 != NULL) {
#ifdef FFS2
if (ip->i_ump->um_fstype == UM_UFS2)
pool_put(&ffs_dinode2_pool, ip->i_din2);
else
#endif
pool_put(&ffs_dinode1_pool, ip->i_din1);
}
pool_put(&ffs_ino_pool, ip);
vp->v_data = NULL;
return (0);
}
#ifdef FIFO
int
ffsfifo_reclaim(void *v)
{
fifo_reclaim(v);
return (ffs_reclaim(v));
}
#endif
/* $OpenBSD: ppp_tty.c,v 1.54 2022/01/02 22:36:04 jsg Exp $ */
/* $NetBSD: ppp_tty.c,v 1.12 1997/03/24 21:23:10 christos Exp $ */
/*
* ppp_tty.c - Point-to-Point Protocol (PPP) driver for asynchronous
* tty devices.
*
* Copyright (c) 1984-2000 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Office of Technology Transfer
* Carnegie Mellon University
* 5000 Forbes Avenue
* Pittsburgh, PA 15213-3890
* (412) 268-4387, fax: (412) 268-7395
* tech-transfer@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Based on:
* @(#)if_sl.c 7.6.1.2 (Berkeley) 2/15/89
*
* Copyright (c) 1987 Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*
* Serial Line interface
*
* Rick Adams
* Center for Seismic Studies
* 1300 N 17th Street, Suite 1450
* Arlington, Virginia 22209
* (703)276-7900
* rick@seismo.ARPA
* seismo!rick
*
* Pounded on heavily by Chris Torek (chris@mimsy.umd.edu, umcp-cs!chris).
* Converted to 4.3BSD Beta by Chris Torek.
* Other changes made at Berkeley, based in part on code by Kirk Smith.
*
* Converted to 4.3BSD+ 386BSD by Brad Parker (brad@cayman.com)
* Added VJ tcp header compression; more unified ioctls
*
* Extensively modified by Paul Mackerras (paulus@cs.anu.edu.au).
* Cleaned up a lot of the mbuf-related code to fix bugs that
* caused system crashes and packet corruption. Changed pppstart
* so that it doesn't just give up with a collision if the whole
* packet doesn't fit in the output ring buffer.
*
* Added priority queueing for interactive IP packets, following
* the model of if_sl.c, plus hooks for bpf.
* Paul Mackerras (paulus@cs.anu.edu.au).
*/
/* from if_sl.c,v 1.11 84/10/04 12:54:47 rick Exp */
/* from NetBSD: if_ppp.c,v 1.15.2.2 1994/07/28 05:17:58 cgd Exp */
#include "ppp.h"
#if NPPP > 0
#define VJC
#define PPP_COMPRESS
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/timeout.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/tty.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <sys/systm.h>
#include <sys/rwlock.h>
#include <sys/pool.h>
#include <net/if.h>
#include <net/if_var.h>
#ifdef VJC
#include <netinet/in.h>
#include <netinet/ip.h>
#include <net/slcompress.h>
#endif
#include <net/bpf.h>
#include <net/ppp_defs.h>
#include <net/if_ppp.h>
#include <net/if_pppvar.h>
int pppstart_internal(struct tty *tp, int);
u_int16_t pppfcs(u_int16_t fcs, u_char *cp, int len);
void pppasyncstart(struct ppp_softc *);
void pppasyncctlp(struct ppp_softc *);
void pppasyncrelinq(struct ppp_softc *);
void ppp_timeout(void *);
void ppppkt(struct ppp_softc *sc);
void pppdumpb(u_char *b, int l);
void ppplogchar(struct ppp_softc *, int);
struct rwlock ppp_pkt_init = RWLOCK_INITIALIZER("ppppktini");
struct pool ppp_pkts;
#define PKT_MAXLEN(_sc) ((_sc)->sc_mru + PPP_HDRLEN + PPP_FCSLEN)
/*
* Does c need to be escaped?
*/
#define ESCAPE_P(c) (sc->sc_asyncmap[(c) >> 5] & (1 << ((c) & 0x1F)))
/*
* Procedures for using an async tty interface for PPP.
*/
/* This is a NetBSD-1.0 or later kernel. */
#define CCOUNT(q) ((q)->c_cc)
/*
* Line specific open routine for async tty devices.
* Attach the given tty to the first available ppp unit.
* Called from device open routine or ttioctl.
*/
int
pppopen(dev_t dev, struct tty *tp, struct proc *p)
{
struct ppp_softc *sc;
int error, s;
if ((error = suser(p)) != 0)
return (error);
rw_enter_write(&ppp_pkt_init);
if (ppp_pkts.pr_size == 0) {
extern const struct kmem_pa_mode kp_dma_contig;
pool_init(&ppp_pkts, sizeof(struct ppp_pkt), 0,
IPL_TTY, 0, "ppppkts", NULL); /* IPL_SOFTTTY */
pool_set_constraints(&ppp_pkts, &kp_dma_contig);
}
rw_exit_write(&ppp_pkt_init);
s = spltty();
if (tp->t_line == PPPDISC) {
sc = (struct ppp_softc *) tp->t_sc;
if (sc != NULL && sc->sc_devp == (void *) tp) {
splx(s);
return (0);
}
}
if ((sc = pppalloc(p->p_p->ps_pid)) == NULL) {
splx(s);
return ENXIO;
}
if (sc->sc_relinq) (*sc->sc_relinq)(sc); /* get previous owner to relinquish the unit */
timeout_set(&sc->sc_timo, ppp_timeout, sc);
sc->sc_ilen = 0;
sc->sc_pkt = NULL;
bzero(sc->sc_asyncmap, sizeof(sc->sc_asyncmap));
sc->sc_asyncmap[0] = 0xffffffff;
sc->sc_asyncmap[3] = 0x60000000;
sc->sc_rasyncmap = 0;
sc->sc_devp = (void *) tp;
sc->sc_start = pppasyncstart;
sc->sc_ctlp = pppasyncctlp;
sc->sc_relinq = pppasyncrelinq;
sc->sc_outm = NULL;
ppppkt(sc);
sc->sc_if.if_flags |= IFF_RUNNING;
sc->sc_if.if_baudrate = tp->t_ospeed;
tp->t_sc = (caddr_t) sc;
ttyflush(tp, FREAD | FWRITE);
splx(s);
return (0);
}
/*
* Line specific close routine, called from device close routine
* and from ttioctl.
* Detach the tty from the ppp unit.
* Mimics part of ttyclose().
*/
int
pppclose(struct tty *tp, int flag, struct proc *p)
{
struct ppp_softc *sc;
int s;
s = spltty();
ttyflush(tp, FREAD|FWRITE);
tp->t_line = 0;
sc = (struct ppp_softc *) tp->t_sc;
if (sc != NULL) {
tp->t_sc = NULL;
if (tp == (struct tty *) sc->sc_devp) {
pppasyncrelinq(sc);
pppdealloc(sc);
}
}
splx(s);
return 0;
}
/*
* Relinquish the interface unit to another device.
*/
void
pppasyncrelinq(struct ppp_softc *sc)
{
int s;
KERNEL_LOCK();
s = spltty();
m_freem(sc->sc_outm);
sc->sc_outm = NULL;
if (sc->sc_pkt != NULL) {
ppp_pkt_free(sc->sc_pkt);
sc->sc_pkt = sc->sc_pktc = NULL;
}
if (sc->sc_flags & SC_TIMEOUT) {
timeout_del(&sc->sc_timo);
sc->sc_flags &= ~SC_TIMEOUT;
}
splx(s);
KERNEL_UNLOCK();
}
/*
* Line specific (tty) read routine.
*/
int
pppread(struct tty *tp, struct uio *uio, int flag)
{
struct ppp_softc *sc = (struct ppp_softc *)tp->t_sc;
struct mbuf *m, *m0;
int s;
int error = 0;
if (sc == NULL)
return 0;
/*
* Loop waiting for input, checking that nothing disastrous
* happens in the meantime.
*/
s = spltty();
for (;;) {
if (tp != (struct tty *) sc->sc_devp || tp->t_line != PPPDISC) {
splx(s);
return 0;
}
/* Get the packet from the input queue */
m0 = mq_dequeue(&sc->sc_inq);
if (m0 != NULL)
break;
if ((tp->t_state & TS_CARR_ON) == 0 && (tp->t_cflag & CLOCAL) == 0
&& (tp->t_state & TS_ISOPEN)) {
splx(s);
return 0; /* end of file */
}
if (tp->t_state & TS_ASYNC || flag & IO_NDELAY) {
splx(s);
return (EWOULDBLOCK);
}
error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI|PCATCH, ttyin);
if (error) {
splx(s);
return error;
}
}
/* Pull place-holder byte out of canonical queue */
getc(&tp->t_canq);
splx(s);
for (m = m0; m && uio->uio_resid; m = m->m_next)
if ((error = uiomove(mtod(m, u_char *), m->m_len, uio)) != 0)
break;
m_freem(m0);
return (error);
}
/*
* Line specific (tty) write routine.
*/
int
pppwrite(struct tty *tp, struct uio *uio, int flag)
{
struct ppp_softc *sc = (struct ppp_softc *)tp->t_sc;
struct mbuf *m, *m0, **mp;
struct sockaddr dst;
u_int len;
int error;
if ((tp->t_state & TS_CARR_ON) == 0 && (tp->t_cflag & CLOCAL) == 0)
return 0; /* wrote 0 bytes */
if (tp->t_line != PPPDISC)
return (EINVAL);
if (sc == NULL || tp != (struct tty *) sc->sc_devp)
return EIO;
if (uio->uio_resid > sc->sc_if.if_mtu + PPP_HDRLEN ||
uio->uio_resid < PPP_HDRLEN)
return (EMSGSIZE);
for (mp = &m0; uio->uio_resid; mp = &m->m_next) {
if (mp == &m0) {
MGETHDR(m, M_WAIT, MT_DATA);
m->m_pkthdr.len = uio->uio_resid - PPP_HDRLEN;
m->m_pkthdr.ph_ifidx = 0;
} else
MGET(m, M_WAIT, MT_DATA);
*mp = m;
m->m_len = 0;
if (uio->uio_resid >= MCLBYTES / 2)
MCLGET(m, M_DONTWAIT);
len = m_trailingspace(m);
if (len > uio->uio_resid)
len = uio->uio_resid;
if ((error = uiomove(mtod(m, u_char *), len, uio)) != 0) {
m_freem(m0);
return (error);
}
m->m_len = len;
}
dst.sa_family = AF_UNSPEC;
bcopy(mtod(m0, u_char *), dst.sa_data, PPP_HDRLEN);
m0->m_data += PPP_HDRLEN;
m0->m_len -= PPP_HDRLEN;
return sc->sc_if.if_output(&sc->sc_if, m0, &dst, NULL);
}
/*
* Line specific (tty) ioctl routine.
* This discipline requires that tty device drivers call
* the line specific l_ioctl routine from their ioctl routines.
*/
int
ppptioctl(struct tty *tp, u_long cmd, caddr_t data, int flag, struct proc *p)
{
struct ppp_softc *sc = (struct ppp_softc *) tp->t_sc;
int error, s;
if (sc == NULL || tp != (struct tty *) sc->sc_devp)
return -1;
error = 0;
switch (cmd) {
case PPPIOCSASYNCMAP:
if ((error = suser(p)) != 0)
break;
sc->sc_asyncmap[0] = *(u_int *)data;
break;
case PPPIOCGASYNCMAP:
*(u_int *)data = sc->sc_asyncmap[0];
break;
case PPPIOCSRASYNCMAP:
if ((error = suser(p)) != 0)
break;
sc->sc_rasyncmap = *(u_int *)data;
break;
case PPPIOCGRASYNCMAP:
*(u_int *)data = sc->sc_rasyncmap;
break;
case PPPIOCSXASYNCMAP:
if ((error = suser(p)) != 0)
break;
s = spltty();
bcopy(data, sc->sc_asyncmap, sizeof(sc->sc_asyncmap));
sc->sc_asyncmap[1] = 0; /* mustn't escape 0x20 - 0x3f */
sc->sc_asyncmap[2] &= ~0x40000000; /* mustn't escape 0x5e */
sc->sc_asyncmap[3] |= 0x60000000; /* must escape 0x7d, 0x7e */
splx(s);
break;
case PPPIOCGXASYNCMAP:
bcopy(sc->sc_asyncmap, data, sizeof(sc->sc_asyncmap));
break;
default:
NET_LOCK();
error = pppioctl(sc, cmd, data, flag, p);
NET_UNLOCK();
if (error == 0 && cmd == PPPIOCSMRU)
ppppkt(sc);
}
return error;
}
/*
* FCS lookup table as calculated by genfcstab.
*/
static u_int16_t fcstab[256] = {
0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
};
/*
* Calculate a new FCS given the current FCS and the new data.
*/
u_int16_t
pppfcs(u_int16_t fcs, u_char *cp, int len)
{
while (len--)
fcs = PPP_FCS(fcs, *cp++);
return (fcs);
}
/*
* This gets called from pppoutput when a new packet is
* put on a queue.
*/
void
pppasyncstart(struct ppp_softc *sc)
{
struct tty *tp = (struct tty *) sc->sc_devp;
struct mbuf *m;
int len;
u_char *start, *stop, *cp;
int n, ndone, done, idle;
struct mbuf *m2;
int s;
KERNEL_LOCK();
idle = 0;
while (CCOUNT(&tp->t_outq) < tp->t_hiwat) {
/*
* See if we have an existing packet partly sent.
* If not, get a new packet and start sending it.
*/
m = sc->sc_outm;
if (m == NULL) {
/*
* Get another packet to be sent.
*/
m = ppp_dequeue(sc);
if (m == NULL) {
idle = 1;
break;
}
/*
* The extra PPP_FLAG will start up a new packet, and thus
* will flush any accumulated garbage. We do this whenever
* the line may have been idle for some time.
*/
if (CCOUNT(&tp->t_outq) == 0) {
++sc->sc_stats.ppp_obytes;
(void) putc(PPP_FLAG, &tp->t_outq);
}
/* Calculate the FCS for the first mbuf's worth. */
sc->sc_outfcs = pppfcs(PPP_INITFCS, mtod(m, u_char *), m->m_len);
}
for (;;) {
start = mtod(m, u_char *);
len = m->m_len;
stop = start + len;
while (len > 0) {
/*
* Find out how many bytes in the string we can
* handle without doing something special.
*/
for (cp = start; cp < stop; cp++)
if (ESCAPE_P(*cp))
break;
n = cp - start;
if (n) {
/* NetBSD (0.9 or later), 4.3-Reno or similar. */
ndone = n - b_to_q(start, n, &tp->t_outq);
len -= ndone;
start += ndone;
sc->sc_stats.ppp_obytes += ndone;
if (ndone < n)
break; /* packet doesn't fit */
}
/*
* If there are characters left in the mbuf,
* the first one must be special.
* Put it out in a different form.
*/
if (len) {
s = spltty();
if (putc(PPP_ESCAPE, &tp->t_outq)) {
splx(s);
break;
}
if (putc(*start ^ PPP_TRANS, &tp->t_outq)) {
(void) unputc(&tp->t_outq);
splx(s);
break;
}
splx(s);
sc->sc_stats.ppp_obytes += 2;
start++;
len--;
}
}
/*
* If we didn't empty this mbuf, remember where we're up to.
* If we emptied the last mbuf, try to add the FCS and closing
* flag, and if we can't, leave sc_outm pointing to m, but with
* m->m_len == 0, to remind us to output the FCS and flag later.
*/
done = len == 0;
if (done && m->m_next == NULL) {
u_char *p, *q;
int c;
u_char endseq[8];
/*
* We may have to escape the bytes in the FCS.
*/
p = endseq;
c = ~sc->sc_outfcs & 0xFF;
if (ESCAPE_P(c)) {
*p++ = PPP_ESCAPE;
*p++ = c ^ PPP_TRANS;
} else
*p++ = c;
c = (~sc->sc_outfcs >> 8) & 0xFF;
if (ESCAPE_P(c)) {
*p++ = PPP_ESCAPE;
*p++ = c ^ PPP_TRANS;
} else
*p++ = c;
*p++ = PPP_FLAG;
/*
* Try to output the FCS and flag. If the bytes
* don't all fit, back out.
*/
s = spltty();
for (q = endseq; q < p; ++q)
if (putc(*q, &tp->t_outq)) {
done = 0;
for (; q > endseq; --q)
unputc(&tp->t_outq);
break;
}
splx(s);
if (done)
sc->sc_stats.ppp_obytes += q - endseq;
}
if (!done) {
/* remember where we got to */
m->m_data = start;
m->m_len = len;
break;
}
/* Finished with this mbuf; free it and move on. */
m2 = m_free(m);
m = m2;
if (m == NULL) {
/* Finished a packet */
break;
}
sc->sc_outfcs = pppfcs(sc->sc_outfcs, mtod(m, u_char *), m->m_len);
}
/*
* If m == NULL, we have finished a packet.
* If m != NULL, we've either done as much work this time
* as we need to, or else we've filled up the output queue.
*/
sc->sc_outm = m;
if (m)
break;
}
/* Call pppstart to start output again if necessary. */
s = spltty();
pppstart_internal(tp, 0);
/*
* This timeout is needed for operation on a pseudo-tty,
* because the pty code doesn't call pppstart after it has
* drained the t_outq.
*/
if (!idle && (sc->sc_flags & SC_TIMEOUT) == 0) {
timeout_add(&sc->sc_timo, 1);
sc->sc_flags |= SC_TIMEOUT;
}
splx(s);
KERNEL_UNLOCK();
}
/*
* This gets called when a received packet is placed on
* the inq.
*/
void
pppasyncctlp(struct ppp_softc *sc)
{
struct tty *tp;
int s;
KERNEL_LOCK();
/* Put a placeholder byte in canq for ttpoll()/ttnread(). */
s = spltty();
tp = (struct tty *) sc->sc_devp;
putc(0, &tp->t_canq);
ttwakeup(tp);
splx(s);
KERNEL_UNLOCK();
}
/*
* Start output on async tty interface. If the transmit queue
* has drained sufficiently, arrange for pppasyncstart to be
* called later.
*/
int
pppstart_internal(struct tty *tp, int force)
{
struct ppp_softc *sc = (struct ppp_softc *) tp->t_sc;
/*
* If there is stuff in the output queue, send it now.
* We are being called in lieu of ttstart and must do what it would.
*/
if (tp->t_oproc != NULL)
(*tp->t_oproc)(tp);
/*
* If the transmit queue has drained and the tty has not hung up
* or been disconnected from the ppp unit, then tell if_ppp.c that
* we need more output.
*/
if ((CCOUNT(&tp->t_outq) < tp->t_lowat || force)
&& !((tp->t_state & TS_CARR_ON) == 0 && (tp->t_cflag & CLOCAL) == 0)
&& sc != NULL && tp == (struct tty *) sc->sc_devp) {
ppp_restart(sc);
}
return 0;
}
int
pppstart(struct tty *tp)
{
return pppstart_internal(tp, 0);
}
/*
* Timeout routine - try to start some more output.
*/
void
ppp_timeout(void *x)
{
struct ppp_softc *sc = (struct ppp_softc *) x;
struct tty *tp = (struct tty *) sc->sc_devp;
int s;
s = spltty();
sc->sc_flags &= ~SC_TIMEOUT;
pppstart_internal(tp, 1);
splx(s);
}
/*
* Allocate enough mbuf to handle current MRU.
*/
void
ppppkt(struct ppp_softc *sc)
{
struct ppp_pkt **pktp, *pkt;
int len;
int s;
s = spltty();
pktp = &sc->sc_pkt;
for (len = PKT_MAXLEN(sc); len > 0; len -= sizeof(pkt->p_buf)) {
pkt = *pktp;
if (pkt == NULL) { pkt = pool_get(&ppp_pkts, PR_NOWAIT); if (pkt == NULL)
break;
PKT_NEXT(pkt) = NULL;
PKT_PREV(pkt) = *pktp;
PKT_LEN(pkt) = 0;
*pktp = pkt;
}
pktp = &PKT_NEXT(pkt);
}
splx(s);
}
void
ppp_pkt_free(struct ppp_pkt *pkt)
{
struct ppp_pkt *next;
while (pkt != NULL) {
next = PKT_NEXT(pkt);
pool_put(&ppp_pkts, pkt);
pkt = next;
}
}
/*
* tty interface receiver interrupt.
*/
static unsigned int paritytab[8] = {
0x96696996, 0x69969669, 0x69969669, 0x96696996,
0x69969669, 0x96696996, 0x96696996, 0x69969669
};
int
pppinput(int c, struct tty *tp)
{
struct ppp_softc *sc;
struct ppp_pkt *pkt;
int ilen, s;
sc = (struct ppp_softc *) tp->t_sc;
if (sc == NULL || tp != (struct tty *) sc->sc_devp)
return 0;
++tk_nin;
++sc->sc_stats.ppp_ibytes;
if (c & TTY_FE) {
/* framing error or overrun on this char - abort packet */
if (sc->sc_flags & SC_DEBUG)
printf("%s: bad char %x\n", sc->sc_if.if_xname, c);
goto flush;
}
c &= 0xff;
/*
* Handle software flow control of output.
*/
if (tp->t_iflag & IXON) {
if (c == tp->t_cc[VSTOP] && tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
if ((tp->t_state & TS_TTSTOP) == 0) {
tp->t_state |= TS_TTSTOP;
(*cdevsw[major(tp->t_dev)].d_stop)(tp, 0);
}
return 0;
}
if (c == tp->t_cc[VSTART] && tp->t_cc[VSTART] != _POSIX_VDISABLE) {
tp->t_state &= ~TS_TTSTOP;
if (tp->t_oproc != NULL)
(*tp->t_oproc)(tp);
return 0;
}
}
s = spltty();
if (c & 0x80)
sc->sc_flags |= SC_RCV_B7_1;
else
sc->sc_flags |= SC_RCV_B7_0;
if (paritytab[c >> 5] & (1 << (c & 0x1F)))
sc->sc_flags |= SC_RCV_ODDP;
else
sc->sc_flags |= SC_RCV_EVNP;
splx(s);
if (sc->sc_flags & SC_LOG_RAWIN)
ppplogchar(sc, c);
if (c == PPP_FLAG) {
ilen = sc->sc_ilen;
sc->sc_ilen = 0;
if (sc->sc_rawin_count > 0)
ppplogchar(sc, -1);
/*
* If SC_ESCAPED is set, then we've seen the packet
* abort sequence "}~".
*/
if (sc->sc_flags & (SC_FLUSH | SC_ESCAPED)
|| (ilen > 0 && sc->sc_fcs != PPP_GOODFCS)) {
s = spltty();
sc->sc_flags |= SC_PKTLOST; /* note the dropped packet */
if ((sc->sc_flags & (SC_FLUSH | SC_ESCAPED)) == 0){
if (sc->sc_flags & SC_DEBUG)
printf("%s: bad fcs %x\n", sc->sc_if.if_xname,
sc->sc_fcs);
sc->sc_if.if_ierrors++;
sc->sc_stats.ppp_ierrors++;
} else
sc->sc_flags &= ~(SC_FLUSH | SC_ESCAPED);
splx(s);
return 0;
}
if (ilen < PPP_HDRLEN + PPP_FCSLEN) {
if (ilen) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: too short (%d)\n", sc->sc_if.if_xname, ilen);
s = spltty();
sc->sc_if.if_ierrors++;
sc->sc_stats.ppp_ierrors++;
sc->sc_flags |= SC_PKTLOST;
splx(s);
}
return 0;
}
/*
* Remove FCS trailer.
*/
ilen -= 2;
pkt = sc->sc_pktc;
if (--PKT_LEN(pkt) == 0) {
pkt = PKT_PREV(pkt);
sc->sc_pktc = pkt;
}
PKT_LEN(pkt)--;
/* excise this mbuf chain */
pkt = sc->sc_pkt;
sc->sc_pkt = sc->sc_pktc = PKT_NEXT(sc->sc_pktc);
PKT_NEXT(pkt) = NULL;
ppppktin(sc, pkt, sc->sc_flags & SC_PKTLOST);
if (sc->sc_flags & SC_PKTLOST) {
s = spltty();
sc->sc_flags &= ~SC_PKTLOST;
splx(s);
}
ppppkt(sc);
return 0;
}
if (sc->sc_flags & SC_FLUSH) {
if (sc->sc_flags & SC_LOG_FLUSH)
ppplogchar(sc, c);
return 0;
}
if (c < 0x20 && (sc->sc_rasyncmap & (1 << c)))
return 0;
s = spltty();
if (sc->sc_flags & SC_ESCAPED) {
sc->sc_flags &= ~SC_ESCAPED;
c ^= PPP_TRANS;
} else if (c == PPP_ESCAPE) {
sc->sc_flags |= SC_ESCAPED;
splx(s);
return 0;
}
splx(s);
/*
* Initialize buffer on first octet received.
* First octet could be address or protocol (when compressing
* address/control).
* Second octet is control.
* Third octet is first or second (when compressing protocol)
* octet of protocol.
* Fourth octet is second octet of protocol.
*/
if (sc->sc_ilen == 0) {
/* reset the first input mbuf */
if (sc->sc_pkt == NULL) {
ppppkt(sc);
if (sc->sc_pkt == NULL) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: no input mbufs!\n", sc->sc_if.if_xname);
goto flush;
}
}
pkt = sc->sc_pkt;
PKT_LEN(pkt) = 0;
sc->sc_pktc = pkt;
sc->sc_pktp = pkt->p_buf;
sc->sc_fcs = PPP_INITFCS;
if (c != PPP_ALLSTATIONS) {
if (sc->sc_flags & SC_REJ_COMP_AC) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: garbage received: 0x%x (need 0xFF)\n",
sc->sc_if.if_xname, c);
goto flush;
}
*sc->sc_pktp++ = PPP_ALLSTATIONS;
*sc->sc_pktp++ = PPP_UI;
sc->sc_ilen += 2;
PKT_LEN(pkt) += 2;
}
}
if (sc->sc_ilen == 1 && c != PPP_UI) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: missing UI (0x3), got 0x%x\n",
sc->sc_if.if_xname, c);
goto flush;
}
if (sc->sc_ilen == 2 && (c & 1) == 1) {
/* a compressed protocol */
*sc->sc_pktp++ = 0;
sc->sc_ilen++;
PKT_LEN(sc->sc_pktc)++;
}
if (sc->sc_ilen == 3 && (c & 1) == 0) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: bad protocol %x\n", sc->sc_if.if_xname,
(sc->sc_pktp[-1] << 8) + c);
goto flush;
}
/* packet beyond configured mru? */
if (++sc->sc_ilen > PKT_MAXLEN(sc)) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: packet too big\n", sc->sc_if.if_xname);
goto flush;
}
/* is this packet full? */
pkt = sc->sc_pktc;
if (PKT_LEN(pkt) >= sizeof(pkt->p_buf)) {
if (PKT_NEXT(pkt) == NULL) {
ppppkt(sc);
if (PKT_NEXT(pkt) == NULL) {
if (sc->sc_flags & SC_DEBUG)
printf("%s: too few input packets!\n", sc->sc_if.if_xname);
goto flush;
}
}
sc->sc_pktc = pkt = PKT_NEXT(pkt);
PKT_LEN(pkt) = 0;
sc->sc_pktp = pkt->p_buf;
}
++PKT_LEN(pkt);
*sc->sc_pktp++ = c;
sc->sc_fcs = PPP_FCS(sc->sc_fcs, c);
return 0;
flush:
if (!(sc->sc_flags & SC_FLUSH)) {
s = spltty();
sc->sc_if.if_ierrors++;
sc->sc_stats.ppp_ierrors++;
sc->sc_flags |= SC_FLUSH;
splx(s);
if (sc->sc_flags & SC_LOG_FLUSH)
ppplogchar(sc, c);
}
return 0;
}
#define MAX_DUMP_BYTES 128
void
ppplogchar(struct ppp_softc *sc, int c)
{
if (c >= 0)
sc->sc_rawin[sc->sc_rawin_count++] = c;
if (sc->sc_rawin_count >= sizeof(sc->sc_rawin)
|| (c < 0 && sc->sc_rawin_count > 0)) {
printf("%s input: ", sc->sc_if.if_xname);
pppdumpb(sc->sc_rawin, sc->sc_rawin_count);
sc->sc_rawin_count = 0;
}
}
void
pppdumpb(u_char *b, int l)
{
char buf[3*MAX_DUMP_BYTES+4];
char *bp = buf;
static char digits[] = "0123456789abcdef";
while (l--) {
if (bp >= buf + sizeof(buf) - 3) {
*bp++ = '>';
break;
}
*bp++ = digits[*b >> 4]; /* convert byte to ascii hex */
*bp++ = digits[*b++ & 0xf];
*bp++ = ' ';
}
*bp = 0;
printf("%s\n", buf);
}
#endif /* NPPP > 0 */
/* $OpenBSD: uvm_page.c,v 1.170 2022/08/29 02:58:13 jsg Exp $ */
/* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.c 8.3 (Berkeley) 3/21/94
* from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_page.c: page ops.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/smr.h>
#include <uvm/uvm.h>
/*
* for object trees
*/
RBT_GENERATE(uvm_objtree, vm_page, objt, uvm_pagecmp);
int
uvm_pagecmp(const struct vm_page *a, const struct vm_page *b)
{
return a->offset < b->offset ? -1 : a->offset > b->offset;
}
/*
* global vars... XXXCDC: move to uvm. structure.
*/
/*
* physical memory config is stored in vm_physmem.
*/
struct vm_physseg vm_physmem[VM_PHYSSEG_MAX]; /* XXXCDC: uvm.physmem */
int vm_nphysseg = 0; /* XXXCDC: uvm.nphysseg */
/*
* Some supported CPUs in a given architecture don't support all
* of the things necessary to do idle page zero'ing efficiently.
* We therefore provide a way to disable it from machdep code here.
*/
/*
* local variables
*/
/*
* these variables record the values returned by vm_page_bootstrap,
* for debugging purposes. The implementation of uvm_pageboot_alloc
* and pmap_startup here also uses them internally.
*/
static vaddr_t virtual_space_start;
static vaddr_t virtual_space_end;
/*
* local prototypes
*/
static void uvm_pageinsert(struct vm_page *);
static void uvm_pageremove(struct vm_page *);
int uvm_page_owner_locked_p(struct vm_page *);
/*
* inline functions
*/
/*
* uvm_pageinsert: insert a page in the object
*
* => caller must lock object
* => call should have already set pg's object and offset pointers
* and bumped the version counter
*/
static inline void
uvm_pageinsert(struct vm_page *pg)
{
struct vm_page *dupe;
KASSERT(UVM_OBJ_IS_DUMMY(pg->uobject) ||
rw_write_held(pg->uobject->vmobjlock));
KASSERT((pg->pg_flags & PG_TABLED) == 0);
dupe = RBT_INSERT(uvm_objtree, &pg->uobject->memt, pg);
/* not allowed to insert over another page */
KASSERT(dupe == NULL); atomic_setbits_int(&pg->pg_flags, PG_TABLED);
pg->uobject->uo_npages++;
}
/*
* uvm_page_remove: remove page from object
*
* => caller must lock object
*/
static inline void
uvm_pageremove(struct vm_page *pg)
{
KASSERT(UVM_OBJ_IS_DUMMY(pg->uobject) ||
rw_write_held(pg->uobject->vmobjlock));
KASSERT(pg->pg_flags & PG_TABLED); RBT_REMOVE(uvm_objtree, &pg->uobject->memt, pg);
atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
pg->uobject->uo_npages--;
pg->uobject = NULL;
pg->pg_version++;
}
/*
* uvm_page_init: init the page system. called from uvm_init().
*
* => we return the range of kernel virtual memory in kvm_startp/kvm_endp
*/
void
uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
{
vsize_t freepages, pagecount, n;
vm_page_t pagearray, curpg;
int lcv, i;
paddr_t paddr, pgno;
struct vm_physseg *seg;
/*
* init the page queues and page queue locks
*/
TAILQ_INIT(&uvm.page_active);
TAILQ_INIT(&uvm.page_inactive);
mtx_init(&uvm.pageqlock, IPL_VM);
mtx_init(&uvm.fpageqlock, IPL_VM);
uvm_pmr_init();
/*
* allocate vm_page structures.
*/
/*
* sanity check:
* before calling this function the MD code is expected to register
* some free RAM with the uvm_page_physload() function. our job
* now is to allocate vm_page structures for this memory.
*/
if (vm_nphysseg == 0)
panic("uvm_page_bootstrap: no memory pre-allocated");
/*
* first calculate the number of free pages...
*
* note that we use start/end rather than avail_start/avail_end.
* this allows us to allocate extra vm_page structures in case we
* want to return some memory to the pool after booting.
*/
freepages = 0;
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++)
freepages += (seg->end - seg->start);
/*
* we now know we have (PAGE_SIZE * freepages) bytes of memory we can
* use. for each page of memory we use we need a vm_page structure.
* thus, the total number of pages we can use is the total size of
* the memory divided by the PAGE_SIZE plus the size of the vm_page
* structure. we add one to freepages as a fudge factor to avoid
* truncation errors (since we can only allocate in terms of whole
* pages).
*/
pagecount = (((paddr_t)freepages + 1) << PAGE_SHIFT) /
(PAGE_SIZE + sizeof(struct vm_page));
pagearray = (vm_page_t)uvm_pageboot_alloc(pagecount *
sizeof(struct vm_page));
memset(pagearray, 0, pagecount * sizeof(struct vm_page));
/* init the vm_page structures and put them in the correct place. */
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++) {
n = seg->end - seg->start;
if (n > pagecount) {
panic("uvm_page_init: lost %ld page(s) in init",
(long)(n - pagecount));
/* XXXCDC: shouldn't happen? */
/* n = pagecount; */
}
/* set up page array pointers */
seg->pgs = pagearray;
pagearray += n;
pagecount -= n;
seg->lastpg = seg->pgs + (n - 1);
/* init and free vm_pages (we've already zeroed them) */
pgno = seg->start;
paddr = ptoa(pgno);
for (i = 0, curpg = seg->pgs; i < n;
i++, curpg++, pgno++, paddr += PAGE_SIZE) {
curpg->phys_addr = paddr;
VM_MDPAGE_INIT(curpg);
if (pgno >= seg->avail_start &&
pgno < seg->avail_end) {
uvmexp.npages++;
}
}
/* Add pages to free pool. */
uvm_pmr_freepages(&seg->pgs[seg->avail_start - seg->start],
seg->avail_end - seg->avail_start);
}
/*
* pass up the values of virtual_space_start and
* virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
* layers of the VM.
*/
*kvm_startp = round_page(virtual_space_start);
*kvm_endp = trunc_page(virtual_space_end);
/* init locks for kernel threads */
mtx_init(&uvm.aiodoned_lock, IPL_BIO);
/*
* init reserve thresholds
* XXXCDC - values may need adjusting
*/
uvmexp.reserve_pagedaemon = 4;
uvmexp.reserve_kernel = 8;
uvmexp.anonminpct = 10;
uvmexp.vnodeminpct = 10;
uvmexp.vtextminpct = 5;
uvmexp.anonmin = uvmexp.anonminpct * 256 / 100;
uvmexp.vnodemin = uvmexp.vnodeminpct * 256 / 100;
uvmexp.vtextmin = uvmexp.vtextminpct * 256 / 100;
uvm.page_init_done = TRUE;
}
/*
* uvm_setpagesize: set the page size
*
* => sets page_shift and page_mask from uvmexp.pagesize.
*/
void
uvm_setpagesize(void)
{
if (uvmexp.pagesize == 0)
uvmexp.pagesize = DEFAULT_PAGE_SIZE;
uvmexp.pagemask = uvmexp.pagesize - 1;
if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
panic("uvm_setpagesize: page size not a power of two");
for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
break;
}
/*
* uvm_pageboot_alloc: steal memory from physmem for bootstrapping
*/
vaddr_t
uvm_pageboot_alloc(vsize_t size)
{
#if defined(PMAP_STEAL_MEMORY)
vaddr_t addr;
/*
* defer bootstrap allocation to MD code (it may want to allocate
* from a direct-mapped segment). pmap_steal_memory should round
* off virtual_space_start/virtual_space_end.
*/
addr = pmap_steal_memory(size, &virtual_space_start,
&virtual_space_end);
return addr;
#else /* !PMAP_STEAL_MEMORY */
static boolean_t initialized = FALSE;
vaddr_t addr, vaddr;
paddr_t paddr;
/* round to page size */
size = round_page(size);
/* on first call to this function, initialize ourselves. */
if (initialized == FALSE) {
pmap_virtual_space(&virtual_space_start, &virtual_space_end);
/* round it the way we like it */
virtual_space_start = round_page(virtual_space_start);
virtual_space_end = trunc_page(virtual_space_end);
initialized = TRUE;
}
/* allocate virtual memory for this request */
if (virtual_space_start == virtual_space_end ||
(virtual_space_end - virtual_space_start) < size)
panic("uvm_pageboot_alloc: out of virtual space");
addr = virtual_space_start;
#ifdef PMAP_GROWKERNEL
/*
* If the kernel pmap can't map the requested space,
* then allocate more resources for it.
*/
if (uvm_maxkaddr < (addr + size)) {
uvm_maxkaddr = pmap_growkernel(addr + size);
if (uvm_maxkaddr < (addr + size))
panic("uvm_pageboot_alloc: pmap_growkernel() failed");
}
#endif
virtual_space_start += size;
/* allocate and mapin physical pages to back new virtual pages */
for (vaddr = round_page(addr) ; vaddr < addr + size ;
vaddr += PAGE_SIZE) {
if (!uvm_page_physget(&paddr))
panic("uvm_pageboot_alloc: out of memory");
/*
* Note this memory is no longer managed, so using
* pmap_kenter is safe.
*/
pmap_kenter_pa(vaddr, paddr, PROT_READ | PROT_WRITE);
}
pmap_update(pmap_kernel());
return addr;
#endif /* PMAP_STEAL_MEMORY */
}
#if !defined(PMAP_STEAL_MEMORY)
/*
* uvm_page_physget: "steal" one page from the vm_physmem structure.
*
* => attempt to allocate it off the end of a segment in which the "avail"
* values match the start/end values. if we can't do that, then we
* will advance both values (making them equal, and removing some
* vm_page structures from the non-avail area).
* => return false if out of memory.
*/
boolean_t
uvm_page_physget(paddr_t *paddrp)
{
int lcv;
struct vm_physseg *seg;
/* pass 1: try allocating from a matching end */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) || \
(VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
for (lcv = vm_nphysseg - 1, seg = vm_physmem + lcv; lcv >= 0;
lcv--, seg--)
#else
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++)
#endif
{
if (uvm.page_init_done == TRUE)
panic("uvm_page_physget: called _after_ bootstrap");
/* try from front */
if (seg->avail_start == seg->start &&
seg->avail_start < seg->avail_end) {
*paddrp = ptoa(seg->avail_start);
seg->avail_start++;
seg->start++;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysseg == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysseg--;
for (; lcv < vm_nphysseg; lcv++, seg++)
/* structure copy */
seg[0] = seg[1];
}
return TRUE;
}
/* try from rear */
if (seg->avail_end == seg->end &&
seg->avail_start < seg->avail_end) {
*paddrp = ptoa(seg->avail_end - 1);
seg->avail_end--;
seg->end--;
/* nothing left? nuke it */
if (seg->avail_end == seg->start) {
if (vm_nphysseg == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysseg--;
for (; lcv < vm_nphysseg ; lcv++, seg++)
/* structure copy */
seg[0] = seg[1];
}
return TRUE;
}
}
/* pass2: forget about matching ends, just allocate something */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) || \
(VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
for (lcv = vm_nphysseg - 1, seg = vm_physmem + lcv; lcv >= 0;
lcv--, seg--)
#else
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++)
#endif
{
/* any room in this bank? */
if (seg->avail_start >= seg->avail_end)
continue; /* nope */
*paddrp = ptoa(seg->avail_start);
seg->avail_start++;
/* truncate! */
seg->start = seg->avail_start;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysseg == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysseg--;
for (; lcv < vm_nphysseg ; lcv++, seg++)
/* structure copy */
seg[0] = seg[1];
}
return TRUE;
}
return FALSE; /* whoops! */
}
#endif /* PMAP_STEAL_MEMORY */
/*
* uvm_page_physload: load physical memory into VM system
*
* => all args are PFs
* => all pages in start/end get vm_page structures
* => areas marked by avail_start/avail_end get added to the free page pool
* => we are limited to VM_PHYSSEG_MAX physical memory segments
*/
void
uvm_page_physload(paddr_t start, paddr_t end, paddr_t avail_start,
paddr_t avail_end, int flags)
{
int preload, lcv;
psize_t npages;
struct vm_page *pgs;
struct vm_physseg *ps, *seg;
#ifdef DIAGNOSTIC
if (uvmexp.pagesize == 0)
panic("uvm_page_physload: page size not set!");
if (start >= end)
panic("uvm_page_physload: start >= end");
#endif
/* do we have room? */
if (vm_nphysseg == VM_PHYSSEG_MAX) {
printf("uvm_page_physload: unable to load physical memory "
"segment\n");
printf("\t%d segments allocated, ignoring 0x%llx -> 0x%llx\n",
VM_PHYSSEG_MAX, (long long)start, (long long)end);
printf("\tincrease VM_PHYSSEG_MAX\n");
return;
}
/*
* check to see if this is a "preload" (i.e. uvm_mem_init hasn't been
* called yet, so malloc is not available).
*/
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg; lcv++, seg++) {
if (seg->pgs)
break;
}
preload = (lcv == vm_nphysseg);
/* if VM is already running, attempt to malloc() vm_page structures */
if (!preload) {
/*
* XXXCDC: need some sort of lockout for this case
* right now it is only used by devices so it should be alright.
*/
paddr_t paddr;
npages = end - start; /* # of pages */
pgs = km_alloc(round_page(npages * sizeof(*pgs)),
&kv_any, &kp_zero, &kd_waitok);
if (pgs == NULL) {
printf("uvm_page_physload: can not malloc vm_page "
"structs for segment\n");
printf("\tignoring 0x%lx -> 0x%lx\n", start, end);
return;
}
/* init phys_addr and free pages, XXX uvmexp.npages */
for (lcv = 0, paddr = ptoa(start); lcv < npages;
lcv++, paddr += PAGE_SIZE) {
pgs[lcv].phys_addr = paddr;
VM_MDPAGE_INIT(&pgs[lcv]);
if (atop(paddr) >= avail_start &&
atop(paddr) < avail_end) {
if (flags & PHYSLOAD_DEVICE) {
atomic_setbits_int(&pgs[lcv].pg_flags,
PG_DEV);
pgs[lcv].wire_count = 1;
} else {
#if defined(VM_PHYSSEG_NOADD)
panic("uvm_page_physload: tried to add RAM after vm_mem_init");
#endif
}
}
}
/* Add pages to free pool. */
if ((flags & PHYSLOAD_DEVICE) == 0) {
uvm_pmr_freepages(&pgs[avail_start - start],
avail_end - avail_start);
}
/* XXXCDC: need hook to tell pmap to rebuild pv_list, etc... */
} else {
/* gcc complains if these don't get init'd */
pgs = NULL;
npages = 0;
}
/* now insert us in the proper place in vm_physmem[] */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM)
/* random: put it at the end (easy!) */
ps = &vm_physmem[vm_nphysseg];
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
{
int x;
/* sort by address for binary search */
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg; lcv++, seg++)
if (start < seg->start)
break;
ps = seg;
/* move back other entries, if necessary ... */
for (x = vm_nphysseg, seg = vm_physmem + x - 1; x > lcv;
x--, seg--)
/* structure copy */
seg[1] = seg[0];
}
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
{
int x;
/* sort by largest segment first */
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg; lcv++, seg++)
if ((end - start) >
(seg->end - seg->start))
break;
ps = &vm_physmem[lcv];
/* move back other entries, if necessary ... */
for (x = vm_nphysseg, seg = vm_physmem + x - 1; x > lcv;
x--, seg--)
/* structure copy */
seg[1] = seg[0];
}
#else
panic("uvm_page_physload: unknown physseg strategy selected!");
#endif
ps->start = start;
ps->end = end;
ps->avail_start = avail_start;
ps->avail_end = avail_end;
if (preload) {
ps->pgs = NULL;
} else {
ps->pgs = pgs;
ps->lastpg = pgs + npages - 1;
}
vm_nphysseg++;
return;
}
#ifdef DDB /* XXXCDC: TMP TMP TMP DEBUG DEBUG DEBUG */
void uvm_page_physdump(void); /* SHUT UP GCC */
/* call from DDB */
void
uvm_page_physdump(void)
{
int lcv;
struct vm_physseg *seg;
printf("uvm_page_physdump: physical memory config [segs=%d of %d]:\n",
vm_nphysseg, VM_PHYSSEG_MAX);
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++)
printf("0x%llx->0x%llx [0x%llx->0x%llx]\n",
(long long)seg->start,
(long long)seg->end,
(long long)seg->avail_start,
(long long)seg->avail_end);
printf("STRATEGY = ");
switch (VM_PHYSSEG_STRAT) {
case VM_PSTRAT_RANDOM: printf("RANDOM\n"); break;
case VM_PSTRAT_BSEARCH: printf("BSEARCH\n"); break;
case VM_PSTRAT_BIGFIRST: printf("BIGFIRST\n"); break;
default: printf("<<UNKNOWN>>!!!!\n");
}
}
#endif
void
uvm_shutdown(void)
{
#ifdef UVM_SWAP_ENCRYPT
uvm_swap_finicrypt_all();
#endif
smr_flush();
}
/*
* Perform insert of a given page in the specified anon of obj.
* This is basically, uvm_pagealloc, but with the page already given.
*/
void
uvm_pagealloc_pg(struct vm_page *pg, struct uvm_object *obj, voff_t off,
struct vm_anon *anon)
{
int flags;
KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || off == 0); KASSERT(off == trunc_page(off)); KASSERT(obj == NULL || UVM_OBJ_IS_DUMMY(obj) ||
rw_write_held(obj->vmobjlock));
KASSERT(anon == NULL || anon->an_lock == NULL ||
rw_write_held(anon->an_lock));
flags = PG_BUSY | PG_FAKE;
pg->offset = off;
pg->uobject = obj;
pg->uanon = anon;
KASSERT(uvm_page_owner_locked_p(pg)); if (anon) {
anon->an_page = pg;
flags |= PQ_ANON;
} else if (obj)
uvm_pageinsert(pg);
atomic_setbits_int(&pg->pg_flags, flags);
#if defined(UVM_PAGE_TRKOWN)
pg->owner_tag = NULL;
#endif
UVM_PAGE_OWN(pg, "new alloc");
}
/*
* uvm_pglistalloc: allocate a list of pages
*
* => allocated pages are placed at the tail of rlist. rlist is
* assumed to be properly initialized by caller.
* => returns 0 on success or errno on failure
* => doesn't take into account clean non-busy pages on inactive list
* that could be used(?)
* => params:
* size the size of the allocation, rounded to page size.
* low the low address of the allowed allocation range.
* high the high address of the allowed allocation range.
* alignment memory must be aligned to this power-of-two boundary.
* boundary no segment in the allocation may cross this
* power-of-two boundary (relative to zero).
* => flags:
* UVM_PLA_NOWAIT fail if allocation fails
* UVM_PLA_WAITOK wait for memory to become avail
* UVM_PLA_ZERO return zeroed memory
*/
int
uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
paddr_t boundary, struct pglist *rlist, int nsegs, int flags)
{ KASSERT((alignment & (alignment - 1)) == 0); KASSERT((boundary & (boundary - 1)) == 0); KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); if (size == 0)
return EINVAL;
size = atop(round_page(size));
/*
* XXX uvm_pglistalloc is currently only used for kernel
* objects. Unlike the checks in uvm_pagealloc, below, here
* we are always allowed to use the kernel reserve.
*/
flags |= UVM_PLA_USERESERVE;
if ((high & PAGE_MASK) != PAGE_MASK) { printf("uvm_pglistalloc: Upper boundary 0x%lx "
"not on pagemask.\n", (unsigned long)high);
}
/*
* Our allocations are always page granularity, so our alignment
* must be, too.
*/
if (alignment < PAGE_SIZE)
alignment = PAGE_SIZE;
low = atop(roundup(low, alignment));
/*
* high + 1 may result in overflow, in which case high becomes 0x0,
* which is the 'don't care' value.
* The only requirement in that case is that low is also 0x0, or the
* low<high assert will fail.
*/
high = atop(high + 1);
alignment = atop(alignment);
if (boundary < PAGE_SIZE && boundary != 0)
boundary = PAGE_SIZE;
boundary = atop(boundary);
return uvm_pmr_getpages(size, low, high, alignment, boundary, nsegs,
flags, rlist);
}
/*
* uvm_pglistfree: free a list of pages
*
* => pages should already be unmapped
*/
void
uvm_pglistfree(struct pglist *list)
{
uvm_pmr_freepageq(list);
}
/*
* interface used by the buffer cache to allocate a buffer at a time.
* The pages are allocated wired in DMA accessible memory
*/
int
uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags)
{
struct pglist plist;
struct vm_page *pg;
int i, r;
KASSERT(UVM_OBJ_IS_BUFCACHE(obj)); KERNEL_ASSERT_LOCKED();
TAILQ_INIT(&plist);
r = uvm_pglistalloc(size, dma_constraint.ucr_low,
dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)),
flags);
if (r == 0) {
i = 0;
while ((pg = TAILQ_FIRST(&plist)) != NULL) {
pg->wire_count = 1;
atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE);
KASSERT((pg->pg_flags & PG_DEV) == 0);
TAILQ_REMOVE(&plist, pg, pageq);
uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL);
}
}
return r;
}
/*
* interface used by the buffer cache to reallocate a buffer at a time.
* The pages are reallocated wired outside the DMA accessible region.
*
*/
int
uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags, struct uvm_constraint_range *where)
{
struct pglist plist;
struct vm_page *pg, *tpg;
int i, r;
voff_t offset;
KASSERT(UVM_OBJ_IS_BUFCACHE(obj));
KERNEL_ASSERT_LOCKED();
TAILQ_INIT(&plist);
if (size == 0)
panic("size 0 uvm_pagerealloc");
r = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
0, &plist, atop(round_page(size)), flags);
if (r == 0) {
i = 0;
while((pg = TAILQ_FIRST(&plist)) != NULL) {
offset = off + ptoa(i++);
tpg = uvm_pagelookup(obj, offset);
KASSERT(tpg != NULL);
pg->wire_count = 1;
atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE);
KASSERT((pg->pg_flags & PG_DEV) == 0);
TAILQ_REMOVE(&plist, pg, pageq);
uvm_pagecopy(tpg, pg);
KASSERT(tpg->wire_count == 1);
tpg->wire_count = 0;
uvm_lock_pageq();
uvm_pagefree(tpg);
uvm_unlock_pageq();
uvm_pagealloc_pg(pg, obj, offset, NULL);
}
}
return r;
}
/*
* uvm_pagealloc: allocate vm_page from a particular free list.
*
* => return null if no pages free
* => wake up pagedaemon if number of free pages drops below low water mark
* => only one of obj or anon can be non-null
* => caller must activate/deactivate page if it is not wired.
*/
struct vm_page *
uvm_pagealloc(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
int flags)
{
struct vm_page *pg;
struct pglist pgl;
int pmr_flags;
KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || off == 0); KASSERT(off == trunc_page(off)); KASSERT(obj == NULL || UVM_OBJ_IS_DUMMY(obj) ||
rw_write_held(obj->vmobjlock));
KASSERT(anon == NULL || anon->an_lock == NULL ||
rw_write_held(anon->an_lock));
pmr_flags = UVM_PLA_NOWAIT;
/*
* We're allowed to use the kernel reserve if the page is
* being allocated to a kernel object.
*/
if ((flags & UVM_PGA_USERESERVE) || (obj != NULL && UVM_OBJ_IS_KERN_OBJECT(obj)))
pmr_flags |= UVM_PLA_USERESERVE;
if (flags & UVM_PGA_ZERO)
pmr_flags |= UVM_PLA_ZERO;
TAILQ_INIT(&pgl);
if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, pmr_flags, &pgl) != 0)
goto fail;
pg = TAILQ_FIRST(&pgl);
KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
uvm_pagealloc_pg(pg, obj, off, anon);
KASSERT((pg->pg_flags & PG_DEV) == 0);
if (flags & UVM_PGA_ZERO)
atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
else
atomic_setbits_int(&pg->pg_flags, PG_CLEAN);
return pg;
fail:
return NULL;
}
/*
* uvm_pagerealloc: reallocate a page from one object to another
*/
void
uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
{
/* remove it from the old object */
if (pg->uobject) {
uvm_pageremove(pg);
}
/* put it in the new object */
if (newobj) {
pg->uobject = newobj;
pg->offset = newoff;
pg->pg_version++;
uvm_pageinsert(pg);
}
}
/*
* uvm_pageclean: clean page
*
* => erase page's identity (i.e. remove from object)
* => caller must lock page queues if `pg' is managed
* => assumes all valid mappings of pg are gone
*/
void
uvm_pageclean(struct vm_page *pg)
{
u_int flags_to_clear = 0;
if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
pg->uanon == (void *)0xdeadbeef) {
panic("uvm_pagefree: freeing free page %p", pg);
}
#endif
KASSERT((pg->pg_flags & PG_DEV) == 0); KASSERT(pg->uobject == NULL || UVM_OBJ_IS_DUMMY(pg->uobject) ||
rw_write_held(pg->uobject->vmobjlock));
KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
rw_write_held(pg->uanon->an_lock));
/*
* if the page was an object page (and thus "TABLED"), remove it
* from the object.
*/
if (pg->pg_flags & PG_TABLED)
uvm_pageremove(pg);
/*
* now remove the page from the queues
*/
uvm_pagedequeue(pg);
/*
* if the page was wired, unwire it now.
*/
if (pg->wire_count) { pg->wire_count = 0;
uvmexp.wired--;
}
if (pg->uanon) { pg->uanon->an_page = NULL;
pg->uanon = NULL;
}
/* Clean page state bits. */
flags_to_clear |= PQ_ANON|PQ_AOBJ|PQ_ENCRYPT|PG_ZERO|PG_FAKE|PG_BUSY|
PG_RELEASED|PG_CLEAN|PG_CLEANCHK;
atomic_clearbits_int(&pg->pg_flags, flags_to_clear);
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->offset = 0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
#endif
}
/*
* uvm_pagefree: free page
*
* => erase page's identity (i.e. remove from object)
* => put page on free list
* => caller must lock page queues if `pg' is managed
* => assumes all valid mappings of pg are gone
*/
void
uvm_pagefree(struct vm_page *pg)
{ if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
uvm_pageclean(pg);
uvm_pmr_freepages(pg, 1);
}
/*
* uvm_page_unbusy: unbusy an array of pages.
*
* => pages must either all belong to the same object, or all belong to anons.
* => if pages are anon-owned, anons must have 0 refcount.
*/
void
uvm_page_unbusy(struct vm_page **pgs, int npgs)
{
struct vm_page *pg;
struct uvm_object *uobj;
int i;
for (i = 0; i < npgs; i++) {
pg = pgs[i];
if (pg == NULL || pg == PGO_DONTCARE) {
continue;
}
#if notyet
/*
* XXX swap case in uvm_aio_aiodone() is not holding the lock.
*
* This isn't compatible with the PG_RELEASED anon case below.
*/
KASSERT(uvm_page_owner_locked_p(pg));
#endif
KASSERT(pg->pg_flags & PG_BUSY);
if (pg->pg_flags & PG_WANTED) {
wakeup(pg);
}
if (pg->pg_flags & PG_RELEASED) {
uobj = pg->uobject;
if (uobj != NULL) {
uvm_lock_pageq();
pmap_page_protect(pg, PROT_NONE);
/* XXX won't happen right now */
if (pg->pg_flags & PQ_AOBJ)
uao_dropswap(uobj,
pg->offset >> PAGE_SHIFT);
uvm_pagefree(pg);
uvm_unlock_pageq();
} else {
rw_enter(pg->uanon->an_lock, RW_WRITE);
uvm_anon_release(pg->uanon);
}
} else {
atomic_clearbits_int(&pg->pg_flags, PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(pg, NULL);
}
}
}
/*
* uvm_pagewait: wait for a busy page
*
* => page must be known PG_BUSY
* => object must be locked
* => object will be unlocked on return
*/
void
uvm_pagewait(struct vm_page *pg, struct rwlock *lock, const char *wmesg)
{ KASSERT(rw_lock_held(lock)); KASSERT((pg->pg_flags & PG_BUSY) != 0); atomic_setbits_int(&pg->pg_flags, PG_WANTED);
rwsleep_nsec(pg, lock, PVM | PNORELOCK, wmesg, INFSLP);
}
#if defined(UVM_PAGE_TRKOWN)
/*
* uvm_page_own: set or release page ownership
*
* => this is a debugging function that keeps track of who sets PG_BUSY
* and where they do it. it can be used to track down problems
* such a thread setting "PG_BUSY" and never releasing it.
* => if "tag" is NULL then we are releasing page ownership
*/
void
uvm_page_own(struct vm_page *pg, char *tag)
{
/* gain ownership? */
if (tag) {
if (pg->owner_tag) {
printf("uvm_page_own: page %p already owned "
"by thread %d [%s]\n", pg,
pg->owner, pg->owner_tag);
panic("uvm_page_own");
}
pg->owner = (curproc) ? curproc->p_tid : (pid_t) -1;
pg->owner_tag = tag;
return;
}
/* drop ownership */
if (pg->owner_tag == NULL) {
printf("uvm_page_own: dropping ownership of an non-owned "
"page (%p)\n", pg);
panic("uvm_page_own");
}
pg->owner_tag = NULL;
return;
}
#endif
/*
* when VM_PHYSSEG_MAX is 1, we can simplify these functions
*/
#if VM_PHYSSEG_MAX > 1
/*
* vm_physseg_find: find vm_physseg structure that belongs to a PA
*/
int
vm_physseg_find(paddr_t pframe, int *offp)
{
struct vm_physseg *seg;
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
/* binary search for it */
int start, len, try;
/*
* if try is too large (thus target is less than than try) we reduce
* the length to trunc(len/2) [i.e. everything smaller than "try"]
*
* if the try is too small (thus target is greater than try) then
* we set the new start to be (try + 1). this means we need to
* reduce the length to (round(len/2) - 1).
*
* note "adjust" below which takes advantage of the fact that
* (round(len/2) - 1) == trunc((len - 1) / 2)
* for any value of len we may have
*/
for (start = 0, len = vm_nphysseg ; len != 0 ; len = len / 2) {
try = start + (len / 2); /* try in the middle */
seg = vm_physmem + try;
/* start past our try? */
if (pframe >= seg->start) {
/* was try correct? */
if (pframe < seg->end) {
if (offp)
*offp = pframe - seg->start;
return try; /* got it */
}
start = try + 1; /* next time, start here */
len--; /* "adjust" */
} else {
/*
* pframe before try, just reduce length of
* region, done in "for" loop
*/
}
}
return -1;
#else
/* linear search for it */
int lcv;
for (lcv = 0, seg = vm_physmem; lcv < vm_nphysseg ; lcv++, seg++) { if (pframe >= seg->start && pframe < seg->end) {
if (offp)
*offp = pframe - seg->start;
return lcv; /* got it */
}
}
return -1;
#endif
}
/*
* PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
* back from an I/O mapping (ugh!). used in some MD code as well.
*/
struct vm_page *
PHYS_TO_VM_PAGE(paddr_t pa)
{
paddr_t pf = atop(pa);
int off;
int psi;
psi = vm_physseg_find(pf, &off);
return (psi == -1) ? NULL : &vm_physmem[psi].pgs[off];
}
#endif /* VM_PHYSSEG_MAX > 1 */
/*
* uvm_pagelookup: look up a page
*/
struct vm_page *
uvm_pagelookup(struct uvm_object *obj, voff_t off)
{
/* XXX if stack is too much, handroll */
struct vm_page pg;
pg.offset = off;
return RBT_FIND(uvm_objtree, &obj->memt, &pg);
}
/*
* uvm_pagewire: wire the page, thus removing it from the daemon's grasp
*
* => caller must lock page queues
*/
void
uvm_pagewire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->wire_count == 0) {
uvm_pagedequeue(pg);
uvmexp.wired++;
}
pg->wire_count++;
}
/*
* uvm_pageunwire: unwire the page.
*
* => activate if wire count goes to zero.
* => caller must lock page queues
*/
void
uvm_pageunwire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
pg->wire_count--;
if (pg->wire_count == 0) { uvm_pageactivate(pg);
uvmexp.wired--;
}
}
/*
* uvm_pagedeactivate: deactivate page -- no pmaps have access to page
*
* => caller must lock page queues
* => caller must check to make sure page is not wired
* => object that page belongs to must be locked (so we can adjust pg->flags)
*/
void
uvm_pagedeactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->pg_flags & PQ_ACTIVE) { TAILQ_REMOVE(&uvm.page_active, pg, pageq);
atomic_clearbits_int(&pg->pg_flags, PQ_ACTIVE);
uvmexp.active--;
}
if ((pg->pg_flags & PQ_INACTIVE) == 0) { KASSERT(pg->wire_count == 0);
TAILQ_INSERT_TAIL(&uvm.page_inactive, pg, pageq);
atomic_setbits_int(&pg->pg_flags, PQ_INACTIVE);
uvmexp.inactive++;
pmap_clear_reference(pg);
/*
* update the "clean" bit. this isn't 100%
* accurate, and doesn't have to be. we'll
* re-sync it after we zap all mappings when
* scanning the inactive list.
*/
if ((pg->pg_flags & PG_CLEAN) != 0 &&
pmap_is_modified(pg))
atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
}
}
/*
* uvm_pageactivate: activate page
*
* => caller must lock page queues
*/
void
uvm_pageactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg)); MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
uvm_pagedequeue(pg);
if (pg->wire_count == 0) { TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq);
atomic_setbits_int(&pg->pg_flags, PQ_ACTIVE);
uvmexp.active++;
}
}
/*
* uvm_pagedequeue: remove a page from any paging queue
*/
void
uvm_pagedequeue(struct vm_page *pg)
{
if (pg->pg_flags & PQ_ACTIVE) { TAILQ_REMOVE(&uvm.page_active, pg, pageq);
atomic_clearbits_int(&pg->pg_flags, PQ_ACTIVE);
uvmexp.active--;
}
if (pg->pg_flags & PQ_INACTIVE) { TAILQ_REMOVE(&uvm.page_inactive, pg, pageq);
atomic_clearbits_int(&pg->pg_flags, PQ_INACTIVE);
uvmexp.inactive--;
}
}
/*
* uvm_pagezero: zero fill a page
*/
void
uvm_pagezero(struct vm_page *pg)
{
atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
pmap_zero_page(pg);
}
/*
* uvm_pagecopy: copy a page
*/
void
uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
{
atomic_clearbits_int(&dst->pg_flags, PG_CLEAN);
pmap_copy_page(src, dst);
}
/*
* uvm_page_owner_locked_p: return true if object associated with page is
* locked. this is a weak check for runtime assertions only.
*/
int
uvm_page_owner_locked_p(struct vm_page *pg)
{
if (pg->uobject != NULL) {
if (UVM_OBJ_IS_DUMMY(pg->uobject))
return 1;
return rw_write_held(pg->uobject->vmobjlock);
}
if (pg->uanon != NULL) { return rw_write_held(pg->uanon->an_lock);
}
return 1;
}
/*
* uvm_pagecount: count the number of physical pages in the address range.
*/
psize_t
uvm_pagecount(struct uvm_constraint_range* constraint)
{
int lcv;
psize_t sz;
paddr_t low, high;
paddr_t ps_low, ps_high;
/* Algorithm uses page numbers. */
low = atop(constraint->ucr_low);
high = atop(constraint->ucr_high);
sz = 0;
for (lcv = 0; lcv < vm_nphysseg; lcv++) {
ps_low = MAX(low, vm_physmem[lcv].avail_start);
ps_high = MIN(high, vm_physmem[lcv].avail_end);
if (ps_low < ps_high)
sz += ps_high - ps_low;
}
return sz;
}
/* $OpenBSD: kern_rwlock.c,v 1.48 2022/05/10 16:56:16 bluhm Exp $ */
/*
* Copyright (c) 2002, 2003 Artur Grabowski <art@openbsd.org>
* Copyright (c) 2011 Thordur Bjornsson <thib@secnorth.net>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/limits.h>
#include <sys/atomic.h>
#include <sys/witness.h>
void rw_do_exit(struct rwlock *, unsigned long);
/* XXX - temporary measure until proc0 is properly aligned */
#define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
/*
* Other OSes implement more sophisticated mechanism to determine how long the
* process attempting to acquire the lock should be spinning. We start with
* the most simple approach: we do RW_SPINS attempts at most before eventually
* giving up and putting the process to sleep queue.
*/
#define RW_SPINS 1000
#ifdef MULTIPROCESSOR
#define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o)
#else
static inline int
rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n)
{
if (*p != o)
return (1);
*p = n;
return (0);
}
#endif
/*
* Magic wand for lock operations. Every operation checks if certain
* flags are set and if they aren't, it increments the lock with some
* value (that might need some computing in a few cases). If the operation
* fails, we need to set certain flags while waiting for the lock.
*
* RW_WRITE The lock must be completely empty. We increment it with
* RWLOCK_WRLOCK and the proc pointer of the holder.
* Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting.
* RW_READ RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment
* with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting.
*/
static const struct rwlock_op {
unsigned long inc;
unsigned long check;
unsigned long wait_set;
long proc_mult;
int wait_prio;
} rw_ops[] = {
{ /* RW_WRITE */
RWLOCK_WRLOCK,
ULONG_MAX,
RWLOCK_WAIT | RWLOCK_WRWANT,
1,
PLOCK - 4
},
{ /* RW_READ */
RWLOCK_READ_INCR,
RWLOCK_WRLOCK | RWLOCK_WRWANT,
RWLOCK_WAIT,
0,
PLOCK
},
{ /* Sparse Entry. */
0,
},
{ /* RW_DOWNGRADE */
RWLOCK_READ_INCR - RWLOCK_WRLOCK,
0,
0,
-1,
PLOCK
},
};
void
rw_enter_read(struct rwlock *rwl)
{
unsigned long owner = rwl->rwl_owner;
if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
rw_enter(rwl, RW_READ);
else {
membar_enter_after_atomic();
WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL);
WITNESS_LOCK(&rwl->rwl_lock_obj, 0);
}
}
void
rw_enter_write(struct rwlock *rwl)
{
struct proc *p = curproc;
if (__predict_false(rw_cas(&rwl->rwl_owner, 0,
RW_PROC(p) | RWLOCK_WRLOCK)))
rw_enter(rwl, RW_WRITE);
else {
membar_enter_after_atomic();
WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
LOP_EXCLUSIVE | LOP_NEWORDER, NULL);
WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
}
}
void
rw_exit_read(struct rwlock *rwl)
{
unsigned long owner;
rw_assert_rdlock(rwl);
WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
membar_exit_before_atomic();
owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
rw_do_exit(rwl, 0);
}
void
rw_exit_write(struct rwlock *rwl)
{
unsigned long owner;
rw_assert_wrlock(rwl);
WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
membar_exit_before_atomic();
owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, 0)))
rw_do_exit(rwl, RWLOCK_WRLOCK);
}
#ifdef DIAGNOSTIC
/*
* Put the diagnostic functions here to keep the main code free
* from ifdef clutter.
*/
static void
rw_enter_diag(struct rwlock *rwl, int flags)
{
switch (flags & RW_OPMASK) {
case RW_WRITE:
case RW_READ:
if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner))
panic("rw_enter: %s locking against myself",
rwl->rwl_name);
break;
case RW_DOWNGRADE:
/*
* If we're downgrading, we must hold the write lock.
*/
if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0)
panic("rw_enter: %s downgrade of non-write lock",
rwl->rwl_name); if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner))
panic("rw_enter: %s downgrade, not holder",
rwl->rwl_name);
break;
default:
panic("rw_enter: unknown op 0x%x", flags);
}
}
#else
#define rw_enter_diag(r, f)
#endif
static void
_rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags,
const struct lock_type *type)
{
rwl->rwl_owner = 0;
rwl->rwl_name = name;
#ifdef WITNESS
rwl->rwl_lock_obj.lo_flags = lo_flags;
rwl->rwl_lock_obj.lo_name = name;
rwl->rwl_lock_obj.lo_type = type;
WITNESS_INIT(&rwl->rwl_lock_obj, type);
#else
(void)type;
(void)lo_flags;
#endif
}
void
_rw_init_flags(struct rwlock *rwl, const char *name, int flags,
const struct lock_type *type)
{
_rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type);
}
int
rw_enter(struct rwlock *rwl, int flags)
{
const struct rwlock_op *op;
struct sleep_state sls;
unsigned long inc, o;
#ifdef MULTIPROCESSOR
/*
* If process holds the kernel lock, then we want to give up on CPU
* as soon as possible so other processes waiting for the kernel lock
* can progress. Hence no spinning if we hold the kernel lock.
*/
unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS;
#endif
int error, prio;
#ifdef WITNESS
int lop_flags;
lop_flags = LOP_NEWORDER;
if (flags & RW_WRITE)
lop_flags |= LOP_EXCLUSIVE;
if (flags & RW_DUPOK)
lop_flags |= LOP_DUPOK;
if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0) WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL);
#endif
op = &rw_ops[(flags & RW_OPMASK) - 1];
inc = op->inc + RW_PROC(curproc) * op->proc_mult;
retry:
while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) {
unsigned long set = o | op->wait_set;
int do_sleep;
/* Avoid deadlocks after panic or in DDB */
if (panicstr || db_active)
return (0);
#ifdef MULTIPROCESSOR
/*
* It makes sense to try to spin just in case the lock
* is acquired by writer.
*/
if ((o & RWLOCK_WRLOCK) && (spin != 0)) {
spin--;
CPU_BUSY_CYCLE();
continue;
}
#endif
rw_enter_diag(rwl, flags);
if (flags & RW_NOSLEEP)
return (EBUSY);
prio = op->wait_prio;
if (flags & RW_INTR)
prio |= PCATCH;
sleep_setup(&sls, rwl, prio, rwl->rwl_name, 0);
do_sleep = !rw_cas(&rwl->rwl_owner, o, set);
error = sleep_finish(&sls, do_sleep);
if ((flags & RW_INTR) &&
(error != 0))
return (error);
if (flags & RW_SLEEPFAIL)
return (EAGAIN);
}
if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
goto retry;
membar_enter_after_atomic();
/*
* If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
* downgraded a write lock and had possible read waiter, wake them
* to let them retry the lock.
*/
if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) ==
(RWLOCK_WRLOCK|RWLOCK_WAIT)))
wakeup(rwl);
if (flags & RW_DOWNGRADE)
WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags);
else
WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags);
return (0);
}
void
rw_exit(struct rwlock *rwl)
{
unsigned long wrlock;
/* Avoid deadlocks after panic or in DDB */
if (panicstr || db_active)
return;
wrlock = rwl->rwl_owner & RWLOCK_WRLOCK;
if (wrlock)
rw_assert_wrlock(rwl);
else
rw_assert_rdlock(rwl);
WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
membar_exit_before_atomic();
rw_do_exit(rwl, wrlock);
}
/* membar_exit_before_atomic() has to precede call of this function. */
void
rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
{
unsigned long owner, set;
do {
owner = rwl->rwl_owner;
if (wrlock)
set = 0;
else
set = (owner - RWLOCK_READ_INCR) &
~(RWLOCK_WAIT|RWLOCK_WRWANT);
/*
* Potential MP race here. If the owner had WRWANT set, we
* cleared it and a reader can sneak in before a writer.
*/
} while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); if (owner & RWLOCK_WAIT) wakeup(rwl);
}
int
rw_status(struct rwlock *rwl)
{
unsigned long owner = rwl->rwl_owner;
if (owner & RWLOCK_WRLOCK) {
if (RW_PROC(curproc) == RW_PROC(owner))
return RW_WRITE;
else
return RW_WRITE_OTHER;
}
if (owner)
return RW_READ;
return (0);
}
#ifdef DIAGNOSTIC
void
rw_assert_wrlock(struct rwlock *rwl)
{ if (panicstr || db_active)
return;
#ifdef WITNESS
witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED);
#else
if (!(rwl->rwl_owner & RWLOCK_WRLOCK))
panic("%s: lock not held", rwl->rwl_name);
if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner))
panic("%s: lock not held by this process", rwl->rwl_name);
#endif
}
void
rw_assert_rdlock(struct rwlock *rwl)
{
if (panicstr || db_active)
return;
#ifdef WITNESS
witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED);
#else
if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK))
panic("%s: lock not shared", rwl->rwl_name);
#endif
}
void
rw_assert_anylock(struct rwlock *rwl)
{ if (panicstr || db_active)
return;
#ifdef WITNESS
witness_assert(&rwl->rwl_lock_obj, LA_LOCKED);
#else
switch (rw_status(rwl)) {
case RW_WRITE_OTHER:
panic("%s: lock held by different process", rwl->rwl_name);
case 0:
panic("%s: lock not held", rwl->rwl_name);
}
#endif
}
void
rw_assert_unlocked(struct rwlock *rwl)
{
if (panicstr || db_active)
return;
#ifdef WITNESS
witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED);
#else
if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner))
panic("%s: lock held", rwl->rwl_name);
#endif
}
#endif
/* recursive rwlocks; */
void
_rrw_init_flags(struct rrwlock *rrwl, const char *name, int flags,
const struct lock_type *type)
{
memset(rrwl, 0, sizeof(struct rrwlock));
_rw_init_flags_witness(&rrwl->rrwl_lock, name, RRWLOCK_LO_FLAGS(flags),
type);
}
int
rrw_enter(struct rrwlock *rrwl, int flags)
{
int rv;
if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) {
if (flags & RW_RECURSEFAIL)
return (EDEADLK);
else {
rrwl->rrwl_wcnt++;
WITNESS_LOCK(&rrwl->rrwl_lock.rwl_lock_obj,
LOP_EXCLUSIVE);
return (0);
}
}
rv = rw_enter(&rrwl->rrwl_lock, flags);
if (rv == 0) rrwl->rrwl_wcnt = 1;
return (rv);
}
void
rrw_exit(struct rrwlock *rrwl)
{ if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { KASSERT(rrwl->rrwl_wcnt > 0);
rrwl->rrwl_wcnt--;
if (rrwl->rrwl_wcnt != 0) { WITNESS_UNLOCK(&rrwl->rrwl_lock.rwl_lock_obj,
LOP_EXCLUSIVE);
return;
}
}
rw_exit(&rrwl->rrwl_lock);
}
int
rrw_status(struct rrwlock *rrwl)
{
return (rw_status(&rrwl->rrwl_lock));
}
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#define RWLOCK_OBJ_MAGIC 0x5aa3c85d
struct rwlock_obj {
struct rwlock ro_lock;
u_int ro_magic;
u_int ro_refcnt;
};
struct pool rwlock_obj_pool;
/*
* rw_obj_init:
*
* Initialize the mutex object store.
*/
void
rw_obj_init(void)
{
pool_init(&rwlock_obj_pool, sizeof(struct rwlock_obj), 0, IPL_MPFLOOR,
PR_WAITOK, "rwobjpl", NULL);
}
/*
* rw_obj_alloc:
*
* Allocate a single lock object.
*/
void
_rw_obj_alloc_flags(struct rwlock **lock, const char *name, int flags,
struct lock_type *type)
{
struct rwlock_obj *mo;
mo = pool_get(&rwlock_obj_pool, PR_WAITOK);
mo->ro_magic = RWLOCK_OBJ_MAGIC;
_rw_init_flags(&mo->ro_lock, name, flags, type);
mo->ro_refcnt = 1;
*lock = &mo->ro_lock;
}
/*
* rw_obj_hold:
*
* Add a single reference to a lock object. A reference to the object
* must already be held, and must be held across this call.
*/
void
rw_obj_hold(struct rwlock *lock)
{
struct rwlock_obj *mo = (struct rwlock_obj *)lock;
KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC,
"%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)",
__func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC);
KASSERTMSG(mo->ro_refcnt > 0,
"%s: lock %p: mo->ro_refcnt (%#x) == 0",
__func__, mo, mo->ro_refcnt);
atomic_inc_int(&mo->ro_refcnt);
}
/*
* rw_obj_free:
*
* Drop a reference from a lock object. If the last reference is being
* dropped, free the object and return true. Otherwise, return false.
*/
int
rw_obj_free(struct rwlock *lock)
{
struct rwlock_obj *mo = (struct rwlock_obj *)lock;
KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC,
"%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)",
__func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC);
KASSERTMSG(mo->ro_refcnt > 0,
"%s: lock %p: mo->ro_refcnt (%#x) == 0",
__func__, mo, mo->ro_refcnt);
if (atomic_dec_int_nv(&mo->ro_refcnt) > 0) {
return false;
}
#if notyet
WITNESS_DESTROY(&mo->ro_lock);
#endif
pool_put(&rwlock_obj_pool, mo);
return true;
}
#ifndef __DRM_VMA_MANAGER_H__
#define __DRM_VMA_MANAGER_H__
/*
* Copyright (c) 2013 David Herrmann <dh.herrmann@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <drm/drm_mm.h>
#include <linux/mm.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/types.h>
/* We make up offsets for buffer objects so we can recognize them at
* mmap time. pgoff in mmap is an unsigned long, so we need to make sure
* that the faked up offset will fit
*/
#if BITS_PER_LONG == 64
#define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFFUL >> PAGE_SHIFT) + 1)
#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 256)
#else
#define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFUL >> PAGE_SHIFT) + 1)
#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFUL >> PAGE_SHIFT) * 16)
#endif
struct drm_file;
struct drm_vma_offset_file {
struct rb_node vm_rb;
struct drm_file *vm_tag;
unsigned long vm_count;
};
struct drm_vma_offset_node {
struct mutex vm_lock;
struct drm_mm_node vm_node;
struct rb_root vm_files;
void *driver_private;
};
struct drm_vma_offset_manager {
struct mutex vm_lock;
struct drm_mm vm_addr_space_mm;
};
void drm_vma_offset_manager_init(struct drm_vma_offset_manager *mgr,
unsigned long page_offset, unsigned long size);
void drm_vma_offset_manager_destroy(struct drm_vma_offset_manager *mgr);
struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_manager *mgr,
unsigned long start,
unsigned long pages);
int drm_vma_offset_add(struct drm_vma_offset_manager *mgr,
struct drm_vma_offset_node *node, unsigned long pages);
void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr,
struct drm_vma_offset_node *node);
int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag);
void drm_vma_node_revoke(struct drm_vma_offset_node *node,
struct drm_file *tag);
bool drm_vma_node_is_allowed(struct drm_vma_offset_node *node,
struct drm_file *tag);
/**
* drm_vma_offset_exact_lookup_locked() - Look up node by exact address
* @mgr: Manager object
* @start: Start address (page-based, not byte-based)
* @pages: Size of object (page-based)
*
* Same as drm_vma_offset_lookup_locked() but does not allow any offset into the node.
* It only returns the exact object with the given start address.
*
* RETURNS:
* Node at exact start address @start.
*/
static inline struct drm_vma_offset_node *
drm_vma_offset_exact_lookup_locked(struct drm_vma_offset_manager *mgr,
unsigned long start,
unsigned long pages)
{
struct drm_vma_offset_node *node;
node = drm_vma_offset_lookup_locked(mgr, start, pages);
return (node && node->vm_node.start == start) ? node : NULL;
}
/**
* drm_vma_offset_lock_lookup() - Lock lookup for extended private use
* @mgr: Manager object
*
* Lock VMA manager for extended lookups. Only locked VMA function calls
* are allowed while holding this lock. All other contexts are blocked from VMA
* until the lock is released via drm_vma_offset_unlock_lookup().
*
* Use this if you need to take a reference to the objects returned by
* drm_vma_offset_lookup_locked() before releasing this lock again.
*
* This lock must not be used for anything else than extended lookups. You must
* not call any other VMA helpers while holding this lock.
*
* Note: You're in atomic-context while holding this lock!
*/
static inline void drm_vma_offset_lock_lookup(struct drm_vma_offset_manager *mgr)
{
read_lock(&mgr->vm_lock);
}
/**
* drm_vma_offset_unlock_lookup() - Unlock lookup for extended private use
* @mgr: Manager object
*
* Release lookup-lock. See drm_vma_offset_lock_lookup() for more information.
*/
static inline void drm_vma_offset_unlock_lookup(struct drm_vma_offset_manager *mgr)
{
read_unlock(&mgr->vm_lock);
}
/**
* drm_vma_node_reset() - Initialize or reset node object
* @node: Node to initialize or reset
*
* Reset a node to its initial state. This must be called before using it with
* any VMA offset manager.
*
* This must not be called on an already allocated node, or you will leak
* memory.
*/
static inline void drm_vma_node_reset(struct drm_vma_offset_node *node)
{
memset(node, 0, sizeof(*node));
node->vm_files = RB_ROOT;
mtx_init(&node->vm_lock, IPL_NONE);
}
/**
* drm_vma_node_start() - Return start address for page-based addressing
* @node: Node to inspect
*
* Return the start address of the given node. This can be used as offset into
* the linear VM space that is provided by the VMA offset manager. Note that
* this can only be used for page-based addressing. If you need a proper offset
* for user-space mappings, you must apply "<< PAGE_SHIFT" or use the
* drm_vma_node_offset_addr() helper instead.
*
* RETURNS:
* Start address of @node for page-based addressing. 0 if the node does not
* have an offset allocated.
*/
static inline unsigned long drm_vma_node_start(const struct drm_vma_offset_node *node)
{
return node->vm_node.start;
}
/**
* drm_vma_node_size() - Return size (page-based)
* @node: Node to inspect
*
* Return the size as number of pages for the given node. This is the same size
* that was passed to drm_vma_offset_add(). If no offset is allocated for the
* node, this is 0.
*
* RETURNS:
* Size of @node as number of pages. 0 if the node does not have an offset
* allocated.
*/
static inline unsigned long drm_vma_node_size(struct drm_vma_offset_node *node)
{
return node->vm_node.size;
}
/**
* drm_vma_node_offset_addr() - Return sanitized offset for user-space mmaps
* @node: Linked offset node
*
* Same as drm_vma_node_start() but returns the address as a valid offset that
* can be used for user-space mappings during mmap().
* This must not be called on unlinked nodes.
*
* RETURNS:
* Offset of @node for byte-based addressing. 0 if the node does not have an
* object allocated.
*/
static inline __u64 drm_vma_node_offset_addr(struct drm_vma_offset_node *node)
{
return ((__u64)node->vm_node.start) << PAGE_SHIFT;
}
/**
* drm_vma_node_unmap() - Unmap offset node
* @node: Offset node
* @file_mapping: Address space to unmap @node from
*
* Unmap all userspace mappings for a given offset node. The mappings must be
* associated with the @file_mapping address-space. If no offset exists
* nothing is done.
*
* This call is unlocked. The caller must guarantee that drm_vma_offset_remove()
* is not called on this node concurrently.
*/
#ifdef __linux__
static inline void drm_vma_node_unmap(struct drm_vma_offset_node *node,
struct address_space *file_mapping)
{
if (drm_mm_node_allocated(&node->vm_node))
unmap_mapping_range(file_mapping,
drm_vma_node_offset_addr(node),
drm_vma_node_size(node) << PAGE_SHIFT, 1);
}
#endif
/**
* drm_vma_node_verify_access() - Access verification helper for TTM
* @node: Offset node
* @tag: Tag of file to check
*
* This checks whether @tag is granted access to @node. It is the same as
* drm_vma_node_is_allowed() but suitable as drop-in helper for TTM
* verify_access() callbacks.
*
* RETURNS:
* 0 if access is granted, -EACCES otherwise.
*/
static inline int drm_vma_node_verify_access(struct drm_vma_offset_node *node,
struct drm_file *tag)
{
return drm_vma_node_is_allowed(node, tag) ? 0 : -EACCES;
}
#endif /* __DRM_VMA_MANAGER_H__ */
/* $OpenBSD: ufs_dirhash.c,v 1.42 2019/03/15 05:42:38 kevlo Exp $ */
/*
* Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* This implements a hash-based lookup scheme for UFS directories.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <crypto/siphash.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1))
#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1))
#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen == 0)
#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
int ufs_mindirhashsize;
int ufs_dirhashmaxmem;
int ufs_dirhashmem;
int ufs_dirhashcheck;
SIPHASH_KEY ufsdirhash_key;
int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
void ufsdirhash_delslot(struct dirhash *dh, int slot);
int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
doff_t offset);
doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
int ufsdirhash_recycle(int wanted);
struct pool ufsdirhash_pool;
#define DIRHASHLIST_LOCK() rw_enter_write(&ufsdirhash_mtx)
#define DIRHASHLIST_UNLOCK() rw_exit_write(&ufsdirhash_mtx)
#define DIRHASH_LOCK(dh) rw_enter_write(&(dh)->dh_mtx)
#define DIRHASH_UNLOCK(dh) rw_exit_write(&(dh)->dh_mtx)
#define DIRHASH_BLKALLOC_WAITOK() pool_get(&ufsdirhash_pool, PR_WAITOK)
#define DIRHASH_BLKFREE(v) pool_put(&ufsdirhash_pool, v)
#define mtx_assert(l, f) /* nothing */
#define DIRHASH_ASSERT(e, m) KASSERT((e))
/* Dirhash list; recently-used entries are near the tail. */
TAILQ_HEAD(, dirhash) ufsdirhash_list;
/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
struct rwlock ufsdirhash_mtx;
/*
* Locking order:
* ufsdirhash_mtx
* dh_mtx
*
* The dh_mtx mutex should be acquired either via the inode lock, or via
* ufsdirhash_mtx. Only the owner of the inode may free the associated
* dirhash, but anything can steal its memory and set dh_hash to NULL.
*/
/*
* Attempt to build up a hash table for the directory contents in
* inode 'ip'. Returns 0 on success, or -1 of the operation failed.
*/
int
ufsdirhash_build(struct inode *ip)
{
struct dirhash *dh;
struct buf *bp = NULL;
struct direct *ep;
struct vnode *vp;
doff_t bmask, pos;
int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
/* Check if we can/should use dirhash. */
if (ip->i_dirhash == NULL) {
if (DIP(ip, size) < ufs_mindirhashsize || OFSFMT(ip))
return (-1);
} else {
/* Hash exists, but sysctls could have changed. */
if (DIP(ip, size) < ufs_mindirhashsize ||
ufs_dirhashmem > ufs_dirhashmaxmem) {
ufsdirhash_free(ip);
return (-1);
}
/* Check if hash exists and is intact (note: unlocked read). */
if (ip->i_dirhash->dh_hash != NULL)
return (0);
/* Free the old, recycled hash and build a new one. */
ufsdirhash_free(ip);
}
/* Don't hash removed directories. */
if (ip->i_effnlink == 0)
return (-1);
vp = ip->i_vnode;
/* Allocate 50% more entries than this dir size could ever need. */
DIRHASH_ASSERT(DIP(ip, size) >= DIRBLKSIZ, ("ufsdirhash_build size"));
nslots = DIP(ip, size) / DIRECTSIZ(1);
nslots = (nslots * 3 + 1) / 2;
narrays = howmany(nslots, DH_NBLKOFF);
nslots = narrays * DH_NBLKOFF;
dirblocks = howmany(DIP(ip, size), DIRBLKSIZ);
nblocks = (dirblocks * 3 + 1) / 2;
memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
nblocks * sizeof(*dh->dh_blkfree);
DIRHASHLIST_LOCK();
if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
DIRHASHLIST_UNLOCK();
if (memreqd > ufs_dirhashmaxmem / 2)
return (-1);
/* Try to free some space. */
if (ufsdirhash_recycle(memreqd) != 0)
return (-1);
/* Enough was freed, and list has been locked. */
}
ufs_dirhashmem += memreqd;
DIRHASHLIST_UNLOCK();
/*
* Use non-blocking mallocs so that we will revert to a linear
* lookup on failure rather than potentially blocking forever.
*/
dh = malloc(sizeof(*dh), M_DIRHASH, M_NOWAIT|M_ZERO);
if (dh == NULL) {
DIRHASHLIST_LOCK();
ufs_dirhashmem -= memreqd;
DIRHASHLIST_UNLOCK();
return (-1);
}
dh->dh_hash = mallocarray(narrays, sizeof(dh->dh_hash[0]),
M_DIRHASH, M_NOWAIT|M_ZERO);
dh->dh_blkfree = mallocarray(nblocks, sizeof(dh->dh_blkfree[0]),
M_DIRHASH, M_NOWAIT | M_ZERO);
if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
goto fail;
for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
goto fail;
for (j = 0; j < DH_NBLKOFF; j++)
dh->dh_hash[i][j] = DIRHASH_EMPTY;
}
/* Initialise the hash table and block statistics. */
rw_init(&dh->dh_mtx, "dirhash");
dh->dh_narrays = narrays;
dh->dh_hlen = nslots;
dh->dh_nblk = nblocks;
dh->dh_dirblks = dirblocks;
for (i = 0; i < dirblocks; i++)
dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
for (i = 0; i < DH_NFSTATS; i++)
dh->dh_firstfree[i] = -1;
dh->dh_firstfree[DH_NFSTATS] = 0;
dh->dh_seqopt = 0;
dh->dh_seqoff = 0;
dh->dh_score = DH_SCOREINIT;
ip->i_dirhash = dh;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
pos = 0;
while (pos < DIP(ip, size)) {
/* If necessary, get the next directory block. */
if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp); if (UFS_BUFATOFF(ip, (off_t)pos, NULL, &bp) != 0)
goto fail;
}
/* Add this entry to the hash. */
ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
if (ep->d_reclen == 0 || ep->d_reclen >
DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
/* Corrupted directory. */
brelse(bp);
goto fail;
}
if (ep->d_ino != 0) {
/* Add the entry (simplified ufsdirhash_add). */
slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
dh->dh_hused++;
DH_ENTRY(dh, slot) = pos;
ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
}
pos += ep->d_reclen;
}
if (bp != NULL) brelse(bp);
DIRHASHLIST_LOCK();
TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 1;
DIRHASHLIST_UNLOCK();
return (0);
fail:
if (dh->dh_hash != NULL) { for (i = 0; i < narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); free(dh->dh_hash, M_DIRHASH,
narrays * sizeof(dh->dh_hash[0]));
}
if (dh->dh_blkfree != NULL) free(dh->dh_blkfree, M_DIRHASH,
nblocks * sizeof(dh->dh_blkfree[0]));
free(dh, M_DIRHASH, sizeof(*dh));
ip->i_dirhash = NULL;
DIRHASHLIST_LOCK();
ufs_dirhashmem -= memreqd;
DIRHASHLIST_UNLOCK();
return (-1);
}
/*
* Free any hash table associated with inode 'ip'.
*/
void
ufsdirhash_free(struct inode *ip)
{
struct dirhash *dh;
int i, mem;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASHLIST_LOCK();
DIRHASH_LOCK(dh);
if (dh->dh_onlist)
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
/* The dirhash pointed to by 'dh' is exclusively ours now. */
mem = sizeof(*dh);
if (dh->dh_hash != NULL) {
for (i = 0; i < dh->dh_narrays; i++)
DIRHASH_BLKFREE(dh->dh_hash[i]);
free(dh->dh_hash, M_DIRHASH,
dh->dh_narrays * sizeof(dh->dh_hash[0]));
free(dh->dh_blkfree, M_DIRHASH,
dh->dh_nblk * sizeof(dh->dh_blkfree[0]));
mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
dh->dh_nblk * sizeof(*dh->dh_blkfree);
}
free(dh, M_DIRHASH, sizeof(*dh));
ip->i_dirhash = NULL;
DIRHASHLIST_LOCK();
ufs_dirhashmem -= mem;
DIRHASHLIST_UNLOCK();
}
/*
* Find the offset of the specified name within the given inode.
* Returns 0 on success, ENOENT if the entry does not exist, or
* EJUSTRETURN if the caller should revert to a linear search.
*
* If successful, the directory offset is stored in *offp, and a
* pointer to a struct buf containing the entry is stored in *bpp. If
* prevoffp is non-NULL, the offset of the previous entry within
* the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
* is the first in a block, the start of the block is used).
*/
int
ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
struct buf **bpp, doff_t *prevoffp)
{
struct dirhash *dh, *dh_next;
struct direct *dp;
struct vnode *vp;
struct buf *bp;
doff_t blkoff, bmask, offset, prevoff;
int i, slot;
if ((dh = ip->i_dirhash) == NULL)
return (EJUSTRETURN);
/*
* Move this dirhash towards the end of the list if it has a
* score higher than the next entry, and acquire the dh_mtx.
* Optimise the case where it's already the last by performing
* an unlocked read of the TAILQ_NEXT pointer.
*
* In both cases, end up holding just dh_mtx.
*/
if (TAILQ_NEXT(dh, dh_list) != NULL) {
DIRHASHLIST_LOCK();
DIRHASH_LOCK(dh);
/*
* If the new score will be greater than that of the next
* entry, then move this entry past it. With both mutexes
* held, dh_next won't go away, but its dh_score could
* change; that's not important since it is just a hint.
*/
if (dh->dh_hash != NULL && (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
dh->dh_score >= dh_next->dh_score) {
DIRHASH_ASSERT(dh->dh_onlist, ("dirhash: not on list")); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
dh_list);
}
DIRHASHLIST_UNLOCK();
} else {
/* Already the last, though that could change as we wait. */
DIRHASH_LOCK(dh);
}
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/* Update the score. */
if (dh->dh_score < DH_SCOREMAX) dh->dh_score++;
vp = ip->i_vnode;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
blkoff = -1;
bp = NULL;
restart:
slot = ufsdirhash_hash(dh, name, namelen);
if (dh->dh_seqopt) {
/*
* Sequential access optimisation. dh_seqoff contains the
* offset of the directory entry immediately following
* the last entry that was looked up. Check if this offset
* appears in the hash chain for the name we are looking for.
*/
for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
i = WRAPINCR(i, dh->dh_hlen))
if (offset == dh->dh_seqoff)
break;
if (offset == dh->dh_seqoff) {
/*
* We found an entry with the expected offset. This
* is probably the entry we want, but if not, the
* code below will turn off seqopt and retry.
*/
slot = i;
} else
dh->dh_seqopt = 0;
}
for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) {
if (offset == DIRHASH_DEL)
continue;
DIRHASH_UNLOCK(dh);
if (offset < 0 || offset >= DIP(ip, size))
panic("ufsdirhash_lookup: bad offset in hash array");
if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp);
blkoff = offset & ~bmask;
if (UFS_BUFATOFF(ip, (off_t)blkoff, NULL, &bp) != 0)
return (EJUSTRETURN);
}
dp = (struct direct *)(bp->b_data + (offset & bmask));
if (dp->d_reclen == 0 || dp->d_reclen >
DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
/* Corrupted directory. */
brelse(bp);
return (EJUSTRETURN);
}
if (dp->d_namlen == namelen &&
memcmp(dp->d_name, name, namelen) == 0) {
/* Found. Get the prev offset if needed. */
if (prevoffp != NULL) { if (offset & (DIRBLKSIZ - 1)) {
prevoff = ufsdirhash_getprev(dp,
offset);
if (prevoff == -1) {
brelse(bp);
return (EJUSTRETURN);
}
} else
prevoff = offset;
*prevoffp = prevoff;
}
/* Check for sequential access, and update offset. */
if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) dh->dh_seqopt = 1;
dh->dh_seqoff = offset + DIRSIZ(0, dp);
*bpp = bp;
*offp = offset;
return (0);
}
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/*
* When the name doesn't match in the seqopt case, go back
* and search normally.
*/
if (dh->dh_seqopt) {
dh->dh_seqopt = 0;
goto restart;
}
}
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp);
return (ENOENT);
}
/*
* Find a directory block with room for 'slotneeded' bytes. Returns
* the offset of the directory entry that begins the free space.
* This will either be the offset of an existing entry that has free
* space at the end, or the offset of an entry with d_ino == 0 at
* the start of a DIRBLKSIZ block.
*
* To use the space, the caller may need to compact existing entries in
* the directory. The total number of bytes in all of the entries involved
* in the compaction is stored in *slotsize. In other words, all of
* the entries that must be compacted are exactly contained in the
* region beginning at the returned offset and spanning *slotsize bytes.
*
* Returns -1 if no space was found, indicating that the directory
* must be extended.
*/
doff_t
ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
{
struct direct *dp;
struct dirhash *dh;
struct buf *bp;
doff_t pos, slotstart;
int dirblock, error, freebytes, i;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
/* Find a directory block with the desired free space. */
dirblock = -1;
for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
if ((dirblock = dh->dh_firstfree[i]) != -1)
break;
if (dirblock == -1) {
DIRHASH_UNLOCK(dh);
return (-1);
}
DIRHASH_ASSERT(dirblock < dh->dh_nblk &&
dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
("ufsdirhash_findfree: bad stats"));
DIRHASH_UNLOCK(dh);
pos = dirblock * DIRBLKSIZ;
error = UFS_BUFATOFF(ip, (off_t)pos, (char **)&dp, &bp);
if (error)
return (-1);
/* Find the first entry with free space. */
for (i = 0; i < DIRBLKSIZ; ) {
if (dp->d_reclen == 0) {
brelse(bp);
return (-1);
}
if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
break;
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > DIRBLKSIZ) {
brelse(bp);
return (-1);
}
slotstart = pos + i;
/* Find the range of entries needed to get enough space */
freebytes = 0;
while (i < DIRBLKSIZ && freebytes < slotneeded) {
freebytes += dp->d_reclen;
if (dp->d_ino != 0) freebytes -= DIRSIZ(0, dp);
if (dp->d_reclen == 0) {
brelse(bp);
return (-1);
}
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > DIRBLKSIZ) {
brelse(bp);
return (-1);
}
if (freebytes < slotneeded)
panic("ufsdirhash_findfree: free mismatch");
brelse(bp);
*slotsize = pos + i - slotstart;
return (slotstart);
}
/*
* Return the start of the unused space at the end of a directory, or
* -1 if there are no trailing unused blocks.
*/
doff_t
ufsdirhash_enduseful(struct inode *ip)
{
struct dirhash *dh;
int i;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
DIRHASH_UNLOCK(dh);
return (-1);
}
for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
break;
DIRHASH_UNLOCK(dh);
return ((doff_t)(i + 1) * DIRBLKSIZ);
}
/*
* Insert information into the hash about a new directory entry. dirp
* points to a struct direct containing the entry, and offset specifies
* the offset of this entry.
*/
void
ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
DIRHASH_ASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_add: bad offset"));
/*
* Normal hash usage is < 66%. If the usage gets too high then
* remove the hash entirely and let it be rebuilt later.
*/
if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/* Find a free hash slot (empty or deleted), and add the entry. */
slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
while (DH_ENTRY(dh, slot) >= 0)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
dh->dh_hused++;
DH_ENTRY(dh, slot) = offset;
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
DIRHASH_UNLOCK(dh);
}
/*
* Remove the specified directory entry from the hash. The entry to remove
* is defined by the name in `dirp', which must exist at the specified
* `offset' within the directory.
*/
void
ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
DIRHASH_ASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_remove: bad offset"));
/* Find the entry */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
/* Remove the hash entry. */
ufsdirhash_delslot(dh, slot);
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
DIRHASH_UNLOCK(dh);
}
/*
* Change the offset associated with a directory entry in the hash. Used
* when compacting directory blocks.
*/
void
ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
doff_t newoff)
{
struct dirhash *dh;
int slot;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
DIRHASH_ASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
newoff < dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_move: bad offset"));
/* Find the entry, and update the offset. */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
DH_ENTRY(dh, slot) = newoff;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory has grown by one block that
* begins at offset (i.e. the new length is offset + DIRBLKSIZ).
*/
void
ufsdirhash_newblk(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
DIRHASH_ASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_newblk: bad offset"));
block = offset / DIRBLKSIZ;
if (block >= dh->dh_nblk) {
/* Out of space; must rebuild. */
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
dh->dh_dirblks = block + 1;
/* Account for the new free block. */
dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
if (dh->dh_firstfree[DH_NFSTATS] == -1)
dh->dh_firstfree[DH_NFSTATS] = block;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory is being truncated.
*/
void
ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block, i;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
DIRHASH_ASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_dirtrunc: bad offset"));
block = howmany(offset, DIRBLKSIZ);
/*
* If the directory shrinks to less than 1/8 of dh_nblk blocks
* (about 20% of its original size due to the 50% extra added in
* ufsdirhash_build) then free it, and let the caller rebuild
* if necessary.
*/
if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/*
* Remove any `first free' information pertaining to the
* truncated blocks. All blocks we're removing should be
* completely unused.
*/
if (dh->dh_firstfree[DH_NFSTATS] >= block)
dh->dh_firstfree[DH_NFSTATS] = -1;
for (i = block; i < dh->dh_dirblks; i++)
if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
panic("ufsdirhash_dirtrunc: blocks in use");
for (i = 0; i < DH_NFSTATS; i++)
if (dh->dh_firstfree[i] >= block)
panic("ufsdirhash_dirtrunc: first free corrupt");
dh->dh_dirblks = block;
DIRHASH_UNLOCK(dh);
}
/*
* Debugging function to check that the dirhash information about
* a directory block matches its actual contents. Panics if a mismatch
* is detected.
*
* On entry, `buf' should point to the start of an in-core
* DIRBLKSIZ-sized directory block, and `offset' should contain the
* offset from the start of the directory of that block.
*/
void
ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
{
struct dirhash *dh;
struct direct *dp;
int block, ffslot, i, nfree;
if (!ufs_dirhashcheck)
return;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
block = offset / DIRBLKSIZ;
if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
panic("ufsdirhash_checkblock: bad offset");
nfree = 0;
for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
dp = (struct direct *)(buf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
panic("ufsdirhash_checkblock: bad dir");
if (dp->d_ino == 0) {
#if 0
/*
* XXX entries with d_ino == 0 should only occur
* at the start of a DIRBLKSIZ block. However the
* ufs code is tolerant of such entries at other
* offsets, and fsck does not fix them.
*/
if (i != 0)
panic("ufsdirhash_checkblock: bad dir inode");
#endif
nfree += dp->d_reclen;
continue;
}
/* Check that the entry exists (will panic if it doesn't). */
ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
nfree += dp->d_reclen - DIRSIZ(0, dp);
}
if (i != DIRBLKSIZ)
panic("ufsdirhash_checkblock: bad dir end");
if (dh->dh_blkfree[block] * DIRALIGN != nfree)
panic("ufsdirhash_checkblock: bad free count");
ffslot = BLKFREE2IDX(nfree / DIRALIGN);
for (i = 0; i <= DH_NFSTATS; i++)
if (dh->dh_firstfree[i] == block && i != ffslot)
panic("ufsdirhash_checkblock: bad first-free");
if (dh->dh_firstfree[ffslot] == -1)
panic("ufsdirhash_checkblock: missing first-free entry");
DIRHASH_UNLOCK(dh);
}
/*
* Hash the specified filename into a dirhash slot.
*/
int
ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
{
return SipHash24(&ufsdirhash_key, name, namelen) % dh->dh_hlen;
}
/*
* Adjust the number of free bytes in the block containing `offset'
* by the value specified by `diff'.
*
* The caller must ensure we have exclusive access to `dh'; normally
* that means that dh_mtx should be held, but this is also called
* from ufsdirhash_build() where exclusive access can be assumed.
*/
void
ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
{
int block, i, nfidx, ofidx;
/* Update the per-block summary info. */
block = offset / DIRBLKSIZ;
DIRHASH_ASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
("dirhash bad offset"));
ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
/* Update the `first free' list if necessary. */
if (ofidx != nfidx) {
/* If removing, scan forward for the next block. */
if (dh->dh_firstfree[ofidx] == block) {
for (i = block + 1; i < dh->dh_dirblks; i++)
if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
break;
dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
}
/* Make this the new `first free' if necessary */
if (dh->dh_firstfree[nfidx] > block ||
dh->dh_firstfree[nfidx] == -1)
dh->dh_firstfree[nfidx] = block;
}
}
/*
* Find the specified name which should have the specified offset.
* Returns a slot number, and panics on failure.
*
* `dh' must be locked on entry and remains so on return.
*/
int
ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
{
int slot;
mtx_assert(&dh->dh_mtx, MA_OWNED);
/* Find the entry. */
DIRHASH_ASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
slot = ufsdirhash_hash(dh, name, namelen);
while (DH_ENTRY(dh, slot) != offset &&
DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) != offset)
panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
return (slot);
}
/*
* Remove the entry corresponding to the specified slot from the hash array.
*
* `dh' must be locked on entry and remains so on return.
*/
void
ufsdirhash_delslot(struct dirhash *dh, int slot)
{
int i;
mtx_assert(&dh->dh_mtx, MA_OWNED);
/* Mark the entry as deleted. */
DH_ENTRY(dh, slot) = DIRHASH_DEL;
/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
i = WRAPINCR(i, dh->dh_hlen);
if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
i = WRAPDECR(i, dh->dh_hlen);
while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
DH_ENTRY(dh, i) = DIRHASH_EMPTY;
dh->dh_hused--;
i = WRAPDECR(i, dh->dh_hlen);
}
DIRHASH_ASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
}
}
/*
* Given a directory entry and its offset, find the offset of the
* previous entry in the same DIRBLKSIZ-sized block. Returns an
* offset, or -1 if there is no previous entry in the block or some
* other problem occurred.
*/
doff_t
ufsdirhash_getprev(struct direct *dirp, doff_t offset)
{
struct direct *dp;
char *blkbuf;
doff_t blkoff, prevoff;
int entrypos, i;
blkoff = offset & ~(DIRBLKSIZ - 1); /* offset of start of block */
entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */
blkbuf = (char *)dirp - entrypos;
prevoff = blkoff;
/* If `offset' is the start of a block, there is no previous entry. */
if (entrypos == 0)
return (-1);
/* Scan from the start of the block until we get to the entry. */
for (i = 0; i < entrypos; i += dp->d_reclen) {
dp = (struct direct *)(blkbuf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
return (-1); /* Corrupted directory. */
prevoff = blkoff + i;
}
return (prevoff);
}
/*
* Try to free up `wanted' bytes by stealing memory from existing
* dirhashes. Returns zero with list locked if successful.
*/
int
ufsdirhash_recycle(int wanted)
{
struct dirhash *dh;
doff_t **hash;
u_int8_t *blkfree;
int i, mem, narrays, nblk;
DIRHASHLIST_LOCK();
while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
/* Find a dirhash, and lock it. */
if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
DIRHASHLIST_UNLOCK();
return (-1);
}
DIRHASH_LOCK(dh);
DIRHASH_ASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
/* Decrement the score; only recycle if it becomes zero. */
if (--dh->dh_score > 0) {
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
return (-1);
}
/* Remove it from the list and detach its memory. */
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 0;
hash = dh->dh_hash;
dh->dh_hash = NULL;
blkfree = dh->dh_blkfree;
dh->dh_blkfree = NULL;
narrays = dh->dh_narrays;
nblk = dh->dh_nblk;
mem = narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
dh->dh_nblk * sizeof(*dh->dh_blkfree);
/* Unlock everything, free the detached memory. */
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
for (i = 0; i < narrays; i++)
DIRHASH_BLKFREE(hash[i]);
free(hash, M_DIRHASH, narrays * sizeof(hash[0]));
free(blkfree, M_DIRHASH, nblk * sizeof(blkfree[0]));
/* Account for the returned memory, and repeat if necessary. */
DIRHASHLIST_LOCK();
ufs_dirhashmem -= mem;
}
/* Success; return with list locked. */
return (0);
}
void
ufsdirhash_init(void)
{
pool_init(&ufsdirhash_pool, DH_NBLKOFF * sizeof(doff_t), 0, IPL_NONE,
PR_WAITOK, "dirhash", NULL);
rw_init(&ufsdirhash_mtx, "dirhash_list");
arc4random_buf(&ufsdirhash_key, sizeof(ufsdirhash_key));
TAILQ_INIT(&ufsdirhash_list);
ufs_dirhashmaxmem = 5 * 1024 * 1024;
ufs_mindirhashsize = 5 * DIRBLKSIZ;
}
void
ufsdirhash_uninit(void)
{
DIRHASH_ASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit"));
pool_destroy(&ufsdirhash_pool);
}
/* $OpenBSD: mpls_raw.c,v 1.19 2022/02/22 01:15:02 guenther Exp $ */
/*
* Copyright (C) 1999, 2000 and 2001 AYAME Project, WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/errno.h>
#include <sys/sockio.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
#include <netmpls/mpls.h>
int mpls_defttl = 255;
int mpls_push_expnull_ip = 0;
int mpls_push_expnull_ip6 = 0;
int mpls_mapttl_ip = 1;
int mpls_mapttl_ip6 = 0;
const struct sysctl_bounded_args mplsctl_vars[] = {
{ MPLSCTL_DEFTTL, &mpls_defttl, 0, 255 },
{ MPLSCTL_MAPTTL_IP, &mpls_mapttl_ip, 0, 1 },
{ MPLSCTL_MAPTTL_IP6, &mpls_mapttl_ip6, 0, 1 },
};
int
mpls_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
return sysctl_bounded_arr(mplsctl_vars, nitems(mplsctl_vars),
name, namelen, oldp, oldlenp, newp, newlen);
}
/* $OpenBSD: uvm_fault.c,v 1.132 2022/08/31 01:27:04 guenther Exp $ */
/* $NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
*/
/*
* uvm_fault.c: fault handler
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/percpu.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/tracepoint.h>
#include <uvm/uvm.h>
/*
*
* a word on page faults:
*
* types of page faults we handle:
*
* CASE 1: upper layer faults CASE 2: lower layer faults
*
* CASE 1A CASE 1B CASE 2A CASE 2B
* read/write1 write>1 read/write +-cow_write/zero
* | | | |
* +--|--+ +--|--+ +-----+ + | + | +-----+
* amap | V | | ---------> new | | | | ^ |
* +-----+ +-----+ +-----+ + | + | +--|--+
* | | |
* +-----+ +-----+ +--|--+ | +--|--+
* uobj | d/c | | d/c | | V | +----+ |
* +-----+ +-----+ +-----+ +-----+
*
* d/c = don't care
*
* case [0]: layerless fault
* no amap or uobj is present. this is an error.
*
* case [1]: upper layer fault [anon active]
* 1A: [read] or [write with anon->an_ref == 1]
* I/O takes place in upper level anon and uobj is not touched.
* 1B: [write with anon->an_ref > 1]
* new anon is alloc'd and data is copied off ["COW"]
*
* case [2]: lower layer fault [uobj]
* 2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
* I/O takes place directly in object.
* 2B: [write to copy_on_write] or [read on NULL uobj]
* data is "promoted" from uobj to a new anon.
* if uobj is null, then we zero fill.
*
* we follow the standard UVM locking protocol ordering:
*
* MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
* we hold a PG_BUSY page if we unlock for I/O
*
*
* the code is structured as follows:
*
* - init the "IN" params in the ufi structure
* ReFault: (ERESTART returned to the loop in uvm_fault)
* - do lookups [locks maps], check protection, handle needs_copy
* - check for case 0 fault (error)
* - establish "range" of fault
* - if we have an amap lock it and extract the anons
* - if sequential advice deactivate pages behind us
* - at the same time check pmap for unmapped areas and anon for pages
* that we could map in (and do map it if found)
* - check object for resident pages that we could map in
* - if (case 2) goto Case2
* - >>> handle case 1
* - ensure source anon is resident in RAM
* - if case 1B alloc new anon and copy from source
* - map the correct page in
* Case2:
* - >>> handle case 2
* - ensure source page is resident (if uobj)
* - if case 2B alloc new anon and copy from source (could be zero
* fill if uobj == NULL)
* - map the correct page in
* - done!
*
* note on paging:
* if we have to do I/O we place a PG_BUSY page in the correct object,
* unlock everything, and do the I/O. when I/O is done we must reverify
* the state of the world before assuming that our data structures are
* valid. [because mappings could change while the map is unlocked]
*
* alternative 1: unbusy the page in question and restart the page fault
* from the top (ReFault). this is easy but does not take advantage
* of the information that we already have from our previous lookup,
* although it is possible that the "hints" in the vm_map will help here.
*
* alternative 2: the system already keeps track of a "version" number of
* a map. [i.e. every time you write-lock a map (e.g. to change a
* mapping) you bump the version number up by one...] so, we can save
* the version number of the map before we release the lock and start I/O.
* then when I/O is done we can relock and check the version numbers
* to see if anything changed. this might save us some over 1 because
* we don't have to unbusy the page and may be less compares(?).
*
* alternative 3: put in backpointers or a way to "hold" part of a map
* in place while I/O is in progress. this could be complex to
* implement (especially with structures like amap that can be referenced
* by multiple map entries, and figuring out what should wait could be
* complex as well...).
*
* we use alternative 2. given that we are multi-threaded now we may want
* to reconsider the choice.
*/
/*
* local data structures
*/
struct uvm_advice {
int nback;
int nforw;
};
/*
* page range array: set up in uvmfault_init().
*/
static struct uvm_advice uvmadvice[MADV_MASK + 1];
#define UVM_MAXRANGE 16 /* must be max() of nback+nforw+1 */
/*
* private prototypes
*/
static void uvmfault_amapcopy(struct uvm_faultinfo *);
static inline void uvmfault_anonflush(struct vm_anon **, int);
void uvmfault_unlockmaps(struct uvm_faultinfo *, boolean_t);
void uvmfault_update_stats(struct uvm_faultinfo *);
/*
* inline functions
*/
/*
* uvmfault_anonflush: try and deactivate pages in specified anons
*
* => does not have to deactivate page if it is busy
*/
static inline void
uvmfault_anonflush(struct vm_anon **anons, int n)
{
int lcv;
struct vm_page *pg;
for (lcv = 0; lcv < n; lcv++) { if (anons[lcv] == NULL)
continue;
KASSERT(rw_lock_held(anons[lcv]->an_lock));
pg = anons[lcv]->an_page;
if (pg && (pg->pg_flags & PG_BUSY) == 0) {
uvm_lock_pageq();
if (pg->wire_count == 0) {
pmap_page_protect(pg, PROT_NONE);
uvm_pagedeactivate(pg);
}
uvm_unlock_pageq();
}
}
}
/*
* normal functions
*/
/*
* uvmfault_init: compute proper values for the uvmadvice[] array.
*/
void
uvmfault_init(void)
{
int npages;
npages = atop(16384);
if (npages > 0) {
KASSERT(npages <= UVM_MAXRANGE / 2);
uvmadvice[MADV_NORMAL].nforw = npages;
uvmadvice[MADV_NORMAL].nback = npages - 1;
}
npages = atop(32768);
if (npages > 0) {
KASSERT(npages <= UVM_MAXRANGE / 2);
uvmadvice[MADV_SEQUENTIAL].nforw = npages - 1;
uvmadvice[MADV_SEQUENTIAL].nback = npages;
}
}
/*
* uvmfault_amapcopy: clear "needs_copy" in a map.
*
* => called with VM data structures unlocked (usually, see below)
* => we get a write lock on the maps and clear needs_copy for a VA
* => if we are out of RAM we sleep (waiting for more)
*/
static void
uvmfault_amapcopy(struct uvm_faultinfo *ufi)
{
for (;;) {
/*
* no mapping? give up.
*/
if (uvmfault_lookup(ufi, TRUE) == FALSE)
return;
/*
* copy if needed.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) amap_copy(ufi->map, ufi->entry, M_NOWAIT,
UVM_ET_ISSTACK(ufi->entry) ? FALSE : TRUE,
ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
/*
* didn't work? must be out of RAM. unlock and sleep.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
uvmfault_unlockmaps(ufi, TRUE);
uvm_wait("fltamapcopy");
continue;
}
/*
* got it! unlock and return.
*/
uvmfault_unlockmaps(ufi, TRUE);
return;
}
/*NOTREACHED*/
}
/*
* uvmfault_anonget: get data in an anon into a non-busy, non-released
* page in that anon.
*
* => Map, amap and thus anon should be locked by caller.
* => If we fail, we unlock everything and error is returned.
* => If we are successful, return with everything still locked.
* => We do not move the page on the queues [gets moved later]. If we
* allocate a new page [we_own], it gets put on the queues. Either way,
* the result is that the page is on the queues at return time
*/
int
uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct vm_anon *anon)
{
struct vm_page *pg;
int error;
KASSERT(rw_lock_held(anon->an_lock)); KASSERT(anon->an_lock == amap->am_lock);
/* Increment the counters.*/
counters_inc(uvmexp_counters, flt_anget);
if (anon->an_page) {
curproc->p_ru.ru_minflt++;
} else {
curproc->p_ru.ru_majflt++;
}
error = 0;
/*
* Loop until we get the anon data, or fail.
*/
for (;;) {
boolean_t we_own, locked;
/*
* Note: 'we_own' will become true if we set PG_BUSY on a page.
*/
we_own = FALSE;
pg = anon->an_page;
/*
* Is page resident? Make sure it is not busy/released.
*/
if (pg) {
KASSERT(pg->pg_flags & PQ_ANON); KASSERT(pg->uanon == anon);
/*
* if the page is busy, we drop all the locks and
* try again.
*/
if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0)
return (VM_PAGER_OK);
atomic_setbits_int(&pg->pg_flags, PG_WANTED);
counters_inc(uvmexp_counters, flt_pgwait);
/*
* The last unlock must be an atomic unlock and wait
* on the owner of page.
*/
if (pg->uobject) {
/* Owner of page is UVM object. */
uvmfault_unlockall(ufi, amap, NULL);
rwsleep_nsec(pg, pg->uobject->vmobjlock,
PVM | PNORELOCK, "anonget1", INFSLP);
} else {
/* Owner of page is anon. */
uvmfault_unlockall(ufi, NULL, NULL);
rwsleep_nsec(pg, anon->an_lock, PVM | PNORELOCK,
"anonget2", INFSLP);
}
} else {
/*
* No page, therefore allocate one.
*/
pg = uvm_pagealloc(NULL, 0, anon, 0);
if (pg == NULL) {
/* Out of memory. Wait a little. */
uvmfault_unlockall(ufi, amap, NULL);
counters_inc(uvmexp_counters, flt_noram);
uvm_wait("flt_noram1");
} else {
/* PG_BUSY bit is set. */
we_own = TRUE;
uvmfault_unlockall(ufi, amap, NULL);
/*
* Pass a PG_BUSY+PG_FAKE+PG_CLEAN page into
* the uvm_swap_get() function with all data
* structures unlocked. Note that it is OK
* to read an_swslot here, because we hold
* PG_BUSY on the page.
*/
counters_inc(uvmexp_counters, pageins);
error = uvm_swap_get(pg, anon->an_swslot,
PGO_SYNCIO);
/*
* We clean up after the I/O below in the
* 'we_own' case.
*/
}
}
/*
* Re-lock the map and anon.
*/
locked = uvmfault_relock(ufi);
if (locked || we_own) { rw_enter(anon->an_lock, RW_WRITE);
}
/*
* If we own the page (i.e. we set PG_BUSY), then we need
* to clean up after the I/O. There are three cases to
* consider:
*
* 1) Page was released during I/O: free anon and ReFault.
* 2) I/O not OK. Free the page and cause the fault to fail.
* 3) I/O OK! Activate the page and sync with the non-we_own
* case (i.e. drop anon lock if not locked).
*/
if (we_own) {
if (pg->pg_flags & PG_WANTED) { wakeup(pg);
}
/*
* if we were RELEASED during I/O, then our anon is
* no longer part of an amap. we need to free the
* anon and try again.
*/
if (pg->pg_flags & PG_RELEASED) {
pmap_page_protect(pg, PROT_NONE);
KASSERT(anon->an_ref == 0);
/*
* Released while we had unlocked amap.
*/
if (locked)
uvmfault_unlockall(ufi, NULL, NULL);
uvm_anon_release(anon); /* frees page for us */
counters_inc(uvmexp_counters, flt_pgrele);
return (VM_PAGER_REFAULT); /* refault! */
}
if (error != VM_PAGER_OK) {
KASSERT(error != VM_PAGER_PEND);
/* remove page from anon */
anon->an_page = NULL;
/*
* Remove the swap slot from the anon and
* mark the anon as having no real slot.
* Do not free the swap slot, thus preventing
* it from being used again.
*/
uvm_swap_markbad(anon->an_swslot, 1);
anon->an_swslot = SWSLOT_BAD;
/*
* Note: page was never !PG_BUSY, so it
* cannot be mapped and thus no need to
* pmap_page_protect() it.
*/
uvm_lock_pageq();
uvm_pagefree(pg);
uvm_unlock_pageq();
if (locked) {
uvmfault_unlockall(ufi, NULL, NULL);
}
rw_exit(anon->an_lock);
return (VM_PAGER_ERROR);
}
/*
* We have successfully read the page, activate it.
*/
pmap_clear_modify(pg);
uvm_lock_pageq();
uvm_pageactivate(pg);
uvm_unlock_pageq();
atomic_clearbits_int(&pg->pg_flags,
PG_WANTED|PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
}
/*
* We were not able to re-lock the map - restart the fault.
*/
if (!locked) {
if (we_own) {
rw_exit(anon->an_lock);
}
return (VM_PAGER_REFAULT);
}
/*
* Verify that no one has touched the amap and moved
* the anon on us.
*/
if (ufi != NULL && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start) != anon) {
uvmfault_unlockall(ufi, amap, NULL);
return (VM_PAGER_REFAULT);
}
/*
* Retry..
*/
counters_inc(uvmexp_counters, flt_anretry);
continue;
}
/*NOTREACHED*/
}
/*
* Update statistics after fault resolution.
* - maxrss
*/
void
uvmfault_update_stats(struct uvm_faultinfo *ufi)
{
struct vm_map *map;
struct proc *p;
vsize_t res;
map = ufi->orig_map;
/*
* If this is a nested pmap (eg, a virtual machine pmap managed
* by vmm(4) on amd64/i386), don't do any updating, just return.
*
* pmap_nested() on other archs is #defined to 0, so this is a
* no-op.
*/
if (pmap_nested(map->pmap))
return;
/* Update the maxrss for the process. */
if (map->flags & VM_MAP_ISVMSPACE) {
p = curproc;
KASSERT(p != NULL && &p->p_vmspace->vm_map == map);
res = pmap_resident_count(map->pmap);
/* Convert res from pages to kilobytes. */
res <<= (PAGE_SHIFT - 10);
if (p->p_ru.ru_maxrss < res) p->p_ru.ru_maxrss = res;
}
}
/*
* F A U L T - m a i n e n t r y p o i n t
*/
/*
* uvm_fault: page fault handler
*
* => called from MD code to resolve a page fault
* => VM data structures usually should be unlocked. however, it is
* possible to call here with the main map locked if the caller
* gets a write lock, sets it recursive, and then calls us (c.f.
* uvm_map_pageable). this should be avoided because it keeps
* the map locked off during I/O.
* => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
*/
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
~PROT_WRITE : PROT_MASK)
struct uvm_faultctx {
/*
* the following members are set up by uvm_fault_check() and
* read-only after that.
*/
vm_prot_t enter_prot;
vm_prot_t access_type;
vaddr_t startva;
int npages;
int centeridx;
boolean_t narrow;
boolean_t wired;
paddr_t pa_flags;
};
int uvm_fault_check(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon ***);
int uvm_fault_upper(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon **, vm_fault_t);
boolean_t uvm_fault_upper_lookup(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_anon **, struct vm_page **);
int uvm_fault_lower(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_page **, vm_fault_t);
int
uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
vm_prot_t access_type)
{
struct uvm_faultinfo ufi;
struct uvm_faultctx flt;
boolean_t shadowed;
struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
struct vm_page *pages[UVM_MAXRANGE];
int error;
counters_inc(uvmexp_counters, faults);
TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
/*
* init the IN parameters in the ufi
*/
ufi.orig_map = orig_map;
ufi.orig_rvaddr = trunc_page(vaddr);
ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
if (fault_type == VM_FAULT_WIRE)
flt.narrow = TRUE; /* don't look for neighborhood
* pages on wire */
else
flt.narrow = FALSE; /* normal fault */
flt.access_type = access_type;
error = ERESTART;
while (error == ERESTART) { /* ReFault: */
anons = anons_store;
error = uvm_fault_check(&ufi, &flt, &anons);
if (error != 0)
continue;
/* True if there is an anon at the faulting address */
shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
if (shadowed == TRUE) {
/* case 1: fault on an anon in our amap */
error = uvm_fault_upper(&ufi, &flt, anons, fault_type);
} else {
struct uvm_object *uobj = ufi.entry->object.uvm_obj;
/*
* if the desired page is not shadowed by the amap and
* we have a backing object, then we check to see if
* the backing object would prefer to handle the fault
* itself (rather than letting us do it with the usual
* pgo_get hook). the backing object signals this by
* providing a pgo_fault routine.
*/
if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
KERNEL_LOCK();
rw_enter(uobj->vmobjlock, RW_WRITE);
error = uobj->pgops->pgo_fault(&ufi,
flt.startva, pages, flt.npages,
flt.centeridx, fault_type, flt.access_type,
PGO_LOCKED);
KERNEL_UNLOCK();
if (error == VM_PAGER_OK)
error = 0;
else if (error == VM_PAGER_REFAULT)
error = ERESTART;
else
error = EACCES;
} else {
/* case 2: fault on backing obj or zero fill */
error = uvm_fault_lower(&ufi, &flt, pages,
fault_type);
}
}
}
return error;
}
/*
* uvm_fault_check: check prot, handle needs-copy, etc.
*
* 1. lookup entry.
* 2. check protection.
* 3. adjust fault condition (mainly for simulated fault).
* 4. handle needs-copy (lazy amap copy).
* 5. establish range of interest for neighbor fault (aka pre-fault).
* 6. look up anons (if amap exists).
* 7. flush pages (if MADV_SEQUENTIAL)
*
* => called with nothing locked.
* => if we fail (result != 0) we unlock everything.
* => initialize/adjust many members of flt.
*/
int
uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon ***ranons)
{
struct vm_amap *amap;
struct uvm_object *uobj;
int nback, nforw;
/*
* lookup and lock the maps
*/
if (uvmfault_lookup(ufi, FALSE) == FALSE) {
return EFAULT;
}
/* locked: maps(read) */
#ifdef DIAGNOSTIC
if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0)
panic("uvm_fault: fault on non-pageable map (%p, 0x%lx)",
ufi->map, ufi->orig_rvaddr);
#endif
/*
* check protection
*/
if ((ufi->entry->protection & flt->access_type) != flt->access_type) {
uvmfault_unlockmaps(ufi, FALSE);
return EACCES;
}
/*
* "enter_prot" is the protection we want to enter the page in at.
* for certain pages (e.g. copy-on-write pages) this protection can
* be more strict than ufi->entry->protection. "wired" means either
* the entry is wired or we are fault-wiring the pg.
*/
flt->enter_prot = ufi->entry->protection;
flt->pa_flags = UVM_ET_ISWC(ufi->entry) ? PMAP_WC : 0;
flt->wired = VM_MAPENT_ISWIRED(ufi->entry) || (flt->narrow == TRUE); if (flt->wired)
flt->access_type = flt->enter_prot; /* full access for wired */
/* handle "needs_copy" case. */
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) { if ((flt->access_type & PROT_WRITE) ||
(ufi->entry->object.uvm_obj == NULL)) {
/* need to clear */
uvmfault_unlockmaps(ufi, FALSE);
uvmfault_amapcopy(ufi);
counters_inc(uvmexp_counters, flt_amcopy);
return ERESTART;
} else {
/*
* ensure that we pmap_enter page R/O since
* needs_copy is still true
*/
flt->enter_prot &= ~PROT_WRITE;
}
}
/*
* identify the players
*/
amap = ufi->entry->aref.ar_amap; /* upper layer */
uobj = ufi->entry->object.uvm_obj; /* lower layer */
/*
* check for a case 0 fault. if nothing backing the entry then
* error now.
*/
if (amap == NULL && uobj == NULL) {
uvmfault_unlockmaps(ufi, FALSE);
return EFAULT;
}
/*
* for a case 2B fault waste no time on adjacent pages because
* they are likely already entered.
*/
if (uobj != NULL && amap != NULL &&
(flt->access_type & PROT_WRITE) != 0) {
/* wide fault (!narrow) */
flt->narrow = TRUE;
}
/*
* establish range of interest based on advice from mapper
* and then clip to fit map entry. note that we only want
* to do this the first time through the fault. if we
* ReFault we will disable this by setting "narrow" to true.
*/
if (flt->narrow == FALSE) {
/* wide fault (!narrow) */
nback = min(uvmadvice[ufi->entry->advice].nback,
(ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
flt->startva = ufi->orig_rvaddr - ((vsize_t)nback << PAGE_SHIFT);
nforw = min(uvmadvice[ufi->entry->advice].nforw,
((ufi->entry->end - ufi->orig_rvaddr) >> PAGE_SHIFT) - 1);
/*
* note: "-1" because we don't want to count the
* faulting page as forw
*/
flt->npages = nback + nforw + 1;
flt->centeridx = nback;
flt->narrow = TRUE; /* ensure only once per-fault */
} else {
/* narrow fault! */
nback = nforw = 0;
flt->startva = ufi->orig_rvaddr;
flt->npages = 1;
flt->centeridx = 0;
}
/*
* if we've got an amap then lock it and extract current anons.
*/
if (amap) {
amap_lock(amap);
amap_lookups(&ufi->entry->aref,
flt->startva - ufi->entry->start, *ranons, flt->npages);
} else {
*ranons = NULL; /* to be safe */
}
/*
* for MADV_SEQUENTIAL mappings we want to deactivate the back pages
* now and then forget about them (for the rest of the fault).
*/
if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
/* flush back-page anons? */
if (amap)
uvmfault_anonflush(*ranons, nback);
/*
* flush object?
*/
if (uobj) {
voff_t uoff;
uoff = (flt->startva - ufi->entry->start) + ufi->entry->offset;
rw_enter(uobj->vmobjlock, RW_WRITE);
(void) uobj->pgops->pgo_flush(uobj, uoff, uoff +
((vsize_t)nback << PAGE_SHIFT), PGO_DEACTIVATE);
rw_exit(uobj->vmobjlock);
}
/* now forget about the backpages */
if (amap)
*ranons += nback; flt->startva += ((vsize_t)nback << PAGE_SHIFT);
flt->npages -= nback;
flt->centeridx = 0;
}
return 0;
}
/*
* uvm_fault_upper_lookup: look up existing h/w mapping and amap.
*
* iterate range of interest:
* 1. check if h/w mapping exists. if yes, we don't care
* 2. check if anon exists. if not, page is lower.
* 3. if anon exists, enter h/w mapping for neighbors.
*
* => called with amap locked (if exists).
*/
boolean_t
uvm_fault_upper_lookup(struct uvm_faultinfo *ufi,
const struct uvm_faultctx *flt, struct vm_anon **anons,
struct vm_page **pages)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct vm_anon *anon;
boolean_t shadowed;
vaddr_t currva;
paddr_t pa;
int lcv;
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_write_held(amap->am_lock));
/*
* map in the backpages and frontpages we found in the amap in hopes
* of preventing future faults. we also init the pages[] array as
* we go.
*/
currva = flt->startva;
shadowed = FALSE;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
/*
* dont play with VAs that are already mapped
* except for center)
*/
if (lcv != flt->centeridx &&
pmap_extract(ufi->orig_map->pmap, currva, &pa)) {
pages[lcv] = PGO_DONTCARE;
continue;
}
/*
* unmapped or center page. check if any anon at this level.
*/
if (amap == NULL || anons[lcv] == NULL) {
pages[lcv] = NULL;
continue;
}
/*
* check for present page and map if possible.
*/
pages[lcv] = PGO_DONTCARE;
if (lcv == flt->centeridx) { /* save center for later! */
shadowed = TRUE;
continue;
}
anon = anons[lcv];
KASSERT(anon->an_lock == amap->am_lock); if (anon->an_page &&
(anon->an_page->pg_flags & (PG_RELEASED|PG_BUSY)) == 0) {
uvm_lock_pageq();
uvm_pageactivate(anon->an_page); /* reactivate */
uvm_unlock_pageq();
counters_inc(uvmexp_counters, flt_namap);
/*
* Since this isn't the page that's actually faulting,
* ignore pmap_enter() failures; it's not critical
* that we enter these right now.
*/
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(anon->an_page) | flt->pa_flags,
(anon->an_ref > 1) ?
(flt->enter_prot & ~PROT_WRITE) : flt->enter_prot,
PMAP_CANFAIL |
(VM_MAPENT_ISWIRED(ufi->entry) ? PMAP_WIRED : 0));
}
}
if (flt->npages > 1)
pmap_update(ufi->orig_map->pmap);
return shadowed;
}
/*
* uvm_fault_upper: handle upper fault.
*
* 1. acquire anon lock.
* 2. get anon. let uvmfault_anonget do the dirty work.
* 3. if COW, promote data to new anon
* 4. enter h/w mapping
*/
int
uvm_fault_upper(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon **anons, vm_fault_t fault_type)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct vm_anon *oanon, *anon = anons[flt->centeridx];
struct vm_page *pg = NULL;
int error, ret;
/* locked: maps(read), amap, anon */
KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon->an_lock == amap->am_lock);
/*
* no matter if we have case 1A or case 1B we are going to need to
* have the anon's memory resident. ensure that now.
*/
/*
* let uvmfault_anonget do the dirty work.
* if it fails (!OK) it will unlock everything for us.
* if it succeeds, locks are still valid and locked.
* also, if it is OK, then the anon's page is on the queues.
* if the page is on loan from a uvm_object, then anonget will
* lock that object for us if it does not fail.
*/
error = uvmfault_anonget(ufi, amap, anon);
switch (error) {
case VM_PAGER_OK:
break;
case VM_PAGER_REFAULT:
return ERESTART;
case VM_PAGER_ERROR:
/*
* An error occurred while trying to bring in the
* page -- this is the only error we return right
* now.
*/
return EACCES; /* XXX */
default:
#ifdef DIAGNOSTIC
panic("uvm_fault: uvmfault_anonget -> %d", error);
#else
return EACCES;
#endif
}
KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon->an_lock == amap->am_lock);
/*
* if we are case 1B then we will need to allocate a new blank
* anon to transfer the data into. note that we have a lock
* on anon, so no one can busy or release the page until we are done.
* also note that the ref count can't drop to zero here because
* it is > 1 and we are only dropping one ref.
*
* in the (hopefully very rare) case that we are out of RAM we
* will unlock, wait for more RAM, and refault.
*
* if we are out of anon VM we wait for RAM to become available.
*/
if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
counters_inc(uvmexp_counters, flt_acow);
oanon = anon; /* oanon = old */
anon = uvm_analloc();
if (anon) {
anon->an_lock = amap->am_lock;
pg = uvm_pagealloc(NULL, 0, anon, 0);
}
/* check for out of RAM */
if (anon == NULL || pg == NULL) {
uvmfault_unlockall(ufi, amap, NULL);
if (anon == NULL)
counters_inc(uvmexp_counters, flt_noanon);
else {
anon->an_lock = NULL;
anon->an_ref--;
uvm_anfree(anon);
counters_inc(uvmexp_counters, flt_noram);
}
if (uvm_swapisfull())
return ENOMEM;
/* out of RAM, wait for more */
if (anon == NULL)
uvm_anwait();
else
uvm_wait("flt_noram3");
return ERESTART;
}
/* got all resources, replace anon with nanon */
uvm_pagecopy(oanon->an_page, pg); /* pg now !PG_CLEAN */
/* un-busy! new page */
atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
ret = amap_add(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start, anon, 1);
KASSERT(ret == 0);
/* deref: can not drop to zero here by defn! */
oanon->an_ref--;
#if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW)
/*
* If there are multiple threads, either uvm or the
* pmap has to make sure no threads see the old RO
* mapping once any have seen the new RW mapping.
* uvm does it by inserting the new mapping RO and
* letting it fault again.
* This is only a problem on MP systems.
*/
if (P_HASSIBLING(curproc)) {
flt->enter_prot &= ~PROT_WRITE;
flt->access_type &= ~PROT_WRITE;
}
#endif
/*
* note: anon is _not_ locked, but we have the sole references
* to in from amap.
* thus, no one can get at it until we are done with it.
*/
} else {
counters_inc(uvmexp_counters, flt_anon);
oanon = anon;
pg = anon->an_page;
if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ flt->enter_prot = flt->enter_prot & ~PROT_WRITE;
}
/*
* now map the page in .
*/
if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
VM_PAGE_TO_PHYS(pg) | flt->pa_flags, flt->enter_prot,
flt->access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
uvmfault_unlockall(ufi, amap, NULL);
if (uvm_swapisfull()) {
/* XXX instrumentation */
return ENOMEM;
}
/* XXX instrumentation */
uvm_wait("flt_pmfail1");
return ERESTART;
}
/*
* ... update the page queues.
*/
uvm_lock_pageq();
if (fault_type == VM_FAULT_WIRE) {
uvm_pagewire(pg);
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* since an anon with no swap cannot be PG_CLEAN,
* clear its clean flag now.
*/
atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
uvm_anon_dropswap(anon);
} else {
/* activate it */
uvm_pageactivate(pg);
}
uvm_unlock_pageq();
/*
* done case 1! finish up by unlocking everything and returning success
*/
uvmfault_unlockall(ufi, amap, NULL);
pmap_update(ufi->orig_map->pmap);
return 0;
}
/*
* uvm_fault_lower_lookup: look up on-memory uobj pages.
*
* 1. get on-memory pages.
* 2. if failed, give up (get only center page later).
* 3. if succeeded, enter h/w mapping of neighbor pages.
*/
struct vm_page *
uvm_fault_lower_lookup(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_page **pages)
{
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
struct vm_page *uobjpage = NULL;
int lcv, gotpages;
vaddr_t currva;
rw_enter(uobj->vmobjlock, RW_WRITE);
counters_inc(uvmexp_counters, flt_lget);
gotpages = flt->npages;
(void) uobj->pgops->pgo_get(uobj,
ufi->entry->offset + (flt->startva - ufi->entry->start),
pages, &gotpages, flt->centeridx,
flt->access_type & MASK(ufi->entry), ufi->entry->advice,
PGO_LOCKED);
/*
* check for pages to map, if we got any
*/
if (gotpages == 0) {
return NULL;
}
currva = flt->startva;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) { if (pages[lcv] == NULL ||
pages[lcv] == PGO_DONTCARE)
continue;
KASSERT((pages[lcv]->pg_flags & PG_RELEASED) == 0);
/*
* if center page is resident and not
* PG_BUSY, then pgo_get made it PG_BUSY
* for us and gave us a handle to it.
* remember this page as "uobjpage."
* (for later use).
*/
if (lcv == flt->centeridx) {
uobjpage = pages[lcv];
continue;
}
/*
* note: calling pgo_get with locked data
* structures returns us pages which are
* neither busy nor released, so we don't
* need to check for this. we can just
* directly enter the page (after moving it
* to the head of the active queue [useful?]).
*/
uvm_lock_pageq();
uvm_pageactivate(pages[lcv]); /* reactivate */
uvm_unlock_pageq();
counters_inc(uvmexp_counters, flt_nomap);
/*
* Since this page isn't the page that's
* actually faulting, ignore pmap_enter()
* failures; it's not critical that we
* enter these right now.
*/
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(pages[lcv]) | flt->pa_flags,
flt->enter_prot & MASK(ufi->entry),
PMAP_CANFAIL |
(flt->wired ? PMAP_WIRED : 0));
/*
* NOTE: page can't be PG_WANTED because
* we've held the lock the whole time
* we've had the handle.
*/
atomic_clearbits_int(&pages[lcv]->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pages[lcv], NULL);
}
pmap_update(ufi->orig_map->pmap);
return uobjpage;
}
/*
* uvm_fault_lower: handle lower fault.
*
*/
int
uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_page **pages, vm_fault_t fault_type)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
boolean_t promote, locked;
int result;
struct vm_page *uobjpage, *pg = NULL;
struct vm_anon *anon = NULL;
voff_t uoff;
/*
* now, if the desired page is not shadowed by the amap and we have
* a backing object that does not have a special fault routine, then
* we ask (with pgo_get) the object for resident pages that we care
* about and attempt to map them in. we do not let pgo_get block
* (PGO_LOCKED).
*/
if (uobj == NULL) {
/* zero fill; don't care neighbor pages */
uobjpage = NULL;
} else {
uobjpage = uvm_fault_lower_lookup(ufi, flt, pages);
}
/*
* note that at this point we are done with any front or back pages.
* we are now going to focus on the center page (i.e. the one we've
* faulted on). if we have faulted on the bottom (uobj)
* layer [i.e. case 2] and the page was both present and available,
* then we've got a pointer to it as "uobjpage" and we've already
* made it BUSY.
*/
/*
* locked:
*/
KASSERT(amap == NULL ||
rw_write_held(amap->am_lock));
KASSERT(uobj == NULL ||
rw_write_held(uobj->vmobjlock));
/*
* note that uobjpage can not be PGO_DONTCARE at this point. we now
* set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we
* have a backing object, check and see if we are going to promote
* the data up to an anon during the fault.
*/
if (uobj == NULL) {
uobjpage = PGO_DONTCARE;
promote = TRUE; /* always need anon here */
} else {
KASSERT(uobjpage != PGO_DONTCARE); promote = (flt->access_type & PROT_WRITE) && UVM_ET_ISCOPYONWRITE(ufi->entry);
}
/*
* if uobjpage is not null then we do not need to do I/O to get the
* uobjpage.
*
* if uobjpage is null, then we need to ask the pager to
* get the data for us. once we have the data, we need to reverify
* the state the world. we are currently not holding any resources.
*/
if (uobjpage) {
/* update rusage counters */
curproc->p_ru.ru_minflt++;
} else {
int gotpages;
/* update rusage counters */
curproc->p_ru.ru_majflt++;
uvmfault_unlockall(ufi, amap, NULL);
counters_inc(uvmexp_counters, flt_get);
gotpages = 1;
uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages,
0, flt->access_type & MASK(ufi->entry), ufi->entry->advice,
PGO_SYNCIO);
/*
* recover from I/O
*/
if (result != VM_PAGER_OK) { KASSERT(result != VM_PAGER_PEND);
if (result == VM_PAGER_AGAIN) {
tsleep_nsec(&nowake, PVM, "fltagain2",
MSEC_TO_NSEC(5));
return ERESTART;
}
if (!UVM_ET_ISNOFAULT(ufi->entry))
return (EIO);
uobjpage = PGO_DONTCARE;
uobj = NULL;
promote = TRUE;
}
/* re-verify the state of the world. */
locked = uvmfault_relock(ufi);
if (locked && amap != NULL) amap_lock(amap);
/* might be changed */
if (uobjpage != PGO_DONTCARE) { uobj = uobjpage->uobject;
rw_enter(uobj->vmobjlock, RW_WRITE);
}
/*
* Re-verify that amap slot is still free. if there is
* a problem, we clean up.
*/
if (locked && amap && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start)) {
if (locked)
uvmfault_unlockall(ufi, amap, NULL);
locked = FALSE;
}
/* didn't get the lock? release the page and retry. */
if (locked == FALSE && uobjpage != PGO_DONTCARE) {
uvm_lock_pageq();
/* make sure it is in queues */
uvm_pageactivate(uobjpage);
uvm_unlock_pageq();
if (uobjpage->pg_flags & PG_WANTED)
/* still holding object lock */
wakeup(uobjpage);
atomic_clearbits_int(&uobjpage->pg_flags,
PG_BUSY|PG_WANTED);
UVM_PAGE_OWN(uobjpage, NULL);
}
if (locked == FALSE) { if (uobjpage != PGO_DONTCARE) rw_exit(uobj->vmobjlock);
return ERESTART;
}
/*
* we have the data in uobjpage which is PG_BUSY
*/
}
/*
* notes:
* - at this point uobjpage can not be NULL
* - at this point uobjpage could be PG_WANTED (handle later)
*/
if (promote == FALSE) {
/*
* we are not promoting. if the mapping is COW ensure that we
* don't give more access than we should (e.g. when doing a read
* fault on a COPYONWRITE mapping we want to map the COW page in
* R/O even though the entry protection could be R/W).
*
* set "pg" to the page we want to map in (uobjpage, usually)
*/
counters_inc(uvmexp_counters, flt_obj);
if (UVM_ET_ISCOPYONWRITE(ufi->entry)) flt->enter_prot &= ~PROT_WRITE;
pg = uobjpage; /* map in the actual object */
/* assert(uobjpage != PGO_DONTCARE) */
/*
* we are faulting directly on the page.
*/
} else {
/*
* if we are going to promote the data to an anon we
* allocate a blank anon here and plug it into our amap.
*/
#ifdef DIAGNOSTIC
if (amap == NULL)
panic("uvm_fault: want to promote data, but no anon");
#endif
anon = uvm_analloc();
if (anon) {
/*
* In `Fill in data...' below, if
* uobjpage == PGO_DONTCARE, we want
* a zero'd, dirty page, so have
* uvm_pagealloc() do that for us.
*/
anon->an_lock = amap->am_lock;
pg = uvm_pagealloc(NULL, 0, anon,
(uobjpage == PGO_DONTCARE) ? UVM_PGA_ZERO : 0);
}
/*
* out of memory resources?
*/
if (anon == NULL || pg == NULL) {
/*
* arg! must unbusy our page and fail or sleep.
*/
if (uobjpage != PGO_DONTCARE) {
uvm_lock_pageq();
uvm_pageactivate(uobjpage);
uvm_unlock_pageq();
if (uobjpage->pg_flags & PG_WANTED) wakeup(uobjpage);
atomic_clearbits_int(&uobjpage->pg_flags,
PG_BUSY|PG_WANTED);
UVM_PAGE_OWN(uobjpage, NULL);
}
/* unlock and fail ... */
uvmfault_unlockall(ufi, amap, uobj);
if (anon == NULL)
counters_inc(uvmexp_counters, flt_noanon);
else {
anon->an_lock = NULL;
anon->an_ref--;
uvm_anfree(anon);
counters_inc(uvmexp_counters, flt_noram);
}
if (uvm_swapisfull())
return (ENOMEM);
/* out of RAM, wait for more */
if (anon == NULL)
uvm_anwait();
else
uvm_wait("flt_noram5");
return ERESTART;
}
/*
* fill in the data
*/
if (uobjpage != PGO_DONTCARE) {
counters_inc(uvmexp_counters, flt_prcopy);
/* copy page [pg now dirty] */
uvm_pagecopy(uobjpage, pg);
/*
* promote to shared amap? make sure all sharing
* procs see it
*/
if ((amap_flags(amap) & AMAP_SHARED) != 0) { pmap_page_protect(uobjpage, PROT_NONE);
}
/* dispose of uobjpage. drop handle to uobj as well. */
if (uobjpage->pg_flags & PG_WANTED) wakeup(uobjpage);
atomic_clearbits_int(&uobjpage->pg_flags,
PG_BUSY|PG_WANTED);
UVM_PAGE_OWN(uobjpage, NULL);
uvm_lock_pageq();
uvm_pageactivate(uobjpage);
uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
uobj = NULL;
} else {
counters_inc(uvmexp_counters, flt_przero);
/*
* Page is zero'd and marked dirty by uvm_pagealloc()
* above.
*/
}
if (amap_add(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start, anon, 0)) {
uvmfault_unlockall(ufi, amap, uobj);
uvm_anfree(anon);
counters_inc(uvmexp_counters, flt_noamap);
if (uvm_swapisfull())
return (ENOMEM);
amap_populate(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start);
return ERESTART;
}
}
/* note: pg is either the uobjpage or the new page in the new anon */
/*
* all resources are present. we can now map it in and free our
* resources.
*/
if (amap == NULL)
KASSERT(anon == NULL);
else {
KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
}
if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
VM_PAGE_TO_PHYS(pg) | flt->pa_flags, flt->enter_prot,
flt->access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
if (pg->pg_flags & PG_WANTED) wakeup(pg);
atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
UVM_PAGE_OWN(pg, NULL);
uvmfault_unlockall(ufi, amap, uobj);
if (uvm_swapisfull()) {
/* XXX instrumentation */
return (ENOMEM);
}
/* XXX instrumentation */
uvm_wait("flt_pmfail2");
return ERESTART;
}
if (fault_type == VM_FAULT_WIRE) {
uvm_lock_pageq();
uvm_pagewire(pg);
uvm_unlock_pageq();
if (pg->pg_flags & PQ_AOBJ) {
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* since an aobj page with no swap cannot be clean,
* mark it dirty now.
*
* use pg->uobject here. if the page is from a
* tmpfs vnode, the pages are backed by its UAO and
* not the vnode.
*/
KASSERT(uobj != NULL); KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock); atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
}
} else {
/* activate it */
uvm_lock_pageq();
uvm_pageactivate(pg);
uvm_unlock_pageq();
}
if (pg->pg_flags & PG_WANTED) wakeup(pg);
atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
UVM_PAGE_OWN(pg, NULL);
uvmfault_unlockall(ufi, amap, uobj);
pmap_update(ufi->orig_map->pmap);
return (0);
}
/*
* uvm_fault_wire: wire down a range of virtual addresses in a map.
*
* => map may be read-locked by caller, but MUST NOT be write-locked.
* => if map is read-locked, any operations which may cause map to
* be write-locked in uvm_fault() must be taken care of by
* the caller. See uvm_map_pageable().
*/
int
uvm_fault_wire(vm_map_t map, vaddr_t start, vaddr_t end, vm_prot_t access_type)
{
vaddr_t va;
int rv;
/*
* now fault it in a page at a time. if the fault fails then we have
* to undo what we have done. note that in uvm_fault PROT_NONE
* is replaced with the max protection if fault_type is VM_FAULT_WIRE.
*/
for (va = start ; va < end ; va += PAGE_SIZE) {
rv = uvm_fault(map, va, VM_FAULT_WIRE, access_type);
if (rv) {
if (va != start) { uvm_fault_unwire(map, start, va);
}
return (rv);
}
}
return (0);
}
/*
* uvm_fault_unwire(): unwire range of virtual space.
*/
void
uvm_fault_unwire(vm_map_t map, vaddr_t start, vaddr_t end)
{
vm_map_lock_read(map);
uvm_fault_unwire_locked(map, start, end);
vm_map_unlock_read(map);
}
/*
* uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
*
* => map must be at least read-locked.
*/
void
uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
{
vm_map_entry_t entry, oentry = NULL, next;
pmap_t pmap = vm_map_pmap(map);
vaddr_t va;
paddr_t pa;
struct vm_page *pg;
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
/*
* we assume that the area we are unwiring has actually been wired
* in the first place. this means that we should be able to extract
* the PAs from the pmap.
*/
/*
* find the beginning map entry for the region.
*/
KASSERT(start >= vm_map_min(map) && end <= vm_map_max(map));
if (uvm_map_lookup_entry(map, start, &entry) == FALSE)
panic("uvm_fault_unwire_locked: address not in map"); for (va = start; va < end ; va += PAGE_SIZE) { if (pmap_extract(pmap, va, &pa) == FALSE)
continue;
/*
* find the map entry for the current address.
*/
KASSERT(va >= entry->start); while (entry && va >= entry->end) {
next = RBT_NEXT(uvm_map_addr, entry);
entry = next;
}
if (entry == NULL)
return;
if (va < entry->start)
continue;
/*
* lock it.
*/
if (entry != oentry) { if (oentry != NULL) { uvm_map_unlock_entry(oentry);
}
uvm_map_lock_entry(entry);
oentry = entry;
}
/*
* if the entry is no longer wired, tell the pmap.
*/
if (VM_MAPENT_ISWIRED(entry) == 0) pmap_unwire(pmap, va);
pg = PHYS_TO_VM_PAGE(pa);
if (pg) { uvm_lock_pageq();
uvm_pageunwire(pg);
uvm_unlock_pageq();
}
}
if (oentry != NULL) { uvm_map_unlock_entry(oentry);
}
}
/*
* uvmfault_unlockmaps: unlock the maps
*/
void
uvmfault_unlockmaps(struct uvm_faultinfo *ufi, boolean_t write_locked)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return;
}
uvmfault_update_stats(ufi);
if (write_locked) {
vm_map_unlock(ufi->map);
} else {
vm_map_unlock_read(ufi->map);
}
}
/*
* uvmfault_unlockall: unlock everything passed in.
*
* => maps must be read-locked (not write-locked).
*/
void
uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct uvm_object *uobj)
{ if (uobj) rw_exit(uobj->vmobjlock); if (amap != NULL) amap_unlock(amap);
uvmfault_unlockmaps(ufi, FALSE);
}
/*
* uvmfault_lookup: lookup a virtual address in a map
*
* => caller must provide a uvm_faultinfo structure with the IN
* params properly filled in
* => we will lookup the map entry (handling submaps) as we go
* => if the lookup is a success we will return with the maps locked
* => if "write_lock" is TRUE, we write_lock the map, otherwise we only
* get a read lock.
* => note that submaps can only appear in the kernel and they are
* required to use the same virtual addresses as the map they
* are referenced by (thus address translation between the main
* map and the submap is unnecessary).
*/
boolean_t
uvmfault_lookup(struct uvm_faultinfo *ufi, boolean_t write_lock)
{
vm_map_t tmpmap;
/*
* init ufi values for lookup.
*/
ufi->map = ufi->orig_map;
ufi->size = ufi->orig_size;
/*
* keep going down levels until we are done. note that there can
* only be two levels so we won't loop very long.
*/
while (1) {
if (ufi->orig_rvaddr < ufi->map->min_offset ||
ufi->orig_rvaddr >= ufi->map->max_offset)
return FALSE;
/* lock map */
if (write_lock) {
vm_map_lock(ufi->map);
} else {
vm_map_lock_read(ufi->map);
}
/* lookup */
if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
&ufi->entry)) {
uvmfault_unlockmaps(ufi, write_lock);
return FALSE;
}
/* reduce size if necessary */
if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) ufi->size = ufi->entry->end - ufi->orig_rvaddr;
/*
* submap? replace map with the submap and lookup again.
* note: VAs in submaps must match VAs in main map.
*/
if (UVM_ET_ISSUBMAP(ufi->entry)) {
tmpmap = ufi->entry->object.sub_map;
uvmfault_unlockmaps(ufi, write_lock);
ufi->map = tmpmap;
continue;
}
/*
* got it!
*/
ufi->mapv = ufi->map->timestamp;
return TRUE;
} /* while loop */
/*NOTREACHED*/
}
/*
* uvmfault_relock: attempt to relock the same version of the map
*
* => fault data structures should be unlocked before calling.
* => if a success (TRUE) maps will be locked after call.
*/
boolean_t
uvmfault_relock(struct uvm_faultinfo *ufi)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return TRUE;
}
counters_inc(uvmexp_counters, flt_relck);
/*
* relock map. fail if version mismatch (in which case nothing
* gets locked).
*/
vm_map_lock_read(ufi->map);
if (ufi->mapv != ufi->map->timestamp) {
vm_map_unlock_read(ufi->map);
return FALSE;
}
counters_inc(uvmexp_counters, flt_relckok);
return TRUE; /* got it! */
}
/* $OpenBSD: event.h,v 1.67 2022/03/31 01:41:22 millert Exp $ */
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/sys/sys/event.h,v 1.11 2001/02/24 01:41:31 jlemon Exp $
*/
#ifndef _SYS_EVENT_H_
#define _SYS_EVENT_H_
#define EVFILT_READ (-1)
#define EVFILT_WRITE (-2)
#define EVFILT_AIO (-3) /* attached to aio requests */
#define EVFILT_VNODE (-4) /* attached to vnodes */
#define EVFILT_PROC (-5) /* attached to struct process */
#define EVFILT_SIGNAL (-6) /* attached to struct process */
#define EVFILT_TIMER (-7) /* timers */
#define EVFILT_DEVICE (-8) /* devices */
#define EVFILT_EXCEPT (-9) /* exceptional conditions */
#define EVFILT_SYSCOUNT 9
#define EV_SET(kevp, a, b, c, d, e, f) do { \
struct kevent *__kevp = (kevp); \
(__kevp)->ident = (a); \
(__kevp)->filter = (b); \
(__kevp)->flags = (c); \
(__kevp)->fflags = (d); \
(__kevp)->data = (e); \
(__kevp)->udata = (f); \
} while(0)
struct kevent {
__uintptr_t ident; /* identifier for this event */
short filter; /* filter for event */
unsigned short flags; /* action flags for kqueue */
unsigned int fflags; /* filter flag value */
__int64_t data; /* filter data value */
void *udata; /* opaque user data identifier */
};
/* actions */
#define EV_ADD 0x0001 /* add event to kq (implies enable) */
#define EV_DELETE 0x0002 /* delete event from kq */
#define EV_ENABLE 0x0004 /* enable event */
#define EV_DISABLE 0x0008 /* disable event (not reported) */
/* flags */
#define EV_ONESHOT 0x0010 /* only report one occurrence */
#define EV_CLEAR 0x0020 /* clear event state after reporting */
#define EV_RECEIPT 0x0040 /* force EV_ERROR on success, data=0 */
#define EV_DISPATCH 0x0080 /* disable event after reporting */
#define EV_SYSFLAGS 0xf800 /* reserved by system */
#define EV_FLAG1 0x2000 /* filter-specific flag */
/* returned values */
#define EV_EOF 0x8000 /* EOF detected */
#define EV_ERROR 0x4000 /* error, data contains errno */
/*
* data/hint flags for EVFILT_{READ|WRITE}, shared with userspace
*/
#define NOTE_LOWAT 0x0001 /* low water mark */
#define NOTE_EOF 0x0002 /* return on EOF */
/*
* data/hint flags for EVFILT_EXCEPT, shared with userspace and with
* EVFILT_{READ|WRITE}
*/
#define NOTE_OOB 0x0004 /* OOB data on a socket */
/*
* data/hint flags for EVFILT_VNODE, shared with userspace
*/
#define NOTE_DELETE 0x0001 /* vnode was removed */
#define NOTE_WRITE 0x0002 /* data contents changed */
#define NOTE_EXTEND 0x0004 /* size increased */
#define NOTE_ATTRIB 0x0008 /* attributes changed */
#define NOTE_LINK 0x0010 /* link count changed */
#define NOTE_RENAME 0x0020 /* vnode was renamed */
#define NOTE_REVOKE 0x0040 /* vnode access was revoked */
#define NOTE_TRUNCATE 0x0080 /* vnode was truncated */
/*
* data/hint flags for EVFILT_PROC, shared with userspace
*/
#define NOTE_EXIT 0x80000000 /* process exited */
#define NOTE_FORK 0x40000000 /* process forked */
#define NOTE_EXEC 0x20000000 /* process exec'd */
#define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */
#define NOTE_PDATAMASK 0x000fffff /* mask for pid */
/* additional flags for EVFILT_PROC */
#define NOTE_TRACK 0x00000001 /* follow across forks */
#define NOTE_TRACKERR 0x00000002 /* could not track child */
#define NOTE_CHILD 0x00000004 /* am a child process */
/* data/hint flags for EVFILT_DEVICE, shared with userspace */
#define NOTE_CHANGE 0x00000001 /* device change event */
/*
* This is currently visible to userland to work around broken
* programs which pull in <sys/proc.h> or <sys/selinfo.h>.
*/
#include <sys/queue.h>
struct klistops;
struct knote;
SLIST_HEAD(knlist, knote);
struct klist {
struct knlist kl_list;
const struct klistops *kl_ops;
void *kl_arg;
};
#ifdef _KERNEL
/* kernel-only flags */
#define __EV_SELECT 0x0800 /* match behavior of select */
#define __EV_POLL 0x1000 /* match behavior of poll */
#define __EV_HUP EV_FLAG1 /* device or socket disconnected */
#define EVFILT_MARKER 0xf /* placemarker for tailq */
/*
* hint flag for in-kernel use - must not equal any existing note
*/
#define NOTE_SUBMIT 0x01000000 /* initial knote submission */
#define KNOTE(list, hint) do { \
struct klist *__list = (list); \
if (!klist_empty(__list)) \
knote(__list, hint); \
} while (0)
#define KN_HASHSIZE 64 /* XXX should be tunable */
/*
* Flag indicating hint is a signal. Used by EVFILT_SIGNAL, and also
* shared by EVFILT_PROC (all knotes attached to p->p_klist)
*/
#define NOTE_SIGNAL 0x08000000
/*
* = Event filter interface
*
* == .f_flags
*
* Defines properties of the event filter:
*
* - FILTEROP_ISFD Each knote of this filter is associated
* with a file descriptor.
*
* - FILTEROP_MPSAFE The kqueue subsystem can invoke .f_attach(),
* .f_detach(), .f_modify() and .f_process() without
* the kernel lock.
*
* == .f_attach()
*
* Attaches the knote to the object.
*
* == .f_detach()
*
* Detaches the knote from the object. The object must not use this knote
* for delivering events after this callback has returned.
*
* == .f_event()
*
* Notifies the filter about an event. Called through knote().
*
* == .f_modify()
*
* Modifies the knote with new state from the user.
*
* Returns non-zero if the knote has become active.
*
* == .f_process()
*
* Checks if the event is active and returns non-zero if the event should be
* returned to the user.
*
* If kev is non-NULL and the event is active, the callback should store
* the event's state in kev for delivery to the user.
*
* == Concurrency control
*
* The kqueue subsystem serializes calls of .f_attach(), .f_detach(),
* .f_modify() and .f_process().
*/
#define FILTEROP_ISFD 0x00000001 /* ident == filedescriptor */
#define FILTEROP_MPSAFE 0x00000002 /* safe without kernel lock */
struct filterops {
int f_flags;
int (*f_attach)(struct knote *kn);
void (*f_detach)(struct knote *kn);
int (*f_event)(struct knote *kn, long hint);
int (*f_modify)(struct kevent *kev, struct knote *kn);
int (*f_process)(struct knote *kn, struct kevent *kev);
};
/*
* Locking:
* I immutable after creation
* o object lock
* q kn_kq->kq_lock
*/
struct knote {
SLIST_ENTRY(knote) kn_link; /* for fd */
SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */
TAILQ_ENTRY(knote) kn_tqe;
struct kqueue *kn_kq; /* [I] which queue we are on */
struct kevent kn_kevent;
int kn_status; /* [q] */
int kn_sfflags; /* [o] saved filter flags */
__int64_t kn_sdata; /* [o] saved data field */
union {
struct file *p_fp; /* file data pointer */
struct process *p_process; /* process pointer */
} kn_ptr;
const struct filterops *kn_fop;
void *kn_hook; /* [o] */
unsigned int kn_pollid; /* [I] */
#define KN_ACTIVE 0x0001 /* event has been triggered */
#define KN_QUEUED 0x0002 /* event is on queue */
#define KN_DISABLED 0x0004 /* event is disabled */
#define KN_DETACHED 0x0008 /* knote is detached */
#define KN_PROCESSING 0x0010 /* knote is being processed */
#define KN_WAITING 0x0020 /* waiting on processing */
#define kn_id kn_kevent.ident /* [I] */
#define kn_filter kn_kevent.filter /* [I] */
#define kn_flags kn_kevent.flags /* [o] */
#define kn_fflags kn_kevent.fflags /* [o] */
#define kn_data kn_kevent.data /* [o] */
#define kn_udata kn_kevent.udata /* [o] */
#define kn_fp kn_ptr.p_fp /* [o] */
};
struct klistops {
void (*klo_assertlk)(void *);
int (*klo_lock)(void *);
void (*klo_unlock)(void *, int);
};
struct kqueue_scan_state {
struct kqueue *kqs_kq; /* kqueue of this scan */
struct knote kqs_start; /* start marker */
struct knote kqs_end; /* end marker */
int kqs_nevent; /* number of events collected */
int kqs_queued; /* if set, end marker is
* in queue */
};
struct mutex;
struct proc;
struct rwlock;
struct timespec;
extern const struct filterops sig_filtops;
extern const struct filterops dead_filtops;
extern const struct klistops socket_klistops;
extern void kqpoll_init(unsigned int);
extern void kqpoll_done(unsigned int);
extern void kqpoll_exit(void);
extern void knote(struct klist *list, long hint);
extern void knote_fdclose(struct proc *p, int fd);
extern void knote_processexit(struct process *);
extern void knote_assign(const struct kevent *, struct knote *);
extern void knote_submit(struct knote *, struct kevent *);
extern void kqueue_init(void);
extern void kqueue_init_percpu(void);
extern int kqueue_register(struct kqueue *kq, struct kevent *kev,
unsigned int pollid, struct proc *p);
extern int kqueue_scan(struct kqueue_scan_state *, int, struct kevent *,
struct timespec *, struct proc *, int *);
extern void kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
extern void kqueue_scan_finish(struct kqueue_scan_state *);
extern int filt_seltrue(struct knote *kn, long hint);
extern int seltrue_kqfilter(dev_t, struct knote *);
extern void klist_init(struct klist *, const struct klistops *, void *);
extern void klist_init_mutex(struct klist *, struct mutex *);
extern void klist_init_rwlock(struct klist *, struct rwlock *);
extern void klist_free(struct klist *);
extern void klist_insert(struct klist *, struct knote *);
extern void klist_insert_locked(struct klist *, struct knote *);
extern void klist_remove(struct klist *, struct knote *);
extern void klist_remove_locked(struct klist *, struct knote *);
extern void klist_invalidate(struct klist *);
static inline int
knote_modify_fn(const struct kevent *kev, struct knote *kn,
int (*f_event)(struct knote *, long))
{
knote_assign(kev, kn);
return ((*f_event)(kn, 0));
}
static inline int
knote_modify(const struct kevent *kev, struct knote *kn)
{
return (knote_modify_fn(kev, kn, kn->kn_fop->f_event));
}
static inline int
knote_process_fn(struct knote *kn, struct kevent *kev,
int (*f_event)(struct knote *, long))
{
int active;
/*
* If called from kqueue_scan(), skip f_event
* when EV_ONESHOT is set, to preserve old behaviour.
*/
if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
active = 1;
else
active = (*f_event)(kn, 0);
if (active)
knote_submit(kn, kev);
return (active);
}
static inline int
knote_process(struct knote *kn, struct kevent *kev)
{
return (knote_process_fn(kn, kev, kn->kn_fop->f_event));
}
static inline int
klist_empty(struct klist *klist)
{
return (SLIST_EMPTY(&klist->kl_list));
}
#else /* !_KERNEL */
#include <sys/cdefs.h>
struct timespec;
__BEGIN_DECLS
int kqueue(void);
int kevent(int kq, const struct kevent *changelist, int nchanges,
struct kevent *eventlist, int nevents,
const struct timespec *timeout);
__END_DECLS
#endif /* !_KERNEL */
#endif /* !_SYS_EVENT_H_ */
/* $OpenBSD: kern_acct.c,v 1.47 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_acct.c,v 1.42 1996/02/04 02:15:12 christos Exp $ */
/*-
* Copyright (c) 1994 Christopher G. Demetriou
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/errno.h>
#include <sys/acct.h>
#include <sys/resourcevar.h>
#include <sys/tty.h>
#include <sys/kthread.h>
#include <sys/rwlock.h>
#include <sys/syscallargs.h>
/*
* The routines implemented in this file are described in:
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
* on pages 62-63.
*
* Arguably, to simplify accounting operations, this mechanism should
* be replaced by one in which an accounting log file (similar to /dev/klog)
* is read by a user process, etc. However, that has its own problems.
*/
/*
* Internal accounting functions.
*/
comp_t encode_comp_t(u_long, u_long);
int acct_start(void);
void acct_thread(void *);
void acct_shutdown(void);
/*
* Accounting vnode pointer, and saved vnode pointer.
*/
struct vnode *acctp;
struct vnode *savacctp;
/*
* Lock protecting acctp and savacctp.
*/
struct rwlock acct_lock = RWLOCK_INITIALIZER("acctlk");
/*
* Values associated with enabling and disabling accounting
*/
int acctsuspend = 2; /* stop accounting when < 2% free space left */
int acctresume = 4; /* resume when free space risen to > 4% */
int acctrate = 15; /* delay (in seconds) between space checks */
struct proc *acct_proc;
/*
* Accounting system call. Written based on the specification and
* previous implementation done by Mark Tinguely.
*/
int
sys_acct(struct proc *p, void *v, register_t *retval)
{
struct sys_acct_args /* {
syscallarg(const char *) path;
} */ *uap = v;
struct nameidata nd;
int error;
/* Make sure that the caller is root. */
if ((error = suser(p)) != 0)
return (error);
/*
* If accounting is to be started to a file, open that file for
* writing and make sure it's 'normal'.
*/
if (SCARG(uap, path) != NULL) {
NDINIT(&nd, 0, 0, UIO_USERSPACE, SCARG(uap, path), p);
if ((error = vn_open(&nd, FWRITE|O_APPEND, 0)) != 0)
return (error);
VOP_UNLOCK(nd.ni_vp);
if (nd.ni_vp->v_type != VREG) { vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
return (EACCES);
}
}
rw_enter_write(&acct_lock);
/*
* If accounting was previously enabled, kill the old space-watcher,
* close the file, and (if no new file was specified, leave).
*/
if (acctp != NULL || savacctp != NULL) { wakeup(&acct_proc);
(void)vn_close((acctp != NULL ? acctp : savacctp), FWRITE,
p->p_ucred, p);
acctp = savacctp = NULL;
}
if (SCARG(uap, path) == NULL)
goto out;
/*
* Save the new accounting file vnode, and schedule the new
* free space watcher.
*/
acctp = nd.ni_vp;
if ((error = acct_start()) != 0) { acctp = NULL;
(void)vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
}
out:
rw_exit_write(&acct_lock);
return (error);
}
/*
* Write out process accounting information, on process exit.
* Data to be written out is specified in Leffler, et al.
* and are enumerated below. (They're also noted in the system
* "acct.h" header file.)
*/
int
acct_process(struct proc *p)
{
struct acct acct;
struct process *pr = p->p_p;
struct rusage *r;
struct timespec booted, elapsed, realstart, st, tmp, uptime, ut;
int t;
struct vnode *vp;
int error = 0;
/* If accounting isn't enabled, don't bother */
if (acctp == NULL)
return (0);
rw_enter_read(&acct_lock);
/*
* Check the vnode again in case accounting got disabled while waiting
* for the lock.
*/
vp = acctp;
if (vp == NULL)
goto out;
/*
* Get process accounting information.
*/
/* (1) The name of the command that ran */
memcpy(acct.ac_comm, pr->ps_comm, sizeof acct.ac_comm);
/* (2) The amount of user and system time that was used */
calctsru(&pr->ps_tu, &ut, &st, NULL);
acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_nsec);
acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_nsec);
/* (3) The elapsed time the command ran (and its starting time) */
nanouptime(&uptime);
nanoboottime(&booted);
timespecadd(&booted, &pr->ps_start, &realstart);
acct.ac_btime = realstart.tv_sec;
timespecsub(&uptime, &pr->ps_start, &elapsed);
acct.ac_etime = encode_comp_t(elapsed.tv_sec, elapsed.tv_nsec);
/* (4) The average amount of memory used */
r = &p->p_ru;
timespecadd(&ut, &st, &tmp);
t = tmp.tv_sec * hz + tmp.tv_nsec / (1000 * tick);
if (t)
acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
else
acct.ac_mem = 0;
/* (5) The number of disk I/O operations done */
acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
/* (6) The UID and GID of the process */
acct.ac_uid = pr->ps_ucred->cr_ruid;
acct.ac_gid = pr->ps_ucred->cr_rgid;
/* (7) The terminal from which the process was started */
if ((pr->ps_flags & PS_CONTROLT) &&
pr->ps_pgrp->pg_session->s_ttyp)
acct.ac_tty = pr->ps_pgrp->pg_session->s_ttyp->t_dev;
else
acct.ac_tty = -1;
/* (8) The boolean flags that tell how process terminated or misbehaved. */
acct.ac_flag = pr->ps_acflag;
/* Extensions */
acct.ac_pid = pr->ps_pid;
/*
* Now, just write the accounting information to the file.
*/
error = vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
(off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT|IO_NOLIMIT,
p->p_ucred, NULL, p);
out:
rw_exit_read(&acct_lock);
return (error);
}
/*
* Encode_comp_t converts from ticks in seconds and microseconds
* to ticks in 1/AHZ seconds. The encoding is described in
* Leffler, et al., on page 63.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
comp_t
encode_comp_t(u_long s, u_long ns)
{
int exp, rnd;
exp = 0;
rnd = 0;
s *= AHZ;
s += ns / (1000000000 / AHZ); /* Maximize precision. */
while (s > MAXFRACT) {
rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/* If we need to round up, do it (and handle overflow correctly). */
if (rnd && (++s > MAXFRACT)) {
s >>= EXPSIZE;
exp++;
}
/* Clean it up and polish it off. */
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += s; /* and add on the mantissa. */
return (exp);
}
int
acct_start(void)
{
/* Already running. */
if (acct_proc != NULL)
return (0);
return (kthread_create(acct_thread, NULL, &acct_proc, "acct"));
}
/*
* Periodically check the file system to see if accounting
* should be turned on or off. Beware the case where the vnode
* has been vgone()'d out from underneath us, e.g. when the file
* system containing the accounting file has been forcibly unmounted.
*/
void
acct_thread(void *arg)
{
struct statfs sb;
struct proc *p = curproc;
rw_enter_write(&acct_lock);
for (;;) {
if (savacctp != NULL) {
if (savacctp->v_type == VBAD) {
(void) vn_close(savacctp, FWRITE, NOCRED, p);
savacctp = NULL;
break;
}
(void)VFS_STATFS(savacctp->v_mount, &sb, NULL);
if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
acctp = savacctp;
savacctp = NULL;
log(LOG_NOTICE, "Accounting resumed\n");
}
} else if (acctp != NULL) {
if (acctp->v_type == VBAD) {
(void) vn_close(acctp, FWRITE, NOCRED, p);
acctp = NULL;
break;
}
(void)VFS_STATFS(acctp->v_mount, &sb, NULL);
if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
savacctp = acctp;
acctp = NULL;
log(LOG_NOTICE, "Accounting suspended\n");
}
} else {
break;
}
rwsleep_nsec(&acct_proc, &acct_lock, PPAUSE, "acct",
SEC_TO_NSEC(acctrate));
}
acct_proc = NULL;
rw_exit_write(&acct_lock);
kthread_exit(0);
}
void
acct_shutdown(void)
{
struct proc *p = curproc;
rw_enter_write(&acct_lock);
if (acctp != NULL || savacctp != NULL) {
vn_close((acctp != NULL ? acctp : savacctp), FWRITE,
NOCRED, p);
acctp = savacctp = NULL;
}
rw_exit_write(&acct_lock);
}
/* $OpenBSD: uvm_anon.c,v 1.54 2021/03/26 13:40:05 mpi Exp $ */
/* $NetBSD: uvm_anon.c,v 1.10 2000/11/25 06:27:59 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_anon.c: uvm anon ops
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
struct pool uvm_anon_pool;
void
uvm_anon_init(void)
{
pool_init(&uvm_anon_pool, sizeof(struct vm_anon), 0, IPL_MPFLOOR,
PR_WAITOK, "anonpl", NULL);
pool_sethiwat(&uvm_anon_pool, uvmexp.free / 16);
}
/*
* uvm_analloc: allocate a new anon.
*
* => anon will have no lock associated.
*/
struct vm_anon *
uvm_analloc(void)
{
struct vm_anon *anon;
anon = pool_get(&uvm_anon_pool, PR_NOWAIT);
if (anon) { anon->an_lock = NULL;
anon->an_ref = 1;
anon->an_page = NULL;
anon->an_swslot = 0;
}
return anon;
}
/*
* uvm_anfree_list: free a single anon structure
*
* => anon must be removed from the amap (if anon was in an amap).
* => amap must be locked, if anon was owned by amap.
* => we may lock the pageq's.
*/
void
uvm_anfree_list(struct vm_anon *anon, struct pglist *pgl)
{
struct vm_page *pg = anon->an_page;
KASSERT(anon->an_lock == NULL || rw_write_held(anon->an_lock)); KASSERT(anon->an_ref == 0);
/*
* Dispose of the page, if it is resident.
*/
if (pg != NULL) {
KASSERT(anon->an_lock != NULL);
/*
* If the page is busy, mark it as PG_RELEASED, so
* that uvm_anon_release(9) would release it later.
*/
if ((pg->pg_flags & PG_BUSY) != 0) {
atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
rw_obj_hold(anon->an_lock);
return;
}
pmap_page_protect(pg, PROT_NONE);
if (pgl != NULL) {
/*
* clean page, and put on on pglist
* for later freeing.
*/
uvm_lock_pageq();
uvm_pageclean(pg);
uvm_unlock_pageq();
TAILQ_INSERT_HEAD(pgl, pg, pageq);
} else {
uvm_lock_pageq(); /* lock out pagedaemon */
uvm_pagefree(pg); /* bye bye */
uvm_unlock_pageq(); /* free the daemon */
}
} else {
if (anon->an_swslot != 0) {
/* This page is no longer only in swap. */
KASSERT(uvmexp.swpgonly > 0); atomic_dec_int(&uvmexp.swpgonly);
}
}
anon->an_lock = NULL;
/*
* Free any swap resources, leave a page replacement hint.
*/
uvm_anon_dropswap(anon);
KASSERT(anon->an_page == NULL);
KASSERT(anon->an_swslot == 0);
pool_put(&uvm_anon_pool, anon);
}
/*
* uvm_anwait: wait for memory to become available to allocate an anon.
*/
void
uvm_anwait(void)
{
struct vm_anon *anon;
/* XXX: Want something like pool_wait()? */
anon = pool_get(&uvm_anon_pool, PR_WAITOK);
pool_put(&uvm_anon_pool, anon);
}
/*
* uvm_anon_pagein: fetch an anon's page.
*
* => anon must be locked, and is unlocked upon return.
* => returns true if pagein was aborted due to lack of memory.
*/
boolean_t
uvm_anon_pagein(struct vm_amap *amap, struct vm_anon *anon)
{
struct vm_page *pg;
int rv;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(anon->an_lock == amap->am_lock);
/*
* Get the page of the anon.
*/
rv = uvmfault_anonget(NULL, amap, anon);
switch (rv) {
case VM_PAGER_OK:
KASSERT(rw_write_held(anon->an_lock));
break;
case VM_PAGER_ERROR:
case VM_PAGER_REFAULT:
/*
* Nothing more to do on errors.
* VM_PAGER_REFAULT means that the anon was freed.
*/
return FALSE;
default:
#ifdef DIAGNOSTIC
panic("anon_pagein: uvmfault_anonget -> %d", rv);
#else
return FALSE;
#endif
}
/*
* Mark the page as dirty and clear its swslot.
*/
pg = anon->an_page;
if (anon->an_swslot > 0) {
uvm_swap_free(anon->an_swslot, 1);
}
anon->an_swslot = 0;
atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
/*
* Deactivate the page (to put it on a page queue).
*/
pmap_clear_reference(pg);
pmap_page_protect(pg, PROT_NONE);
uvm_lock_pageq();
uvm_pagedeactivate(pg);
uvm_unlock_pageq();
rw_exit(anon->an_lock);
return FALSE;
}
/*
* uvm_anon_dropswap: release any swap resources from this anon.
*
* => anon must be locked or have a reference count of 0.
*/
void
uvm_anon_dropswap(struct vm_anon *anon)
{ KASSERT(anon->an_ref == 0 || rw_lock_held(anon->an_lock)); if (anon->an_swslot == 0)
return;
uvm_swap_free(anon->an_swslot, 1);
anon->an_swslot = 0;
}
/*
* uvm_anon_release: release an anon and its page.
*
* => anon should not have any references.
* => anon must be locked.
*/
void
uvm_anon_release(struct vm_anon *anon)
{
struct vm_page *pg = anon->an_page;
struct rwlock *lock;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(pg != NULL);
KASSERT((pg->pg_flags & PG_RELEASED) != 0);
KASSERT((pg->pg_flags & PG_BUSY) != 0);
KASSERT(pg->uobject == NULL);
KASSERT(pg->uanon == anon);
KASSERT(anon->an_ref == 0);
uvm_lock_pageq();
uvm_pagefree(pg);
uvm_unlock_pageq();
KASSERT(anon->an_page == NULL);
lock = anon->an_lock;
uvm_anfree(anon);
rw_exit(lock);
/* Note: extra reference is held for PG_RELEASED case. */
rw_obj_free(lock);
}
/* $OpenBSD: ip_var.h,v 1.104 2022/09/03 22:43:38 mvs Exp $ */
/* $NetBSD: ip_var.h,v 1.16 1996/02/13 23:43:20 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET_IP_VAR_H_
#define _NETINET_IP_VAR_H_
/*
* Structure stored in mbuf in inpcb.ip_options
* and passed to ip_output when ip options are in use.
* The actual length of the options (including ipopt_dst)
* is in m_len.
*/
#define MAX_IPOPTLEN 40
/*
* Overlay for ip header used by other protocols (tcp, udp).
*/
struct ipovly {
u_int8_t ih_x1[9]; /* (unused) */
u_int8_t ih_pr; /* protocol */
u_int16_t ih_len; /* protocol length */
struct in_addr ih_src; /* source internet address */
struct in_addr ih_dst; /* destination internet address */
};
struct ipstat {
u_long ips_total; /* total packets received */
u_long ips_badsum; /* checksum bad */
u_long ips_tooshort; /* packet too short */
u_long ips_toosmall; /* not enough data */
u_long ips_badhlen; /* ip header length < data size */
u_long ips_badlen; /* ip length < ip header length */
u_long ips_fragments; /* fragments received */
u_long ips_fragdropped; /* frags dropped (dups, out of space) */
u_long ips_fragtimeout; /* fragments timed out */
u_long ips_forward; /* packets forwarded */
u_long ips_cantforward; /* packets rcvd for unreachable dest */
u_long ips_redirectsent; /* packets forwarded on same net */
u_long ips_noproto; /* unknown or unsupported protocol */
u_long ips_delivered; /* datagrams delivered to upper level*/
u_long ips_localout; /* total ip packets generated here */
u_long ips_odropped; /* lost output due to nobufs, etc. */
u_long ips_reassembled; /* total packets reassembled ok */
u_long ips_fragmented; /* datagrams successfully fragmented */
u_long ips_ofragments; /* output fragments created */
u_long ips_cantfrag; /* don't fragment flag was set, etc. */
u_long ips_badoptions; /* error in option processing */
u_long ips_noroute; /* packets discarded due to no route */
u_long ips_badvers; /* ip version != 4 */
u_long ips_rawout; /* total raw ip packets generated */
u_long ips_badfrags; /* malformed fragments (bad length) */
u_long ips_rcvmemdrop; /* frags dropped for lack of memory */
u_long ips_toolong; /* ip length > max ip packet size */
u_long ips_nogif; /* no match gif found */
u_long ips_badaddr; /* invalid address on header */
u_long ips_inswcsum; /* software checksummed on input */
u_long ips_outswcsum; /* software checksummed on output */
u_long ips_notmember; /* multicasts for unregistered groups */
u_long ips_wrongif; /* packet received on wrong interface */
u_long ips_idropped; /* lost input due to nobufs, etc. */
};
struct ipoption {
struct in_addr ipopt_dst; /* first-hop dst if source routed */
int8_t ipopt_list[MAX_IPOPTLEN]; /* options proper */
};
#ifdef _KERNEL
#include <sys/percpu.h>
enum ipstat_counters {
ips_total, /* total packets received */
ips_badsum, /* checksum bad */
ips_tooshort, /* packet too short */
ips_toosmall, /* not enough data */
ips_badhlen, /* ip header length < data size */
ips_badlen, /* ip length < ip header length */
ips_fragments, /* fragments received */
ips_fragdropped, /* frags dropped (dups, out of space) */
ips_fragtimeout, /* fragments timed out */
ips_forward, /* packets forwarded */
ips_cantforward, /* packets rcvd for unreachable dest */
ips_redirectsent, /* packets forwarded on same net */
ips_noproto, /* unknown or unsupported protocol */
ips_delivered, /* datagrams delivered to upper level*/
ips_localout, /* total ip packets generated here */
ips_odropped, /* lost output packets due to nobufs, etc. */
ips_reassembled, /* total packets reassembled ok */
ips_fragmented, /* datagrams successfully fragmented */
ips_ofragments, /* output fragments created */
ips_cantfrag, /* don't fragment flag was set, etc. */
ips_badoptions, /* error in option processing */
ips_noroute, /* packets discarded due to no route */
ips_badvers, /* ip version != 4 */
ips_rawout, /* total raw ip packets generated */
ips_badfrags, /* malformed fragments (bad length) */
ips_rcvmemdrop, /* frags dropped for lack of memory */
ips_toolong, /* ip length > max ip packet size */
ips_nogif, /* no match gif found */
ips_badaddr, /* invalid address on header */
ips_inswcsum, /* software checksummed on input */
ips_outswcsum, /* software checksummed on output */
ips_notmember, /* multicasts for unregistered groups */
ips_wrongif, /* packet received on wrong interface */
ips_idropped, /* lost input packets due to nobufs, etc. */
ips_ncounters
};
extern struct cpumem *ipcounters;
static inline void
ipstat_inc(enum ipstat_counters c)
{
counters_inc(ipcounters, c);
}
static inline void
ipstat_add(enum ipstat_counters c, uint64_t v)
{
counters_add(ipcounters, c, v);
}
/*
* Structure attached to inpcb.ip_moptions and
* passed to ip_output when IP multicast options are in use.
*/
struct ip_moptions {
struct in_multi **imo_membership; /* group memberships */
unsigned short imo_ifidx; /* ifp index for outgoing multicasts */
u_int8_t imo_ttl; /* TTL for outgoing multicasts */
u_int8_t imo_loop; /* 1 => hear sends if a member */
u_int16_t imo_num_memberships; /* no. memberships this socket */
u_int16_t imo_max_memberships; /* max memberships this socket */
};
#include <sys/queue.h>
/*
* Ip reassembly queue structures.
*/
LIST_HEAD(ipqehead, ipqent);
struct ipqent {
LIST_ENTRY(ipqent) ipqe_q;
struct ip *ipqe_ip;
struct mbuf *ipqe_m; /* mbuf contains packet */
uint16_t ipqe_mff; /* for IP fragmentation */
};
/*
* Ip reassembly queue structure. Each fragment
* being reassembled is attached to one of these structures.
* They are timed out after ipq_ttl drops to 0, and may also
* be reclaimed if memory becomes tight.
*/
struct ipq {
LIST_ENTRY(ipq) ipq_q; /* to other reass headers */
u_int8_t ipq_ttl; /* time for reass q to live */
u_int8_t ipq_p; /* protocol of this fragment */
u_int16_t ipq_id; /* sequence id for reassembly */
struct ipqehead ipq_fragq; /* to ip fragment queue */
struct in_addr ipq_src, ipq_dst;
};
/* flags passed to ip_output */
#define IP_FORWARDING 0x1 /* most of ip header exists */
#define IP_RAWOUTPUT 0x2 /* raw ip header exists */
#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */
#define IP_MTUDISC 0x0800 /* pmtu discovery, set DF */
extern struct ipstat ipstat;
extern int ip_defttl; /* default IP ttl */
#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */
extern int ip_mtudisc; /* mtu discovery */
extern int ip_mtudisc_timeout; /* seconds to timeout mtu discovery */
extern int ipport_firstauto; /* min port for port allocation */
extern int ipport_lastauto; /* max port for port allocation */
extern int ipport_hifirstauto; /* min dynamic/private port number */
extern int ipport_hilastauto; /* max dynamic/private port number */
extern int ipforwarding; /* enable IP forwarding */
#ifdef MROUTING
extern int ipmforwarding; /* enable multicast forwarding */
#endif
extern int ipmultipath; /* enable multipath routing */
extern int la_hold_total;
extern const struct pr_usrreqs rip_usrreqs;
extern struct rttimer_queue ip_mtudisc_timeout_q;
extern struct pool ipqent_pool;
struct route;
struct inpcb;
int ip_ctloutput(int, struct socket *, int, int, struct mbuf *);
int ip_fragment(struct mbuf *, struct mbuf_list *, struct ifnet *, u_long);
void ip_freemoptions(struct ip_moptions *);
int ip_getmoptions(int, struct ip_moptions *, struct mbuf *);
void ip_init(void);
struct mbuf*
ip_insertoptions(struct mbuf *, struct mbuf *, int *);
int ip_mforward(struct mbuf *, struct ifnet *);
int ip_optcopy(struct ip *, struct ip *);
int ip_output(struct mbuf *, struct mbuf *, struct route *, int,
struct ip_moptions *, struct inpcb *, u_int32_t);
u_int16_t
ip_randomid(void);
void ip_send(struct mbuf *);
void ip_send_raw(struct mbuf *);
void ip_slowtimo(void);
struct mbuf *
ip_srcroute(struct mbuf *);
void ip_stripoptions(struct mbuf *);
int ip_sysctl(int *, u_int, void *, size_t *, void *, size_t);
void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
struct mbuf *);
int ip_input_if(struct mbuf **, int *, int, int, struct ifnet *);
int ip_deliver(struct mbuf **, int *, int, int);
void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
int rip_ctloutput(int, struct socket *, int, int, struct mbuf *);
void rip_init(void);
int rip_input(struct mbuf **, int *, int, int);
int rip_output(struct mbuf *, struct socket *, struct sockaddr *,
struct mbuf *);
int rip_attach(struct socket *, int);
int rip_detach(struct socket *);
int rip_bind(struct socket *so, struct mbuf *, struct proc *);
int rip_connect(struct socket *, struct mbuf *);
int rip_disconnect(struct socket *);
int rip_shutdown(struct socket *);
int rip_send(struct socket *, struct mbuf *, struct mbuf *,
struct mbuf *);
int rip_abort(struct socket *);
#ifdef MROUTING
extern struct socket *ip_mrouter[]; /* multicast routing daemon */
#endif
#endif /* _KERNEL */
#endif /* _NETINET_IP_VAR_H_ */
/* $OpenBSD: raw_ip.c,v 1.147 2022/09/03 22:43:38 mvs Exp $ */
/* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_mroute.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_icmp.h>
#include <net/pfvar.h>
#include "pf.h"
struct inpcbtable rawcbtable;
/*
* Nominal space allocated to a raw ip socket.
*/
#define RIPSNDQ 8192
#define RIPRCVQ 8192
/*
* Raw interface to IP protocol.
*/
const struct pr_usrreqs rip_usrreqs = {
.pru_attach = rip_attach,
.pru_detach = rip_detach,
.pru_bind = rip_bind,
.pru_connect = rip_connect,
.pru_disconnect = rip_disconnect,
.pru_shutdown = rip_shutdown,
.pru_send = rip_send,
.pru_abort = rip_abort,
.pru_control = in_control,
.pru_sockaddr = in_sockaddr,
.pru_peeraddr = in_peeraddr,
};
/*
* Initialize raw connection block q.
*/
void
rip_init(void)
{
in_pcbinit(&rawcbtable, 1);
}
struct mbuf *rip_chkhdr(struct mbuf *, struct mbuf *);
int
rip_input(struct mbuf **mp, int *offp, int proto, int af)
{
struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
struct inpcb *inp;
SIMPLEQ_HEAD(, inpcb) inpcblist;
struct in_addr *key;
struct counters_ref ref;
uint64_t *counters;
struct sockaddr_in ripsrc;
KASSERT(af == AF_INET);
memset(&ripsrc, 0, sizeof(ripsrc));
ripsrc.sin_family = AF_INET;
ripsrc.sin_len = sizeof(ripsrc);
ripsrc.sin_addr = ip->ip_src;
key = &ip->ip_dst;
#if NPF > 0
if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL);
switch (divert->type) {
case PF_DIVERT_TO:
key = &divert->addr.v4;
break;
case PF_DIVERT_REPLY:
break;
default:
panic("%s: unknown divert type %d, mbuf %p, divert %p",
__func__, divert->type, m, divert);
}
}
#endif
SIMPLEQ_INIT(&inpcblist);
rw_enter_write(&rawcbtable.inpt_notify);
mtx_enter(&rawcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
continue;
#ifdef INET6
if (inp->inp_flags & INP_IPV6)
continue;
#endif
if (rtable_l2(inp->inp_rtableid) !=
rtable_l2(m->m_pkthdr.ph_rtableid))
continue;
if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p)
continue;
if (inp->inp_laddr.s_addr &&
inp->inp_laddr.s_addr != key->s_addr)
continue;
if (inp->inp_faddr.s_addr &&
inp->inp_faddr.s_addr != ip->ip_src.s_addr)
continue;
in_pcbref(inp);
SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
}
mtx_leave(&rawcbtable.inpt_mtx);
if (SIMPLEQ_EMPTY(&inpcblist)) {
rw_exit_write(&rawcbtable.inpt_notify);
if (ip->ip_p != IPPROTO_ICMP)
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
0, 0);
else
m_freem(m);
counters = counters_enter(&ref, ipcounters);
counters[ips_noproto]++;
counters[ips_delivered]--;
counters_leave(&ref, ipcounters);
return IPPROTO_DONE;
}
while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
struct mbuf *n, *opts = NULL;
SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
if (SIMPLEQ_EMPTY(&inpcblist))
n = m;
else
n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
if (n != NULL) {
if (inp->inp_flags & INP_CONTROLOPTS ||
inp->inp_socket->so_options & SO_TIMESTAMP)
ip_savecontrol(inp, &opts, ip, n);
if (sbappendaddr(inp->inp_socket,
&inp->inp_socket->so_rcv,
sintosa(&ripsrc), n, opts) == 0) {
/* should notify about lost packet */
m_freem(n);
m_freem(opts);
} else
sorwakeup(inp->inp_socket);
}
in_pcbunref(inp);
}
rw_exit_write(&rawcbtable.inpt_notify);
return IPPROTO_DONE;
}
/*
* Generate IP header and pass packet to ip_output.
* Tack on options user may have setup with control call.
*/
int
rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
struct mbuf *control)
{
struct sockaddr_in *dst = satosin(dstaddr);
struct ip *ip;
struct inpcb *inp;
int flags, error;
inp = sotoinpcb(so);
flags = IP_ALLOWBROADCAST;
/*
* If the user handed us a complete IP packet, use it.
* Otherwise, allocate an mbuf for a header and fill it in.
*/
if ((inp->inp_flags & INP_HDRINCL) == 0) {
if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
m_freem(m);
return (EMSGSIZE);
}
M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
if (!m)
return (ENOBUFS);
ip = mtod(m, struct ip *);
ip->ip_tos = inp->inp_ip.ip_tos;
ip->ip_off = htons(0);
ip->ip_p = inp->inp_ip.ip_p;
ip->ip_len = htons(m->m_pkthdr.len);
ip->ip_src.s_addr = INADDR_ANY;
ip->ip_dst = dst->sin_addr;
ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL;
} else {
if (m->m_pkthdr.len > IP_MAXPACKET) {
m_freem(m);
return (EMSGSIZE);
}
m = rip_chkhdr(m, inp->inp_options);
if (m == NULL)
return (EINVAL);
ip = mtod(m, struct ip *);
if (ip->ip_id == 0)
ip->ip_id = htons(ip_randomid());
dst->sin_addr = ip->ip_dst;
/* XXX prevent ip_output from overwriting header fields */
flags |= IP_RAWOUTPUT;
ipstat_inc(ips_rawout);
}
if (ip->ip_src.s_addr == INADDR_ANY) {
error = in_pcbselsrc(&ip->ip_src, dst, inp);
if (error != 0)
return (error);
}
#ifdef INET6
/*
* A thought: Even though raw IP shouldn't be able to set IPv6
* multicast options, if it does, the last parameter to
* ip_output should be guarded against v6/v4 problems.
*/
#endif
/* force routing table */
m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
#if NPF > 0
if (inp->inp_socket->so_state & SS_ISCONNECTED &&
ip->ip_p != IPPROTO_ICMP)
pf_mbuf_link_inpcb(m, inp);
#endif
error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
inp->inp_moptions, inp, 0);
return (error);
}
struct mbuf *
rip_chkhdr(struct mbuf *m, struct mbuf *options)
{
struct ip *ip;
int hlen, opt, optlen, cnt;
u_char *cp;
if (m->m_pkthdr.len < sizeof(struct ip)) {
m_freem(m);
return NULL;
}
m = m_pullup(m, sizeof (struct ip));
if (m == NULL)
return NULL;
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
/* Don't allow packet length sizes that will crash. */
if (hlen < sizeof (struct ip) ||
ntohs(ip->ip_len) < hlen ||
ntohs(ip->ip_len) != m->m_pkthdr.len) {
m_freem(m);
return NULL;
}
m = m_pullup(m, hlen);
if (m == NULL)
return NULL;
ip = mtod(m, struct ip *);
if (ip->ip_v != IPVERSION) {
m_freem(m);
return NULL;
}
/*
* Don't allow both user specified and setsockopt options.
* If options are present verify them.
*/
if (hlen != sizeof(struct ip)) {
if (options) {
m_freem(m);
return NULL;
} else {
cp = (u_char *)(ip + 1);
cnt = hlen - sizeof(struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp)) {
m_freem(m);
return NULL;
}
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) ||
optlen > cnt) {
m_freem(m);
return NULL;
}
}
}
}
}
return m;
}
/*
* Raw IP socket option processing.
*/
int
rip_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
struct inpcb *inp = sotoinpcb(so);
int error;
if (level != IPPROTO_IP)
return (EINVAL);
switch (optname) {
case IP_HDRINCL:
error = 0;
if (op == PRCO_SETOPT) {
if (m == NULL || m->m_len < sizeof (int))
error = EINVAL;
else if (*mtod(m, int *))
inp->inp_flags |= INP_HDRINCL;
else
inp->inp_flags &= ~INP_HDRINCL;
} else {
m->m_len = sizeof(int);
*mtod(m, int *) = inp->inp_flags & INP_HDRINCL;
}
return (error);
case MRT_INIT:
case MRT_DONE:
case MRT_ADD_VIF:
case MRT_DEL_VIF:
case MRT_ADD_MFC:
case MRT_DEL_MFC:
case MRT_VERSION:
case MRT_ASSERT:
case MRT_API_SUPPORT:
case MRT_API_CONFIG:
#ifdef MROUTING
switch (op) {
case PRCO_SETOPT:
error = ip_mrouter_set(so, optname, m);
break;
case PRCO_GETOPT:
error = ip_mrouter_get(so, optname, m);
break;
default:
error = EINVAL;
break;
}
return (error);
#else
return (EOPNOTSUPP);
#endif
}
return (ip_ctloutput(op, so, level, optname, m));
}
u_long rip_sendspace = RIPSNDQ;
u_long rip_recvspace = RIPRCVQ;
int
rip_attach(struct socket *so, int proto)
{
struct inpcb *inp;
int error;
if (so->so_pcb)
panic("rip_attach"); if ((so->so_state & SS_PRIV) == 0)
return EACCES;
if (proto < 0 || proto >= IPPROTO_MAX)
return EPROTONOSUPPORT;
if ((error = soreserve(so, rip_sendspace, rip_recvspace)))
return error;
NET_ASSERT_LOCKED(); if ((error = in_pcballoc(so, &rawcbtable)))
return error;
inp = sotoinpcb(so);
inp->inp_ip.ip_p = proto;
return 0;
}
int
rip_detach(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
soassertlocked(so);
if (inp == NULL)
return (EINVAL);
#ifdef MROUTING
if (so == ip_mrouter[inp->inp_rtableid])
ip_mrouter_done(so);
#endif
in_pcbdetach(inp);
return (0);
}
int
rip_bind(struct socket *so, struct mbuf *nam, struct proc *p)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in *addr;
int error;
soassertlocked(so);
if ((error = in_nam2sin(nam, &addr)))
return (error);
if (!((so->so_options & SO_BINDANY) ||
addr->sin_addr.s_addr == INADDR_ANY ||
addr->sin_addr.s_addr == INADDR_BROADCAST ||
in_broadcast(addr->sin_addr, inp->inp_rtableid) ||
ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid)))
return (EADDRNOTAVAIL);
inp->inp_laddr = addr->sin_addr;
return (0);
}
int
rip_connect(struct socket *so, struct mbuf *nam)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in *addr;
int error;
soassertlocked(so);
if ((error = in_nam2sin(nam, &addr)))
return (error);
inp->inp_faddr = addr->sin_addr;
soisconnected(so);
return (0);
}
int
rip_disconnect(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
soassertlocked(so);
if ((so->so_state & SS_ISCONNECTED) == 0)
return (ENOTCONN);
soisdisconnected(so);
inp->inp_faddr.s_addr = INADDR_ANY;
return (0);
}
int
rip_shutdown(struct socket *so)
{
/*
* Mark the connection as being incapable of further input.
*/
soassertlocked(so);
socantsendmore(so);
return (0);
}
int
rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
struct mbuf *control)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in dst;
int error;
soassertlocked(so);
/*
* Ship a packet out. The appropriate raw output
* routine handles any massaging necessary.
*/
memset(&dst, 0, sizeof(dst));
dst.sin_family = AF_INET;
dst.sin_len = sizeof(dst);
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
error = EISCONN;
goto out;
}
dst.sin_addr = inp->inp_faddr;
} else {
struct sockaddr_in *addr;
if (nam == NULL) {
error = ENOTCONN;
goto out;
}
if ((error = in_nam2sin(nam, &addr)))
goto out;
dst.sin_addr = addr->sin_addr;
}
#ifdef IPSEC
/* XXX Find an IPsec TDB */
#endif
error = rip_output(m, so, sintosa(&dst), NULL);
m = NULL;
out:
m_freem(control);
m_freem(m);
return (error);
}
int
rip_abort(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
soassertlocked(so);
soisdisconnected(so);
#ifdef MROUTING
if (so == ip_mrouter[inp->inp_rtableid])
ip_mrouter_done(so);
#endif
in_pcbdetach(inp);
return (0);
}
/* $OpenBSD: proc.h,v 1.334 2022/07/23 22:10:59 cheloha Exp $ */
/* $NetBSD: proc.h,v 1.44 1996/04/22 01:23:21 christos Exp $ */
/*-
* Copyright (c) 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)proc.h 8.8 (Berkeley) 1/21/94
*/
#ifndef _SYS_PROC_H_
#define _SYS_PROC_H_
#include <machine/proc.h> /* Machine-dependent proc substruct. */
#include <sys/selinfo.h> /* For struct selinfo */
#include <sys/syslimits.h> /* For LOGIN_NAME_MAX */
#include <sys/queue.h>
#include <sys/timeout.h> /* For struct timeout */
#include <sys/event.h> /* For struct klist */
#include <sys/mutex.h> /* For struct mutex */
#include <sys/resource.h> /* For struct rusage */
#include <sys/rwlock.h> /* For struct rwlock */
#include <sys/sigio.h> /* For struct sigio */
#ifdef _KERNEL
#include <sys/atomic.h>
#define __need_process
#endif
/*
* One structure allocated per session.
*/
struct process;
struct session {
int s_count; /* Ref cnt; pgrps in session. */
struct process *s_leader; /* Session leader. */
struct vnode *s_ttyvp; /* Vnode of controlling terminal. */
struct tty *s_ttyp; /* Controlling terminal. */
char s_login[LOGIN_NAME_MAX]; /* Setlogin() name. */
pid_t s_verauthppid;
uid_t s_verauthuid;
struct timeout s_verauthto;
};
void zapverauth(/* struct session */ void *);
/*
* One structure allocated per process group.
*/
struct pgrp {
LIST_ENTRY(pgrp) pg_hash; /* Hash chain. */
LIST_HEAD(, process) pg_members;/* Pointer to pgrp members. */
struct session *pg_session; /* Pointer to session. */
struct sigiolst pg_sigiolst; /* List of sigio structures. */
pid_t pg_id; /* Pgrp id. */
int pg_jobc; /* # procs qualifying pgrp for job control */
};
/*
* time usage: accumulated times in ticks
* Once a second, each thread's immediate counts (p_[usi]ticks) are
* accumulated into these.
*/
struct tusage {
struct timespec tu_runtime; /* Realtime. */
uint64_t tu_uticks; /* Statclock hits in user mode. */
uint64_t tu_sticks; /* Statclock hits in system mode. */
uint64_t tu_iticks; /* Statclock hits processing intr. */
};
/*
* Description of a process.
*
* These structures contain the information needed to manage a thread of
* control, known in UN*X as a process; it has references to substructures
* containing descriptions of things that the process uses, but may share
* with related processes.
*
* struct process is the higher level process containing information
* shared by all threads in a process, while struct proc contains the
* run-time information needed by threads.
*/
#ifdef __need_process
struct futex;
LIST_HEAD(futex_list, futex);
struct proc;
struct tslpentry;
TAILQ_HEAD(tslpqueue, tslpentry);
struct unveil;
/*
* Locks used to protect struct members in this file:
* I immutable after creation
* a atomic operations
* K kernel lock
* m this process' `ps_mtx'
* p this process' `ps_lock'
* R rlimit_lock
* S scheduler lock
* T itimer_mtx
*/
struct process {
/*
* ps_mainproc is the original thread in the process.
* It's only still special for the handling of
* some signal and ptrace behaviors that need to be fixed.
*/
struct proc *ps_mainproc;
struct ucred *ps_ucred; /* Process owner's identity. */
LIST_ENTRY(process) ps_list; /* List of all processes. */
TAILQ_HEAD(,proc) ps_threads; /* [K|S] Threads in this process. */
LIST_ENTRY(process) ps_pglist; /* List of processes in pgrp. */
struct process *ps_pptr; /* Pointer to parent process. */
LIST_ENTRY(process) ps_sibling; /* List of sibling processes. */
LIST_HEAD(, process) ps_children;/* Pointer to list of children. */
LIST_ENTRY(process) ps_hash; /* Hash chain. */
/*
* An orphan is the child that has been re-parented to the
* debugger as a result of attaching to it. Need to keep
* track of them for parent to be able to collect the exit
* status of what used to be children.
*/
LIST_ENTRY(process) ps_orphan; /* List of orphan processes. */
LIST_HEAD(, process) ps_orphans;/* Pointer to list of orphans. */
struct sigiolst ps_sigiolst; /* List of sigio structures. */
struct sigacts *ps_sigacts; /* [I] Signal actions, state */
struct vnode *ps_textvp; /* Vnode of executable. */
struct filedesc *ps_fd; /* Ptr to open files structure */
struct vmspace *ps_vmspace; /* Address space */
pid_t ps_pid; /* Process identifier. */
struct futex_list ps_ftlist; /* futexes attached to this process */
struct tslpqueue ps_tslpqueue; /* [p] queue of threads in thrsleep */
struct rwlock ps_lock; /* per-process rwlock */
struct mutex ps_mtx; /* per-process mutex */
/* The following fields are all zeroed upon creation in process_new. */
#define ps_startzero ps_klist
struct klist ps_klist; /* knotes attached to this process */
u_int ps_flags; /* [a] PS_* flags. */
int ps_siglist; /* Signals pending for the process. */
struct proc *ps_single; /* [S] Thread for single-threading. */
u_int ps_singlecount; /* [a] Not yet suspended threads. */
int ps_traceflag; /* Kernel trace points. */
struct vnode *ps_tracevp; /* Trace to vnode. */
struct ucred *ps_tracecred; /* Creds for writing trace */
u_int ps_xexit; /* Exit status for wait */
int ps_xsig; /* Stopping or killing signal */
pid_t ps_ppid; /* [a] Cached parent pid */
pid_t ps_oppid; /* [a] Save parent pid during ptrace. */
int ps_ptmask; /* Ptrace event mask */
struct ptrace_state *ps_ptstat;/* Ptrace state */
struct rusage *ps_ru; /* sum of stats for dead threads. */
struct tusage ps_tu; /* accumulated times. */
struct rusage ps_cru; /* sum of stats for reaped children */
struct itimerspec ps_timer[3]; /* [m] ITIMER_REAL timer */
/* [T] ITIMER_{VIRTUAL,PROF} timers */
struct timeout ps_rucheck_to; /* [] resource limit check timer */
time_t ps_nextxcpu; /* when to send next SIGXCPU, */
/* in seconds of process runtime */
u_int64_t ps_wxcounter;
struct unveil *ps_uvpaths; /* unveil vnodes and names */
ssize_t ps_uvvcount; /* count of unveil vnodes held */
size_t ps_uvncount; /* count of unveil names allocated */
int ps_uvdone; /* no more unveil is permitted */
/* End area that is zeroed on creation. */
#define ps_endzero ps_startcopy
/* The following fields are all copied upon creation in process_new. */
#define ps_startcopy ps_limit
struct plimit *ps_limit; /* [m,R] Process limits. */
struct pgrp *ps_pgrp; /* Pointer to process group. */
char ps_comm[_MAXCOMLEN]; /* command name, incl NUL */
vaddr_t ps_strings; /* User pointers to argv/env */
vaddr_t ps_timekeep; /* User pointer to timekeep */
vaddr_t ps_sigcode; /* [I] User pointer to signal code */
vaddr_t ps_sigcoderet; /* [I] User ptr to sigreturn retPC */
u_long ps_sigcookie; /* [I] */
u_int ps_rtableid; /* [a] Process routing table/domain. */
char ps_nice; /* Process "nice" value. */
struct uprof { /* profile arguments */
caddr_t pr_base; /* buffer base */
size_t pr_size; /* buffer size */
u_long pr_off; /* pc offset */
u_int pr_scale; /* pc scaling */
} ps_prof;
u_int32_t ps_acflag; /* Accounting flags. */
uint64_t ps_pledge; /* [m] pledge promises */
uint64_t ps_execpledge; /* [m] execpledge promises */
int64_t ps_kbind_cookie; /* [m] */
u_long ps_kbind_addr; /* [m] */
/* End area that is copied on creation. */
#define ps_endcopy ps_refcnt
int ps_refcnt; /* Number of references. */
struct timespec ps_start; /* starting uptime. */
struct timeout ps_realit_to; /* [m] ITIMER_REAL timeout */
};
#define ps_session ps_pgrp->pg_session
#define ps_pgid ps_pgrp->pg_id
#endif /* __need_process */
/*
* These flags are kept in ps_flags.
*/
#define PS_CONTROLT 0x00000001 /* Has a controlling terminal. */
#define PS_EXEC 0x00000002 /* Process called exec. */
#define PS_INEXEC 0x00000004 /* Process is doing an exec right now */
#define PS_EXITING 0x00000008 /* Process is exiting. */
#define PS_SUGID 0x00000010 /* Had set id privs since last exec. */
#define PS_SUGIDEXEC 0x00000020 /* last execve() was set[ug]id */
#define PS_PPWAIT 0x00000040 /* Parent waits for exec/exit. */
#define PS_ISPWAIT 0x00000080 /* Is parent of PPWAIT child. */
#define PS_PROFIL 0x00000100 /* Has started profiling. */
#define PS_TRACED 0x00000200 /* Being ptraced. */
#define PS_WAITED 0x00000400 /* Stopped proc was waited for. */
#define PS_COREDUMP 0x00000800 /* Busy coredumping */
#define PS_SINGLEEXIT 0x00001000 /* Other threads must die. */
#define PS_SINGLEUNWIND 0x00002000 /* Other threads must unwind. */
#define PS_NOZOMBIE 0x00004000 /* No signal or zombie at exit. */
#define PS_STOPPED 0x00008000 /* Just stopped, need sig to parent. */
#define PS_SYSTEM 0x00010000 /* No sigs, stats or swapping. */
#define PS_EMBRYO 0x00020000 /* New process, not yet fledged */
#define PS_ZOMBIE 0x00040000 /* Dead and ready to be waited for */
#define PS_NOBROADCASTKILL 0x00080000 /* Process excluded from kill -1. */
#define PS_PLEDGE 0x00100000 /* Has called pledge(2) */
#define PS_WXNEEDED 0x00200000 /* Process allowed to violate W^X */
#define PS_EXECPLEDGE 0x00400000 /* Has exec pledges */
#define PS_ORPHAN 0x00800000 /* Process is on an orphan list */
#define PS_CHROOT 0x01000000 /* Process is chrooted */
#define PS_BITS \
("\20" "\01CONTROLT" "\02EXEC" "\03INEXEC" "\04EXITING" "\05SUGID" \
"\06SUGIDEXEC" "\07PPWAIT" "\010ISPWAIT" "\011PROFIL" "\012TRACED" \
"\013WAITED" "\014COREDUMP" "\015SINGLEEXIT" "\016SINGLEUNWIND" \
"\017NOZOMBIE" "\020STOPPED" "\021SYSTEM" "\022EMBRYO" "\023ZOMBIE" \
"\024NOBROADCASTKILL" "\025PLEDGE" "\026WXNEEDED" "\027EXECPLEDGE" \
"\030ORPHAN" "\031CHROOT")
struct kcov_dev;
struct lock_list_entry;
struct kqueue;
struct p_inentry {
u_long ie_serial;
vaddr_t ie_start;
vaddr_t ie_end;
};
/*
* Locks used to protect struct members in this file:
* I immutable after creation
* S scheduler lock
* l read only reference, see lim_read_enter()
* o owned (read/modified only) by this thread
*/
struct proc {
TAILQ_ENTRY(proc) p_runq; /* [S] current run/sleep queue */
LIST_ENTRY(proc) p_list; /* List of all threads. */
struct process *p_p; /* [I] The process of this thread. */
TAILQ_ENTRY(proc) p_thr_link; /* Threads in a process linkage. */
TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */
struct futex *p_futex; /* Current sleeping futex. */
/* substructures: */
struct filedesc *p_fd; /* copy of p_p->ps_fd */
struct vmspace *p_vmspace; /* [I] copy of p_p->ps_vmspace */
struct p_inentry p_spinentry; /* [o] cache for SP check */
struct p_inentry p_pcinentry; /* [o] cache for PC check */
int p_flag; /* P_* flags. */
u_char p_spare; /* unused */
char p_stat; /* [S] S* process status. */
u_char p_runpri; /* [S] Runqueue priority */
u_char p_descfd; /* if not 255, fdesc permits this fd */
pid_t p_tid; /* Thread identifier. */
LIST_ENTRY(proc) p_hash; /* Hash chain. */
/* The following fields are all zeroed upon creation in fork. */
#define p_startzero p_dupfd
int p_dupfd; /* Sideways return value from filedescopen. XXX */
/* scheduling */
int p_cpticks; /* Ticks of cpu time. */
const volatile void *p_wchan; /* [S] Sleep address. */
struct timeout p_sleep_to;/* timeout for tsleep() */
const char *p_wmesg; /* [S] Reason for sleep. */
fixpt_t p_pctcpu; /* [S] %cpu for this thread */
u_int p_slptime; /* [S] Time since last blocked. */
u_int p_uticks; /* Statclock hits in user mode. */
u_int p_sticks; /* Statclock hits in system mode. */
u_int p_iticks; /* Statclock hits processing intr. */
struct cpu_info * volatile p_cpu; /* [S] CPU we're running on. */
struct rusage p_ru; /* Statistics */
struct tusage p_tu; /* accumulated times. */
struct timespec p_rtime; /* Real time. */
struct plimit *p_limit; /* [l] read ref. of p_p->ps_limit */
struct kcov_dev *p_kd; /* kcov device handle */
struct lock_list_entry *p_sleeplocks; /* WITNESS lock tracking */
struct kqueue *p_kq; /* [o] select/poll queue of evts */
unsigned long p_kq_serial; /* [o] to check against enqueued evts */
int p_siglist; /* [a] Signals arrived & not delivered*/
/* End area that is zeroed on creation. */
#define p_endzero p_startcopy
/* The following fields are all copied upon creation in fork. */
#define p_startcopy p_sigmask
sigset_t p_sigmask; /* [a] Current signal mask */
u_char p_slppri; /* [S] Sleeping priority */
u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */
u_int p_estcpu; /* [S] Time averaged val of p_cpticks */
int p_pledge_syscall; /* Cache of current syscall */
struct ucred *p_ucred; /* [o] cached credentials */
struct sigaltstack p_sigstk; /* sp & on stack state variable */
u_long p_prof_addr; /* tmp storage for profiling addr until AST */
u_long p_prof_ticks; /* tmp storage for profiling ticks until AST */
/* End area that is copied on creation. */
#define p_endcopy p_addr
struct user *p_addr; /* Kernel virtual addr of u-area */
struct mdproc p_md; /* Any machine-dependent fields. */
sigset_t p_oldmask; /* Saved mask from before sigpause */
int p_sisig; /* For core dump/debugger XXX */
union sigval p_sigval; /* For core dump/debugger XXX */
long p_sitrapno; /* For core dump/debugger XXX */
int p_sicode; /* For core dump/debugger XXX */
};
/* Status values. */
#define SIDL 1 /* Thread being created by fork. */
#define SRUN 2 /* Currently runnable. */
#define SSLEEP 3 /* Sleeping on an address. */
#define SSTOP 4 /* Debugging or suspension. */
#define SZOMB 5 /* unused */
#define SDEAD 6 /* Thread is almost gone */
#define SONPROC 7 /* Thread is currently on a CPU. */
#define P_ZOMBIE(p) ((p)->p_stat == SDEAD)
#define P_HASSIBLING(p) (TAILQ_FIRST(&(p)->p_p->ps_threads) != (p) || \
TAILQ_NEXT((p), p_thr_link) != NULL)
/*
* These flags are per-thread and kept in p_flag
*/
#define P_INKTR 0x00000001 /* In a ktrace op, don't recurse */
#define P_PROFPEND 0x00000002 /* SIGPROF needs to be posted */
#define P_ALRMPEND 0x00000004 /* SIGVTALRM needs to be posted */
#define P_SIGSUSPEND 0x00000008 /* Need to restore before-suspend mask*/
#define P_CANTSLEEP 0x00000010 /* insomniac thread */
#define P_SINTR 0x00000080 /* Sleep is interruptible. */
#define P_SYSTEM 0x00000200 /* No sigs, stats or swapping. */
#define P_TIMEOUT 0x00000400 /* Timing out during sleep. */
#define P_WEXIT 0x00002000 /* Working on exiting. */
#define P_OWEUPC 0x00008000 /* Owe proc an addupc() at next ast. */
#define P_SUSPSINGLE 0x00080000 /* Need to stop for single threading. */
#define P_CONTINUED 0x00800000 /* Proc has continued from a stopped state. */
#define P_THREAD 0x04000000 /* Only a thread, not a real process */
#define P_SUSPSIG 0x08000000 /* Stopped from signal. */
#define P_SOFTDEP 0x10000000 /* Stuck processing softdep worklist */
#define P_CPUPEG 0x40000000 /* Do not move to another cpu. */
#define P_BITS \
("\20" "\01INKTR" "\02PROFPEND" "\03ALRMPEND" "\04SIGSUSPEND" \
"\05CANTSLEEP" "\010SINTR" "\012SYSTEM" "\013TIMEOUT" \
"\016WEXIT" "\020OWEUPC" "\024SUSPSINGLE" "\027XX" \
"\030CONTINUED" "\033THREAD" "\034SUSPSIG" "\035SOFTDEP" "\037CPUPEG")
#define THREAD_PID_OFFSET 100000
#ifdef _KERNEL
struct uidinfo {
LIST_ENTRY(uidinfo) ui_hash;
uid_t ui_uid;
long ui_proccnt; /* proc structs */
long ui_lockcnt; /* lockf structs */
};
struct uidinfo *uid_find(uid_t);
void uid_release(struct uidinfo *);
/*
* We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
* as it is used to represent "no process group".
* We set PID_MAX to 99999 to keep it in 5 columns in ps
* When exposed to userspace, thread IDs have THREAD_PID_OFFSET
* added to keep them from overlapping the PID range. For them,
* we use a * a (0 .. 2^n] range for cheapness, picking 'n' such
* that 2^n + THREAD_PID_OFFSET and THREAD_PID_OFFSET have
* the same number of columns when printed.
*/
#define PID_MAX 99999
#define TID_MASK 0x7ffff
#define NO_PID (PID_MAX+1)
#define SESS_LEADER(pr) ((pr)->ps_session->s_leader == (pr))
#define SESSHOLD(s) ((s)->s_count++)
#define SESSRELE(s) do { \
if (--(s)->s_count == 0) { \
timeout_del(&(s)->s_verauthto); \
pool_put(&session_pool, (s)); \
} \
} while (/* CONSTCOND */ 0)
/*
* Flags to fork1().
*/
#define FORK_FORK 0x00000001
#define FORK_VFORK 0x00000002
#define FORK_IDLE 0x00000004
#define FORK_PPWAIT 0x00000008
#define FORK_SHAREFILES 0x00000010
#define FORK_SYSTEM 0x00000020
#define FORK_NOZOMBIE 0x00000040
#define FORK_SHAREVM 0x00000080
#define FORK_PTRACE 0x00000400
#define EXIT_NORMAL 0x00000001
#define EXIT_THREAD 0x00000002
#define EXIT_THREAD_NOCHECK 0x00000003
#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash])
extern LIST_HEAD(tidhashhead, proc) *tidhashtbl;
extern u_long tidhash;
#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash])
extern LIST_HEAD(pidhashhead, process) *pidhashtbl;
extern u_long pidhash;
#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash])
extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
extern u_long pgrphash;
extern struct proc proc0; /* Process slot for swapper. */
extern struct process process0; /* Process slot for kernel threads. */
extern int nprocesses, maxprocess; /* Cur and max number of processes. */
extern int nthreads, maxthread; /* Cur and max number of threads. */
LIST_HEAD(proclist, proc);
LIST_HEAD(processlist, process);
extern struct processlist allprocess; /* List of all processes. */
extern struct processlist zombprocess; /* List of zombie processes. */
extern struct proclist allproc; /* List of all threads. */
extern struct process *initprocess; /* Process slot for init. */
extern struct proc *reaperproc; /* Thread slot for reaper. */
extern struct proc *syncerproc; /* filesystem syncer daemon */
extern struct pool process_pool; /* memory pool for processes */
extern struct pool proc_pool; /* memory pool for procs */
extern struct pool rusage_pool; /* memory pool for zombies */
extern struct pool ucred_pool; /* memory pool for ucreds */
extern struct pool session_pool; /* memory pool for sessions */
extern struct pool pgrp_pool; /* memory pool for pgrps */
void freepid(pid_t);
struct process *prfind(pid_t); /* Find process by id. */
struct process *zombiefind(pid_t); /* Find zombie process by id. */
struct proc *tfind(pid_t); /* Find thread by id. */
struct pgrp *pgfind(pid_t); /* Find process group by id. */
void proc_printit(struct proc *p, const char *modif,
int (*pr)(const char *, ...));
int chgproccnt(uid_t uid, int diff);
void enternewpgrp(struct process *, struct pgrp *, struct session *);
void enterthispgrp(struct process *, struct pgrp *);
int inferior(struct process *, struct process *);
void leavepgrp(struct process *);
void killjobc(struct process *);
void preempt(void);
void procinit(void);
void setpriority(struct proc *, uint32_t, uint8_t);
void setrunnable(struct proc *);
void endtsleep(void *);
int wakeup_proc(struct proc *, const volatile void *);
void unsleep(struct proc *);
void reaper(void *);
__dead void exit1(struct proc *, int, int, int);
void exit2(struct proc *);
int dowait4(struct proc *, pid_t, int *, int, struct rusage *,
register_t *);
void cpu_fork(struct proc *_curp, struct proc *_child, void *_stack,
void *_tcb, void (*_func)(void *), void *_arg);
void cpu_exit(struct proc *);
void process_initialize(struct process *, struct proc *);
int fork1(struct proc *_curp, int _flags, void (*_func)(void *),
void *_arg, register_t *_retval, struct proc **_newprocp);
int thread_fork(struct proc *_curp, void *_stack, void *_tcb,
pid_t *_tidptr, register_t *_retval);
int groupmember(gid_t, struct ucred *);
void dorefreshcreds(struct process *, struct proc *);
void dosigsuspend(struct proc *, sigset_t);
static inline void
refreshcreds(struct proc *p)
{
struct process *pr = p->p_p;
/* this is an unlocked access to ps_ucred, but the result is benign */
if (pr->ps_ucred != p->p_ucred) dorefreshcreds(pr, p);
}
enum single_thread_mode {
SINGLE_SUSPEND, /* other threads to stop wherever they are */
SINGLE_UNWIND, /* other threads to unwind and stop */
SINGLE_EXIT /* other threads to unwind and then exit */
};
int single_thread_set(struct proc *, enum single_thread_mode, int);
int single_thread_wait(struct process *, int);
void single_thread_clear(struct proc *, int);
int single_thread_check(struct proc *, int);
void child_return(void *);
int proc_cansugid(struct proc *);
struct sleep_state {
int sls_s;
int sls_catch;
int sls_timeout;
};
struct cond {
unsigned int c_wait; /* [a] initialized and waiting */
};
#define COND_INITIALIZER() { .c_wait = 1 }
#if defined(MULTIPROCESSOR)
void proc_trampoline_mp(void); /* XXX */
#endif
/*
* functions to handle sets of cpus.
*
* For now we keep the cpus in ints so that we can use the generic
* atomic ops.
*/
#define CPUSET_ASIZE(x) (((x) - 1)/32 + 1)
#define CPUSET_SSIZE CPUSET_ASIZE(MAXCPUS)
struct cpuset {
int cs_set[CPUSET_SSIZE];
};
void cpuset_init_cpu(struct cpu_info *);
void cpuset_clear(struct cpuset *);
void cpuset_add(struct cpuset *, struct cpu_info *);
void cpuset_del(struct cpuset *, struct cpu_info *);
int cpuset_isset(struct cpuset *, struct cpu_info *);
void cpuset_add_all(struct cpuset *);
void cpuset_copy(struct cpuset *, struct cpuset *);
void cpuset_union(struct cpuset *, struct cpuset *, struct cpuset *);
void cpuset_intersection(struct cpuset *t, struct cpuset *, struct cpuset *);
void cpuset_complement(struct cpuset *, struct cpuset *, struct cpuset *);
int cpuset_cardinality(struct cpuset *);
struct cpu_info *cpuset_first(struct cpuset *);
#endif /* _KERNEL */
#endif /* !_SYS_PROC_H_ */
/* $OpenBSD: ip6_var.h,v 1.102 2022/09/03 22:43:38 mvs Exp $ */
/* $KAME: ip6_var.h,v 1.33 2000/06/11 14:59:20 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET6_IP6_VAR_H_
#define _NETINET6_IP6_VAR_H_
/*
* IP6 reassembly queue structure. Each fragment
* being reassembled is attached to one of these structures.
*/
struct ip6q {
TAILQ_ENTRY(ip6q) ip6q_queue;
LIST_HEAD(ip6asfrag_list, ip6asfrag) ip6q_asfrag;
struct in6_addr ip6q_src, ip6q_dst;
int ip6q_unfrglen; /* len of unfragmentable part */
int ip6q_nfrag; /* # of fragments */
u_int32_t ip6q_ident; /* fragment identification */
u_int8_t ip6q_nxt; /* ip6f_nxt in first fragment */
u_int8_t ip6q_ecn;
u_int8_t ip6q_ttl; /* time to live in slowtimo units */
};
struct ip6asfrag {
LIST_ENTRY(ip6asfrag) ip6af_list;
struct mbuf *ip6af_m;
int ip6af_offset; /* offset in ip6af_m to next header */
int ip6af_frglen; /* fragmentable part length */
int ip6af_off; /* fragment offset */
u_int16_t ip6af_mff; /* more fragment bit in frag off */
};
struct ip6_moptions {
LIST_HEAD(, in6_multi_mship) im6o_memberships;
unsigned short im6o_ifidx; /* ifp index for outgoing multicasts */
u_char im6o_hlim; /* hoplimit for outgoing multicasts */
u_char im6o_loop; /* 1 >= hear sends if a member */
};
/*
* Control options for outgoing packets
*/
/* Routing header related info */
struct ip6po_rhinfo {
struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
struct route_in6 ip6po_rhi_route; /* Route to the 1st hop */
};
#define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr
#define ip6po_route ip6po_rhinfo.ip6po_rhi_route
struct ip6_pktopts {
/* Hoplimit for outgoing packets */
int ip6po_hlim;
/* Outgoing IF/address information */
struct in6_pktinfo *ip6po_pktinfo;
/* Hop-by-Hop options header */
struct ip6_hbh *ip6po_hbh;
/* Destination options header (before a routing header) */
struct ip6_dest *ip6po_dest1;
/* Routing header related info. */
struct ip6po_rhinfo ip6po_rhinfo;
/* Destination options header (after a routing header) */
struct ip6_dest *ip6po_dest2;
/* traffic class */
int ip6po_tclass;
/* fragment vs PMTU discovery policy */
int ip6po_minmtu;
#define IP6PO_MINMTU_MCASTONLY -1 /* default: send at min MTU for multicast */
#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */
#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */
int ip6po_flags;
#define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */
};
struct ip6stat {
u_int64_t ip6s_total; /* total packets received */
u_int64_t ip6s_tooshort; /* packet too short */
u_int64_t ip6s_toosmall; /* not enough data */
u_int64_t ip6s_fragments; /* fragments received */
u_int64_t ip6s_fragdropped; /* frags dropped(dups, out of space) */
u_int64_t ip6s_fragtimeout; /* fragments timed out */
u_int64_t ip6s_fragoverflow; /* fragments that exceeded limit */
u_int64_t ip6s_forward; /* packets forwarded */
u_int64_t ip6s_cantforward; /* packets rcvd for unreachable dest */
u_int64_t ip6s_redirectsent; /* packets forwarded on same net */
u_int64_t ip6s_delivered; /* datagrams delivered to upper level*/
u_int64_t ip6s_localout; /* total ip packets generated here */
u_int64_t ip6s_odropped; /* lost output due to nobufs, etc. */
u_int64_t ip6s_reassembled; /* total packets reassembled ok */
u_int64_t ip6s_fragmented; /* datagrams successfully fragmented */
u_int64_t ip6s_ofragments; /* output fragments created */
u_int64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */
u_int64_t ip6s_badoptions; /* error in option processing */
u_int64_t ip6s_noroute; /* packets discarded due to no route */
u_int64_t ip6s_badvers; /* ip6 version != 6 */
u_int64_t ip6s_rawout; /* total raw ip packets generated */
u_int64_t ip6s_badscope; /* scope error */
u_int64_t ip6s_notmember; /* don't join this multicast group */
u_int64_t ip6s_nxthist[256]; /* next header history */
u_int64_t ip6s_m1; /* one mbuf */
u_int64_t ip6s_m2m[32]; /* two or more mbuf */
u_int64_t ip6s_mext1; /* one ext mbuf */
u_int64_t ip6s_mext2m; /* two or more ext mbuf */
u_int64_t ip6s_nogif; /* no match gif found */
u_int64_t ip6s_toomanyhdr; /* discarded due to too many headers */
/*
* statistics for improvement of the source address selection
* algorithm:
* XXX: hardcoded 16 = # of ip6 multicast scope types + 1
*/
/* number of times that address selection fails */
u_int64_t ip6s_sources_none;
/* number of times that an address on the outgoing I/F is chosen */
u_int64_t ip6s_sources_sameif[16];
/* number of times that an address on a non-outgoing I/F is chosen */
u_int64_t ip6s_sources_otherif[16];
/*
* number of times that an address that has the same scope
* from the destination is chosen.
*/
u_int64_t ip6s_sources_samescope[16];
/*
* number of times that an address that has a different scope
* from the destination is chosen.
*/
u_int64_t ip6s_sources_otherscope[16];
/* number of times that an deprecated address is chosen */
u_int64_t ip6s_sources_deprecated[16];
u_int64_t ip6s_forward_cachehit;
u_int64_t ip6s_forward_cachemiss;
u_int64_t ip6s_wrongif; /* packet received on wrong interface */
u_int64_t ip6s_idropped; /* lost input due to nobufs, etc. */
};
#ifdef _KERNEL
#include <sys/percpu.h>
enum ip6stat_counters {
ip6s_total,
ip6s_tooshort,
ip6s_toosmall,
ip6s_fragments,
ip6s_fragdropped,
ip6s_fragtimeout,
ip6s_fragoverflow,
ip6s_forward,
ip6s_cantforward,
ip6s_redirectsent,
ip6s_delivered,
ip6s_localout,
ip6s_odropped,
ip6s_reassembled,
ip6s_fragmented,
ip6s_ofragments,
ip6s_cantfrag,
ip6s_badoptions,
ip6s_noroute,
ip6s_badvers,
ip6s_rawout,
ip6s_badscope,
ip6s_notmember,
ip6s_nxthist,
ip6s_m1 = ip6s_nxthist + 256,
ip6s_m2m,
ip6s_mext1 = ip6s_m2m + 32,
ip6s_mext2m,
ip6s_nogif,
ip6s_toomanyhdr,
ip6s_sources_none,
ip6s_sources_sameif,
ip6s_sources_otherif = ip6s_sources_sameif + 16,
ip6s_sources_samescope = ip6s_sources_otherif + 16,
ip6s_sources_otherscope = ip6s_sources_samescope + 16,
ip6s_sources_deprecated = ip6s_sources_otherscope + 16,
ip6s_forward_cachehit = ip6s_sources_deprecated + 16,
ip6s_forward_cachemiss,
ip6s_wrongif,
ip6s_idropped,
ip6s_ncounters,
};
extern struct cpumem *ip6counters;
static inline void
ip6stat_inc(enum ip6stat_counters c)
{
counters_inc(ip6counters, c);
}
static inline void
ip6stat_add(enum ip6stat_counters c, uint64_t v)
{
counters_add(ip6counters, c, v);
}
/* flags passed to ip6_output as last parameter */
#define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */
#define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */
#define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */
extern int ip6_mtudisc_timeout; /* mtu discovery */
extern struct rttimer_queue icmp6_mtudisc_timeout_q;
extern int ip6_defhlim; /* default hop limit */
extern int ip6_defmcasthlim; /* default multicast hop limit */
extern int ip6_forwarding; /* act as router? */
extern int ip6_mforwarding; /* act as multicast router? */
extern int ip6_multipath; /* use multipath routes */
extern int ip6_sendredirect; /* send ICMPv6 redirect? */
extern int ip6_use_deprecated; /* allow deprecated addr as source */
extern int ip6_mcast_pmtu; /* path MTU discovery for multicast */
extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */
extern int ip6_maxdynroutes; /* Max # of routes created via redirect */
extern struct socket *ip6_mrouter[RT_TABLEID_MAX + 1]; /* multicast routing daemon */
extern int ip6_sendredirects; /* send IP redirects when forwarding? */
extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */
extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */
extern int ip6_log_interval;
extern time_t ip6_log_time;
extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */
extern int ip6_dad_count; /* DupAddrDetectionTransmits */
extern int ip6_dad_pending; /* number of currently running DADs */
extern int ip6_auto_flowlabel;
extern int ip6_auto_linklocal;
#define IP6_SOIIKEY_LEN 16
extern uint8_t ip6_soiikey[IP6_SOIIKEY_LEN];
extern const struct pr_usrreqs rip6_usrreqs;
struct in6pcb;
struct inpcb;
int icmp6_ctloutput(int, struct socket *, int, int, struct mbuf *);
void ip6_init(void);
void ip6intr(void);
int ip6_input_if(struct mbuf **, int *, int, int, struct ifnet *);
void ip6_freepcbopts(struct ip6_pktopts *);
void ip6_freemoptions(struct ip6_moptions *);
int ip6_unknown_opt(struct mbuf **, u_int8_t *, int);
int ip6_get_prevhdr(struct mbuf *, int);
int ip6_nexthdr(struct mbuf *, int, int, int *);
int ip6_lasthdr(struct mbuf *, int, int, int *);
int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int ip6_process_hopopts(struct mbuf **, u_int8_t *, int, u_int32_t *,
u_int32_t *);
void ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **);
int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);
void ip6_forward(struct mbuf *, struct rtentry *, int);
void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *);
int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int,
struct ip6_moptions *, struct inpcb *);
int ip6_fragment(struct mbuf *, struct mbuf_list *, int, u_char, u_long);
int ip6_ctloutput(int, struct socket *, int, int, struct mbuf *);
int ip6_raw_ctloutput(int, struct socket *, int, int, struct mbuf *);
void ip6_initpktopts(struct ip6_pktopts *);
int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *,
struct ip6_pktopts *, int, int);
void ip6_clearpktopts(struct ip6_pktopts *, int);
void ip6_randomid_init(void);
u_int32_t ip6_randomid(void);
void ip6_send(struct mbuf *);
int route6_input(struct mbuf **, int *, int, int);
void frag6_init(void);
int frag6_input(struct mbuf **, int *, int, int);
int frag6_deletefraghdr(struct mbuf *, int);
void frag6_slowtimo(void);
void rip6_init(void);
int rip6_input(struct mbuf **, int *, int, int);
void rip6_ctlinput(int, struct sockaddr *, u_int, void *);
int rip6_ctloutput(int, struct socket *, int, int, struct mbuf *);
int rip6_output(struct mbuf *, struct socket *, struct sockaddr *,
struct mbuf *);
int rip6_attach(struct socket *, int);
int rip6_detach(struct socket *);
int rip6_bind(struct socket *, struct mbuf *, struct proc *);
int rip6_connect(struct socket *, struct mbuf *);
int rip6_disconnect(struct socket *);
int rip6_shutdown(struct socket *);
int rip6_send(struct socket *, struct mbuf *, struct mbuf *,
struct mbuf *);
int rip6_abort(struct socket *);
int rip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);
int dest6_input(struct mbuf **, int *, int, int);
int none_input(struct mbuf **, int *, int);
int in6_pcbselsrc(struct in6_addr **, struct sockaddr_in6 *,
struct inpcb *, struct ip6_pktopts *);
int in6_selectsrc(struct in6_addr **, struct sockaddr_in6 *,
struct ip6_moptions *, unsigned int);
struct rtentry *in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct route_in6 *, unsigned int rtableid);
u_int32_t ip6_randomflowlabel(void);
#ifdef IPSEC
struct tdb;
int ip6_output_ipsec_lookup(struct mbuf *, struct inpcb *, struct tdb **);
int ip6_output_ipsec_send(struct tdb *, struct mbuf *, struct route_in6 *,
int, int);
#endif /* IPSEC */
#endif /* _KERNEL */
#endif /* !_NETINET6_IP6_VAR_H_ */
/* $OpenBSD: uvm_device.c,v 1.66 2021/12/15 12:53:53 mpi Exp $ */
/* $NetBSD: uvm_device.c,v 1.30 2000/11/25 06:27:59 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_device.c,v 1.1.2.9 1998/02/06 05:11:47 chs Exp
*/
/*
* uvm_device.c: the device pager.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <uvm/uvm.h>
#include <uvm/uvm_device.h>
#include "drm.h"
/*
* private global data structure
*
* we keep a list of active device objects in the system.
*/
LIST_HEAD(, uvm_device) udv_list = LIST_HEAD_INITIALIZER(udv_list);
struct mutex udv_lock = MUTEX_INITIALIZER(IPL_NONE);
/*
* functions
*/
static void udv_reference(struct uvm_object *);
static void udv_detach(struct uvm_object *);
static int udv_fault(struct uvm_faultinfo *, vaddr_t,
vm_page_t *, int, int, vm_fault_t,
vm_prot_t, int);
static boolean_t udv_flush(struct uvm_object *, voff_t, voff_t,
int);
/*
* master pager structure
*/
const struct uvm_pagerops uvm_deviceops = {
.pgo_reference = udv_reference,
.pgo_detach = udv_detach,
.pgo_fault = udv_fault,
.pgo_flush = udv_flush,
};
/*
* the ops!
*/
/*
* udv_attach
*
* get a VM object that is associated with a device. allocate a new
* one if needed.
*
* => nothing should be locked so that we can sleep here.
*
* The last two arguments (off and size) are only used for access checking.
*/
struct uvm_object *
udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size)
{
struct uvm_device *udv, *lcv;
paddr_t (*mapfn)(dev_t, off_t, int);
#if NDRM > 0
struct uvm_object *obj;
#endif
/*
* before we do anything, ensure this device supports mmap
*/
mapfn = cdevsw[major(device)].d_mmap;
if (mapfn == NULL ||
mapfn == (paddr_t (*)(dev_t, off_t, int)) enodev ||
mapfn == (paddr_t (*)(dev_t, off_t, int)) nullop)
return(NULL);
/*
* Negative offsets on the object are not allowed.
*/
if (off < 0)
return(NULL);
#if NDRM > 0
obj = udv_attach_drm(device, accessprot, off, size);
if (obj)
return(obj);
#endif
/*
* Check that the specified range of the device allows the
* desired protection.
*
* XXX clobbers off and size, but nothing else here needs them.
*/
while (size != 0) { if ((*mapfn)(device, off, accessprot) == -1)
return (NULL);
off += PAGE_SIZE; size -= PAGE_SIZE;
}
/*
* keep looping until we get it
*/
for (;;) {
/*
* first, attempt to find it on the main list
*/
mtx_enter(&udv_lock);
LIST_FOREACH(lcv, &udv_list, u_list) {
if (device == lcv->u_device)
break;
}
/*
* got it on main list. put a hold on it and unlock udv_lock.
*/
if (lcv) {
/*
* if someone else has a hold on it, sleep and start
* over again. Else, we need take HOLD flag so we
* don't have to re-order locking here.
*/
if (lcv->u_flags & UVM_DEVICE_HOLD) {
lcv->u_flags |= UVM_DEVICE_WANTED;
msleep_nsec(lcv, &udv_lock, PVM | PNORELOCK,
"udv_attach", INFSLP);
continue;
}
/* we are now holding it */
lcv->u_flags |= UVM_DEVICE_HOLD;
mtx_leave(&udv_lock);
/*
* bump reference count, unhold, return.
*/
rw_enter(lcv->u_obj.vmobjlock, RW_WRITE);
lcv->u_obj.uo_refs++;
rw_exit(lcv->u_obj.vmobjlock);
mtx_enter(&udv_lock);
if (lcv->u_flags & UVM_DEVICE_WANTED) wakeup(lcv);
lcv->u_flags &= ~(UVM_DEVICE_WANTED|UVM_DEVICE_HOLD);
mtx_leave(&udv_lock);
return(&lcv->u_obj);
}
/*
* Did not find it on main list. Need to allocate a new one.
*/
mtx_leave(&udv_lock);
/* NOTE: we could sleep in the following malloc() */
udv = malloc(sizeof(*udv), M_TEMP, M_WAITOK);
uvm_obj_init(&udv->u_obj, &uvm_deviceops, 1);
mtx_enter(&udv_lock);
/*
* now we have to double check to make sure no one added it
* to the list while we were sleeping...
*/
LIST_FOREACH(lcv, &udv_list, u_list) {
if (device == lcv->u_device)
break;
}
/*
* did we lose a race to someone else?
* free our memory and retry.
*/
if (lcv) {
mtx_leave(&udv_lock);
uvm_obj_destroy(&udv->u_obj);
free(udv, M_TEMP, sizeof(*udv));
continue;
}
/*
* we have it! init the data structures, add to list
* and return.
*/
udv->u_flags = 0;
udv->u_device = device;
LIST_INSERT_HEAD(&udv_list, udv, u_list);
mtx_leave(&udv_lock);
return(&udv->u_obj);
}
/*NOTREACHED*/
}
/*
* udv_reference
*
* add a reference to a VM object. Note that the reference count must
* already be one (the passed in reference) so there is no chance of the
* udv being released or locked out here.
*/
static void
udv_reference(struct uvm_object *uobj)
{
rw_enter(uobj->vmobjlock, RW_WRITE);
uobj->uo_refs++;
rw_exit(uobj->vmobjlock);
}
/*
* udv_detach
*
* remove a reference to a VM object.
*/
static void
udv_detach(struct uvm_object *uobj)
{
struct uvm_device *udv = (struct uvm_device *)uobj;
KERNEL_ASSERT_LOCKED();
/*
* loop until done
*/
again:
rw_enter(uobj->vmobjlock, RW_WRITE);
if (uobj->uo_refs > 1) {
uobj->uo_refs--;
rw_exit(uobj->vmobjlock);
return;
}
KASSERT(uobj->uo_npages == 0 && RBT_EMPTY(uvm_objtree, &uobj->memt));
/*
* is it being held? if so, wait until others are done.
*/
mtx_enter(&udv_lock);
if (udv->u_flags & UVM_DEVICE_HOLD) {
udv->u_flags |= UVM_DEVICE_WANTED;
rw_exit(uobj->vmobjlock);
msleep_nsec(udv, &udv_lock, PVM | PNORELOCK, "udv_detach",
INFSLP);
goto again;
}
/*
* got it! nuke it now.
*/
LIST_REMOVE(udv, u_list);
if (udv->u_flags & UVM_DEVICE_WANTED)
wakeup(udv);
mtx_leave(&udv_lock);
rw_exit(uobj->vmobjlock);
uvm_obj_destroy(uobj);
free(udv, M_TEMP, sizeof(*udv));
}
/*
* udv_flush
*
* flush pages out of a uvm object. a no-op for devices.
*/
static boolean_t
udv_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
return(TRUE);
}
/*
* udv_fault: non-standard fault routine for device "pages"
*
* => rather than having a "get" function, we have a fault routine
* since we don't return vm_pages we need full control over the
* pmap_enter map in
* => on return, we unlock all fault data structures
* => flags: PGO_ALLPAGES: get all of the pages
* PGO_LOCKED: fault data structures are locked
* XXX: currently PGO_LOCKED is always required ... consider removing
* it as a flag
* => NOTE: vaddr is the VA of pps[0] in ufi->entry, _NOT_ pps[centeridx]
*/
static int
udv_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, vm_page_t *pps, int npages,
int centeridx, vm_fault_t fault_type, vm_prot_t access_type, int flags)
{
struct vm_map_entry *entry = ufi->entry;
struct uvm_object *uobj = entry->object.uvm_obj;
struct uvm_device *udv = (struct uvm_device *)uobj;
vaddr_t curr_va;
off_t curr_offset;
paddr_t paddr;
int lcv, retval;
dev_t device;
paddr_t (*mapfn)(dev_t, off_t, int);
vm_prot_t mapprot;
KERNEL_ASSERT_LOCKED();
/*
* we do not allow device mappings to be mapped copy-on-write
* so we kill any attempt to do so here.
*/
if (UVM_ET_ISCOPYONWRITE(entry)) {
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
return(VM_PAGER_ERROR);
}
/*
* get device map function.
*/
device = udv->u_device;
mapfn = cdevsw[major(device)].d_mmap;
/*
* now we must determine the offset in udv to use and the VA to
* use for pmap_enter. note that we always use orig_map's pmap
* for pmap_enter (even if we have a submap). since virtual
* addresses in a submap must match the main map, this is ok.
*/
/* udv offset = (offset from start of entry) + entry's offset */
curr_offset = entry->offset + (vaddr - entry->start);
/* pmap va = vaddr (virtual address of pps[0]) */
curr_va = vaddr;
/*
* loop over the page range entering in as needed
*/
retval = VM_PAGER_OK;
for (lcv = 0 ; lcv < npages ; lcv++, curr_offset += PAGE_SIZE,
curr_va += PAGE_SIZE) {
if ((flags & PGO_ALLPAGES) == 0 && lcv != centeridx)
continue;
if (pps[lcv] == PGO_DONTCARE)
continue;
paddr = (*mapfn)(device, curr_offset, access_type);
if (paddr == -1) {
retval = VM_PAGER_ERROR;
break;
}
mapprot = ufi->entry->protection;
if (pmap_enter(ufi->orig_map->pmap, curr_va, paddr,
mapprot, PMAP_CANFAIL | mapprot) != 0) {
/*
* pmap_enter() didn't have the resource to
* enter this mapping. Unlock everything,
* wait for the pagedaemon to free up some
* pages, and then tell uvm_fault() to start
* the fault again.
*
* XXX Needs some rethinking for the PGO_ALLPAGES
* XXX case.
*/
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap,
uobj);
/* sync what we have so far */
pmap_update(ufi->orig_map->pmap);
uvm_wait("udv_fault");
return (VM_PAGER_REFAULT);
}
}
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
pmap_update(ufi->orig_map->pmap);
return (retval);
}
/* $OpenBSD: tty_conf.c,v 1.23 2015/12/22 20:31:51 sf Exp $ */
/* $NetBSD: tty_conf.c,v 1.18 1996/05/19 17:17:55 jonathan Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty_conf.c 8.4 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/conf.h>
#include "ppp.h"
#include "nmea.h"
#include "msts.h"
#include "endrun.h"
#define ttynodisc ((int (*)(dev_t, struct tty *, struct proc *))enodev)
#define ttyerrclose ((int (*)(struct tty *, int flags, struct proc *))enodev)
#define ttyerrio ((int (*)(struct tty *, struct uio *, int))enodev)
#define ttyerrinput ((int (*)(int c, struct tty *))enodev)
#define ttyerrstart ((int (*)(struct tty *))enodev)
struct linesw linesw[] =
{
{ ttyopen, ttylclose, ttread, ttwrite, nullioctl,
ttyinput, ttstart, ttymodem }, /* 0- termios */
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem }, /* 1- defunct */
/* 2- old NTTYDISC (defunct) */
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
/* 3- TABLDISC (defunct) */
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
/* 4- SLIPDISC (defunct) */
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#if NPPP > 0
{ pppopen, pppclose, pppread, pppwrite, ppptioctl,
pppinput, pppstart, ttymodem }, /* 5- PPPDISC */
#else
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#endif
/* 6- STRIPDISC (defunct) */
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#if NNMEA > 0
{ nmeaopen, nmeaclose, ttread, ttwrite, nullioctl,
nmeainput, ttstart, ttymodem }, /* 7- NMEADISC */
#else
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#endif
#if NMSTS > 0
{ mstsopen, mstsclose, ttread, ttwrite, nullioctl,
mstsinput, ttstart, ttymodem }, /* 8- MSTSDISC */
#else
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#endif
#if NENDRUN > 0
{ endrunopen, endrunclose, ttread, ttwrite, nullioctl,
endruninput, ttstart, ttymodem }, /* 9- ENDRUNDISC */
#else
{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
ttyerrinput, ttyerrstart, nullmodem },
#endif
};
int nlinesw = sizeof (linesw) / sizeof (linesw[0]);
/*
* Do nothing specific version of line
* discipline specific ioctl command.
*/
int
nullioctl(struct tty *tp, u_long cmd, char *data, int flags, struct proc *p)
{
return (-1);
}
/* $OpenBSD: ufs_inode.c,v 1.44 2020/02/27 09:10:31 mpi Exp $ */
/* $NetBSD: ufs_inode.c,v 1.7 1996/05/11 18:27:52 mycroft Exp $ */
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_inode.c 8.7 (Berkeley) 7/22/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/malloc.h>
#include <sys/namei.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#endif
/*
* Last reference to an inode. If necessary, write or delete it.
*/
int
ufs_inactive(void *v)
{
struct vop_inactive_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
mode_t mode;
int error = 0;
#ifdef DIAGNOSTIC
extern int prtactive;
if (prtactive && vp->v_usecount != 0) vprint("ufs_inactive: pushing active", vp);
#endif
/*
* Ignore inodes related to stale file handles.
*/
if (ip->i_din1 == NULL || DIP(ip, mode) == 0)
goto out;
if (DIP(ip, nlink) <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { if (getinoquota(ip) == 0) (void)ufs_quota_free_inode(ip, NOCRED);
error = UFS_TRUNCATE(ip, (off_t)0, 0, NOCRED);
DIP_ASSIGN(ip, rdev, 0); mode = DIP(ip, mode);
DIP_ASSIGN(ip, mode, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Setting the mode to zero needs to wait for the inode to be
* written just as does a change to the link count. So, rather
* than creating a new entry point to do the same thing, we
* just use softdep_change_linkcnt(). Also, we can't let
* softdep co-opt us to help on its worklist, as we may end up
* trying to recycle vnodes and getting to this same point a
* couple of times, blowing the kernel stack. However, this
* could be optimized by checking if we are coming from
* vrele(), vput() or vclean() (by checking for VXLOCK) and
* just avoiding the co-opt to happen in the last case.
*/
if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip, 1);
UFS_INODE_FREE(ip, ip->i_number, mode);
}
if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { UFS_UPDATE(ip, 0);
}
out:
VOP_UNLOCK(vp);
/*
* If we are done with the inode, reclaim it
* so that it can be reused immediately.
*/
if (ip->i_din1 == NULL || DIP(ip, mode) == 0)
vrecycle(vp, ap->a_p);
return (error);
}
/*
* Reclaim an inode so that it can be used for other purposes.
*/
int
ufs_reclaim(struct vnode *vp)
{
struct inode *ip;
#ifdef DIAGNOSTIC
extern int prtactive;
if (prtactive && vp->v_usecount != 0) vprint("ufs_reclaim: pushing active", vp);
#endif
ip = VTOI(vp);
/*
* Stop deferring timestamp writes
*/
if (ip->i_flag & IN_LAZYMOD) { ip->i_flag |= IN_MODIFIED;
UFS_UPDATE(ip, 0);
}
/*
* Remove the inode from its hash chain.
*/
ufs_ihashrem(ip);
/*
* Purge old data structures associated with the inode.
*/
cache_purge(vp);
if (ip->i_devvp) { vrele(ip->i_devvp);
}
#ifdef UFS_DIRHASH
if (ip->i_dirhash != NULL) ufsdirhash_free(ip);
#endif
ufs_quota_delete(ip);
return (0);
}
/* $OpenBSD: toeplitz.c,v 1.10 2021/02/21 02:37:38 dlg Exp $ */
/*
* Copyright (c) 2009 The DragonFly Project. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Sepherosa Ziehau <sepherosa@gmail.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of The DragonFly Project nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific, prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 2019 David Gwynne <dlg@openbsd.org>
* Copyright (c) 2020 Theo Buehler <tb@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <netinet/in.h>
#include <net/toeplitz.h>
/*
* symmetric toeplitz
*/
static stoeplitz_key stoeplitz_keyseed = STOEPLITZ_KEYSEED;
static struct stoeplitz_cache stoeplitz_syskey_cache;
const struct stoeplitz_cache *const
stoeplitz_cache = &stoeplitz_syskey_cache;
/* parity of n16: count (mod 2) of ones in the binary representation. */
int
parity(uint16_t n16)
{
n16 = ((n16 & 0xaaaa) >> 1) ^ (n16 & 0x5555);
n16 = ((n16 & 0xcccc) >> 2) ^ (n16 & 0x3333);
n16 = ((n16 & 0xf0f0) >> 4) ^ (n16 & 0x0f0f);
n16 = ((n16 & 0xff00) >> 8) ^ (n16 & 0x00ff);
return (n16);
}
/*
* The Toeplitz matrix obtained from a seed is invertible if and only if the
* parity of the seed is 1. Generate such a seed uniformly at random.
*/
stoeplitz_key
stoeplitz_random_seed(void)
{
stoeplitz_key seed;
seed = arc4random() & UINT16_MAX;
if (parity(seed) == 0)
seed ^= 1;
return (seed);
}
void
stoeplitz_init(void)
{
stoeplitz_keyseed = stoeplitz_random_seed();
stoeplitz_cache_init(&stoeplitz_syskey_cache, stoeplitz_keyseed);
}
#define NBSK (NBBY * sizeof(stoeplitz_key))
/*
* The Toeplitz hash of a 16-bit number considered as a column vector over
* the field with two elements is calculated as a matrix multiplication with
* a 16x16 circulant Toeplitz matrix T generated by skey.
*
* The first eight columns H of T generate the remaining eight columns using
* the byteswap operation J = swap16: T = [H JH]. Thus, the Toeplitz hash of
* n = [hi lo] is computed via the formula T * n = (H * hi) ^ swap16(H * lo).
*
* Therefore the results H * val for all values of a byte are cached in scache.
*/
void
stoeplitz_cache_init(struct stoeplitz_cache *scache, stoeplitz_key skey)
{
uint16_t column[NBBY];
unsigned int b, shift, val;
bzero(column, sizeof(column));
/* Calculate the first eight columns H of the Toeplitz matrix T. */
for (b = 0; b < NBBY; ++b)
column[b] = skey << b | skey >> (NBSK - b);
/* Cache the results of H * val for all possible values of a byte. */
for (val = 0; val < 256; ++val) {
uint16_t res = 0;
for (b = 0; b < NBBY; ++b) {
shift = NBBY - b - 1;
if (val & (1 << shift))
res ^= column[b];
}
scache->bytes[val] = res;
}
}
uint16_t
stoeplitz_hash_ip4(const struct stoeplitz_cache *scache,
in_addr_t faddr, in_addr_t laddr)
{
return (stoeplitz_hash_n32(scache, faddr ^ laddr));
}
uint16_t
stoeplitz_hash_ip4port(const struct stoeplitz_cache *scache,
in_addr_t faddr, in_addr_t laddr, in_port_t fport, in_port_t lport)
{
return (stoeplitz_hash_n32(scache, faddr ^ laddr ^ fport ^ lport));
}
#ifdef INET6
uint16_t
stoeplitz_hash_ip6(const struct stoeplitz_cache *scache,
const struct in6_addr *faddr6, const struct in6_addr *laddr6)
{
uint32_t n32 = 0;
size_t i;
for (i = 0; i < nitems(faddr6->s6_addr32); i++)
n32 ^= faddr6->s6_addr32[i] ^ laddr6->s6_addr32[i];
return (stoeplitz_hash_n32(scache, n32));
}
uint16_t
stoeplitz_hash_ip6port(const struct stoeplitz_cache *scache,
const struct in6_addr *faddr6, const struct in6_addr *laddr6,
in_port_t fport, in_port_t lport)
{
uint32_t n32 = 0;
size_t i;
for (i = 0; i < nitems(faddr6->s6_addr32); i++)
n32 ^= faddr6->s6_addr32[i] ^ laddr6->s6_addr32[i];
n32 ^= fport ^ lport;
return (stoeplitz_hash_n32(scache, n32));
}
#endif /* INET6 */
uint16_t
stoeplitz_hash_eaddr(const struct stoeplitz_cache *scache,
const uint8_t ea[static 6])
{
const uint16_t *ea16 = (const uint16_t *)ea;
return (stoeplitz_hash_n16(scache, ea16[0] ^ ea16[1] ^ ea16[2]));
}
void
stoeplitz_to_key(void *key, size_t klen)
{
uint8_t *k = key;
uint16_t skey = htons(stoeplitz_keyseed);
size_t i;
KASSERT((klen % 2) == 0);
for (i = 0; i < klen; i += sizeof(skey)) {
k[i + 0] = skey >> 8;
k[i + 1] = skey;
}
}
/* $OpenBSD: in6.c,v 1.248 2022/08/29 07:51:45 bluhm Exp $ */
/* $KAME: in6.c,v 1.372 2004/06/14 08:14:21 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.2 (Berkeley) 11/15/93
*/
#include "carp.h"
#include <sys/param.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/mld6_var.h>
#ifdef MROUTING
#include <netinet6/ip6_mroute.h>
#endif
#include <netinet6/in6_ifattach.h>
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
/*
* Definitions of some constant IP6 addresses.
*/
const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
const struct in6_addr in6addr_intfacelocal_allnodes =
IN6ADDR_INTFACELOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allnodes =
IN6ADDR_LINKLOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allrouters =
IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
const struct in6_addr in6mask0 = IN6MASK0;
const struct in6_addr in6mask32 = IN6MASK32;
const struct in6_addr in6mask64 = IN6MASK64;
const struct in6_addr in6mask96 = IN6MASK96;
const struct in6_addr in6mask128 = IN6MASK128;
int in6_ioctl(u_long, caddr_t, struct ifnet *, int);
int in6_ioctl_change_ifaddr(u_long, caddr_t, struct ifnet *);
int in6_ioctl_get(u_long, caddr_t, struct ifnet *);
int in6_check_embed_scope(struct sockaddr_in6 *, unsigned int);
int in6_clear_scope_id(struct sockaddr_in6 *, unsigned int);
int in6_ifinit(struct ifnet *, struct in6_ifaddr *, int);
void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
const struct sockaddr_in6 sa6_any = {
sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0
};
int
in6_mask2len(struct in6_addr *mask, u_char *lim0)
{
int x = 0, y;
u_char *lim = lim0, *p;
/* ignore the scope_id part */
if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
lim = (u_char *)mask + sizeof(*mask);
for (p = (u_char *)mask; p < lim; x++, p++) {
if (*p != 0xff)
break;
}
y = 0;
if (p < lim) {
for (y = 0; y < 8; y++) {
if ((*p & (0x80 >> y)) == 0)
break;
}
}
/*
* when the limit pointer is given, do a stricter check on the
* remaining bits.
*/
if (p < lim) {
if (y != 0 && (*p & (0x00ff >> y)) != 0)
return (-1);
for (p = p + 1; p < lim; p++)
if (*p != 0)
return (-1);
}
return x * 8 + y;
}
int
in6_nam2sin6(const struct mbuf *nam, struct sockaddr_in6 **sin6)
{
struct sockaddr *sa = mtod(nam, struct sockaddr *);
if (nam->m_len < offsetof(struct sockaddr, sa_data))
return EINVAL;
if (sa->sa_family != AF_INET6)
return EAFNOSUPPORT;
if (sa->sa_len != nam->m_len)
return EINVAL;
if (sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
*sin6 = satosin6(sa);
return 0;
}
int
in6_sa2sin6(struct sockaddr *sa, struct sockaddr_in6 **sin6)
{
if (sa->sa_family != AF_INET6)
return EAFNOSUPPORT;
if (sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
*sin6 = satosin6(sa);
return 0;
}
int
in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp)
{
int privileged;
int error;
privileged = 0;
if ((so->so_state & SS_PRIV) != 0)
privileged++;
switch (cmd) {
#ifdef MROUTING
case SIOCGETSGCNT_IN6:
case SIOCGETMIFCNT_IN6:
error = mrt6_ioctl(so, cmd, data);
break;
#endif /* MROUTING */
default:
error = in6_ioctl(cmd, data, ifp, privileged);
break;
}
return error;
}
int
in6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, int privileged)
{ if (ifp == NULL)
return (ENXIO);
switch (cmd) {
case SIOCGIFINFO_IN6:
case SIOCGNBRINFO_IN6:
return (nd6_ioctl(cmd, data, ifp));
case SIOCGIFDSTADDR_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
return (in6_ioctl_get(cmd, data, ifp));
case SIOCAIFADDR_IN6:
case SIOCDIFADDR_IN6:
if (!privileged)
return (EPERM);
return (in6_ioctl_change_ifaddr(cmd, data, ifp));
case SIOCSIFADDR:
case SIOCSIFDSTADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
/*
* Do not pass those ioctl to driver handler since they are not
* properly set up. Instead just error out.
*/
return (EINVAL);
default:
return (EOPNOTSUPP);
}
}
int
in6_ioctl_change_ifaddr(u_long cmd, caddr_t data, struct ifnet *ifp)
{
struct in6_ifaddr *ia6 = NULL;
struct in6_aliasreq *ifra = (struct in6_aliasreq *)data;
struct sockaddr *sa;
struct sockaddr_in6 *sa6 = NULL;
int error = 0, newifaddr = 0, plen;
/*
* Find address for this interface, if it exists.
*
* In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
* only, and used the first interface address as the target of other
* operations (without checking ifra_addr). This was because netinet
* code/API assumed at most 1 interface address per interface.
* Since IPv6 allows a node to assign multiple addresses
* on a single interface, we almost always look and check the
* presence of ifra_addr, and reject invalid ones here.
* It also decreases duplicated code among SIOC*_IN6 operations.
*
* We always require users to specify a valid IPv6 address for
* the corresponding operation.
*/
switch (cmd) {
case SIOCAIFADDR_IN6:
sa = sin6tosa(&ifra->ifra_addr);
break;
case SIOCDIFADDR_IN6:
sa = sin6tosa(&((struct in6_ifreq *)data)->ifr_addr);
break;
default:
panic("%s: invalid ioctl %lu", __func__, cmd);
}
if (sa->sa_family == AF_INET6) {
error = in6_sa2sin6(sa, &sa6);
if (error)
return (error);
}
NET_LOCK();
if (sa6 != NULL) {
error = in6_check_embed_scope(sa6, ifp->if_index);
if (error)
goto err;
error = in6_clear_scope_id(sa6, ifp->if_index);
if (error)
goto err;
ia6 = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
}
switch (cmd) {
case SIOCDIFADDR_IN6:
/*
* for IPv4, we look for existing in_ifaddr here to allow
* "ifconfig if0 delete" to remove the first IPv4 address on
* the interface. For IPv6, as the spec allows multiple
* interface address from the day one, we consider "remove the
* first one" semantics to be not preferable.
*/
if (ia6 == NULL) {
error = EADDRNOTAVAIL;
break;
}
in6_purgeaddr(&ia6->ia_ifa);
if_addrhooks_run(ifp);
break;
case SIOCAIFADDR_IN6:
if (ifra->ifra_addr.sin6_family != AF_INET6 ||
ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
error = EAFNOSUPPORT;
break;
}
/* reject read-only flags */
if ((ifra->ifra_flags & IN6_IFF_DUPLICATED) != 0 ||
(ifra->ifra_flags & IN6_IFF_DETACHED) != 0 ||
(ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
error = EINVAL;
break;
}
if (ia6 == NULL)
newifaddr = 1;
/*
* Make the address tentative before joining multicast
* addresses, so that corresponding MLD responses would
* not have a tentative source address.
*/
if (newifaddr && in6if_do_dad(ifp))
ifra->ifra_flags |= IN6_IFF_TENTATIVE;
/*
* first, make or update the interface address structure,
* and link it to the list. try to enable inet6 if there
* is no link-local yet.
*/
error = in6_ifattach(ifp);
if (error)
break;
error = in6_update_ifa(ifp, ifra, ia6);
if (error)
break;
ia6 = NULL;
if (sa6 != NULL)
ia6 = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
if (ia6 == NULL) {
/*
* this can happen when the user specify the 0 valid
* lifetime.
*/
break;
}
/* Perform DAD, if needed. */
if (ia6->ia6_flags & IN6_IFF_TENTATIVE)
nd6_dad_start(&ia6->ia_ifa);
if (!newifaddr) {
if_addrhooks_run(ifp);
break;
}
plen = in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL);
if ((ifp->if_flags & IFF_LOOPBACK) || plen == 128) {
if_addrhooks_run(ifp);
break; /* No need to install a connected route. */
}
error = rt_ifa_add(&ia6->ia_ifa,
RTF_CLONING | RTF_CONNECTED | RTF_MPATH,
ia6->ia_ifa.ifa_addr, ifp->if_rdomain);
if (error) {
in6_purgeaddr(&ia6->ia_ifa);
break;
}
if_addrhooks_run(ifp);
break;
}
err:
NET_UNLOCK();
return (error);
}
int
in6_ioctl_get(u_long cmd, caddr_t data, struct ifnet *ifp)
{
struct in6_ifreq *ifr = (struct in6_ifreq *)data;
struct in6_ifaddr *ia6 = NULL;
struct sockaddr *sa;
struct sockaddr_in6 *sa6 = NULL;
int error = 0;
sa = sin6tosa(&ifr->ifr_addr);
if (sa->sa_family == AF_INET6) {
sa->sa_len = sizeof(struct sockaddr_in6);
error = in6_sa2sin6(sa, &sa6);
if (error)
return (error);
}
NET_LOCK_SHARED();
if (sa6 != NULL) {
error = in6_check_embed_scope(sa6, ifp->if_index);
if (error)
goto err;
error = in6_clear_scope_id(sa6, ifp->if_index);
if (error)
goto err;
ia6 = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
}
/* must think again about its semantics */
if (ia6 == NULL) {
error = EADDRNOTAVAIL;
goto err;
}
switch (cmd) {
case SIOCGIFDSTADDR_IN6:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
break;
}
/*
* XXX: should we check if ifa_dstaddr is NULL and return
* an error?
*/
ifr->ifr_dstaddr = ia6->ia_dstaddr;
break;
case SIOCGIFNETMASK_IN6:
ifr->ifr_addr = ia6->ia_prefixmask;
break;
case SIOCGIFAFLAG_IN6:
ifr->ifr_ifru.ifru_flags6 = ia6->ia6_flags;
break;
case SIOCGIFALIFETIME_IN6:
ifr->ifr_ifru.ifru_lifetime = ia6->ia6_lifetime;
if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
time_t expire, maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire =
(time_t)~(1ULL << ((sizeof(maxexpire) * 8) - 1));
if (ia6->ia6_lifetime.ia6t_vltime <
maxexpire - ia6->ia6_updatetime) {
expire = ia6->ia6_updatetime +
ia6->ia6_lifetime.ia6t_vltime;
if (expire != 0) { expire -= getuptime();
expire += gettime();
}
retlt->ia6t_expire = expire;
} else
retlt->ia6t_expire = maxexpire;
}
if (ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
time_t expire, maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire =
(time_t)~(1ULL << ((sizeof(maxexpire) * 8) - 1));
if (ia6->ia6_lifetime.ia6t_pltime <
maxexpire - ia6->ia6_updatetime) {
expire = ia6->ia6_updatetime +
ia6->ia6_lifetime.ia6t_pltime;
if (expire != 0) { expire -= getuptime();
expire += gettime();
}
retlt->ia6t_preferred = expire;
} else
retlt->ia6t_preferred = maxexpire;
}
break;
default:
panic("%s: invalid ioctl %lu", __func__, cmd);
}
err:
NET_UNLOCK_SHARED();
return (error);
}
int
in6_check_embed_scope(struct sockaddr_in6 *sa6, unsigned int ifidx)
{
if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) {
if (sa6->sin6_addr.s6_addr16[1] == 0) {
/* link ID is not embedded by the user */
sa6->sin6_addr.s6_addr16[1] = htons(ifidx); } else if (sa6->sin6_addr.s6_addr16[1] != htons(ifidx))
return EINVAL; /* link ID contradicts */
}
return 0;
}
int
in6_clear_scope_id(struct sockaddr_in6 *sa6, unsigned int ifidx)
{
if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) {
if (sa6->sin6_scope_id) { if (sa6->sin6_scope_id != (u_int32_t)ifidx)
return EINVAL;
sa6->sin6_scope_id = 0; /* XXX: good way? */
}
}
return 0;
}
/*
* Update parameters of an IPv6 interface address.
* If necessary, a new entry is created and linked into address chains.
* This function is separated from in6_control().
*/
int
in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
struct in6_ifaddr *ia6)
{
int error = 0, hostIsNew = 0, plen = -1;
struct sockaddr_in6 dst6;
struct in6_addrlifetime *lt;
struct in6_multi_mship *imm;
struct rtentry *rt;
char addr[INET6_ADDRSTRLEN];
NET_ASSERT_LOCKED();
/* Validate parameters */
if (ifp == NULL || ifra == NULL) /* this maybe redundant */
return (EINVAL);
/*
* The destination address for a p2p link must have a family
* of AF_UNSPEC or AF_INET6.
*/
if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
return (EAFNOSUPPORT);
/*
* validate ifra_prefixmask. don't check sin6_family, netmask
* does not carry fields other than sin6_len.
*/
if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
return (EINVAL);
/*
* Because the IPv6 address architecture is classless, we require
* users to specify a (non 0) prefix length (mask) for a new address.
* We also require the prefix (when specified) mask is valid, and thus
* reject a non-consecutive mask.
*/
if (ia6 == NULL && ifra->ifra_prefixmask.sin6_len == 0)
return (EINVAL);
if (ifra->ifra_prefixmask.sin6_len != 0) {
plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
(u_char *)&ifra->ifra_prefixmask +
ifra->ifra_prefixmask.sin6_len);
if (plen <= 0)
return (EINVAL);
} else {
/*
* In this case, ia6 must not be NULL. We just use its prefix
* length.
*/
plen = in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL);
}
/*
* If the destination address on a p2p interface is specified,
* and the address is a scoped one, validate/set the scope
* zone identifier.
*/
dst6 = ifra->ifra_dstaddr;
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
(dst6.sin6_family == AF_INET6)) {
error = in6_check_embed_scope(&dst6, ifp->if_index);
if (error)
return error;
}
/*
* The destination address can be specified only for a p2p or a
* loopback interface. If specified, the corresponding prefix length
* must be 128.
*/
if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0)
return (EINVAL);
if (plen != 128)
return (EINVAL);
}
/* lifetime consistency check */
lt = &ifra->ifra_lifetime;
if (lt->ia6t_pltime > lt->ia6t_vltime)
return (EINVAL);
if (lt->ia6t_vltime == 0) {
/*
* the following log might be noisy, but this is a typical
* configuration mistake or a tool's bug.
*/
nd6log((LOG_INFO, "%s: valid lifetime is 0 for %s\n", __func__,
inet_ntop(AF_INET6, &ifra->ifra_addr.sin6_addr,
addr, sizeof(addr))));
if (ia6 == NULL)
return (0); /* there's nothing to do */
}
/*
* If this is a new address, allocate a new ifaddr and link it
* into chains.
*/
if (ia6 == NULL) {
hostIsNew = 1;
ia6 = malloc(sizeof(*ia6), M_IFADDR, M_WAITOK | M_ZERO);
refcnt_init_trace(&ia6->ia_ifa.ifa_refcnt,
DT_REFCNT_IDX_IFADDR);
LIST_INIT(&ia6->ia6_memberships);
/* Initialize the address and masks, and put time stamp */
ia6->ia_ifa.ifa_addr = sin6tosa(&ia6->ia_addr);
ia6->ia_addr.sin6_family = AF_INET6;
ia6->ia_addr.sin6_len = sizeof(ia6->ia_addr);
ia6->ia6_updatetime = getuptime();
if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
/*
* XXX: some functions expect that ifa_dstaddr is not
* NULL for p2p interfaces.
*/
ia6->ia_ifa.ifa_dstaddr = sin6tosa(&ia6->ia_dstaddr);
} else {
ia6->ia_ifa.ifa_dstaddr = NULL;
}
ia6->ia_ifa.ifa_netmask = sin6tosa(&ia6->ia_prefixmask);
ia6->ia_ifp = ifp;
ia6->ia_addr = ifra->ifra_addr;
ifa_add(ifp, &ia6->ia_ifa);
}
/* set prefix mask */
if (ifra->ifra_prefixmask.sin6_len) {
/*
* We prohibit changing the prefix length of an existing
* address, because
* + such an operation should be rare in IPv6, and
* + the operation would confuse prefix management.
*/
if (ia6->ia_prefixmask.sin6_len &&
in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL) != plen) {
error = EINVAL;
goto unlink;
}
ia6->ia_prefixmask = ifra->ifra_prefixmask;
}
/*
* If a new destination address is specified, scrub the old one and
* install the new destination. Note that the interface must be
* p2p or loopback (see the check above.)
*/
if ((ifp->if_flags & IFF_POINTOPOINT) && dst6.sin6_family == AF_INET6 &&
!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, &ia6->ia_dstaddr.sin6_addr)) {
struct ifaddr *ifa = &ia6->ia_ifa;
if ((ia6->ia_flags & IFA_ROUTE) != 0 &&
rt_ifa_del(ifa, RTF_HOST, ifa->ifa_dstaddr,
ifp->if_rdomain) != 0) {
nd6log((LOG_ERR, "%s: failed to remove a route "
"to the old destination: %s\n", __func__,
inet_ntop(AF_INET6, &ia6->ia_addr.sin6_addr,
addr, sizeof(addr))));
/* proceed anyway... */
} else
ia6->ia_flags &= ~IFA_ROUTE;
ia6->ia_dstaddr = dst6;
}
/*
* Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred
* to see if the address is deprecated or invalidated, but initialize
* these members for applications.
*/
ia6->ia6_updatetime = getuptime();
ia6->ia6_lifetime = ifra->ifra_lifetime;
if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
ia6->ia6_lifetime.ia6t_expire =
getuptime() + ia6->ia6_lifetime.ia6t_vltime;
} else
ia6->ia6_lifetime.ia6t_expire = 0;
if (ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
ia6->ia6_lifetime.ia6t_preferred =
getuptime() + ia6->ia6_lifetime.ia6t_pltime;
} else
ia6->ia6_lifetime.ia6t_preferred = 0;
/* reset the interface and routing table appropriately. */
if ((error = in6_ifinit(ifp, ia6, hostIsNew)) != 0)
goto unlink;
/* re-run DAD */
if (ia6->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED))
ifra->ifra_flags |= IN6_IFF_TENTATIVE;
/*
* configure address flags.
*/
ia6->ia6_flags = ifra->ifra_flags;
nd6_expire_timer_update(ia6);
/*
* We are done if we have simply modified an existing address.
*/
if (!hostIsNew)
return (error);
/*
* Beyond this point, we should call in6_purgeaddr upon an error,
* not just go to unlink.
*/
/* join necessary multiast groups */
if ((ifp->if_flags & IFF_MULTICAST) != 0) {
struct sockaddr_in6 mltaddr, mltmask;
/* join solicited multicast addr for new host id */
struct sockaddr_in6 llsol;
bzero(&llsol, sizeof(llsol));
llsol.sin6_family = AF_INET6;
llsol.sin6_len = sizeof(llsol);
llsol.sin6_addr.s6_addr16[0] = htons(0xff02);
llsol.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
llsol.sin6_addr.s6_addr32[1] = 0;
llsol.sin6_addr.s6_addr32[2] = htonl(1);
llsol.sin6_addr.s6_addr32[3] =
ifra->ifra_addr.sin6_addr.s6_addr32[3];
llsol.sin6_addr.s6_addr8[12] = 0xff;
imm = in6_joingroup(ifp, &llsol.sin6_addr, &error);
if (!imm)
goto cleanup;
LIST_INSERT_HEAD(&ia6->ia6_memberships, imm, i6mm_chain);
bzero(&mltmask, sizeof(mltmask));
mltmask.sin6_len = sizeof(struct sockaddr_in6);
mltmask.sin6_family = AF_INET6;
mltmask.sin6_addr = in6mask32;
/*
* join link-local all-nodes address
*/
bzero(&mltaddr, sizeof(mltaddr));
mltaddr.sin6_len = sizeof(struct sockaddr_in6);
mltaddr.sin6_family = AF_INET6;
mltaddr.sin6_addr = in6addr_linklocal_allnodes;
mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
mltaddr.sin6_scope_id = 0;
/*
* XXX: do we really need this automatic routes?
* We should probably reconsider this stuff. Most applications
* actually do not need the routes, since they usually specify
* the outgoing interface.
*/
rt = rtalloc(sin6tosa(&mltaddr), 0, ifp->if_rdomain);
if (rt) {
/* 32bit came from "mltmask" */
if (memcmp(&mltaddr.sin6_addr,
&satosin6(rt_key(rt))->sin6_addr,
32 / 8)) {
rtfree(rt);
rt = NULL;
}
}
if (!rt) {
struct rt_addrinfo info;
bzero(&info, sizeof(info));
info.rti_ifa = &ia6->ia_ifa;
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia6->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia6->ia_addr);
info.rti_flags = RTF_MULTICAST;
error = rtrequest(RTM_ADD, &info, RTP_CONNECTED, NULL,
ifp->if_rdomain);
if (error)
goto cleanup;
} else {
rtfree(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error);
if (!imm)
goto cleanup;
LIST_INSERT_HEAD(&ia6->ia6_memberships, imm, i6mm_chain);
/*
* join interface-local all-nodes address.
* (ff01::1%ifN, and ff01::%ifN/32)
*/
bzero(&mltaddr, sizeof(mltaddr));
mltaddr.sin6_len = sizeof(struct sockaddr_in6);
mltaddr.sin6_family = AF_INET6;
mltaddr.sin6_addr = in6addr_intfacelocal_allnodes;
mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
mltaddr.sin6_scope_id = 0;
/* XXX: again, do we really need the route? */
rt = rtalloc(sin6tosa(&mltaddr), 0, ifp->if_rdomain);
if (rt) {
/* 32bit came from "mltmask" */
if (memcmp(&mltaddr.sin6_addr,
&satosin6(rt_key(rt))->sin6_addr,
32 / 8)) {
rtfree(rt);
rt = NULL;
}
}
if (!rt) {
struct rt_addrinfo info;
bzero(&info, sizeof(info));
info.rti_ifa = &ia6->ia_ifa;
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia6->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia6->ia_addr);
info.rti_flags = RTF_MULTICAST;
error = rtrequest(RTM_ADD, &info, RTP_CONNECTED, NULL,
ifp->if_rdomain);
if (error)
goto cleanup;
} else {
rtfree(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error);
if (!imm)
goto cleanup;
LIST_INSERT_HEAD(&ia6->ia6_memberships, imm, i6mm_chain);
}
return (error);
unlink:
/*
* XXX: if a change of an existing address failed, keep the entry
* anyway.
*/
if (hostIsNew)
in6_unlink_ifa(ia6, ifp);
return (error);
cleanup:
in6_purgeaddr(&ia6->ia_ifa);
return error;
}
void
in6_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct in6_ifaddr *ia6 = ifatoia6(ifa);
struct in6_multi_mship *imm;
/* stop DAD processing */
nd6_dad_stop(ifa);
/*
* delete route to the destination of the address being purged.
* The interface must be p2p or loopback in this case.
*/
if ((ifp->if_flags & IFF_POINTOPOINT) && (ia6->ia_flags & IFA_ROUTE) &&
ia6->ia_dstaddr.sin6_len != 0) {
int e;
e = rt_ifa_del(ifa, RTF_HOST, ifa->ifa_dstaddr,
ifp->if_rdomain);
if (e != 0) {
char addr[INET6_ADDRSTRLEN];
log(LOG_ERR, "in6_purgeaddr: failed to remove "
"a route to the p2p destination: %s on %s, "
"errno=%d\n",
inet_ntop(AF_INET6, &ia6->ia_addr.sin6_addr,
addr, sizeof(addr)),
ifp->if_xname, e);
/* proceed anyway... */
} else
ia6->ia_flags &= ~IFA_ROUTE;
}
/* Remove ownaddr's loopback rtentry, if it exists. */
rt_ifa_dellocal(&(ia6->ia_ifa));
/*
* leave from multicast groups we have joined for the interface
*/
while (!LIST_EMPTY(&ia6->ia6_memberships)) {
imm = LIST_FIRST(&ia6->ia6_memberships);
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
in6_unlink_ifa(ia6, ifp);
}
void
in6_unlink_ifa(struct in6_ifaddr *ia6, struct ifnet *ifp)
{
struct ifaddr *ifa = &ia6->ia_ifa;
int plen;
NET_ASSERT_LOCKED();
/* Release the reference to the base prefix. */
plen = in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL);
if ((ifp->if_flags & IFF_LOOPBACK) == 0 && plen != 128) {
rt_ifa_del(ifa, RTF_CLONING | RTF_CONNECTED,
ifa->ifa_addr, ifp->if_rdomain);
}
rt_ifa_purge(ifa);
ifa_del(ifp, ifa);
ia6->ia_ifp = NULL;
ifafree(&ia6->ia_ifa);
}
/*
* Initialize an interface's inet6 address
* and routing table entry.
*/
int
in6_ifinit(struct ifnet *ifp, struct in6_ifaddr *ia6, int newhost)
{
int error = 0, plen, ifacount = 0;
struct ifaddr *ifa;
NET_ASSERT_LOCKED();
/*
* Give the interface a chance to initialize
* if this is its first address (or it is a CARP interface)
* and to validate the address if necessary.
*/
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifacount++;
}
if ((ifacount <= 1 || ifp->if_type == IFT_CARP ||
(ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))) &&
(error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia6))) {
return (error);
}
ia6->ia_ifa.ifa_metric = ifp->if_metric;
/* we could do in(6)_socktrim here, but just omit it at this moment. */
/*
* Special case:
* If the destination address is specified for a point-to-point
* interface, install a route to the destination as an interface
* direct route.
*/
plen = in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL); /* XXX */
if ((ifp->if_flags & IFF_POINTOPOINT) && plen == 128 &&
ia6->ia_dstaddr.sin6_family == AF_INET6) {
ifa = &ia6->ia_ifa;
error = rt_ifa_add(ifa, RTF_HOST | RTF_MPATH,
ifa->ifa_dstaddr, ifp->if_rdomain);
if (error != 0)
return (error);
ia6->ia_flags |= IFA_ROUTE;
}
if (newhost)
error = rt_ifa_addlocal(&(ia6->ia_ifa));
return (error);
}
/*
* Add an address to the list of IP6 multicast addresses for a
* given interface.
*/
struct in6_multi *
in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp)
{
struct in6_ifreq ifr;
struct in6_multi *in6m;
NET_ASSERT_LOCKED();
*errorp = 0;
/*
* See if address already in list.
*/
IN6_LOOKUP_MULTI(*maddr6, ifp, in6m);
if (in6m != NULL) {
/*
* Found it; just increment the reference count.
*/
in6m->in6m_refcnt++;
} else {
/*
* New address; allocate a new multicast record
* and link it into the interface's multicast list.
*/
in6m = malloc(sizeof(*in6m), M_IPMADDR, M_NOWAIT | M_ZERO);
if (in6m == NULL) {
*errorp = ENOBUFS;
return (NULL);
}
in6m->in6m_sin.sin6_len = sizeof(struct sockaddr_in6);
in6m->in6m_sin.sin6_family = AF_INET6;
in6m->in6m_sin.sin6_addr = *maddr6;
in6m->in6m_refcnt = 1;
in6m->in6m_ifidx = ifp->if_index;
in6m->in6m_ifma.ifma_addr = sin6tosa(&in6m->in6m_sin);
/*
* Ask the network driver to update its multicast reception
* filter appropriately for the new address.
*/
memcpy(&ifr.ifr_addr, &in6m->in6m_sin, sizeof(in6m->in6m_sin));
*errorp = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
if (*errorp) {
free(in6m, M_IPMADDR, sizeof(*in6m));
return (NULL);
}
TAILQ_INSERT_HEAD(&ifp->if_maddrlist, &in6m->in6m_ifma,
ifma_list);
/*
* Let MLD6 know that we have joined a new IP6 multicast
* group.
*/
mld6_start_listening(in6m);
}
return (in6m);
}
/*
* Delete a multicast address record.
*/
void
in6_delmulti(struct in6_multi *in6m)
{
struct in6_ifreq ifr;
struct ifnet *ifp;
NET_ASSERT_LOCKED(); if (--in6m->in6m_refcnt == 0) {
/*
* No remaining claims to this record; let MLD6 know
* that we are leaving the multicast group.
*/
mld6_stop_listening(in6m);
ifp = if_get(in6m->in6m_ifidx);
/*
* Notify the network driver to update its multicast
* reception filter.
*/
if (ifp != NULL) { bzero(&ifr.ifr_addr, sizeof(struct sockaddr_in6));
ifr.ifr_addr.sin6_len = sizeof(struct sockaddr_in6);
ifr.ifr_addr.sin6_family = AF_INET6;
ifr.ifr_addr.sin6_addr = in6m->in6m_addr;
KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
KERNEL_UNLOCK();
TAILQ_REMOVE(&ifp->if_maddrlist, &in6m->in6m_ifma,
ifma_list);
}
if_put(ifp);
free(in6m, M_IPMADDR, sizeof(*in6m));
}
}
/*
* Return 1 if the multicast group represented by ``maddr6'' has been
* joined by interface ``ifp'', 0 otherwise.
*/
int
in6_hasmulti(struct in6_addr *maddr6, struct ifnet *ifp)
{
struct in6_multi *in6m;
int joined;
IN6_LOOKUP_MULTI(*maddr6, ifp, in6m);
joined = (in6m != NULL);
return (joined);
}
struct in6_multi_mship *
in6_joingroup(struct ifnet *ifp, struct in6_addr *addr, int *errorp)
{
struct in6_multi_mship *imm;
imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT);
if (!imm) {
*errorp = ENOBUFS;
return NULL;
}
imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp);
if (!imm->i6mm_maddr) {
/* *errorp is already set */
free(imm, M_IPMADDR, sizeof(*imm));
return NULL;
}
return imm;
}
void
in6_leavegroup(struct in6_multi_mship *imm)
{ if (imm->i6mm_maddr) in6_delmulti(imm->i6mm_maddr);
free(imm, M_IPMADDR, sizeof(*imm));
}
/*
* Find an IPv6 interface link-local address specific to an interface.
*/
struct in6_ifaddr *
in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
{
struct ifaddr *ifa;
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((ifatoia6(ifa)->ia6_flags & ignoreflags) != 0)
continue;
break;
}
}
return (ifatoia6(ifa));
}
/*
* find the internet address corresponding to a given interface and address.
*/
struct in6_ifaddr *
in6ifa_ifpwithaddr(struct ifnet *ifp, struct in6_addr *addr)
{
struct ifaddr *ifa;
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa)))
break;
}
return (ifatoia6(ifa));
}
/*
* Get a scope of the address. Node-local, link-local, site-local or global.
*/
int
in6_addrscope(struct in6_addr *addr)
{
int scope;
if (addr->s6_addr8[0] == 0xfe) { scope = addr->s6_addr8[1] & 0xc0; switch (scope) {
case 0x80:
return __IPV6_ADDR_SCOPE_LINKLOCAL;
break;
case 0xc0:
return __IPV6_ADDR_SCOPE_SITELOCAL;
break;
default:
return __IPV6_ADDR_SCOPE_GLOBAL; /* just in case */
break;
}
}
if (addr->s6_addr8[0] == 0xff) {
scope = addr->s6_addr8[1] & 0x0f;
/*
* due to other scope such as reserved,
* return scope doesn't work.
*/
switch (scope) {
case __IPV6_ADDR_SCOPE_INTFACELOCAL:
return __IPV6_ADDR_SCOPE_INTFACELOCAL;
break;
case __IPV6_ADDR_SCOPE_LINKLOCAL:
return __IPV6_ADDR_SCOPE_LINKLOCAL;
break;
case __IPV6_ADDR_SCOPE_SITELOCAL:
return __IPV6_ADDR_SCOPE_SITELOCAL;
break;
default:
return __IPV6_ADDR_SCOPE_GLOBAL;
break;
}
}
if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr8[15] == 1) /* loopback */
return __IPV6_ADDR_SCOPE_INTFACELOCAL;
if (addr->s6_addr8[15] == 0) /* unspecified */
return __IPV6_ADDR_SCOPE_LINKLOCAL;
}
return __IPV6_ADDR_SCOPE_GLOBAL;
}
int
in6_addr2scopeid(unsigned int ifidx, struct in6_addr *addr)
{
int scope = in6_addrscope(addr);
switch(scope) {
case __IPV6_ADDR_SCOPE_INTFACELOCAL:
case __IPV6_ADDR_SCOPE_LINKLOCAL:
/* XXX: we do not distinguish between a link and an I/F. */
return (ifidx);
case __IPV6_ADDR_SCOPE_SITELOCAL:
return (0); /* XXX: invalid. */
default:
return (0); /* XXX: treat as global. */
}
}
/*
* return length of part which dst and src are equal
* hard coding...
*/
int
in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
{
int match = 0;
u_char *s = (u_char *)src, *d = (u_char *)dst;
u_char *lim = s + 16, r;
while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) {
match++;
r <<= 1;
}
break;
} else
match += 8;
return match;
}
void
in6_prefixlen2mask(struct in6_addr *maskp, int len)
{
u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
int bytelen, bitlen, i;
/* sanity check */
if (0 > len || len > 128) {
log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
len);
return;
}
bzero(maskp, sizeof(*maskp));
bytelen = len / 8;
bitlen = len % 8;
for (i = 0; i < bytelen; i++)
maskp->s6_addr[i] = 0xff;
/* len == 128 is ok because bitlen == 0 then */
if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
}
/*
* return the best address out of the same scope
*/
struct in6_ifaddr *
in6_ifawithscope(struct ifnet *oifp, struct in6_addr *dst, u_int rdomain)
{
int dst_scope = in6_addrscope(dst), src_scope, best_scope = 0;
int blen = -1;
struct ifaddr *ifa;
struct ifnet *ifp;
struct in6_ifaddr *ia6_best = NULL;
if (oifp == NULL) {
printf("%s: output interface is not specified\n", __func__);
return (NULL);
}
/* We search for all addresses on all interfaces from the beginning. */
TAILQ_FOREACH(ifp, &ifnet, if_list) { if (ifp->if_rdomain != rdomain)
continue;
#if NCARP > 0
/*
* Never use a carp address of an interface which is not
* the master.
*/
if (ifp->if_type == IFT_CARP && !carp_iamatch(ifp))
continue;
#endif
/*
* We can never take an address that breaks the scope zone
* of the destination.
*/
if (in6_addr2scopeid(ifp->if_index, dst) !=
in6_addr2scopeid(oifp->if_index, dst))
continue;
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
int tlen = -1;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
src_scope = in6_addrscope(IFA_IN6(ifa));
/*
* Don't use an address before completing DAD
* nor a duplicated address.
*/
if (ifatoia6(ifa)->ia6_flags &
(IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED))
continue;
/*
* RFC 6724 allows anycast addresses as source address
* because the restriction was removed in RFC 4291.
* However RFC 4443 states that ICMPv6 responses
* MUST use a unicast source address.
*
* XXX Skip anycast addresses for now since
* icmp6_reflect() uses this function for source
* address selection.
*/
if (ifatoia6(ifa)->ia6_flags & IN6_IFF_ANYCAST)
continue;
if (ifatoia6(ifa)->ia6_flags & IN6_IFF_DETACHED)
continue;
/*
* If this is the first address we find,
* keep it anyway.
*/
if (ia6_best == NULL)
goto replace;
/*
* ia6_best is never NULL beyond this line except
* within the block labeled "replace".
*/
/*
* Rule 2: Prefer appropriate scope.
* Find the address with the smallest scope that is
* bigger (or equal) to the scope of the destination
* address.
* Accept an address with smaller scope than the
* destination if non exists with bigger scope.
*/
if (best_scope < src_scope) {
if (best_scope < dst_scope)
goto replace;
else
continue;
} else if (src_scope < best_scope) {
if (src_scope < dst_scope)
continue;
else
goto replace;
}
/* Rule 3: Avoid deprecated addresses. */
if (ifatoia6(ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
/*
* Ignore any deprecated addresses if
* specified by configuration.
*/
if (!ip6_use_deprecated)
continue;
/*
* If we have already found a non-deprecated
* candidate, just ignore deprecated addresses.
*/
if ((ia6_best->ia6_flags & IN6_IFF_DEPRECATED)
== 0)
continue;
} else if ((ia6_best->ia6_flags & IN6_IFF_DEPRECATED))
goto replace;
/*
* Rule 4: Prefer home addresses.
* We do not support home addresses.
*/
/* Rule 5: Prefer outgoing interface */
if (ia6_best->ia_ifp == oifp && ifp != oifp)
continue;
if (ia6_best->ia_ifp != oifp && ifp == oifp)
goto replace;
/*
* Rule 5.5: Prefer addresses in a prefix advertised
* by the next-hop.
* We do not track this information.
*/
/*
* Rule 6: Prefer matching label.
* We do not implement policy tables.
*/
/* Rule 7: Prefer temporary addresses. */
if ((ia6_best->ia6_flags & IN6_IFF_TEMPORARY) &&
!(ifatoia6(ifa)->ia6_flags & IN6_IFF_TEMPORARY))
continue;
if (!(ia6_best->ia6_flags & IN6_IFF_TEMPORARY) &&
(ifatoia6(ifa)->ia6_flags & IN6_IFF_TEMPORARY))
goto replace;
/* Rule 8: Use longest matching prefix. */
tlen = in6_matchlen(IFA_IN6(ifa), dst);
if (tlen > blen) {
#if NCARP > 0
/*
* Don't let carp interfaces win a tie against
* the output interface based on matchlen.
* We should only use a carp address if no
* other interface has a usable address.
* Otherwise, when communicating from a carp
* master to a carp backup, the backup system
* won't respond since the carp address is also
* configured as a local address on the backup.
* Note that carp interfaces in backup state
* were already skipped above.
*/
if (ifp->if_type == IFT_CARP &&
oifp->if_type != IFT_CARP)
continue;
#endif
goto replace;
} else if (tlen < blen)
continue;
/*
* If the eight rules fail to choose a single address,
* the tiebreaker is implementation-specific.
*/
/* Prefer address with highest pltime. */
if (ia6_best->ia6_updatetime +
ia6_best->ia6_lifetime.ia6t_pltime <
ifatoia6(ifa)->ia6_updatetime +
ifatoia6(ifa)->ia6_lifetime.ia6t_pltime)
goto replace;
else if (ia6_best->ia6_updatetime +
ia6_best->ia6_lifetime.ia6t_pltime >
ifatoia6(ifa)->ia6_updatetime +
ifatoia6(ifa)->ia6_lifetime.ia6t_pltime)
continue;
/* Prefer address with highest vltime. */
if (ia6_best->ia6_updatetime +
ia6_best->ia6_lifetime.ia6t_vltime <
ifatoia6(ifa)->ia6_updatetime +
ifatoia6(ifa)->ia6_lifetime.ia6t_vltime)
goto replace;
else if (ia6_best->ia6_updatetime +
ia6_best->ia6_lifetime.ia6t_vltime >
ifatoia6(ifa)->ia6_updatetime +
ifatoia6(ifa)->ia6_lifetime.ia6t_vltime)
continue;
continue;
replace:
ia6_best = ifatoia6(ifa);
blen = tlen >= 0 ? tlen :
in6_matchlen(IFA_IN6(ifa), dst);
best_scope =
in6_addrscope(&ia6_best->ia_addr.sin6_addr);
}
}
/* count statistics for future improvements */
if (ia6_best == NULL)
ip6stat_inc(ip6s_sources_none);
else {
if (oifp == ia6_best->ia_ifp)
ip6stat_inc(ip6s_sources_sameif + best_scope);
else
ip6stat_inc(ip6s_sources_otherif + best_scope);
if (best_scope == dst_scope)
ip6stat_inc(ip6s_sources_samescope + best_scope);
else
ip6stat_inc(ip6s_sources_otherscope + best_scope); if ((ia6_best->ia6_flags & IN6_IFF_DEPRECATED) != 0) ip6stat_inc(ip6s_sources_deprecated + best_scope);
}
return (ia6_best);
}
int
in6if_do_dad(struct ifnet *ifp)
{
if ((ifp->if_flags & IFF_LOOPBACK) != 0)
return (0);
switch (ifp->if_type) {
#if NCARP > 0
case IFT_CARP:
/*
* XXX: DAD does not work currently on carp(4)
* so disable it for now.
*/
return (0);
#endif
default:
/*
* Our DAD routine requires the interface up and running.
* However, some interfaces can be up before the RUNNING
* status. Additionally, users may try to assign addresses
* before the interface becomes up (or running).
* We simply skip DAD in such a case as a work around.
* XXX: we should rather mark "tentative" on such addresses,
* and do DAD after the interface becomes ready.
*/
if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) !=
(IFF_UP|IFF_RUNNING))
return (0);
return (1);
}
}
void *
in6_domifattach(struct ifnet *ifp)
{
struct in6_ifextra *ext;
ext = malloc(sizeof(*ext), M_IFADDR, M_WAITOK | M_ZERO);
ext->nd_ifinfo = nd6_ifattach(ifp);
ext->nprefixes = 0;
ext->ndefrouters = 0;
return ext;
}
void
in6_domifdetach(struct ifnet *ifp, void *aux)
{
struct in6_ifextra *ext = (struct in6_ifextra *)aux;
nd6_ifdetach(ext->nd_ifinfo);
free(ext, M_IFADDR, sizeof(*ext));
}
/* $OpenBSD: ffs_inode.c,v 1.81 2021/12/12 09:14:59 visa Exp $ */
/* $NetBSD: ffs_inode.c,v 1.10 1996/05/11 18:27:19 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int, long *);
/*
* Update the access, modified, and inode change times as specified by the
* IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. The IN_MODIFIED
* flag is used to specify that the inode needs to be updated but that the
* times have already been set. The IN_LAZYMOD flag is used to specify
* that the inode needs to be updated at some point, by reclaim if not
* in the course of other changes; this is used to defer writes just to
* update device timestamps. If waitfor is set, then wait for the disk
* write of the inode to complete.
*/
int
ffs_update(struct inode *ip, int waitfor)
{
struct vnode *vp;
struct fs *fs;
struct buf *bp;
int error;
vp = ITOV(ip);
ufs_itimes(vp);
if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
return (0);
ip->i_flag &= ~(IN_MODIFIED | IN_LAZYMOD);
fs = ip->i_fs;
/*
* Ensure that uid and gid are correct. This is a temporary
* fix until fsck has been changed to do the update.
*/
if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_inodefmt < FS_44INODEFMT) { ip->i_din1->di_ouid = ip->i_ffs1_uid;
ip->i_din1->di_ogid = ip->i_ffs1_gid;
}
error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, &bp);
if (error) {
brelse(bp);
return (error);
}
if (DOINGSOFTDEP(vp))
softdep_update_inodeblock(ip, bp, waitfor); else if (ip->i_effnlink != DIP(ip, nlink)) panic("ffs_update: bad link cnt");
#ifdef FFS2
if (ip->i_ump->um_fstype == UM_UFS2)
*((struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
else
#endif
*((struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
if (waitfor && !DOINGASYNC(vp)) {
return (bwrite(bp));
} else {
bdwrite(bp);
return (0);
}
}
#define SINGLE 0 /* index of single indirect block */
#define DOUBLE 1 /* index of double indirect block */
#define TRIPLE 2 /* index of triple indirect block */
/*
* Truncate the inode oip to at most length size, freeing the
* disk blocks.
*/
int
ffs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred)
{
struct vnode *ovp;
daddr_t lastblock;
daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
struct fs *fs;
struct buf *bp;
int offset, size, level;
long count, nblocks, vflags, blocksreleased = 0;
int i, aflags, error, allerror;
off_t osize;
if (length < 0)
return (EINVAL);
ovp = ITOV(oip);
if (ovp->v_type != VREG &&
ovp->v_type != VDIR &&
ovp->v_type != VLNK)
return (0);
if (DIP(oip, size) == length)
return (0);
if (ovp->v_type == VLNK && (DIP(oip, size) < oip->i_ump->um_maxsymlinklen || (oip->i_ump->um_maxsymlinklen == 0 &&
oip->i_din1->di_blocks == 0))) {
#ifdef DIAGNOSTIC
if (length != 0)
panic("ffs_truncate: partial truncate of symlink");
#endif
memset(SHORTLINK(oip), 0, (size_t) DIP(oip, size));
DIP_ASSIGN(oip, size, 0);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (UFS_UPDATE(oip, 1));
}
if ((error = getinoquota(oip)) != 0)
return (error);
fs = oip->i_fs;
if (length > fs->fs_maxfilesize)
return (EFBIG);
uvm_vnp_setsize(ovp, length);
oip->i_ci.ci_lasta = oip->i_ci.ci_clen
= oip->i_ci.ci_cstart = oip->i_ci.ci_lastw = 0;
if (DOINGSOFTDEP(ovp)) { if (length > 0 || softdep_slowdown(ovp)) {
/*
* If a file is only partially truncated, then
* we have to clean up the data structures
* describing the allocation past the truncation
* point. Finding and deallocating those structures
* is a lot of work. Since partial truncation occurs
* rarely, we solve the problem by syncing the file
* so that it will have no data structures left.
*/
if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT,
curproc)) != 0)
return (error);
} else {
(void)ufs_quota_free_blocks(oip, DIP(oip, blocks),
NOCRED);
softdep_setup_freeblocks(oip, length);
vinvalbuf(ovp, 0, cred, curproc, 0, INFSLP);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (UFS_UPDATE(oip, 0));
}
}
osize = DIP(oip, size);
/*
* Lengthen the size of the file. We must ensure that the
* last byte of the file is allocated. Since the smallest
* value of osize is 0, length will be at least 1.
*/
if (osize < length) {
aflags = B_CLRBUF;
if (flags & IO_SYNC)
aflags |= B_SYNC;
error = UFS_BUF_ALLOC(oip, length - 1, 1,
cred, aflags, &bp);
if (error)
return (error);
DIP_ASSIGN(oip, size, length);
uvm_vnp_setsize(ovp, length);
(void) uvm_vnp_uncache(ovp);
if (aflags & B_SYNC)
bwrite(bp);
else
bawrite(bp);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (UFS_UPDATE(oip, 1));
}
uvm_vnp_setsize(ovp, length);
/*
* Shorten the size of the file. If the file is not being
* truncated to a block boundary, the contents of the
* partial block following the end of the file must be
* zero'ed in case it ever becomes accessible again because
* of subsequent file growth. Directories however are not
* zero'ed as they should grow back initialized to empty.
*/
offset = blkoff(fs, length);
if (offset == 0) {
DIP_ASSIGN(oip, size, length);
} else {
lbn = lblkno(fs, length);
aflags = B_CLRBUF;
if (flags & IO_SYNC)
aflags |= B_SYNC;
error = UFS_BUF_ALLOC(oip, length - 1, 1,
cred, aflags, &bp);
if (error)
return (error);
/*
* When we are doing soft updates and the UFS_BALLOC
* above fills in a direct block hole with a full sized
* block that will be truncated down to a fragment below,
* we must flush out the block dependency with an FSYNC
* so that we do not get a soft updates inconsistency
* when we create the fragment below.
*/
if (DOINGSOFTDEP(ovp) && lbn < NDADDR && fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
(error = VOP_FSYNC(ovp, cred, MNT_WAIT, curproc)) != 0)
return (error);
DIP_ASSIGN(oip, size, length); size = blksize(fs, oip, lbn);
(void) uvm_vnp_uncache(ovp);
if (ovp->v_type != VDIR)
memset(bp->b_data + offset, 0, size - offset);
buf_adjcnt(bp, size);
if (aflags & B_SYNC)
bwrite(bp);
else
bawrite(bp);
}
/*
* Calculate index into inode's block list of
* last direct and indirect blocks (if any)
* which we want to keep. Lastblock is -1 when
* the file is truncated to 0.
*/
lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
lastiblock[SINGLE] = lastblock - NDADDR;
lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
nblocks = btodb(fs->fs_bsize);
/*
* Update file and block pointers on disk before we start freeing
* blocks. If we crash before free'ing blocks below, the blocks
* will be returned to the free list. lastiblock values are also
* normalized to -1 for calls to ffs_indirtrunc below.
*/
for (level = TRIPLE; level >= SINGLE; level--) {
oldblks[NDADDR + level] = DIP(oip, ib[level]); if (lastiblock[level] < 0) { DIP_ASSIGN(oip, ib[level], 0);
lastiblock[level] = -1;
}
}
for (i = 0; i < NDADDR; i++) {
oldblks[i] = DIP(oip, db[i]);
if (i > lastblock) DIP_ASSIGN(oip, db[i], 0);
}
oip->i_flag |= IN_CHANGE | IN_UPDATE;
if ((error = UFS_UPDATE(oip, 1)) != 0)
allerror = error;
/*
* Having written the new inode to disk, save its new configuration
* and put back the old block pointers long enough to process them.
* Note that we save the new block configuration so we can check it
* when we are done.
*/
for (i = 0; i < NDADDR; i++) { newblks[i] = DIP(oip, db[i]);
DIP_ASSIGN(oip, db[i], oldblks[i]);
}
for (i = 0; i < NIADDR; i++) {
newblks[NDADDR + i] = DIP(oip, ib[i]);
DIP_ASSIGN(oip, ib[i], oldblks[NDADDR + i]);
}
DIP_ASSIGN(oip, size, osize);
vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA;
allerror = vinvalbuf(ovp, vflags, cred, curproc, 0, INFSLP);
/*
* Indirect blocks first.
*/
indir_lbn[SINGLE] = -NDADDR;
indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
for (level = TRIPLE; level >= SINGLE; level--) { bn = DIP(oip, ib[level]);
if (bn != 0) {
error = ffs_indirtrunc(oip, indir_lbn[level],
fsbtodb(fs, bn), lastiblock[level], level, &count);
if (error)
allerror = error;
blocksreleased += count;
if (lastiblock[level] < 0) { DIP_ASSIGN(oip, ib[level], 0);
ffs_blkfree(oip, bn, fs->fs_bsize);
blocksreleased += nblocks;
}
}
if (lastiblock[level] >= 0)
goto done;
}
/*
* All whole direct blocks or frags.
*/
for (i = NDADDR - 1; i > lastblock; i--) {
long bsize;
bn = DIP(oip, db[i]); if (bn == 0)
continue;
DIP_ASSIGN(oip, db[i], 0); bsize = blksize(fs, oip, i);
ffs_blkfree(oip, bn, bsize);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
goto done;
/*
* Finally, look for a change in size of the
* last direct block; release any frags.
*/
bn = DIP(oip, db[lastblock]); if (bn != 0) {
long oldspace, newspace;
/*
* Calculate amount of space we're giving
* back as old block size minus new block size.
*/
oldspace = blksize(fs, oip, lastblock);
DIP_ASSIGN(oip, size, length);
newspace = blksize(fs, oip, lastblock);
if (newspace == 0)
panic("ffs_truncate: newspace"); if (oldspace - newspace > 0) {
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
* required for the storage we're keeping.
*/
bn += numfrags(fs, newspace);
ffs_blkfree(oip, bn, oldspace - newspace);
blocksreleased += btodb(oldspace - newspace);
}
}
done:
#ifdef DIAGNOSTIC
for (level = SINGLE; level <= TRIPLE; level++)
if (newblks[NDADDR + level] != DIP(oip, ib[level]))
panic("ffs_truncate1");
for (i = 0; i < NDADDR; i++)
if (newblks[i] != DIP(oip, db[i]))
panic("ffs_truncate2");
#endif /* DIAGNOSTIC */
/*
* Put back the real size.
*/
DIP_ASSIGN(oip, size, length);
if (DIP(oip, blocks) >= blocksreleased)
DIP_ADD(oip, blocks, -blocksreleased);
else /* sanity */
DIP_ASSIGN(oip, blocks, 0);
oip->i_flag |= IN_CHANGE;
(void)ufs_quota_free_blocks(oip, blocksreleased, NOCRED);
return (allerror);
}
#ifdef FFS2
#define BAP(ip, i) (((ip)->i_ump->um_fstype == UM_UFS2) ? bap2[i] : bap1[i])
#define BAP_ASSIGN(ip, i, value) \
do { \
if ((ip)->i_ump->um_fstype == UM_UFS2) \
bap2[i] = (value); \
else \
bap1[i] = (value); \
} while (0)
#else
#define BAP(ip, i) bap1[i]
#define BAP_ASSIGN(ip, i, value) do { bap1[i] = (value); } while (0)
#endif /* FFS2 */
/*
* Release blocks associated with the inode ip and stored in the indirect
* block bn. Blocks are free'd in LIFO order up to (but not including)
* lastbn. If level is greater than SINGLE, the block is an indirect block
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*
* NB: triple indirect blocks are untested.
*/
int
ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
daddr_t lastbn, int level, long *countp)
{
int i;
struct buf *bp;
struct fs *fs = ip->i_fs;
struct vnode *vp;
void *copy = NULL;
daddr_t nb, nlbn, last;
long blkcount, factor;
int nblocks, blocksreleased = 0;
int error = 0, allerror = 0;
int32_t *bap1 = NULL;
#ifdef FFS2
int64_t *bap2 = NULL;
#endif
/*
* Calculate index in current block of last
* block to be kept. -1 indicates the entire
* block so we need not calculate the index.
*/
factor = 1;
for (i = SINGLE; i < level; i++)
factor *= NINDIR(fs);
last = lastbn;
if (lastbn > 0) last /= factor;
nblocks = btodb(fs->fs_bsize);
/*
* Get buffer of block pointers, zero those entries corresponding
* to blocks to be free'd, and update on disk copy first. Since
* double(triple) indirect before single(double) indirect, calls
* to bmap on these blocks will fail. However, we already have
* the on disk address, so we have to set the b_blkno field
* explicitly instead of letting bread do everything for us.
*/
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, INFSLP);
if (!(bp->b_flags & (B_DONE | B_DELWRI))) {
curproc->p_ru.ru_inblock++; /* pay for read */
bcstats.pendingreads++;
bcstats.numreads++;
bp->b_flags |= B_READ;
if (bp->b_bcount > bp->b_bufsize)
panic("ffs_indirtrunc: bad buffer size");
bp->b_blkno = dbn;
VOP_STRATEGY(bp->b_vp, bp);
error = biowait(bp);
}
if (error) { brelse(bp);
*countp = 0;
return (error);
}
#ifdef FFS2
if (ip->i_ump->um_fstype == UM_UFS2)
bap2 = (int64_t *)bp->b_data;
else
#endif
bap1 = (int32_t *)bp->b_data; if (lastbn != -1) {
copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
memcpy(copy, bp->b_data, fs->fs_bsize);
for (i = last + 1; i < NINDIR(fs); i++) BAP_ASSIGN(ip, i, 0);
if (!DOINGASYNC(vp)) {
error = bwrite(bp);
if (error)
allerror = error;
} else {
bawrite(bp);
}
#ifdef FFS2
if (ip->i_ump->um_fstype == UM_UFS2)
bap2 = (int64_t *)copy;
else
#endif
bap1 = (int32_t *)copy;
}
/*
* Recursively free totally unused blocks.
*/
for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
i--, nlbn += factor) {
nb = BAP(ip, i); if (nb == 0)
continue;
if (level > SINGLE) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
-1, level - 1, &blkcount);
if (error)
allerror = error;
blocksreleased += blkcount;
}
ffs_blkfree(ip, nb, fs->fs_bsize);
blocksreleased += nblocks;
}
/*
* Recursively free last partial block.
*/
if (level > SINGLE && lastbn >= 0) {
last = lastbn % factor;
nb = BAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
last, level - 1, &blkcount);
if (error)
allerror = error;
blocksreleased += blkcount;
}
}
if (copy != NULL) {
free(copy, M_TEMP, fs->fs_bsize);
} else {
bp->b_flags |= B_INVAL;
brelse(bp);
}
*countp = blocksreleased;
return (allerror);
}
/* $OpenBSD: in6_var.h,v 1.74 2022/01/02 22:36:04 jsg Exp $ */
/* $KAME: in6_var.h,v 1.55 2001/02/16 12:49:45 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1985, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET6_IN6_VAR_H_
#define _NETINET6_IN6_VAR_H_
/*
* Interface address, Internet version. One of these structures
* is allocated for each interface with an Internet address.
* The ifaddr structure contains the protocol-independent part
* of the structure and is assumed to be first.
*/
/*
* pltime/vltime are just for future reference (required to implements 2
* hour rule for hosts). they should never be modified by nd6_timeout or
* anywhere else.
* userland -> kernel: accept pltime/vltime
* kernel -> userland: throw up everything
* in kernel: modify preferred/expire only
*/
struct in6_addrlifetime {
time_t ia6t_expire; /* valid lifetime expiration time */
time_t ia6t_preferred; /* preferred lifetime expiration time */
u_int32_t ia6t_vltime; /* valid lifetime */
u_int32_t ia6t_pltime; /* prefix lifetime */
};
#ifdef _KERNEL
struct nd_ifinfo;
struct in6_ifextra {
struct nd_ifinfo *nd_ifinfo;
void *rs_lhcookie;
int nprefixes;
int ndefrouters;
};
struct in6_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
struct sockaddr_in6 ia_addr; /* interface address */
struct sockaddr_in6 ia_dstaddr; /* space for destination addr */
struct sockaddr_in6 ia_prefixmask; /* prefix mask */
TAILQ_ENTRY(in6_ifaddr) ia_list; /* list of IP6 addresses */
int ia6_flags;
struct in6_addrlifetime ia6_lifetime;
time_t ia6_updatetime;
/* multicast addresses joined from the kernel */
LIST_HEAD(, in6_multi_mship) ia6_memberships;
};
#endif /* _KERNEL */
/*
* IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12).
*/
struct in6_ifstat {
u_int64_t ifs6_in_receive; /* # of total input datagram */
u_int64_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */
u_int64_t ifs6_in_toobig; /* # of datagrams exceeded MTU */
u_int64_t ifs6_in_noroute; /* # of datagrams with no route */
u_int64_t ifs6_in_addrerr; /* # of datagrams with invalid dst */
u_int64_t ifs6_in_protounknown; /* # of datagrams with unknown proto */
/* NOTE: increment on final dst if */
u_int64_t ifs6_in_truncated; /* # of truncated datagrams */
u_int64_t ifs6_in_discard; /* # of discarded datagrams */
/* NOTE: fragment timeout is not here */
u_int64_t ifs6_in_deliver; /* # of datagrams delivered to ULP */
/* NOTE: increment on final dst if */
u_int64_t ifs6_out_forward; /* # of datagrams forwarded */
/* NOTE: increment on outgoing if */
u_int64_t ifs6_out_request; /* # of outgoing datagrams from ULP */
/* NOTE: does not include forwards */
u_int64_t ifs6_out_discard; /* # of discarded datagrams */
u_int64_t ifs6_out_fragok; /* # of datagrams fragmented */
u_int64_t ifs6_out_fragfail; /* # of datagrams failed on fragment */
u_int64_t ifs6_out_fragcreat; /* # of fragment datagrams */
/* NOTE: this is # after fragment */
u_int64_t ifs6_reass_reqd; /* # of incoming fragmented packets */
/* NOTE: increment on final dst if */
u_int64_t ifs6_reass_ok; /* # of reassembled packets */
/* NOTE: this is # after reass */
/* NOTE: increment on final dst if */
u_int64_t ifs6_reass_fail; /* # of reass failures */
/* NOTE: may not be packet count */
/* NOTE: increment on final dst if */
u_int64_t ifs6_in_mcast; /* # of inbound multicast datagrams */
u_int64_t ifs6_out_mcast; /* # of outbound multicast datagrams */
};
/*
* ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry.
* XXX: I'm not sure if this file is the right place for this structure...
*/
struct icmp6_ifstat {
/*
* Input statistics
*/
/* ipv6IfIcmpInMsgs, total # of input messages */
u_int64_t ifs6_in_msg;
/* ipv6IfIcmpInErrors, # of input error messages */
u_int64_t ifs6_in_error;
/* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */
u_int64_t ifs6_in_dstunreach;
/* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */
u_int64_t ifs6_in_adminprohib;
/* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */
u_int64_t ifs6_in_timeexceed;
/* ipv6IfIcmpInParmProblems, # of input parameter problem errors */
u_int64_t ifs6_in_paramprob;
/* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */
u_int64_t ifs6_in_pkttoobig;
/* ipv6IfIcmpInEchos, # of input echo requests */
u_int64_t ifs6_in_echo;
/* ipv6IfIcmpInEchoReplies, # of input echo replies */
u_int64_t ifs6_in_echoreply;
/* ipv6IfIcmpInRouterSolicits, # of input router solicitations */
u_int64_t ifs6_in_routersolicit;
/* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */
u_int64_t ifs6_in_routeradvert;
/* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */
u_int64_t ifs6_in_neighborsolicit;
/* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */
u_int64_t ifs6_in_neighboradvert;
/* ipv6IfIcmpInRedirects, # of input redirects */
u_int64_t ifs6_in_redirect;
/* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */
u_int64_t ifs6_in_mldquery;
/* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */
u_int64_t ifs6_in_mldreport;
/* ipv6IfIcmpInGroupMembReductions, # of input MLD done */
u_int64_t ifs6_in_mlddone;
/*
* Output statistics. We should solve unresolved routing problem...
*/
/* ipv6IfIcmpOutMsgs, total # of output messages */
u_int64_t ifs6_out_msg;
/* ipv6IfIcmpOutErrors, # of output error messages */
u_int64_t ifs6_out_error;
/* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */
u_int64_t ifs6_out_dstunreach;
/* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */
u_int64_t ifs6_out_adminprohib;
/* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */
u_int64_t ifs6_out_timeexceed;
/* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */
u_int64_t ifs6_out_paramprob;
/* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */
u_int64_t ifs6_out_pkttoobig;
/* ipv6IfIcmpOutEchos, # of output echo requests */
u_int64_t ifs6_out_echo;
/* ipv6IfIcmpOutEchoReplies, # of output echo replies */
u_int64_t ifs6_out_echoreply;
/* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */
u_int64_t ifs6_out_routersolicit;
/* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */
u_int64_t ifs6_out_routeradvert;
/* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */
u_int64_t ifs6_out_neighborsolicit;
/* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */
u_int64_t ifs6_out_neighboradvert;
/* ipv6IfIcmpOutRedirects, # of output redirects */
u_int64_t ifs6_out_redirect;
/* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */
u_int64_t ifs6_out_mldquery;
/* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */
u_int64_t ifs6_out_mldreport;
/* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */
u_int64_t ifs6_out_mlddone;
};
struct in6_ifreq {
char ifr_name[IFNAMSIZ];
union {
struct sockaddr_in6 ifru_addr;
struct sockaddr_in6 ifru_dstaddr;
short ifru_flags;
int ifru_flags6;
int ifru_metric;
caddr_t ifru_data;
struct in6_addrlifetime ifru_lifetime;
struct in6_ifstat ifru_stat;
struct icmp6_ifstat ifru_icmp6stat;
} ifr_ifru;
};
struct in6_aliasreq {
char ifra_name[IFNAMSIZ];
union {
struct sockaddr_in6 ifrau_addr;
int ifrau_align;
} ifra_ifrau;
#ifndef ifra_addr
#define ifra_addr ifra_ifrau.ifrau_addr
#endif
struct sockaddr_in6 ifra_dstaddr;
struct sockaddr_in6 ifra_prefixmask;
int ifra_flags;
struct in6_addrlifetime ifra_lifetime;
};
/*
* Given a pointer to an in6_ifaddr (ifaddr),
* return a pointer to the addr as a sockaddr_in6
*/
#define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr))
#define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr))
#define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr))
#define IA6_SIN6(ia) (&((ia)->ia_addr))
#define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr))
#define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr)
#define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr)
#define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq)
#define SIOCAIFADDR_IN6 _IOW('i', 26, struct in6_aliasreq)
#define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq)
#define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq)
#define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq)
#define SIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ndireq)
#define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo)
#define SIOCGIFALIFETIME_IN6 _IOWR('i', 81, struct in6_ifreq)
#define SIOCGETSGCNT_IN6 _IOWR('u', 106, struct sioc_sg_req6)
#define SIOCGETMIFCNT_IN6 _IOWR('u', 107, struct sioc_mif_req6)
#define IN6_IFF_ANYCAST 0x01 /* anycast address */
#define IN6_IFF_TENTATIVE 0x02 /* tentative address */
#define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */
#define IN6_IFF_DETACHED 0x08 /* may be detached from the link */
#define IN6_IFF_DEPRECATED 0x10 /* deprecated address */
#define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */
#define IN6_IFF_TEMPORARY 0x80 /* RFC 4941 temporary address */
#ifdef _KERNEL
#define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
(((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \
(((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \
(((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \
(((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 )
#define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b))
#define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b))
/*
* Multi-cast membership entry. One for each group/ifp that a PCB
* belongs to.
*/
struct in6_multi_mship {
struct in6_multi *i6mm_maddr; /* Multicast address pointer */
LIST_ENTRY(in6_multi_mship) i6mm_chain; /* multicast options chain */
};
struct in6_multi {
struct ifmaddr in6m_ifma; /* Protocol-independent info */
#define in6m_refcnt in6m_ifma.ifma_refcnt
#define in6m_ifidx in6m_ifma.ifma_ifidx
struct sockaddr_in6 in6m_sin; /* IPv6 multicast address */
#define in6m_addr in6m_sin.sin6_addr
u_int in6m_state; /* state of membership */
u_int in6m_timer; /* MLD6 membership report timer */
};
static __inline struct in6_multi *
ifmatoin6m(struct ifmaddr *ifma)
{
return ((struct in6_multi *)(ifma));
}
/*
* Macros for looking up the in6_multi record for a given IP6 multicast
* address on a given interface. If no matching record is found, "in6m"
* returns NULL.
*/
#define IN6_LOOKUP_MULTI(addr, ifp, in6m) \
/* struct in6_addr addr; */ \
/* struct ifnet *ifp; */ \
/* struct in6_multi *in6m; */ \
do { \
struct ifmaddr *ifma; \
\
(in6m) = NULL; \
TAILQ_FOREACH(ifma, &(ifp)->if_maddrlist, ifma_list) \
if (ifma->ifma_addr->sa_family == AF_INET6 && \
IN6_ARE_ADDR_EQUAL(&ifmatoin6m(ifma)->in6m_addr, \
&(addr))) { \
(in6m) = ifmatoin6m(ifma); \
break; \
} \
} while (/* CONSTCOND */ 0)
struct in6_multi *in6_addmulti(struct in6_addr *, struct ifnet *, int *);
void in6_delmulti(struct in6_multi *);
int in6_hasmulti(struct in6_addr *, struct ifnet *);
struct in6_multi_mship *in6_joingroup(struct ifnet *, struct in6_addr *, int *);
void in6_leavegroup(struct in6_multi_mship *);
int in6_control(struct socket *, u_long, caddr_t, struct ifnet *);
int in6_ioctl(u_long, caddr_t, struct ifnet *, int);
int in6_update_ifa(struct ifnet *, struct in6_aliasreq *,
struct in6_ifaddr *);
void in6_purgeaddr(struct ifaddr *);
int in6if_do_dad(struct ifnet *);
void *in6_domifattach(struct ifnet *);
void in6_domifdetach(struct ifnet *, void *);
struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int);
struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, struct in6_addr *);
int in6_addr2scopeid(unsigned int, struct in6_addr *);
int in6_matchlen(struct in6_addr *, struct in6_addr *);
void in6_prefixlen2mask(struct in6_addr *, int);
void in6_purgeprefix(struct ifnet *);
#endif /* _KERNEL */
#endif /* _NETINET6_IN6_VAR_H_ */
/* $OpenBSD: uvm_swap.c,v 1.164 2022/08/29 11:09:31 mpi Exp $ */
/* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */
/*
* Copyright (c) 1995, 1996, 1997 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
* from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/extent.h>
#include <sys/blist.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <sys/disk.h>
#include <sys/task.h>
#include <sys/pledge.h>
#if defined(NFSCLIENT)
#include <sys/socket.h>
#include <netinet/in.h>
#include <nfs/nfsproto.h>
#include <nfs/nfsdiskless.h>
#endif
#include <uvm/uvm.h>
#ifdef UVM_SWAP_ENCRYPT
#include <uvm/uvm_swap_encrypt.h>
#endif
#include <sys/specdev.h>
#include "vnd.h"
/*
* uvm_swap.c: manage configuration and i/o to swap space.
*/
/*
* swap space is managed in the following way:
*
* each swap partition or file is described by a "swapdev" structure.
* each "swapdev" structure contains a "swapent" structure which contains
* information that is passed up to the user (via system calls).
*
* each swap partition is assigned a "priority" (int) which controls
* swap partition usage.
*
* the system maintains a global data structure describing all swap
* partitions/files. there is a sorted LIST of "swappri" structures
* which describe "swapdev"'s at that priority. this LIST is headed
* by the "swap_priority" global var. each "swappri" contains a
* TAILQ of "swapdev" structures at that priority.
*
* locking:
* - swap_syscall_lock (sleep lock): this lock serializes the swapctl
* system call and prevents the swap priority list from changing
* while we are in the middle of a system call (e.g. SWAP_STATS).
* - uvm_swap_data_lock (mutex): this lock protects all swap data
* structures including the priority list, the swapdev structures,
* and the swapmap arena.
*
* each swap device has the following info:
* - swap device in use (could be disabled, preventing future use)
* - swap enabled (allows new allocations on swap)
* - map info in /dev/drum
* - vnode pointer
* for swap files only:
* - block size
* - max byte count in buffer
* - buffer
* - credentials to use when doing i/o to file
*
* userland controls and configures swap with the swapctl(2) system call.
* the sys_swapctl performs the following operations:
* [1] SWAP_NSWAP: returns the number of swap devices currently configured
* [2] SWAP_STATS: given a pointer to an array of swapent structures
* (passed in via "arg") of a size passed in via "misc" ... we load
* the current swap config into the array.
* [3] SWAP_ON: given a pathname in arg (could be device or file) and a
* priority in "misc", start swapping on it.
* [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
* [5] SWAP_CTL: changes the priority of a swap device (new priority in
* "misc")
*/
/*
* swapdev: describes a single swap partition/file
*
* note the following should be true:
* swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
* swd_nblks <= swd_mapsize [because mapsize includes disklabel]
*/
struct swapdev {
struct swapent swd_se;
#define swd_dev swd_se.se_dev /* device id */
#define swd_flags swd_se.se_flags /* flags:inuse/enable/fake */
#define swd_priority swd_se.se_priority /* our priority */
#define swd_inuse swd_se.se_inuse /* blocks used */
#define swd_nblks swd_se.se_nblks /* total blocks */
char *swd_path; /* saved pathname of device */
int swd_pathlen; /* length of pathname */
int swd_npages; /* #pages we can use */
int swd_npginuse; /* #pages in use */
int swd_npgbad; /* #pages bad */
int swd_drumoffset; /* page0 offset in drum */
int swd_drumsize; /* #pages in drum */
blist_t swd_blist; /* blist for this swapdev */
struct vnode *swd_vp; /* backing vnode */
TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */
int swd_bsize; /* blocksize (bytes) */
int swd_maxactive; /* max active i/o reqs */
int swd_active; /* # of active i/o reqs */
struct bufq swd_bufq;
struct ucred *swd_cred; /* cred for file access */
#ifdef UVM_SWAP_ENCRYPT
#define SWD_KEY_SHIFT 7 /* One key per 0.5 MByte */
#define SWD_KEY(x,y) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
#define SWD_KEY_SIZE(x) (((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT)
#define SWD_DCRYPT_SHIFT 5
#define SWD_DCRYPT_BITS 32
#define SWD_DCRYPT_MASK (SWD_DCRYPT_BITS - 1)
#define SWD_DCRYPT_OFF(x) ((x) >> SWD_DCRYPT_SHIFT)
#define SWD_DCRYPT_BIT(x) ((x) & SWD_DCRYPT_MASK)
#define SWD_DCRYPT_SIZE(x) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
u_int32_t *swd_decrypt; /* bitmap for decryption */
struct swap_key *swd_keys; /* keys for different parts */
#endif
};
/*
* swap device priority entry; the list is kept sorted on `spi_priority'.
*/
struct swappri {
int spi_priority; /* priority */
TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
/* tailq of swapdevs at this priority */
LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
};
/*
* The following two structures are used to keep track of data transfers
* on swap devices associated with regular files.
* NOTE: this code is more or less a copy of vnd.c; we use the same
* structure names here to ease porting..
*/
struct vndxfer {
struct buf *vx_bp; /* Pointer to parent buffer */
struct swapdev *vx_sdp;
int vx_error;
int vx_pending; /* # of pending aux buffers */
int vx_flags;
#define VX_BUSY 1
#define VX_DEAD 2
};
struct vndbuf {
struct buf vb_buf;
struct vndxfer *vb_vnx;
struct task vb_task;
};
/*
* We keep a of pool vndbuf's and vndxfer structures.
*/
struct pool vndxfer_pool;
struct pool vndbuf_pool;
/*
* local variables
*/
struct extent *swapmap; /* controls the mapping of /dev/drum */
/* list of all active swap devices [by priority] */
LIST_HEAD(swap_priority, swappri);
struct swap_priority swap_priority; /* [S] */
/* locks */
struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE);
struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
struct vm_page *oompps[SWCLUSTPAGES];
int oom = 0;
/*
* prototypes
*/
void swapdrum_add(struct swapdev *, int);
struct swapdev *swapdrum_getsdp(int);
struct swapdev *swaplist_find(struct vnode *, int);
void swaplist_insert(struct swapdev *,
struct swappri *, int);
void swaplist_trim(void);
int swap_on(struct proc *, struct swapdev *);
int swap_off(struct proc *, struct swapdev *);
void sw_reg_strategy(struct swapdev *, struct buf *, int);
void sw_reg_iodone(struct buf *);
void sw_reg_iodone_internal(void *);
void sw_reg_start(struct swapdev *);
int uvm_swap_io(struct vm_page **, int, int, int);
void swapmount(void);
int uvm_swap_allocpages(struct vm_page **, int, int);
#ifdef UVM_SWAP_ENCRYPT
/* for swap encrypt */
void uvm_swap_markdecrypt(struct swapdev *, int, int, int);
boolean_t uvm_swap_needdecrypt(struct swapdev *, int);
void uvm_swap_initcrypt(struct swapdev *, int);
#endif
/*
* uvm_swap_init: init the swap system data structures and locks
*
* => called at boot time from init_main.c after the filesystems
* are brought up (which happens after uvm_init())
*/
void
uvm_swap_init(void)
{
int error;
/*
* first, init the swap list, its counter, and its lock.
* then get a handle on the vnode for /dev/drum by using
* the its dev_t number ("swapdev", from MD conf.c).
*/
LIST_INIT(&swap_priority);
uvmexp.nswapdev = 0;
if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp))
panic("uvm_swap_init: can't get vnode for swap device");
/*
* create swap block extent to map /dev/drum. The extent spans
* 1 to INT_MAX allows 2 gigablocks of swap space. Note that
* block 0 is reserved (used to indicate an allocation failure,
* or no allocation).
*/
swapmap = extent_create("swapmap", 1, INT_MAX,
M_VMSWAP, 0, 0, EX_NOWAIT);
if (swapmap == 0)
panic("uvm_swap_init: extent_create failed");
/* allocate pools for structures used for swapping to files. */
pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO, 0,
"swp vnx", NULL);
pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
"swp vnd", NULL);
/* allocate pages for OOM situations. */
error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT);
KASSERT(error == 0);
/* Setup the initial swap partition */
swapmount();
}
#ifdef UVM_SWAP_ENCRYPT
void
uvm_swap_initcrypt_all(void)
{
struct swapdev *sdp;
struct swappri *spp;
int npages;
LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_decrypt == NULL) { npages = dbtob((uint64_t)sdp->swd_nblks) >>
PAGE_SHIFT;
uvm_swap_initcrypt(sdp, npages);
}
}
}
}
void
uvm_swap_initcrypt(struct swapdev *sdp, int npages)
{
/*
* keep information if a page needs to be decrypted when we get it
* from the swap device.
* We cannot chance a malloc later, if we are doing ASYNC puts,
* we may not call malloc with M_WAITOK. This consumes only
* 8KB memory for a 256MB swap partition.
*/
sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP,
M_WAITOK|M_ZERO);
sdp->swd_keys = mallocarray(SWD_KEY_SIZE(npages),
sizeof(struct swap_key), M_VMSWAP, M_WAITOK|M_ZERO);
}
#endif /* UVM_SWAP_ENCRYPT */
int
uvm_swap_allocpages(struct vm_page **pps, int npages, int flags)
{
struct pglist pgl;
int error, i;
KASSERT(npages <= SWCLUSTPAGES);
TAILQ_INIT(&pgl);
again:
error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
dma_constraint.ucr_high, 0, 0, &pgl, npages, flags);
if (error && (curproc == uvm.pagedaemon_proc)) {
mtx_enter(&oommtx);
if (oom) {
msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
"oom", INFSLP);
goto again;
}
oom = 1;
for (i = 0; i < npages; i++) {
pps[i] = oompps[i];
atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
}
mtx_leave(&oommtx);
return 0;
}
if (error)
return error;
for (i = 0; i < npages; i++) {
pps[i] = TAILQ_FIRST(&pgl);
/* *sigh* */
atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
TAILQ_REMOVE(&pgl, pps[i], pageq);
}
return 0;
}
void
uvm_swap_freepages(struct vm_page **pps, int npages)
{
int i;
if (pps[0] == oompps[0]) {
for (i = 0; i < npages; i++)
uvm_pageclean(pps[i]);
mtx_enter(&oommtx);
KASSERT(oom == 1);
oom = 0;
mtx_leave(&oommtx);
wakeup(&oom);
return;
}
uvm_lock_pageq();
for (i = 0; i < npages; i++)
uvm_pagefree(pps[i]);
uvm_unlock_pageq();
}
#ifdef UVM_SWAP_ENCRYPT
/*
* Mark pages on the swap device for later decryption
*/
void
uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
int decrypt)
{
int pagestart, i;
int off, bit;
if (!sdp)
return;
pagestart = startslot - sdp->swd_drumoffset;
for (i = 0; i < npages; i++, pagestart++) {
off = SWD_DCRYPT_OFF(pagestart);
bit = SWD_DCRYPT_BIT(pagestart);
if (decrypt)
/* pages read need decryption */
sdp->swd_decrypt[off] |= 1 << bit;
else
/* pages read do not need decryption */
sdp->swd_decrypt[off] &= ~(1 << bit);
}
}
/*
* Check if the page that we got from disk needs to be decrypted
*/
boolean_t
uvm_swap_needdecrypt(struct swapdev *sdp, int off)
{
if (!sdp)
return FALSE;
off -= sdp->swd_drumoffset;
return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
TRUE : FALSE;
}
void
uvm_swap_finicrypt_all(void)
{
struct swapdev *sdp;
struct swappri *spp;
struct swap_key *key;
unsigned int nkeys;
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_decrypt == NULL)
continue;
nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT;
key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1);
do {
if (key->refcount != 0)
swap_key_delete(key);
} while (key-- != sdp->swd_keys);
}
}
}
#endif /* UVM_SWAP_ENCRYPT */
/*
* swaplist functions: functions that operate on the list of swap
* devices on the system.
*/
/*
* swaplist_insert: insert swap device "sdp" into the global list
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => caller must provide a newly allocated swappri structure (we will
* FREE it if we don't need it... this it to prevent allocation
* blocking here while adding swap)
*/
void
swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
{
struct swappri *spp, *pspp;
KASSERT(rw_write_held(&swap_syscall_lock));
MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
/*
* find entry at or after which to insert the new device.
*/
pspp = NULL;
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
if (priority <= spp->spi_priority)
break;
pspp = spp;
}
/*
* new priority?
*/
if (spp == NULL || spp->spi_priority != priority) {
spp = newspp; /* use newspp! */
spp->spi_priority = priority;
TAILQ_INIT(&spp->spi_swapdev);
if (pspp)
LIST_INSERT_AFTER(pspp, spp, spi_swappri);
else
LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
} else {
/* we don't need a new priority structure, free it */
free(newspp, M_VMSWAP, sizeof(*newspp));
}
/*
* priority found (or created). now insert on the priority's
* tailq list and bump the total number of swapdevs.
*/
sdp->swd_priority = priority;
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev++;
}
/*
* swaplist_find: find and optionally remove a swap device from the
* global list.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => we return the swapdev we found (and removed)
*/
struct swapdev *
swaplist_find(struct vnode *vp, boolean_t remove)
{
struct swapdev *sdp;
struct swappri *spp;
KASSERT(rw_write_held(&swap_syscall_lock));
MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
/*
* search the lists for the requested vp
*/
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_vp != vp)
continue;
if (remove) {
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev--;
}
return (sdp);
}
}
return (NULL);
}
/*
* swaplist_trim: scan priority list for empty priority entries and kill
* them.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
*/
void
swaplist_trim(void)
{
struct swappri *spp, *nextspp;
KASSERT(rw_write_held(&swap_syscall_lock));
MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
if (!TAILQ_EMPTY(&spp->spi_swapdev))
continue;
LIST_REMOVE(spp, spi_swappri);
free(spp, M_VMSWAP, sizeof(*spp));
}
}
/*
* swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
*
* => caller must hold swap_syscall_lock
* => uvm_swap_data_lock should be unlocked (we may sleep)
*/
void
swapdrum_add(struct swapdev *sdp, int npages)
{
u_long result;
if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
EX_WAITOK, &result))
panic("swapdrum_add");
sdp->swd_drumoffset = result;
sdp->swd_drumsize = npages;
}
/*
* swapdrum_getsdp: given a page offset in /dev/drum, convert it back
* to the "swapdev" that maps that section of the drum.
*
* => each swapdev takes one big contig chunk of the drum
* => caller must hold uvm_swap_data_lock
*/
struct swapdev *
swapdrum_getsdp(int pgno)
{
struct swapdev *sdp;
struct swappri *spp;
MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (pgno >= sdp->swd_drumoffset &&
pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
return sdp;
}
}
}
return NULL;
}
/*
* sys_swapctl: main entry point for swapctl(2) system call
* [with two helper functions: swap_on and swap_off]
*/
int
sys_swapctl(struct proc *p, void *v, register_t *retval)
{
struct sys_swapctl_args /* {
syscallarg(int) cmd;
syscallarg(void *) arg;
syscallarg(int) misc;
} */ *uap = (struct sys_swapctl_args *)v;
struct vnode *vp;
struct nameidata nd;
struct swappri *spp;
struct swapdev *sdp;
struct swapent *sep;
char userpath[MAXPATHLEN];
size_t len;
int count, error, misc;
int priority;
misc = SCARG(uap, misc);
if ((error = pledge_swapctl(p, SCARG(uap, cmd))))
return error;
/*
* ensure serialized syscall access by grabbing the swap_syscall_lock
*/
rw_enter_write(&swap_syscall_lock);
/*
* we handle the non-priv NSWAP and STATS request first.
*
* SWAP_NSWAP: return number of config'd swap devices
* [can also be obtained with uvmexp sysctl]
*/
if (SCARG(uap, cmd) == SWAP_NSWAP) {
*retval = uvmexp.nswapdev;
error = 0;
goto out;
}
/*
* SWAP_STATS: get stats on current # of configured swap devs
*
* note that the swap_priority list can't change as long
* as we are holding the swap_syscall_lock. we don't want
* to grab the uvm_swap_data_lock because we may fault&sleep during
* copyout() and we don't want to be holding that lock then!
*/
if (SCARG(uap, cmd) == SWAP_STATS) {
sep = (struct swapent *)SCARG(uap, arg);
count = 0;
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (count >= misc)
continue;
sdp->swd_inuse =
btodb((u_int64_t)sdp->swd_npginuse <<
PAGE_SHIFT);
error = copyout(&sdp->swd_se, sep,
sizeof(struct swapent));
if (error)
goto out;
/* now copy out the path if necessary */
error = copyoutstr(sdp->swd_path,
sep->se_path, sizeof(sep->se_path), NULL);
if (error)
goto out;
count++;
sep++;
}
}
*retval = count;
error = 0;
goto out;
}
/* all other requests require superuser privs. verify. */
if ((error = suser(p)))
goto out;
/*
* at this point we expect a path name in arg. we will
* use namei() to gain a vnode reference (vref), and lock
* the vnode (VOP_LOCK).
*/
error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len);
if (error)
goto out;
disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK);
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p);
if ((error = namei(&nd)))
goto out;
vp = nd.ni_vp;
/* note: "vp" is referenced and locked */
error = 0; /* assume no error */
switch(SCARG(uap, cmd)) {
case SWAP_DUMPDEV:
if (vp->v_type != VBLK) {
error = ENOTBLK;
break;
}
dumpdev = vp->v_rdev;
break;
case SWAP_CTL:
/*
* get new priority, remove old entry (if any) and then
* reinsert it in the correct place. finally, prune out
* any empty priority structures.
*/
priority = SCARG(uap, misc);
spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
mtx_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, 1)) == NULL) {
error = ENOENT;
} else {
swaplist_insert(sdp, spp, priority);
swaplist_trim();
}
mtx_leave(&uvm_swap_data_lock);
if (error)
free(spp, M_VMSWAP, sizeof(*spp));
break;
case SWAP_ON:
/*
* If the device is a regular file, make sure the filesystem
* can be used for swapping.
*/
if (vp->v_type == VREG &&
(vp->v_mount->mnt_flag & MNT_SWAPPABLE) == 0) {
error = ENOTSUP;
break;
}
/*
* check for duplicates. if none found, then insert a
* dummy entry on the list to prevent someone else from
* trying to enable this device while we are working on
* it.
*/
priority = SCARG(uap, misc);
sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
sdp->swd_flags = SWF_FAKE; /* placeholder only */
sdp->swd_vp = vp;
sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
/*
* XXX Is NFS elaboration necessary?
*/
if (vp->v_type == VREG) {
sdp->swd_cred = crdup(p->p_ucred);
}
mtx_enter(&uvm_swap_data_lock);
if (swaplist_find(vp, 0) != NULL) {
error = EBUSY;
mtx_leave(&uvm_swap_data_lock);
if (vp->v_type == VREG) {
crfree(sdp->swd_cred);
}
free(sdp, M_VMSWAP, sizeof *sdp);
free(spp, M_VMSWAP, sizeof *spp);
break;
}
swaplist_insert(sdp, spp, priority);
mtx_leave(&uvm_swap_data_lock);
sdp->swd_pathlen = len;
sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
strlcpy(sdp->swd_path, userpath, len);
/*
* we've now got a FAKE placeholder in the swap list.
* now attempt to enable swap on it. if we fail, undo
* what we've done and kill the fake entry we just inserted.
* if swap_on is a success, it will clear the SWF_FAKE flag
*/
if ((error = swap_on(p, sdp)) != 0) {
mtx_enter(&uvm_swap_data_lock);
(void) swaplist_find(vp, 1); /* kill fake entry */
swaplist_trim();
mtx_leave(&uvm_swap_data_lock);
if (vp->v_type == VREG) {
crfree(sdp->swd_cred);
}
free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
free(sdp, M_VMSWAP, sizeof(*sdp));
break;
}
break;
case SWAP_OFF:
mtx_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, 0)) == NULL) {
mtx_leave(&uvm_swap_data_lock);
error = ENXIO;
break;
}
/*
* If a device isn't in use or enabled, we
* can't stop swapping from it (again).
*/
if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
mtx_leave(&uvm_swap_data_lock);
error = EBUSY;
break;
}
/*
* do the real work.
*/
error = swap_off(p, sdp);
break;
default:
error = EINVAL;
}
/* done! release the ref gained by namei() and unlock. */
vput(vp);
out:
rw_exit_write(&swap_syscall_lock);
return (error);
}
/*
* swap_on: attempt to enable a swapdev for swapping. note that the
* swapdev is already on the global list, but disabled (marked
* SWF_FAKE).
*
* => we avoid the start of the disk (to protect disk labels)
* => caller should leave uvm_swap_data_lock unlocked, we may lock it
* if needed.
*/
int
swap_on(struct proc *p, struct swapdev *sdp)
{
struct vnode *vp;
int error, npages, nblocks, size;
long addr;
struct vattr va;
#if defined(NFSCLIENT)
extern const struct vops nfs_vops;
#endif /* defined(NFSCLIENT) */
dev_t dev;
/*
* we want to enable swapping on sdp. the swd_vp contains
* the vnode we want (locked and ref'd), and the swd_dev
* contains the dev_t of the file, if it a block device.
*/
vp = sdp->swd_vp;
dev = sdp->swd_dev;
#if NVND > 0
/* no swapping to vnds. */
if (bdevsw[major(dev)].d_strategy == vndstrategy)
return (EOPNOTSUPP);
#endif
/*
* open the swap file (mostly useful for block device files to
* let device driver know what is up).
*
* we skip the open/close for root on swap because the root
* has already been opened when root was mounted (mountroot).
*/
if (vp != rootvp) {
if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
return (error);
}
/* XXX this only works for block devices */
/*
* we now need to determine the size of the swap area. for
* block specials we can call the d_psize function.
* for normal files, we must stat [get attrs].
*
* we put the result in nblks.
* for normal files, we also want the filesystem block size
* (which we get with statfs).
*/
switch (vp->v_type) {
case VBLK:
if (bdevsw[major(dev)].d_psize == 0 ||
(nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
error = ENXIO;
goto bad;
}
break;
case VREG:
if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
goto bad;
nblocks = (int)btodb(va.va_size);
if ((error =
VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
goto bad;
sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
/*
* limit the max # of outstanding I/O requests we issue
* at any one time. take it easy on NFS servers.
*/
#if defined(NFSCLIENT)
if (vp->v_op == &nfs_vops)
sdp->swd_maxactive = 2; /* XXX */
else
#endif /* defined(NFSCLIENT) */
sdp->swd_maxactive = 8; /* XXX */
bufq_init(&sdp->swd_bufq, BUFQ_FIFO);
break;
default:
error = ENXIO;
goto bad;
}
/*
* save nblocks in a safe place and convert to pages.
*/
sdp->swd_nblks = nblocks;
npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
/*
* for block special files, we want to make sure that leave
* the disklabel and bootblocks alone, so we arrange to skip
* over them (arbitrarily choosing to skip PAGE_SIZE bytes).
* note that because of this the "size" can be less than the
* actual number of blocks on the device.
*/
if (vp->v_type == VBLK) {
/* we use pages 1 to (size - 1) [inclusive] */
size = npages - 1;
addr = 1;
} else {
/* we use pages 0 to (size - 1) [inclusive] */
size = npages;
addr = 0;
}
/*
* make sure we have enough blocks for a reasonable sized swap
* area. we want at least one page.
*/
if (size < 1) {
error = EINVAL;
goto bad;
}
/*
* now we need to allocate a blist to manage this swap device
*/
sdp->swd_blist = blist_create(npages);
/* mark all expect the `saved' region free. */
blist_free(sdp->swd_blist, addr, size);
#ifdef HIBERNATE
/*
* Lock down the last region of primary disk swap, in case
* hibernate needs to place a signature there.
*/
if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) {
if (blist_fill(sdp->swd_blist, npages - 1, 1) != 1)
panic("hibernate reserve");
}
#endif
/* add a ref to vp to reflect usage as a swap device. */
vref(vp);
#ifdef UVM_SWAP_ENCRYPT
if (uvm_doswapencrypt)
uvm_swap_initcrypt(sdp, npages);
#endif
/* now add the new swapdev to the drum and enable. */
swapdrum_add(sdp, npages);
sdp->swd_npages = size;
mtx_enter(&uvm_swap_data_lock);
sdp->swd_flags &= ~SWF_FAKE; /* going live */
sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
uvmexp.swpages += size;
mtx_leave(&uvm_swap_data_lock);
return (0);
/*
* failure: clean up and return error.
*/
bad:
if (vp != rootvp)
(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
return (error);
}
/*
* swap_off: stop swapping on swapdev
*
* => swap data should be locked, we will unlock.
*/
int
swap_off(struct proc *p, struct swapdev *sdp)
{
int npages = sdp->swd_npages;
int error = 0;
KASSERT(rw_write_held(&swap_syscall_lock));
MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
/* disable the swap area being removed */
sdp->swd_flags &= ~SWF_ENABLE;
mtx_leave(&uvm_swap_data_lock);
/*
* the idea is to find all the pages that are paged out to this
* device, and page them all in. in uvm, swap-backed pageable
* memory can take two forms: aobjs and anons. call the
* swapoff hook for each subsystem to bring in pages.
*/
if (uao_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize) ||
amap_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize)) {
error = ENOMEM;
} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
error = EBUSY;
}
if (error) {
mtx_enter(&uvm_swap_data_lock);
sdp->swd_flags |= SWF_ENABLE;
mtx_leave(&uvm_swap_data_lock);
return error;
}
/*
* done with the vnode and saved creds.
* drop our ref on the vnode before calling VOP_CLOSE()
* so that spec_close() can tell if this is the last close.
*/
if (sdp->swd_vp->v_type == VREG) {
crfree(sdp->swd_cred);
}
vrele(sdp->swd_vp);
if (sdp->swd_vp != rootvp) {
(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
}
mtx_enter(&uvm_swap_data_lock);
uvmexp.swpages -= npages;
if (swaplist_find(sdp->swd_vp, 1) == NULL)
panic("swap_off: swapdev not in list");
swaplist_trim();
mtx_leave(&uvm_swap_data_lock);
/*
* free all resources!
*/
extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
EX_WAITOK);
blist_destroy(sdp->swd_blist);
/* free sdp->swd_path ? */
free(sdp, M_VMSWAP, sizeof(*sdp));
return (0);
}
/*
* /dev/drum interface and i/o functions
*/
/*
* swstrategy: perform I/O on the drum
*
* => we must map the i/o request from the drum to the correct swapdev.
*/
void
swstrategy(struct buf *bp)
{
struct swapdev *sdp;
int s, pageno, bn;
/*
* convert block number to swapdev. note that swapdev can't
* be yanked out from under us because we are holding resources
* in it (i.e. the blocks we are doing I/O on).
*/
pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
mtx_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(pageno);
mtx_leave(&uvm_swap_data_lock);
if (sdp == NULL) {
bp->b_error = EINVAL;
bp->b_flags |= B_ERROR;
s = splbio();
biodone(bp);
splx(s);
return;
}
/* convert drum page number to block number on this swapdev. */
pageno -= sdp->swd_drumoffset; /* page # on swapdev */
bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
/*
* for block devices we finish up here.
* for regular files we have to do more work which we delegate
* to sw_reg_strategy().
*/
switch (sdp->swd_vp->v_type) {
default:
panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
case VBLK:
/*
* must convert "bp" from an I/O on /dev/drum to an I/O
* on the swapdev (sdp).
*/
s = splbio();
buf_replacevnode(bp, sdp->swd_vp);
bp->b_blkno = bn;
splx(s);
VOP_STRATEGY(bp->b_vp, bp);
return;
case VREG:
/* delegate to sw_reg_strategy function. */
sw_reg_strategy(sdp, bp, bn);
return;
}
/* NOTREACHED */
}
/*
* sw_reg_strategy: handle swap i/o to regular files
*/
void
sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
{
struct vnode *vp;
struct vndxfer *vnx;
daddr_t nbn;
caddr_t addr;
off_t byteoff;
int s, off, nra, error, sz, resid;
/*
* allocate a vndxfer head for this transfer and point it to
* our buffer.
*/
vnx = pool_get(&vndxfer_pool, PR_WAITOK);
vnx->vx_flags = VX_BUSY;
vnx->vx_error = 0;
vnx->vx_pending = 0;
vnx->vx_bp = bp;
vnx->vx_sdp = sdp;
/*
* setup for main loop where we read filesystem blocks into
* our buffer.
*/
error = 0;
bp->b_resid = bp->b_bcount; /* nothing transferred yet! */
addr = bp->b_data; /* current position in buffer */
byteoff = dbtob((u_int64_t)bn);
for (resid = bp->b_resid; resid; resid -= sz) {
struct vndbuf *nbp;
/*
* translate byteoffset into block number. return values:
* vp = vnode of underlying device
* nbn = new block number (on underlying vnode dev)
* nra = num blocks we can read-ahead (excludes requested
* block)
*/
nra = 0;
error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
&vp, &nbn, &nra);
if (error == 0 && nbn == -1) {
/*
* this used to just set error, but that doesn't
* do the right thing. Instead, it causes random
* memory errors. The panic() should remain until
* this condition doesn't destabilize the system.
*/
#if 1
panic("sw_reg_strategy: swap to sparse file");
#else
error = EIO; /* failure */
#endif
}
/*
* punt if there was an error or a hole in the file.
* we must wait for any i/o ops we have already started
* to finish before returning.
*
* XXX we could deal with holes here but it would be
* a hassle (in the write case).
*/
if (error) {
s = splbio();
vnx->vx_error = error; /* pass error up */
goto out;
}
/*
* compute the size ("sz") of this transfer (in bytes).
*/
off = byteoff % sdp->swd_bsize;
sz = (1 + nra) * sdp->swd_bsize - off;
if (sz > resid)
sz = resid;
/*
* now get a buf structure. note that the vb_buf is
* at the front of the nbp structure so that you can
* cast pointers between the two structure easily.
*/
nbp = pool_get(&vndbuf_pool, PR_WAITOK);
nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
nbp->vb_buf.b_bcount = sz;
nbp->vb_buf.b_bufsize = sz;
nbp->vb_buf.b_error = 0;
nbp->vb_buf.b_data = addr;
nbp->vb_buf.b_bq = NULL;
nbp->vb_buf.b_blkno = nbn + btodb(off);
nbp->vb_buf.b_proc = bp->b_proc;
nbp->vb_buf.b_iodone = sw_reg_iodone;
nbp->vb_buf.b_vp = NULLVP;
nbp->vb_buf.b_vnbufs.le_next = NOLIST;
LIST_INIT(&nbp->vb_buf.b_dep);
/*
* set b_dirtyoff/end and b_validoff/end. this is
* required by the NFS client code (otherwise it will
* just discard our I/O request).
*/
if (bp->b_dirtyend == 0) {
nbp->vb_buf.b_dirtyoff = 0;
nbp->vb_buf.b_dirtyend = sz;
} else {
nbp->vb_buf.b_dirtyoff =
max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
nbp->vb_buf.b_dirtyend =
min(sz,
max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
}
if (bp->b_validend == 0) {
nbp->vb_buf.b_validoff = 0;
nbp->vb_buf.b_validend = sz;
} else {
nbp->vb_buf.b_validoff =
max(0, bp->b_validoff - (bp->b_bcount-resid));
nbp->vb_buf.b_validend =
min(sz,
max(0, bp->b_validend - (bp->b_bcount-resid)));
}
/* patch it back to the vnx */
nbp->vb_vnx = vnx;
task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp);
s = splbio();
if (vnx->vx_error != 0) {
pool_put(&vndbuf_pool, nbp);
goto out;
}
vnx->vx_pending++;
/* assoc new buffer with underlying vnode */
bgetvp(vp, &nbp->vb_buf);
/* start I/O if we are not over our limit */
bufq_queue(&sdp->swd_bufq, &nbp->vb_buf);
sw_reg_start(sdp);
splx(s);
/*
* advance to the next I/O
*/
byteoff += sz;
addr += sz;
}
s = splbio();
out: /* Arrive here at splbio */
vnx->vx_flags &= ~VX_BUSY;
if (vnx->vx_pending == 0) {
if (vnx->vx_error != 0) {
bp->b_error = vnx->vx_error;
bp->b_flags |= B_ERROR;
}
pool_put(&vndxfer_pool, vnx);
biodone(bp);
}
splx(s);
}
/* sw_reg_start: start an I/O request on the requested swapdev. */
void
sw_reg_start(struct swapdev *sdp)
{
struct buf *bp;
/* XXX: recursion control */
if ((sdp->swd_flags & SWF_BUSY) != 0)
return;
sdp->swd_flags |= SWF_BUSY;
while (sdp->swd_active < sdp->swd_maxactive) {
bp = bufq_dequeue(&sdp->swd_bufq);
if (bp == NULL)
break;
sdp->swd_active++;
if ((bp->b_flags & B_READ) == 0)
bp->b_vp->v_numoutput++;
VOP_STRATEGY(bp->b_vp, bp);
}
sdp->swd_flags &= ~SWF_BUSY;
}
/*
* sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
*
* => note that we can recover the vndbuf struct by casting the buf ptr
*
* XXX:
* We only put this onto a taskq here, because of the maxactive game since
* it basically requires us to call back into VOP_STRATEGY() (where we must
* be able to sleep) via sw_reg_start().
*/
void
sw_reg_iodone(struct buf *bp)
{
struct vndbuf *vbp = (struct vndbuf *)bp;
task_add(systq, &vbp->vb_task);
}
void
sw_reg_iodone_internal(void *xvbp)
{
struct vndbuf *vbp = xvbp;
struct vndxfer *vnx = vbp->vb_vnx;
struct buf *pbp = vnx->vx_bp; /* parent buffer */
struct swapdev *sdp = vnx->vx_sdp;
int resid, s;
s = splbio();
resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
pbp->b_resid -= resid;
vnx->vx_pending--;
/* pass error upward */
if (vbp->vb_buf.b_error)
vnx->vx_error = vbp->vb_buf.b_error;
/* disassociate this buffer from the vnode (if any). */
if (vbp->vb_buf.b_vp != NULL) {
brelvp(&vbp->vb_buf);
}
/* kill vbp structure */
pool_put(&vndbuf_pool, vbp);
/*
* wrap up this transaction if it has run to completion or, in
* case of an error, when all auxiliary buffers have returned.
*/
if (vnx->vx_error != 0) {
/* pass error upward */
pbp->b_flags |= B_ERROR;
pbp->b_error = vnx->vx_error;
if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
pool_put(&vndxfer_pool, vnx);
biodone(pbp);
}
} else if (pbp->b_resid == 0) {
KASSERT(vnx->vx_pending == 0);
if ((vnx->vx_flags & VX_BUSY) == 0) {
pool_put(&vndxfer_pool, vnx);
biodone(pbp);
}
}
/*
* done! start next swapdev I/O if one is pending
*/
sdp->swd_active--;
sw_reg_start(sdp);
splx(s);
}
/*
* uvm_swap_alloc: allocate space on swap
*
* => allocation is done "round robin" down the priority list, as we
* allocate in a priority we "rotate" the tail queue.
* => space can be freed with uvm_swap_free
* => we return the page slot number in /dev/drum (0 == invalid slot)
* => we lock uvm_swap_data_lock
* => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
*/
int
uvm_swap_alloc(int *nslots, boolean_t lessok)
{
struct swapdev *sdp;
struct swappri *spp;
/*
* no swap devices configured yet? definite failure.
*/
if (uvmexp.nswapdev < 1)
return 0;
/*
* lock data lock, convert slots into blocks, and enter loop
*/
KERNEL_ASSERT_LOCKED();
mtx_enter(&uvm_swap_data_lock);
ReTry: /* XXXMRG */
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
swblk_t result;
/* if it's not enabled, then we can't swap from it */
if ((sdp->swd_flags & SWF_ENABLE) == 0)
continue;
if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
continue;
result = blist_alloc(sdp->swd_blist, *nslots);
if (result == SWAPBLK_NONE) {
continue;
}
KASSERT(result < sdp->swd_drumsize);
/*
* successful allocation! now rotate the tailq.
*/
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
sdp->swd_npginuse += *nslots;
uvmexp.swpginuse += *nslots;
mtx_leave(&uvm_swap_data_lock);
/* done! return drum slot number */
return result + sdp->swd_drumoffset;
}
}
/* XXXMRG: BEGIN HACK */
if (*nslots > 1 && lessok) {
*nslots = 1;
/* XXXMRG: ugh! blist should support this for us */
goto ReTry;
}
/* XXXMRG: END HACK */
mtx_leave(&uvm_swap_data_lock);
return 0; /* failed */
}
/*
* uvm_swapisfull: return true if all of available swap is allocated
* and in use.
*/
int
uvm_swapisfull(void)
{
int result;
mtx_enter(&uvm_swap_data_lock);
KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
result = (uvmexp.swpgonly == uvmexp.swpages);
mtx_leave(&uvm_swap_data_lock);
return result;
}
/*
* uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
*
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_markbad(int startslot, int nslots)
{
struct swapdev *sdp;
mtx_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
if (sdp != NULL) {
/*
* we just keep track of how many pages have been marked bad
* in this device, to make everything add up in swap_off().
* we assume here that the range of slots will all be within
* one swap device.
*/
sdp->swd_npgbad += nslots;
}
mtx_leave(&uvm_swap_data_lock);
}
/*
* uvm_swap_free: free swap slots
*
* => this can be all or part of an allocation made by uvm_swap_alloc
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_free(int startslot, int nslots)
{
struct swapdev *sdp;
/*
* ignore attempts to free the "bad" slot.
*/
if (startslot == SWSLOT_BAD) {
return;
}
/*
* convert drum slot offset back to sdp, free the blocks
* in the extent, and return. must hold pri lock to do
* lookup and access the extent.
*/
KERNEL_LOCK();
mtx_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
KASSERT(uvmexp.nswapdev >= 1);
KASSERT(sdp != NULL);
KASSERT(sdp->swd_npginuse >= nslots);
blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
sdp->swd_npginuse -= nslots;
uvmexp.swpginuse -= nslots;
mtx_leave(&uvm_swap_data_lock);
#ifdef UVM_SWAP_ENCRYPT
{
int i;
if (swap_encrypt_initialized) {
/* Dereference keys */
for (i = 0; i < nslots; i++)
if (uvm_swap_needdecrypt(sdp, startslot + i)) {
struct swap_key *key;
key = SWD_KEY(sdp, startslot + i);
if (key->refcount != 0)
SWAP_KEY_PUT(sdp, key);
}
/* Mark range as not decrypt */
uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
}
}
#endif /* UVM_SWAP_ENCRYPT */
KERNEL_UNLOCK();
}
/*
* uvm_swap_put: put any number of pages into a contig place on swap
*
* => can be sync or async
*/
int
uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
{
int result;
result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
return (result);
}
/*
* uvm_swap_get: get a single page from swap
*
* => usually a sync op (from fault)
*/
int
uvm_swap_get(struct vm_page *page, int swslot, int flags)
{
int result;
atomic_inc_int(&uvmexp.nswget);
KASSERT(flags & PGO_SYNCIO);
if (swslot == SWSLOT_BAD) {
return VM_PAGER_ERROR;
}
KERNEL_LOCK();
result = uvm_swap_io(&page, swslot, 1, B_READ);
KERNEL_UNLOCK();
if (result == VM_PAGER_OK || result == VM_PAGER_PEND) {
/*
* this page is no longer only in swap.
*/
atomic_dec_int(&uvmexp.swpgonly);
}
return (result);
}
/*
* uvm_swap_io: do an i/o operation to swap
*/
int
uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
{
daddr_t startblk;
struct buf *bp;
vaddr_t kva;
int result, s, mapinflags, pflag, bounce = 0, i;
boolean_t write, async;
vaddr_t bouncekva;
struct vm_page *tpps[SWCLUSTPAGES];
int pdaemon = (curproc == uvm.pagedaemon_proc);
#ifdef UVM_SWAP_ENCRYPT
struct swapdev *sdp;
int encrypt = 0;
#endif
KERNEL_ASSERT_LOCKED();
write = (flags & B_READ) == 0;
async = (flags & B_ASYNC) != 0;
/* convert starting drum slot to block number */
startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
bp = pool_get(&bufpool, pflag | PR_ZERO);
if (bp == NULL)
return (VM_PAGER_AGAIN);
/*
* map the pages into the kernel (XXX: currently required
* by buffer system).
*/
mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
if (!async)
mapinflags |= UVMPAGER_MAPIN_WAITOK;
kva = uvm_pagermapin(pps, npages, mapinflags);
if (kva == 0) {
pool_put(&bufpool, bp);
return (VM_PAGER_AGAIN);
}
#ifdef UVM_SWAP_ENCRYPT
if (write) {
/*
* Check if we need to do swap encryption on old pages.
* Later we need a different scheme, that swap encrypts
* all pages of a process that had at least one page swap
* encrypted. Then we might not need to copy all pages
* in the cluster, and avoid the memory overheard in
* swapping.
*/
if (uvm_doswapencrypt)
encrypt = 1;
}
if (swap_encrypt_initialized || encrypt) {
/*
* we need to know the swap device that we are swapping to/from
* to see if the pages need to be marked for decryption or
* actually need to be decrypted.
* XXX - does this information stay the same over the whole
* execution of this function?
*/
mtx_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
mtx_leave(&uvm_swap_data_lock);
}
/*
* Check that we are dma capable for read (write always bounces
* through the swapencrypt anyway...
*/
if (write && encrypt) {
bounce = 1; /* bounce through swapencrypt always */
} else {
#else
{
#endif
for (i = 0; i < npages; i++) {
if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low ||
VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) {
bounce = 1;
break;
}
}
}
if (bounce) {
int swmapflags, plaflags;
/* We always need write access. */
swmapflags = UVMPAGER_MAPIN_READ;
plaflags = UVM_PLA_NOWAIT;
if (!async) {
swmapflags |= UVMPAGER_MAPIN_WAITOK;
plaflags = UVM_PLA_WAITOK;
}
if (uvm_swap_allocpages(tpps, npages, plaflags)) {
pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
return (VM_PAGER_AGAIN);
}
bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
if (bouncekva == 0) {
pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
uvm_swap_freepages(tpps, npages);
return (VM_PAGER_AGAIN);
}
}
/* encrypt to swap */
if (write && bounce) {
int i, opages;
caddr_t src, dst;
u_int64_t block;
src = (caddr_t) kva;
dst = (caddr_t) bouncekva;
block = startblk;
for (i = 0; i < npages; i++) {
#ifdef UVM_SWAP_ENCRYPT
struct swap_key *key;
if (encrypt) {
key = SWD_KEY(sdp, startslot + i);
SWAP_KEY_GET(sdp, key); /* add reference */
swap_encrypt(key, src, dst, block, PAGE_SIZE);
block += btodb(PAGE_SIZE);
} else {
#else
{
#endif /* UVM_SWAP_ENCRYPT */
memcpy(dst, src, PAGE_SIZE);
}
/* this just tells async callbacks to free */
atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT);
src += PAGE_SIZE;
dst += PAGE_SIZE;
}
uvm_pagermapout(kva, npages);
/* dispose of pages we dont use anymore */
opages = npages;
uvm_pager_dropcluster(NULL, NULL, pps, &opages,
PGO_PDFREECLUST);
kva = bouncekva;
}
/*
* prevent ASYNC reads.
* uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
* assumes that all gets are SYNCIO. Just make sure here.
* XXXARTUBC - might not be true anymore.
*/
if (!write) {
flags &= ~B_ASYNC;
async = 0;
}
/*
* fill in the bp. we currently route our i/o through
* /dev/drum's vnode [swapdev_vp].
*/
bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC));
bp->b_proc = &proc0; /* XXX */
bp->b_vnbufs.le_next = NOLIST;
if (bounce)
bp->b_data = (caddr_t)bouncekva;
else
bp->b_data = (caddr_t)kva;
bp->b_bq = NULL;
bp->b_blkno = startblk;
LIST_INIT(&bp->b_dep);
s = splbio();
bp->b_vp = NULL;
buf_replacevnode(bp, swapdev_vp);
splx(s);
bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT;
/*
* for pageouts we must set "dirtyoff" [NFS client code needs it].
* and we bump v_numoutput (counter of number of active outputs).
*/
if (write) {
bp->b_dirtyoff = 0;
bp->b_dirtyend = npages << PAGE_SHIFT;
#ifdef UVM_SWAP_ENCRYPT
/* mark the pages in the drum for decryption */
if (swap_encrypt_initialized)
uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
#endif
s = splbio();
swapdev_vp->v_numoutput++;
splx(s);
}
/* for async ops we must set up the iodone handler. */
if (async) {
bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
bp->b_iodone = uvm_aio_biodone;
}
/* now we start the I/O, and if async, return. */
VOP_STRATEGY(bp->b_vp, bp);
if (async)
return (VM_PAGER_PEND);
/* must be sync i/o. wait for it to finish */
(void) biowait(bp);
result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
/* decrypt swap */
if (!write && !(bp->b_flags & B_ERROR)) {
int i;
caddr_t data = (caddr_t)kva;
caddr_t dst = (caddr_t)kva;
u_int64_t block = startblk;
if (bounce)
data = (caddr_t)bouncekva;
for (i = 0; i < npages; i++) {
#ifdef UVM_SWAP_ENCRYPT
struct swap_key *key;
/* Check if we need to decrypt */
if (swap_encrypt_initialized &&
uvm_swap_needdecrypt(sdp, startslot + i)) {
key = SWD_KEY(sdp, startslot + i);
if (key->refcount == 0) {
result = VM_PAGER_ERROR;
break;
}
swap_decrypt(key, data, dst, block, PAGE_SIZE);
} else if (bounce) {
#else
if (bounce) {
#endif
memcpy(dst, data, PAGE_SIZE);
}
data += PAGE_SIZE;
dst += PAGE_SIZE;
block += btodb(PAGE_SIZE);
}
if (bounce)
uvm_pagermapout(bouncekva, npages);
}
/* kill the pager mapping */
uvm_pagermapout(kva, npages);
/* Not anymore needed, free after encryption/bouncing */
if (!write && bounce)
uvm_swap_freepages(tpps, npages);
/* now dispose of the buf */
s = splbio();
if (bp->b_vp)
brelvp(bp);
if (write && bp->b_vp)
vwakeup(bp->b_vp);
pool_put(&bufpool, bp);
splx(s);
/* finally return. */
return (result);
}
void
swapmount(void)
{
struct swapdev *sdp;
struct swappri *spp;
struct vnode *vp;
dev_t swap_dev = swdevt[0].sw_dev;
char *nam;
char path[MNAMELEN + 1];
if (swap_dev == NODEV)
return;
rw_enter_write(&swap_syscall_lock);
#if defined(NFSCLIENT)
if (swap_dev == NETDEV) {
extern struct nfs_diskless nfs_diskless;
snprintf(path, sizeof(path), "%s",
nfs_diskless.nd_swap.ndm_host);
vp = nfs_diskless.sw_vp;
goto gotit;
} else
#endif
if (bdevvp(swap_dev, &vp))
return;
/* Construct a potential path to swap */
if ((nam = findblkname(major(swap_dev))))
snprintf(path, sizeof(path), "/dev/%s%d%c", nam,
DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev));
else
snprintf(path, sizeof(path), "blkdev0x%x",
swap_dev);
#if defined(NFSCLIENT)
gotit:
#endif
sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO);
spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);
sdp->swd_flags = SWF_FAKE;
sdp->swd_dev = swap_dev;
sdp->swd_pathlen = strlen(path) + 1;
sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO);
strlcpy(sdp->swd_path, path, sdp->swd_pathlen);
sdp->swd_vp = vp;
mtx_enter(&uvm_swap_data_lock);
swaplist_insert(sdp, spp, 0);
mtx_leave(&uvm_swap_data_lock);
if (swap_on(curproc, sdp)) {
mtx_enter(&uvm_swap_data_lock);
swaplist_find(vp, 1);
swaplist_trim();
vput(sdp->swd_vp);
mtx_leave(&uvm_swap_data_lock);
rw_exit_write(&swap_syscall_lock);
free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
free(sdp, M_VMSWAP, sizeof(*sdp));
return;
}
rw_exit_write(&swap_syscall_lock);
}
#ifdef HIBERNATE
int
uvm_hibswap(dev_t dev, u_long *sp, u_long *ep)
{
struct swapdev *sdp, *swd = NULL;
struct swappri *spp;
/* no swap devices configured yet? */
if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev)
return (1);
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_dev == dev)
swd = sdp;
}
}
if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0)
return (1);
blist_gapfind(swd->swd_blist, sp, ep);
if (*ep - *sp == 0)
/* no gap found */
return (1);
/*
* blist_gapfind returns the gap as [sp,ep[ ,
* whereas [sp,ep] is expected from uvm_hibswap().
*/
*ep -= 1;
return (0);
}
#endif /* HIBERNATE */
#ifdef DDB
void
swap_print_all(int (*pr)(const char *, ...))
{
struct swappri *spp;
struct swapdev *sdp;
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
#ifdef HIBERNATE
u_long bgap = 0, egap = 0;
#endif
pr("swap %p path \"%s\" flags 0x%x\n", sdp,
sdp->swd_path, sdp->swd_flags);
blist_print(sdp->swd_blist);
#ifdef HIBERNATE
if (!uvm_hibswap(sdp->swd_dev, &bgap, &egap))
pr("hibernate gap: [0x%lx, 0x%lx] size=%lu\n",
bgap, egap, (egap - bgap + 1));
else
pr("hibernate gap: not found\n");
#endif
}
}
}
#endif /* DDB */
/* $OpenBSD: ipi.c,v 1.17 2020/01/21 02:01:50 mlarkin Exp $ */
/* $NetBSD: ipi.c,v 1.2 2003/03/01 13:05:37 fvdl Exp $ */
/*-
* Copyright (c) 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by RedBack Networks Inc.
*
* Author: Bill Sommerfeld
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/device.h>
#include <sys/systm.h>
#include <machine/intr.h>
#include <machine/atomic.h>
#include <machine/cpuvar.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
void
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
x86_atomic_setbits_u32(&ci->ci_ipis, ipimask);
/* Don't send IPI to cpu which isn't (yet) running. */
if (!(ci->ci_flags & CPUF_RUNNING))
return;
x86_ipi(LAPIC_IPI_VECTOR, ci->ci_apicid, LAPIC_DLMODE_FIXED);
}
int
x86_fast_ipi(struct cpu_info *ci, int ipi)
{ if (!(ci->ci_flags & CPUF_RUNNING))
return (ENOENT);
x86_ipi(ipi, ci->ci_apicid, LAPIC_DLMODE_FIXED);
return 0;
}
void
x86_broadcast_ipi(int ipimask)
{
struct cpu_info *ci, *self = curcpu();
int count = 0;
CPU_INFO_ITERATOR cii;
CPU_INFO_FOREACH(cii, ci) {
if (ci == self)
continue;
if ((ci->ci_flags & CPUF_RUNNING) == 0)
continue;
x86_atomic_setbits_u32(&ci->ci_ipis, ipimask);
count++;
}
if (!count)
return;
x86_ipi(LAPIC_IPI_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED);
}
void
x86_ipi_handler(void)
{
extern struct evcount ipi_count;
struct cpu_info *ci = curcpu();
u_int32_t pending;
int bit;
int floor;
floor = ci->ci_handled_intr_level;
ci->ci_handled_intr_level = ci->ci_ilevel;
pending = atomic_swap_uint(&ci->ci_ipis, 0);
for (bit = 0; bit < X86_NIPI && pending; bit++) {
if (pending & (1 << bit)) {
pending &= ~(1 << bit);
(*ipifunc[bit])(ci);
ipi_count.ec_count++;
}
}
ci->ci_handled_intr_level = floor;
}
/* $OpenBSD: ip_output.c,v 1.382 2022/08/12 17:04:16 bluhm Exp $ */
/* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_enc.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/udp_var.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
#ifdef IPSEC
#ifdef ENCDEBUG
#define DPRINTF(fmt, args...) \
do { \
if (encdebug) \
printf("%s: " fmt "\n", __func__, ## args); \
} while (0)
#else
#define DPRINTF(fmt, args...) \
do { } while (0)
#endif
#endif /* IPSEC */
int ip_pcbopts(struct mbuf **, struct mbuf *);
int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *);
int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int);
void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *);
static __inline u_int16_t __attribute__((__unused__))
in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t);
void in_delayed_cksum(struct mbuf *);
int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
struct tdb **, int ipsecflowinfo);
void ip_output_ipsec_pmtu_update(struct tdb *, struct route *, struct in_addr,
int, int);
int ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int);
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
*/
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp, u_int32_t ipsecflowinfo)
{
struct ip *ip;
struct ifnet *ifp = NULL;
struct mbuf_list fml;
int hlen = sizeof (struct ip);
int error = 0;
struct route iproute;
struct sockaddr_in *dst;
struct tdb *tdb = NULL;
u_long mtu;
#if NPF > 0
u_int orig_rtableid;
#endif
NET_ASSERT_LOCKED();
#ifdef IPSEC
if (inp && (inp->inp_flags & INP_IPV6) != 0) panic("ip_output: IPv6 pcb is passed");
#endif /* IPSEC */
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("ip_output no HDR");
#endif
if (opt)
m = ip_insertoptions(m, opt, &hlen);
ip = mtod(m, struct ip *);
/*
* Fill in IP header.
*/
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_off &= htons(IP_DF);
ip->ip_id = htons(ip_randomid());
ip->ip_hl = hlen >> 2;
ipstat_inc(ips_localout);
} else {
hlen = ip->ip_hl << 2;
}
/*
* We should not send traffic to 0/8 say both Stevens and RFCs
* 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6.
*/
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) {
error = ENETUNREACH;
goto bad;
}
#if NPF > 0
orig_rtableid = m->m_pkthdr.ph_rtableid;
reroute:
#endif
/*
* Do a route lookup now in case we need the source address to
* do an SPD lookup in IPsec; for most packets, the source address
* is set at a higher level protocol. ICMPs and other packets
* though (e.g., traceroute) have a source address of zeroes.
*/
if (ro == NULL) {
ro = &iproute;
memset(ro, 0, sizeof(*ro));
}
dst = satosin(&ro->ro_dst);
/*
* If there is a cached route, check that it is to the same
* destination and is still up. If not, free it and try again.
*/
if (!rtisvalid(ro->ro_rt) || dst->sin_addr.s_addr != ip->ip_dst.s_addr ||
ro->ro_tableid != m->m_pkthdr.ph_rtableid) {
rtfree(ro->ro_rt);
ro->ro_rt = NULL;
}
if (ro->ro_rt == NULL) {
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr = ip->ip_dst;
ro->ro_tableid = m->m_pkthdr.ph_rtableid;
}
if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
(ip->ip_dst.s_addr == INADDR_BROADCAST)) && imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) {
mtu = ifp->if_mtu;
if (ip->ip_src.s_addr == INADDR_ANY) {
struct in_ifaddr *ia;
IFP_TO_IA(ifp, ia);
if (ia != NULL)
ip->ip_src = ia->ia_addr.sin_addr;
}
} else {
struct in_ifaddr *ia;
if (ro->ro_rt == NULL)
ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
&ip->ip_src.s_addr, ro->ro_tableid);
if (ro->ro_rt == NULL) {
ipstat_inc(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
ia = ifatoia(ro->ro_rt->rt_ifa);
if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL))
ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid));
else
ifp = if_get(ro->ro_rt->rt_ifidx);
/*
* We aren't using rtisvalid() here because the UP/DOWN state
* machine is broken with some Ethernet drivers like em(4).
* As a result we might try to use an invalid cached route
* entry while an interface is being detached.
*/
if (ifp == NULL) {
ipstat_inc(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
if ((mtu = ro->ro_rt->rt_mtu) == 0) mtu = ifp->if_mtu; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = satosin(ro->ro_rt->rt_gateway);
/* Set the source IP address */
if (ip->ip_src.s_addr == INADDR_ANY && ia)
ip->ip_src = ia->ia_addr.sin_addr;
}
#ifdef IPSEC
if (ipsec_in_use || inp != NULL) {
/* Do we have any pending SAs to apply ? */
error = ip_output_ipsec_lookup(m, hlen, inp, &tdb,
ipsecflowinfo);
if (error) {
/* Should silently drop packet */
if (error == -EINVAL)
error = 0;
goto bad;
}
if (tdb != NULL) {
/*
* If it needs TCP/UDP hardware-checksumming, do the
* computation now.
*/
in_proto_cksum_out(m, NULL);
}
}
#endif /* IPSEC */
if (IN_MULTICAST(ip->ip_dst.s_addr) ||
(ip->ip_dst.s_addr == INADDR_BROADCAST)) {
m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
M_BCAST : M_MCAST;
/*
* IP destination address is multicast. Make sure "dst"
* still points to the address in "ro". (It may have been
* changed to point to a gateway address, above.)
*/
dst = satosin(&ro->ro_dst);
/*
* See if the caller provided any multicast options
*/
if (imo != NULL) ip->ip_ttl = imo->imo_ttl;
else
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
/*
* if we don't know the outgoing ifp yet, we can't generate
* output
*/
if (!ifp) {
ipstat_inc(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
/*
* Confirm that the outgoing interface supports multicast,
* but only if the packet actually is going out on that
* interface (i.e., no IPsec is applied).
*/
if ((((m->m_flags & M_MCAST) && (ifp->if_flags & IFF_MULTICAST) == 0) || ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) {
ipstat_inc(ips_noroute);
error = ENETUNREACH;
goto bad;
}
/*
* If source address not specified yet, use address
* of outgoing interface.
*/
if (ip->ip_src.s_addr == INADDR_ANY) {
struct in_ifaddr *ia;
IFP_TO_IA(ifp, ia);
if (ia != NULL)
ip->ip_src = ia->ia_addr.sin_addr;
}
if ((imo == NULL || imo->imo_loop) &&
in_hasmulti(&ip->ip_dst, ifp)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
* Can't defer TCP/UDP checksumming, do the
* computation now.
*/
in_proto_cksum_out(m, NULL);
ip_mloopback(ifp, m, dst);
}
#ifdef MROUTING
else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IP_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip_mloopback(),
* above, will be forwarded by the ip_input() routine,
* if necessary.
*/
if (ipmforwarding && ip_mrouter[ifp->if_rdomain] &&
(flags & IP_FORWARDING) == 0) {
int rv;
KERNEL_LOCK();
rv = ip_mforward(m, ifp);
KERNEL_UNLOCK();
if (rv != 0)
goto bad;
}
}
#endif
/*
* Multicasts with a time-to-live of zero may be looped-
* back, above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0)
goto bad;
goto sendit;
}
/*
* Look for broadcast address and verify user is allowed to send
* such a packet; if the packet is going in an IPsec tunnel, skip
* this check.
*/
if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EADDRNOTAVAIL;
goto bad;
}
if ((flags & IP_ALLOWBROADCAST) == 0) {
error = EACCES;
goto bad;
}
/* Don't allow broadcast messages to be fragmented */
if (ntohs(ip->ip_len) > ifp->if_mtu) {
error = EMSGSIZE;
goto bad;
}
m->m_flags |= M_BCAST;
} else
m->m_flags &= ~M_BCAST;
sendit:
/*
* If we're doing Path MTU discovery, we need to set DF unless
* the route's MTU is locked.
*/
if ((flags & IP_MTUDISC) && ro && ro->ro_rt &&
(ro->ro_rt->rt_locks & RTV_MTU) == 0)
ip->ip_off |= htons(IP_DF);
#ifdef IPSEC
/*
* Check if the packet needs encapsulation.
*/
if (tdb != NULL) {
/* Callee frees mbuf */
error = ip_output_ipsec_send(tdb, m, ro,
(flags & IP_FORWARDING) ? 1 : 0);
goto done;
}
#endif /* IPSEC */
/*
* Packet filter
*/
#if NPF > 0
if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT,
ifp, &m) != PF_PASS) {
error = EACCES;
goto bad;
}
if (m == NULL)
goto done;
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) ==
(PF_TAG_REROUTE | PF_TAG_GENERATED))
/* already rerun the route lookup, go on */
m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE);
else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) {
/* tag as generated to skip over pf_test on rerun */
m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
ro = NULL;
if_put(ifp); /* drop reference since target changed */
ifp = NULL;
goto reroute;
}
#endif
in_proto_cksum_out(m, ifp);
#ifdef IPSEC
if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) &&
(m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) == NULL)) {
error = EHOSTUNREACH;
goto bad;
}
#endif
/*
* If small enough for interface, can just send directly.
*/
if (ntohs(ip->ip_len) <= mtu) {
ip->ip_sum = 0;
if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
else {
ipstat_inc(ips_outswcsum);
ip->ip_sum = in_cksum(m, hlen);
}
error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt);
goto done;
}
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
*/
if (ip->ip_off & htons(IP_DF)) {
#ifdef IPSEC
if (ip_mtudisc) ipsec_adjust_mtu(m, ifp->if_mtu);
#endif
error = EMSGSIZE;
#if NPF > 0
/* pf changed routing table, use orig rtable for path MTU */
if (ro->ro_tableid != orig_rtableid) { rtfree(ro->ro_rt);
ro->ro_tableid = orig_rtableid;
ro->ro_rt = icmp_mtudisc_clone(
satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0);
}
#endif
/*
* This case can happen if the user changed the MTU
* of an interface after enabling IP on it. Because
* most netifs don't keep track of routes pointing to
* them, there is no way for one to update all its
* routes when the MTU is changed.
*/
if (rtisvalid(ro->ro_rt) && ISSET(ro->ro_rt->rt_flags, RTF_HOST) && !(ro->ro_rt->rt_locks & RTV_MTU) &&
(ro->ro_rt->rt_mtu > ifp->if_mtu)) {
ro->ro_rt->rt_mtu = ifp->if_mtu;
}
ipstat_inc(ips_cantfrag);
goto bad;
}
error = ip_fragment(m, &fml, ifp, mtu);
if (error)
goto done;
while ((m = ml_dequeue(&fml)) != NULL) {
error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt);
if (error)
break;
}
if (error)
ml_purge(&fml);
else
ipstat_inc(ips_fragmented);
done:
if (ro == &iproute && ro->ro_rt) rtfree(ro->ro_rt);
if_put(ifp);
#ifdef IPSEC
tdb_unref(tdb);
#endif /* IPSEC */
return (error);
bad:
m_freem(m);
goto done;
}
#ifdef IPSEC
int
ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
struct tdb **tdbout, int ipsecflowinfo)
{
struct m_tag *mtag;
struct tdb_ident *tdbi;
struct tdb *tdb;
struct ipsec_ids *ids = NULL;
int error;
/* Do we have any pending SAs to apply ? */
if (ipsecflowinfo) ids = ipsp_ids_lookup(ipsecflowinfo);
error = ipsp_spd_lookup(m, AF_INET, hlen, IPSP_DIRECTION_OUT,
NULL, inp, &tdb, ids);
ipsp_ids_free(ids);
if (error || tdb == NULL) {
*tdbout = NULL;
return error;
}
/* Loop detection */
for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE)
continue;
tdbi = (struct tdb_ident *)(mtag + 1);
if (tdbi->spi == tdb->tdb_spi && tdbi->proto == tdb->tdb_sproto && tdbi->rdomain == tdb->tdb_rdomain &&
!memcmp(&tdbi->dst, &tdb->tdb_dst,
sizeof(union sockaddr_union))) {
/* no IPsec needed */
tdb_unref(tdb);
*tdbout = NULL;
return 0;
}
}
*tdbout = tdb;
return 0;
}
void
ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro,
struct in_addr dst, int rtableid, int transportmode)
{
struct rtentry *rt = NULL;
int rt_mtucloned = 0;
/* Find a host route to store the mtu in */
if (ro != NULL)
rt = ro->ro_rt;
/* but don't add a PMTU route for transport mode SAs */
if (transportmode)
rt = NULL;
else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) {
rt = icmp_mtudisc_clone(dst, rtableid, 1);
rt_mtucloned = 1;
}
DPRINTF("spi %08x mtu %d rt %p cloned %d",
ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned);
if (rt != NULL) {
rt->rt_mtu = tdb->tdb_mtu;
if (ro != NULL && ro->ro_rt != NULL) {
rtfree(ro->ro_rt);
ro->ro_rt = rtalloc(&ro->ro_dst, RT_RESOLVE, rtableid);
}
if (rt_mtucloned)
rtfree(rt);
}
}
int
ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd)
{
#if NPF > 0
struct ifnet *encif;
#endif
struct ip *ip;
struct in_addr dst;
int error, rtableid;
#if NPF > 0
/*
* Packet filter
*/
if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL ||
pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) {
m_freem(m);
return EACCES;
}
if (m == NULL)
return 0;
/*
* PF_TAG_REROUTE handling or not...
* Packet is entering IPsec so the routing is
* already overruled by the IPsec policy.
* Until now the change was not reconsidered.
* What's the behaviour?
*/
in_proto_cksum_out(m, encif);
#endif
/* Check if we are allowed to fragment */
ip = mtod(m, struct ip *);
dst = ip->ip_dst;
rtableid = m->m_pkthdr.ph_rtableid;
if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu &&
ntohs(ip->ip_len) > tdb->tdb_mtu &&
tdb->tdb_mtutimeout > gettime()) {
int transportmode;
transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) &&
(tdb->tdb_dst.sin.sin_addr.s_addr == dst.s_addr);
ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid,
transportmode);
ipsec_adjust_mtu(m, tdb->tdb_mtu);
m_freem(m);
return EMSGSIZE;
}
/* propagate IP_DF for v4-over-v6 */
if (ip_mtudisc && ip->ip_off & htons(IP_DF))
SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
/*
* Clear these -- they'll be set in the recursive invocation
* as needed.
*/
m->m_flags &= ~(M_MCAST | M_BCAST);
/* Callee frees mbuf */
KERNEL_LOCK();
error = ipsp_process_packet(m, tdb, AF_INET, 0);
KERNEL_UNLOCK();
if (error) {
ipsecstat_inc(ipsec_odrops);
tdbstat_inc(tdb, tdb_odrops);
}
if (ip_mtudisc && error == EMSGSIZE)
ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0);
return error;
}
#endif /* IPSEC */
int
ip_fragment(struct mbuf *m0, struct mbuf_list *fml, struct ifnet *ifp,
u_long mtu)
{
struct mbuf *m;
struct ip *ip;
int firstlen, hlen, tlen, len, off;
int error;
ml_init(fml);
ml_enqueue(fml, m0);
ip = mtod(m0, struct ip *);
hlen = ip->ip_hl << 2;
tlen = m0->m_pkthdr.len;
len = (mtu - hlen) &~ 7;
if (len < 8) {
error = EMSGSIZE;
goto bad;
}
firstlen = len;
/*
* If we are doing fragmentation, we can't defer TCP/UDP
* checksumming; compute the checksum and clear the flag.
*/
in_proto_cksum_out(m0, NULL);
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
*/
for (off = hlen + firstlen; off < tlen; off += len) {
struct ip *mhip;
int mhlen;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
ml_enqueue(fml, m);
if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0)
goto bad;
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
if (hlen > sizeof(struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
mhip->ip_hl = mhlen >> 2;
} else
mhlen = sizeof(struct ip);
m->m_len = mhlen;
mhip->ip_off = ((off - hlen) >> 3) +
(ntohs(ip->ip_off) & ~IP_MF);
if (ip->ip_off & htons(IP_MF))
mhip->ip_off |= IP_MF;
if (off + len >= tlen)
len = tlen - off;
else
mhip->ip_off |= IP_MF;
mhip->ip_off = htons(mhip->ip_off);
m->m_pkthdr.len = mhlen + len;
mhip->ip_len = htons(m->m_pkthdr.len);
m->m_next = m_copym(m0, off, len, M_NOWAIT);
if (m->m_next == NULL) {
error = ENOBUFS;
goto bad;
}
mhip->ip_sum = 0;
if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
else {
ipstat_inc(ips_outswcsum);
mhip->ip_sum = in_cksum(m, mhlen);
}
}
/*
* Update first fragment by trimming what's been copied out
* and updating header, then send each fragment (in order).
*/
m = m0;
m_adj(m, hlen + firstlen - tlen);
ip->ip_off |= htons(IP_MF);
ip->ip_len = htons(m->m_pkthdr.len);
ip->ip_sum = 0;
if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
else {
ipstat_inc(ips_outswcsum);
ip->ip_sum = in_cksum(m, hlen);
}
ipstat_add(ips_ofragments, ml_len(fml));
return (0);
bad:
ipstat_inc(ips_odropped);
ml_purge(fml);
return (error);
}
/*
* Insert IP options into preformed packet.
* Adjust IP destination as required for IP source routing,
* as indicated by a non-zero in_addr at the start of the options.
*/
struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
struct ipoption *p = mtod(opt, struct ipoption *);
struct mbuf *n;
struct ip *ip = mtod(m, struct ip *);
unsigned int optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
return (m); /* XXX should fail */
/* check if options will fit to IP header */
if ((optlen + sizeof(struct ip)) > (0x0f << 2)) {
*phlen = sizeof(struct ip);
return (m);
}
if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n == NULL)
return (m);
M_MOVE_HDR(n, m);
n->m_pkthdr.len += optlen;
m->m_len -= sizeof(struct ip);
m->m_data += sizeof(struct ip);
n->m_next = m;
m = n;
m->m_len = optlen + sizeof(struct ip);
m->m_data += max_linkhdr;
memcpy(mtod(m, caddr_t), ip, sizeof(struct ip));
} else {
m->m_data -= optlen;
m->m_len += optlen;
m->m_pkthdr.len += optlen;
memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip));
}
ip = mtod(m, struct ip *);
memcpy(ip + 1, p->ipopt_list, optlen);
*phlen = sizeof(struct ip) + optlen;
ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
return (m);
}
/*
* Copy options from ip to jp,
* omitting those not copied during fragmentation.
*/
int
ip_optcopy(struct ip *ip, struct ip *jp)
{
u_char *cp, *dp;
int opt, optlen, cnt;
cp = (u_char *)(ip + 1);
dp = (u_char *)(jp + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP) {
/* Preserve for IP mcast tunnel's LSRR alignment. */
*dp++ = IPOPT_NOP;
optlen = 1;
continue;
}
#ifdef DIAGNOSTIC
if (cnt < IPOPT_OLEN + sizeof(*cp))
panic("malformed IPv4 option passed to ip_optcopy");
#endif
optlen = cp[IPOPT_OLEN];
#ifdef DIAGNOSTIC
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
panic("malformed IPv4 option passed to ip_optcopy");
#endif
/* bogus lengths should have been caught by ip_dooptions */
if (optlen > cnt)
optlen = cnt;
if (IPOPT_COPIED(opt)) { memcpy(dp, cp, optlen);
dp += optlen;
}
}
for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL;
return (optlen);
}
/*
* IP socket option processing.
*/
int
ip_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
struct inpcb *inp = sotoinpcb(so);
int optval = 0;
struct proc *p = curproc; /* XXX */
int error = 0;
u_int rtableid, rtid = 0;
if (level != IPPROTO_IP)
return (EINVAL);
rtableid = p->p_p->ps_rtableid;
switch (op) {
case PRCO_SETOPT:
switch (optname) {
case IP_OPTIONS:
return (ip_pcbopts(&inp->inp_options, m));
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVTTL:
case IP_RECVDSTPORT:
case IP_RECVRTABLE:
case IP_IPSECFLOWINFO:
if (m == NULL || m->m_len != sizeof(int))
error = EINVAL;
else {
optval = *mtod(m, int *);
switch (optname) {
case IP_TOS:
inp->inp_ip.ip_tos = optval;
break;
case IP_TTL:
if (optval > 0 && optval <= MAXTTL)
inp->inp_ip.ip_ttl = optval; else if (optval == -1) inp->inp_ip.ip_ttl = ip_defttl;
else
error = EINVAL;
break;
case IP_MINTTL:
if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval;
else
error = EINVAL;
break;
#define OPTSET(bit) \
if (optval) \
inp->inp_flags |= bit; \
else \
inp->inp_flags &= ~bit;
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
OPTSET(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
OPTSET(INP_RECVDSTADDR);
break;
case IP_RECVIF:
OPTSET(INP_RECVIF);
break;
case IP_RECVTTL:
OPTSET(INP_RECVTTL);
break;
case IP_RECVDSTPORT:
OPTSET(INP_RECVDSTPORT);
break;
case IP_RECVRTABLE:
OPTSET(INP_RECVRTABLE);
break;
case IP_IPSECFLOWINFO:
OPTSET(INP_IPSECFLOWINFO);
break;
}
}
break;
#undef OPTSET
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_setmoptions(optname, &inp->inp_moptions, m,
inp->inp_rtableid);
break;
case IP_PORTRANGE:
if (m == NULL || m->m_len != sizeof(int))
error = EINVAL;
else {
optval = *mtod(m, int *);
switch (optval) {
case IP_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags &= ~(INP_HIGHPORT);
break;
case IP_PORTRANGE_HIGH:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags |= INP_HIGHPORT;
break;
case IP_PORTRANGE_LOW:
inp->inp_flags &= ~(INP_HIGHPORT);
inp->inp_flags |= INP_LOWPORT;
break;
default:
error = EINVAL;
break;
}
}
break;
case IP_AUTH_LEVEL:
case IP_ESP_TRANS_LEVEL:
case IP_ESP_NETWORK_LEVEL:
case IP_IPCOMP_LEVEL:
#ifndef IPSEC
error = EOPNOTSUPP;
#else
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
if (optval < IPSEC_LEVEL_BYPASS ||
optval > IPSEC_LEVEL_UNIQUE) {
error = EINVAL;
break;
}
switch (optname) {
case IP_AUTH_LEVEL:
if (optval < IPSEC_AUTH_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_AUTH] = optval;
break;
case IP_ESP_TRANS_LEVEL:
if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_ESP_TRANS] = optval;
break;
case IP_ESP_NETWORK_LEVEL:
if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_ESP_NETWORK] = optval;
break;
case IP_IPCOMP_LEVEL:
if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_IPCOMP] = optval;
break;
}
#endif
break;
case IP_IPSEC_LOCAL_ID:
case IP_IPSEC_REMOTE_ID:
error = EOPNOTSUPP;
break;
case SO_RTABLE:
if (m == NULL || m->m_len < sizeof(u_int)) {
error = EINVAL;
break;
}
rtid = *mtod(m, u_int *);
if (inp->inp_rtableid == rtid)
break;
/* needs privileges to switch when already set */
if (rtableid != rtid && rtableid != 0 &&
(error = suser(p)) != 0)
break;
/* table must exist */
if (!rtable_exists(rtid)) {
error = EINVAL;
break;
}
if (inp->inp_lport) {
error = EBUSY;
break;
}
inp->inp_rtableid = rtid;
in_pcbrehash(inp);
break;
case IP_PIPEX:
if (m != NULL && m->m_len == sizeof(int)) inp->inp_pipex = *mtod(m, int *);
else
error = EINVAL;
break;
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (optname) {
case IP_OPTIONS:
case IP_RETOPTS:
if (inp->inp_options) {
m->m_len = inp->inp_options->m_len;
memcpy(mtod(m, caddr_t),
mtod(inp->inp_options, caddr_t), m->m_len);
} else
m->m_len = 0;
break;
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVTTL:
case IP_RECVDSTPORT:
case IP_RECVRTABLE:
case IP_IPSECFLOWINFO:
case IP_IPDEFTTL:
m->m_len = sizeof(int);
switch (optname) {
case IP_TOS:
optval = inp->inp_ip.ip_tos;
break;
case IP_TTL:
optval = inp->inp_ip.ip_ttl;
break;
case IP_MINTTL:
optval = inp->inp_ip_minttl;
break;
case IP_IPDEFTTL:
optval = ip_defttl;
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
optval = OPTBIT(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
optval = OPTBIT(INP_RECVDSTADDR);
break;
case IP_RECVIF:
optval = OPTBIT(INP_RECVIF);
break;
case IP_RECVTTL:
optval = OPTBIT(INP_RECVTTL);
break;
case IP_RECVDSTPORT:
optval = OPTBIT(INP_RECVDSTPORT);
break;
case IP_RECVRTABLE:
optval = OPTBIT(INP_RECVRTABLE);
break;
case IP_IPSECFLOWINFO:
optval = OPTBIT(INP_IPSECFLOWINFO);
break;
}
*mtod(m, int *) = optval;
break;
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_getmoptions(optname, inp->inp_moptions, m);
break;
case IP_PORTRANGE:
m->m_len = sizeof(int);
if (inp->inp_flags & INP_HIGHPORT)
optval = IP_PORTRANGE_HIGH;
else if (inp->inp_flags & INP_LOWPORT)
optval = IP_PORTRANGE_LOW;
else
optval = 0;
*mtod(m, int *) = optval;
break;
case IP_AUTH_LEVEL:
case IP_ESP_TRANS_LEVEL:
case IP_ESP_NETWORK_LEVEL:
case IP_IPCOMP_LEVEL:
#ifndef IPSEC
m->m_len = sizeof(int);
*mtod(m, int *) = IPSEC_LEVEL_NONE;
#else
m->m_len = sizeof(int);
switch (optname) {
case IP_AUTH_LEVEL:
optval = inp->inp_seclevel[SL_AUTH];
break;
case IP_ESP_TRANS_LEVEL:
optval = inp->inp_seclevel[SL_ESP_TRANS];
break;
case IP_ESP_NETWORK_LEVEL:
optval = inp->inp_seclevel[SL_ESP_NETWORK];
break;
case IP_IPCOMP_LEVEL:
optval = inp->inp_seclevel[SL_IPCOMP];
break;
}
*mtod(m, int *) = optval;
#endif
break;
case IP_IPSEC_LOCAL_ID:
case IP_IPSEC_REMOTE_ID:
error = EOPNOTSUPP;
break;
case SO_RTABLE:
m->m_len = sizeof(u_int);
*mtod(m, u_int *) = inp->inp_rtableid;
break;
case IP_PIPEX:
m->m_len = sizeof(int);
*mtod(m, int *) = inp->inp_pipex;
break;
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
/*
* Set up IP options in pcb for insertion in output packets.
* Store in mbuf with pointer in pcbopt, adding pseudo-option
* with destination address if source routed.
*/
int
ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m)
{
struct mbuf *n;
struct ipoption *p;
int cnt, off, optlen;
u_char *cp;
u_char opt;
/* turn off any old options */
m_freem(*pcbopt);
*pcbopt = NULL;
if (m == NULL || m->m_len == 0) {
/*
* Only turning off any previous options.
*/
return (0);
}
if (m->m_len % sizeof(int32_t) ||
m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
return (EINVAL);
/* Don't sleep because NET_LOCK() is hold. */
if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL)
return (ENOBUFS);
p = mtod(n, struct ipoption *);
memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */
n->m_len = sizeof(struct in_addr);
off = 0;
cnt = m->m_len;
cp = mtod(m, u_char *);
while (cnt > 0) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_NOP || opt == IPOPT_EOL) {
optlen = 1;
} else {
if (cnt < IPOPT_OLEN + sizeof(*cp))
goto bad;
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
goto bad;
}
switch (opt) {
default:
memcpy(p->ipopt_list + off, cp, optlen);
break;
case IPOPT_LSRR:
case IPOPT_SSRR:
/*
* user process specifies route as:
* ->A->B->C->D
* D must be our final destination (but we can't
* check that since we may not have connected yet).
* A is first hop destination, which doesn't appear in
* actual IP option, but is stored before the options.
*/
if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
goto bad;
/*
* Optlen is smaller because first address is popped.
* Cnt and cp will be adjusted a bit later to reflect
* this.
*/
optlen -= sizeof(struct in_addr);
p->ipopt_list[off + IPOPT_OPTVAL] = opt;
p->ipopt_list[off + IPOPT_OLEN] = optlen;
/*
* Move first hop before start of options.
*/
memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET,
sizeof(struct in_addr));
cp += sizeof(struct in_addr);
cnt -= sizeof(struct in_addr);
/*
* Then copy rest of options
*/
memcpy(p->ipopt_list + off + IPOPT_OFFSET,
cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET);
break;
}
off += optlen;
cp += optlen;
cnt -= optlen;
if (opt == IPOPT_EOL)
break;
}
/* pad options to next word, since p was zeroed just adjust off */
off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1);
n->m_len += off; if (n->m_len > sizeof(*p)) {
bad:
m_freem(n);
return (EINVAL);
}
*pcbopt = n;
return (0);
}
/*
* Lookup the interface based on the information in the ip_mreqn struct.
*/
int
ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx)
{
struct sockaddr_in sin;
struct rtentry *rt;
/*
* In case userland provides the imr_ifindex use this as interface.
* If no interface address was provided, use the interface of
* the route to the given multicast address.
*/
if (mreq->imr_ifindex != 0) {
*ifidx = mreq->imr_ifindex;
} else if (mreq->imr_address.s_addr == INADDR_ANY) {
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = mreq->imr_multiaddr;
rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
if (!rtisvalid(rt)) {
rtfree(rt);
return EADDRNOTAVAIL;
}
*ifidx = rt->rt_ifidx;
rtfree(rt);
} else {
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = mreq->imr_address;
rt = rtalloc(sintosa(&sin), 0, rtableid);
if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) {
rtfree(rt);
return EADDRNOTAVAIL;
}
*ifidx = rt->rt_ifidx;
rtfree(rt);
}
return 0;
}
/*
* Set the IP multicast options in response to user setsockopt().
*/
int
ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m,
u_int rtableid)
{
struct in_addr addr;
struct in_ifaddr *ia;
struct ip_mreqn mreqn;
struct ifnet *ifp = NULL;
struct ip_moptions *imo = *imop;
struct in_multi **immp;
struct sockaddr_in sin;
unsigned int ifidx;
int i, error = 0;
u_char loop;
if (imo == NULL) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO);
immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS,
M_WAITOK|M_ZERO);
*imop = imo;
imo->imo_ifidx = 0;
imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL;
imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP;
imo->imo_num_memberships = 0;
imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
imo->imo_membership = immp;
}
switch (optname) {
case IP_MULTICAST_IF:
/*
* Select the interface for outgoing multicast packets.
*/
if (m == NULL) {
error = EINVAL;
break;
}
if (m->m_len == sizeof(struct in_addr)) { addr = *(mtod(m, struct in_addr *));
} else if (m->m_len == sizeof(struct ip_mreq) ||
m->m_len == sizeof(struct ip_mreqn)) {
memset(&mreqn, 0, sizeof(mreqn));
memcpy(&mreqn, mtod(m, void *), m->m_len);
/*
* If an interface index is given use this
* index to set the imo_ifidx but check first
* that the interface actually exists.
* In the other case just set the addr to
* the imr_address and fall through to the
* regular code.
*/
if (mreqn.imr_ifindex != 0) {
ifp = if_get(mreqn.imr_ifindex);
if (ifp == NULL ||
ifp->if_rdomain != rtable_l2(rtableid)) {
error = EADDRNOTAVAIL;
if_put(ifp);
break;
}
imo->imo_ifidx = ifp->if_index;
if_put(ifp);
break;
} else
addr = mreqn.imr_address;
} else {
error = EINVAL;
break;
}
/*
* INADDR_ANY is used to remove a previous selection.
* When no interface is selected, a default one is
* chosen every time a multicast packet is sent.
*/
if (addr.s_addr == INADDR_ANY) {
imo->imo_ifidx = 0;
break;
}
/*
* The selected interface is identified by its local
* IP address. Find the interface and confirm that
* it supports multicasting.
*/
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = addr;
ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid));
if (ia == NULL ||
(ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
break;
}
imo->imo_ifidx = ia->ia_ifp->if_index;
break;
case IP_MULTICAST_TTL:
/*
* Set the IP time-to-live for outgoing multicast packets.
*/
if (m == NULL || m->m_len != 1) {
error = EINVAL;
break;
}
imo->imo_ttl = *(mtod(m, u_char *));
break;
case IP_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
if (m == NULL || m->m_len != 1 ||
(loop = *(mtod(m, u_char *))) > 1) {
error = EINVAL;
break;
}
imo->imo_loop = loop;
break;
case IP_ADD_MEMBERSHIP:
/*
* Add a multicast group membership.
* Group must be a valid IP multicast address.
*/
if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) ||
m->m_len == sizeof(struct ip_mreqn))) {
error = EINVAL;
break;
}
memset(&mreqn, 0, sizeof(mreqn));
memcpy(&mreqn, mtod(m, void *), m->m_len);
if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) {
error = EINVAL;
break;
}
error = ip_multicast_if(&mreqn, rtableid, &ifidx);
if (error)
break;
/*
* See if we found an interface, and confirm that it
* supports multicast.
*/
ifp = if_get(ifidx);
if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) ||
(ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
if_put(ifp);
break;
}
/*
* See if the membership already exists or if all the
* membership slots are full.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifidx == ifidx &&
imo->imo_membership[i]->inm_addr.s_addr
== mreqn.imr_multiaddr.s_addr)
break;
}
if (i < imo->imo_num_memberships) {
error = EADDRINUSE;
if_put(ifp);
break;
}
if (imo->imo_num_memberships == imo->imo_max_memberships) {
struct in_multi **nmships, **omships;
size_t newmax;
/*
* Resize the vector to next power-of-two minus 1. If
* the size would exceed the maximum then we know we've
* really run out of entries. Otherwise, we reallocate
* the vector.
*/
nmships = NULL;
omships = imo->imo_membership;
newmax = ((imo->imo_max_memberships + 1) * 2) - 1;
if (newmax <= IP_MAX_MEMBERSHIPS) {
nmships = mallocarray(newmax, sizeof(*nmships),
M_IPMOPTS, M_NOWAIT|M_ZERO);
if (nmships != NULL) {
memcpy(nmships, omships,
sizeof(*omships) *
imo->imo_max_memberships);
free(omships, M_IPMOPTS,
sizeof(*omships) *
imo->imo_max_memberships);
imo->imo_membership = nmships;
imo->imo_max_memberships = newmax;
}
}
if (nmships == NULL) {
error = ENOBUFS;
if_put(ifp);
break;
}
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
if ((imo->imo_membership[i] =
in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) {
error = ENOBUFS;
if_put(ifp);
break;
}
++imo->imo_num_memberships;
if_put(ifp);
break;
case IP_DROP_MEMBERSHIP:
/*
* Drop a multicast group membership.
* Group must be a valid IP multicast address.
*/
if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) ||
m->m_len == sizeof(struct ip_mreqn))) {
error = EINVAL;
break;
}
memset(&mreqn, 0, sizeof(mreqn));
memcpy(&mreqn, mtod(m, void *), m->m_len);
if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) {
error = EINVAL;
break;
}
/*
* If an interface address was specified, get a pointer
* to its ifnet structure.
*/
error = ip_multicast_if(&mreqn, rtableid, &ifidx);
if (error)
break;
/*
* Find the membership in the membership array.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifidx == 0 || imo->imo_membership[i]->inm_ifidx == ifidx) &&
imo->imo_membership[i]->inm_addr.s_addr ==
mreqn.imr_multiaddr.s_addr)
break;
}
if (i == imo->imo_num_memberships) {
error = EADDRNOTAVAIL;
break;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
in_delmulti(imo->imo_membership[i]);
/*
* Remove the gap in the membership array.
*/
for (++i; i < imo->imo_num_memberships; ++i)
imo->imo_membership[i-1] = imo->imo_membership[i];
--imo->imo_num_memberships;
break;
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the data.
*/
if (imo->imo_ifidx == 0 && imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP &&
imo->imo_num_memberships == 0) {
free(imo->imo_membership , M_IPMOPTS,
imo->imo_max_memberships * sizeof(struct in_multi *));
free(*imop, M_IPMOPTS, sizeof(**imop));
*imop = NULL;
}
return (error);
}
/*
* Return the IP multicast options in response to user getsockopt().
*/
int
ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m)
{
u_char *ttl;
u_char *loop;
struct in_addr *addr;
struct in_ifaddr *ia;
struct ifnet *ifp;
switch (optname) {
case IP_MULTICAST_IF:
addr = mtod(m, struct in_addr *);
m->m_len = sizeof(struct in_addr);
if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL)
addr->s_addr = INADDR_ANY;
else {
IFP_TO_IA(ifp, ia);
addr->s_addr = (ia == NULL) ? INADDR_ANY
: ia->ia_addr.sin_addr.s_addr;
if_put(ifp);
}
return (0);
case IP_MULTICAST_TTL:
ttl = mtod(m, u_char *);
m->m_len = 1;
*ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL : imo->imo_ttl;
return (0);
case IP_MULTICAST_LOOP:
loop = mtod(m, u_char *);
m->m_len = 1;
*loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP : imo->imo_loop;
return (0);
default:
return (EOPNOTSUPP);
}
}
/*
* Discard the IP multicast options.
*/
void
ip_freemoptions(struct ip_moptions *imo)
{
int i;
if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i)
in_delmulti(imo->imo_membership[i]);
free(imo->imo_membership, M_IPMOPTS,
imo->imo_max_memberships * sizeof(struct in_multi *));
free(imo, M_IPMOPTS, sizeof(*imo));
}
}
/*
* Routine called from ip_output() to loop back a copy of an IP multicast
* packet to the input queue of a specified interface.
*/
void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst)
{
struct ip *ip;
struct mbuf *copym;
copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT);
if (copym != NULL) {
/*
* We don't bother to fragment if the IP length is greater
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
if_input_local(ifp, copym, dst->sin_family);
}
}
/*
* Compute significant parts of the IPv4 checksum pseudo-header
* for use in a delayed TCP/UDP checksum calculation.
*/
static __inline u_int16_t __attribute__((__unused__))
in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto)
{
u_int32_t sum;
sum = lenproto +
(u_int16_t)(src >> 16) +
(u_int16_t)(src /*& 0xffff*/) +
(u_int16_t)(dst >> 16) +
(u_int16_t)(dst /*& 0xffff*/);
sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/);
if (sum > 0xffff)
sum -= 0xffff;
return (sum);
}
/*
* Process a delayed payload checksum calculation.
*/
void
in_delayed_cksum(struct mbuf *m)
{
struct ip *ip;
u_int16_t csum, offset;
ip = mtod(m, struct ip *);
offset = ip->ip_hl << 2;
csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset);
if (csum == 0 && ip->ip_p == IPPROTO_UDP)
csum = 0xffff;
switch (ip->ip_p) {
case IPPROTO_TCP:
offset += offsetof(struct tcphdr, th_sum);
break;
case IPPROTO_UDP:
offset += offsetof(struct udphdr, uh_sum);
break;
case IPPROTO_ICMP:
offset += offsetof(struct icmp, icmp_cksum);
break;
default:
return;
}
if ((offset + sizeof(u_int16_t)) > m->m_len)
m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
else
*(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
}
void
in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp)
{
struct ip *ip = mtod(m, struct ip *);
/* some hw and in_delayed_cksum need the pseudo header cksum */
if (m->m_pkthdr.csum_flags &
(M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) {
u_int16_t csum = 0, offset;
offset = ip->ip_hl << 2;
if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
csum = in_cksum_phdr(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) -
offset + ip->ip_p));
if (ip->ip_p == IPPROTO_TCP) offset += offsetof(struct tcphdr, th_sum);
else if (ip->ip_p == IPPROTO_UDP)
offset += offsetof(struct udphdr, uh_sum);
else if (ip->ip_p == IPPROTO_ICMP)
offset += offsetof(struct icmp, icmp_cksum);
if ((offset + sizeof(u_int16_t)) > m->m_len)
m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
else
*(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
}
if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) {
if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) ||
ip->ip_hl != 5) {
tcpstat_inc(tcps_outswcsum);
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */
}
} else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) {
if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) ||
ip->ip_hl != 5) {
udpstat_inc(udps_outswcsum);
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */
}
} else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) {
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */
}
}
int
in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap)
{
if ((ifp == NULL) || !ISSET(ifp->if_capabilities, ifcap) ||
(ifp->if_bridgeidx != 0))
return (0);
/*
* Simplex interface sends packet back without hardware cksum.
* Keep this check in sync with the condition where ether_resolve()
* calls if_input_local().
*/
if (ISSET(m->m_flags, M_BCAST) && ISSET(ifp->if_flags, IFF_SIMPLEX) &&
!m->m_pkthdr.pf.routed)
return (0);
return (1);
}
/* $OpenBSD: subr_prf.c,v 1.106 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: subr_prf.c,v 1.45 1997/10/24 18:14:25 chuck Exp $ */
/*-
* Copyright (c) 1986, 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_prf.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/reboot.h>
#include <sys/msgbuf.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/tprintf.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/mutex.h>
#include <dev/cons.h>
/*
* note that stdarg.h and the ansi style va_start macro is used for both
* ansi and traditional c compilers.
*/
#include <sys/stdarg.h>
#ifdef DDB
#include <ddb/db_output.h> /* db_printf, db_putchar prototypes */
#include <ddb/db_var.h> /* db_log, db_radix */
#endif
/*
* defines
*/
/* flags for kprintf */
#define TOCONS 0x01 /* to the console */
#define TOTTY 0x02 /* to the process' tty */
#define TOLOG 0x04 /* to the kernel message buffer */
#define TOBUFONLY 0x08 /* to the buffer (only) [for snprintf] */
#define TODDB 0x10 /* to ddb console */
#define TOCOUNT 0x20 /* act like [v]snprintf */
/* max size buffer kprintf needs to print quad_t [size in base 8 + \0] */
#define KPRINTF_BUFSIZE (sizeof(quad_t) * NBBY / 3 + 2)
/*
* local prototypes
*/
int kprintf(const char *, int, void *, char *, va_list);
void kputchar(int, int, struct tty *);
struct mutex kprintf_mutex =
MUTEX_INITIALIZER_FLAGS(IPL_HIGH, "kprintf", MTX_NOWITNESS);
/*
* globals
*/
extern int log_open; /* subr_log: is /dev/klog open? */
const char *panicstr; /* arg to first call to panic (used as a flag
to indicate that panic has already been called). */
#ifdef DDB
/*
* Enter ddb on panic.
*/
int db_panic = 1;
/*
* db_console controls if we can be able to enter ddb by a special key
* combination (machine dependent).
* If DDB_SAFE_CONSOLE is defined in the kernel configuration it allows
* to break into console during boot. It's _really_ useful when debugging
* some things in the kernel that can cause init(8) to crash.
*/
#ifdef DDB_SAFE_CONSOLE
int db_console = 1;
#else
int db_console = 0;
#endif
#endif
/*
* panic on spl assertion failure?
*/
#ifdef SPLASSERT_WATCH
int splassert_ctl = 3;
#else
int splassert_ctl = 1;
#endif
/*
* v_putc: routine to putc on virtual console
*
* the v_putc pointer can be used to redirect the console cnputc elsewhere
* [e.g. to a "virtual console"].
*/
void (*v_putc)(int) = cnputc; /* start with cnputc (normal cons) */
/*
* Silence kernel printf when masquerading as a bootloader.
*/
#ifdef BOOT_QUIET
int printf_flags = TOLOG;
#else
int printf_flags = TOCONS | TOLOG;
#endif
/*
* functions
*/
/*
* Partial support (the failure case) of the assertion facility
* commonly found in userland.
*/
void
__assert(const char *t, const char *f, int l, const char *e)
{
panic(__KASSERTSTR, t, e, f, l);
}
/*
* tablefull: warn that a system table is full
*/
void
tablefull(const char *tab)
{
log(LOG_ERR, "%s: table is full\n", tab);
}
/*
* If we have panicked, prefer db_printf() and db_vprintf() where
* available.
*/
#ifdef DDB
#define panic_printf(...) db_printf(__VA_ARGS__)
#define panic_vprintf(...) db_vprintf(__VA_ARGS__)
#else
#define panic_printf(...) printf(__VA_ARGS__)
#define panic_vprintf(...) vprintf(__VA_ARGS__)
#endif
/*
* panic: handle an unresolvable fatal error
*
* prints "panic: <message>" and reboots. if called twice (i.e. recursive
* call) we avoid trying to sync the disk and just reboot (to avoid
* recursive panics).
*/
void
panic(const char *fmt, ...)
{
struct cpu_info *ci = curcpu();
int bootopt;
va_list ap;
bootopt = RB_AUTOBOOT | RB_DUMP;
if (atomic_cas_ptr(&panicstr, NULL, ci->ci_panicbuf) != NULL)
bootopt |= RB_NOSYNC;
/* do not trigger assertions, we know that we are inconsistent */
splassert_ctl = 0;
#ifdef BOOT_QUIET
printf_flags |= TOCONS; /* make sure we see kernel printf output */
#endif
/*
* All panic messages are printed, but only the first panic on a
* given CPU is written to its panicbuf.
*/
if (ci->ci_panicbuf[0] == '\0') {
va_start(ap, fmt);
vsnprintf(ci->ci_panicbuf, sizeof(ci->ci_panicbuf), fmt, ap);
va_end(ap);
panic_printf("panic: %s\n", ci->ci_panicbuf);
} else {
panic_printf("panic: ");
va_start(ap, fmt);
panic_vprintf(fmt, ap);
va_end(ap);
panic_printf("\n");
}
#ifdef DDB
if (db_panic)
db_enter();
else
db_stack_dump();
#endif
reboot(bootopt);
/* NOTREACHED */
}
/*
* We print only the function name. The file name is usually very long and
* would eat tons of space in the kernel.
*/
void
splassert_fail(int wantipl, int haveipl, const char *func)
{
if (panicstr || db_active)
return;
printf("splassert: %s: want %d have %d\n", func, wantipl, haveipl);
switch (splassert_ctl) {
case 1:
break;
case 2:
#ifdef DDB
db_stack_dump();
#endif
break;
case 3:
#ifdef DDB
db_stack_dump();
db_enter();
#endif
break;
default:
panic("spl assertion failure in %s", func);
}
}
/*
* kernel logging functions: log, logpri, addlog
*/
/*
* log: write to the log buffer
*
* => will not sleep [so safe to call from interrupt]
* => will log to console if /dev/klog isn't open
*/
void
log(int level, const char *fmt, ...)
{
int s;
va_list ap;
s = splhigh();
logpri(level); /* log the level first */
va_start(ap, fmt);
kprintf(fmt, TOLOG, NULL, NULL, ap);
va_end(ap);
splx(s);
if (!log_open) {
va_start(ap, fmt);
mtx_enter(&kprintf_mutex);
kprintf(fmt, TOCONS, NULL, NULL, ap);
mtx_leave(&kprintf_mutex);
va_end(ap);
}
logwakeup(); /* wake up anyone waiting for log msgs */
}
/*
* logpri: log the priority level to the klog
*/
void
logpri(int level)
{
char *p;
char snbuf[KPRINTF_BUFSIZE];
kputchar('<', TOLOG, NULL);
snprintf(snbuf, sizeof snbuf, "%d", level);
for (p = snbuf ; *p ; p++)
kputchar(*p, TOLOG, NULL);
kputchar('>', TOLOG, NULL);
}
/*
* addlog: add info to previous log message
*/
int
addlog(const char *fmt, ...)
{
int s;
va_list ap;
s = splhigh();
va_start(ap, fmt);
kprintf(fmt, TOLOG, NULL, NULL, ap);
va_end(ap);
splx(s);
if (!log_open) {
va_start(ap, fmt);
mtx_enter(&kprintf_mutex);
kprintf(fmt, TOCONS, NULL, NULL, ap);
mtx_leave(&kprintf_mutex);
va_end(ap);
}
logwakeup();
return(0);
}
/*
* kputchar: print a single character on console or user terminal.
*
* => if console, then the last MSGBUFS chars are saved in msgbuf
* for inspection later (e.g. dmesg/syslog)
*/
void
kputchar(int c, int flags, struct tty *tp)
{
extern int msgbufmapped;
if (panicstr)
constty = NULL;
if ((flags & TOCONS) && tp == NULL && constty != NULL && !db_active) {
tp = constty;
flags |= TOTTY;
}
if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
(flags & TOCONS) && tp == constty)
constty = NULL; if ((flags & TOLOG) &&
c != '\0' && c != '\r' && c != 0177 && msgbufmapped)
msgbuf_putchar(msgbufp, c); if ((flags & TOCONS) && (constty == NULL || db_active) && c != '\0') (*v_putc)(c);
#ifdef DDB
if (flags & TODDB) db_putchar(c);
#endif
}
/*
* uprintf: print to the controlling tty of the current process
*
* => we may block if the tty queue is full
* => no message is printed if the queue doesn't clear in a reasonable
* time
*/
void
uprintf(const char *fmt, ...)
{
struct process *pr = curproc->p_p;
va_list ap;
if (pr->ps_flags & PS_CONTROLT && pr->ps_session->s_ttyvp) {
va_start(ap, fmt);
kprintf(fmt, TOTTY, pr->ps_session->s_ttyp, NULL, ap);
va_end(ap);
}
}
#if defined(NFSSERVER) || defined(NFSCLIENT)
/*
* tprintf functions: used to send messages to a specific process
*
* usage:
* get a tpr_t handle on a process "p" by using "tprintf_open(p)"
* use the handle when calling "tprintf"
* when done, do a "tprintf_close" to drop the handle
*/
/*
* tprintf_open: get a tprintf handle on a process "p"
* XXX change s/proc/process
*
* => returns NULL if process can't be printed to
*/
tpr_t
tprintf_open(struct proc *p)
{
struct process *pr = p->p_p;
if (pr->ps_flags & PS_CONTROLT && pr->ps_session->s_ttyvp) {
SESSHOLD(pr->ps_session);
return ((tpr_t)pr->ps_session);
}
return ((tpr_t) NULL);
}
/*
* tprintf_close: dispose of a tprintf handle obtained with tprintf_open
*/
void
tprintf_close(tpr_t sess)
{
if (sess)
SESSRELE((struct session *) sess);
}
/*
* tprintf: given tprintf handle to a process [obtained with tprintf_open],
* send a message to the controlling tty for that process.
*
* => also sends message to /dev/klog
*/
void
tprintf(tpr_t tpr, const char *fmt, ...)
{
struct session *sess = (struct session *)tpr;
struct tty *tp = NULL;
int flags = TOLOG;
va_list ap;
logpri(LOG_INFO);
if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) {
flags |= TOTTY;
tp = sess->s_ttyp;
}
va_start(ap, fmt);
kprintf(fmt, flags, tp, NULL, ap);
va_end(ap);
logwakeup();
}
#endif /* NFSSERVER || NFSCLIENT */
/*
* ttyprintf: send a message to a specific tty
*
* => should be used only by tty driver or anything that knows the
* underlying tty will not be revoked(2)'d away. [otherwise,
* use tprintf]
*/
void
ttyprintf(struct tty *tp, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
kprintf(fmt, TOTTY, tp, NULL, ap);
va_end(ap);
}
#ifdef DDB
/*
* db_printf: printf for DDB (via db_putchar)
*/
int
db_printf(const char *fmt, ...)
{
va_list ap;
int retval;
va_start(ap, fmt);
retval = db_vprintf(fmt, ap);
va_end(ap);
return(retval);
}
int
db_vprintf(const char *fmt, va_list ap)
{
int flags;
flags = TODDB;
if (db_log)
flags |= TOLOG;
return (kprintf(fmt, flags, NULL, NULL, ap));
}
#endif /* DDB */
/*
* normal kernel printf functions: printf, vprintf, snprintf
*/
/*
* printf: print a message to the console and the log
*/
int
printf(const char *fmt, ...)
{
va_list ap;
int retval;
va_start(ap, fmt);
mtx_enter(&kprintf_mutex);
retval = kprintf(fmt, printf_flags, NULL, NULL, ap);
mtx_leave(&kprintf_mutex);
va_end(ap);
if (!panicstr) logwakeup();
return(retval);
}
/*
* vprintf: print a message to the console and the log [already have a
* va_list]
*/
int
vprintf(const char *fmt, va_list ap)
{
int retval;
mtx_enter(&kprintf_mutex);
retval = kprintf(fmt, TOCONS | TOLOG, NULL, NULL, ap);
mtx_leave(&kprintf_mutex);
if (!panicstr)
logwakeup();
return (retval);
}
/*
* snprintf: print a message to a buffer
*/
int
snprintf(char *buf, size_t size, const char *fmt, ...)
{
int retval;
va_list ap;
char *p;
p = buf;
if (size > 0)
p += size - 1;
va_start(ap, fmt);
retval = kprintf(fmt, TOBUFONLY | TOCOUNT, &p, buf, ap);
va_end(ap);
if (size > 0) *p = '\0'; /* null terminate */
return(retval);
}
/*
* vsnprintf: print a message to a buffer [already have va_alist]
*/
int
vsnprintf(char *buf, size_t size, const char *fmt, va_list ap)
{
int retval;
char *p;
p = buf + size - 1;
if (size < 1)
p = buf;
retval = kprintf(fmt, TOBUFONLY | TOCOUNT, &p, buf, ap);
if (size > 0)
*(p) = 0; /* null terminate */
return(retval);
}
/*
* kprintf: scaled down version of printf(3).
*
* this version based on vfprintf() from libc which was derived from
* software contributed to Berkeley by Chris Torek.
*
* The additional format %b is supported to decode error registers.
* Its usage is:
*
* printf("reg=%b\n", regval, "<base><arg>*");
*
* where <base> is the output base expressed as a control character, e.g.
* \10 gives octal; \20 gives hex. Each arg is a sequence of characters,
* the first of which gives the bit number to be inspected (origin 1), and
* the next characters (up to a control character, i.e. a character <= 32),
* give the name of the register. Thus:
*
* kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
*
* would produce output:
*
* reg=3<BITTWO,BITONE>
*
* To support larger integers (> 32 bits), %b formatting will also accept
* control characters in the region 0x80 - 0xff. 0x80 refers to bit 0,
* 0x81 refers to bit 1, and so on. The equivalent string to the above is:
*
* kprintf("reg=%b\n", 3, "\10\201BITTWO\200BITONE\n");
*
* and would produce the same output.
*
* Like the rest of printf, %b can be prefixed to handle various size
* modifiers, eg. %b is for "int", %lb is for "long", and %llb supports
* "long long".
*
* This code is large and complicated...
*/
/*
* macros for converting digits to letters and vice versa
*/
#define to_digit(c) ((c) - '0')
#define is_digit(c) ((unsigned)to_digit(c) <= 9)
#define to_char(n) ((n) + '0')
/*
* flags used during conversion.
*/
#define ALT 0x001 /* alternate form */
#define HEXPREFIX 0x002 /* add 0x or 0X prefix */
#define LADJUST 0x004 /* left adjustment */
#define LONGDBL 0x008 /* long double; unimplemented */
#define LONGINT 0x010 /* long integer */
#define QUADINT 0x020 /* quad integer */
#define SHORTINT 0x040 /* short integer */
#define ZEROPAD 0x080 /* zero (as opposed to blank) pad */
#define FPT 0x100 /* Floating point number */
#define SIZEINT 0x200 /* (signed) size_t */
/*
* To extend shorts properly, we need both signed and unsigned
* argument extraction methods.
*/
#define SARG() \
(flags&QUADINT ? va_arg(ap, quad_t) : \
flags&LONGINT ? va_arg(ap, long) : \
flags&SIZEINT ? va_arg(ap, ssize_t) : \
flags&SHORTINT ? (long)(short)va_arg(ap, int) : \
(long)va_arg(ap, int))
#define UARG() \
(flags&QUADINT ? va_arg(ap, u_quad_t) : \
flags&LONGINT ? va_arg(ap, u_long) : \
flags&SIZEINT ? va_arg(ap, size_t) : \
flags&SHORTINT ? (u_long)(u_short)va_arg(ap, int) : \
(u_long)va_arg(ap, u_int))
#define KPRINTF_PUTCHAR(C) do { \
int chr = (C); \
ret += 1; \
if (oflags & TOBUFONLY) { \
if ((vp != NULL) && (sbuf == tailp)) { \
if (!(oflags & TOCOUNT)) \
goto overflow; \
} else \
*sbuf++ = chr; \
} else { \
kputchar(chr, oflags, (struct tty *)vp); \
} \
} while(0)
int
kprintf(const char *fmt0, int oflags, void *vp, char *sbuf, va_list ap)
{
char *fmt; /* format string */
int ch; /* character from fmt */
int n; /* handy integer (short term usage) */
char *cp = NULL; /* handy char pointer (short term usage) */
int flags; /* flags as above */
int ret; /* return value accumulator */
int width; /* width from format (%8d), or 0 */
int prec; /* precision from format (%.3d), or -1 */
char sign; /* sign prefix (' ', '+', '-', or \0) */
u_quad_t _uquad; /* integer arguments %[diouxX] */
enum { OCT, DEC, HEX } base;/* base for [diouxX] conversion */
int dprec; /* a copy of prec if [diouxX], 0 otherwise */
int realsz; /* field size expanded by dprec */
int size = 0; /* size of converted field or string */
char *xdigs = NULL; /* digits for [xX] conversion */
char buf[KPRINTF_BUFSIZE]; /* space for %c, %[diouxX] */
char *tailp = NULL; /* tail pointer for snprintf */
if (oflags & TOCONS) MUTEX_ASSERT_LOCKED(&kprintf_mutex); if ((oflags & TOBUFONLY) && (vp != NULL)) tailp = *(char **)vp;
fmt = (char *)fmt0;
ret = 0;
/*
* Scan the format for conversions (`%' character).
*/
for (;;) {
while (*fmt != '%' && *fmt) { KPRINTF_PUTCHAR(*fmt++);
}
if (*fmt == 0)
goto done;
fmt++; /* skip over '%' */
flags = 0;
dprec = 0;
width = 0;
prec = -1;
sign = '\0';
rflag: ch = *fmt++;
reswitch: switch (ch) {
/* XXX: non-standard '%b' format */
case 'b': {
char *b, *z;
int tmp;
_uquad = UARG(); b = va_arg(ap, char *); if (*b == 8)
snprintf(buf, sizeof buf, "%llo", _uquad);
else if (*b == 10)
snprintf(buf, sizeof buf, "%lld", _uquad);
else if (*b == 16)
snprintf(buf, sizeof buf, "%llx", _uquad);
else
break;
b++;
z = buf;
while (*z) { KPRINTF_PUTCHAR(*z++);
}
if (_uquad) {
tmp = 0;
while ((n = *b++) != 0) {
if (n & 0x80)
n &= 0x7f;
else if (n <= ' ')
n = n - 1;
if (_uquad & (1LL << n)) {
KPRINTF_PUTCHAR(tmp ? ',':'<'); while (*b > ' ' &&
(*b & 0x80) == 0) {
KPRINTF_PUTCHAR(*b);
b++;
}
tmp = 1;
} else {
while (*b > ' ' &&
(*b & 0x80) == 0)
b++;
}
}
if (tmp) { KPRINTF_PUTCHAR('>');
}
}
continue; /* no output */
}
case ' ':
/*
* ``If the space and + flags both appear, the space
* flag will be ignored.''
* -- ANSI X3J11
*/
if (!sign)
sign = ' ';
goto rflag;
case '#':
flags |= ALT;
goto rflag;
case '*':
/*
* ``A negative field width argument is taken as a
* - flag followed by a positive field width.''
* -- ANSI X3J11
* They don't exclude field widths read from args.
*/
if ((width = va_arg(ap, int)) >= 0)
goto rflag;
width = -width;
/* FALLTHROUGH */
case '-':
flags |= LADJUST;
goto rflag;
case '+':
sign = '+';
goto rflag;
case '.':
if ((ch = *fmt++) == '*') {
n = va_arg(ap, int);
prec = n < 0 ? -1 : n;
goto rflag;
}
n = 0;
while (is_digit(ch)) {
n = 10 * n + to_digit(ch);
ch = *fmt++;
}
prec = n < 0 ? -1 : n;
goto reswitch;
case '0':
/*
* ``Note that 0 is taken as a flag, not as the
* beginning of a field width.''
* -- ANSI X3J11
*/
flags |= ZEROPAD;
goto rflag;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
n = 0;
do {
n = 10 * n + to_digit(ch);
ch = *fmt++; } while (is_digit(ch));
width = n;
goto reswitch;
case 'h':
flags |= SHORTINT;
goto rflag;
case 'l':
if (*fmt == 'l') {
fmt++;
flags |= QUADINT;
} else {
flags |= LONGINT;
}
goto rflag;
case 'q':
flags |= QUADINT;
goto rflag;
case 'z':
flags |= SIZEINT;
goto rflag;
case 'c':
*(cp = buf) = va_arg(ap, int);
size = 1;
sign = '\0';
break;
case 't':
/* ptrdiff_t */
/* FALLTHROUGH */
case 'D':
flags |= LONGINT;
/*FALLTHROUGH*/
case 'd':
case 'i':
_uquad = SARG();
if ((quad_t)_uquad < 0) {
_uquad = -_uquad;
sign = '-';
}
base = DEC;
goto number;
case 'n':
panic("no %%n support");
break;
case 'O':
flags |= LONGINT;
/*FALLTHROUGH*/
case 'o':
_uquad = UARG();
base = OCT;
goto nosign;
case 'p':
/*
* ``The argument shall be a pointer to void. The
* value of the pointer is converted to a sequence
* of printable characters, in an implementation-
* defined manner.''
* -- ANSI X3J11
*/
_uquad = (u_long)va_arg(ap, void *);
base = HEX;
xdigs = "0123456789abcdef";
flags |= HEXPREFIX;
ch = 'x';
goto nosign;
case 's':
if ((cp = va_arg(ap, char *)) == NULL)
cp = "(null)";
if (prec >= 0) {
/*
* can't use strlen; can only look for the
* NUL in the first `prec' characters, and
* strlen() will go further.
*/
char *p = memchr(cp, 0, prec);
if (p != NULL) { size = p - cp;
if (size > prec)
size = prec;
} else
size = prec;
} else
size = strlen(cp);
sign = '\0';
break;
case 'U':
flags |= LONGINT;
/*FALLTHROUGH*/
case 'u':
_uquad = UARG();
base = DEC;
goto nosign;
case 'X':
xdigs = "0123456789ABCDEF";
goto hex;
case 'x':
xdigs = "0123456789abcdef";
hex: _uquad = UARG();
base = HEX;
/* leading 0x/X only if non-zero */
if (flags & ALT && _uquad != 0)
flags |= HEXPREFIX;
/* unsigned conversions */
nosign: sign = '\0';
/*
* ``... diouXx conversions ... if a precision is
* specified, the 0 flag will be ignored.''
* -- ANSI X3J11
*/
number: if ((dprec = prec) >= 0)
flags &= ~ZEROPAD;
/*
* ``The result of converting a zero value with an
* explicit precision of zero is no characters.''
* -- ANSI X3J11
*/
cp = buf + KPRINTF_BUFSIZE;
if (_uquad != 0 || prec != 0) {
/*
* Unsigned mod is hard, and unsigned mod
* by a constant is easier than that by
* a variable; hence this switch.
*/
switch (base) {
case OCT:
do {
*--cp = to_char(_uquad & 7);
_uquad >>= 3;
} while (_uquad);
/* handle octal leading 0 */
if (flags & ALT && *cp != '0') *--cp = '0';
break;
case DEC:
/* many numbers are 1 digit */
while (_uquad >= 10) {
*--cp = to_char(_uquad % 10);
_uquad /= 10;
}
*--cp = to_char(_uquad);
break;
case HEX:
do {
*--cp = xdigs[_uquad & 15];
_uquad >>= 4;
} while (_uquad);
break;
default:
cp = "bug in kprintf: bad base";
size = strlen(cp);
goto skipsize;
}
}
size = buf + KPRINTF_BUFSIZE - cp;
skipsize:
break;
default: /* "%?" prints ?, unless ? is NUL */
if (ch == '\0')
goto done;
/* pretend it was %c with argument ch */
cp = buf;
*cp = ch;
size = 1;
sign = '\0';
break;
}
/*
* All reasonable formats wind up here. At this point, `cp'
* points to a string which (if not flags&LADJUST) should be
* padded out to `width' places. If flags&ZEROPAD, it should
* first be prefixed by any sign or other prefix; otherwise,
* it should be blank padded before the prefix is emitted.
* After any left-hand padding and prefixing, emit zeroes
* required by a decimal [diouxX] precision, then print the
* string proper, then emit zeroes required by any leftover
* floating precision; finally, if LADJUST, pad with blanks.
*
* Compute actual size, so we know how much to pad.
* size excludes decimal prec; realsz includes it.
*/
realsz = dprec > size ? dprec : size;
if (sign) realsz++;
else if (flags & HEXPREFIX)
realsz+= 2;
/* right-adjusting blank padding */
if ((flags & (LADJUST|ZEROPAD)) == 0) {
n = width - realsz;
while (n-- > 0) KPRINTF_PUTCHAR(' ');
}
/* prefix */
if (sign) {
KPRINTF_PUTCHAR(sign); } else if (flags & HEXPREFIX) { KPRINTF_PUTCHAR('0'); KPRINTF_PUTCHAR(ch);
}
/* right-adjusting zero padding */
if ((flags & (LADJUST|ZEROPAD)) == ZEROPAD) {
n = width - realsz;
while (n-- > 0) KPRINTF_PUTCHAR('0');
}
/* leading zeroes from decimal precision */
n = dprec - size;
while (n-- > 0) KPRINTF_PUTCHAR('0');
/* the string or number proper */
while (size--) KPRINTF_PUTCHAR(*cp++);
/* left-adjusting padding (always blank) */
if (flags & LADJUST) {
n = width - realsz;
while (n-- > 0) KPRINTF_PUTCHAR(' ');
}
}
done:
if ((oflags & TOBUFONLY) && (vp != NULL)) *(char **)vp = sbuf;
overflow:
return (ret);
/* NOTREACHED */
}
#if __GNUC_PREREQ__(2,96)
/*
* XXX - these functions shouldn't be in the kernel, but gcc 3.X feels like
* translating some printf calls to puts and since it doesn't seem
* possible to just turn off parts of those optimizations (some of
* them are really useful), we have to provide a dummy puts and putchar
* that are wrappers around printf.
*/
int puts(const char *);
int putchar(int c);
int
puts(const char *str)
{
printf("%s\n", str);
return (0);
}
int
putchar(int c)
{
printf("%c", c);
return (c);
}
#endif
/* $OpenBSD: icmp6.c,v 1.242 2022/05/05 13:57:40 claudio Exp $ */
/* $KAME: icmp6.c,v 1.217 2001/06/20 15:03:29 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94
*/
#include "carp.h"
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/sysctl.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/mld6_var.h>
#include <netinet/in_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
#if NPF > 0
#include <net/pfvar.h>
#endif
struct cpumem *icmp6counters;
extern int icmp6errppslim;
static int icmp6errpps_count = 0;
static struct timeval icmp6errppslim_last;
/*
* List of callbacks to notify when Path MTU changes are made.
*/
struct icmp6_mtudisc_callback {
LIST_ENTRY(icmp6_mtudisc_callback) mc_list;
void (*mc_func)(struct sockaddr_in6 *, u_int);
};
LIST_HEAD(, icmp6_mtudisc_callback) icmp6_mtudisc_callbacks =
LIST_HEAD_INITIALIZER(icmp6_mtudisc_callbacks);
struct rttimer_queue icmp6_mtudisc_timeout_q;
/* XXX do these values make any sense? */
static int icmp6_mtudisc_hiwat = 1280;
static int icmp6_mtudisc_lowat = 256;
/*
* keep track of # of redirect routes.
*/
struct rttimer_queue icmp6_redirect_timeout_q;
/* XXX experimental, turned off */
static int icmp6_redirect_lowat = -1;
void icmp6_errcount(int, int);
int icmp6_ratelimit(const struct in6_addr *, const int, const int);
const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *,
struct in6_addr *);
int icmp6_notify_error(struct mbuf *, int, int, int);
void icmp6_mtudisc_timeout(struct rtentry *, u_int);
void
icmp6_init(void)
{
mld6_init();
rt_timer_queue_init(&icmp6_mtudisc_timeout_q, ip6_mtudisc_timeout,
&icmp6_mtudisc_timeout);
rt_timer_queue_init(&icmp6_redirect_timeout_q, icmp6_redirtimeout,
NULL);
icmp6counters = counters_alloc(icp6s_ncounters);
}
void
icmp6_errcount(int type, int code)
{
enum icmp6stat_counters c = icp6s_ounknown;
switch (type) {
case ICMP6_DST_UNREACH:
switch (code) {
case ICMP6_DST_UNREACH_NOROUTE:
c = icp6s_odst_unreach_noroute;
break;
case ICMP6_DST_UNREACH_ADMIN:
c = icp6s_odst_unreach_admin;
break;
case ICMP6_DST_UNREACH_BEYONDSCOPE:
c = icp6s_odst_unreach_beyondscope;
break;
case ICMP6_DST_UNREACH_ADDR:
c = icp6s_odst_unreach_addr;
break;
case ICMP6_DST_UNREACH_NOPORT:
c = icp6s_odst_unreach_noport;
break;
}
break;
case ICMP6_PACKET_TOO_BIG:
c = icp6s_opacket_too_big;
break;
case ICMP6_TIME_EXCEEDED:
switch (code) {
case ICMP6_TIME_EXCEED_TRANSIT:
c = icp6s_otime_exceed_transit;
break;
case ICMP6_TIME_EXCEED_REASSEMBLY:
c = icp6s_otime_exceed_reassembly;
break;
}
break;
case ICMP6_PARAM_PROB:
switch (code) {
case ICMP6_PARAMPROB_HEADER:
c = icp6s_oparamprob_header;
break;
case ICMP6_PARAMPROB_NEXTHEADER:
c = icp6s_oparamprob_nextheader;
break;
case ICMP6_PARAMPROB_OPTION:
c = icp6s_oparamprob_option;
break;
}
break;
case ND_REDIRECT:
c = icp6s_oredirect;
break;
}
icmp6stat_inc(c);
}
/*
* Register a Path MTU Discovery callback.
*/
void
icmp6_mtudisc_callback_register(void (*func)(struct sockaddr_in6 *, u_int))
{
struct icmp6_mtudisc_callback *mc;
LIST_FOREACH(mc, &icmp6_mtudisc_callbacks, mc_list) {
if (mc->mc_func == func)
return;
}
mc = malloc(sizeof(*mc), M_PCB, M_NOWAIT);
if (mc == NULL)
panic("%s", __func__);
mc->mc_func = func;
LIST_INSERT_HEAD(&icmp6_mtudisc_callbacks, mc, mc_list);
}
struct mbuf *
icmp6_do_error(struct mbuf *m, int type, int code, int param)
{
struct ip6_hdr *oip6, *nip6;
struct icmp6_hdr *icmp6;
u_int preplen;
int off;
int nxt;
icmp6stat_inc(icp6s_error);
/* count per-type-code statistics */
icmp6_errcount(type, code); if (m->m_len < sizeof(struct ip6_hdr)) {
m = m_pullup(m, sizeof(struct ip6_hdr));
if (m == NULL)
return (NULL);
}
oip6 = mtod(m, struct ip6_hdr *);
/*
* If the destination address of the erroneous packet is a multicast
* address, or the packet was sent using link-layer multicast,
* we should basically suppress sending an error (RFC 2463, Section
* 2.4).
* We have two exceptions (the item e.2 in that section):
* - the Packet Too Big message can be sent for path MTU discovery.
* - the Parameter Problem Message that can be allowed an icmp6 error
* in the option type field. This check has been done in
* ip6_unknown_opt(), so we can just check the type and code.
*/
if ((m->m_flags & (M_BCAST|M_MCAST) ||
IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB ||
code != ICMP6_PARAMPROB_OPTION)))
goto freeit;
/*
* RFC 2463, 2.4 (e.5): source address check.
* XXX: the case of anycast source?
*/
if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
goto freeit;
/*
* If we are about to send ICMPv6 against ICMPv6 error/redirect,
* don't do it.
*/
nxt = -1;
off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
if (off >= 0 && nxt == IPPROTO_ICMPV6) {
struct icmp6_hdr *icp;
IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
sizeof(*icp));
if (icp == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (NULL);
}
if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
icp->icmp6_type == ND_REDIRECT) {
/*
* ICMPv6 error
* Special case: for redirect (which is
* informational) we must not send icmp6 error.
*/
icmp6stat_inc(icp6s_canterror);
goto freeit;
} else {
/* ICMPv6 informational - send the error */
}
}
else {
/* non-ICMPv6 - send the error */
}
oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */
/* Finally, do rate limitation check. */
if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
icmp6stat_inc(icp6s_toofreq);
goto freeit;
}
/*
* OK, ICMP6 can be generated.
*/
if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
M_PREPEND(m, preplen, M_DONTWAIT);
if (m && m->m_len < preplen)
m = m_pullup(m, preplen);
if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
return (NULL);
}
nip6 = mtod(m, struct ip6_hdr *);
nip6->ip6_src = oip6->ip6_src;
nip6->ip6_dst = oip6->ip6_dst;
if (IN6_IS_SCOPE_EMBED(&oip6->ip6_src))
oip6->ip6_src.s6_addr16[1] = 0;
if (IN6_IS_SCOPE_EMBED(&oip6->ip6_dst))
oip6->ip6_dst.s6_addr16[1] = 0;
icmp6 = (struct icmp6_hdr *)(nip6 + 1);
icmp6->icmp6_type = type;
icmp6->icmp6_code = code;
icmp6->icmp6_pptr = htonl((u_int32_t)param);
/*
* icmp6_reflect() is designed to be in the input path.
* icmp6_error() can be called from both input and output path,
* and if we are in output path rcvif could contain bogus value.
* clear m->m_pkthdr.ph_ifidx for safety, we should have enough
* scope information in ip header (nip6).
*/
m->m_pkthdr.ph_ifidx = 0;
icmp6stat_inc(icp6s_outhist + type);
return (m);
freeit:
/*
* If we can't tell whether or not we can generate ICMP6, free it.
*/
return (m_freem(m));
}
/*
* Generate an error packet of type error in response to bad IP6 packet.
*/
void
icmp6_error(struct mbuf *m, int type, int code, int param)
{
struct mbuf *n;
n = icmp6_do_error(m, type, code, param);
if (n != NULL) {
/* header order: IPv6 - ICMPv6 */
if (!icmp6_reflect(&n, sizeof(struct ip6_hdr), NULL)) ip6_send(n);
}
}
/*
* Process a received ICMP6 message.
*/
int
icmp6_input(struct mbuf **mp, int *offp, int proto, int af)
{
#if NCARP > 0
struct ifnet *ifp;
#endif
struct mbuf *m = *mp, *n;
struct ip6_hdr *ip6, *nip6;
struct icmp6_hdr *icmp6, *nicmp6;
int off = *offp;
int icmp6len = m->m_pkthdr.len - *offp;
int code, sum, noff;
char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
/*
* Locate icmp6 structure in mbuf, and check
* that not corrupted and of at least minimum length
*/
ip6 = mtod(m, struct ip6_hdr *);
if (icmp6len < sizeof(struct icmp6_hdr)) {
icmp6stat_inc(icp6s_tooshort);
goto freeit;
}
/*
* calculate the checksum
*/
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
if (icmp6 == NULL) {
icmp6stat_inc(icp6s_tooshort);
return IPPROTO_DONE;
}
code = icmp6->icmp6_code;
if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
nd6log((LOG_ERR,
"ICMP6 checksum error(%d|%x) %s\n",
icmp6->icmp6_type, sum,
inet_ntop(AF_INET6, &ip6->ip6_src, src, sizeof(src))));
icmp6stat_inc(icp6s_checksum);
goto freeit;
}
#if NPF > 0
if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
switch (icmp6->icmp6_type) {
/*
* These ICMP6 types map to other connections. They must be
* delivered to pr_ctlinput() also for diverted connections.
*/
case ICMP6_DST_UNREACH:
case ICMP6_PACKET_TOO_BIG:
case ICMP6_TIME_EXCEEDED:
case ICMP6_PARAM_PROB:
/*
* Do not use the divert-to property of the TCP or UDP
* rule when doing the PCB lookup for the raw socket.
*/
m->m_pkthdr.pf.flags &=~ PF_TAG_DIVERTED;
break;
default:
goto raw;
}
}
#endif /* NPF */
#if NCARP > 0
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp == NULL)
goto freeit;
if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST &&
carp_lsdrop(ifp, m, AF_INET6, ip6->ip6_src.s6_addr32,
ip6->ip6_dst.s6_addr32, 1)) {
if_put(ifp);
goto freeit;
}
if_put(ifp);
#endif
icmp6stat_inc(icp6s_inhist + icmp6->icmp6_type);
switch (icmp6->icmp6_type) {
case ICMP6_DST_UNREACH:
switch (code) {
case ICMP6_DST_UNREACH_NOROUTE:
code = PRC_UNREACH_NET;
break;
case ICMP6_DST_UNREACH_ADMIN:
code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
break;
case ICMP6_DST_UNREACH_ADDR:
code = PRC_HOSTDEAD;
break;
case ICMP6_DST_UNREACH_BEYONDSCOPE:
/* I mean "source address was incorrect." */
code = PRC_PARAMPROB;
break;
case ICMP6_DST_UNREACH_NOPORT:
code = PRC_UNREACH_PORT;
break;
default:
goto badcode;
}
goto deliver;
case ICMP6_PACKET_TOO_BIG:
/* MTU is checked in icmp6_mtudisc_update. */
code = PRC_MSGSIZE;
/*
* Updating the path MTU will be done after examining
* intermediate extension headers.
*/
goto deliver;
case ICMP6_TIME_EXCEEDED:
switch (code) {
case ICMP6_TIME_EXCEED_TRANSIT:
code = PRC_TIMXCEED_INTRANS;
break;
case ICMP6_TIME_EXCEED_REASSEMBLY:
code = PRC_TIMXCEED_REASS;
break;
default:
goto badcode;
}
goto deliver;
case ICMP6_PARAM_PROB:
switch (code) {
case ICMP6_PARAMPROB_NEXTHEADER:
code = PRC_UNREACH_PROTOCOL;
break;
case ICMP6_PARAMPROB_HEADER:
case ICMP6_PARAMPROB_OPTION:
code = PRC_PARAMPROB;
break;
default:
goto badcode;
}
goto deliver;
case ICMP6_ECHO_REQUEST:
if (code != 0)
goto badcode;
/*
* Copy mbuf to send to two data paths: userland socket(s),
* and to the querier (echo reply).
* m: a copy for socket, n: a copy for querier
*/
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* Give up local */
n = m;
m = *mp = NULL;
goto deliverecho;
}
/*
* If the first mbuf is shared, or the first mbuf is too short,
* copy the first part of the data into a fresh mbuf.
* Otherwise, we will wrongly overwrite both copies.
*/
if ((n->m_flags & M_EXT) != 0 ||
n->m_len < off + sizeof(struct icmp6_hdr)) {
struct mbuf *n0 = n;
const int maxlen = sizeof(*nip6) + sizeof(*nicmp6);
/*
* Prepare an internal mbuf. m_pullup() doesn't
* always copy the length we specified.
*/
if (maxlen >= MCLBYTES) {
/* Give up remote */
m_freem(n0);
break;
}
MGETHDR(n, M_DONTWAIT, n0->m_type);
if (n && maxlen >= MHLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (n == NULL) {
/* Give up local */
m_freem(n0);
n = m;
m = *mp = NULL;
goto deliverecho;
}
M_MOVE_PKTHDR(n, n0);
/*
* Copy IPv6 and ICMPv6 only.
*/
nip6 = mtod(n, struct ip6_hdr *);
bcopy(ip6, nip6, sizeof(struct ip6_hdr));
nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
noff = sizeof(struct ip6_hdr);
n->m_len = noff + sizeof(struct icmp6_hdr);
/*
* Adjust mbuf. ip6_plen will be adjusted in
* ip6_output().
* n->m_pkthdr.len == n0->m_pkthdr.len at this point.
*/
n->m_pkthdr.len += noff + sizeof(struct icmp6_hdr);
n->m_pkthdr.len -= (off + sizeof(struct icmp6_hdr));
m_adj(n0, off + sizeof(struct icmp6_hdr));
n->m_next = n0;
} else {
deliverecho:
IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
sizeof(*nicmp6));
noff = off;
}
if (n) {
nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
nicmp6->icmp6_code = 0;
icmp6stat_inc(icp6s_reflect);
icmp6stat_inc(icp6s_outhist + ICMP6_ECHO_REPLY);
if (!icmp6_reflect(&n, noff, NULL))
ip6_send(n);
}
if (!m)
goto freeit;
break;
case ICMP6_ECHO_REPLY:
if (code != 0)
goto badcode;
break;
case MLD_LISTENER_QUERY:
case MLD_LISTENER_REPORT:
if (icmp6len < sizeof(struct mld_hdr))
goto badlen;
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* give up local */
mld6_input(m, off);
m = NULL;
goto freeit;
}
mld6_input(n, off);
/* m stays. */
break;
case MLD_LISTENER_DONE:
if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */
goto badlen;
break; /* nothing to be done in kernel */
case MLD_MTRACE_RESP:
case MLD_MTRACE:
/* XXX: these two are experimental. not officially defined. */
/* XXX: per-interface statistics? */
break; /* just pass it to applications */
case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */
/* IPv6 Node Information Queries are not supported */
break;
case ICMP6_WRUREPLY:
break;
case ND_ROUTER_SOLICIT:
case ND_ROUTER_ADVERT:
if (code != 0)
goto badcode;
if ((icmp6->icmp6_type == ND_ROUTER_SOLICIT && icmp6len <
sizeof(struct nd_router_solicit)) ||
(icmp6->icmp6_type == ND_ROUTER_ADVERT && icmp6len <
sizeof(struct nd_router_advert)))
goto badlen;
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* give up local */
nd6_rtr_cache(m, off, icmp6len,
icmp6->icmp6_type);
m = NULL;
goto freeit;
}
nd6_rtr_cache(n, off, icmp6len, icmp6->icmp6_type);
/* m stays. */
break;
case ND_NEIGHBOR_SOLICIT:
if (code != 0)
goto badcode;
if (icmp6len < sizeof(struct nd_neighbor_solicit))
goto badlen;
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* give up local */
nd6_ns_input(m, off, icmp6len);
m = NULL;
goto freeit;
}
nd6_ns_input(n, off, icmp6len);
/* m stays. */
break;
case ND_NEIGHBOR_ADVERT:
if (code != 0)
goto badcode;
if (icmp6len < sizeof(struct nd_neighbor_advert))
goto badlen;
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* give up local */
nd6_na_input(m, off, icmp6len);
m = NULL;
goto freeit;
}
nd6_na_input(n, off, icmp6len);
/* m stays. */
break;
case ND_REDIRECT:
if (code != 0)
goto badcode;
if (icmp6len < sizeof(struct nd_redirect))
goto badlen;
if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
/* give up local */
icmp6_redirect_input(m, off);
m = NULL;
goto freeit;
}
icmp6_redirect_input(n, off);
/* m stays. */
break;
case ICMP6_ROUTER_RENUMBERING:
if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
code != ICMP6_ROUTER_RENUMBERING_RESULT)
goto badcode;
if (icmp6len < sizeof(struct icmp6_router_renum))
goto badlen;
break;
default:
nd6log((LOG_DEBUG,
"icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%u)\n",
icmp6->icmp6_type,
inet_ntop(AF_INET6, &ip6->ip6_src, src, sizeof(src)),
inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst)),
m->m_pkthdr.ph_ifidx));
if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
/* ICMPv6 error: MUST deliver it by spec... */
code = PRC_NCMDS;
/* deliver */
} else {
/* ICMPv6 informational: MUST not deliver */
break;
}
deliver:
if (icmp6_notify_error(m, off, icmp6len, code)) {
/* In this case, m should've been freed. */
return (IPPROTO_DONE);
}
break;
badcode:
icmp6stat_inc(icp6s_badcode);
break;
badlen:
icmp6stat_inc(icp6s_badlen);
break;
}
#if NPF > 0
raw:
#endif
/* deliver the packet to appropriate sockets */
return rip6_input(mp, offp, proto, af);
freeit:
m_freem(m);
return IPPROTO_DONE;
}
int
icmp6_notify_error(struct mbuf *m, int off, int icmp6len, int code)
{
struct icmp6_hdr *icmp6;
struct ip6_hdr *eip6;
u_int32_t notifymtu;
struct sockaddr_in6 icmp6src, icmp6dst;
if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
icmp6stat_inc(icp6s_tooshort);
goto freeit;
}
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
sizeof(*icmp6) + sizeof(struct ip6_hdr));
if (icmp6 == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
eip6 = (struct ip6_hdr *)(icmp6 + 1);
/* Detect the upper level protocol */
{
void (*ctlfunc)(int, struct sockaddr *, u_int, void *);
u_int8_t nxt = eip6->ip6_nxt;
int eoff = off + sizeof(struct icmp6_hdr) +
sizeof(struct ip6_hdr);
struct ip6ctlparam ip6cp;
struct in6_addr *finaldst = NULL;
int icmp6type = icmp6->icmp6_type;
struct ip6_frag *fh;
struct ip6_rthdr *rth;
struct ip6_rthdr0 *rth0;
int rthlen;
while (1) { /* XXX: should avoid infinite loop explicitly? */
struct ip6_ext *eh;
switch (nxt) {
case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
case IPPROTO_AH:
IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
eoff, sizeof(*eh));
if (eh == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
if (nxt == IPPROTO_AH)
eoff += (eh->ip6e_len + 2) << 2;
else
eoff += (eh->ip6e_len + 1) << 3;
nxt = eh->ip6e_nxt;
break;
case IPPROTO_ROUTING:
/*
* When the erroneous packet contains a
* routing header, we should examine the
* header to determine the final destination.
* Otherwise, we can't properly update
* information that depends on the final
* destination (e.g. path MTU).
*/
IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
eoff, sizeof(*rth));
if (rth == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
rthlen = (rth->ip6r_len + 1) << 3;
/*
* XXX: currently there is no
* officially defined type other
* than type-0.
* Note that if the segment left field
* is 0, all intermediate hops must
* have been passed.
*/
if (rth->ip6r_segleft &&
rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
int hops;
IP6_EXTHDR_GET(rth0,
struct ip6_rthdr0 *, m,
eoff, rthlen);
if (rth0 == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
/* just ignore a bogus header */
if ((rth0->ip6r0_len % 2) == 0 &&
(hops = rth0->ip6r0_len/2))
finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
}
eoff += rthlen;
nxt = rth->ip6r_nxt;
break;
case IPPROTO_FRAGMENT:
IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
eoff, sizeof(*fh));
if (fh == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
/*
* Data after a fragment header is meaningless
* unless it is the first fragment, but
* we'll go to the notify label for path MTU
* discovery.
*/
if (fh->ip6f_offlg & IP6F_OFF_MASK)
goto notify;
eoff += sizeof(struct ip6_frag);
nxt = fh->ip6f_nxt;
break;
default:
/*
* This case includes ESP and the No Next
* Header. In such cases going to the notify
* label does not have any meaning
* (i.e. ctlfunc will be NULL), but we go
* anyway since we might have to update
* path MTU information.
*/
goto notify;
}
}
notify:
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
sizeof(*icmp6) + sizeof(struct ip6_hdr));
if (icmp6 == NULL) {
icmp6stat_inc(icp6s_tooshort);
return (-1);
}
eip6 = (struct ip6_hdr *)(icmp6 + 1);
bzero(&icmp6dst, sizeof(icmp6dst));
icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
icmp6dst.sin6_family = AF_INET6;
if (finaldst == NULL)
icmp6dst.sin6_addr = eip6->ip6_dst;
else
icmp6dst.sin6_addr = *finaldst;
icmp6dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.ph_ifidx,
&icmp6dst.sin6_addr);
if (in6_embedscope(&icmp6dst.sin6_addr, &icmp6dst, NULL)) {
/* should be impossible */
nd6log((LOG_DEBUG,
"icmp6_notify_error: in6_embedscope failed\n"));
goto freeit;
}
/*
* retrieve parameters from the inner IPv6 header, and convert
* them into sockaddr structures.
*/
bzero(&icmp6src, sizeof(icmp6src));
icmp6src.sin6_len = sizeof(struct sockaddr_in6);
icmp6src.sin6_family = AF_INET6;
icmp6src.sin6_addr = eip6->ip6_src;
icmp6src.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.ph_ifidx,
&icmp6src.sin6_addr);
if (in6_embedscope(&icmp6src.sin6_addr, &icmp6src, NULL)) {
/* should be impossible */
nd6log((LOG_DEBUG,
"icmp6_notify_error: in6_embedscope failed\n"));
goto freeit;
}
icmp6src.sin6_flowinfo =
(eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
if (finaldst == NULL)
finaldst = &eip6->ip6_dst;
ip6cp.ip6c_m = m;
ip6cp.ip6c_icmp6 = icmp6;
ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
ip6cp.ip6c_off = eoff;
ip6cp.ip6c_finaldst = finaldst;
ip6cp.ip6c_src = &icmp6src;
ip6cp.ip6c_nxt = nxt;
#if NPF > 0
pf_pkt_addr_changed(m);
#endif
if (icmp6type == ICMP6_PACKET_TOO_BIG) {
notifymtu = ntohl(icmp6->icmp6_mtu);
ip6cp.ip6c_cmdarg = (void *)¬ifymtu;
}
ctlfunc = inet6sw[ip6_protox[nxt]].pr_ctlinput;
if (ctlfunc)
(*ctlfunc)(code, sin6tosa(&icmp6dst),
m->m_pkthdr.ph_rtableid, &ip6cp);
}
return (0);
freeit:
m_freem(m);
return (-1);
}
void
icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
{
unsigned long rtcount;
struct icmp6_mtudisc_callback *mc;
struct in6_addr *dst = ip6cp->ip6c_finaldst;
struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */
u_int mtu = ntohl(icmp6->icmp6_mtu);
struct rtentry *rt = NULL;
struct sockaddr_in6 sin6;
if (mtu < IPV6_MMTU)
return;
/*
* allow non-validated cases if memory is plenty, to make traffic
* from non-connected pcb happy.
*/
rtcount = rt_timer_queue_count(&icmp6_mtudisc_timeout_q);
if (validated) {
if (0 <= icmp6_mtudisc_hiwat && rtcount > icmp6_mtudisc_hiwat)
return;
else if (0 <= icmp6_mtudisc_lowat &&
rtcount > icmp6_mtudisc_lowat) {
/*
* XXX nuke a victim, install the new one.
*/
}
} else {
if (0 <= icmp6_mtudisc_lowat && rtcount > icmp6_mtudisc_lowat)
return;
}
bzero(&sin6, sizeof(sin6));
sin6.sin6_family = PF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_addr = *dst;
/* XXX normally, this won't happen */
if (IN6_IS_ADDR_LINKLOCAL(dst)) {
sin6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.ph_ifidx);
}
sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.ph_ifidx,
&sin6.sin6_addr);
rt = icmp6_mtudisc_clone(&sin6, m->m_pkthdr.ph_rtableid, 0);
if (rt != NULL && ISSET(rt->rt_flags, RTF_HOST) &&
!(rt->rt_locks & RTV_MTU) &&
(rt->rt_mtu > mtu || rt->rt_mtu == 0)) {
struct ifnet *ifp;
ifp = if_get(rt->rt_ifidx);
if (ifp != NULL && mtu < ifp->if_mtu) {
icmp6stat_inc(icp6s_pmtuchg);
rt->rt_mtu = mtu;
}
if_put(ifp);
}
rtfree(rt);
/*
* Notify protocols that the MTU for this destination
* has changed.
*/
LIST_FOREACH(mc, &icmp6_mtudisc_callbacks, mc_list)
(*mc->mc_func)(&sin6, m->m_pkthdr.ph_rtableid);
}
/*
* Reflect the ip6 packet back to the source.
* OFF points to the icmp6 header, counted from the top of the mbuf.
*/
int
icmp6_reflect(struct mbuf **mp, size_t off, struct sockaddr *sa)
{
struct mbuf *m = *mp;
struct rtentry *rt = NULL;
struct ip6_hdr *ip6;
struct icmp6_hdr *icmp6;
struct in6_addr t, *src = NULL;
struct sockaddr_in6 sa6_src, sa6_dst;
u_int rtableid;
u_int8_t pfflags;
CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) <= MHLEN);
/* too short to reflect */
if (off < sizeof(struct ip6_hdr)) {
nd6log((LOG_DEBUG,
"sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
(u_long)off, (u_long)sizeof(struct ip6_hdr),
__FILE__, __LINE__));
goto bad;
}
if (m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) {
m_freemp(mp);
return (ELOOP);
}
rtableid = m->m_pkthdr.ph_rtableid;
pfflags = m->m_pkthdr.pf.flags;
m_resethdr(m);
m->m_pkthdr.ph_rtableid = rtableid;
m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
/*
* If there are extra headers between IPv6 and ICMPv6, strip
* off that header first.
*/
if (off > sizeof(struct ip6_hdr)) {
size_t l;
struct ip6_hdr nip6;
l = off - sizeof(struct ip6_hdr);
m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
m_adj(m, l);
l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
if (m->m_len < l) { if ((m = *mp = m_pullup(m, l)) == NULL)
return (EMSGSIZE);
}
memcpy(mtod(m, caddr_t), &nip6, sizeof(nip6));
} else /* off == sizeof(struct ip6_hdr) */ {
size_t l;
l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
if (m->m_len < l) { if ((m = *mp = m_pullup(m, l)) == NULL)
return (EMSGSIZE);
}
}
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_nxt = IPPROTO_ICMPV6;
icmp6 = (struct icmp6_hdr *)(ip6 + 1);
t = ip6->ip6_dst;
/*
* ip6_input() drops a packet if its src is multicast.
* So, the src is never multicast.
*/
ip6->ip6_dst = ip6->ip6_src;
/*
* XXX: make sure to embed scope zone information, using
* already embedded IDs or the received interface (if any).
* Note that rcvif may be NULL.
* TODO: scoped routing case (XXX).
*/
bzero(&sa6_src, sizeof(sa6_src));
sa6_src.sin6_family = AF_INET6;
sa6_src.sin6_len = sizeof(sa6_src);
sa6_src.sin6_addr = ip6->ip6_dst;
bzero(&sa6_dst, sizeof(sa6_dst));
sa6_dst.sin6_family = AF_INET6;
sa6_dst.sin6_len = sizeof(sa6_dst);
sa6_dst.sin6_addr = t;
if (sa == NULL) {
/*
* If the incoming packet was addressed directly to us (i.e.
* unicast), use dst as the src for the reply. The
* IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED case would be VERY rare,
* but is possible (for example) when we encounter an error
* while forwarding procedure destined to a duplicated address
* of ours.
*/
rt = rtalloc(sin6tosa(&sa6_dst), 0, rtableid);
if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL) &&
!ISSET(ifatoia6(rt->rt_ifa)->ia6_flags,
IN6_IFF_ANYCAST|IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)) {
src = &t;
}
rtfree(rt);
rt = NULL;
sa = sin6tosa(&sa6_src);
}
if (src == NULL) {
struct in6_ifaddr *ia6;
/*
* This case matches to multicasts, our anycast, or unicasts
* that we do not own. Select a source address based on the
* source address of the erroneous packet.
*/
rt = rtalloc(sa, RT_RESOLVE, rtableid);
if (!rtisvalid(rt)) {
char addr[INET6_ADDRSTRLEN];
nd6log((LOG_DEBUG,
"%s: source can't be determined: dst=%s\n",
__func__, inet_ntop(AF_INET6, &sa6_src.sin6_addr,
addr, sizeof(addr))));
rtfree(rt);
goto bad;
}
ia6 = in6_ifawithscope(rt->rt_ifa->ifa_ifp, &t, rtableid);
if (ia6 != NULL)
src = &ia6->ia_addr.sin6_addr;
if (src == NULL) src = &ifatoia6(rt->rt_ifa)->ia_addr.sin6_addr;
}
ip6->ip6_src = *src;
rtfree(rt);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_hlim = ip6_defhlim;
icmp6->icmp6_cksum = 0;
m->m_pkthdr.csum_flags = M_ICMP_CSUM_OUT;
/*
* XXX option handling
*/
m->m_flags &= ~(M_BCAST|M_MCAST);
return (0);
bad:
m_freemp(mp);
return (EHOSTUNREACH);
}
void
icmp6_fasttimo(void)
{
mld6_fasttimeo();
}
const char *
icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
struct in6_addr *tgt6)
{
static char buf[1024]; /* XXX */
char src[INET6_ADDRSTRLEN];
char dst[INET6_ADDRSTRLEN];
char tgt[INET6_ADDRSTRLEN];
snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
inet_ntop(AF_INET6, src6, src, sizeof(src)),
inet_ntop(AF_INET6, dst6, dst, sizeof(dst)),
inet_ntop(AF_INET6, tgt6, tgt, sizeof(tgt)));
return buf;
}
void
icmp6_redirect_input(struct mbuf *m, int off)
{
struct ifnet *ifp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct nd_redirect *nd_rd;
int icmp6len = ntohs(ip6->ip6_plen);
char *lladdr = NULL;
int lladdrlen = 0;
struct rtentry *rt = NULL;
int is_router;
int is_onlink;
struct in6_addr src6 = ip6->ip6_src;
struct in6_addr redtgt6;
struct in6_addr reddst6;
union nd_opts ndopts;
char addr[INET6_ADDRSTRLEN];
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp == NULL)
return;
/* XXX if we are router, we don't update route by icmp6 redirect */
if (ip6_forwarding)
goto freeit;
if (!(ifp->if_xflags & IFXF_AUTOCONF6))
goto freeit;
IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
if (nd_rd == NULL) {
icmp6stat_inc(icp6s_tooshort);
if_put(ifp);
return;
}
redtgt6 = nd_rd->nd_rd_target;
reddst6 = nd_rd->nd_rd_dst;
if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
redtgt6.s6_addr16[1] = htons(ifp->if_index);
if (IN6_IS_ADDR_LINKLOCAL(&reddst6))
reddst6.s6_addr16[1] = htons(ifp->if_index);
/* validation */
if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
nd6log((LOG_ERR,
"ICMP6 redirect sent from %s rejected; "
"must be from linklocal\n",
inet_ntop(AF_INET6, &src6, addr, sizeof(addr))));
goto bad;
}
if (ip6->ip6_hlim != 255) {
nd6log((LOG_ERR,
"ICMP6 redirect sent from %s rejected; "
"hlim=%d (must be 255)\n",
inet_ntop(AF_INET6, &src6, addr, sizeof(addr)),
ip6->ip6_hlim));
goto bad;
}
if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
nd6log((LOG_ERR,
"ICMP6 redirect rejected; "
"redirect dst must be unicast: %s\n",
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
goto bad;
}
{
/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
struct sockaddr_in6 sin6;
struct in6_addr *gw6;
bzero(&sin6, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
memcpy(&sin6.sin6_addr, &reddst6, sizeof(reddst6));
rt = rtalloc(sin6tosa(&sin6), 0, m->m_pkthdr.ph_rtableid);
if (rt) {
if (rt->rt_gateway == NULL ||
rt->rt_gateway->sa_family != AF_INET6) {
nd6log((LOG_ERR,
"ICMP6 redirect rejected; no route "
"with inet6 gateway found for redirect dst: %s\n",
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
rtfree(rt);
goto bad;
}
gw6 = &(satosin6(rt->rt_gateway)->sin6_addr);
if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
nd6log((LOG_ERR,
"ICMP6 redirect rejected; "
"not equal to gw-for-src=%s (must be same): "
"%s\n",
inet_ntop(AF_INET6, gw6, addr, sizeof(addr)),
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
rtfree(rt);
goto bad;
}
} else {
nd6log((LOG_ERR,
"ICMP6 redirect rejected; "
"no route found for redirect dst: %s\n",
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
goto bad;
}
rtfree(rt);
rt = NULL;
}
is_router = is_onlink = 0;
if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
is_router = 1; /* router case */
if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
is_onlink = 1; /* on-link destination case */
if (!is_router && !is_onlink) {
nd6log((LOG_ERR,
"ICMP6 redirect rejected; "
"neither router case nor onlink case: %s\n",
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
goto bad;
}
/* validation passed */
icmp6len -= sizeof(*nd_rd);
nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
if (nd6_options(&ndopts) < 0) {
nd6log((LOG_INFO, "icmp6_redirect_input: "
"invalid ND option, rejected: %s\n",
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
/* nd6_options have incremented stats */
goto freeit;
}
if (ndopts.nd_opts_tgt_lladdr) {
lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
}
if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
nd6log((LOG_INFO,
"icmp6_redirect_input: lladdrlen mismatch for %s "
"(if %d, icmp6 packet %d): %s\n",
inet_ntop(AF_INET6, &redtgt6, addr, sizeof(addr)),
ifp->if_addrlen, lladdrlen - 2,
icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
goto bad;
}
/* RFC 2461 8.3 */
nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
if (!is_onlink) { /* better router case. perform rtredirect. */
/* perform rtredirect */
struct sockaddr_in6 sdst;
struct sockaddr_in6 sgw;
struct sockaddr_in6 ssrc;
unsigned long rtcount;
struct rtentry *newrt = NULL;
/*
* do not install redirect route, if the number of entries
* is too much (> hiwat). note that, the node (= host) will
* work just fine even if we do not install redirect route
* (there will be additional hops, though).
*/
rtcount = rt_timer_queue_count(&icmp6_redirect_timeout_q);
if (0 <= ip6_maxdynroutes && rtcount >= ip6_maxdynroutes)
goto freeit;
else if (0 <= icmp6_redirect_lowat &&
rtcount > icmp6_redirect_lowat) {
/*
* XXX nuke a victim, install the new one.
*/
}
bzero(&sdst, sizeof(sdst));
bzero(&sgw, sizeof(sgw));
bzero(&ssrc, sizeof(ssrc));
sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
sizeof(struct sockaddr_in6);
memcpy(&sgw.sin6_addr, &redtgt6, sizeof(struct in6_addr));
memcpy(&sdst.sin6_addr, &reddst6, sizeof(struct in6_addr));
memcpy(&ssrc.sin6_addr, &src6, sizeof(struct in6_addr));
rtredirect(sin6tosa(&sdst), sin6tosa(&sgw), sin6tosa(&ssrc),
&newrt, m->m_pkthdr.ph_rtableid);
if (newrt != NULL && icmp6_redirtimeout > 0) {
rt_timer_add(newrt, &icmp6_redirect_timeout_q,
m->m_pkthdr.ph_rtableid);
}
rtfree(newrt);
}
/* finally update cached route in each socket via pfctlinput */
{
struct sockaddr_in6 sdst;
bzero(&sdst, sizeof(sdst));
sdst.sin6_family = AF_INET6;
sdst.sin6_len = sizeof(struct sockaddr_in6);
memcpy(&sdst.sin6_addr, &reddst6, sizeof(struct in6_addr));
pfctlinput(PRC_REDIRECT_HOST, sin6tosa(&sdst));
}
freeit:
if_put(ifp);
m_freem(m);
return;
bad:
if_put(ifp);
icmp6stat_inc(icp6s_badredirect);
m_freem(m);
}
void
icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
{
struct ifnet *ifp = NULL;
struct in6_addr *ifp_ll6;
struct in6_addr *nexthop;
struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */
struct mbuf *m = NULL; /* newly allocated one */
struct ip6_hdr *ip6; /* m as struct ip6_hdr */
struct nd_redirect *nd_rd;
size_t maxlen;
u_char *p;
struct sockaddr_in6 src_sa;
icmp6_errcount(ND_REDIRECT, 0);
/* if we are not router, we don't send icmp6 redirect */
if (!ip6_forwarding)
goto fail;
/* sanity check */
if (m0 == NULL || !rtisvalid(rt))
goto fail;
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
goto fail;
/*
* Address check:
* the source address must identify a neighbor, and
* the destination address must not be a multicast address
* [RFC 2461, sec 8.2]
*/
sip6 = mtod(m0, struct ip6_hdr *);
bzero(&src_sa, sizeof(src_sa));
src_sa.sin6_family = AF_INET6;
src_sa.sin6_len = sizeof(src_sa);
src_sa.sin6_addr = sip6->ip6_src;
/* we don't currently use sin6_scope_id, but eventually use it */
src_sa.sin6_scope_id = in6_addr2scopeid(ifp->if_index, &sip6->ip6_src);
if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
goto fail;
if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
goto fail; /* what should we do here? */
/* rate limit */
if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
goto fail;
/*
* Since we are going to append up to 1280 bytes (= IPV6_MMTU),
* we almost always ask for an mbuf cluster for simplicity.
* (MHLEN < IPV6_MMTU is almost always true)
*/
#if IPV6_MMTU >= MCLBYTES
# error assumption failed about IPV6_MMTU and MCLBYTES
#endif
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m && IPV6_MMTU >= MHLEN)
MCLGET(m, M_DONTWAIT);
if (!m)
goto fail;
m->m_pkthdr.ph_ifidx = 0;
m->m_len = 0;
maxlen = m_trailingspace(m);
maxlen = min(IPV6_MMTU, maxlen);
/* just for safety */
if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
goto fail;
}
{
/* get ip6 linklocal address for ifp(my outgoing interface). */
struct in6_ifaddr *ia6;
if ((ia6 = in6ifa_ifpforlinklocal(ifp, IN6_IFF_TENTATIVE|
IN6_IFF_DUPLICATED|IN6_IFF_ANYCAST)) == NULL)
goto fail;
ifp_ll6 = &ia6->ia_addr.sin6_addr;
}
/* get ip6 linklocal address for the router. */
if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
struct sockaddr_in6 *sin6;
sin6 = satosin6(rt->rt_gateway);
nexthop = &sin6->sin6_addr;
if (!IN6_IS_ADDR_LINKLOCAL(nexthop))
nexthop = NULL;
} else
nexthop = NULL;
/* ip6 */
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6->ip6_plen will be set later */
ip6->ip6_nxt = IPPROTO_ICMPV6;
ip6->ip6_hlim = 255;
/* ip6->ip6_src must be linklocal addr for my outgoing if. */
bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
/* ND Redirect */
nd_rd = (struct nd_redirect *)(ip6 + 1);
nd_rd->nd_rd_type = ND_REDIRECT;
nd_rd->nd_rd_code = 0;
nd_rd->nd_rd_reserved = 0;
if (rt->rt_flags & RTF_GATEWAY) {
/*
* nd_rd->nd_rd_target must be a link-local address in
* better router cases.
*/
if (!nexthop)
goto fail;
bcopy(nexthop, &nd_rd->nd_rd_target,
sizeof(nd_rd->nd_rd_target));
bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
sizeof(nd_rd->nd_rd_dst));
} else {
/* make sure redtgt == reddst */
nexthop = &sip6->ip6_dst;
bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
sizeof(nd_rd->nd_rd_target));
bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
sizeof(nd_rd->nd_rd_dst));
}
p = (u_char *)(nd_rd + 1);
{
/* target lladdr option */
struct rtentry *nrt;
int len;
struct sockaddr_dl *sdl;
struct nd_opt_hdr *nd_opt;
char *lladdr;
len = sizeof(*nd_opt) + ifp->if_addrlen;
len = (len + 7) & ~7; /* round by 8 */
/* safety check */
if (len + (p - (u_char *)ip6) > maxlen)
goto nolladdropt;
nrt = nd6_lookup(nexthop, 0, ifp, ifp->if_rdomain);
if ((nrt != NULL) &&
(nrt->rt_flags & (RTF_GATEWAY|RTF_LLINFO)) == RTF_LLINFO &&
(nrt->rt_gateway->sa_family == AF_LINK) &&
(sdl = satosdl(nrt->rt_gateway)) &&
sdl->sdl_alen) {
nd_opt = (struct nd_opt_hdr *)p;
nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
nd_opt->nd_opt_len = len >> 3;
lladdr = (char *)(nd_opt + 1);
bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen);
p += len;
}
rtfree(nrt);
}
nolladdropt:;
m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
/* just to be safe */
if (p - (u_char *)ip6 > maxlen)
goto noredhdropt;
{
/* redirected header option */
int len;
struct nd_opt_rd_hdr *nd_opt_rh;
/*
* compute the maximum size for icmp6 redirect header option.
* XXX room for auth header?
*/
len = maxlen - (p - (u_char *)ip6);
len &= ~7;
/*
* Redirected header option spec (RFC2461 4.6.3) talks nothing
* about padding/truncate rule for the original IP packet.
* From the discussion on IPv6imp in Feb 1999,
* the consensus was:
* - "attach as much as possible" is the goal
* - pad if not aligned (original size can be guessed by
* original ip6 header)
* Following code adds the padding if it is simple enough,
* and truncates if not.
*/
if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
/* not enough room, truncate */
m_adj(m0, (len - sizeof(*nd_opt_rh)) -
m0->m_pkthdr.len);
} else {
/*
* enough room, truncate if not aligned.
* we don't pad here for simplicity.
*/
size_t extra;
extra = m0->m_pkthdr.len % 8;
if (extra) {
/* truncate */
m_adj(m0, -extra);
}
len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
}
nd_opt_rh = (struct nd_opt_rd_hdr *)p;
bzero(nd_opt_rh, sizeof(*nd_opt_rh));
nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
nd_opt_rh->nd_opt_rh_len = len >> 3;
p += sizeof(*nd_opt_rh);
m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
/* connect m0 to m */
m->m_pkthdr.len += m0->m_pkthdr.len;
m_cat(m, m0);
m0 = NULL;
}
noredhdropt:
m_freem(m0);
m0 = NULL;
sip6 = mtod(m, struct ip6_hdr *);
if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_src))
sip6->ip6_src.s6_addr16[1] = 0;
if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_dst))
sip6->ip6_dst.s6_addr16[1] = 0;
#if 0
if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
ip6->ip6_src.s6_addr16[1] = 0;
if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst))
ip6->ip6_dst.s6_addr16[1] = 0;
#endif
if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_target))
nd_rd->nd_rd_target.s6_addr16[1] = 0;
if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_dst))
nd_rd->nd_rd_dst.s6_addr16[1] = 0;
ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
nd_rd->nd_rd_cksum = 0;
m->m_pkthdr.csum_flags = M_ICMP_CSUM_OUT;
/* send the packet to outside... */
ip6_output(m, NULL, NULL, 0, NULL, NULL);
icmp6stat_inc(icp6s_outhist + ND_REDIRECT);
if_put(ifp);
return;
fail:
if_put(ifp);
m_freem(m);
m_freem(m0);
}
/*
* ICMPv6 socket option processing.
*/
int
icmp6_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
int error = 0;
struct inpcb *in6p = sotoinpcb(so);
if (level != IPPROTO_ICMPV6)
return EINVAL;
switch (op) {
case PRCO_SETOPT:
switch (optname) {
case ICMP6_FILTER:
{
struct icmp6_filter *p;
if (m == NULL || m->m_len != sizeof(*p)) {
error = EMSGSIZE;
break;
}
p = mtod(m, struct icmp6_filter *);
if (!p || !in6p->inp_icmp6filt) {
error = EINVAL;
break;
}
bcopy(p, in6p->inp_icmp6filt,
sizeof(struct icmp6_filter));
error = 0;
break;
}
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (optname) {
case ICMP6_FILTER:
{
struct icmp6_filter *p;
if (!in6p->inp_icmp6filt) {
error = EINVAL;
break;
}
m->m_len = sizeof(struct icmp6_filter);
p = mtod(m, struct icmp6_filter *);
bcopy(in6p->inp_icmp6filt, p,
sizeof(struct icmp6_filter));
error = 0;
break;
}
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
/*
* Perform rate limit check.
* Returns 0 if it is okay to send the icmp6 packet.
* Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
* limitation.
*
* XXX per-destination/type check necessary?
*
* dst - not used at this moment
* type - not used at this moment
* code - not used at this moment
*/
int
icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code)
{
/* PPS limit */
if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count,
icmp6errppslim))
return 1; /* The packet is subject to rate limit */
return 0; /* okay to send */
}
struct rtentry *
icmp6_mtudisc_clone(struct sockaddr_in6 *dst, u_int rtableid, int ipsec)
{
struct rtentry *rt;
int error;
rt = rtalloc(sin6tosa(dst), RT_RESOLVE, rtableid);
/* Check if the route is actually usable */
if (!rtisvalid(rt))
goto bad;
/* IPsec needs the route only for PMTU, it can use reject for that */
if (!ipsec && (rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)))
goto bad;
/*
* No PMTU for local routes and permanent neighbors,
* ARP and NDP use the same expire timer as the route.
*/
if (ISSET(rt->rt_flags, RTF_LOCAL) ||
(ISSET(rt->rt_flags, RTF_LLINFO) && rt->rt_expire == 0))
goto bad;
/* If we didn't get a host route, allocate one */
if ((rt->rt_flags & RTF_HOST) == 0) {
struct rtentry *nrt;
struct rt_addrinfo info;
struct sockaddr_rtlabel sa_rl;
memset(&info, 0, sizeof(info));
info.rti_ifa = rt->rt_ifa;
info.rti_flags = RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC;
info.rti_info[RTAX_DST] = sin6tosa(dst);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_LABEL] =
rtlabel_id2sa(rt->rt_labelid, &sa_rl);
error = rtrequest(RTM_ADD, &info, rt->rt_priority, &nrt,
rtableid);
if (error)
goto bad;
nrt->rt_rmx = rt->rt_rmx;
rtfree(rt);
rt = nrt;
rtm_send(rt, RTM_ADD, 0, rtableid);
}
error = rt_timer_add(rt, &icmp6_mtudisc_timeout_q, rtableid);
if (error)
goto bad;
return (rt);
bad:
rtfree(rt);
return (NULL);
}
void
icmp6_mtudisc_timeout(struct rtentry *rt, u_int rtableid)
{
struct ifnet *ifp;
NET_ASSERT_LOCKED();
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
return;
if ((rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == (RTF_DYNAMIC|RTF_HOST)) {
rtdeletemsg(rt, ifp, rtableid);
} else {
if (!(rt->rt_locks & RTV_MTU))
rt->rt_mtu = 0;
}
if_put(ifp);
}
const struct sysctl_bounded_args icmpv6ctl_vars[] = {
{ ICMPV6CTL_ND6_DELAY, &nd6_delay, 0, INT_MAX },
{ ICMPV6CTL_ND6_UMAXTRIES, &nd6_umaxtries, 0, INT_MAX },
{ ICMPV6CTL_ND6_MMAXTRIES, &nd6_mmaxtries, 0, INT_MAX },
{ ICMPV6CTL_ERRPPSLIMIT, &icmp6errppslim, -1, 1000 },
{ ICMPV6CTL_ND6_MAXNUDHINT, &nd6_maxnudhint, 0, INT_MAX },
{ ICMPV6CTL_MTUDISC_HIWAT, &icmp6_mtudisc_hiwat, -1, INT_MAX },
{ ICMPV6CTL_MTUDISC_LOWAT, &icmp6_mtudisc_lowat, -1, INT_MAX },
{ ICMPV6CTL_ND6_DEBUG, &nd6_debug, 0, 1 },
};
int
icmp6_sysctl_icmp6stat(void *oldp, size_t *oldlenp, void *newp)
{
struct icmp6stat *icmp6stat;
int ret;
CTASSERT(sizeof(*icmp6stat) == icp6s_ncounters * sizeof(uint64_t));
icmp6stat = malloc(sizeof(*icmp6stat), M_TEMP, M_WAITOK|M_ZERO);
counters_read(icmp6counters, (uint64_t *)icmp6stat, icp6s_ncounters);
ret = sysctl_rdstruct(oldp, oldlenp, newp,
icmp6stat, sizeof(*icmp6stat));
free(icmp6stat, M_TEMP, sizeof(*icmp6stat));
return (ret);
}
int
icmp6_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
int error;
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case ICMPV6CTL_REDIRTIMEOUT:
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&icmp6_redirtimeout, 0, INT_MAX);
rt_timer_queue_change(&icmp6_redirect_timeout_q,
icmp6_redirtimeout);
NET_UNLOCK();
break;
case ICMPV6CTL_STATS:
error = icmp6_sysctl_icmp6stat(oldp, oldlenp, newp);
break;
default:
NET_LOCK();
error = sysctl_bounded_arr(icmpv6ctl_vars,
nitems(icmpv6ctl_vars), name, namelen, oldp, oldlenp, newp,
newlen);
NET_UNLOCK();
break;
}
return (error);
}
/* $OpenBSD: uvm_unix.c,v 1.71 2020/10/21 21:24:57 deraadt Exp $ */
/* $NetBSD: uvm_unix.c,v 1.18 2000/09/13 15:00:25 thorpej Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993 The Regents of the University of California.
* Copyright (c) 1988 University of Utah.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
* from: Id: uvm_unix.c,v 1.1.2.2 1997/08/25 18:52:30 chuck Exp
*/
/*
* uvm_unix.c: traditional sbrk/grow interface to vm.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm.h>
/*
* sys_obreak: set break
*/
int
sys_obreak(struct proc *p, void *v, register_t *retval)
{
struct sys_obreak_args /* {
syscallarg(char *) nsize;
} */ *uap = v;
struct vmspace *vm = p->p_vmspace;
vaddr_t new, old, base;
int error;
base = (vaddr_t)vm->vm_daddr;
new = round_page((vaddr_t)SCARG(uap, nsize));
if (new < base || (new - base) > lim_cur(RLIMIT_DATA))
return (ENOMEM);
old = round_page(base + ptoa(vm->vm_dsize));
if (new == old)
return (0);
/* grow or shrink? */
if (new > old) {
error = uvm_map(&vm->vm_map, &old, new - old, NULL,
UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PROT_EXEC, MAP_INHERIT_COPY,
MADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_COPYONW));
if (error) {
uprintf("sbrk: grow %ld failed, error = %d\n",
new - old, error);
return (ENOMEM);
}
vm->vm_dsize += atop(new - old);
} else {
uvm_unmap(&vm->vm_map, new, old);
vm->vm_dsize -= atop(old - new);
}
return (0);
}
/*
* uvm_grow: enlarge the "stack segment" to include sp.
*/
void
uvm_grow(struct proc *p, vaddr_t sp)
{
struct vmspace *vm = p->p_vmspace;
vm_map_t map = &vm->vm_map;
int si;
/* For user defined stacks (from sendsig). */
if (sp < (vaddr_t)vm->vm_maxsaddr)
return;
#ifdef MACHINE_STACK_GROWS_UP
if (sp >= (vaddr_t)vm->vm_minsaddr)
return;
#endif
vm_map_lock(map);
/* For common case of already allocated (from trap). */
#ifdef MACHINE_STACK_GROWS_UP
if (sp < (vaddr_t)vm->vm_maxsaddr + ptoa(vm->vm_ssize))
#else
if (sp >= (vaddr_t)vm->vm_minsaddr - ptoa(vm->vm_ssize))
#endif
goto out;
/* Really need to check vs limit and increment stack size if ok. */
#ifdef MACHINE_STACK_GROWS_UP
si = atop(sp - (vaddr_t)vm->vm_maxsaddr) - vm->vm_ssize + 1;
#else
si = atop((vaddr_t)vm->vm_minsaddr - sp) - vm->vm_ssize;
#endif
if (vm->vm_ssize + si <= atop(lim_cur(RLIMIT_STACK)))
vm->vm_ssize += si;
out:
vm_map_unlock(map);
}
#ifndef SMALL_KERNEL
#define WALK_CHUNK 32
/*
* Not all the pages in an amap may be present. When dumping core,
* we don't want to force all the pages to be present: it's a waste
* of time and memory when we already know what they contain (zeros)
* and the ELF format at least can adequately represent them as a
* segment with memory size larger than its file size.
*
* So, we walk the amap with calls to amap_lookups() and scan the
* resulting pointers to find ranges of zero or more present pages
* followed by at least one absent page or the end of the amap.
* When then pass that range to the walk callback with 'start'
* pointing to the start of the present range, 'realend' pointing
* to the first absent page (or the end of the entry), and 'end'
* pointing to the page past the last absent page (or the end of
* the entry).
*
* Note that if the first page of the amap is empty then the callback
* must be invoked with 'start' == 'realend' so it can present that
* first range of absent pages.
*/
int
uvm_coredump_walk_amap(struct vm_map_entry *entry, int *nsegmentp,
uvm_coredump_walk_cb *walk, void *cookie)
{
struct vm_anon *anons[WALK_CHUNK];
vaddr_t pos, start, realend, end, entry_end;
vm_prot_t prot;
int nsegment, absent, npages, i, error;
prot = entry->protection;
nsegment = *nsegmentp;
start = entry->start;
entry_end = MIN(entry->end, VM_MAXUSER_ADDRESS);
absent = 0;
for (pos = start; pos < entry_end; pos += npages << PAGE_SHIFT) {
npages = (entry_end - pos) >> PAGE_SHIFT;
if (npages > WALK_CHUNK)
npages = WALK_CHUNK;
amap_lookups(&entry->aref, pos - entry->start, anons, npages);
for (i = 0; i < npages; i++) {
if ((anons[i] == NULL) == absent)
continue;
if (!absent) {
/* going from present to absent: set realend */
realend = pos + (i << PAGE_SHIFT);
absent = 1;
continue;
}
/* going from absent to present: invoke callback */
end = pos + (i << PAGE_SHIFT);
if (start != end) {
error = (*walk)(start, realend, end, prot,
nsegment, cookie);
if (error)
return error;
nsegment++;
}
start = realend = end;
absent = 0;
}
}
if (!absent)
realend = entry_end;
error = (*walk)(start, realend, entry_end, prot, nsegment, cookie);
*nsegmentp = nsegment + 1;
return error;
}
/*
* Common logic for whether a map entry should be included in a coredump
*/
static inline int
uvm_should_coredump(struct proc *p, struct vm_map_entry *entry)
{
if (!(entry->protection & PROT_WRITE) &&
entry->aref.ar_amap == NULL &&
entry->start != p->p_p->ps_sigcode &&
entry->start != p->p_p->ps_timekeep)
return 0;
/*
* Skip ranges marked as unreadable, as uiomove(UIO_USERSPACE)
* will fail on them. Maybe this really should be a test of
* entry->max_protection, but doing
* uvm_map_extract(UVM_EXTRACT_FIXPROT)
* on each such page would suck.
*/
if ((entry->protection & PROT_READ) == 0)
return 0;
/* Skip ranges excluded from coredumps. */
if (UVM_ET_ISCONCEAL(entry))
return 0;
/* Don't dump mmaped devices. */
if (entry->object.uvm_obj != NULL &&
UVM_OBJ_IS_DEVICE(entry->object.uvm_obj))
return 0;
if (entry->start >= VM_MAXUSER_ADDRESS)
return 0;
return 1;
}
/* do nothing callback for uvm_coredump_walk_amap() */
static int
noop(vaddr_t start, vaddr_t realend, vaddr_t end, vm_prot_t prot,
int nsegment, void *cookie)
{
return 0;
}
/*
* Walk the VA space for a process to identify what to write to
* a coredump. First the number of contiguous ranges is counted,
* then the 'setup' callback is invoked to prepare for actually
* recording the ranges, then the VA is walked again, invoking
* the 'walk' callback for each range. The number of ranges walked
* is guaranteed to match the count seen by the 'setup' callback.
*/
int
uvm_coredump_walkmap(struct proc *p, uvm_coredump_setup_cb *setup,
uvm_coredump_walk_cb *walk, void *cookie)
{
struct vmspace *vm = p->p_vmspace;
struct vm_map *map = &vm->vm_map;
struct vm_map_entry *entry;
vaddr_t end;
int refed_amaps = 0;
int nsegment, error;
/*
* Walk the map once to count the segments. If an amap is
* referenced more than once than take *another* reference
* and treat the amap as exactly one segment instead of
* checking page presence inside it. On the second pass
* we'll recognize which amaps we did that for by the ref
* count being >1...and decrement it then.
*/
nsegment = 0;
RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
/* should never happen for a user process */
if (UVM_ET_ISSUBMAP(entry)) {
panic("%s: user process with submap?", __func__);
}
if (! uvm_should_coredump(p, entry))
continue;
if (entry->aref.ar_amap != NULL) {
if (entry->aref.ar_amap->am_ref == 1) {
uvm_coredump_walk_amap(entry, &nsegment,
&noop, cookie);
continue;
}
/*
* Multiple refs currently, so take another and
* treat it as a single segment
*/
entry->aref.ar_amap->am_ref++;
refed_amaps++;
}
nsegment++;
}
/*
* Okay, we have a count in nsegment. Prepare to
* walk it again, then invoke the setup callback.
*/
entry = RBT_MIN(uvm_map_addr, &map->addr);
error = (*setup)(nsegment, cookie);
if (error)
goto cleanup;
/*
* Setup went okay, so do the second walk, invoking the walk
* callback on the counted segments and cleaning up references
* as we go.
*/
nsegment = 0;
for (; entry != NULL; entry = RBT_NEXT(uvm_map_addr, entry)) {
if (! uvm_should_coredump(p, entry))
continue;
if (entry->aref.ar_amap != NULL &&
entry->aref.ar_amap->am_ref == 1) {
error = uvm_coredump_walk_amap(entry, &nsegment,
walk, cookie);
if (error)
break;
continue;
}
end = entry->end;
if (end > VM_MAXUSER_ADDRESS)
end = VM_MAXUSER_ADDRESS;
error = (*walk)(entry->start, end, end, entry->protection,
nsegment, cookie);
if (error)
break;
nsegment++;
if (entry->aref.ar_amap != NULL &&
entry->aref.ar_amap->am_ref > 1) {
/* multiple refs, so we need to drop one */
entry->aref.ar_amap->am_ref--;
refed_amaps--;
}
}
if (error) {
cleanup:
/* clean up the extra references from where we left off */
if (refed_amaps > 0) {
for (; entry != NULL;
entry = RBT_NEXT(uvm_map_addr, entry)) {
if (entry->aref.ar_amap == NULL ||
entry->aref.ar_amap->am_ref == 1)
continue;
if (! uvm_should_coredump(p, entry))
continue;
entry->aref.ar_amap->am_ref--;
if (refed_amaps-- == 0)
break;
}
}
}
return error;
}
#endif /* !SMALL_KERNEL */
/* $OpenBSD: bus_dma.c,v 1.51 2019/06/09 12:52:04 kettenis Exp $ */
/* $NetBSD: bus_dma.c,v 1.3 2003/05/07 21:33:58 fvdl Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The following is included because _bus_dma_uiomove is derived from
* uiomove() in kern_subr.c.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <machine/bus.h>
#include <uvm/uvm_extern.h>
int _bus_dmamap_load_buffer(bus_dma_tag_t, bus_dmamap_t, void *, bus_size_t,
struct proc *, int, paddr_t *, int *, int);
/*
* Common function for DMA map creation. May be called by bus-specific
* DMA map creation functions.
*/
int
_bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp)
{
struct bus_dmamap *map;
void *mapstore;
size_t mapsize;
/*
* Allocate and initialize the DMA map. The end of the map
* is a variable-sized array of segments, so we allocate enough
* room for them in one shot.
*
* Note we don't preserve the WAITOK or NOWAIT flags. Preservation
* of ALLOCNOW notifies others that we've reserved these resources,
* and they are not to be freed.
*
* The bus_dmamap_t includes one bus_dma_segment_t, hence
* the (nsegments - 1).
*/
mapsize = sizeof(struct bus_dmamap) +
(sizeof(bus_dma_segment_t) * (nsegments - 1));
if ((mapstore = malloc(mapsize, M_DEVBUF,
(flags & BUS_DMA_NOWAIT) ?
(M_NOWAIT|M_ZERO) : (M_WAITOK|M_ZERO))) == NULL)
return (ENOMEM);
map = (struct bus_dmamap *)mapstore;
map->_dm_size = size;
map->_dm_segcnt = nsegments;
map->_dm_maxsegsz = maxsegsz;
map->_dm_boundary = boundary;
map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT);
*dmamp = map;
return (0);
}
/*
* Common function for DMA map destruction. May be called by bus-specific
* DMA map destruction functions.
*/
void
_bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map)
{
size_t mapsize;
mapsize = sizeof(struct bus_dmamap) +
(sizeof(bus_dma_segment_t) * (map->_dm_segcnt - 1));
free(map, M_DEVBUF, mapsize);
}
/*
* Common function for loading a DMA map with a linear buffer. May
* be called by bus-specific DMA map load functions.
*/
int
_bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
bus_size_t buflen, struct proc *p, int flags)
{
bus_addr_t lastaddr = 0;
int seg, error;
/*
* Make sure that on error condition we return "no valid mappings".
*/
map->dm_mapsize = 0;
map->dm_nsegs = 0;
if (buflen > map->_dm_size)
return (EINVAL);
seg = 0;
error = _bus_dmamap_load_buffer(t, map, buf, buflen, p, flags,
&lastaddr, &seg, 1);
if (error == 0) { map->dm_mapsize = buflen;
map->dm_nsegs = seg + 1;
}
return (error);
}
/*
* Like _bus_dmamap_load(), but for mbufs.
*/
int
_bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0,
int flags)
{
paddr_t lastaddr = 0;
int seg, error, first;
struct mbuf *m;
/*
* Make sure that on error condition we return "no valid mappings".
*/
map->dm_mapsize = 0;
map->dm_nsegs = 0;
#ifdef DIAGNOSTIC
if ((m0->m_flags & M_PKTHDR) == 0)
panic("_bus_dmamap_load_mbuf: no packet header");
#endif
if (m0->m_pkthdr.len > map->_dm_size)
return (EINVAL);
first = 1;
seg = 0;
error = 0;
for (m = m0; m != NULL && error == 0; m = m->m_next) { if (m->m_len == 0)
continue;
error = _bus_dmamap_load_buffer(t, map, m->m_data, m->m_len,
NULL, flags, &lastaddr, &seg, first);
first = 0;
}
if (error == 0) { map->dm_mapsize = m0->m_pkthdr.len;
map->dm_nsegs = seg + 1;
}
return (error);
}
/*
* Like _bus_dmamap_load(), but for uios.
*/
int
_bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio,
int flags)
{
paddr_t lastaddr = 0;
int seg, i, error, first;
bus_size_t minlen, resid;
struct proc *p = NULL;
struct iovec *iov;
caddr_t addr;
/*
* Make sure that on error condition we return "no valid mappings".
*/
map->dm_mapsize = 0;
map->dm_nsegs = 0;
resid = uio->uio_resid;
iov = uio->uio_iov;
if (uio->uio_segflg == UIO_USERSPACE) {
p = uio->uio_procp;
#ifdef DIAGNOSTIC
if (p == NULL)
panic("_bus_dmamap_load_uio: USERSPACE but no proc");
#endif
}
first = 1;
seg = 0;
error = 0;
for (i = 0; i < uio->uio_iovcnt && resid != 0 && error == 0; i++) {
/*
* Now at the first iovec to load. Load each iovec
* until we have exhausted the residual count.
*/
minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
addr = (caddr_t)iov[i].iov_base;
error = _bus_dmamap_load_buffer(t, map, addr, minlen,
p, flags, &lastaddr, &seg, first);
first = 0;
resid -= minlen;
}
if (error == 0) {
map->dm_mapsize = uio->uio_resid;
map->dm_nsegs = seg + 1;
}
return (error);
}
/*
* Like _bus_dmamap_load(), but for raw memory allocated with
* bus_dmamem_alloc().
*/
int
_bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, bus_dma_segment_t *segs,
int nsegs, bus_size_t size, int flags)
{
bus_addr_t paddr, baddr, bmask, lastaddr = 0;
bus_size_t plen, sgsize, mapsize;
int first = 1;
int i, seg = 0;
/*
* Make sure that on error condition we return "no valid mappings".
*/
map->dm_mapsize = 0;
map->dm_nsegs = 0;
if (nsegs > map->_dm_segcnt || size > map->_dm_size)
return (EINVAL);
mapsize = size;
bmask = ~(map->_dm_boundary - 1);
for (i = 0; i < nsegs && size > 0; i++) {
paddr = segs[i].ds_addr;
plen = MIN(segs[i].ds_len, size);
while (plen > 0) {
/*
* Compute the segment size, and adjust counts.
*/
sgsize = PAGE_SIZE - ((u_long)paddr & PGOFSET);
if (plen < sgsize)
sgsize = plen;
if (paddr > dma_constraint.ucr_high &&
(map->_dm_flags & BUS_DMA_64BIT) == 0)
panic("Non dma-reachable buffer at paddr %#lx(raw)",
paddr);
/*
* Make sure we don't cross any boundaries.
*/
if (map->_dm_boundary > 0) {
baddr = (paddr + map->_dm_boundary) & bmask;
if (sgsize > (baddr - paddr))
sgsize = (baddr - paddr);
}
/*
* Insert chunk into a segment, coalescing with
* previous segment if possible.
*/
if (first) {
map->dm_segs[seg].ds_addr = paddr;
map->dm_segs[seg].ds_len = sgsize;
first = 0;
} else {
if (paddr == lastaddr &&
(map->dm_segs[seg].ds_len + sgsize) <=
map->_dm_maxsegsz &&
(map->_dm_boundary == 0 ||
(map->dm_segs[seg].ds_addr & bmask) ==
(paddr & bmask)))
map->dm_segs[seg].ds_len += sgsize;
else {
if (++seg >= map->_dm_segcnt)
return (EINVAL);
map->dm_segs[seg].ds_addr = paddr;
map->dm_segs[seg].ds_len = sgsize;
}
}
paddr += sgsize;
plen -= sgsize;
size -= sgsize;
lastaddr = paddr;
}
}
map->dm_mapsize = mapsize;
map->dm_nsegs = seg + 1;
return (0);
}
/*
* Common function for unloading a DMA map. May be called by
* bus-specific DMA map unload functions.
*/
void
_bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map)
{
/*
* No resources to free; just mark the mappings as
* invalid.
*/
map->dm_mapsize = 0;
map->dm_nsegs = 0;
}
/*
* Common function for DMA map synchronization. May be called
* by bus-specific DMA map synchronization functions.
*/
void
_bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t addr,
bus_size_t size, int op)
{
/* Nothing to do here. */
}
/*
* Common function for DMA-safe memory allocation. May be called
* by bus-specific DMA memory allocation functions.
*/
int
_bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment,
bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs,
int flags)
{
/*
* XXX in the presence of decent (working) iommus and bouncebuffers
* we can then fallback this allocation to a range of { 0, -1 }.
* However for now we err on the side of caution and allocate dma
* memory under the 4gig boundary.
*/
return (_bus_dmamem_alloc_range(t, size, alignment, boundary,
segs, nsegs, rsegs, flags, (bus_addr_t)0, (bus_addr_t)0xffffffff));
}
/*
* Common function for freeing DMA-safe memory. May be called by
* bus-specific DMA memory free functions.
*/
void
_bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs)
{
struct vm_page *m;
bus_addr_t addr;
struct pglist mlist;
int curseg;
/*
* Build a list of pages to free back to the VM system.
*/
TAILQ_INIT(&mlist);
for (curseg = 0; curseg < nsegs; curseg++) {
for (addr = segs[curseg].ds_addr;
addr < (segs[curseg].ds_addr + segs[curseg].ds_len);
addr += PAGE_SIZE) {
m = PHYS_TO_VM_PAGE(addr);
TAILQ_INSERT_TAIL(&mlist, m, pageq);
}
}
uvm_pglistfree(&mlist);
}
/*
* Common function for mapping DMA-safe memory. May be called by
* bus-specific DMA memory map functions.
*/
int
_bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs,
size_t size, caddr_t *kvap, int flags)
{
vaddr_t va, sva;
size_t ssize;
bus_addr_t addr;
int curseg, pmapflags = 0, error;
const struct kmem_dyn_mode *kd;
if (nsegs == 1 && (flags & BUS_DMA_NOCACHE) == 0) {
*kvap = (caddr_t)PMAP_DIRECT_MAP(segs[0].ds_addr);
return (0);
}
if (flags & BUS_DMA_NOCACHE)
pmapflags |= PMAP_NOCACHE;
size = round_page(size);
kd = flags & BUS_DMA_NOWAIT ? &kd_trylock : &kd_waitok;
va = (vaddr_t)km_alloc(size, &kv_any, &kp_none, kd);
if (va == 0)
return (ENOMEM);
*kvap = (caddr_t)va;
sva = va;
ssize = size;
for (curseg = 0; curseg < nsegs; curseg++) {
for (addr = segs[curseg].ds_addr;
addr < (segs[curseg].ds_addr + segs[curseg].ds_len);
addr += PAGE_SIZE, va += PAGE_SIZE, size -= PAGE_SIZE) {
if (size == 0)
panic("_bus_dmamem_map: size botch");
error = pmap_enter(pmap_kernel(), va, addr | pmapflags,
PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PMAP_WIRED | PMAP_CANFAIL);
if (error) {
pmap_update(pmap_kernel());
km_free((void *)sva, ssize, &kv_any, &kp_none);
return (error);
}
}
}
pmap_update(pmap_kernel());
return (0);
}
/*
* Common function for unmapping DMA-safe memory. May be called by
* bus-specific DMA memory unmapping functions.
*/
void
_bus_dmamem_unmap(bus_dma_tag_t t, caddr_t kva, size_t size)
{
#ifdef DIAGNOSTIC
if ((u_long)kva & PGOFSET)
panic("_bus_dmamem_unmap");
#endif
if (kva >= (caddr_t)PMAP_DIRECT_BASE && kva <= (caddr_t)PMAP_DIRECT_END)
return;
km_free(kva, round_page(size), &kv_any, &kp_none);
}
/*
* Common function for mmap(2)'ing DMA-safe memory. May be called by
* bus-specific DMA mmap(2)'ing functions.
*/
paddr_t
_bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, off_t off,
int prot, int flags)
{
int i, pmapflags = 0;
if (flags & BUS_DMA_NOCACHE)
pmapflags |= PMAP_NOCACHE;
for (i = 0; i < nsegs; i++) {
#ifdef DIAGNOSTIC
if (off & PGOFSET)
panic("_bus_dmamem_mmap: offset unaligned");
if (segs[i].ds_addr & PGOFSET)
panic("_bus_dmamem_mmap: segment unaligned");
if (segs[i].ds_len & PGOFSET)
panic("_bus_dmamem_mmap: segment size not multiple"
" of page size");
#endif
if (off >= segs[i].ds_len) {
off -= segs[i].ds_len;
continue;
}
return ((segs[i].ds_addr + off) | pmapflags);
}
/* Page not found. */
return (-1);
}
/**********************************************************************
* DMA utility functions
**********************************************************************/
/*
* Utility function to load a linear buffer. lastaddrp holds state
* between invocations (for multiple-buffer loads). segp contains
* the starting segment on entrance, and the ending segment on exit.
* first indicates if this is the first invocation of this function.
*/
int
_bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
bus_size_t buflen, struct proc *p, int flags, paddr_t *lastaddrp, int *segp,
int first)
{
bus_size_t sgsize;
bus_addr_t curaddr, lastaddr, baddr, bmask;
vaddr_t vaddr = (vaddr_t)buf;
int seg;
pmap_t pmap;
if (p != NULL) pmap = p->p_vmspace->vm_map.pmap;
else
pmap = pmap_kernel();
lastaddr = *lastaddrp;
bmask = ~(map->_dm_boundary - 1);
for (seg = *segp; buflen > 0 ; ) {
/*
* Get the physical address for this segment.
*/
pmap_extract(pmap, vaddr, (paddr_t *)&curaddr);
if (curaddr > dma_constraint.ucr_high &&
(map->_dm_flags & BUS_DMA_64BIT) == 0)
panic("Non dma-reachable buffer at curaddr %#lx(raw)",
curaddr);
/*
* Compute the segment size, and adjust counts.
*/
sgsize = PAGE_SIZE - ((u_long)vaddr & PGOFSET);
if (buflen < sgsize)
sgsize = buflen;
/*
* Make sure we don't cross any boundaries.
*/
if (map->_dm_boundary > 0) {
baddr = (curaddr + map->_dm_boundary) & bmask;
if (sgsize > (baddr - curaddr))
sgsize = (baddr - curaddr);
}
/*
* Insert chunk into a segment, coalescing with
* previous segment if possible.
*/
if (first) {
map->dm_segs[seg].ds_addr = curaddr;
map->dm_segs[seg].ds_len = sgsize;
first = 0;
} else {
if (curaddr == lastaddr &&
(map->dm_segs[seg].ds_len + sgsize) <=
map->_dm_maxsegsz && (map->_dm_boundary == 0 ||
(map->dm_segs[seg].ds_addr & bmask) ==
(curaddr & bmask)))
map->dm_segs[seg].ds_len += sgsize;
else {
if (++seg >= map->_dm_segcnt)
break;
map->dm_segs[seg].ds_addr = curaddr;
map->dm_segs[seg].ds_len = sgsize;
}
}
lastaddr = curaddr + sgsize;
vaddr += sgsize;
buflen -= sgsize;
}
*segp = seg;
*lastaddrp = lastaddr;
/*
* Did we fit?
*/
if (buflen != 0)
return (EFBIG); /* XXX better return value here? */
return (0);
}
/*
* Allocate physical memory from the given physical address range.
* Called by DMA-safe memory allocation methods.
*/
int
_bus_dmamem_alloc_range(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment,
bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs,
int flags, bus_addr_t low, bus_addr_t high)
{
paddr_t curaddr, lastaddr;
struct vm_page *m;
struct pglist mlist;
int curseg, error, plaflag;
/* Always round the size. */
size = round_page(size);
segs[0]._ds_boundary = boundary;
segs[0]._ds_align = alignment;
/*
* Allocate pages from the VM system.
*/
plaflag = flags & BUS_DMA_NOWAIT ? UVM_PLA_NOWAIT : UVM_PLA_WAITOK;
if (flags & BUS_DMA_ZERO)
plaflag |= UVM_PLA_ZERO;
TAILQ_INIT(&mlist);
error = uvm_pglistalloc(size, low, high, alignment, boundary,
&mlist, nsegs, plaflag);
if (error)
return (error);
/*
* Compute the location, size, and number of segments actually
* returned by the VM code.
*/
m = TAILQ_FIRST(&mlist);
curseg = 0;
lastaddr = segs[curseg].ds_addr = VM_PAGE_TO_PHYS(m);
segs[curseg].ds_len = PAGE_SIZE;
for (m = TAILQ_NEXT(m, pageq); m != NULL; m = TAILQ_NEXT(m, pageq)) {
curaddr = VM_PAGE_TO_PHYS(m);
#ifdef DIAGNOSTIC
if (curseg == nsegs) {
printf("uvm_pglistalloc returned too many\n");
panic("_bus_dmamem_alloc_range");
}
if (curaddr < low || curaddr >= high) {
printf("uvm_pglistalloc returned non-sensical"
" address 0x%lx\n", curaddr);
panic("_bus_dmamem_alloc_range");
}
#endif
if (curaddr == (lastaddr + PAGE_SIZE))
segs[curseg].ds_len += PAGE_SIZE;
else {
curseg++;
segs[curseg].ds_addr = curaddr;
segs[curseg].ds_len = PAGE_SIZE;
}
lastaddr = curaddr;
}
*rsegs = curseg + 1;
return (0);
}
/* $OpenBSD: in_cksum.c,v 1.9 2019/04/22 22:47:49 bluhm Exp $ */
/* $NetBSD: in_cksum.c,v 1.11 1996/04/08 19:55:37 jonathan Exp $ */
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*/
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
int
in_cksum(struct mbuf *m, int len)
{
uint16_t *w;
int sum = 0;
int mlen = 0;
int byte_swapped = 0;
union {
uint8_t c[2];
uint16_t s;
} s_util;
union {
uint16_t s[2];
uint32_t l;
} l_util;
for (;m && len; m = m->m_next) { if (m->m_len == 0)
continue;
w = mtod(m, uint16_t *);
if (mlen == -1) {
/*
* The first byte of this mbuf is the continuation
* of a word spanning between this mbuf and the
* last mbuf.
*
* s_util.c[0] is already saved when scanning previous
* mbuf.
*/
s_util.c[1] = *(uint8_t *)w;
sum += s_util.s;
w = (uint16_t *)((uint8_t *)w + 1);
mlen = m->m_len - 1;
len--;
} else
mlen = m->m_len;
if (len < mlen)
mlen = len;
len -= mlen;
/*
* Force to even boundary.
*/
if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *(uint8_t *)w;
w = (uint16_t *)((uint8_t *)w + 1);
mlen--;
byte_swapped = 1;
}
/*
* Unroll the loop to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
w += 4;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
continue;
REDUCE;
while ((mlen -= 2) >= 0) {
sum += *w++;
}
if (byte_swapped) {
REDUCE;
sum <<= 8;
byte_swapped = 0;
if (mlen == -1) { s_util.c[1] = *(uint8_t *)w;
sum += s_util.s;
mlen = 0;
} else
mlen = -1;
} else if (mlen == -1) s_util.c[0] = *(uint8_t *)w;
}
if (len)
panic("%s: out of data, len %d", __func__, len); if (mlen == -1) {
/* The last mbuf has odd # of bytes. Follow the
standard (the odd byte may be shifted left by 8 bits
or not as determined by endian-ness of the machine) */
s_util.c[1] = 0;
sum += s_util.s;
}
REDUCE;
return (~sum & 0xffff);
}
/* $OpenBSD: uvm_amap.c,v 1.91 2022/08/01 14:15:46 mpi Exp $ */
/* $NetBSD: uvm_amap.c,v 1.27 2000/11/25 06:27:59 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_amap.c: amap operations
*
* this file contains functions that perform operations on amaps. see
* uvm_amap.h for a brief explanation of the role of amaps in uvm.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
/*
* pools for allocation of vm_amap structures. note that in order to
* avoid an endless loop, the amap pool's allocator cannot allocate
* memory from an amap (it currently goes through the kernel uobj, so
* we are ok).
*/
struct pool uvm_amap_pool;
struct pool uvm_small_amap_pool[UVM_AMAP_CHUNK];
struct pool uvm_amap_chunk_pool;
LIST_HEAD(, vm_amap) amap_list;
struct rwlock amap_list_lock = RWLOCK_INITIALIZER("amaplstlk");
#define amap_lock_list() rw_enter_write(&amap_list_lock)
#define amap_unlock_list() rw_exit_write(&amap_list_lock)
static char amap_small_pool_names[UVM_AMAP_CHUNK][9];
/*
* local functions
*/
static struct vm_amap *amap_alloc1(int, int, int);
static inline void amap_list_insert(struct vm_amap *);
static inline void amap_list_remove(struct vm_amap *);
struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int, int);
void amap_chunk_free(struct vm_amap *, struct vm_amap_chunk *);
/*
* if we enable PPREF, then we have a couple of extra functions that
* we need to prototype here...
*/
#ifdef UVM_AMAP_PPREF
#define PPREF_NONE ((int *) -1) /* not using ppref */
void amap_pp_adjref(struct vm_amap *, int, vsize_t, int);
void amap_pp_establish(struct vm_amap *);
void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int,
int);
void amap_wiperange(struct vm_amap *, int, int);
#endif /* UVM_AMAP_PPREF */
static inline void
amap_list_insert(struct vm_amap *amap)
{
amap_lock_list();
LIST_INSERT_HEAD(&amap_list, amap, am_list);
amap_unlock_list();
}
static inline void
amap_list_remove(struct vm_amap *amap)
{
amap_lock_list();
LIST_REMOVE(amap, am_list);
amap_unlock_list();
}
/*
* amap_chunk_get: lookup a chunk for slot. if create is non-zero,
* the chunk is created if it does not yet exist.
*
* => returns the chunk on success or NULL on error
*/
struct vm_amap_chunk *
amap_chunk_get(struct vm_amap *amap, int slot, int create, int waitf)
{
int bucket = UVM_AMAP_BUCKET(amap, slot);
int baseslot = AMAP_BASE_SLOT(slot);
int n;
struct vm_amap_chunk *chunk, *newchunk, *pchunk = NULL;
if (UVM_AMAP_SMALL(amap)) return &amap->am_small; for (chunk = amap->am_buckets[bucket]; chunk != NULL;
chunk = TAILQ_NEXT(chunk, ac_list)) {
if (UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) != bucket)
break;
if (chunk->ac_baseslot == baseslot)
return chunk;
pchunk = chunk;
}
if (!create)
return NULL;
if (amap->am_nslot - baseslot >= UVM_AMAP_CHUNK)
n = UVM_AMAP_CHUNK;
else
n = amap->am_nslot - baseslot;
newchunk = pool_get(&uvm_amap_chunk_pool, waitf | PR_ZERO);
if (newchunk == NULL)
return NULL;
if (pchunk == NULL) {
TAILQ_INSERT_TAIL(&amap->am_chunks, newchunk, ac_list);
KASSERT(amap->am_buckets[bucket] == NULL); amap->am_buckets[bucket] = newchunk;
} else
TAILQ_INSERT_AFTER(&amap->am_chunks, pchunk, newchunk,
ac_list);
amap->am_ncused++;
newchunk->ac_baseslot = baseslot;
newchunk->ac_nslot = n;
return newchunk;
}
void
amap_chunk_free(struct vm_amap *amap, struct vm_amap_chunk *chunk)
{
int bucket = UVM_AMAP_BUCKET(amap, chunk->ac_baseslot);
struct vm_amap_chunk *nchunk;
if (UVM_AMAP_SMALL(amap))
return;
nchunk = TAILQ_NEXT(chunk, ac_list);
TAILQ_REMOVE(&amap->am_chunks, chunk, ac_list);
if (amap->am_buckets[bucket] == chunk) { if (nchunk != NULL &&
UVM_AMAP_BUCKET(amap, nchunk->ac_baseslot) == bucket)
amap->am_buckets[bucket] = nchunk;
else
amap->am_buckets[bucket] = NULL;
}
pool_put(&uvm_amap_chunk_pool, chunk);
amap->am_ncused--;
}
#ifdef UVM_AMAP_PPREF
/*
* what is ppref? ppref is an _optional_ amap feature which is used
* to keep track of reference counts on a per-page basis. it is enabled
* when UVM_AMAP_PPREF is defined.
*
* when enabled, an array of ints is allocated for the pprefs. this
* array is allocated only when a partial reference is added to the
* map (either by unmapping part of the amap, or gaining a reference
* to only a part of an amap). if the allocation of the array fails
* (M_NOWAIT), then we set the array pointer to PPREF_NONE to indicate
* that we tried to do ppref's but couldn't alloc the array so just
* give up (after all, this is an optional feature!).
*
* the array is divided into page sized "chunks." for chunks of length 1,
* the chunk reference count plus one is stored in that chunk's slot.
* for chunks of length > 1 the first slot contains (the reference count
* plus one) * -1. [the negative value indicates that the length is
* greater than one.] the second slot of the chunk contains the length
* of the chunk. here is an example:
*
* actual REFS: 2 2 2 2 3 1 1 0 0 0 4 4 0 1 1 1
* ppref: -3 4 x x 4 -2 2 -1 3 x -5 2 1 -2 3 x
* <----------><-><----><-------><----><-><------->
* (x = don't care)
*
* this allows us to allow one int to contain the ref count for the whole
* chunk. note that the "plus one" part is needed because a reference
* count of zero is neither positive or negative (need a way to tell
* if we've got one zero or a bunch of them).
*
* here are some in-line functions to help us.
*/
/*
* pp_getreflen: get the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_getreflen(int *ppref, int offset, int *refp, int *lenp)
{
if (ppref[offset] > 0) { /* chunk size must be 1 */ *refp = ppref[offset] - 1; /* don't forget to adjust */
*lenp = 1;
} else {
*refp = (ppref[offset] * -1) - 1;
*lenp = ppref[offset+1];
}
}
/*
* pp_setreflen: set the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_setreflen(int *ppref, int offset, int ref, int len)
{
if (len == 1) { ppref[offset] = ref + 1;
} else {
ppref[offset] = (ref + 1) * -1;
ppref[offset+1] = len;
}
}
#endif /* UVM_AMAP_PPREF */
/*
* amap_init: called at boot time to init global amap data structures
*/
void
amap_init(void)
{
int i;
size_t size;
/* Initialize the vm_amap pool. */
pool_init(&uvm_amap_pool, sizeof(struct vm_amap),
0, IPL_MPFLOOR, PR_WAITOK, "amappl", NULL);
pool_sethiwat(&uvm_amap_pool, 4096);
/* initialize small amap pools */
for (i = 0; i < nitems(uvm_small_amap_pool); i++) {
snprintf(amap_small_pool_names[i],
sizeof(amap_small_pool_names[0]), "amappl%d", i + 1);
size = offsetof(struct vm_amap, am_small.ac_anon) +
(i + 1) * sizeof(struct vm_anon *);
pool_init(&uvm_small_amap_pool[i], size, 0, IPL_MPFLOOR,
PR_WAITOK, amap_small_pool_names[i], NULL);
}
pool_init(&uvm_amap_chunk_pool, sizeof(struct vm_amap_chunk) +
UVM_AMAP_CHUNK * sizeof(struct vm_anon *),
0, IPL_MPFLOOR, PR_WAITOK, "amapchunkpl", NULL);
pool_sethiwat(&uvm_amap_chunk_pool, 4096);
}
/*
* amap_alloc1: allocate an amap, but do not initialise the overlay.
*
* => Note: lock is not set.
*/
static inline struct vm_amap *
amap_alloc1(int slots, int waitf, int lazyalloc)
{
struct vm_amap *amap;
struct vm_amap_chunk *chunk, *tmp;
int chunks, log_chunks, chunkperbucket = 1, hashshift = 0;
int buckets, i, n;
int pwaitf = (waitf & M_WAITOK) ? PR_WAITOK : PR_NOWAIT;
KASSERT(slots > 0);
/*
* Cast to unsigned so that rounding up cannot cause integer overflow
* if slots is large.
*/
chunks = roundup((unsigned int)slots, UVM_AMAP_CHUNK) / UVM_AMAP_CHUNK;
if (lazyalloc) {
/*
* Basically, the amap is a hash map where the number of
* buckets is fixed. We select the number of buckets using the
* following strategy:
*
* 1. The maximal number of entries to search in a bucket upon
* a collision should be less than or equal to
* log2(slots / UVM_AMAP_CHUNK). This is the worst-case number
* of lookups we would have if we could chunk the amap. The
* log2(n) comes from the fact that amaps are chunked by
* splitting up their vm_map_entries and organizing those
* in a binary search tree.
*
* 2. The maximal number of entries in a bucket must be a
* power of two.
*
* The maximal number of entries per bucket is used to hash
* a slot to a bucket.
*
* In the future, this strategy could be refined to make it
* even harder/impossible that the total amount of KVA needed
* for the hash buckets of all amaps to exceed the maximal
* amount of KVA memory reserved for amaps.
*/
for (log_chunks = 1; (chunks >> log_chunks) > 0; log_chunks++)
continue;
chunkperbucket = 1 << hashshift;
while (chunkperbucket + 1 < log_chunks) {
hashshift++;
chunkperbucket = 1 << hashshift;
}
}
if (slots > UVM_AMAP_CHUNK)
amap = pool_get(&uvm_amap_pool, pwaitf);
else
amap = pool_get(&uvm_small_amap_pool[slots - 1],
pwaitf | PR_ZERO);
if (amap == NULL)
return NULL;
amap->am_lock = NULL;
amap->am_ref = 1;
amap->am_flags = 0;
#ifdef UVM_AMAP_PPREF
amap->am_ppref = NULL;
#endif
amap->am_nslot = slots;
amap->am_nused = 0;
if (UVM_AMAP_SMALL(amap)) {
amap->am_small.ac_nslot = slots;
return amap;
}
amap->am_ncused = 0;
TAILQ_INIT(&amap->am_chunks);
amap->am_hashshift = hashshift;
amap->am_buckets = NULL;
buckets = howmany(chunks, chunkperbucket);
amap->am_buckets = mallocarray(buckets, sizeof(*amap->am_buckets),
M_UVMAMAP, waitf | (lazyalloc ? M_ZERO : 0));
if (amap->am_buckets == NULL)
goto fail1;
amap->am_nbuckets = buckets;
if (!lazyalloc) { for (i = 0; i < buckets; i++) {
if (i == buckets - 1) {
n = slots % UVM_AMAP_CHUNK;
if (n == 0)
n = UVM_AMAP_CHUNK;
} else
n = UVM_AMAP_CHUNK;
chunk = pool_get(&uvm_amap_chunk_pool,
PR_ZERO | pwaitf);
if (chunk == NULL)
goto fail1;
amap->am_buckets[i] = chunk;
amap->am_ncused++;
chunk->ac_baseslot = i * UVM_AMAP_CHUNK;
chunk->ac_nslot = n;
TAILQ_INSERT_TAIL(&amap->am_chunks, chunk, ac_list);
}
}
return amap;
fail1:
free(amap->am_buckets, M_UVMAMAP, buckets * sizeof(*amap->am_buckets)); TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp)
pool_put(&uvm_amap_chunk_pool, chunk);
pool_put(&uvm_amap_pool, amap);
return NULL;
}
static void
amap_lock_alloc(struct vm_amap *amap)
{
rw_obj_alloc(&amap->am_lock, "amaplk");
}
/*
* amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM
*
* => caller should ensure sz is a multiple of PAGE_SIZE
* => reference count to new amap is set to one
* => new amap is returned unlocked
*/
struct vm_amap *
amap_alloc(vaddr_t sz, int waitf, int lazyalloc)
{
struct vm_amap *amap;
size_t slots;
AMAP_B2SLOT(slots, sz); /* load slots */ if (slots > INT_MAX)
return NULL;
amap = amap_alloc1(slots, waitf, lazyalloc);
if (amap != NULL) {
amap_lock_alloc(amap);
amap_list_insert(amap);
}
return amap;
}
/*
* amap_free: free an amap
*
* => the amap must be unlocked
* => the amap should have a zero reference count and be empty
*/
void
amap_free(struct vm_amap *amap)
{
struct vm_amap_chunk *chunk, *tmp;
KASSERT(amap->am_ref == 0 && amap->am_nused == 0); KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0); if (amap->am_lock != NULL) { KASSERT(amap->am_lock == NULL || !rw_write_held(amap->am_lock)); rw_obj_free(amap->am_lock);
}
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) free(amap->am_ppref, M_UVMAMAP, amap->am_nslot * sizeof(int));
#endif
if (UVM_AMAP_SMALL(amap))
pool_put(&uvm_small_amap_pool[amap->am_nslot - 1], amap);
else {
TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp)
pool_put(&uvm_amap_chunk_pool, chunk);
free(amap->am_buckets, M_UVMAMAP,
amap->am_nbuckets * sizeof(*amap->am_buckets));
pool_put(&uvm_amap_pool, amap);
}
}
/*
* amap_wipeout: wipeout all anon's in an amap; then free the amap!
*
* => Called from amap_unref(), when reference count drops to zero.
* => amap must be locked.
*/
void
amap_wipeout(struct vm_amap *amap)
{
int slot;
struct vm_anon *anon;
struct vm_amap_chunk *chunk;
struct pglist pgl;
KASSERT(rw_write_held(amap->am_lock)); KASSERT(amap->am_ref == 0);
if (__predict_false((amap->am_flags & AMAP_SWAPOFF) != 0)) {
/*
* Note: amap_swap_off() will call us again.
*/
amap_unlock(amap);
return;
}
TAILQ_INIT(&pgl);
amap_list_remove(amap);
AMAP_CHUNK_FOREACH(chunk, amap) {
int i, refs, map = chunk->ac_usedmap;
for (i = ffs(map); i != 0; i = ffs(map)) {
slot = i - 1;
map ^= 1 << slot;
anon = chunk->ac_anon[slot];
if (anon == NULL || anon->an_ref == 0)
panic("amap_wipeout: corrupt amap");
KASSERT(anon->an_lock == amap->am_lock);
/*
* Drop the reference.
*/
refs = --anon->an_ref;
if (refs == 0) { uvm_anfree_list(anon, &pgl);
}
}
}
/* free the pages */
uvm_pglistfree(&pgl);
/*
* Finally, destroy the amap.
*/
amap->am_ref = 0; /* ... was one */
amap->am_nused = 0;
amap_unlock(amap);
amap_free(amap);
}
/*
* amap_copy: ensure that a map entry's "needs_copy" flag is false
* by copying the amap if necessary.
*
* => an entry with a null amap pointer will get a new (blank) one.
* => the map that the map entry belongs to must be locked by caller.
* => the amap currently attached to "entry" (if any) must be unlocked.
* => if canchunk is true, then we may clip the entry into a chunk
* => "startva" and "endva" are used only if canchunk is true. they are
* used to limit chunking (e.g. if you have a large space that you
* know you are going to need to allocate amaps for, there is no point
* in allowing that to be chunked)
*/
void
amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
boolean_t canchunk, vaddr_t startva, vaddr_t endva)
{
struct vm_amap *amap, *srcamap;
int slots, lcv, lazyalloc = 0;
vaddr_t chunksize;
int i, j, k, n, srcslot;
struct vm_amap_chunk *chunk = NULL, *srcchunk = NULL;
struct vm_anon *anon;
KASSERT(map != kernel_map); /* we use sleeping locks */
/*
* Is there an amap to copy? If not, create one.
*/
if (entry->aref.ar_amap == NULL) {
/*
* Check to see if we have a large amap that we can
* chunk. We align startva/endva to chunk-sized
* boundaries and then clip to them.
*
* If we cannot chunk the amap, allocate it in a way
* that makes it grow or shrink dynamically with
* the number of slots.
*/
if (atop(entry->end - entry->start) >= UVM_AMAP_LARGE) { if (canchunk) {
/* convert slots to bytes */
chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
startva = (startva / chunksize) * chunksize;
endva = roundup(endva, chunksize);
UVM_MAP_CLIP_START(map, entry, startva);
/* watch out for endva wrap-around! */
if (endva >= startva) UVM_MAP_CLIP_END(map, entry, endva);
} else
lazyalloc = 1;
}
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap_alloc(entry->end - entry->start,
waitf, lazyalloc);
if (entry->aref.ar_amap != NULL) entry->etype &= ~UVM_ET_NEEDSCOPY;
return;
}
/*
* First check and see if we are the only map entry referencing
* he amap we currently have. If so, then just take it over instead
* of copying it. Note that we are reading am_ref without lock held
* as the value value can only be one if we have the only reference
* to the amap (via our locked map). If the value is greater than
* one, then allocate amap and re-check the value.
*/
if (entry->aref.ar_amap->am_ref == 1) {
entry->etype &= ~UVM_ET_NEEDSCOPY;
return;
}
/*
* Allocate a new amap (note: not initialised, etc).
*/
AMAP_B2SLOT(slots, entry->end - entry->start); if (!UVM_AMAP_SMALL(entry->aref.ar_amap) && entry->aref.ar_amap->am_hashshift != 0)
lazyalloc = 1;
amap = amap_alloc1(slots, waitf, lazyalloc);
if (amap == NULL)
return;
srcamap = entry->aref.ar_amap;
/*
* Make the new amap share the source amap's lock, and then lock
* both.
*/
amap->am_lock = srcamap->am_lock;
rw_obj_hold(amap->am_lock);
amap_lock(srcamap);
/*
* Re-check the reference count with the lock held. If it has
* dropped to one - we can take over the existing map.
*/
if (srcamap->am_ref == 1) {
/* Just take over the existing amap. */
entry->etype &= ~UVM_ET_NEEDSCOPY;
amap_unlock(srcamap);
/* Destroy the new (unused) amap. */
amap->am_ref--;
amap_free(amap);
return;
}
/*
* Copy the slots.
*/
for (lcv = 0; lcv < slots; lcv += n) {
srcslot = entry->aref.ar_pageoff + lcv;
i = UVM_AMAP_SLOTIDX(lcv);
j = UVM_AMAP_SLOTIDX(srcslot);
n = UVM_AMAP_CHUNK;
if (i > j)
n -= i;
else
n -= j;
if (lcv + n > slots)
n = slots - lcv;
srcchunk = amap_chunk_get(srcamap, srcslot, 0, PR_NOWAIT);
if (srcchunk == NULL)
continue;
chunk = amap_chunk_get(amap, lcv, 1, PR_NOWAIT);
if (chunk == NULL) {
/* amap_wipeout() releases the lock. */
amap->am_ref = 0;
amap_wipeout(amap);
return;
}
for (k = 0; k < n; i++, j++, k++) {
chunk->ac_anon[i] = anon = srcchunk->ac_anon[j];
if (anon == NULL)
continue;
KASSERT(anon->an_lock == srcamap->am_lock); KASSERT(anon->an_ref > 0); chunk->ac_usedmap |= (1 << i);
anon->an_ref++;
amap->am_nused++;
}
}
/*
* Drop our reference to the old amap (srcamap) and unlock.
* Since the reference count on srcamap is greater than one,
* (we checked above), it cannot drop to zero while it is locked.
*/
srcamap->am_ref--; KASSERT(srcamap->am_ref > 0); if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) srcamap->am_flags &= ~AMAP_SHARED; /* clear shared flag */
#ifdef UVM_AMAP_PPREF
if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) { amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
(entry->end - entry->start) >> PAGE_SHIFT, -1);
}
#endif
/*
* If we referenced any anons, then share the source amap's lock.
* Otherwise, we have nothing in common, so allocate a new one.
*/
KASSERT(amap->am_lock == srcamap->am_lock); if (amap->am_nused == 0) { rw_obj_free(amap->am_lock);
amap->am_lock = NULL;
}
amap_unlock(srcamap);
if (amap->am_lock == NULL) amap_lock_alloc(amap);
/*
* Install new amap.
*/
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap;
entry->etype &= ~UVM_ET_NEEDSCOPY;
amap_list_insert(amap);
}
/*
* amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2)
*
* called during fork(2) when the parent process has a wired map
* entry. in that case we want to avoid write-protecting pages
* in the parent's map (e.g. like what you'd do for a COW page)
* so we resolve the COW here.
*
* => assume parent's entry was wired, thus all pages are resident.
* => the parent and child vm_map must both be locked.
* => caller passes child's map/entry in to us
* => XXXCDC: out of memory should cause fork to fail, but there is
* currently no easy way to do this (needs fix)
*/
void
amap_cow_now(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_amap *amap = entry->aref.ar_amap;
int slot;
struct vm_anon *anon, *nanon;
struct vm_page *pg, *npg;
struct vm_amap_chunk *chunk;
/*
* note that if we unlock the amap then we must ReStart the "lcv" for
* loop because some other process could reorder the anon's in the
* am_anon[] array on us while the lock is dropped.
*/
ReStart:
amap_lock(amap);
AMAP_CHUNK_FOREACH(chunk, amap) {
int i, map = chunk->ac_usedmap;
for (i = ffs(map); i != 0; i = ffs(map)) {
slot = i - 1;
map ^= 1 << slot;
anon = chunk->ac_anon[slot];
pg = anon->an_page;
KASSERT(anon->an_lock == amap->am_lock);
/*
* The old page must be resident since the parent is
* wired.
*/
KASSERT(pg != NULL);
/*
* if the anon ref count is one, we are safe (the child
* has exclusive access to the page).
*/
if (anon->an_ref <= 1)
continue;
/*
* If the page is busy, then we have to unlock, wait for
* it and then restart.
*/
if (pg->pg_flags & PG_BUSY) {
uvm_pagewait(pg, amap->am_lock, "cownow");
goto ReStart;
}
/*
* Perform a copy-on-write.
* First - get a new anon and a page.
*/
nanon = uvm_analloc();
if (nanon != NULL) {
/* the new anon will share the amap's lock */
nanon->an_lock = amap->am_lock;
npg = uvm_pagealloc(NULL, 0, nanon, 0);
} else
npg = NULL; /* XXX: quiet gcc warning */
if (nanon == NULL || npg == NULL) {
/* out of memory */
amap_unlock(amap);
if (nanon != NULL) {
nanon->an_lock = NULL;
nanon->an_ref--;
KASSERT(nanon->an_ref == 0);
uvm_anfree(nanon);
}
uvm_wait("cownowpage");
goto ReStart;
}
/*
* Copy the data and replace anon with the new one.
* Also, setup its lock (share the with amap's lock).
*/
uvm_pagecopy(pg, npg);
anon->an_ref--;
KASSERT(anon->an_ref > 0);
chunk->ac_anon[slot] = nanon;
/*
* Drop PG_BUSY on new page. Since its owner was write
* locked all this time - it cannot be PG_RELEASED or
* PG_WANTED.
*/
atomic_clearbits_int(&npg->pg_flags, PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(npg, NULL);
uvm_lock_pageq();
uvm_pageactivate(npg);
uvm_unlock_pageq();
}
}
amap_unlock(amap);
}
/*
* amap_splitref: split a single reference into two separate references
*
* => called from uvm_map's clip routines
* => origref's map should be locked
* => origref->ar_amap should be unlocked (we will lock)
*/
void
amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset)
{
struct vm_amap *amap = origref->ar_amap;
int leftslots;
KASSERT(splitref->ar_amap == amap); AMAP_B2SLOT(leftslots, offset);
if (leftslots == 0)
panic("amap_splitref: split at zero offset");
amap_lock(amap);
if (amap->am_nslot - origref->ar_pageoff - leftslots <= 0)
panic("amap_splitref: map size check failed");
#ifdef UVM_AMAP_PPREF
/* Establish ppref before we add a duplicate reference to the amap. */
if (amap->am_ppref == NULL)
amap_pp_establish(amap);
#endif
/* Note: not a share reference. */
amap->am_ref++;
splitref->ar_amap = amap;
splitref->ar_pageoff = origref->ar_pageoff + leftslots;
amap_unlock(amap);
}
#ifdef UVM_AMAP_PPREF
/*
* amap_pp_establish: add a ppref array to an amap, if possible.
*
* => amap should be locked by caller* => amap should be locked by caller
*/
void
amap_pp_establish(struct vm_amap *amap)
{
KASSERT(rw_write_held(amap->am_lock));
amap->am_ppref = mallocarray(amap->am_nslot, sizeof(int),
M_UVMAMAP, M_NOWAIT|M_ZERO);
if (amap->am_ppref == NULL) {
/* Failure - just do not use ppref. */
amap->am_ppref = PPREF_NONE;
return;
}
pp_setreflen(amap->am_ppref, 0, amap->am_ref, amap->am_nslot);
}
/*
* amap_pp_adjref: adjust reference count to a part of an amap using the
* per-page reference count array.
*
* => caller must check that ppref != PPREF_NONE before calling.
* => map and amap must be locked.
*/
void
amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval)
{
int stopslot, *ppref, lcv, prevlcv;
int ref, len, prevref, prevlen;
KASSERT(rw_write_held(amap->am_lock));
stopslot = curslot + slotlen;
ppref = amap->am_ppref;
prevlcv = 0;
/*
* Advance to the correct place in the array, fragment if needed.
*/
for (lcv = 0 ; lcv < curslot ; lcv += len) {
pp_getreflen(ppref, lcv, &ref, &len);
if (lcv + len > curslot) { /* goes past start? */
pp_setreflen(ppref, lcv, ref, curslot - lcv);
pp_setreflen(ppref, curslot, ref, len - (curslot -lcv));
len = curslot - lcv; /* new length of entry @ lcv */
}
prevlcv = lcv;
}
if (lcv != 0)
pp_getreflen(ppref, prevlcv, &prevref, &prevlen);
else {
/*
* Ensure that the "prevref == ref" test below always
* fails, since we are starting from the beginning of
* the ppref array; that is, there is no previous chunk.
*/
prevref = -1;
prevlen = 0;
}
/*
* Now adjust reference counts in range. Merge the first
* changed entry with the last unchanged entry if possible.
*/
if (lcv != curslot)
panic("amap_pp_adjref: overshot target"); for (/* lcv already set */; lcv < stopslot ; lcv += len) {
pp_getreflen(ppref, lcv, &ref, &len);
if (lcv + len > stopslot) { /* goes past end? */
pp_setreflen(ppref, lcv, ref, stopslot - lcv);
pp_setreflen(ppref, stopslot, ref,
len - (stopslot - lcv));
len = stopslot - lcv;
}
ref += adjval;
if (ref < 0)
panic("amap_pp_adjref: negative reference count");
if (lcv == prevlcv + prevlen && ref == prevref) {
pp_setreflen(ppref, prevlcv, ref, prevlen + len);
} else {
pp_setreflen(ppref, lcv, ref, len);
}
if (ref == 0)
amap_wiperange(amap, lcv, len);
}
}
void
amap_wiperange_chunk(struct vm_amap *amap, struct vm_amap_chunk *chunk,
int slotoff, int slots)
{
int curslot, i, map;
int startbase, endbase;
struct vm_anon *anon;
startbase = AMAP_BASE_SLOT(slotoff);
endbase = AMAP_BASE_SLOT(slotoff + slots - 1);
map = chunk->ac_usedmap;
if (startbase == chunk->ac_baseslot)
map &= ~((1 << (slotoff - startbase)) - 1);
if (endbase == chunk->ac_baseslot)
map &= (1 << (slotoff + slots - endbase)) - 1;
for (i = ffs(map); i != 0; i = ffs(map)) {
int refs;
curslot = i - 1;
map ^= 1 << curslot;
chunk->ac_usedmap ^= 1 << curslot;
anon = chunk->ac_anon[curslot];
KASSERT(anon->an_lock == amap->am_lock);
/* remove it from the amap */
chunk->ac_anon[curslot] = NULL;
amap->am_nused--;
/* drop anon reference count */
refs = --anon->an_ref;
if (refs == 0) { uvm_anfree(anon);
}
/*
* done with this anon, next ...!
*/
} /* end of 'for' loop */
}
/*
* amap_wiperange: wipe out a range of an amap.
* Note: different from amap_wipeout because the amap is kept intact.
*
* => Both map and amap must be locked by caller.
*/
void
amap_wiperange(struct vm_amap *amap, int slotoff, int slots)
{
int bucket, startbucket, endbucket;
struct vm_amap_chunk *chunk, *nchunk;
KASSERT(rw_write_held(amap->am_lock));
startbucket = UVM_AMAP_BUCKET(amap, slotoff);
endbucket = UVM_AMAP_BUCKET(amap, slotoff + slots - 1);
/*
* We can either traverse the amap by am_chunks or by am_buckets.
* Determine which way is less expensive.
*/
if (UVM_AMAP_SMALL(amap))
amap_wiperange_chunk(amap, &amap->am_small, slotoff, slots);
else if (endbucket + 1 - startbucket >= amap->am_ncused) {
TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, nchunk) { if (chunk->ac_baseslot + chunk->ac_nslot <= slotoff)
continue;
if (chunk->ac_baseslot >= slotoff + slots)
continue;
amap_wiperange_chunk(amap, chunk, slotoff, slots);
if (chunk->ac_usedmap == 0)
amap_chunk_free(amap, chunk);
}
} else {
for (bucket = startbucket; bucket <= endbucket; bucket++) { for (chunk = amap->am_buckets[bucket]; chunk != NULL;
chunk = nchunk) {
nchunk = TAILQ_NEXT(chunk, ac_list);
if (UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) !=
bucket)
break;
if (chunk->ac_baseslot + chunk->ac_nslot <=
slotoff)
continue;
if (chunk->ac_baseslot >= slotoff + slots)
continue;
amap_wiperange_chunk(amap, chunk, slotoff,
slots);
if (chunk->ac_usedmap == 0)
amap_chunk_free(amap, chunk);
}
}
}
}
#endif
/*
* amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
*
* => note that we don't always traverse all anons.
* eg. amaps being wiped out, released anons.
* => return TRUE if failed.
*/
boolean_t
amap_swap_off(int startslot, int endslot)
{
struct vm_amap *am;
struct vm_amap *am_next;
struct vm_amap marker;
boolean_t rv = FALSE;
amap_lock_list();
for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) {
int i, map;
struct vm_amap_chunk *chunk;
amap_lock(am);
if (am->am_nused == 0) {
amap_unlock(am);
am_next = LIST_NEXT(am, am_list);
continue;
}
LIST_INSERT_AFTER(am, &marker, am_list);
amap_unlock_list();
again:
AMAP_CHUNK_FOREACH(chunk, am) {
map = chunk->ac_usedmap;
for (i = ffs(map); i != 0; i = ffs(map)) {
int swslot;
int slot = i - 1;
struct vm_anon *anon;
map ^= 1 << slot;
anon = chunk->ac_anon[slot];
swslot = anon->an_swslot;
if (swslot < startslot || endslot <= swslot) {
continue;
}
am->am_flags |= AMAP_SWAPOFF;
rv = uvm_anon_pagein(am, anon);
amap_lock(am);
am->am_flags &= ~AMAP_SWAPOFF;
if (amap_refs(am) == 0) {
amap_wipeout(am);
am = NULL;
goto nextamap;
}
if (rv)
goto nextamap;
goto again;
}
}
nextamap:
if (am != NULL)
amap_unlock(am);
amap_lock_list();
am_next = LIST_NEXT(&marker, am_list);
LIST_REMOVE(&marker, am_list);
}
amap_unlock_list();
return rv;
}
/*
* amap_lookup: look up a page in an amap.
*
* => amap should be locked by caller.
*/
struct vm_anon *
amap_lookup(struct vm_aref *aref, vaddr_t offset)
{
int slot;
struct vm_amap *amap = aref->ar_amap;
struct vm_amap_chunk *chunk;
AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
chunk = amap_chunk_get(amap, slot, 0, PR_NOWAIT);
if (chunk == NULL)
return NULL;
return chunk->ac_anon[UVM_AMAP_SLOTIDX(slot)];
}
/*
* amap_lookups: look up a range of pages in an amap.
*
* => amap should be locked by caller.
* => XXXCDC: this interface is biased toward array-based amaps. fix.
*/
void
amap_lookups(struct vm_aref *aref, vaddr_t offset,
struct vm_anon **anons, int npages)
{
int i, lcv, n, slot;
struct vm_amap *amap = aref->ar_amap;
struct vm_amap_chunk *chunk = NULL;
AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT((slot + (npages - 1)) < amap->am_nslot); for (i = 0, lcv = slot; lcv < slot + npages; i += n, lcv += n) {
n = UVM_AMAP_CHUNK - UVM_AMAP_SLOTIDX(lcv);
if (lcv + n > slot + npages)
n = slot + npages - lcv;
chunk = amap_chunk_get(amap, lcv, 0, PR_NOWAIT);
if (chunk == NULL)
memset(&anons[i], 0, n * sizeof(*anons));
else
memcpy(&anons[i],
&chunk->ac_anon[UVM_AMAP_SLOTIDX(lcv)],
n * sizeof(*anons));
}
}
/*
* amap_populate: ensure that the amap can store an anon for the page at
* offset. This function can sleep until memory to store the anon is
* available.
*/
void
amap_populate(struct vm_aref *aref, vaddr_t offset)
{
int slot;
struct vm_amap *amap = aref->ar_amap;
struct vm_amap_chunk *chunk;
AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
chunk = amap_chunk_get(amap, slot, 1, PR_WAITOK);
KASSERT(chunk != NULL);
}
/*
* amap_add: add (or replace) a page to an amap.
*
* => amap should be locked by caller.
* => anon must have the lock associated with this amap.
*/
int
amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon,
boolean_t replace)
{
int slot;
struct vm_amap *amap = aref->ar_amap;
struct vm_amap_chunk *chunk;
AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
chunk = amap_chunk_get(amap, slot, 1, PR_NOWAIT);
if (chunk == NULL)
return 1;
slot = UVM_AMAP_SLOTIDX(slot);
if (replace) {
struct vm_anon *oanon = chunk->ac_anon[slot];
KASSERT(oanon != NULL); if (oanon->an_page && (amap->am_flags & AMAP_SHARED) != 0) {
pmap_page_protect(oanon->an_page, PROT_NONE);
/*
* XXX: suppose page is supposed to be wired somewhere?
*/
}
} else { /* !replace */
if (chunk->ac_anon[slot] != NULL)
panic("amap_add: slot in use"); chunk->ac_usedmap |= 1 << slot;
amap->am_nused++;
}
chunk->ac_anon[slot] = anon;
return 0;
}
/*
* amap_unadd: remove a page from an amap.
*
* => amap should be locked by caller.
*/
void
amap_unadd(struct vm_aref *aref, vaddr_t offset)
{
struct vm_amap *amap = aref->ar_amap;
struct vm_amap_chunk *chunk;
int slot;
KASSERT(rw_write_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
chunk = amap_chunk_get(amap, slot, 0, PR_NOWAIT);
KASSERT(chunk != NULL);
slot = UVM_AMAP_SLOTIDX(slot);
KASSERT(chunk->ac_anon[slot] != NULL);
chunk->ac_anon[slot] = NULL;
chunk->ac_usedmap &= ~(1 << slot);
amap->am_nused--;
if (chunk->ac_usedmap == 0)
amap_chunk_free(amap, chunk);
}
/*
* amap_adjref_anons: adjust the reference count(s) on amap and its anons.
*/
static void
amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len,
int refv, boolean_t all)
{
#ifdef UVM_AMAP_PPREF
KASSERT(rw_write_held(amap->am_lock));
/*
* We must establish the ppref array before changing am_ref
* so that the ppref values match the current amap refcount.
*/
if (amap->am_ppref == NULL && !all && len != amap->am_nslot) {
amap_pp_establish(amap);
}
#endif
amap->am_ref += refv;
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
if (all) {
amap_pp_adjref(amap, 0, amap->am_nslot, refv);
} else {
amap_pp_adjref(amap, offset, len, refv);
}
}
#endif
amap_unlock(amap);
}
/*
* amap_ref: gain a reference to an amap.
*
* => amap must not be locked (we will lock).
* => "offset" and "len" are in units of pages.
* => Called at fork time to gain the child's reference.
*/
void
amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
{
amap_lock(amap);
if (flags & AMAP_SHARED) amap->am_flags |= AMAP_SHARED;
amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0);
}
/*
* amap_unref: remove a reference to an amap.
*
* => All pmap-level references to this amap must be already removed.
* => Called from uvm_unmap_detach(); entry is already removed from the map.
* => We will lock amap, so it must be unlocked.
*/
void
amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, boolean_t all)
{
amap_lock(amap);
KASSERT(amap->am_ref > 0); if (amap->am_ref == 1) {
/*
* If the last reference - wipeout and destroy the amap.
*/
amap->am_ref--;
amap_wipeout(amap);
return;
}
/*
* Otherwise, drop the reference count(s) on anons.
*/
if (amap->am_ref == 2 && (amap->am_flags & AMAP_SHARED) != 0) { amap->am_flags &= ~AMAP_SHARED;
}
amap_adjref_anons(amap, offset, len, -1, all);
}
/* $OpenBSD: raw_ip6.c,v 1.168 2022/09/03 22:43:38 mvs Exp $ */
/* $KAME: raw_ip6.c,v 1.69 2001/03/04 15:55:44 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#ifdef MROUTING
#include <netinet6/ip6_mroute.h>
#endif
#include <netinet/icmp6.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/raw_ip6.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
#include <sys/stdarg.h>
/*
* Raw interface to IP6 protocol.
*/
struct inpcbtable rawin6pcbtable;
struct cpumem *rip6counters;
const struct pr_usrreqs rip6_usrreqs = {
.pru_attach = rip6_attach,
.pru_detach = rip6_detach,
.pru_bind = rip6_bind,
.pru_connect = rip6_connect,
.pru_disconnect = rip6_disconnect,
.pru_shutdown = rip6_shutdown,
.pru_send = rip6_send,
.pru_abort = rip6_abort,
.pru_control = in6_control,
.pru_sockaddr = in6_sockaddr,
.pru_peeraddr = in6_peeraddr,
};
/*
* Initialize raw connection block queue.
*/
void
rip6_init(void)
{
in_pcbinit(&rawin6pcbtable, 1);
rip6counters = counters_alloc(rip6s_ncounters);
}
int
rip6_input(struct mbuf **mp, int *offp, int proto, int af)
{
struct mbuf *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *in6p;
SIMPLEQ_HEAD(, inpcb) inpcblist;
struct in6_addr *key;
struct sockaddr_in6 rip6src;
uint8_t type;
KASSERT(af == AF_INET6);
if (proto == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
sizeof(*icmp6));
if (icmp6 == NULL)
return IPPROTO_DONE;
type = icmp6->icmp6_type;
} else
rip6stat_inc(rip6s_ipackets);
bzero(&rip6src, sizeof(rip6src));
rip6src.sin6_len = sizeof(struct sockaddr_in6);
rip6src.sin6_family = AF_INET6;
/* KAME hack: recover scopeid */
in6_recoverscope(&rip6src, &ip6->ip6_src);
key = &ip6->ip6_dst;
#if NPF > 0
if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL);
switch (divert->type) {
case PF_DIVERT_TO:
key = &divert->addr.v6;
break;
case PF_DIVERT_REPLY:
break;
default:
panic("%s: unknown divert type %d, mbuf %p, divert %p",
__func__, divert->type, m, divert);
}
}
#endif
SIMPLEQ_INIT(&inpcblist);
rw_enter_write(&rawin6pcbtable.inpt_notify);
mtx_enter(&rawin6pcbtable.inpt_mtx);
TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
continue;
if (rtable_l2(in6p->inp_rtableid) !=
rtable_l2(m->m_pkthdr.ph_rtableid))
continue;
if (!(in6p->inp_flags & INP_IPV6))
continue;
if ((in6p->inp_ipv6.ip6_nxt || proto == IPPROTO_ICMPV6) &&
in6p->inp_ipv6.ip6_nxt != proto)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->inp_laddr6) &&
!IN6_ARE_ADDR_EQUAL(&in6p->inp_laddr6, key))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->inp_faddr6) &&
!IN6_ARE_ADDR_EQUAL(&in6p->inp_faddr6, &ip6->ip6_src))
continue;
if (proto == IPPROTO_ICMPV6 && in6p->inp_icmp6filt) {
if (ICMP6_FILTER_WILLBLOCK(type, in6p->inp_icmp6filt))
continue;
}
if (proto != IPPROTO_ICMPV6 && in6p->inp_cksum6 != -1) {
rip6stat_inc(rip6s_isum);
/*
* Although in6_cksum() does not need the position of
* the checksum field for verification, enforce that it
* is located within the packet. Userland has given
* a checksum offset, a packet too short for that is
* invalid. Avoid overflow with user supplied offset.
*/
if (m->m_pkthdr.len < *offp + 2 ||
m->m_pkthdr.len - *offp - 2 < in6p->inp_cksum6 ||
in6_cksum(m, proto, *offp,
m->m_pkthdr.len - *offp)) {
rip6stat_inc(rip6s_badsum);
continue;
}
}
in_pcbref(in6p);
SIMPLEQ_INSERT_TAIL(&inpcblist, in6p, inp_notify);
}
mtx_leave(&rawin6pcbtable.inpt_mtx);
if (SIMPLEQ_EMPTY(&inpcblist)) {
struct counters_ref ref;
uint64_t *counters;
rw_exit_write(&rawin6pcbtable.inpt_notify);
if (proto != IPPROTO_ICMPV6) {
rip6stat_inc(rip6s_nosock);
if (m->m_flags & M_MCAST)
rip6stat_inc(rip6s_nosockmcast);
}
if (proto == IPPROTO_NONE || proto == IPPROTO_ICMPV6) {
m_freem(m);
} else {
int prvnxt = ip6_get_prevhdr(m, *offp);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_NEXTHEADER, prvnxt);
}
counters = counters_enter(&ref, ip6counters);
counters[ip6s_delivered]--;
counters_leave(&ref, ip6counters);
return IPPROTO_DONE;
}
while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
struct mbuf *n, *opts = NULL;
SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
if (SIMPLEQ_EMPTY(&inpcblist))
n = m;
else
n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
if (n != NULL) {
if (in6p->inp_flags & IN6P_CONTROLOPTS)
ip6_savecontrol(in6p, n, &opts);
/* strip intermediate headers */
m_adj(n, *offp);
if (sbappendaddr(in6p->inp_socket,
&in6p->inp_socket->so_rcv,
sin6tosa(&rip6src), n, opts) == 0) {
/* should notify about lost packet */
m_freem(n);
m_freem(opts);
rip6stat_inc(rip6s_fullsock);
} else
sorwakeup(in6p->inp_socket);
}
in_pcbunref(in6p);
}
rw_exit_write(&rawin6pcbtable.inpt_notify);
return IPPROTO_DONE;
}
void
rip6_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *d)
{
struct ip6_hdr *ip6;
struct ip6ctlparam *ip6cp = NULL;
struct sockaddr_in6 *sa6 = satosin6(sa);
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
void (*notify)(struct inpcb *, int) = in_rtchange;
int nxt;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return;
if ((unsigned)cmd >= PRC_NCMDS)
return;
if (PRC_IS_REDIRECT(cmd))
notify = in_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (inet6ctlerrmap[cmd] == 0)
return;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
ip6 = ip6cp->ip6c_ip6;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
nxt = ip6cp->ip6c_nxt;
} else {
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
nxt = -1;
}
if (ip6 && cmd == PRC_MSGSIZE) {
int valid = 0;
struct inpcb *in6p;
/*
* Check to see if we have a valid raw IPv6 socket
* corresponding to the address in the ICMPv6 message
* payload, and the protocol (ip6_nxt) meets the socket.
* XXX chase extension headers, or pass final nxt value
* from icmp6_notify_error()
*/
in6p = in6_pcblookup(&rawin6pcbtable, &sa6->sin6_addr, 0,
&sa6_src->sin6_addr, 0, rdomain);
if (in6p && in6p->inp_ipv6.ip6_nxt &&
in6p->inp_ipv6.ip6_nxt == nxt)
valid = 1;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
in_pcbunref(in6p);
/*
* regardless of if we called icmp6_mtudisc_update(),
* we need to call in6_pcbnotify(), to notify path
* MTU change to the userland (2292bis-02), because
* some unconnected sockets may share the same
* destination and want to know the path MTU.
*/
}
in6_pcbnotify(&rawin6pcbtable, sa6, 0,
sa6_src, 0, rdomain, cmd, cmdarg, notify);
}
/*
* Generate IPv6 header and pass packet to ip6_output.
* Tack on options user may have setup with control call.
*/
int
rip6_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
struct mbuf *control)
{
struct in6_addr *dst;
struct ip6_hdr *ip6;
struct inpcb *in6p;
u_int plen = m->m_pkthdr.len;
int error = 0;
struct ip6_pktopts opt, *optp = NULL, *origoptp;
int type; /* for ICMPv6 output statistics only */
int priv = 0;
int flags;
in6p = sotoinpcb(so);
priv = 0;
if ((so->so_state & SS_PRIV) != 0)
priv = 1;
if (control) {
if ((error = ip6_setpktopts(control, &opt,
in6p->inp_outputopts6,
priv, so->so_proto->pr_protocol)) != 0)
goto bad;
optp = &opt;
} else
optp = in6p->inp_outputopts6;
if (dstaddr->sa_family != AF_INET6) {
error = EAFNOSUPPORT;
goto bad;
}
dst = &satosin6(dstaddr)->sin6_addr;
if (IN6_IS_ADDR_V4MAPPED(dst)) {
error = EADDRNOTAVAIL;
goto bad;
}
/*
* For an ICMPv6 packet, we should know its type and code
* to update statistics.
*/
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
if (m->m_len < sizeof(struct icmp6_hdr) &&
(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
error = ENOBUFS;
goto bad;
}
icmp6 = mtod(m, struct icmp6_hdr *);
type = icmp6->icmp6_type;
}
M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
if (!m) {
error = ENOBUFS;
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Next header might not be ICMP6 but use its pseudo header anyway.
*/
ip6->ip6_dst = *dst;
/* KAME hack: embed scopeid */
origoptp = in6p->inp_outputopts6;
in6p->inp_outputopts6 = optp;
if (in6_embedscope(&ip6->ip6_dst, satosin6(dstaddr), in6p) != 0) {
error = EINVAL;
goto bad;
}
in6p->inp_outputopts6 = origoptp;
/*
* Source address selection.
*/
{
struct in6_addr *in6a;
error = in6_pcbselsrc(&in6a, satosin6(dstaddr), in6p, optp);
if (error)
goto bad;
ip6->ip6_src = *in6a;
}
ip6->ip6_flow = in6p->inp_flowinfo & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
#if 0 /* ip6_plen will be filled in ip6_output. */
ip6->ip6_plen = htons((u_short)plen);
#endif
ip6->ip6_nxt = in6p->inp_ipv6.ip6_nxt;
ip6->ip6_hlim = in6_selecthlim(in6p);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
in6p->inp_cksum6 != -1) {
struct mbuf *n;
int off;
u_int16_t *sump;
int sumoff;
/* compute checksum */
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
off = offsetof(struct icmp6_hdr, icmp6_cksum);
else
off = in6p->inp_cksum6;
if (plen < 2 || plen - 2 < off) {
error = EINVAL;
goto bad;
}
off += sizeof(struct ip6_hdr);
n = m_pulldown(m, off, sizeof(*sump), &sumoff);
if (n == NULL) {
m = NULL;
error = ENOBUFS;
goto bad;
}
sump = (u_int16_t *)(mtod(n, caddr_t) + sumoff);
*sump = 0;
*sump = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
}
flags = 0;
if (in6p->inp_flags & IN6P_MINMTU)
flags |= IPV6_MINMTU;
/* force routing table */
m->m_pkthdr.ph_rtableid = in6p->inp_rtableid;
#if NPF > 0
if (in6p->inp_socket->so_state & SS_ISCONNECTED &&
so->so_proto->pr_protocol != IPPROTO_ICMPV6)
pf_mbuf_link_inpcb(m, in6p);
#endif
error = ip6_output(m, optp, &in6p->inp_route6, flags,
in6p->inp_moptions6, in6p);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
icmp6stat_inc(icp6s_outhist + type);
} else
rip6stat_inc(rip6s_opackets);
goto freectl;
bad:
m_freem(m);
freectl:
if (control) {
ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return (error);
}
/*
* Raw IPv6 socket option processing.
*/
int
rip6_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
#ifdef MROUTING
int error;
#endif
switch (level) {
case IPPROTO_IPV6:
switch (optname) {
#ifdef MROUTING
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
if (op == PRCO_SETOPT) {
error = ip6_mrouter_set(optname, so, m);
} else if (op == PRCO_GETOPT)
error = ip6_mrouter_get(optname, so, m);
else
error = EINVAL;
return (error);
#endif
case IPV6_CHECKSUM:
return (ip6_raw_ctloutput(op, so, level, optname, m));
default:
return (ip6_ctloutput(op, so, level, optname, m));
}
case IPPROTO_ICMPV6:
/*
* XXX: is it better to call icmp6_ctloutput() directly
* from protosw?
*/
return (icmp6_ctloutput(op, so, level, optname, m));
default:
return EINVAL;
}
}
extern u_long rip6_sendspace;
extern u_long rip6_recvspace;
int
rip6_attach(struct socket *so, int proto)
{
struct inpcb *in6p;
int error;
if (so->so_pcb)
panic("%s", __func__); if ((so->so_state & SS_PRIV) == 0)
return (EACCES);
if (proto < 0 || proto >= IPPROTO_MAX)
return EPROTONOSUPPORT;
if ((error = soreserve(so, rip6_sendspace, rip6_recvspace)))
return error;
NET_ASSERT_LOCKED(); if ((error = in_pcballoc(so, &rawin6pcbtable)))
return error;
in6p = sotoinpcb(so);
in6p->inp_ipv6.ip6_nxt = proto;
in6p->inp_cksum6 = -1;
in6p->inp_icmp6filt = malloc(sizeof(struct icmp6_filter),
M_PCB, M_NOWAIT);
if (in6p->inp_icmp6filt == NULL) {
in_pcbdetach(in6p);
return ENOMEM;
}
ICMP6_FILTER_SETPASSALL(in6p->inp_icmp6filt);
return 0;
}
int
rip6_detach(struct socket *so)
{
struct inpcb *in6p = sotoinpcb(so);
soassertlocked(so);
if (in6p == NULL)
panic("%s", __func__);
#ifdef MROUTING
if (so == ip6_mrouter[in6p->inp_rtableid])
ip6_mrouter_done(so);
#endif
free(in6p->inp_icmp6filt, M_PCB, sizeof(struct icmp6_filter));
in6p->inp_icmp6filt = NULL;
in_pcbdetach(in6p);
return (0);
}
int
rip6_bind(struct socket *so, struct mbuf *nam, struct proc *p)
{
struct inpcb *in6p = sotoinpcb(so);
struct sockaddr_in6 *addr;
int error;
soassertlocked(so);
if ((error = in6_nam2sin6(nam, &addr)))
return (error);
/*
* Make sure to not enter in_pcblookup_local(), local ports
* are non-sensical for raw sockets.
*/
addr->sin6_port = 0;
if ((error = in6_pcbaddrisavail(in6p, addr, 0, p)))
return (error);
in6p->inp_laddr6 = addr->sin6_addr;
return (0);
}
int
rip6_connect(struct socket *so, struct mbuf *nam)
{
struct inpcb *in6p = sotoinpcb(so);
struct sockaddr_in6 *addr;
struct in6_addr *in6a = NULL;
int error;
soassertlocked(so);
if ((error = in6_nam2sin6(nam, &addr)))
return (error);
/* Source address selection. XXX: need pcblookup? */
error = in6_pcbselsrc(&in6a, addr, in6p, in6p->inp_outputopts6);
if (error)
return (error);
in6p->inp_laddr6 = *in6a;
in6p->inp_faddr6 = addr->sin6_addr;
soisconnected(so);
return (0);
}
int
rip6_disconnect(struct socket *so)
{
struct inpcb *in6p = sotoinpcb(so);
soassertlocked(so);
if ((so->so_state & SS_ISCONNECTED) == 0)
return (ENOTCONN);
in6p->inp_faddr6 = in6addr_any;
so->so_state &= ~SS_ISCONNECTED; /* XXX */
return (0);
}
int
rip6_shutdown(struct socket *so)
{
/*
* Mark the connection as being incapable of further input.
*/
soassertlocked(so);
socantsendmore(so);
return (0);
}
int
rip6_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
struct mbuf *control)
{
struct inpcb *in6p = sotoinpcb(so);
struct sockaddr_in6 dst;
int error;
soassertlocked(so);
/*
* Ship a packet out. The appropriate raw output
* routine handles any messaging necessary.
*/
/* always copy sockaddr to avoid overwrites */
memset(&dst, 0, sizeof(dst));
dst.sin6_family = AF_INET6;
dst.sin6_len = sizeof(dst);
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
error = EISCONN;
goto out;
}
dst.sin6_addr = in6p->inp_faddr6;
} else {
struct sockaddr_in6 *addr6;
if (nam == NULL) {
error = ENOTCONN;
goto out;
}
if ((error = in6_nam2sin6(nam, &addr6)))
goto out;
dst.sin6_addr = addr6->sin6_addr;
dst.sin6_scope_id = addr6->sin6_scope_id;
}
error = rip6_output(m, so, sin6tosa(&dst), control);
control = NULL;
m = NULL;
out:
m_freem(control);
m_freem(m);
return (error);
}
int
rip6_abort(struct socket *so)
{
struct inpcb *in6p = sotoinpcb(so);
soassertlocked(so);
soisdisconnected(so);
#ifdef MROUTING
if (so == ip6_mrouter[in6p->inp_rtableid])
ip6_mrouter_done(so);
#endif
free(in6p->inp_icmp6filt, M_PCB, sizeof(struct icmp6_filter));
in6p->inp_icmp6filt = NULL;
in_pcbdetach(in6p);
return (0);
}
int
rip6_sysctl_rip6stat(void *oldp, size_t *oldplen, void *newp)
{
struct rip6stat rip6stat;
CTASSERT(sizeof(rip6stat) == rip6s_ncounters * sizeof(uint64_t));
counters_read(rip6counters, (uint64_t *)&rip6stat, rip6s_ncounters);
return (sysctl_rdstruct(oldp, oldplen, newp,
&rip6stat, sizeof(rip6stat)));
}
int
rip6_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return ENOTDIR;
switch (name[0]) {
case RIPV6CTL_STATS:
return (rip6_sysctl_rip6stat(oldp, oldlenp, newp));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
/* $OpenBSD: ip6_output.c,v 1.271 2022/08/12 17:04:17 bluhm Exp $ */
/* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/errno.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_enc.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/udp.h>
#include <netinet/tcp.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/udp_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <crypto/idgen.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
#ifdef IPSEC
#include <netinet/ip_ipsp.h>
#include <netinet/ip_ah.h>
#include <netinet/ip_esp.h>
#ifdef ENCDEBUG
#define DPRINTF(fmt, args...) \
do { \
if (encdebug) \
printf("%s: " fmt "\n", __func__, ## args); \
} while (0)
#else
#define DPRINTF(fmt, args...) \
do { } while (0)
#endif
#endif /* IPSEC */
struct ip6_exthdrs {
struct mbuf *ip6e_ip6;
struct mbuf *ip6e_hbh;
struct mbuf *ip6e_dest1;
struct mbuf *ip6e_rthdr;
struct mbuf *ip6e_dest2;
};
int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, int, int);
int ip6_getpcbopt(struct ip6_pktopts *, int, struct mbuf *);
int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, int, int, int);
int ip6_setmoptions(int, struct ip6_moptions **, struct mbuf *, unsigned int);
int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf *);
int ip6_copyexthdr(struct mbuf **, caddr_t, int);
int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
struct ip6_frag **);
int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *);
int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *);
static __inline u_int16_t __attribute__((__unused__))
in6_cksum_phdr(const struct in6_addr *, const struct in6_addr *,
u_int32_t, u_int32_t);
void in6_delayed_cksum(struct mbuf *, u_int8_t);
int ip6_output_ipsec_pmtu_update(struct tdb *, struct route_in6 *,
struct in6_addr *, int, int, int);
/* Context for non-repeating IDs */
struct idgen32_ctx ip6_id_ctx;
/*
* IP6 output. The packet in mbuf chain m contains a skeletal IP6
* header (with pri, len, nxt, hlim, src, dst).
* This function may modify ver and hlim only.
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
*
* type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
* nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one,
* which is rt_mtu.
*/
int
ip6_output(struct mbuf *m, struct ip6_pktopts *opt, struct route_in6 *ro,
int flags, struct ip6_moptions *im6o, struct inpcb *inp)
{
struct ip6_hdr *ip6;
struct ifnet *ifp = NULL;
struct mbuf_list fml;
int hlen, tlen;
struct route_in6 ip6route;
struct rtentry *rt = NULL;
struct sockaddr_in6 *dst, dstsock;
int error = 0;
u_long mtu;
int dontfrag;
u_int16_t src_scope, dst_scope;
u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
struct ip6_exthdrs exthdrs;
struct in6_addr finaldst;
struct route_in6 *ro_pmtu = NULL;
int hdrsplit = 0;
u_int8_t sproto = 0;
u_char nextproto;
#ifdef IPSEC
struct tdb *tdb = NULL;
#endif /* IPSEC */
#ifdef IPSEC
if (inp && (inp->inp_flags & INP_IPV6) == 0) panic("%s: IPv4 pcb is passed", __func__);
#endif /* IPSEC */
ip6 = mtod(m, struct ip6_hdr *);
finaldst = ip6->ip6_dst;
#define MAKE_EXTHDR(hp, mp) \
do { \
if (hp) { \
struct ip6_ext *eh = (struct ip6_ext *)(hp); \
error = ip6_copyexthdr((mp), (caddr_t)(hp), \
((eh)->ip6e_len + 1) << 3); \
if (error) \
goto freehdrs; \
} \
} while (0)
bzero(&exthdrs, sizeof(exthdrs));
if (opt) {
/* Hop-by-Hop options header */
MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
/* Destination options header(1st part) */
MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
/* Routing header */
MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
/* Destination options header(2nd part) */
MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
}
#ifdef IPSEC
if (ipsec_in_use || inp != NULL) {
error = ip6_output_ipsec_lookup(m, inp, &tdb);
if (error) {
/*
* -EINVAL is used to indicate that the packet should
* be silently dropped, typically because we've asked
* key management for an SA.
*/
if (error == -EINVAL) /* Should silently drop packet */
error = 0;
goto freehdrs;
}
}
#endif /* IPSEC */
/*
* Calculate the total length of the extension header chain.
* Keep the length of the unfragmentable part for fragmentation.
*/
optlen = 0;
if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len;
unfragpartlen = optlen + sizeof(struct ip6_hdr);
/* NOTE: we don't add AH/ESP length here. do that later. */
if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len;
/*
* If we need IPsec, or there is at least one extension header,
* separate IP6 header from the payload.
*/
if ((sproto || optlen) && !hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
/* adjust mbuf packet header length */
m->m_pkthdr.len += optlen;
plen = m->m_pkthdr.len - sizeof(*ip6);
/* If this is a jumbo payload, insert a jumbo payload option. */
if (plen > IPV6_MAXPACKET) {
if (!hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
goto freehdrs;
ip6->ip6_plen = 0;
} else
ip6->ip6_plen = htons(plen);
/*
* Concatenate headers and fill in next header fields.
* Here we have, on "m"
* IPv6 payload
* and we insert headers accordingly. Finally, we should be getting:
* IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
*
* during the header composing process, "m" points to IPv6 header.
* "mprev" points to an extension header prior to esp.
*/
{
u_char *nexthdrp = &ip6->ip6_nxt;
struct mbuf *mprev = m;
/*
* we treat dest2 specially. this makes IPsec processing
* much easier. the goal here is to make mprev point the
* mbuf prior to dest2.
*
* result: IPv6 dest2 payload
* m and mprev will point to IPv6 header.
*/
if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("%s: assumption failed: hdr not split",
__func__);
exthdrs.ip6e_dest2->m_next = m->m_next;
m->m_next = exthdrs.ip6e_dest2;
*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_DSTOPTS;
}
#define MAKE_CHAIN(m, mp, p, i)\
do {\
if (m) {\
if (!hdrsplit) \
panic("assumption failed: hdr not split"); \
*mtod((m), u_char *) = *(p);\
*(p) = (i);\
p = mtod((m), u_char *);\
(m)->m_next = (mp)->m_next;\
(mp)->m_next = (m);\
(mp) = (m);\
}\
} while (0)
/*
* result: IPv6 hbh dest1 rthdr dest2 payload
* m will point to IPv6 header. mprev will point to the
* extension header prior to dest2 (rthdr in the above case).
*/
MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
IPPROTO_DSTOPTS);
MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
IPPROTO_ROUTING);
}
/*
* If there is a routing header, replace the destination address field
* with the first hop of the routing header.
*/
if (exthdrs.ip6e_rthdr) {
struct ip6_rthdr *rh;
struct ip6_rthdr0 *rh0;
struct in6_addr *addr;
rh = (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr,
struct ip6_rthdr *));
switch (rh->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
rh0 = (struct ip6_rthdr0 *)rh;
addr = (struct in6_addr *)(rh0 + 1);
ip6->ip6_dst = addr[0];
bcopy(&addr[1], &addr[0],
sizeof(struct in6_addr) * (rh0->ip6r0_segleft - 1));
addr[rh0->ip6r0_segleft - 1] = finaldst;
break;
default: /* is it possible? */
error = EINVAL;
goto bad;
}
}
/* Source address validation */
if (!(flags & IPV6_UNSPECSRC) && IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
/*
* XXX: we can probably assume validation in the caller, but
* we explicitly check the address here for safety.
*/
error = EOPNOTSUPP;
ip6stat_inc(ip6s_badscope);
goto bad;
}
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
error = EOPNOTSUPP;
ip6stat_inc(ip6s_badscope);
goto bad;
}
ip6stat_inc(ip6s_localout);
/*
* Route packet.
*/
#if NPF > 0
reroute:
#endif
/* initialize cached route */
if (ro == NULL) {
ro = &ip6route;
bzero((caddr_t)ro, sizeof(*ro));
}
ro_pmtu = ro;
if (opt && opt->ip6po_rthdr)
ro = &opt->ip6po_route;
dst = &ro->ro_dst;
/*
* if specified, try to fill in the traffic class field.
* do not override if a non-zero value is already set.
* we check the diffserv field and the ecn field separately.
*/
if (opt && opt->ip6po_tclass >= 0) {
int mask = 0;
if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
mask |= 0xfc;
if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
mask |= 0x03;
if (mask != 0)
ip6->ip6_flow |=
htonl((opt->ip6po_tclass & mask) << 20);
}
/* fill in or override the hop limit field, if necessary. */
if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
if (im6o != NULL)
ip6->ip6_hlim = im6o->im6o_hlim;
else
ip6->ip6_hlim = ip6_defmcasthlim;
}
#ifdef IPSEC
if (tdb != NULL) {
/*
* XXX what should we do if ip6_hlim == 0 and the
* packet gets tunneled?
*/
/*
* if we are source-routing, do not attempt to tunnel the
* packet just because ip6_dst is different from what tdb has.
* XXX
*/
error = ip6_output_ipsec_send(tdb, m, ro,
exthdrs.ip6e_rthdr ? 1 : 0, 0);
goto done;
}
#endif /* IPSEC */
bzero(&dstsock, sizeof(dstsock));
dstsock.sin6_family = AF_INET6;
dstsock.sin6_addr = ip6->ip6_dst;
dstsock.sin6_len = sizeof(dstsock);
ro->ro_tableid = m->m_pkthdr.ph_rtableid;
if (IN6_IS_ADDR_MULTICAST(&dstsock.sin6_addr)) {
struct in6_pktinfo *pi = NULL;
/*
* If the caller specify the outgoing interface
* explicitly, use it.
*/
if (opt != NULL && (pi = opt->ip6po_pktinfo) != NULL) ifp = if_get(pi->ipi6_ifindex); if (ifp == NULL && im6o != NULL) ifp = if_get(im6o->im6o_ifidx);
}
if (ifp == NULL) {
rt = in6_selectroute(&dstsock, opt, ro, ro->ro_tableid);
if (rt == NULL) {
ip6stat_inc(ip6s_noroute);
error = EHOSTUNREACH;
goto bad;
}
if (ISSET(rt->rt_flags, RTF_LOCAL))
ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid));
else
ifp = if_get(rt->rt_ifidx);
/*
* We aren't using rtisvalid() here because the UP/DOWN state
* machine is broken with some Ethernet drivers like em(4).
* As a result we might try to use an invalid cached route
* entry while an interface is being detached.
*/
if (ifp == NULL) {
ip6stat_inc(ip6s_noroute);
error = EHOSTUNREACH;
goto bad;
}
} else {
*dst = dstsock;
}
if (rt && (rt->rt_flags & RTF_GATEWAY) &&
!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
dst = satosin6(rt->rt_gateway); if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/* Unicast */
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
} else {
/* Multicast */
m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
/*
* Confirm that the outgoing interface supports multicast.
*/
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
ip6stat_inc(ip6s_noroute);
error = ENETUNREACH;
goto bad;
}
if ((im6o == NULL || im6o->im6o_loop) &&
in6_hasmulti(&ip6->ip6_dst, ifp)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
* Can't defer TCP/UDP checksumming, do the
* computation now.
*/
in6_proto_cksum_out(m, NULL);
ip6_mloopback(ifp, m, dst);
}
#ifdef MROUTING
else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IPV6_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip6_mloopback(),
* above, will be forwarded by the ip6_input() routine,
* if necessary.
*/
if (ip6_mforwarding && ip6_mrouter[ifp->if_rdomain] &&
(flags & IPV6_FORWARDING) == 0) {
if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m);
goto done;
}
}
}
#endif
/*
* Multicasts with a hoplimit of zero may be looped back,
* above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip6_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
m_freem(m);
goto done;
}
}
/*
* If this packet is going through a loopback interface we won't
* be able to restore its scope ID using the interface index.
*/
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) { if (ifp->if_flags & IFF_LOOPBACK) src_scope = ip6->ip6_src.s6_addr16[1];
ip6->ip6_src.s6_addr16[1] = 0;
}
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) { if (ifp->if_flags & IFF_LOOPBACK) dst_scope = ip6->ip6_dst.s6_addr16[1];
ip6->ip6_dst.s6_addr16[1] = 0;
}
/* Determine path MTU. */
if ((error = ip6_getpmtu(ro_pmtu->ro_rt, ifp, &mtu)) != 0)
goto bad;
/*
* The caller of this function may specify to use the minimum MTU
* in some cases.
* An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
* setting. The logic is a bit complicated; by default, unicast
* packets will follow path MTU while multicast packets will be sent at
* the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets
* including unicast ones will be sent at the minimum MTU. Multicast
* packets will always be sent at the minimum MTU unless
* IP6PO_MINMTU_DISABLE is explicitly specified.
* See RFC 3542 for more details.
*/
if (mtu > IPV6_MMTU) {
if ((flags & IPV6_MINMTU))
mtu = IPV6_MMTU;
else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
mtu = IPV6_MMTU;
else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL ||
opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
mtu = IPV6_MMTU;
}
}
/*
* If the outgoing packet contains a hop-by-hop options header,
* it must be examined and processed even by the source node.
* (RFC 2460, section 4.)
*/
if (exthdrs.ip6e_hbh) {
struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
u_int32_t rtalert; /* returned value is ignored */
u_int32_t plen = 0; /* no more than 1 jumbo payload option! */
m->m_pkthdr.ph_ifidx = ifp->if_index;
if (ip6_process_hopopts(&m, (u_int8_t *)(hbh + 1),
((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
&rtalert, &plen) < 0) {
/* m was already freed at this point */
error = EINVAL;/* better error? */
goto done;
}
m->m_pkthdr.ph_ifidx = 0;
}
#if NPF > 0
if (pf_test(AF_INET6, PF_OUT, ifp, &m) != PF_PASS) {
error = EACCES;
m_freem(m);
goto done;
}
if (m == NULL)
goto done;
ip6 = mtod(m, struct ip6_hdr *);
if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) ==
(PF_TAG_REROUTE | PF_TAG_GENERATED)) {
/* already rerun the route lookup, go on */
m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); } else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) {
/* tag as generated to skip over pf_test on rerun */
m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
finaldst = ip6->ip6_dst;
ro = NULL;
if_put(ifp); /* drop reference since destination changed */
ifp = NULL;
goto reroute;
}
#endif
/*
* If the packet is not going on the wire it can be destined
* to any local address. In this case do not clear its scopes
* to let ip6_input() find a matching local route.
*/
if (ifp->if_flags & IFF_LOOPBACK) { if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
ip6->ip6_src.s6_addr16[1] = src_scope;
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
ip6->ip6_dst.s6_addr16[1] = dst_scope;
}
in6_proto_cksum_out(m, ifp);
/*
* Send the packet to the outgoing interface.
* If necessary, do IPv6 fragmentation before sending.
*
* the logic here is rather complex:
* 1: normal case (dontfrag == 0)
* 1-a: send as is if tlen <= path mtu
* 1-b: fragment if tlen > path mtu
*
* 2: if user asks us not to fragment (dontfrag == 1)
* 2-a: send as is if tlen <= interface mtu
* 2-b: error if tlen > interface mtu
*/
tlen = m->m_pkthdr.len;
if (ISSET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT)) {
CLR(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
dontfrag = 1;
} else if (opt && ISSET(opt->ip6po_flags, IP6PO_DONTFRAG))
dontfrag = 1;
else
dontfrag = 0;
if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */
#ifdef IPSEC
if (ip_mtudisc)
ipsec_adjust_mtu(m, mtu);
#endif
error = EMSGSIZE;
goto bad;
}
/*
* transmit packet without fragmentation
*/
if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */
error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt);
goto done;
}
/*
* try to fragment the packet. case 1-b
*/
if (mtu < IPV6_MMTU) {
/* path MTU cannot be less than IPV6_MMTU */
error = EMSGSIZE;
goto bad;
} else if (ip6->ip6_plen == 0) {
/* jumbo payload cannot be fragmented */
error = EMSGSIZE;
goto bad;
}
/*
* Too large for the destination or interface;
* fragment if possible.
* Must be able to put at least 8 bytes per fragment.
*/
hlen = unfragpartlen;
if (mtu > IPV6_MAXPACKET)
mtu = IPV6_MAXPACKET;
/*
* If we are doing fragmentation, we can't defer TCP/UDP
* checksumming; compute the checksum and clear the flag.
*/
in6_proto_cksum_out(m, NULL);
/*
* Change the next header field of the last header in the
* unfragmentable part.
*/
if (exthdrs.ip6e_rthdr) {
nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_dest1) {
nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_hbh) {
nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
} else {
nextproto = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_FRAGMENT;
}
error = ip6_fragment(m, &fml, hlen, nextproto, mtu);
if (error)
goto done;
while ((m = ml_dequeue(&fml)) != NULL) {
error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt);
if (error)
break;
}
if (error)
ml_purge(&fml);
else
ip6stat_inc(ip6s_fragmented);
done:
if (ro == &ip6route && ro->ro_rt) {
rtfree(ro->ro_rt);
} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
rtfree(ro_pmtu->ro_rt);
}
if_put(ifp);
#ifdef IPSEC
tdb_unref(tdb);
#endif /* IPSEC */
return (error);
freehdrs:
m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */
m_freem(exthdrs.ip6e_dest1);
m_freem(exthdrs.ip6e_rthdr);
m_freem(exthdrs.ip6e_dest2);
/* FALLTHROUGH */
bad:
m_freem(m);
goto done;
}
int
ip6_fragment(struct mbuf *m0, struct mbuf_list *fml, int hlen,
u_char nextproto, u_long mtu)
{
struct mbuf *m;
struct ip6_hdr *ip6;
u_int32_t id;
int tlen, len, off;
int error;
ml_init(fml);
ip6 = mtod(m0, struct ip6_hdr *);
tlen = m0->m_pkthdr.len;
len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
if (len < 8) {
error = EMSGSIZE;
goto bad;
}
id = htonl(ip6_randomid());
/*
* Loop through length of segment,
* make new header and copy data of each part and link onto chain.
*/
for (off = hlen; off < tlen; off += len) {
struct mbuf *mlast;
struct ip6_hdr *mhip6;
struct ip6_frag *ip6f;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
ml_enqueue(fml, m);
if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0)
goto bad;
m->m_data += max_linkhdr;
mhip6 = mtod(m, struct ip6_hdr *);
*mhip6 = *ip6;
m->m_len = sizeof(struct ip6_hdr);
if ((error = ip6_insertfraghdr(m0, m, hlen, &ip6f)) != 0)
goto bad;
ip6f->ip6f_offlg = htons((off - hlen) & ~7);
if (off + len >= tlen)
len = tlen - off;
else
ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
m->m_pkthdr.len = hlen + sizeof(struct ip6_frag) + len;
mhip6->ip6_plen = htons(m->m_pkthdr.len -
sizeof(struct ip6_hdr));
for (mlast = m; mlast->m_next; mlast = mlast->m_next)
;
mlast->m_next = m_copym(m0, off, len, M_DONTWAIT);
if (mlast->m_next == NULL) {
error = ENOBUFS;
goto bad;
}
ip6f->ip6f_reserved = 0;
ip6f->ip6f_ident = id;
ip6f->ip6f_nxt = nextproto;
}
ip6stat_add(ip6s_ofragments, ml_len(fml));
m_freem(m0);
return (0);
bad:
ip6stat_inc(ip6s_odropped);
ml_purge(fml);
m_freem(m0);
return (error);
}
int
ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
{
struct mbuf *m;
if (hlen > MCLBYTES)
return (ENOBUFS); /* XXX */
MGET(m, M_DONTWAIT, MT_DATA);
if (!m)
return (ENOBUFS);
if (hlen > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) { m_free(m);
return (ENOBUFS);
}
}
m->m_len = hlen;
if (hdr)
memcpy(mtod(m, caddr_t), hdr, hlen);
*mp = m;
return (0);
}
/*
* Insert jumbo payload option.
*/
int
ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
{
struct mbuf *mopt;
u_int8_t *optbuf;
u_int32_t v;
#define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */
/*
* If there is no hop-by-hop options header, allocate new one.
* If there is one but it doesn't have enough space to store the
* jumbo payload option, allocate a cluster to store the whole options.
* Otherwise, use it to store the options.
*/
if (exthdrs->ip6e_hbh == 0) {
MGET(mopt, M_DONTWAIT, MT_DATA);
if (mopt == NULL)
return (ENOBUFS);
mopt->m_len = JUMBOOPTLEN;
optbuf = mtod(mopt, u_int8_t *);
optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */
exthdrs->ip6e_hbh = mopt;
} else {
struct ip6_hbh *hbh;
mopt = exthdrs->ip6e_hbh;
if (m_trailingspace(mopt) < JUMBOOPTLEN) {
/*
* XXX assumption:
* - exthdrs->ip6e_hbh is not referenced from places
* other than exthdrs.
* - exthdrs->ip6e_hbh is not an mbuf chain.
*/
int oldoptlen = mopt->m_len;
struct mbuf *n;
/*
* XXX: give up if the whole (new) hbh header does
* not fit even in an mbuf cluster.
*/
if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
return (ENOBUFS);
/*
* As a consequence, we must always prepare a cluster
* at this point.
*/
MGET(n, M_DONTWAIT, MT_DATA);
if (n) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n)
return (ENOBUFS);
n->m_len = oldoptlen + JUMBOOPTLEN;
memcpy(mtod(n, caddr_t), mtod(mopt, caddr_t),
oldoptlen);
optbuf = mtod(n, u_int8_t *) + oldoptlen;
m_freem(mopt);
mopt = exthdrs->ip6e_hbh = n;
} else {
optbuf = mtod(mopt, u_int8_t *) + mopt->m_len;
mopt->m_len += JUMBOOPTLEN;
}
optbuf[0] = IP6OPT_PADN;
optbuf[1] = 0;
/*
* Adjust the header length according to the pad and
* the jumbo payload option.
*/
hbh = mtod(mopt, struct ip6_hbh *);
hbh->ip6h_len += (JUMBOOPTLEN >> 3);
}
/* fill in the option. */
optbuf[2] = IP6OPT_JUMBO;
optbuf[3] = 4;
v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
memcpy(&optbuf[4], &v, sizeof(u_int32_t));
/* finally, adjust the packet header length */
exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
return (0);
#undef JUMBOOPTLEN
}
/*
* Insert fragment header and copy unfragmentable header portions.
*/
int
ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
struct ip6_frag **frghdrp)
{
struct mbuf *n, *mlast;
if (hlen > sizeof(struct ip6_hdr)) {
n = m_copym(m0, sizeof(struct ip6_hdr),
hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
if (n == NULL)
return (ENOBUFS);
m->m_next = n;
} else
n = m;
/* Search for the last mbuf of unfragmentable part. */
for (mlast = n; mlast->m_next; mlast = mlast->m_next)
;
if ((mlast->m_flags & M_EXT) == 0 &&
m_trailingspace(mlast) >= sizeof(struct ip6_frag)) {
/* use the trailing space of the last mbuf for fragment hdr */
*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
mlast->m_len);
mlast->m_len += sizeof(struct ip6_frag);
m->m_pkthdr.len += sizeof(struct ip6_frag);
} else {
/* allocate a new mbuf for the fragment header */
struct mbuf *mfrg;
MGET(mfrg, M_DONTWAIT, MT_DATA);
if (mfrg == NULL)
return (ENOBUFS);
mfrg->m_len = sizeof(struct ip6_frag);
*frghdrp = mtod(mfrg, struct ip6_frag *);
mlast->m_next = mfrg;
}
return (0);
}
int
ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup)
{
u_int32_t mtu = 0;
int error = 0;
if (rt != NULL) {
mtu = rt->rt_mtu;
if (mtu == 0)
mtu = ifp->if_mtu; else if (mtu < IPV6_MMTU) {
/* RFC8021 IPv6 Atomic Fragments Considered Harmful */
mtu = IPV6_MMTU;
} else if (mtu > ifp->if_mtu) {
/*
* The MTU on the route is larger than the MTU on
* the interface! This shouldn't happen, unless the
* MTU of the interface has been changed after the
* interface was brought up. Change the MTU in the
* route to match the interface MTU (as long as the
* field isn't locked).
*/
mtu = ifp->if_mtu;
if (!(rt->rt_locks & RTV_MTU)) rt->rt_mtu = mtu;
}
} else {
mtu = ifp->if_mtu;
}
*mtup = mtu;
return (error);
}
/*
* IP6 socket option processing.
*/
int
ip6_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
int privileged, optdatalen, uproto;
void *optdata;
struct inpcb *inp = sotoinpcb(so);
int error, optval;
struct proc *p = curproc; /* For IPsec and rdomain */
u_int rtableid, rtid = 0;
error = optval = 0;
privileged = (inp->inp_socket->so_state & SS_PRIV);
uproto = (int)so->so_proto->pr_protocol;
if (level != IPPROTO_IPV6)
return (EINVAL);
rtableid = p->p_p->ps_rtableid;
switch (op) {
case PRCO_SETOPT:
switch (optname) {
/*
* Use of some Hop-by-Hop options or some
* Destination options, might require special
* privilege. That is, normal applications
* (without special privilege) might be forbidden
* from setting certain options in outgoing packets,
* and might never see certain options in received
* packets. [RFC 2292 Section 6]
* KAME specific note:
* KAME prevents non-privileged users from sending or
* receiving ANY hbh/dst options in order to avoid
* overhead of parsing options in the kernel.
*/
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
if (!privileged) {
error = EPERM;
break;
}
/* FALLTHROUGH */
case IPV6_UNICAST_HOPS:
case IPV6_MINHOPCOUNT:
case IPV6_HOPLIMIT:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_RECVTCLASS:
case IPV6_V6ONLY:
case IPV6_AUTOFLOWLABEL:
case IPV6_RECVDSTPORT:
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
switch (optname) {
case IPV6_UNICAST_HOPS:
if (optval < -1 || optval >= 256)
error = EINVAL;
else {
/* -1 = kernel default */
inp->inp_hops = optval;
}
break;
case IPV6_MINHOPCOUNT:
if (optval < 0 || optval > 255)
error = EINVAL;
else
inp->inp_ip6_minhlim = optval;
break;
#define OPTSET(bit) \
do { \
if (optval) \
inp->inp_flags |= (bit); \
else \
inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)
#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
case IPV6_RECVPKTINFO:
OPTSET(IN6P_PKTINFO);
break;
case IPV6_HOPLIMIT:
{
struct ip6_pktopts **optp;
optp = &inp->inp_outputopts6;
error = ip6_pcbopt(IPV6_HOPLIMIT,
(u_char *)&optval, sizeof(optval), optp,
privileged, uproto);
break;
}
case IPV6_RECVHOPLIMIT:
OPTSET(IN6P_HOPLIMIT);
break;
case IPV6_RECVHOPOPTS:
OPTSET(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
OPTSET(IN6P_DSTOPTS);
break;
case IPV6_RECVRTHDR:
OPTSET(IN6P_RTHDR);
break;
case IPV6_RECVPATHMTU:
/*
* We ignore this option for TCP
* sockets.
* (RFC3542 leaves this case
* unspecified.)
*/
if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU);
break;
case IPV6_V6ONLY:
/*
* make setsockopt(IPV6_V6ONLY)
* available only prior to bind(2).
* see ipng mailing list, Jun 22 2001.
*/
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(
&inp->inp_laddr6)) {
error = EINVAL;
break;
}
/* No support for IPv4-mapped addresses. */
if (!optval)
error = EINVAL;
else
error = 0;
break;
case IPV6_RECVTCLASS:
OPTSET(IN6P_TCLASS);
break;
case IPV6_AUTOFLOWLABEL:
OPTSET(IN6P_AUTOFLOWLABEL);
break;
case IPV6_RECVDSTPORT:
OPTSET(IN6P_RECVDSTPORT);
break;
}
break;
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
if (m == NULL || m->m_len != sizeof(optval)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
{
struct ip6_pktopts **optp;
optp = &inp->inp_outputopts6;
error = ip6_pcbopt(optname, (u_char *)&optval,
sizeof(optval), optp, privileged, uproto);
break;
}
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
{
/* new advanced API (RFC3542) */
u_char *optbuf;
int optbuflen;
struct ip6_pktopts **optp;
if (m && m->m_next) {
error = EINVAL; /* XXX */
break;
}
if (m) {
optbuf = mtod(m, u_char *);
optbuflen = m->m_len;
} else {
optbuf = NULL;
optbuflen = 0;
}
optp = &inp->inp_outputopts6;
error = ip6_pcbopt(optname, optbuf, optbuflen, optp,
privileged, uproto);
break;
}
#undef OPTSET
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_setmoptions(optname,
&inp->inp_moptions6,
m, inp->inp_rtableid);
break;
case IPV6_PORTRANGE:
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
switch (optval) {
case IPV6_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags &= ~(IN6P_HIGHPORT);
break;
case IPV6_PORTRANGE_HIGH:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags |= IN6P_HIGHPORT;
break;
case IPV6_PORTRANGE_LOW:
inp->inp_flags &= ~(IN6P_HIGHPORT);
inp->inp_flags |= IN6P_LOWPORT;
break;
default:
error = EINVAL;
break;
}
break;
case IPSEC6_OUTSA:
error = EINVAL;
break;
case IPV6_AUTH_LEVEL:
case IPV6_ESP_TRANS_LEVEL:
case IPV6_ESP_NETWORK_LEVEL:
case IPV6_IPCOMP_LEVEL:
#ifndef IPSEC
error = EINVAL;
#else
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
if (optval < IPSEC_LEVEL_BYPASS ||
optval > IPSEC_LEVEL_UNIQUE) {
error = EINVAL;
break;
}
switch (optname) {
case IPV6_AUTH_LEVEL:
if (optval < IPSEC_AUTH_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_AUTH] = optval;
break;
case IPV6_ESP_TRANS_LEVEL:
if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_ESP_TRANS] = optval;
break;
case IPV6_ESP_NETWORK_LEVEL:
if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_ESP_NETWORK] = optval;
break;
case IPV6_IPCOMP_LEVEL:
if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT &&
suser(p)) {
error = EACCES;
break;
}
inp->inp_seclevel[SL_IPCOMP] = optval;
break;
}
#endif
break;
case SO_RTABLE:
if (m == NULL || m->m_len < sizeof(u_int)) {
error = EINVAL;
break;
}
rtid = *mtod(m, u_int *);
if (inp->inp_rtableid == rtid)
break;
/* needs privileges to switch when already set */
if (rtableid != rtid && rtableid != 0 &&
(error = suser(p)) != 0)
break;
/* table must exist */
if (!rtable_exists(rtid)) {
error = EINVAL;
break;
}
if (inp->inp_lport) {
error = EBUSY;
break;
}
inp->inp_rtableid = rtid;
in_pcbrehash(inp);
break;
case IPV6_PIPEX:
if (m != NULL && m->m_len == sizeof(int)) inp->inp_pipex = *mtod(m, int *);
else
error = EINVAL;
break;
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (optname) {
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
case IPV6_UNICAST_HOPS:
case IPV6_MINHOPCOUNT:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_V6ONLY:
case IPV6_PORTRANGE:
case IPV6_RECVTCLASS:
case IPV6_AUTOFLOWLABEL:
case IPV6_RECVDSTPORT:
switch (optname) {
case IPV6_RECVHOPOPTS:
optval = OPTBIT(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
optval = OPTBIT(IN6P_DSTOPTS);
break;
case IPV6_UNICAST_HOPS:
optval = inp->inp_hops;
break;
case IPV6_MINHOPCOUNT:
optval = inp->inp_ip6_minhlim;
break;
case IPV6_RECVPKTINFO:
optval = OPTBIT(IN6P_PKTINFO);
break;
case IPV6_RECVHOPLIMIT:
optval = OPTBIT(IN6P_HOPLIMIT);
break;
case IPV6_RECVRTHDR:
optval = OPTBIT(IN6P_RTHDR);
break;
case IPV6_RECVPATHMTU:
optval = OPTBIT(IN6P_MTU);
break;
case IPV6_V6ONLY:
optval = 1;
break;
case IPV6_PORTRANGE:
{
int flags;
flags = inp->inp_flags;
if (flags & IN6P_HIGHPORT)
optval = IPV6_PORTRANGE_HIGH;
else if (flags & IN6P_LOWPORT)
optval = IPV6_PORTRANGE_LOW;
else
optval = 0;
break;
}
case IPV6_RECVTCLASS:
optval = OPTBIT(IN6P_TCLASS);
break;
case IPV6_AUTOFLOWLABEL:
optval = OPTBIT(IN6P_AUTOFLOWLABEL);
break;
case IPV6_RECVDSTPORT:
optval = OPTBIT(IN6P_RECVDSTPORT);
break;
}
if (error)
break;
m->m_len = sizeof(int);
*mtod(m, int *) = optval;
break;
case IPV6_PATHMTU:
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
struct ifnet *ifp;
struct rtentry *rt;
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
rt = in_pcbrtentry(inp);
if (!rtisvalid(rt))
return (EHOSTUNREACH);
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
return (EHOSTUNREACH);
/*
* XXX: we dot not consider the case of source
* routing, or optional information to specify
* the outgoing interface.
*/
error = ip6_getpmtu(rt, ifp, &pmtu);
if_put(ifp);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)
pmtu = IPV6_MAXPACKET;
bzero(&mtuinfo, sizeof(mtuinfo));
mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
optdata = (void *)&mtuinfo;
optdatalen = sizeof(mtuinfo);
if (optdatalen > MCLBYTES)
return (EMSGSIZE); /* XXX */
if (optdatalen > MLEN)
MCLGET(m, M_WAIT);
m->m_len = optdatalen;
bcopy(optdata, mtod(m, void *), optdatalen);
break;
}
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
error = ip6_getpcbopt(inp->inp_outputopts6,
optname, m);
break;
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_getmoptions(optname,
inp->inp_moptions6, m);
break;
case IPSEC6_OUTSA:
error = EINVAL;
break;
case IPV6_AUTH_LEVEL:
case IPV6_ESP_TRANS_LEVEL:
case IPV6_ESP_NETWORK_LEVEL:
case IPV6_IPCOMP_LEVEL:
#ifndef IPSEC
m->m_len = sizeof(int);
*mtod(m, int *) = IPSEC_LEVEL_NONE;
#else
m->m_len = sizeof(int);
switch (optname) {
case IPV6_AUTH_LEVEL:
optval = inp->inp_seclevel[SL_AUTH];
break;
case IPV6_ESP_TRANS_LEVEL:
optval =
inp->inp_seclevel[SL_ESP_TRANS];
break;
case IPV6_ESP_NETWORK_LEVEL:
optval =
inp->inp_seclevel[SL_ESP_NETWORK];
break;
case IPV6_IPCOMP_LEVEL:
optval = inp->inp_seclevel[SL_IPCOMP];
break;
}
*mtod(m, int *) = optval;
#endif
break;
case SO_RTABLE:
m->m_len = sizeof(u_int);
*mtod(m, u_int *) = inp->inp_rtableid;
break;
case IPV6_PIPEX:
m->m_len = sizeof(int);
*mtod(m, int *) = inp->inp_pipex;
break;
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
int
ip6_raw_ctloutput(int op, struct socket *so, int level, int optname,
struct mbuf *m)
{
int error = 0, optval;
const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
struct inpcb *inp = sotoinpcb(so);
if (level != IPPROTO_IPV6)
return (EINVAL);
switch (optname) {
case IPV6_CHECKSUM:
/*
* For ICMPv6 sockets, no modification allowed for checksum
* offset, permit "no change" values to help existing apps.
*
* RFC3542 says: "An attempt to set IPV6_CHECKSUM
* for an ICMPv6 socket will fail."
* The current behavior does not meet RFC3542.
*/
switch (op) {
case PRCO_SETOPT:
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
optval = *mtod(m, int *);
if (optval < -1 ||
(optval > 0 && (optval % 2) != 0)) {
/*
* The API assumes non-negative even offset
* values or -1 as a special value.
*/
error = EINVAL;
} else if (so->so_proto->pr_protocol ==
IPPROTO_ICMPV6) {
if (optval != icmp6off)
error = EINVAL;
} else
inp->inp_cksum6 = optval;
break;
case PRCO_GETOPT:
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
optval = icmp6off;
else
optval = inp->inp_cksum6;
m->m_len = sizeof(int);
*mtod(m, int *) = optval;
break;
default:
error = EINVAL;
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
return (error);
}
/*
* initialize ip6_pktopts. beware that there are non-zero default values in
* the struct.
*/
void
ip6_initpktopts(struct ip6_pktopts *opt)
{
bzero(opt, sizeof(*opt));
opt->ip6po_hlim = -1; /* -1 means default hop limit */
opt->ip6po_tclass = -1; /* -1 means default traffic class */
opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
}
int
ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
int priv, int uproto)
{
struct ip6_pktopts *opt;
if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
M_WAITOK);
ip6_initpktopts(*pktopt);
}
opt = *pktopt;
return (ip6_setpktopt(optname, buf, len, opt, priv, 1, uproto));
}
int
ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct mbuf *m)
{
void *optdata = NULL;
int optdatalen = 0;
struct ip6_ext *ip6e;
int error = 0;
struct in6_pktinfo null_pktinfo;
int deftclass = 0, on;
int defminmtu = IP6PO_MINMTU_MCASTONLY;
switch (optname) {
case IPV6_PKTINFO:
if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo;
else {
/* XXX: we don't have to do this every time... */
bzero(&null_pktinfo, sizeof(null_pktinfo));
optdata = (void *)&null_pktinfo;
}
optdatalen = sizeof(struct in6_pktinfo);
break;
case IPV6_TCLASS:
if (pktopt && pktopt->ip6po_tclass >= 0)
optdata = (void *)&pktopt->ip6po_tclass;
else
optdata = (void *)&deftclass;
optdatalen = sizeof(int);
break;
case IPV6_HOPOPTS:
if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh;
ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDR:
if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr;
ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDRDSTOPTS:
if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_DSTOPTS:
if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_USE_MIN_MTU:
if (pktopt)
optdata = (void *)&pktopt->ip6po_minmtu;
else
optdata = (void *)&defminmtu;
optdatalen = sizeof(int);
break;
case IPV6_DONTFRAG:
if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
on = 1;
else
on = 0;
optdata = (void *)&on;
optdatalen = sizeof(on);
break;
default: /* should not happen */
#ifdef DIAGNOSTIC
panic("%s: unexpected option", __func__);
#endif
return (ENOPROTOOPT);
}
if (optdatalen > MCLBYTES)
return (EMSGSIZE); /* XXX */
if (optdatalen > MLEN) MCLGET(m, M_WAIT);
m->m_len = optdatalen;
if (optdatalen)
bcopy(optdata, mtod(m, void *), optdatalen);
return (error);
}
void
ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
{ if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT, 0);
pktopt->ip6po_pktinfo = NULL;
}
if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1;
if (optname == -1 || optname == IPV6_TCLASS)
pktopt->ip6po_tclass = -1;
if (optname == -1 || optname == IPV6_HOPOPTS) {
if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT, 0);
pktopt->ip6po_hbh = NULL;
}
if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT, 0);
pktopt->ip6po_dest1 = NULL;
}
if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT, 0);
pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
if (pktopt->ip6po_route.ro_rt) { rtfree(pktopt->ip6po_route.ro_rt);
pktopt->ip6po_route.ro_rt = NULL;
}
}
if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT, 0);
pktopt->ip6po_dest2 = NULL;
}
}
#define PKTOPT_EXTHDRCPY(type) \
do {\
if (src->type) {\
size_t hlen;\
hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
dst->type = malloc(hlen, M_IP6OPT, M_NOWAIT);\
if (dst->type == NULL)\
goto bad;\
memcpy(dst->type, src->type, hlen);\
}\
} while (/*CONSTCOND*/ 0)
int
copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src)
{
dst->ip6po_hlim = src->ip6po_hlim;
dst->ip6po_tclass = src->ip6po_tclass;
dst->ip6po_flags = src->ip6po_flags;
if (src->ip6po_pktinfo) {
dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
M_IP6OPT, M_NOWAIT);
if (dst->ip6po_pktinfo == NULL)
goto bad;
*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
}
PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
return (0);
bad:
ip6_clearpktopts(dst, -1);
return (ENOBUFS);
}
#undef PKTOPT_EXTHDRCPY
void
ip6_freepcbopts(struct ip6_pktopts *pktopt)
{ if (pktopt == NULL)
return;
ip6_clearpktopts(pktopt, -1);
free(pktopt, M_IP6OPT, 0);
}
/*
* Set the IP6 multicast options in response to user setsockopt().
*/
int
ip6_setmoptions(int optname, struct ip6_moptions **im6op, struct mbuf *m,
unsigned int rtableid)
{
int error = 0;
u_int loop, ifindex;
struct ipv6_mreq *mreq;
struct ifnet *ifp;
struct ip6_moptions *im6o = *im6op;
struct in6_multi_mship *imm;
struct proc *p = curproc; /* XXX */
if (im6o == NULL) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_WAITOK); if (im6o == NULL)
return (ENOBUFS);
*im6op = im6o;
im6o->im6o_ifidx = 0;
im6o->im6o_hlim = ip6_defmcasthlim;
im6o->im6o_loop = IPV6_DEFAULT_MULTICAST_LOOP;
LIST_INIT(&im6o->im6o_memberships);
}
switch (optname) {
case IPV6_MULTICAST_IF:
/*
* Select the interface for outgoing multicast packets.
*/
if (m == NULL || m->m_len != sizeof(u_int)) {
error = EINVAL;
break;
}
memcpy(&ifindex, mtod(m, u_int *), sizeof(ifindex));
if (ifindex != 0) {
ifp = if_get(ifindex);
if (ifp == NULL) {
error = ENXIO; /* XXX EINVAL? */
break;
}
if (ifp->if_rdomain != rtable_l2(rtableid) ||
(ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
if_put(ifp);
break;
}
if_put(ifp);
}
im6o->im6o_ifidx = ifindex;
break;
case IPV6_MULTICAST_HOPS:
{
/*
* Set the IP6 hoplimit for outgoing multicast packets.
*/
int optval;
if (m == NULL || m->m_len != sizeof(int)) {
error = EINVAL;
break;
}
memcpy(&optval, mtod(m, u_int *), sizeof(optval));
if (optval < -1 || optval >= 256)
error = EINVAL;
else if (optval == -1)
im6o->im6o_hlim = ip6_defmcasthlim;
else
im6o->im6o_hlim = optval;
break;
}
case IPV6_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
if (m == NULL || m->m_len != sizeof(u_int)) {
error = EINVAL;
break;
}
memcpy(&loop, mtod(m, u_int *), sizeof(loop));
if (loop > 1) {
error = EINVAL;
break;
}
im6o->im6o_loop = loop;
break;
case IPV6_JOIN_GROUP:
/*
* Add a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
error = EINVAL;
break;
}
mreq = mtod(m, struct ipv6_mreq *);
if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
/*
* We use the unspecified address to specify to accept
* all multicast addresses. Only super user is allowed
* to do this.
*/
if (suser(p))
{
error = EACCES;
break;
}
} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
error = EINVAL;
break;
}
/*
* If no interface was explicitly specified, choose an
* appropriate one according to the given multicast address.
*/
if (mreq->ipv6mr_interface == 0) {
struct rtentry *rt;
struct sockaddr_in6 dst;
memset(&dst, 0, sizeof(dst));
dst.sin6_len = sizeof(dst);
dst.sin6_family = AF_INET6;
dst.sin6_addr = mreq->ipv6mr_multiaddr;
rt = rtalloc(sin6tosa(&dst), RT_RESOLVE, rtableid);
if (rt == NULL) {
error = EADDRNOTAVAIL;
break;
}
ifp = if_get(rt->rt_ifidx);
rtfree(rt);
} else {
/*
* If the interface is specified, validate it.
*/
ifp = if_get(mreq->ipv6mr_interface);
if (ifp == NULL) {
error = ENXIO; /* XXX EINVAL? */
break;
}
}
/*
* See if we found an interface, and confirm that it
* supports multicast
*/
if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) ||
(ifp->if_flags & IFF_MULTICAST) == 0) {
if_put(ifp);
error = EADDRNOTAVAIL;
break;
}
/*
* Put interface index into the multicast address,
* if the address has link/interface-local scope.
*/
if (IN6_IS_SCOPE_EMBED(&mreq->ipv6mr_multiaddr)) {
mreq->ipv6mr_multiaddr.s6_addr16[1] =
htons(ifp->if_index);
}
/*
* See if the membership already exists.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) if (imm->i6mm_maddr->in6m_ifidx == ifp->if_index &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&mreq->ipv6mr_multiaddr))
break;
if (imm != NULL) {
if_put(ifp);
error = EADDRINUSE;
break;
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
imm = in6_joingroup(ifp, &mreq->ipv6mr_multiaddr, &error);
if_put(ifp);
if (!imm)
break;
LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
break;
case IPV6_LEAVE_GROUP:
/*
* Drop a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) {
error = EINVAL;
break;
}
mreq = mtod(m, struct ipv6_mreq *);
if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
if (suser(p)) {
error = EACCES;
break;
}
} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
error = EINVAL;
break;
}
/*
* Put interface index into the multicast address,
* if the address has link-local scope.
*/
if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) {
mreq->ipv6mr_multiaddr.s6_addr16[1] =
htons(mreq->ipv6mr_interface);
}
/*
* If an interface address was specified, get a pointer
* to its ifnet structure.
*/
if (mreq->ipv6mr_interface == 0)
ifp = NULL;
else {
ifp = if_get(mreq->ipv6mr_interface);
if (ifp == NULL) {
error = ENXIO; /* XXX EINVAL? */
break;
}
}
/*
* Find the membership in the membership list.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifidx == ifp->if_index) &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&mreq->ipv6mr_multiaddr))
break;
}
if_put(ifp);
if (imm == NULL) {
/* Unable to resolve interface */
error = EADDRNOTAVAIL;
break;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
break;
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the option
* structure.
*/
if (im6o->im6o_ifidx == 0 && im6o->im6o_hlim == ip6_defmcasthlim && im6o->im6o_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
LIST_EMPTY(&im6o->im6o_memberships)) {
free(*im6op, M_IPMOPTS, sizeof(**im6op));
*im6op = NULL;
}
return (error);
}
/*
* Return the IP6 multicast options in response to user getsockopt().
*/
int
ip6_getmoptions(int optname, struct ip6_moptions *im6o, struct mbuf *m)
{
u_int *hlim, *loop, *ifindex;
switch (optname) {
case IPV6_MULTICAST_IF:
ifindex = mtod(m, u_int *);
m->m_len = sizeof(u_int);
if (im6o == NULL || im6o->im6o_ifidx == 0)
*ifindex = 0;
else
*ifindex = im6o->im6o_ifidx;
return (0);
case IPV6_MULTICAST_HOPS:
hlim = mtod(m, u_int *);
m->m_len = sizeof(u_int);
if (im6o == NULL)
*hlim = ip6_defmcasthlim;
else
*hlim = im6o->im6o_hlim;
return (0);
case IPV6_MULTICAST_LOOP:
loop = mtod(m, u_int *);
m->m_len = sizeof(u_int);
if (im6o == NULL)
*loop = ip6_defmcasthlim;
else
*loop = im6o->im6o_loop;
return (0);
default:
return (EOPNOTSUPP);
}
}
/*
* Discard the IP6 multicast options.
*/
void
ip6_freemoptions(struct ip6_moptions *im6o)
{
struct in6_multi_mship *imm;
if (im6o == NULL)
return;
while (!LIST_EMPTY(&im6o->im6o_memberships)) {
imm = LIST_FIRST(&im6o->im6o_memberships);
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
free(im6o, M_IPMOPTS, sizeof(*im6o));
}
/*
* Set IPv6 outgoing packet options based on advanced API.
*/
int
ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
struct ip6_pktopts *stickyopt, int priv, int uproto)
{
u_int clen;
struct cmsghdr *cm = 0;
caddr_t cmsgs;
int error;
if (control == NULL || opt == NULL)
return (EINVAL);
ip6_initpktopts(opt);
if (stickyopt) {
int error;
/*
* If stickyopt is provided, make a local copy of the options
* for this particular packet, then override them by ancillary
* objects.
* XXX: copypktopts() does not copy the cached route to a next
* hop (if any). This is not very good in terms of efficiency,
* but we can allow this since this option should be rarely
* used.
*/
if ((error = copypktopts(opt, stickyopt)) != 0)
return (error);
}
/*
* XXX: Currently, we assume all the optional information is stored
* in a single mbuf.
*/
if (control->m_next)
return (EINVAL);
clen = control->m_len;
cmsgs = mtod(control, caddr_t);
do {
if (clen < CMSG_LEN(0))
return (EINVAL);
cm = (struct cmsghdr *)cmsgs;
if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > clen ||
CMSG_ALIGN(cm->cmsg_len) > clen)
return (EINVAL);
if (cm->cmsg_level == IPPROTO_IPV6) {
error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
cm->cmsg_len - CMSG_LEN(0), opt, priv, 0, uproto);
if (error)
return (error);
}
clen -= CMSG_ALIGN(cm->cmsg_len);
cmsgs += CMSG_ALIGN(cm->cmsg_len);
} while (clen);
return (0);
}
/*
* Set a particular packet option, as a sticky option or an ancillary data
* item. "len" can be 0 only when it's a sticky option.
*/
int
ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
int priv, int sticky, int uproto)
{
int minmtupolicy;
switch (optname) {
case IPV6_PKTINFO:
{
struct ifnet *ifp = NULL;
struct in6_pktinfo *pktinfo;
if (len != sizeof(struct in6_pktinfo))
return (EINVAL);
pktinfo = (struct in6_pktinfo *)buf;
/*
* An application can clear any sticky IPV6_PKTINFO option by
* doing a "regular" setsockopt with ipi6_addr being
* in6addr_any and ipi6_ifindex being zero.
* [RFC 3542, Section 6]
*/
if (opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname);
break;
}
if (uproto == IPPROTO_TCP && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
return (EINVAL);
}
if (pktinfo->ipi6_ifindex) {
ifp = if_get(pktinfo->ipi6_ifindex);
if (ifp == NULL)
return (ENXIO);
if_put(ifp);
}
/*
* We store the address anyway, and let in6_selectsrc()
* validate the specified address. This is because ipi6_addr
* may not have enough information about its scope zone, and
* we may need additional information (such as outgoing
* interface or the scope zone of a destination address) to
* disambiguate the scope.
* XXX: the delay of the validation may confuse the
* application when it is used as a sticky option.
*/
if (opt->ip6po_pktinfo == NULL) {
opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
M_IP6OPT, M_NOWAIT);
if (opt->ip6po_pktinfo == NULL)
return (ENOBUFS);
}
bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
break;
}
case IPV6_HOPLIMIT:
{
int *hlimp;
/*
* RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
* to simplify the ordering among hoplimit options.
*/
if (sticky)
return (ENOPROTOOPT);
if (len != sizeof(int))
return (EINVAL);
hlimp = (int *)buf;
if (*hlimp < -1 || *hlimp > 255)
return (EINVAL);
opt->ip6po_hlim = *hlimp;
break;
}
case IPV6_TCLASS:
{
int tclass;
if (len != sizeof(int))
return (EINVAL);
tclass = *(int *)buf;
if (tclass < -1 || tclass > 255)
return (EINVAL);
opt->ip6po_tclass = tclass;
break;
}
case IPV6_HOPOPTS:
{
struct ip6_hbh *hbh;
int hbhlen;
/*
* XXX: We don't allow a non-privileged user to set ANY HbH
* options, since per-option restriction has too much
* overhead.
*/
if (!priv)
return (EPERM);
if (len == 0) {
ip6_clearpktopts(opt, IPV6_HOPOPTS);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_hbh))
return (EINVAL);
hbh = (struct ip6_hbh *)buf;
hbhlen = (hbh->ip6h_len + 1) << 3;
if (len != hbhlen)
return (EINVAL);
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, IPV6_HOPOPTS);
opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_hbh == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_hbh, hbh, hbhlen);
break;
}
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
{
struct ip6_dest *dest, **newdest = NULL;
int destlen;
if (!priv) /* XXX: see the comment for IPV6_HOPOPTS */
return (EPERM);
if (len == 0) { ip6_clearpktopts(opt, optname);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_dest))
return (EINVAL);
dest = (struct ip6_dest *)buf;
destlen = (dest->ip6d_len + 1) << 3;
if (len != destlen)
return (EINVAL);
/*
* Determine the position that the destination options header
* should be inserted; before or after the routing header.
*/
switch (optname) {
case IPV6_RTHDRDSTOPTS:
newdest = &opt->ip6po_dest1;
break;
case IPV6_DSTOPTS:
newdest = &opt->ip6po_dest2;
break;
}
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, optname);
*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
if (*newdest == NULL)
return (ENOBUFS);
memcpy(*newdest, dest, destlen);
break;
}
case IPV6_RTHDR:
{
struct ip6_rthdr *rth;
int rthlen;
if (len == 0) {
ip6_clearpktopts(opt, IPV6_RTHDR);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_rthdr))
return (EINVAL);
rth = (struct ip6_rthdr *)buf;
rthlen = (rth->ip6r_len + 1) << 3;
if (len != rthlen)
return (EINVAL);
switch (rth->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
if (rth->ip6r_len == 0) /* must contain one addr */
return (EINVAL);
if (rth->ip6r_len % 2) /* length must be even */
return (EINVAL);
if (rth->ip6r_len / 2 != rth->ip6r_segleft)
return (EINVAL);
break;
default:
return (EINVAL); /* not supported */
}
/* turn off the previous option */
ip6_clearpktopts(opt, IPV6_RTHDR);
opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_rthdr == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_rthdr, rth, rthlen);
break;
}
case IPV6_USE_MIN_MTU:
if (len != sizeof(int))
return (EINVAL);
minmtupolicy = *(int *)buf;
if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
minmtupolicy != IP6PO_MINMTU_DISABLE &&
minmtupolicy != IP6PO_MINMTU_ALL) {
return (EINVAL);
}
opt->ip6po_minmtu = minmtupolicy;
break;
case IPV6_DONTFRAG:
if (len != sizeof(int))
return (EINVAL);
if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
/*
* we ignore this option for TCP sockets.
* (RFC3542 leaves this case unspecified.)
*/
opt->ip6po_flags &= ~IP6PO_DONTFRAG;
} else
opt->ip6po_flags |= IP6PO_DONTFRAG;
break;
default:
return (ENOPROTOOPT);
} /* end of switch */
return (0);
}
/*
* Routine called from ip6_output() to loop back a copy of an IP6 multicast
* packet to the input queue of a specified interface.
*/
void
ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
{
struct mbuf *copym;
struct ip6_hdr *ip6;
/*
* Duplicate the packet.
*/
copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
if (copym == NULL)
return;
/*
* Make sure to deep-copy IPv6 header portion in case the data
* is in an mbuf cluster, so that we can safely override the IPv6
* header portion later.
*/
if ((copym->m_flags & M_EXT) != 0 ||
copym->m_len < sizeof(struct ip6_hdr)) {
copym = m_pullup(copym, sizeof(struct ip6_hdr));
if (copym == NULL)
return;
}
#ifdef DIAGNOSTIC
if (copym->m_len < sizeof(*ip6)) { m_freem(copym);
return;
}
#endif
ip6 = mtod(copym, struct ip6_hdr *);
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
ip6->ip6_src.s6_addr16[1] = 0;
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
ip6->ip6_dst.s6_addr16[1] = 0;
if_input_local(ifp, copym, dst->sin6_family);
}
/*
* Chop IPv6 header off from the payload.
*/
int
ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
{
struct mbuf *mh;
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
if (m->m_len > sizeof(*ip6)) {
MGET(mh, M_DONTWAIT, MT_HEADER);
if (mh == NULL) {
m_freem(m);
return ENOBUFS;
}
M_MOVE_PKTHDR(mh, m);
m_align(mh, sizeof(*ip6));
m->m_len -= sizeof(*ip6);
m->m_data += sizeof(*ip6);
mh->m_next = m;
m = mh;
m->m_len = sizeof(*ip6);
bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
}
exthdrs->ip6e_ip6 = m;
return 0;
}
u_int32_t
ip6_randomid(void)
{
return idgen32(&ip6_id_ctx);
}
void
ip6_randomid_init(void)
{
idgen32_init(&ip6_id_ctx);
}
/*
* Compute significant parts of the IPv6 checksum pseudo-header
* for use in a delayed TCP/UDP checksum calculation.
*/
static __inline u_int16_t __attribute__((__unused__))
in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst,
u_int32_t len, u_int32_t nxt)
{
u_int32_t sum = 0;
const u_int16_t *w;
w = (const u_int16_t *) src;
sum += w[0];
if (!IN6_IS_SCOPE_EMBED(src))
sum += w[1];
sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
sum += w[6]; sum += w[7];
w = (const u_int16_t *) dst;
sum += w[0];
if (!IN6_IS_SCOPE_EMBED(dst))
sum += w[1];
sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
sum += w[6]; sum += w[7];
sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/);
sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/);
sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/);
if (sum > 0xffff)
sum -= 0xffff;
return (sum);
}
/*
* Process a delayed payload checksum calculation.
*/
void
in6_delayed_cksum(struct mbuf *m, u_int8_t nxt)
{
int nxtp, offset;
u_int16_t csum;
offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxtp);
if (offset <= 0 || nxtp != nxt)
/* If the desired next protocol isn't found, punt. */
return;
csum = (u_int16_t)(in6_cksum(m, 0, offset, m->m_pkthdr.len - offset));
switch (nxt) {
case IPPROTO_TCP:
offset += offsetof(struct tcphdr, th_sum);
break;
case IPPROTO_UDP:
offset += offsetof(struct udphdr, uh_sum);
if (csum == 0)
csum = 0xffff;
break;
case IPPROTO_ICMPV6:
offset += offsetof(struct icmp6_hdr, icmp6_cksum);
break;
}
if ((offset + sizeof(u_int16_t)) > m->m_len)
m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
else
*(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
}
void
in6_proto_cksum_out(struct mbuf *m, struct ifnet *ifp)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
/* some hw and in6_delayed_cksum need the pseudo header cksum */
if (m->m_pkthdr.csum_flags &
(M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) {
int nxt, offset;
u_int16_t csum;
offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
csum = in6_cksum_phdr(&ip6->ip6_src, &ip6->ip6_dst,
htonl(m->m_pkthdr.len - offset), htonl(nxt));
if (nxt == IPPROTO_TCP) offset += offsetof(struct tcphdr, th_sum);
else if (nxt == IPPROTO_UDP)
offset += offsetof(struct udphdr, uh_sum);
else if (nxt == IPPROTO_ICMPV6)
offset += offsetof(struct icmp6_hdr, icmp6_cksum);
if ((offset + sizeof(u_int16_t)) > m->m_len)
m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
else
*(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
}
if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) {
if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_TCPv6) || ip6->ip6_nxt != IPPROTO_TCP ||
ifp->if_bridgeidx != 0) {
tcpstat_inc(tcps_outswcsum);
in6_delayed_cksum(m, IPPROTO_TCP);
m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */
}
} else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) {
if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_UDPv6) || ip6->ip6_nxt != IPPROTO_UDP ||
ifp->if_bridgeidx != 0) {
udpstat_inc(udps_outswcsum);
in6_delayed_cksum(m, IPPROTO_UDP);
m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */
}
} else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { in6_delayed_cksum(m, IPPROTO_ICMPV6);
m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */
}
}
#ifdef IPSEC
int
ip6_output_ipsec_lookup(struct mbuf *m, struct inpcb *inp, struct tdb **tdbout)
{
struct tdb *tdb;
struct m_tag *mtag;
struct tdb_ident *tdbi;
int error;
/*
* Check if there was an outgoing SA bound to the flow
* from a transport protocol.
*/
/* Do we have any pending SAs to apply ? */
error = ipsp_spd_lookup(m, AF_INET6, sizeof(struct ip6_hdr),
IPSP_DIRECTION_OUT, NULL, inp, &tdb, NULL);
if (error || tdb == NULL) {
*tdbout = NULL;
return error;
}
/* Loop detection */
for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE)
continue;
tdbi = (struct tdb_ident *)(mtag + 1);
if (tdbi->spi == tdb->tdb_spi && tdbi->proto == tdb->tdb_sproto && tdbi->rdomain == tdb->tdb_rdomain &&
!memcmp(&tdbi->dst, &tdb->tdb_dst,
sizeof(union sockaddr_union))) {
/* no IPsec needed */
tdb_unref(tdb);
*tdbout = NULL;
return 0;
}
}
*tdbout = tdb;
return 0;
}
int
ip6_output_ipsec_pmtu_update(struct tdb *tdb, struct route_in6 *ro,
struct in6_addr *dst, int ifidx, int rtableid, int transportmode)
{
struct rtentry *rt = NULL;
int rt_mtucloned = 0;
/* Find a host route to store the mtu in */
if (ro != NULL)
rt = ro->ro_rt;
/* but don't add a PMTU route for transport mode SAs */
if (transportmode)
rt = NULL;
else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) {
struct sockaddr_in6 sin6;
int error;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = *dst;
sin6.sin6_scope_id = in6_addr2scopeid(ifidx, dst);
error = in6_embedscope(dst, &sin6, NULL);
if (error) {
/* should be impossible */
return error;
}
rt = icmp6_mtudisc_clone(&sin6, rtableid, 1);
rt_mtucloned = 1;
}
DPRINTF("spi %08x mtu %d rt %p cloned %d",
ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned);
if (rt != NULL) {
rt->rt_mtu = tdb->tdb_mtu;
if (ro != NULL && ro->ro_rt != NULL) {
rtfree(ro->ro_rt);
ro->ro_rt = rtalloc(sin6tosa(&ro->ro_dst), RT_RESOLVE,
rtableid);
}
if (rt_mtucloned)
rtfree(rt);
}
return 0;
}
int
ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro,
int tunalready, int fwd)
{
#if NPF > 0
struct ifnet *encif;
#endif
struct ip6_hdr *ip6;
struct in6_addr dst;
int error, ifidx, rtableid;
#if NPF > 0
/*
* Packet filter
*/
if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL ||
pf_test(AF_INET6, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) {
m_freem(m);
return EACCES;
}
if (m == NULL)
return 0;
/*
* PF_TAG_REROUTE handling or not...
* Packet is entering IPsec so the routing is
* already overruled by the IPsec policy.
* Until now the change was not reconsidered.
* What's the behaviour?
*/
in6_proto_cksum_out(m, encif);
#endif
/* Check if we are allowed to fragment */
ip6 = mtod(m, struct ip6_hdr *);
dst = ip6->ip6_dst;
ifidx = m->m_pkthdr.ph_ifidx;
rtableid = m->m_pkthdr.ph_rtableid;
if (ip_mtudisc && tdb->tdb_mtu &&
sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu &&
tdb->tdb_mtutimeout > gettime()) {
int transportmode;
transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) &&
(IN6_ARE_ADDR_EQUAL(&tdb->tdb_dst.sin6.sin6_addr, &dst));
error = ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx,
rtableid, transportmode);
if (error) {
ipsecstat_inc(ipsec_odrops);
tdbstat_inc(tdb, tdb_odrops);
m_freem(m);
return error;
}
ipsec_adjust_mtu(m, tdb->tdb_mtu);
m_freem(m);
return EMSGSIZE;
}
/* propagate don't fragment for v6-over-v6 */
if (ip_mtudisc)
SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
/*
* Clear these -- they'll be set in the recursive invocation
* as needed.
*/
m->m_flags &= ~(M_BCAST | M_MCAST);
/* Callee frees mbuf */
KERNEL_LOCK();
error = ipsp_process_packet(m, tdb, AF_INET6, tunalready);
KERNEL_UNLOCK();
if (error) {
ipsecstat_inc(ipsec_odrops);
tdbstat_inc(tdb, tdb_odrops);
}
if (ip_mtudisc && error == EMSGSIZE)
ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0);
return error;
}
#endif /* IPSEC */
/* $OpenBSD: sys_socket.c,v 1.54 2022/09/02 13:12:31 mvs Exp $ */
/* $NetBSD: sys_socket.c,v 1.13 1995/08/12 23:59:09 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_socket.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/fcntl.h>
#include <net/if.h>
const struct fileops socketops = {
.fo_read = soo_read,
.fo_write = soo_write,
.fo_ioctl = soo_ioctl,
.fo_kqfilter = soo_kqfilter,
.fo_stat = soo_stat,
.fo_close = soo_close
};
int
soo_read(struct file *fp, struct uio *uio, int fflags)
{
struct socket *so = (struct socket *)fp->f_data;
int flags = 0;
if (fp->f_flag & FNONBLOCK)
flags |= MSG_DONTWAIT;
return (soreceive(so, NULL, uio, NULL, NULL, &flags, 0));
}
int
soo_write(struct file *fp, struct uio *uio, int fflags)
{
struct socket *so = (struct socket *)fp->f_data;
int flags = 0;
if (fp->f_flag & FNONBLOCK)
flags |= MSG_DONTWAIT;
return (sosend(so, NULL, uio, NULL, NULL, flags));
}
int
soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
{
struct socket *so = (struct socket *)fp->f_data;
int error = 0;
switch (cmd) {
case FIONBIO:
break;
case FIOASYNC:
solock(so);
if (*(int *)data) {
so->so_rcv.sb_flags |= SB_ASYNC;
so->so_snd.sb_flags |= SB_ASYNC;
} else {
so->so_rcv.sb_flags &= ~SB_ASYNC;
so->so_snd.sb_flags &= ~SB_ASYNC;
}
sounlock(so);
break;
case FIONREAD:
*(int *)data = so->so_rcv.sb_datacc;
break;
case FIOSETOWN:
case SIOCSPGRP:
case TIOCSPGRP:
error = sigio_setown(&so->so_sigio, cmd, data);
break;
case FIOGETOWN:
case SIOCGPGRP:
case TIOCGPGRP:
sigio_getown(&so->so_sigio, cmd, data);
break;
case SIOCATMARK:
*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
break;
default:
/*
* Interface/routing/protocol specific ioctls:
* interface and routing ioctls should have a
* different entry since a socket's unnecessary
*/
if (IOCGROUP(cmd) == 'i') { KERNEL_LOCK();
error = ifioctl(so, cmd, data, p);
KERNEL_UNLOCK();
return (error);
}
if (IOCGROUP(cmd) == 'r')
return (EOPNOTSUPP);
KERNEL_LOCK();
error = pru_control(so, cmd, data, NULL);
KERNEL_UNLOCK();
break;
}
return (error);
}
int
soo_stat(struct file *fp, struct stat *ub, struct proc *p)
{
struct socket *so = fp->f_data;
memset(ub, 0, sizeof (*ub));
ub->st_mode = S_IFSOCK;
solock(so);
if ((so->so_state & SS_CANTRCVMORE) == 0 || so->so_rcv.sb_cc != 0)
ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
if ((so->so_state & SS_CANTSENDMORE) == 0)
ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
ub->st_uid = so->so_euid;
ub->st_gid = so->so_egid;
(void)pru_sense(so, ub);
sounlock(so);
return (0);
}
int
soo_close(struct file *fp, struct proc *p)
{
int flags, error = 0;
if (fp->f_data) { flags = (fp->f_flag & FNONBLOCK) ? MSG_DONTWAIT : 0;
error = soclose(fp->f_data, flags);
}
fp->f_data = NULL;
return (error);
}
/* $OpenBSD: strlcpy.c,v 1.9 2019/01/25 00:19:26 millert Exp $ */
/*
* Copyright (c) 1998, 2015 Todd C. Miller <millert@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <lib/libkern/libkern.h>
/*
* Copy string src to buffer dst of size dsize. At most dsize-1
* chars will be copied. Always NUL terminates (unless dsize == 0).
* Returns strlen(src); if retval >= dsize, truncation occurred.
*/
size_t
strlcpy(char *dst, const char *src, size_t dsize)
{
const char *osrc = src;
size_t nleft = dsize;
/* Copy as many bytes as will fit. */
if (nleft != 0) {
while (--nleft != 0) {
if ((*dst++ = *src++) == '\0')
break;
}
}
/* Not enough room in dst, add NUL and traverse rest of src. */
if (nleft == 0) {
if (dsize != 0)
*dst = '\0'; /* NUL-terminate dst */
while (*src++)
;
}
return(src - osrc - 1); /* count does not include NUL */
}
/* $OpenBSD: uvm_swap_encrypt.c,v 1.24 2021/03/12 14:15:49 jsg Exp $ */
/*
* Copyright 1999 Niels Provos <provos@citi.umich.edu>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Niels Provos.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <crypto/rijndael.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap_encrypt.h>
struct swap_key *kcur = NULL;
rijndael_ctx swap_ctxt;
int uvm_doswapencrypt = 1;
u_int uvm_swpkeyscreated = 0;
u_int uvm_swpkeysdeleted = 0;
int swap_encrypt_initialized = 0;
int
swap_encrypt_ctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen, struct proc *p)
{
/* all sysctl names at this level are terminal */
if (namelen != 1)
return (ENOTDIR); /* overloaded */
switch (name[0]) {
case SWPENC_ENABLE: {
int doencrypt = uvm_doswapencrypt;
int result;
result = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&doencrypt, 0, 1);
if (result)
return result;
/*
* Swap Encryption has been turned on, we need to
* initialize state for swap devices that have been
* added.
*/
if (doencrypt) uvm_swap_initcrypt_all();
uvm_doswapencrypt = doencrypt;
return (0);
}
case SWPENC_CREATED:
return (sysctl_rdint(oldp, oldlenp, newp, uvm_swpkeyscreated));
case SWPENC_DELETED:
return (sysctl_rdint(oldp, oldlenp, newp, uvm_swpkeysdeleted));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
void
swap_key_create(struct swap_key *key)
{
arc4random_buf(key->key, sizeof(key->key));
uvm_swpkeyscreated++;
}
void
swap_key_delete(struct swap_key *key)
{
/* Make sure that this key gets removed if we just used it */
swap_key_cleanup(key);
explicit_bzero(key, sizeof(*key));
uvm_swpkeysdeleted++;
}
/*
* Encrypt the data before it goes to swap, the size should be 64-bit
* aligned.
*/
void
swap_encrypt(struct swap_key *key, caddr_t src, caddr_t dst, u_int64_t block,
size_t count)
{
u_int32_t *dsrc = (u_int32_t *)src;
u_int32_t *ddst = (u_int32_t *)dst;
u_int32_t iv[4];
u_int32_t iv1, iv2, iv3, iv4;
if (!swap_encrypt_initialized)
swap_encrypt_initialized = 1;
swap_key_prepare(key, 1);
count /= sizeof(u_int32_t);
iv[0] = block >> 32; iv[1] = block; iv[2] = ~iv[0]; iv[3] = ~iv[1];
rijndael_encrypt(&swap_ctxt, (u_char *)iv, (u_char *)iv);
iv1 = iv[0]; iv2 = iv[1]; iv3 = iv[2]; iv4 = iv[3];
for (; count > 0; count -= 4) {
ddst[0] = dsrc[0] ^ iv1;
ddst[1] = dsrc[1] ^ iv2;
ddst[2] = dsrc[2] ^ iv3;
ddst[3] = dsrc[3] ^ iv4;
/*
* Do not worry about endianness, it only needs to decrypt
* on this machine.
*/
rijndael_encrypt(&swap_ctxt, (u_char *)ddst, (u_char *)ddst);
iv1 = ddst[0];
iv2 = ddst[1];
iv3 = ddst[2];
iv4 = ddst[3];
dsrc += 4;
ddst += 4;
}
}
/*
* Decrypt the data after we retrieved it from swap, the size should be 64-bit
* aligned.
*/
void
swap_decrypt(struct swap_key *key, caddr_t src, caddr_t dst, u_int64_t block,
size_t count)
{
u_int32_t *dsrc = (u_int32_t *)src;
u_int32_t *ddst = (u_int32_t *)dst;
u_int32_t iv[4];
u_int32_t iv1, iv2, iv3, iv4, niv1, niv2, niv3, niv4;
if (!swap_encrypt_initialized)
panic("swap_decrypt: key not initialized");
swap_key_prepare(key, 0);
count /= sizeof(u_int32_t);
iv[0] = block >> 32; iv[1] = block; iv[2] = ~iv[0]; iv[3] = ~iv[1];
rijndael_encrypt(&swap_ctxt, (u_char *)iv, (u_char *)iv);
iv1 = iv[0]; iv2 = iv[1]; iv3 = iv[2]; iv4 = iv[3];
for (; count > 0; count -= 4) {
ddst[0] = niv1 = dsrc[0];
ddst[1] = niv2 = dsrc[1];
ddst[2] = niv3 = dsrc[2];
ddst[3] = niv4 = dsrc[3];
rijndael_decrypt(&swap_ctxt, (u_char *)ddst, (u_char *)ddst);
ddst[0] ^= iv1;
ddst[1] ^= iv2;
ddst[2] ^= iv3;
ddst[3] ^= iv4;
iv1 = niv1;
iv2 = niv2;
iv3 = niv3;
iv4 = niv4;
dsrc += 4;
ddst += 4;
}
}
void
swap_key_prepare(struct swap_key *key, int encrypt)
{
/*
* Check if we have prepared for this key already,
* if we only have the encryption schedule, we have
* to recompute and get the decryption schedule also.
*/
if (kcur == key && (encrypt || !swap_ctxt.enc_only))
return;
if (encrypt)
rijndael_set_key_enc_only(&swap_ctxt, (u_char *)key->key,
sizeof(key->key) * 8);
else
rijndael_set_key(&swap_ctxt, (u_char *)key->key,
sizeof(key->key) * 8);
kcur = key;
}
/*
* Make sure that a specific key is no longer available.
*/
void
swap_key_cleanup(struct swap_key *key)
{
/* Check if we have a key */
if (kcur == NULL || kcur != key)
return;
/* Zero out the subkeys */
explicit_bzero(&swap_ctxt, sizeof(swap_ctxt));
kcur = NULL;
}
/* $OpenBSD: exec_script.c,v 1.48 2019/07/15 04:11:03 visa Exp $ */
/* $NetBSD: exec_script.c,v 1.13 1996/02/04 02:15:06 christos Exp $ */
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/namei.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/exec_script.h>
/*
* exec_script_makecmds(): Check if it's an executable shell script.
*
* Given a proc pointer and an exec package pointer, see if the referent
* of the epp is in shell script. If it is, then set things up so that
* the script can be run. This involves preparing the address space
* and arguments for the shell which will run the script.
*
* This function is ultimately responsible for creating a set of vmcmds
* which can be used to build the process's vm space and inserting them
* into the exec package.
*/
int
exec_script_makecmds(struct proc *p, struct exec_package *epp)
{
int error, hdrlinelen, shellnamelen, shellarglen;
char *hdrstr = epp->ep_hdr;
char *cp, *shellname, *shellarg, *oldpnbuf;
char **shellargp = NULL, **tmpsap;
struct vnode *scriptvp;
uid_t script_uid = -1;
gid_t script_gid = -1;
u_short script_sbits;
/*
* remember the old vp and pnbuf for later, so we can restore
* them if check_exec() fails.
*/
scriptvp = epp->ep_vp;
oldpnbuf = epp->ep_ndp->ni_cnd.cn_pnbuf;
/*
* if the magic isn't that of a shell script, or we've already
* done shell script processing for this exec, punt on it.
*/
if ((epp->ep_flags & EXEC_INDIR) != 0 || epp->ep_hdrvalid < EXEC_SCRIPT_MAGICLEN ||
strncmp(hdrstr, EXEC_SCRIPT_MAGIC, EXEC_SCRIPT_MAGICLEN))
return ENOEXEC;
/*
* check that the shell spec is terminated by a newline,
* and that it isn't too large. Don't modify the
* buffer unless we're ready to commit to handling it.
* (The latter requirement means that we have to check
* for both spaces and tabs later on.)
*/
hdrlinelen = min(epp->ep_hdrvalid, MAXINTERP);
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; cp < hdrstr + hdrlinelen;
cp++) {
if (*cp == '\n') {
*cp = '\0';
break;
}
}
if (cp >= hdrstr + hdrlinelen)
return ENOEXEC;
shellname = NULL;
shellarg = NULL;
shellarglen = 0;
/* strip spaces before the shell name */
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; *cp == ' ' || *cp == '\t';
cp++)
;
/* collect the shell name; remember its length for later */
shellname = cp;
shellnamelen = 0;
if (*cp == '\0')
goto check_shell;
for ( /* cp = cp */ ; *cp != '\0' && *cp != ' ' && *cp != '\t'; cp++) shellnamelen++;
if (*cp == '\0')
goto check_shell;
*cp++ = '\0';
/* skip spaces before any argument */
for ( /* cp = cp */ ; *cp == ' ' || *cp == '\t'; cp++)
;
if (*cp == '\0')
goto check_shell;
/*
* collect the shell argument. everything after the shell name
* is passed as ONE argument; that's the correct (historical)
* behaviour.
*/
shellarg = cp;
for ( /* cp = cp */ ; *cp != '\0'; cp++)
shellarglen++;
*cp++ = '\0';
check_shell:
/*
* MNT_NOSUID and STRC are already taken care of by check_exec,
* so we don't need to worry about them now or later.
*/
script_sbits = epp->ep_vap->va_mode & (VSUID | VSGID);
if (script_sbits != 0) { script_uid = epp->ep_vap->va_uid;
script_gid = epp->ep_vap->va_gid;
}
/*
* if the script isn't readable, or it's set-id, then we've
* gotta supply a "/dev/fd/..." for the shell to read.
* Note that stupid shells (csh) do the wrong thing, and
* close all open fd's when they start. That kills this
* method of implementing "safe" set-id and x-only scripts.
*/
vn_lock(scriptvp, LK_EXCLUSIVE|LK_RETRY);
error = VOP_ACCESS(scriptvp, VREAD, p->p_ucred, p);
VOP_UNLOCK(scriptvp);
if (error == EACCES || script_sbits) {
struct file *fp;
#ifdef DIAGNOSTIC
if (epp->ep_flags & EXEC_HASFD)
panic("exec_script_makecmds: epp already has a fd");
#endif
fdplock(p->p_fd);
error = falloc(p, &fp, &epp->ep_fd);
if (error) {
fdpunlock(p->p_fd);
goto fail;
}
epp->ep_flags |= EXEC_HASFD;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_data = (caddr_t) scriptvp;
fp->f_flag = FREAD;
fdinsert(p->p_fd, epp->ep_fd, 0, fp);
fdpunlock(p->p_fd);
FRELE(fp, p);
}
/* set up the parameters for the recursive check_exec() call */
epp->ep_ndp->ni_dirfd = AT_FDCWD;
epp->ep_ndp->ni_dirp = shellname;
epp->ep_ndp->ni_segflg = UIO_SYSSPACE;
epp->ep_flags |= EXEC_INDIR;
/* and set up the fake args list, for later */
shellargp = mallocarray(4, sizeof(char *), M_EXEC, M_WAITOK);
tmpsap = shellargp;
*tmpsap = malloc(shellnamelen + 1, M_EXEC, M_WAITOK);
strlcpy(*tmpsap++, shellname, shellnamelen + 1);
if (shellarg != NULL) { *tmpsap = malloc(shellarglen + 1, M_EXEC, M_WAITOK);
strlcpy(*tmpsap++, shellarg, shellarglen + 1);
}
*tmpsap = malloc(MAXPATHLEN, M_EXEC, M_WAITOK);
if ((epp->ep_flags & EXEC_HASFD) == 0) {
error = copyinstr(epp->ep_name, *tmpsap, MAXPATHLEN,
NULL);
if (error != 0) { *(tmpsap + 1) = NULL;
goto fail;
}
} else
snprintf(*tmpsap, MAXPATHLEN, "/dev/fd/%d", epp->ep_fd);
tmpsap++;
*tmpsap = NULL;
/*
* mark the header we have as invalid; check_exec will read
* the header from the new executable
*/
epp->ep_hdrvalid = 0;
if ((error = check_exec(p, epp)) == 0) {
/* note that we've clobbered the header */
epp->ep_flags |= EXEC_DESTR;
/*
* It succeeded. Unlock the script and
* close it if we aren't using it any more.
* Also, set things up so that the fake args
* list will be used.
*/
if ((epp->ep_flags & EXEC_HASFD) == 0) vn_close(scriptvp, FREAD, p->p_ucred, p);
/* free the old pathname buffer */
pool_put(&namei_pool, oldpnbuf);
epp->ep_flags |= (EXEC_HASARGL | EXEC_SKIPARG);
epp->ep_fa = shellargp;
/*
* set things up so that set-id scripts will be
* handled appropriately
*/
epp->ep_vap->va_mode |= script_sbits;
if (script_sbits & VSUID) epp->ep_vap->va_uid = script_uid; if (script_sbits & VSGID) epp->ep_vap->va_gid = script_gid;
return (0);
}
/* XXX oldpnbuf not set for "goto fail" path */
epp->ep_ndp->ni_cnd.cn_pnbuf = oldpnbuf;
fail:
/* note that we've clobbered the header */
epp->ep_flags |= EXEC_DESTR;
/* kill the opened file descriptor, else close the file */
if (epp->ep_flags & EXEC_HASFD) {
epp->ep_flags &= ~EXEC_HASFD;
fdplock(p->p_fd);
/* fdrelease() unlocks p->p_fd. */
(void) fdrelease(p, epp->ep_fd);
} else
vn_close(scriptvp, FREAD, p->p_ucred, p);
pool_put(&namei_pool, epp->ep_ndp->ni_cnd.cn_pnbuf);
/* free the fake arg list, because we're not returning it */
if (shellargp != NULL) {
free(shellargp[0], M_EXEC, shellnamelen + 1);
if (shellargp[2] != NULL) { free(shellargp[1], M_EXEC, shellarglen + 1);
free(shellargp[2], M_EXEC, MAXPATHLEN);
} else
free(shellargp[1], M_EXEC, MAXPATHLEN);
free(shellargp, M_EXEC, 4 * sizeof(char *));
}
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
return error;
}
/* $OpenBSD: kern_tc.c,v 1.77 2022/08/12 02:20:36 cheloha Exp $ */
/*
* Copyright (c) 2000 Poul-Henning Kamp <phk@FreeBSD.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* If we meet some day, and you think this stuff is worth it, you
* can buy me a beer in return. Poul-Henning Kamp
*/
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/stdint.h>
#include <sys/timeout.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/timetc.h>
#include <sys/queue.h>
#include <sys/malloc.h>
u_int dummy_get_timecount(struct timecounter *);
int sysctl_tc_hardware(void *, size_t *, void *, size_t);
int sysctl_tc_choice(void *, size_t *, void *, size_t);
/*
* Implement a dummy timecounter which we can use until we get a real one
* in the air. This allows the console and other early stuff to use
* time services.
*/
u_int
dummy_get_timecount(struct timecounter *tc)
{
static u_int now;
return atomic_inc_int_nv(&now);
}
static struct timecounter dummy_timecounter = {
.tc_get_timecount = dummy_get_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
.tc_frequency = 1000000,
.tc_name = "dummy",
.tc_quality = -1000000,
.tc_priv = NULL,
.tc_user = 0,
};
/*
* Locks used to protect struct members, global variables in this file:
* I immutable after initialization
* T tc_lock
* W windup_mtx
*/
struct timehands {
/* These fields must be initialized by the driver. */
struct timecounter *th_counter; /* [W] */
int64_t th_adjtimedelta; /* [T,W] */
struct bintime th_next_ntp_update; /* [T,W] */
int64_t th_adjustment; /* [W] */
u_int64_t th_scale; /* [W] */
u_int th_offset_count; /* [W] */
struct bintime th_boottime; /* [T,W] */
struct bintime th_offset; /* [W] */
struct bintime th_naptime; /* [W] */
struct timeval th_microtime; /* [W] */
struct timespec th_nanotime; /* [W] */
/* Fields not to be copied in tc_windup start with th_generation. */
volatile u_int th_generation; /* [W] */
struct timehands *th_next; /* [I] */
};
static struct timehands th0;
static struct timehands th1 = {
.th_next = &th0
};
static struct timehands th0 = {
.th_counter = &dummy_timecounter,
.th_scale = UINT64_MAX / 1000000,
.th_offset = { .sec = 1, .frac = 0 },
.th_generation = 1,
.th_next = &th1
};
struct rwlock tc_lock = RWLOCK_INITIALIZER("tc_lock");
/*
* tc_windup() must be called before leaving this mutex.
*/
struct mutex windup_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
static struct timehands *volatile timehands = &th0; /* [W] */
struct timecounter *timecounter = &dummy_timecounter; /* [T] */
static SLIST_HEAD(, timecounter) tc_list = SLIST_HEAD_INITIALIZER(tc_list);
/*
* These are updated from tc_windup(). They are useful when
* examining kernel core dumps.
*/
volatile time_t naptime = 0;
volatile time_t time_second = 1;
volatile time_t time_uptime = 0;
static int timestepwarnings;
void ntp_update_second(struct timehands *);
void tc_windup(struct bintime *, struct bintime *, int64_t *);
/*
* Return the difference between the timehands' counter value now and what
* was when we copied it to the timehands' offset_count.
*/
static __inline u_int
tc_delta(struct timehands *th)
{
struct timecounter *tc;
tc = th->th_counter;
return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
tc->tc_counter_mask);
}
/*
* Functions for reading the time. We have to loop until we are sure that
* the timehands that we operated on was not updated under our feet. See
* the comment in <sys/time.h> for a description of these functions.
*/
void
binboottime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
*bt = th->th_boottime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
microboottime(struct timeval *tvp)
{
struct bintime bt;
binboottime(&bt);
BINTIME_TO_TIMEVAL(&bt, tvp);
}
void
nanoboottime(struct timespec *tsp)
{
struct bintime bt;
binboottime(&bt);
BINTIME_TO_TIMESPEC(&bt, tsp);
}
void
binuptime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
TIMECOUNT_TO_BINTIME(tc_delta(th), th->th_scale, bt);
bintimeadd(bt, &th->th_offset, bt);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
getbinuptime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
nanouptime(struct timespec *tsp)
{
struct bintime bt;
binuptime(&bt);
BINTIME_TO_TIMESPEC(&bt, tsp);
}
void
microuptime(struct timeval *tvp)
{
struct bintime bt;
binuptime(&bt);
BINTIME_TO_TIMEVAL(&bt, tvp);
}
time_t
getuptime(void)
{
#if defined(__LP64__)
return time_uptime; /* atomic */
#else
time_t now;
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
now = th->th_offset.sec;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
return now;
#endif
}
uint64_t
nsecuptime(void)
{
struct bintime bt;
binuptime(&bt);
return BINTIME_TO_NSEC(&bt);
}
uint64_t
getnsecuptime(void)
{
struct bintime bt;
getbinuptime(&bt);
return BINTIME_TO_NSEC(&bt);
}
void
binruntime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
TIMECOUNT_TO_BINTIME(tc_delta(th), th->th_scale, bt);
bintimeadd(bt, &th->th_offset, bt);
bintimesub(bt, &th->th_naptime, bt);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
nanoruntime(struct timespec *ts)
{
struct bintime bt;
binruntime(&bt);
BINTIME_TO_TIMESPEC(&bt, ts);
}
void
bintime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
TIMECOUNT_TO_BINTIME(tc_delta(th), th->th_scale, bt);
bintimeadd(bt, &th->th_offset, bt);
bintimeadd(bt, &th->th_boottime, bt);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
nanotime(struct timespec *tsp)
{
struct bintime bt;
bintime(&bt);
BINTIME_TO_TIMESPEC(&bt, tsp);
}
void
microtime(struct timeval *tvp)
{
struct bintime bt;
bintime(&bt);
BINTIME_TO_TIMEVAL(&bt, tvp);
}
time_t
gettime(void)
{
#if defined(__LP64__)
return time_second; /* atomic */
#else
time_t now;
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
now = th->th_microtime.tv_sec;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
return now;
#endif
}
void
getnanouptime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
BINTIME_TO_TIMESPEC(&th->th_offset, tsp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getmicrouptime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
BINTIME_TO_TIMEVAL(&th->th_offset, tvp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getnanotime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
*tsp = th->th_nanotime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getmicrotime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
*tvp = th->th_microtime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
/*
* Initialize a new timecounter and possibly use it.
*/
void
tc_init(struct timecounter *tc)
{
u_int64_t tmp;
u_int u;
u = tc->tc_frequency / tc->tc_counter_mask;
/* XXX: We need some margin here, 10% is a guess */
u *= 11;
u /= 10;
if (tc->tc_quality >= 0) {
if (u > hz) {
tc->tc_quality = -2000;
printf("Timecounter \"%s\" frequency %lu Hz",
tc->tc_name, (unsigned long)tc->tc_frequency);
printf(" -- Insufficient hz, needs at least %u\n", u);
}
}
/* Determine the counter's precision. */
for (tmp = 1; (tmp & tc->tc_counter_mask) == 0; tmp <<= 1)
continue;
tc->tc_precision = tmp;
SLIST_INSERT_HEAD(&tc_list, tc, tc_next);
/*
* Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
* worse since this timecounter may not be monotonic.
*/
if (tc->tc_quality < 0)
return;
if (tc->tc_quality < timecounter->tc_quality)
return;
if (tc->tc_quality == timecounter->tc_quality &&
tc->tc_frequency < timecounter->tc_frequency)
return;
(void)tc->tc_get_timecount(tc);
enqueue_randomness(tc->tc_get_timecount(tc));
timecounter = tc;
}
/*
* Change the given timecounter's quality. If it is the active
* counter and it is no longer the best counter, activate the
* best counter.
*/
void
tc_reset_quality(struct timecounter *tc, int quality)
{
struct timecounter *best = &dummy_timecounter, *tmp;
if (tc == &dummy_timecounter)
panic("%s: cannot change dummy counter quality", __func__);
tc->tc_quality = quality;
if (timecounter == tc) {
SLIST_FOREACH(tmp, &tc_list, tc_next) {
if (tmp->tc_quality < 0)
continue;
if (tmp->tc_quality < best->tc_quality)
continue;
if (tmp->tc_quality == best->tc_quality &&
tmp->tc_frequency < best->tc_frequency)
continue;
best = tmp;
}
if (best != tc) {
enqueue_randomness(best->tc_get_timecount(best));
timecounter = best;
}
}
}
/* Report the frequency of the current timecounter. */
u_int64_t
tc_getfrequency(void)
{
return (timehands->th_counter->tc_frequency);
}
/* Report the precision of the current timecounter. */
u_int64_t
tc_getprecision(void)
{
return (timehands->th_counter->tc_precision);
}
/*
* Step our concept of UTC, aka the realtime clock.
* This is done by modifying our estimate of when we booted.
*
* Any ongoing adjustment is meaningless after a clock jump,
* so we zero adjtimedelta here as well.
*/
void
tc_setrealtimeclock(const struct timespec *ts)
{
struct bintime boottime, old_utc, uptime, utc;
struct timespec tmp;
int64_t zero = 0;
TIMESPEC_TO_BINTIME(ts, &utc);
rw_enter_write(&tc_lock);
mtx_enter(&windup_mtx);
binuptime(&uptime);
bintimesub(&utc, &uptime, &boottime);
bintimeadd(&timehands->th_boottime, &uptime, &old_utc);
/* XXX fiddle all the little crinkly bits around the fiords... */
tc_windup(&boottime, NULL, &zero);
mtx_leave(&windup_mtx);
rw_exit_write(&tc_lock);
enqueue_randomness(ts->tv_sec);
if (timestepwarnings) {
BINTIME_TO_TIMESPEC(&old_utc, &tmp);
log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n",
(long long)tmp.tv_sec, tmp.tv_nsec,
(long long)ts->tv_sec, ts->tv_nsec);
}
}
/*
* Step the monotonic and realtime clocks, triggering any timeouts that
* should have occurred across the interval.
*/
void
tc_setclock(const struct timespec *ts)
{
struct bintime new_naptime, old_naptime, uptime, utc;
struct timespec tmp;
static int first = 1;
#ifndef SMALL_KERNEL
struct bintime elapsed;
long long adj_ticks;
#endif
/*
* When we're called for the first time, during boot when
* the root partition is mounted, we need to set boottime.
*/
if (first) {
tc_setrealtimeclock(ts);
first = 0;
return;
}
enqueue_randomness(ts->tv_sec);
TIMESPEC_TO_BINTIME(ts, &utc);
mtx_enter(&windup_mtx);
bintimesub(&utc, &timehands->th_boottime, &uptime);
old_naptime = timehands->th_naptime;
/* XXX fiddle all the little crinkly bits around the fiords... */
tc_windup(NULL, &uptime, NULL);
new_naptime = timehands->th_naptime;
mtx_leave(&windup_mtx);
if (bintimecmp(&old_naptime, &new_naptime, ==)) {
BINTIME_TO_TIMESPEC(&uptime, &tmp);
printf("%s: cannot rewind uptime to %lld.%09ld\n",
__func__, (long long)tmp.tv_sec, tmp.tv_nsec);
}
#ifndef SMALL_KERNEL
/* convert the bintime to ticks */
bintimesub(&new_naptime, &old_naptime, &elapsed);
adj_ticks = BINTIME_TO_NSEC(&elapsed) / tick_nsec;
if (adj_ticks > 0) {
if (adj_ticks > INT_MAX)
adj_ticks = INT_MAX;
timeout_adjust_ticks(adj_ticks);
}
#endif
}
void
tc_update_timekeep(void)
{
static struct timecounter *last_tc = NULL;
struct timehands *th;
MUTEX_ASSERT_LOCKED(&windup_mtx);
if (timekeep == NULL)
return;
th = timehands;
timekeep->tk_generation = 0;
membar_producer();
timekeep->tk_scale = th->th_scale;
timekeep->tk_offset_count = th->th_offset_count;
timekeep->tk_offset = th->th_offset;
timekeep->tk_naptime = th->th_naptime;
timekeep->tk_boottime = th->th_boottime;
if (last_tc != th->th_counter) {
timekeep->tk_counter_mask = th->th_counter->tc_counter_mask;
timekeep->tk_user = th->th_counter->tc_user;
last_tc = th->th_counter;
}
membar_producer();
timekeep->tk_generation = th->th_generation;
return;
}
/*
* Initialize the next struct timehands in the ring and make
* it the active timehands. Along the way we might switch to a different
* timecounter and/or do seconds processing in NTP. Slightly magic.
*/
void
tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
int64_t *new_adjtimedelta)
{
struct bintime bt;
struct timecounter *active_tc;
struct timehands *th, *tho;
u_int64_t scale;
u_int delta, ncount, ogen;
if (new_boottime != NULL || new_adjtimedelta != NULL)
rw_assert_wrlock(&tc_lock);
MUTEX_ASSERT_LOCKED(&windup_mtx);
active_tc = timecounter;
/*
* Make the next timehands a copy of the current one, but do not
* overwrite the generation or next pointer. While we update
* the contents, the generation must be zero.
*/
tho = timehands;
ogen = tho->th_generation;
th = tho->th_next;
th->th_generation = 0;
membar_producer();
memcpy(th, tho, offsetof(struct timehands, th_generation));
/*
* Capture a timecounter delta on the current timecounter and if
* changing timecounters, a counter value from the new timecounter.
* Update the offset fields accordingly.
*/
delta = tc_delta(th);
if (th->th_counter != active_tc)
ncount = active_tc->tc_get_timecount(active_tc);
else
ncount = 0;
th->th_offset_count += delta;
th->th_offset_count &= th->th_counter->tc_counter_mask;
TIMECOUNT_TO_BINTIME(delta, th->th_scale, &bt);
bintimeadd(&th->th_offset, &bt, &th->th_offset);
/*
* Ignore new offsets that predate the current offset.
* If changing the offset, first increase the naptime
* accordingly.
*/
if (new_offset != NULL && bintimecmp(&th->th_offset, new_offset, <)) {
bintimesub(new_offset, &th->th_offset, &bt);
bintimeadd(&th->th_naptime, &bt, &th->th_naptime);
naptime = th->th_naptime.sec;
th->th_offset = *new_offset;
}
#ifdef notyet
/*
* Hardware latching timecounters may not generate interrupts on
* PPS events, so instead we poll them. There is a finite risk that
* the hardware might capture a count which is later than the one we
* got above, and therefore possibly in the next NTP second which might
* have a different rate than the current NTP second. It doesn't
* matter in practice.
*/
if (tho->th_counter->tc_poll_pps)
tho->th_counter->tc_poll_pps(tho->th_counter);
#endif
/*
* If changing the boot time or clock adjustment, do so before
* NTP processing.
*/
if (new_boottime != NULL)
th->th_boottime = *new_boottime;
if (new_adjtimedelta != NULL) {
th->th_adjtimedelta = *new_adjtimedelta;
/* Reset the NTP update period. */
bintimesub(&th->th_offset, &th->th_naptime,
&th->th_next_ntp_update);
}
/*
* Deal with NTP second processing. The while-loop normally
* iterates at most once, but in extreme situations it might
* keep NTP sane if tc_windup() is not run for several seconds.
*/
bintimesub(&th->th_offset, &th->th_naptime, &bt);
while (bintimecmp(&th->th_next_ntp_update, &bt, <=)) {
ntp_update_second(th);
th->th_next_ntp_update.sec++;
}
/* Update the UTC timestamps used by the get*() functions. */
bintimeadd(&th->th_boottime, &th->th_offset, &bt);
BINTIME_TO_TIMEVAL(&bt, &th->th_microtime);
BINTIME_TO_TIMESPEC(&bt, &th->th_nanotime);
/* Now is a good time to change timecounters. */
if (th->th_counter != active_tc) {
th->th_counter = active_tc;
th->th_offset_count = ncount;
}
/*-
* Recalculate the scaling factor. We want the number of 1/2^64
* fractions of a second per period of the hardware counter, taking
* into account the th_adjustment factor which the NTP PLL/adjtime(2)
* processing provides us with.
*
* The th_adjustment is nanoseconds per second with 32 bit binary
* fraction and we want 64 bit binary fraction of second:
*
* x = a * 2^32 / 10^9 = a * 4.294967296
*
* The range of th_adjustment is +/- 5000PPM so inside a 64bit int
* we can only multiply by about 850 without overflowing, but that
* leaves suitably precise fractions for multiply before divide.
*
* Divide before multiply with a fraction of 2199/512 results in a
* systematic undercompensation of 10PPM of th_adjustment. On a
* 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
*
* We happily sacrifice the lowest of the 64 bits of our result
* to the goddess of code clarity.
*
*/
scale = (u_int64_t)1 << 63;
scale += \
((th->th_adjustment + th->th_counter->tc_freq_adj) / 1024) * 2199;
scale /= th->th_counter->tc_frequency;
th->th_scale = scale * 2;
/*
* Now that the struct timehands is again consistent, set the new
* generation number, making sure to not make it zero.
*/
if (++ogen == 0)
ogen = 1;
membar_producer();
th->th_generation = ogen;
/* Go live with the new struct timehands. */
time_second = th->th_microtime.tv_sec;
time_uptime = th->th_offset.sec;
membar_producer();
timehands = th;
tc_update_timekeep();
}
/* Report or change the active timecounter hardware. */
int
sysctl_tc_hardware(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
char newname[32];
struct timecounter *newtc, *tc;
int error;
tc = timecounter;
strlcpy(newname, tc->tc_name, sizeof(newname));
error = sysctl_string(oldp, oldlenp, newp, newlen, newname, sizeof(newname));
if (error != 0 || strcmp(newname, tc->tc_name) == 0)
return (error);
SLIST_FOREACH(newtc, &tc_list, tc_next) {
if (strcmp(newname, newtc->tc_name) != 0)
continue;
/* Warm up new timecounter. */
(void)newtc->tc_get_timecount(newtc);
(void)newtc->tc_get_timecount(newtc);
rw_enter_write(&tc_lock);
timecounter = newtc;
rw_exit_write(&tc_lock);
return (0);
}
return (EINVAL);
}
/* Report or change the active timecounter hardware. */
int
sysctl_tc_choice(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
char buf[32], *spc, *choices;
struct timecounter *tc;
int error, maxlen;
if (SLIST_EMPTY(&tc_list))
return (sysctl_rdstring(oldp, oldlenp, newp, ""));
spc = "";
maxlen = 0;
SLIST_FOREACH(tc, &tc_list, tc_next)
maxlen += sizeof(buf);
choices = malloc(maxlen, M_TEMP, M_WAITOK);
*choices = '\0';
SLIST_FOREACH(tc, &tc_list, tc_next) {
snprintf(buf, sizeof(buf), "%s%s(%d)",
spc, tc->tc_name, tc->tc_quality);
spc = " ";
strlcat(choices, buf, maxlen);
}
error = sysctl_rdstring(oldp, oldlenp, newp, choices);
free(choices, M_TEMP, maxlen);
return (error);
}
/*
* Timecounters need to be updated every so often to prevent the hardware
* counter from overflowing. Updating also recalculates the cached values
* used by the get*() family of functions, so their precision depends on
* the update frequency.
*/
static int tc_tick;
void
tc_ticktock(void)
{
static int count;
if (++count < tc_tick)
return;
if (!mtx_enter_try(&windup_mtx))
return;
count = 0;
tc_windup(NULL, NULL, NULL);
mtx_leave(&windup_mtx);
}
void
inittimecounter(void)
{
#ifdef DEBUG
u_int p;
#endif
/*
* Set the initial timeout to
* max(1, <approx. number of hardclock ticks in a millisecond>).
* People should probably not use the sysctl to set the timeout
* to smaller than its initial value, since that value is the
* smallest reasonable one. If they want better timestamps they
* should use the non-"get"* functions.
*/
if (hz > 1000)
tc_tick = (hz + 500) / 1000;
else
tc_tick = 1;
#ifdef DEBUG
p = (tc_tick * 1000000) / hz;
printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
#endif
/* warm up new timecounter (again) and get rolling. */
(void)timecounter->tc_get_timecount(timecounter);
(void)timecounter->tc_get_timecount(timecounter);
}
const struct sysctl_bounded_args tc_vars[] = {
{ KERN_TIMECOUNTER_TICK, &tc_tick, SYSCTL_INT_READONLY },
{ KERN_TIMECOUNTER_TIMESTEPWARNINGS, ×tepwarnings, 0, 1 },
};
/*
* Return timecounter-related information.
*/
int
sysctl_tc(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{ if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case KERN_TIMECOUNTER_HARDWARE:
return (sysctl_tc_hardware(oldp, oldlenp, newp, newlen));
case KERN_TIMECOUNTER_CHOICE:
return (sysctl_tc_choice(oldp, oldlenp, newp, newlen));
default:
return (sysctl_bounded_arr(tc_vars, nitems(tc_vars), name,
namelen, oldp, oldlenp, newp, newlen));
}
/* NOTREACHED */
}
/*
* Skew the timehands according to any adjtime(2) adjustment.
*/
void
ntp_update_second(struct timehands *th)
{
int64_t adj;
MUTEX_ASSERT_LOCKED(&windup_mtx);
if (th->th_adjtimedelta > 0)
adj = MIN(5000, th->th_adjtimedelta);
else
adj = MAX(-5000, th->th_adjtimedelta);
th->th_adjtimedelta -= adj;
th->th_adjustment = (adj * 1000) << 32;
}
void
tc_adjfreq(int64_t *old, int64_t *new)
{
if (old != NULL) {
rw_assert_anylock(&tc_lock);
*old = timecounter->tc_freq_adj;
}
if (new != NULL) {
rw_assert_wrlock(&tc_lock);
mtx_enter(&windup_mtx);
timecounter->tc_freq_adj = *new;
tc_windup(NULL, NULL, NULL);
mtx_leave(&windup_mtx);
}
}
void
tc_adjtime(int64_t *old, int64_t *new)
{
struct timehands *th;
u_int gen;
if (old != NULL) {
do {
th = timehands;
gen = th->th_generation;
membar_consumer();
*old = th->th_adjtimedelta;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
if (new != NULL) {
rw_assert_wrlock(&tc_lock);
mtx_enter(&windup_mtx);
tc_windup(NULL, NULL, new);
mtx_leave(&windup_mtx);
}
}
/* $OpenBSD: sysv_ipc.c,v 1.8 2015/03/14 03:38:50 jsg Exp $ */
/* $NetBSD: sysv_ipc.c,v 1.10 1995/06/03 05:53:28 mycroft Exp $ */
/*
* Copyright (c) 1995 Charles M. Hannum. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Charles M. Hannum.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/ipc.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/vnode.h>
/*
* Check for ipc permission
*/
int
ipcperm(struct ucred *cred, struct ipc_perm *perm, int mode)
{
if (mode == IPC_M) {
if (cred->cr_uid == 0 || cred->cr_uid == perm->uid || cred->cr_uid == perm->cuid)
return (0);
return (EPERM);
}
if (vaccess(VNON, perm->mode, perm->uid, perm->gid, mode, cred) == 0 || vaccess(VNON, perm->mode, perm->cuid, perm->cgid, mode, cred) == 0)
return (0);
return (EACCES);
}
/* $OpenBSD: libkern.h,v 1.36 2020/02/26 14:23:15 visa Exp $ */
/* $NetBSD: libkern.h,v 1.7 1996/03/14 18:52:08 christos Exp $ */
/*-
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)libkern.h 8.1 (Berkeley) 6/10/93
*/
#ifndef __LIBKERN_H__
#define __LIBKERN_H__
#include <sys/types.h>
#ifndef LIBKERN_INLINE
#define LIBKERN_INLINE static __inline
#define LIBKERN_BODY
#endif
LIBKERN_INLINE int imax(int, int);
LIBKERN_INLINE int imin(int, int);
LIBKERN_INLINE u_int max(u_int, u_int);
LIBKERN_INLINE u_int min(u_int, u_int);
LIBKERN_INLINE long lmax(long, long);
LIBKERN_INLINE long lmin(long, long);
LIBKERN_INLINE u_long ulmax(u_long, u_long);
LIBKERN_INLINE u_long ulmin(u_long, u_long);
LIBKERN_INLINE int abs(int);
#ifdef LIBKERN_BODY
LIBKERN_INLINE int
imax(int a, int b)
{
return (a > b ? a : b);
}
LIBKERN_INLINE int
imin(int a, int b)
{
return (a < b ? a : b);
}
LIBKERN_INLINE long
lmax(long a, long b)
{
return (a > b ? a : b);
}
LIBKERN_INLINE long
lmin(long a, long b)
{
return (a < b ? a : b);
}
LIBKERN_INLINE u_int
max(u_int a, u_int b)
{
return (a > b ? a : b);
}
LIBKERN_INLINE u_int
min(u_int a, u_int b)
{
return (a < b ? a : b);
}
LIBKERN_INLINE u_long
ulmax(u_long a, u_long b)
{
return (a > b ? a : b);
}
LIBKERN_INLINE u_long
ulmin(u_long a, u_long b)
{
return (a < b ? a : b);
}
LIBKERN_INLINE int
abs(int j)
{
return(j < 0 ? -j : j);
}
#endif
#ifdef NDEBUG /* tradition! */
#define assert(e) ((void)0)
#else
#define assert(e) ((e) ? (void)0 : \
__assert("", __FILE__, __LINE__, #e))
#endif
#define __KASSERTSTR "kernel %sassertion \"%s\" failed: file \"%s\", line %d"
#ifndef DIAGNOSTIC
#define KASSERTMSG(e, msg, ...) ((void)0)
#define KASSERT(e) ((void)0)
#else
#define KASSERTMSG(e, msg, ...) ((e) ? (void)0 : \
panic(__KASSERTSTR " " msg, "diagnostic ", #e, \
__FILE__, __LINE__, ## __VA_ARGS__))
#define KASSERT(e) ((e) ? (void)0 : \
__assert("diagnostic ", __FILE__, __LINE__, #e))
#endif
#ifndef DEBUG
#define KDASSERTMSG(e, msg, ...) ((void)0)
#define KDASSERT(e) ((void)0)
#else
#define KDASSERTMSG(e, msg, ...) ((e) ? (void)0 : \
panic(__KASSERTSTR " " msg, "debugging ", #e, \
__FILE__, __LINE__, ## __VA_ARGS__))
#define KDASSERT(e) ((e) ? (void)0 : \
__assert("debugging ", __FILE__, __LINE__, #e))
#endif
#define CTASSERT(x) extern char _ctassert[(x) ? 1 : -1 ] \
__attribute__((__unused__))
/* Prototypes for non-quad routines. */
void __assert(const char *, const char *, int, const char *)
__attribute__ ((__noreturn__));
int bcmp(const void *, const void *, size_t);
void bzero(void *, size_t);
void explicit_bzero(void *, size_t);
int ffs(int);
int fls(int);
int flsl(long);
void *memchr(const void *, int, size_t);
int memcmp(const void *, const void *, size_t);
void *memset(void *, int c, size_t len);
u_int32_t random(void);
int scanc(u_int, const u_char *, const u_char [], int);
int skpc(int, size_t, u_char *);
size_t strlen(const char *);
char *strncpy(char *, const char *, size_t)
__attribute__ ((__bounded__(__string__,1,3)));
size_t strnlen(const char *, size_t);
size_t strlcpy(char *, const char *, size_t)
__attribute__ ((__bounded__(__string__,1,3)));
size_t strlcat(char *, const char *, size_t)
__attribute__ ((__bounded__(__string__,1,3)));
int strcmp(const char *, const char *);
int strncmp(const char *, const char *, size_t);
int strncasecmp(const char *, const char *, size_t);
size_t getsn(char *, size_t)
__attribute__ ((__bounded__(__string__,1,2)));
char *strchr(const char *, int);
char *strrchr(const char *, int);
int timingsafe_bcmp(const void *, const void *, size_t);
#endif /* __LIBKERN_H__ */
/* $OpenBSD: mem.c,v 1.35 2021/03/24 14:26:39 bluhm Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mem.c 8.3 (Berkeley) 1/12/94
*/
/*
* Memory special file
*/
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/filio.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/ioccom.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
#include <machine/cpu.h>
#include <uvm/uvm_extern.h>
caddr_t zeropage;
extern int start, end, etext;
/* open counter for aperture */
#ifdef APERTURE
static int ap_open_count = 0;
extern int allowaperture;
#define VGA_START 0xA0000
#define BIOS_END 0xFFFFF
#endif
#ifdef MTRR
struct mem_range_softc mem_range_softc;
int mem_ioctl(dev_t, u_long, caddr_t, int, struct proc *);
int mem_range_attr_get(struct mem_range_desc *, int *);
int mem_range_attr_set(struct mem_range_desc *, int *);
#endif
int
mmopen(dev_t dev, int flag, int mode, struct proc *p)
{
extern int allowkmem;
switch (minor(dev)) {
case 0:
case 1:
if (securelevel <= 0 || allowkmem)
break;
return (EPERM);
case 2:
case 12:
break;
#ifdef APERTURE
case 4:
if (suser(p) != 0 || !allowaperture)
return (EPERM);
/* authorize only one simultaneous open() unless
* allowaperture=3 */
if (ap_open_count > 0 && allowaperture < 3)
return (EPERM);
ap_open_count++;
break;
#endif
default:
return (ENXIO);
}
return (0);
}
int
mmclose(dev_t dev, int flag, int mode, struct proc *p)
{
#ifdef APERTURE
if (minor(dev) == 4) ap_open_count = 0;
#endif
return (0);
}
int
mmrw(dev_t dev, struct uio *uio, int flags)
{
extern vaddr_t kern_end;
vaddr_t v;
size_t c;
struct iovec *iov;
int error = 0;
while (uio->uio_resid > 0 && error == 0) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
if (uio->uio_iovcnt < 0) panic("mmrw");
continue;
}
switch (minor(dev)) {
/* minor device 0 is physical memory */
case 0:
v = PMAP_DIRECT_MAP(uio->uio_offset);
error = uiomove((caddr_t)v, uio->uio_resid, uio);
continue;
/* minor device 1 is kernel memory */
case 1:
v = uio->uio_offset;
c = ulmin(iov->iov_len, MAXPHYS);
if (v >= (vaddr_t)&start && v < kern_end - c) { if (v < (vaddr_t)&etext - c &&
uio->uio_rw == UIO_WRITE)
return EFAULT;
} else if ((!uvm_kernacc((caddr_t)v, c,
uio->uio_rw == UIO_READ ? B_READ : B_WRITE)) && (v < PMAP_DIRECT_BASE || v > PMAP_DIRECT_END - c))
return (EFAULT);
error = uiomove((caddr_t)v, c, uio);
continue;
/* minor device 2 is /dev/null */
case 2:
if (uio->uio_rw == UIO_WRITE) uio->uio_resid = 0;
return (0);
/* minor device 12 is /dev/zero */
case 12:
if (uio->uio_rw == UIO_WRITE) {
c = iov->iov_len;
break;
}
if (zeropage == NULL)
zeropage =
malloc(PAGE_SIZE, M_TEMP, M_WAITOK|M_ZERO);
c = ulmin(iov->iov_len, PAGE_SIZE);
error = uiomove(zeropage, c, uio);
continue;
default:
return (ENXIO);
}
iov->iov_base += c;
iov->iov_len -= c;
uio->uio_offset += c;
uio->uio_resid -= c;
}
return (error);
}
paddr_t
mmmmap(dev_t dev, off_t off, int prot)
{
struct proc *p = curproc; /* XXX */
switch (minor(dev)) {
/* minor device 0 is physical memory */
case 0:
if (suser(p) != 0 && amd64_pa_used(off))
return -1;
return off;
#ifdef APERTURE
/* minor device 4 is aperture driver */
case 4:
/* Check if a write combining mapping is requested. */
if (off >= MEMRANGE_WC_RANGE)
off = (off - MEMRANGE_WC_RANGE) | PMAP_WC;
switch (allowaperture) {
case 1:
/* Allow mapping of the VGA framebuffer & BIOS only */
if ((off >= VGA_START && off <= BIOS_END) || !amd64_pa_used(off))
return off;
else
return -1;
case 2:
case 3:
/* Allow mapping of the whole 1st megabyte
for x86emu */
if (off <= BIOS_END || !amd64_pa_used(off))
return off;
else
return -1;
default:
return -1;
}
#endif
default:
return -1;
}
}
int
mmioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p)
{ switch (cmd) {
case FIONBIO:
case FIOASYNC:
/* handled by fd layer */
return 0;
}
#ifdef MTRR
switch (minor(dev)) {
case 0:
case 4:
return mem_ioctl(dev, cmd, data, flags, p);
}
#endif
return (ENODEV);
}
#ifdef MTRR
/*
* Operations for changing memory attributes.
*
* This is basically just an ioctl shim for mem_range_attr_get
* and mem_range_attr_set.
*/
int
mem_ioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p)
{
int nd, error = 0;
struct mem_range_op *mo = (struct mem_range_op *)data;
struct mem_range_desc *md;
/* is this for us? */
if ((cmd != MEMRANGE_GET) &&
(cmd != MEMRANGE_SET))
return (ENOTTY);
/* any chance we can handle this? */
if (mem_range_softc.mr_op == NULL)
return (EOPNOTSUPP);
/* do we have any descriptors? */
if (mem_range_softc.mr_ndesc == 0)
return (ENXIO);
switch (cmd) {
case MEMRANGE_GET:
nd = imin(mo->mo_arg[0], mem_range_softc.mr_ndesc);
if (nd > 0) {
md = mallocarray(nd, sizeof(struct mem_range_desc),
M_MEMDESC, M_WAITOK);
error = mem_range_attr_get(md, &nd);
if (!error)
error = copyout(md, mo->mo_desc,
nd * sizeof(struct mem_range_desc));
free(md, M_MEMDESC, nd * sizeof(struct mem_range_desc));
} else {
nd = mem_range_softc.mr_ndesc;
}
mo->mo_arg[0] = nd;
break;
case MEMRANGE_SET:
md = malloc(sizeof(struct mem_range_desc), M_MEMDESC, M_WAITOK);
error = copyin(mo->mo_desc, md, sizeof(struct mem_range_desc));
/* clamp description string */
md->mr_owner[sizeof(md->mr_owner) - 1] = 0;
if (error == 0)
error = mem_range_attr_set(md, &mo->mo_arg[0]);
free(md, M_MEMDESC, sizeof(struct mem_range_desc));
break;
}
return (error);
}
/*
* Implementation-neutral, kernel-callable functions for manipulating
* memory range attributes.
*/
int
mem_range_attr_get(struct mem_range_desc *mrd, int *arg)
{
/* can we handle this? */
if (mem_range_softc.mr_op == NULL)
return (EOPNOTSUPP);
if (*arg == 0) {
*arg = mem_range_softc.mr_ndesc;
} else {
memcpy(mrd, mem_range_softc.mr_desc, (*arg) * sizeof(struct mem_range_desc));
}
return (0);
}
int
mem_range_attr_set(struct mem_range_desc *mrd, int *arg)
{
/* can we handle this? */
if (mem_range_softc.mr_op == NULL)
return (EOPNOTSUPP);
return (mem_range_softc.mr_op->set(&mem_range_softc, mrd, arg));
}
#endif /* MTRR */
/* $OpenBSD: uvm_pmemrange.h,v 1.14 2016/09/16 02:47:09 dlg Exp $ */
/*
* Copyright (c) 2009 Ariane van der Steldt <ariane@stack.nl>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* uvm_pmemrange.h: describe and manage free physical memory.
*/
#ifndef _UVM_UVM_PMEMRANGE_H_
#define _UVM_UVM_PMEMRANGE_H_
RBT_HEAD(uvm_pmr_addr, vm_page);
RBT_HEAD(uvm_pmr_size, vm_page);
/*
* Page types available:
* - DIRTY: this page may contain random data.
* - ZERO: this page has been zeroed.
*/
#define UVM_PMR_MEMTYPE_DIRTY 0
#define UVM_PMR_MEMTYPE_ZERO 1
#define UVM_PMR_MEMTYPE_MAX 2
/*
* An address range of memory.
*/
struct uvm_pmemrange {
struct uvm_pmr_addr addr; /* Free page chunks, sorted by addr. */
struct uvm_pmr_size size[UVM_PMR_MEMTYPE_MAX];
/* Free page chunks, sorted by size. */
TAILQ_HEAD(, vm_page) single[UVM_PMR_MEMTYPE_MAX];
/* single page regions (uses pageq) */
paddr_t low; /* Start of address range (pgno). */
paddr_t high; /* End +1 (pgno). */
int use; /* Use counter. */
psize_t nsegs; /* Current range count. */
TAILQ_ENTRY(uvm_pmemrange) pmr_use;
/* pmr, sorted by use */
RBT_ENTRY(uvm_pmemrange) pmr_addr;
/* pmr, sorted by address */
};
/*
* Description of failing memory allocation.
*
* Two ways new pages can become available:
* [1] page daemon drops them (we notice because they are freed)
* [2] a process calls free
*
* The buffer cache and page daemon can decide that they don't have the
* ability to make pages available in the requested range. In that case,
* the FAIL bit will be set.
* XXX There's a possibility that a page is no longer on the queues but
* XXX has not yet been freed, or that a page was busy.
* XXX Also, wired pages are not considered for paging, so they could
* XXX cause a failure that may be recoverable.
*/
struct uvm_pmalloc {
TAILQ_ENTRY(uvm_pmalloc) pmq;
/*
* Allocation request parameters.
*/
struct uvm_constraint_range pm_constraint;
psize_t pm_size;
/*
* State flags.
*/
int pm_flags;
};
/*
* uvm_pmalloc flags.
*/
#define UVM_PMA_LINKED 0x01 /* uvm_pmalloc is on list */
#define UVM_PMA_BUSY 0x02 /* entry is busy with fpageq unlocked */
#define UVM_PMA_FAIL 0x10 /* page daemon cannot free pages */
#define UVM_PMA_FREED 0x20 /* at least one page in the range was freed */
RBT_HEAD(uvm_pmemrange_addr, uvm_pmemrange);
TAILQ_HEAD(uvm_pmemrange_use, uvm_pmemrange);
/*
* pmr control structure. Contained in uvm.pmr_control.
*/
struct uvm_pmr_control {
struct uvm_pmemrange_addr addr;
struct uvm_pmemrange_use use;
/* Only changed while fpageq is locked. */
TAILQ_HEAD(, uvm_pmalloc) allocs;
};
void uvm_pmr_freepages(struct vm_page *, psize_t);
void uvm_pmr_freepageq(struct pglist *);
int uvm_pmr_getpages(psize_t, paddr_t, paddr_t, paddr_t, paddr_t,
int, int, struct pglist *);
void uvm_pmr_init(void);
int uvm_wait_pla(paddr_t, paddr_t, paddr_t, int);
void uvm_wakeup_pla(paddr_t, psize_t);
#if defined(DDB) || defined(DEBUG)
int uvm_pmr_isfree(struct vm_page *pg);
#endif
/*
* Internal tree logic.
*/
int uvm_pmr_addr_cmp(const struct vm_page *, const struct vm_page *);
int uvm_pmr_size_cmp(const struct vm_page *, const struct vm_page *);
RBT_PROTOTYPE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);RBT_PROTOTYPE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
RBT_PROTOTYPE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
uvm_pmemrange_addr_cmp);
struct vm_page *uvm_pmr_insert_addr(struct uvm_pmemrange *,
struct vm_page *, int);
void uvm_pmr_insert_size(struct uvm_pmemrange *,
struct vm_page *);
struct vm_page *uvm_pmr_insert(struct uvm_pmemrange *,
struct vm_page *, int);
void uvm_pmr_remove_addr(struct uvm_pmemrange *,
struct vm_page *);
void uvm_pmr_remove_size(struct uvm_pmemrange *,
struct vm_page *);
void uvm_pmr_remove(struct uvm_pmemrange *,
struct vm_page *);
struct vm_page *uvm_pmr_extract_range(struct uvm_pmemrange *,
struct vm_page *, paddr_t, paddr_t,
struct pglist *);
#endif /* _UVM_UVM_PMEMRANGE_H_ */
/* $OpenBSD: kref.h,v 1.4 2020/06/17 02:58:15 jsg Exp $ */
/*
* Copyright (c) 2015 Mark Kettenis
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifndef _LINUX_KREF_H
#define _LINUX_KREF_H
#include <sys/types.h>
#include <sys/rwlock.h>
#include <sys/atomic.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/refcount.h>
struct kref {
uint32_t refcount;
};
static inline void
kref_init(struct kref *ref)
{
atomic_set(&ref->refcount, 1);
}
static inline unsigned int
kref_read(const struct kref *ref)
{
return atomic_read(&ref->refcount);
}
static inline void
kref_get(struct kref *ref)
{
atomic_inc_int(&ref->refcount);
}
static inline int
kref_get_unless_zero(struct kref *ref)
{
if (ref->refcount != 0) {
atomic_inc_int(&ref->refcount);
return (1);
} else {
return (0);
}
}
static inline int
kref_put(struct kref *ref, void (*release)(struct kref *ref))
{
if (atomic_dec_int_nv(&ref->refcount) == 0) {
release(ref);
return 1;
}
return 0;
}
static inline int
kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref),
struct rwlock *lock)
{
if (!atomic_add_unless(&kref->refcount, -1, 1)) {
rw_enter_write(lock);
if (likely(atomic_dec_and_test(&kref->refcount))) {
release(kref);
return 1;
}
rw_exit_write(lock);
return 0;
}
return 0;
}
static inline int
kref_put_lock(struct kref *kref, void (*release)(struct kref *kref),
struct mutex *lock)
{
if (!atomic_add_unless(&kref->refcount, -1, 1)) {
mtx_enter(lock);
if (likely(atomic_dec_and_test(&kref->refcount))) {
release(kref);
return 1;
}
mtx_leave(lock);
return 0;
}
return 0;
}
#endif
/* $OpenBSD: mld6.c,v 1.60 2022/09/05 15:47:39 bluhm Exp $ */
/* $KAME: mld6.c,v 1.26 2001/02/16 14:50:35 itojun Exp $ */
/*
* Copyright (C) 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988 Stephen Deering.
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/mld6.h>
#include <netinet6/mld6_var.h>
static struct ip6_pktopts ip6_opts;
int mld6_timers_are_running; /* [N] shortcut for fast timer */
void mld6_checktimer(struct ifnet *);
static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *);
void
mld6_init(void)
{
static u_int8_t hbh_buf[8];
struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf;
u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);
mld6_timers_are_running = 0;
/* ip6h_nxt will be fill in later */
hbh->ip6h_len = 0; /* (8 >> 3) - 1 */
/* XXX: grotty hard coding... */
hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */
hbh_buf[3] = 0;
hbh_buf[4] = IP6OPT_ROUTER_ALERT;
hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
memcpy(&hbh_buf[6], (caddr_t)&rtalert_code, sizeof(u_int16_t));
ip6_initpktopts(&ip6_opts);
ip6_opts.ip6po_hbh = hbh;
}
void
mld6_start_listening(struct in6_multi *in6m)
{
/* XXX: These are necessary for KAME's link-local hack */
struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
/*
* RFC2710 page 10:
* The node never sends a Report or Done for the link-scope all-nodes
* address.
* MLD messages are never sent for multicast addresses whose scope is 0
* (reserved) or 1 (node-local).
*/
all_nodes.s6_addr16[1] = htons(in6m->in6m_ifidx);
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes) ||
__IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
__IPV6_ADDR_SCOPE_LINKLOCAL) {
in6m->in6m_timer = 0;
in6m->in6m_state = MLD_OTHERLISTENER;
} else {
mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_timer =
MLD_RANDOM_DELAY(MLD_V1_MAX_RI *
PR_FASTHZ);
in6m->in6m_state = MLD_IREPORTEDLAST;
mld6_timers_are_running = 1;
}
}
void
mld6_stop_listening(struct in6_multi *in6m)
{
/* XXX: These are necessary for KAME's link-local hack */
struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
struct in6_addr all_routers = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
all_nodes.s6_addr16[1] = htons(in6m->in6m_ifidx);
/* XXX: necessary when mrouting */
all_routers.s6_addr16[1] = htons(in6m->in6m_ifidx);
if (in6m->in6m_state == MLD_IREPORTEDLAST && (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes)) &&
__IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) >
__IPV6_ADDR_SCOPE_INTFACELOCAL)
mld6_sendpkt(in6m, MLD_LISTENER_DONE, &all_routers);
}
void
mld6_input(struct mbuf *m, int off)
{
struct ip6_hdr *ip6;
struct mld_hdr *mldh;
struct ifnet *ifp;
struct in6_multi *in6m;
struct ifmaddr *ifma;
int timer; /* timer value in the MLD query header */
/* XXX: These are necessary for KAME's link-local hack */
struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh));
if (mldh == NULL) {
icmp6stat_inc(icp6s_tooshort);
return;
}
/* source address validation */
ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */
if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
#if 0
char src[INET6_ADDRSTRLEN], grp[INET6_ADDRSTRLEN];
log(LOG_ERR,
"mld_input: src %s is not link-local (grp=%s)\n",
inet_ntop(AF_INET6, &ip6->ip6_src, src, sizeof(src)),
inet_ntop(AF_INET6, &mldh->mld_addr, grp, sizeof(grp)));
#endif
/*
* spec (RFC2710) does not explicitly
* specify to discard the packet from a non link-local
* source address. But we believe it's expected to do so.
*/
m_freem(m);
return;
}
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp == NULL) {
m_freem(m);
return;
}
/*
* In the MLD6 specification, there are 3 states and a flag.
*
* In Non-Listener state, we simply don't have a membership record.
* In Delaying Listener state, our timer is running (in6m->in6m_timer)
* In Idle Listener state, our timer is not running (in6m->in6m_timer==0)
*
* The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if
* we have heard a report from another member, or MLD_IREPORTEDLAST
* if we sent the last report.
*/
switch(mldh->mld_type) {
case MLD_LISTENER_QUERY:
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) &&
!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
break; /* print error or log stat? */
if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
mldh->mld_addr.s6_addr16[1] =
htons(ifp->if_index); /* XXX */
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
* which the query arrived excl. those that belong
* to the "all-nodes" group (ff02::1).
* - Restart any timer that is already running but has
* A value longer than the requested timeout.
* - Use the value specified in the query message as
* the maximum timeout.
*/
/*
* XXX: System timer resolution is too low to handle Max
* Response Delay, so set 1 to the internal timer even if
* the calculated value equals to zero when Max Response
* Delay is positive.
*/
timer = ntohs(mldh->mld_maxdelay)*PR_FASTHZ/MLD_TIMER_SCALE;
if (timer == 0 && mldh->mld_maxdelay)
timer = 1;
all_nodes.s6_addr16[1] = htons(ifp->if_index);
TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) {
if (ifma->ifma_addr->sa_family != AF_INET6)
continue;
in6m = ifmatoin6m(ifma);
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes) ||
__IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
__IPV6_ADDR_SCOPE_LINKLOCAL)
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) ||
IN6_ARE_ADDR_EQUAL(&mldh->mld_addr,
&in6m->in6m_addr))
{
if (timer == 0) {
/* send a report immediately */
mld6_sendpkt(in6m, MLD_LISTENER_REPORT,
NULL);
in6m->in6m_timer = 0; /* reset timer */
in6m->in6m_state = MLD_IREPORTEDLAST;
} else if (in6m->in6m_timer == 0 || /* idle */
in6m->in6m_timer > timer) {
in6m->in6m_timer =
MLD_RANDOM_DELAY(timer);
mld6_timers_are_running = 1;
}
}
}
if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
break;
case MLD_LISTENER_REPORT:
/*
* For fast leave to work, we have to know that we are the
* last person to send a report for this group. Reports
* can potentially get looped back if we are a multicast
* router, so discard reports sourced by me.
* Note that it is impossible to check IFF_LOOPBACK flag of
* ifp for this purpose, since ip6_mloopback pass the physical
* interface to if_input_local().
*/
if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
break;
if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
break;
if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
mldh->mld_addr.s6_addr16[1] =
htons(ifp->if_index); /* XXX */
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
IN6_LOOKUP_MULTI(mldh->mld_addr, ifp, in6m);
if (in6m) {
in6m->in6m_timer = 0; /* transit to idle state */
in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */
}
if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr))
mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
break;
default: /* this is impossible */
#if 0
/*
* this case should be impossible because of filtering in
* icmp6_input(). But we explicitly disabled this part
* just in case.
*/
log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type);
#endif
break;
}
if_put(ifp);
m_freem(m);
}
void
mld6_fasttimeo(void)
{
struct ifnet *ifp;
/*
* Quick check to see if any work needs to be done, in order
* to minimize the overhead of fasttimo processing.
* Variable mld6_timers_are_running is read atomically, but without
* lock intentionally. In case it is not set due to MP races, we may
* miss to check the timers. Then run the loop at next fast timeout.
*/
if (!mld6_timers_are_running)
return;
NET_LOCK();
mld6_timers_are_running = 0;
TAILQ_FOREACH(ifp, &ifnet, if_list)
mld6_checktimer(ifp);
NET_UNLOCK();
}
void
mld6_checktimer(struct ifnet *ifp)
{
struct in6_multi *in6m;
struct ifmaddr *ifma;
NET_ASSERT_LOCKED();
TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) {
if (ifma->ifma_addr->sa_family != AF_INET6)
continue;
in6m = ifmatoin6m(ifma);
if (in6m->in6m_timer == 0) {
/* do nothing */
} else if (--in6m->in6m_timer == 0) {
mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_state = MLD_IREPORTEDLAST;
} else {
mld6_timers_are_running = 1;
}
}
}
static void
mld6_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst)
{
struct mbuf *mh, *md;
struct mld_hdr *mldh;
struct ip6_hdr *ip6;
struct ip6_moptions im6o;
struct in6_ifaddr *ia6;
struct ifnet *ifp;
int ignflags;
ifp = if_get(in6m->in6m_ifidx);
if (ifp == NULL)
return;
/*
* At first, find a link local address on the outgoing interface
* to use as the source address of the MLD packet.
* We do not reject tentative addresses for MLD report to deal with
* the case where we first join a link-local address.
*/
ignflags = IN6_IFF_DUPLICATED|IN6_IFF_ANYCAST;
if ((ia6 = in6ifa_ifpforlinklocal(ifp, ignflags)) == NULL) {
if_put(ifp);
return;
}
if ((ia6->ia6_flags & IN6_IFF_TENTATIVE))
ia6 = NULL;
/*
* Allocate mbufs to store ip6 header and MLD header.
* We allocate 2 mbufs and make chain in advance because
* it is more convenient when inserting the hop-by-hop option later.
*/
MGETHDR(mh, M_DONTWAIT, MT_HEADER);
if (mh == NULL) {
if_put(ifp);
return;
}
MGET(md, M_DONTWAIT, MT_DATA);
if (md == NULL) {
m_free(mh);
if_put(ifp);
return;
}
mh->m_next = md;
mh->m_pkthdr.ph_ifidx = 0;
mh->m_pkthdr.ph_rtableid = ifp->if_rdomain;
mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
mh->m_len = sizeof(struct ip6_hdr);
m_align(mh, sizeof(struct ip6_hdr));
/* fill in the ip6 header */
ip6 = mtod(mh, struct ip6_hdr *);
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be set later */
ip6->ip6_nxt = IPPROTO_ICMPV6;
/* ip6_hlim will be set by im6o.im6o_hlim */
ip6->ip6_src = ia6 ? ia6->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr;
/* fill in the MLD header */
md->m_len = sizeof(struct mld_hdr);
mldh = mtod(md, struct mld_hdr *);
mldh->mld_type = type;
mldh->mld_code = 0;
mldh->mld_cksum = 0;
/* XXX: we assume the function will not be called for query messages */
mldh->mld_maxdelay = 0;
mldh->mld_reserved = 0;
mldh->mld_addr = in6m->in6m_addr;
if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */
mh->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT;
/* construct multicast option */
bzero(&im6o, sizeof(im6o));
im6o.im6o_ifidx = ifp->if_index;
im6o.im6o_hlim = 1;
/*
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing daemon can hear it.
*/
#ifdef MROUTING
im6o.im6o_loop = (ip6_mrouter[ifp->if_rdomain] != NULL);
#endif
if_put(ifp);
icmp6stat_inc(icp6s_outhist + type);
ip6_output(mh, &ip6_opts, NULL, ia6 ? 0 : IPV6_UNSPECSRC, &im6o,
NULL);
}
/* $OpenBSD: uvm_pager.c,v 1.89 2022/08/19 05:53:19 mpi Exp $ */
/* $NetBSD: uvm_pager.c,v 1.36 2000/11/27 18:26:41 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
*/
/*
* uvm_pager.c: generic functions used to assist the pagers.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/buf.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
const struct uvm_pagerops *uvmpagerops[] = {
&aobj_pager,
&uvm_deviceops,
&uvm_vnodeops,
};
/*
* the pager map: provides KVA for I/O
*
* Each uvm_pseg has room for MAX_PAGERMAP_SEGS pager io space of
* MAXBSIZE bytes.
*
* The number of uvm_pseg instances is dynamic using an array segs.
* At most UVM_PSEG_COUNT instances can exist.
*
* psegs[0/1] always exist (so that the pager can always map in pages).
* psegs[0/1] element 0 are always reserved for the pagedaemon.
*
* Any other pseg is automatically created when no space is available
* and automatically destroyed when it is no longer in use.
*/
#define MAX_PAGER_SEGS 16
#define PSEG_NUMSEGS (PAGER_MAP_SIZE / MAX_PAGER_SEGS / MAXBSIZE)
struct uvm_pseg {
/* Start of virtual space; 0 if not inited. */
vaddr_t start;
/* Bitmap of the segments in use in this pseg. */
int use;
};
struct mutex uvm_pseg_lck;
struct uvm_pseg psegs[PSEG_NUMSEGS];
#define UVM_PSEG_FULL(pseg) ((pseg)->use == (1 << MAX_PAGER_SEGS) - 1)
#define UVM_PSEG_EMPTY(pseg) ((pseg)->use == 0)
#define UVM_PSEG_INUSE(pseg,id) (((pseg)->use & (1 << (id))) != 0)
void uvm_pseg_init(struct uvm_pseg *);
vaddr_t uvm_pseg_get(int);
void uvm_pseg_release(vaddr_t);
/*
* uvm_pager_init: init pagers (at boot time)
*/
void
uvm_pager_init(void)
{
int lcv;
/* init pager map */
uvm_pseg_init(&psegs[0]);
uvm_pseg_init(&psegs[1]);
mtx_init(&uvm_pseg_lck, IPL_VM);
/* init ASYNC I/O queue */
TAILQ_INIT(&uvm.aio_done);
/* call pager init functions */
for (lcv = 0 ; lcv < sizeof(uvmpagerops)/sizeof(struct uvm_pagerops *);
lcv++) {
if (uvmpagerops[lcv]->pgo_init)
uvmpagerops[lcv]->pgo_init();
}
}
/*
* Initialize a uvm_pseg.
*
* May fail, in which case seg->start == 0.
*
* Caller locks uvm_pseg_lck.
*/
void
uvm_pseg_init(struct uvm_pseg *pseg)
{
KASSERT(pseg->start == 0);
KASSERT(pseg->use == 0);
pseg->start = (vaddr_t)km_alloc(MAX_PAGER_SEGS * MAXBSIZE,
&kv_any, &kp_none, &kd_trylock);
}
/*
* Acquire a pager map segment.
*
* Returns a vaddr for paging. 0 on failure.
*
* Caller does not lock.
*/
vaddr_t
uvm_pseg_get(int flags)
{
int i;
struct uvm_pseg *pseg;
/*
* XXX Prevent lock ordering issue in uvm_unmap_detach(). A real
* fix would be to move the KERNEL_LOCK() out of uvm_unmap_detach().
*
* witness_checkorder() at witness_checkorder+0xba0
* __mp_lock() at __mp_lock+0x5f
* uvm_unmap_detach() at uvm_unmap_detach+0xc5
* uvm_map() at uvm_map+0x857
* uvm_km_valloc_try() at uvm_km_valloc_try+0x65
* uvm_pseg_get() at uvm_pseg_get+0x6f
* uvm_pagermapin() at uvm_pagermapin+0x45
* uvn_io() at uvn_io+0xcf
* uvn_get() at uvn_get+0x156
* uvm_fault_lower() at uvm_fault_lower+0x28a
* uvm_fault() at uvm_fault+0x1b3
* upageflttrap() at upageflttrap+0x62
*/
KERNEL_LOCK();
mtx_enter(&uvm_pseg_lck);
pager_seg_restart:
/* Find first pseg that has room. */
for (pseg = &psegs[0]; pseg != &psegs[PSEG_NUMSEGS]; pseg++) { if (UVM_PSEG_FULL(pseg))
continue;
if (pseg->start == 0) {
/* Need initialization. */
uvm_pseg_init(pseg);
if (pseg->start == 0)
goto pager_seg_fail;
}
/* Keep indexes 0,1 reserved for pagedaemon. */
if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
(curproc != uvm.pagedaemon_proc))
i = 2;
else
i = 0;
for (; i < MAX_PAGER_SEGS; i++) {
if (!UVM_PSEG_INUSE(pseg, i)) {
pseg->use |= 1 << i;
mtx_leave(&uvm_pseg_lck);
KERNEL_UNLOCK();
return pseg->start + i * MAXBSIZE;
}
}
}
pager_seg_fail:
if ((flags & UVMPAGER_MAPIN_WAITOK) != 0) {
msleep_nsec(&psegs, &uvm_pseg_lck, PVM, "pagerseg", INFSLP);
goto pager_seg_restart;
}
mtx_leave(&uvm_pseg_lck);
KERNEL_UNLOCK();
return 0;
}
/*
* Release a pager map segment.
*
* Caller does not lock.
*
* Deallocates pseg if it is no longer in use.
*/
void
uvm_pseg_release(vaddr_t segaddr)
{
int id;
struct uvm_pseg *pseg;
vaddr_t va = 0;
mtx_enter(&uvm_pseg_lck);
for (pseg = &psegs[0]; pseg != &psegs[PSEG_NUMSEGS]; pseg++) {
if (pseg->start <= segaddr &&
segaddr < pseg->start + MAX_PAGER_SEGS * MAXBSIZE)
break;
}
KASSERT(pseg != &psegs[PSEG_NUMSEGS]);
id = (segaddr - pseg->start) / MAXBSIZE;
KASSERT(id >= 0 && id < MAX_PAGER_SEGS);
/* test for no remainder */
KDASSERT(segaddr == pseg->start + id * MAXBSIZE);
KASSERT(UVM_PSEG_INUSE(pseg, id));
pseg->use &= ~(1 << id);
wakeup(&psegs);
if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
va = pseg->start;
pseg->start = 0;
}
mtx_leave(&uvm_pseg_lck);
if (va) { km_free((void *)va, MAX_PAGER_SEGS * MAXBSIZE,
&kv_any, &kp_none);
}
}
/*
* uvm_pagermapin: map pages into KVA for I/O that needs mappings
*
* We basically just km_valloc a blank map entry to reserve the space in the
* kernel map and then use pmap_enter() to put the mappings in by hand.
*/
vaddr_t
uvm_pagermapin(struct vm_page **pps, int npages, int flags)
{
vaddr_t kva, cva;
vm_prot_t prot;
vsize_t size;
struct vm_page *pp;
#if defined(__HAVE_PMAP_DIRECT)
/*
* Use direct mappings for single page, unless there is a risk
* of aliasing.
*/
if (npages == 1 && PMAP_PREFER_ALIGN() == 0) {
KASSERT(pps[0]); KASSERT(pps[0]->pg_flags & PG_BUSY); return pmap_map_direct(pps[0]);
}
#endif
prot = PROT_READ;
if (flags & UVMPAGER_MAPIN_READ)
prot |= PROT_WRITE;
size = ptoa(npages);
KASSERT(size <= MAXBSIZE);
kva = uvm_pseg_get(flags);
if (kva == 0)
return 0;
for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) {
pp = *pps++;
KASSERT(pp); KASSERT(pp->pg_flags & PG_BUSY);
/* Allow pmap_enter to fail. */
if (pmap_enter(pmap_kernel(), cva, VM_PAGE_TO_PHYS(pp),
prot, PMAP_WIRED | PMAP_CANFAIL | prot) != 0) {
pmap_remove(pmap_kernel(), kva, cva);
pmap_update(pmap_kernel());
uvm_pseg_release(kva);
return 0;
}
}
pmap_update(pmap_kernel());
return kva;
}
/*
* uvm_pagermapout: remove KVA mapping
*
* We remove our mappings by hand and then remove the mapping.
*/
void
uvm_pagermapout(vaddr_t kva, int npages)
{
#if defined(__HAVE_PMAP_DIRECT)
/*
* Use direct mappings for single page, unless there is a risk
* of aliasing.
*/
if (npages == 1 && PMAP_PREFER_ALIGN() == 0) {
pmap_unmap_direct(kva);
return;
}
#endif
pmap_remove(pmap_kernel(), kva, kva + ((vsize_t)npages << PAGE_SHIFT));
pmap_update(pmap_kernel());
uvm_pseg_release(kva);
}
/*
* uvm_mk_pcluster
*
* generic "make 'pager put' cluster" function. a pager can either
* [1] set pgo_mk_pcluster to NULL (never cluster), [2] set it to this
* generic function, or [3] set it to a pager specific function.
*
* => caller must lock object _and_ pagequeues (since we need to look
* at active vs. inactive bits, etc.)
* => caller must make center page busy and write-protect it
* => we mark all cluster pages busy for the caller
* => the caller must unbusy all pages (and check wanted/released
* status if it drops the object lock)
* => flags:
* PGO_ALLPAGES: all pages in object are valid targets
* !PGO_ALLPAGES: use "lo" and "hi" to limit range of cluster
* PGO_DOACTCLUST: include active pages in cluster.
* PGO_FREE: set the PG_RELEASED bits on the cluster so they'll be freed
* in async io (caller must clean on error).
* NOTE: the caller should clear PG_CLEANCHK bits if PGO_DOACTCLUST.
* PG_CLEANCHK is only a hint, but clearing will help reduce
* the number of calls we make to the pmap layer.
*/
struct vm_page **
uvm_mk_pcluster(struct uvm_object *uobj, struct vm_page **pps, int *npages,
struct vm_page *center, int flags, voff_t mlo, voff_t mhi)
{
struct vm_page **ppsp, *pclust;
voff_t lo, hi, curoff;
int center_idx, forward, incr;
/*
* center page should already be busy and write protected. XXX:
* suppose page is wired? if we lock, then a process could
* fault/block on it. if we don't lock, a process could write the
* pages in the middle of an I/O. (consider an msync()). let's
* lock it for now (better to delay than corrupt data?).
*/
/* get cluster boundaries, check sanity, and apply our limits as well.*/
uobj->pgops->pgo_cluster(uobj, center->offset, &lo, &hi);
if ((flags & PGO_ALLPAGES) == 0) {
if (lo < mlo)
lo = mlo;
if (hi > mhi)
hi = mhi;
}
if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */ pps[0] = center;
*npages = 1;
return pps;
}
/* now determine the center and attempt to cluster around the edges */
center_idx = (center->offset - lo) >> PAGE_SHIFT;
pps[center_idx] = center; /* plug in the center page */
ppsp = &pps[center_idx];
*npages = 1;
/*
* attempt to cluster around the left [backward], and then
* the right side [forward].
*
* note that for inactive pages (pages that have been deactivated)
* there are no valid mappings and PG_CLEAN should be up to date.
* [i.e. there is no need to query the pmap with pmap_is_modified
* since there are no mappings].
*/
for (forward = 0 ; forward <= 1 ; forward++) {
incr = forward ? PAGE_SIZE : -PAGE_SIZE;
curoff = center->offset + incr;
for ( ;(forward == 0 && curoff >= lo) ||
(forward && curoff < hi);
curoff += incr) {
pclust = uvm_pagelookup(uobj, curoff); /* lookup page */
if (pclust == NULL) {
break; /* no page */
}
/* handle active pages */
/* NOTE: inactive pages don't have pmap mappings */
if ((pclust->pg_flags & PQ_INACTIVE) == 0) { if ((flags & PGO_DOACTCLUST) == 0) {
/* dont want mapped pages at all */
break;
}
/* make sure "clean" bit is sync'd */
if ((pclust->pg_flags & PG_CLEANCHK) == 0) { if ((pclust->pg_flags & (PG_CLEAN|PG_BUSY)) == PG_CLEAN &&
pmap_is_modified(pclust))
atomic_clearbits_int(
&pclust->pg_flags,
PG_CLEAN);
/* now checked */
atomic_setbits_int(&pclust->pg_flags,
PG_CLEANCHK);
}
}
/* is page available for cleaning and does it need it */
if ((pclust->pg_flags & (PG_CLEAN|PG_BUSY)) != 0) {
break; /* page is already clean or is busy */
}
/* yes! enroll the page in our array */
atomic_setbits_int(&pclust->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pclust, "uvm_mk_pcluster");
/*
* If we want to free after io is done, and we're
* async, set the released flag
*/
if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE) atomic_setbits_int(&pclust->pg_flags,
PG_RELEASED);
/* XXX: protect wired page? see above comment. */
pmap_page_protect(pclust, PROT_READ);
if (!forward) {
ppsp--; /* back up one page */
*ppsp = pclust;
} else {
/* move forward one page */
ppsp[*npages] = pclust;
}
(*npages)++;
}
}
/*
* done! return the cluster array to the caller!!!
*/
return ppsp;
}
/*
* uvm_pager_put: high level pageout routine
*
* we want to pageout page "pg" to backing store, clustering if
* possible.
*
* => page queues must be locked by caller
* => if page is not swap-backed, then "uobj" points to the object
* backing it.
* => if page is swap-backed, then "uobj" should be NULL.
* => "pg" should be PG_BUSY (by caller), and !PG_CLEAN
* for swap-backed memory, "pg" can be NULL if there is no page
* of interest [sometimes the case for the pagedaemon]
* => "ppsp_ptr" should point to an array of npages vm_page pointers
* for possible cluster building
* => flags (first two for non-swap-backed pages)
* PGO_ALLPAGES: all pages in uobj are valid targets
* PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets
* PGO_SYNCIO: do SYNC I/O (no async)
* PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O
* PGO_FREE: tell the aio daemon to free pages in the async case.
* => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range
* if (!uobj) start is the (daddr_t) of the starting swapblk
* => return state:
* 1. we return the VM_PAGER status code of the pageout
* 2. we return with the page queues unlocked
* 3. on errors we always drop the cluster. thus, if we return
* !PEND, !OK, then the caller only has to worry about
* un-busying the main page (not the cluster pages).
* 4. on success, if !PGO_PDFREECLUST, we return the cluster
* with all pages busy (caller must un-busy and check
* wanted/released flags).
*/
int
uvm_pager_put(struct uvm_object *uobj, struct vm_page *pg,
struct vm_page ***ppsp_ptr, int *npages, int flags,
voff_t start, voff_t stop)
{
int result;
daddr_t swblk;
struct vm_page **ppsp = *ppsp_ptr;
/*
* note that uobj is null if we are doing a swap-backed pageout.
* note that uobj is !null if we are doing normal object pageout.
* note that the page queues must be locked to cluster.
*/
if (uobj) { /* if !swap-backed */
/*
* attempt to build a cluster for pageout using its
* make-put-cluster function (if it has one).
*/
if (uobj->pgops->pgo_mk_pcluster) {
ppsp = uobj->pgops->pgo_mk_pcluster(uobj, ppsp,
npages, pg, flags, start, stop);
*ppsp_ptr = ppsp; /* update caller's pointer */
} else {
ppsp[0] = pg;
*npages = 1;
}
swblk = 0; /* XXX: keep gcc happy */
} else {
/*
* for swap-backed pageout, the caller (the pagedaemon) has
* already built the cluster for us. the starting swap
* block we are writing to has been passed in as "start."
* "pg" could be NULL if there is no page we are especially
* interested in (in which case the whole cluster gets dropped
* in the event of an error or a sync "done").
*/
swblk = start;
/* ppsp and npages should be ok */
}
/* now that we've clustered we can unlock the page queues */
uvm_unlock_pageq();
/*
* now attempt the I/O. if we have a failure and we are
* clustered, we will drop the cluster and try again.
*/
ReTry:
if (uobj) {
result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
} else {
/* XXX daddr_t -> int */
result = uvm_swap_put(swblk, ppsp, *npages, flags);
}
/*
* we have attempted the I/O.
*
* if the I/O was a success then:
* if !PGO_PDFREECLUST, we return the cluster to the
* caller (who must un-busy all pages)
* else we un-busy cluster pages for the pagedaemon
*
* if I/O is pending (async i/o) then we return the pending code.
* [in this case the async i/o done function must clean up when
* i/o is done...]
*/
if (result == VM_PAGER_PEND || result == VM_PAGER_OK) { if (result == VM_PAGER_OK && (flags & PGO_PDFREECLUST)) {
/* drop cluster */
if (*npages > 1 || pg == NULL) uvm_pager_dropcluster(uobj, pg, ppsp, npages,
PGO_PDFREECLUST);
}
return (result);
}
/*
* a pager error occurred (even after dropping the cluster, if there
* was one). give up! the caller only has one page ("pg")
* to worry about.
*/
if (*npages > 1 || pg == NULL) {
uvm_pager_dropcluster(uobj, pg, ppsp, npages, PGO_REALLOCSWAP);
/*
* for failed swap-backed pageouts with a "pg",
* we need to reset pg's swslot to either:
* "swblk" (for transient errors, so we can retry),
* or 0 (for hard errors).
*/
if (uobj == NULL && pg != NULL) {
/* XXX daddr_t -> int */
int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0;
if (pg->pg_flags & PQ_ANON) {
rw_enter(pg->uanon->an_lock, RW_WRITE);
pg->uanon->an_swslot = nswblk;
rw_exit(pg->uanon->an_lock);
} else {
rw_enter(pg->uobject->vmobjlock, RW_WRITE);
uao_set_swslot(pg->uobject,
pg->offset >> PAGE_SHIFT,
nswblk);
rw_exit(pg->uobject->vmobjlock);
}
}
if (result == VM_PAGER_AGAIN) {
/*
* for transient failures, free all the swslots that
* we're not going to retry with.
*/
if (uobj == NULL) {
if (pg) {
/* XXX daddr_t -> int */
uvm_swap_free(swblk + 1, *npages - 1);
} else {
/* XXX daddr_t -> int */
uvm_swap_free(swblk, *npages);
}
}
if (pg) {
ppsp[0] = pg;
*npages = 1;
goto ReTry;
}
} else if (uobj == NULL) {
/*
* for hard errors on swap-backed pageouts,
* mark the swslots as bad. note that we do not
* free swslots that we mark bad.
*/
/* XXX daddr_t -> int */
uvm_swap_markbad(swblk, *npages);
}
}
/*
* a pager error occurred (even after dropping the cluster, if there
* was one). give up! the caller only has one page ("pg")
* to worry about.
*/
return result;
}
/*
* uvm_pager_dropcluster: drop a cluster we have built (because we
* got an error, or, if PGO_PDFREECLUST we are un-busying the
* cluster pages on behalf of the pagedaemon).
*
* => uobj, if non-null, is a non-swap-backed object
* => page queues are not locked
* => pg is our page of interest (the one we clustered around, can be null)
* => ppsp/npages is our current cluster
* => flags: PGO_PDFREECLUST: pageout was a success: un-busy cluster
* pages on behalf of the pagedaemon.
* PGO_REALLOCSWAP: drop previously allocated swap slots for
* clustered swap-backed pages (except for "pg" if !NULL)
* "swblk" is the start of swap alloc (e.g. for ppsp[0])
* [only meaningful if swap-backed (uobj == NULL)]
*/
void
uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,
struct vm_page **ppsp, int *npages, int flags)
{
int lcv;
KASSERT(uobj == NULL || rw_write_held(uobj->vmobjlock));
/* drop all pages but "pg" */
for (lcv = 0 ; lcv < *npages ; lcv++) {
/* skip "pg" or empty slot */
if (ppsp[lcv] == pg || ppsp[lcv] == NULL)
continue;
/*
* Note that PQ_ANON bit can't change as long as we are holding
* the PG_BUSY bit (so there is no need to lock the page
* queues to test it).
*/
if (!uobj) {
if (ppsp[lcv]->pg_flags & PQ_ANON) {
rw_enter(ppsp[lcv]->uanon->an_lock, RW_WRITE);
if (flags & PGO_REALLOCSWAP)
/* zap swap block */
ppsp[lcv]->uanon->an_swslot = 0;
} else {
rw_enter(ppsp[lcv]->uobject->vmobjlock,
RW_WRITE);
if (flags & PGO_REALLOCSWAP)
uao_set_swslot(ppsp[lcv]->uobject,
ppsp[lcv]->offset >> PAGE_SHIFT, 0);
}
}
/* did someone want the page while we had it busy-locked? */
if (ppsp[lcv]->pg_flags & PG_WANTED) {
wakeup(ppsp[lcv]);
}
/* if page was released, release it. otherwise un-busy it */
if (ppsp[lcv]->pg_flags & PG_RELEASED &&
ppsp[lcv]->pg_flags & PQ_ANON) {
/* kills anon and frees pg */
uvm_anon_release(ppsp[lcv]->uanon);
continue;
} else {
/*
* if we were planning on async io then we would
* have PG_RELEASED set, clear that with the others.
*/
atomic_clearbits_int(&ppsp[lcv]->pg_flags,
PG_BUSY|PG_WANTED|PG_FAKE|PG_RELEASED);
UVM_PAGE_OWN(ppsp[lcv], NULL);
}
/*
* if we are operating on behalf of the pagedaemon and we
* had a successful pageout update the page!
*/
if (flags & PGO_PDFREECLUST) {
pmap_clear_reference(ppsp[lcv]);
pmap_clear_modify(ppsp[lcv]);
atomic_setbits_int(&ppsp[lcv]->pg_flags, PG_CLEAN);
}
/* if anonymous cluster, unlock object and move on */
if (!uobj) {
if (ppsp[lcv]->pg_flags & PQ_ANON)
rw_exit(ppsp[lcv]->uanon->an_lock);
else
rw_exit(ppsp[lcv]->uobject->vmobjlock);
}
}
}
/*
* interrupt-context iodone handler for single-buf i/os
* or the top-level buf of a nested-buf i/o.
*
* => must be at splbio().
*/
void
uvm_aio_biodone(struct buf *bp)
{
splassert(IPL_BIO);
/* reset b_iodone for when this is a single-buf i/o. */
bp->b_iodone = uvm_aio_aiodone;
mtx_enter(&uvm.aiodoned_lock);
TAILQ_INSERT_TAIL(&uvm.aio_done, bp, b_freelist);
wakeup(&uvm.aiodoned);
mtx_leave(&uvm.aiodoned_lock);
}
void
uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, boolean_t write,
int error)
{
struct vm_page *pg;
struct uvm_object *uobj;
boolean_t swap;
int i;
uobj = NULL;
for (i = 0; i < npages; i++) {
pg = pgs[i];
if (i == 0) {
swap = (pg->pg_flags & PQ_SWAPBACKED) != 0;
if (!swap) {
uobj = pg->uobject;
rw_enter(uobj->vmobjlock, RW_WRITE);
}
}
KASSERT(swap || pg->uobject == uobj);
/*
* if this is a read and we got an error, mark the pages
* PG_RELEASED so that uvm_page_unbusy() will free them.
*/
if (!write && error) {
atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
continue;
}
KASSERT(!write || (pgs[i]->pg_flags & PG_FAKE) == 0);
/*
* if this is a read and the page is PG_FAKE,
* or this was a successful write,
* mark the page PG_CLEAN and not PG_FAKE.
*/
if ((pgs[i]->pg_flags & PG_FAKE) || (write && error != ENOMEM)) {
pmap_clear_reference(pgs[i]);
pmap_clear_modify(pgs[i]);
atomic_setbits_int(&pgs[i]->pg_flags, PG_CLEAN);
atomic_clearbits_int(&pgs[i]->pg_flags, PG_FAKE);
}
}
uvm_page_unbusy(pgs, npages);
if (!swap) {
rw_exit(uobj->vmobjlock);
}
}
/*
* uvm_aio_aiodone: do iodone processing for async i/os.
* this should be called in thread context, not interrupt context.
*/
void
uvm_aio_aiodone(struct buf *bp)
{
int npages = bp->b_bufsize >> PAGE_SHIFT;
struct vm_page *pgs[MAXPHYS >> PAGE_SHIFT];
int i, error;
boolean_t write;
KASSERT(npages <= MAXPHYS >> PAGE_SHIFT);
splassert(IPL_BIO);
error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
write = (bp->b_flags & B_READ) == 0;
for (i = 0; i < npages; i++)
pgs[i] = uvm_atopg((vaddr_t)bp->b_data +
((vsize_t)i << PAGE_SHIFT));
uvm_pagermapout((vaddr_t)bp->b_data, npages);
#ifdef UVM_SWAP_ENCRYPT
/*
* XXX - assumes that we only get ASYNC writes. used to be above.
*/
if (pgs[0]->pg_flags & PQ_ENCRYPT) {
uvm_swap_freepages(pgs, npages);
goto freed;
}
#endif /* UVM_SWAP_ENCRYPT */
uvm_aio_aiodone_pages(pgs, npages, write, error);
#ifdef UVM_SWAP_ENCRYPT
freed:
#endif
pool_put(&bufpool, bp);
}
/* $OpenBSD: spec_vnops.c,v 1.109 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)spec_vnops.c 8.8 (Berkeley) 11/21/94
*/
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/disklabel.h>
#include <sys/lockf.h>
#include <sys/dkio.h>
#include <sys/malloc.h>
#include <sys/specdev.h>
#include <sys/unistd.h>
#define v_lastr v_specinfo->si_lastr
int spec_open_clone(struct vop_open_args *);
struct vnodechain speclisth[SPECHSZ];
const struct vops spec_vops = {
.vop_lookup = vop_generic_lookup,
.vop_create = vop_generic_badop,
.vop_mknod = vop_generic_badop,
.vop_open = spec_open,
.vop_close = spec_close,
.vop_access = spec_access,
.vop_getattr = spec_getattr,
.vop_setattr = spec_setattr,
.vop_read = spec_read,
.vop_write = spec_write,
.vop_ioctl = spec_ioctl,
.vop_kqfilter = spec_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync = spec_fsync,
.vop_remove = vop_generic_badop,
.vop_link = vop_generic_badop,
.vop_rename = vop_generic_badop,
.vop_mkdir = vop_generic_badop,
.vop_rmdir = vop_generic_badop,
.vop_symlink = vop_generic_badop,
.vop_readdir = vop_generic_badop,
.vop_readlink = vop_generic_badop,
.vop_abortop = vop_generic_badop,
.vop_inactive = spec_inactive,
.vop_reclaim = nullop,
.vop_lock = nullop,
.vop_unlock = nullop,
.vop_islocked = nullop,
.vop_bmap = vop_generic_bmap,
.vop_strategy = spec_strategy,
.vop_print = spec_print,
.vop_pathconf = spec_pathconf,
.vop_advlock = spec_advlock,
.vop_bwrite = vop_generic_bwrite,
};
/*
* Open a special file.
*/
int
spec_open(void *v)
{
struct vop_open_args *ap = v;
struct proc *p = ap->a_p;
struct vnode *vp = ap->a_vp;
struct vnode *bvp;
dev_t bdev;
dev_t dev = (dev_t)vp->v_rdev;
int maj = major(dev);
int error;
/*
* Don't allow open if fs is mounted -nodev.
*/
if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
return (ENXIO);
switch (vp->v_type) {
case VCHR:
if ((u_int)maj >= nchrdev)
return (ENXIO);
if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) {
/*
* When running in very secure mode, do not allow
* opens for writing of any disk character devices.
*/
if (securelevel >= 2 && cdevsw[maj].d_type == D_DISK)
return (EPERM);
/*
* When running in secure mode, do not allow opens
* for writing of /dev/mem, /dev/kmem, or character
* devices whose corresponding block devices are
* currently mounted.
*/
if (securelevel >= 1) { if ((bdev = chrtoblk(dev)) != NODEV && vfinddev(bdev, VBLK, &bvp) && bvp->v_usecount > 0 &&
(error = vfs_mountedon(bvp)))
return (error);
if (iskmemdev(dev))
return (EPERM);
}
}
if (cdevsw[maj].d_type == D_TTY) vp->v_flag |= VISTTY;
if (cdevsw[maj].d_flags & D_CLONE)
return (spec_open_clone(ap)); VOP_UNLOCK(vp);
error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return (error);
case VBLK:
if ((u_int)maj >= nblkdev)
return (ENXIO);
/*
* When running in very secure mode, do not allow
* opens for writing of any disk block devices.
*/
if (securelevel >= 2 && ap->a_cred != FSCRED && (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
return (EPERM);
/*
* Do not allow opens of block devices that are
* currently mounted.
*/
if ((error = vfs_mountedon(vp)) != 0)
return (error);
return ((*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p));
case VNON:
case VLNK:
case VDIR:
case VREG:
case VBAD:
case VFIFO:
case VSOCK:
break;
}
return (0);
}
/*
* Vnode op for read
*/
int
spec_read(void *v)
{
struct vop_read_args *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct proc *p = uio->uio_procp;
struct buf *bp;
daddr_t bn, nextbn, bscale;
int bsize;
struct partinfo dpart;
size_t n;
int on, majordev;
int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *);
int error = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
panic("spec_read mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_read proc");
#endif
if (uio->uio_resid == 0)
return (0);
switch (vp->v_type) {
case VCHR:
VOP_UNLOCK(vp);
error = (*cdevsw[major(vp->v_rdev)].d_read)
(vp->v_rdev, uio, ap->a_ioflag);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return (error);
case VBLK:
if (uio->uio_offset < 0)
return (EINVAL);
bsize = BLKDEV_IOSIZE;
if ((majordev = major(vp->v_rdev)) < nblkdev && (ioctl = bdevsw[majordev].d_ioctl) != NULL &&
(*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) {
u_int32_t frag =
DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock);
u_int32_t fsize =
DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock); if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 &&
fsize != 0)
bsize = frag * fsize;
}
bscale = btodb(bsize);
do {
bn = btodb(uio->uio_offset) & ~(bscale - 1);
on = uio->uio_offset % bsize;
n = ulmin((bsize - on), uio->uio_resid);
if (vp->v_lastr + bscale == bn) {
nextbn = bn + bscale;
error = breadn(vp, bn, bsize, &nextbn, &bsize,
1, &bp);
} else
error = bread(vp, bn, bsize, &bp);
vp->v_lastr = bn;
n = ulmin(n, bsize - bp->b_resid);
if (error) {
brelse(bp);
return (error);
}
error = uiomove((char *)bp->b_data + on, n, uio);
brelse(bp);
} while (error == 0 && uio->uio_resid > 0 && n != 0);
return (error);
default:
panic("spec_read type");
}
/* NOTREACHED */
}
int
spec_inactive(void *v)
{
struct vop_inactive_args *ap = v;
VOP_UNLOCK(ap->a_vp);
return (0);
}
/*
* Vnode op for write
*/
int
spec_write(void *v)
{
struct vop_write_args *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct proc *p = uio->uio_procp;
struct buf *bp;
daddr_t bn, bscale;
int bsize;
struct partinfo dpart;
size_t n;
int on, majordev;
int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *);
int error = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE)
panic("spec_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_write proc");
#endif
switch (vp->v_type) {
case VCHR:
VOP_UNLOCK(vp);
error = (*cdevsw[major(vp->v_rdev)].d_write)
(vp->v_rdev, uio, ap->a_ioflag);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return (error);
case VBLK:
if (uio->uio_resid == 0)
return (0);
if (uio->uio_offset < 0)
return (EINVAL);
bsize = BLKDEV_IOSIZE;
if ((majordev = major(vp->v_rdev)) < nblkdev && (ioctl = bdevsw[majordev].d_ioctl) != NULL &&
(*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) {
u_int32_t frag =
DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock);
u_int32_t fsize =
DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock);
if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 &&
fsize != 0)
bsize = frag * fsize;
}
bscale = btodb(bsize);
do {
bn = btodb(uio->uio_offset) & ~(bscale - 1);
on = uio->uio_offset % bsize;
n = ulmin((bsize - on), uio->uio_resid);
error = bread(vp, bn, bsize, &bp);
n = ulmin(n, bsize - bp->b_resid);
if (error) {
brelse(bp);
return (error);
}
error = uiomove((char *)bp->b_data + on, n, uio);
if (n + on == bsize)
bawrite(bp);
else
bdwrite(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0);
return (error);
default:
panic("spec_write type");
}
/* NOTREACHED */
}
/*
* Device ioctl operation.
*/
int
spec_ioctl(void *v)
{
struct vop_ioctl_args *ap = v;
dev_t dev = ap->a_vp->v_rdev;
int maj = major(dev);
switch (ap->a_vp->v_type) {
case VCHR:
return ((*cdevsw[maj].d_ioctl)(dev, ap->a_command, ap->a_data,
ap->a_fflag, ap->a_p));
case VBLK:
return ((*bdevsw[maj].d_ioctl)(dev, ap->a_command, ap->a_data,
ap->a_fflag, ap->a_p));
default:
panic("spec_ioctl");
/* NOTREACHED */
}
}
int
spec_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
dev_t dev;
dev = ap->a_vp->v_rdev;
switch (ap->a_vp->v_type) {
default:
if (ap->a_kn->kn_flags & (__EV_POLL | __EV_SELECT)) return seltrue_kqfilter(dev, ap->a_kn);
break;
case VCHR:
if (cdevsw[major(dev)].d_kqfilter) return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn);
}
return (EOPNOTSUPP);
}
/*
* Synch buffers associated with a block device
*/
int
spec_fsync(void *v)
{
struct vop_fsync_args *ap = v;
struct vnode *vp = ap->a_vp;
struct buf *bp;
struct buf *nbp;
int s;
if (vp->v_type == VCHR)
return (0);
/*
* Flush all dirty buffers associated with a block device.
*/
loop:
s = splbio();
LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
if ((bp->b_flags & B_BUSY))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("spec_fsync: not dirty");
bremfree(bp);
buf_acquire(bp);
splx(s);
bawrite(bp);
goto loop;
}
if (ap->a_waitfor == MNT_WAIT) {
vwaitforio (vp, 0, "spec_fsync", INFSLP);
#ifdef DIAGNOSTIC
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
splx(s);
vprint("spec_fsync: dirty", vp);
goto loop;
}
#endif
}
splx(s);
return (0);
}
int
spec_strategy(void *v)
{
struct vop_strategy_args *ap = v;
struct buf *bp = ap->a_bp;
int maj = major(bp->b_dev);
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
(*bdevsw[maj].d_strategy)(bp);
return (0);
}
/*
* Device close routine
*/
int
spec_close(void *v)
{
struct vop_close_args *ap = v;
struct proc *p = ap->a_p;
struct vnode *vp = ap->a_vp;
dev_t dev = vp->v_rdev;
int (*devclose)(dev_t, int, int, struct proc *);
int mode, relock, xlocked, error;
int clone = 0;
switch (vp->v_type) {
case VCHR:
/*
* Hack: a tty device that is a controlling terminal
* has a reference from the session structure.
* We cannot easily tell that a character device is
* a controlling terminal, unless it is the closing
* process' controlling terminal. In that case,
* if the reference count is 2 (this last descriptor
* plus the session), release the reference from the session.
*/
if (vcount(vp) == 2 && p != NULL && p->p_p->ps_pgrp &&
vp == p->p_p->ps_pgrp->pg_session->s_ttyvp) {
vrele(vp);
p->p_p->ps_pgrp->pg_session->s_ttyvp = NULL;
}
if (cdevsw[major(dev)].d_flags & D_CLONE) {
clone = 1;
} else {
/*
* If the vnode is locked, then we are in the midst
* of forcibly closing the device, otherwise we only
* close on last reference.
*/
mtx_enter(&vnode_mtx);
xlocked = (vp->v_lflag & VXLOCK);
mtx_leave(&vnode_mtx);
if (vcount(vp) > 1 && !xlocked)
return (0);
}
devclose = cdevsw[major(dev)].d_close;
mode = S_IFCHR;
break;
case VBLK:
/*
* On last close of a block device (that isn't mounted)
* we must invalidate any in core blocks, so that
* we can, for instance, change floppy disks. In order to do
* that, we must lock the vnode. If we are coming from
* vclean(), the vnode is already locked.
*/
mtx_enter(&vnode_mtx);
xlocked = (vp->v_lflag & VXLOCK);
mtx_leave(&vnode_mtx);
if (!xlocked)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vinvalbuf(vp, V_SAVE, ap->a_cred, p, 0, INFSLP);
if (!xlocked)
VOP_UNLOCK(vp);
if (error)
return (error);
/*
* We do not want to really close the device if it
* is still in use unless we are trying to close it
* forcibly. Since every use (buffer, vnode, swap, cmap)
* holds a reference to the vnode, and because we mark
* any other vnodes that alias this device, when the
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
mtx_enter(&vnode_mtx);
xlocked = (vp->v_lflag & VXLOCK);
mtx_leave(&vnode_mtx);
if (vcount(vp) > 1 && !xlocked)
return (0);
devclose = bdevsw[major(dev)].d_close;
mode = S_IFBLK;
break;
default:
panic("spec_close: not special");
}
/* release lock if held and this isn't coming from vclean() */
mtx_enter(&vnode_mtx);
xlocked = (vp->v_lflag & VXLOCK);
mtx_leave(&vnode_mtx);
relock = VOP_ISLOCKED(vp) && !xlocked;
if (relock)
VOP_UNLOCK(vp); error = (*devclose)(dev, ap->a_fflag, mode, p);
if (relock)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (error == 0 && clone) {
struct vnode *pvp;
pvp = vp->v_specparent; /* get parent device */
clrbit(pvp->v_specbitmap, minor(dev) >> CLONE_SHIFT);
vrele(pvp);
}
return (error);
}
int
spec_getattr(void *v)
{
struct vop_getattr_args *ap = v;
struct vnode *vp = ap->a_vp;
int error;
if (!(vp->v_flag & VCLONE))
return (EBADF);
vn_lock(vp->v_specparent, LK_EXCLUSIVE|LK_RETRY);
error = VOP_GETATTR(vp->v_specparent, ap->a_vap, ap->a_cred, ap->a_p);
VOP_UNLOCK(vp->v_specparent);
return (error);
}
int
spec_setattr(void *v)
{
struct vop_getattr_args *ap = v;
struct proc *p = ap->a_p;
struct vnode *vp = ap->a_vp;
int error;
if (!(vp->v_flag & VCLONE))
return (EBADF);
vn_lock(vp->v_specparent, LK_EXCLUSIVE|LK_RETRY);
error = VOP_SETATTR(vp->v_specparent, ap->a_vap, ap->a_cred, p);
VOP_UNLOCK(vp->v_specparent);
return (error);
}
int
spec_access(void *v)
{
struct vop_access_args *ap = v;
struct vnode *vp = ap->a_vp;
int error;
if (!(vp->v_flag & VCLONE))
return (EBADF);
vn_lock(vp->v_specparent, LK_EXCLUSIVE|LK_RETRY);
error = VOP_ACCESS(vp->v_specparent, ap->a_mode, ap->a_cred, ap->a_p);
VOP_UNLOCK(vp->v_specparent);
return (error);
}
/*
* Print out the contents of a special device vnode.
*/
int
spec_print(void *v)
{
struct vop_print_args *ap = v;
printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev),
minor(ap->a_vp->v_rdev));
return 0;
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
spec_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
int error = 0;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
break;
case _PC_MAX_CANON:
*ap->a_retval = MAX_CANON;
break;
case _PC_MAX_INPUT:
*ap->a_retval = MAX_INPUT;
break;
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
break;
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
break;
case _PC_TIMESTAMP_RESOLUTION:
*ap->a_retval = 1;
break;
default:
error = EINVAL;
break;
}
return (error);
}
/*
* Special device advisory byte-level locks.
*/
int
spec_advlock(void *v)
{
struct vop_advlock_args *ap = v;
struct vnode *vp = ap->a_vp;
return (lf_advlock(&vp->v_speclockf, (off_t)0, ap->a_id,
ap->a_op, ap->a_fl, ap->a_flags));
}
/*
* Copyright (c) 2006 Pedro Martelletto <pedro@ambientworks.net>
* Copyright (c) 2006 Thordur Bjornsson <thib@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifdef CLONE_DEBUG
#define DNPRINTF(m...) do { printf(m); } while (0)
#else
#define DNPRINTF(m...) /* nothing */
#endif
int
spec_open_clone(struct vop_open_args *ap)
{
struct vnode *cvp, *vp = ap->a_vp;
struct cloneinfo *cip;
int error, i;
DNPRINTF("cloning vnode\n");
if (minor(vp->v_rdev) >= (1 << CLONE_SHIFT))
return (ENXIO);
for (i = 1; i < CLONE_MAPSZ * NBBY; i++)
if (isclr(vp->v_specbitmap, i)) {
setbit(vp->v_specbitmap, i);
break;
}
if (i == CLONE_MAPSZ * NBBY)
return (EBUSY); /* too many open instances */
error = cdevvp(makedev(major(vp->v_rdev),
(i << CLONE_SHIFT) | minor(vp->v_rdev)), &cvp);
if (error) {
clrbit(vp->v_specbitmap, i);
return (error); /* out of vnodes */
}
VOP_UNLOCK(vp);
error = cdevsw[major(vp->v_rdev)].d_open(cvp->v_rdev, ap->a_mode,
S_IFCHR, ap->a_p);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (error) {
vput(cvp);
clrbit(vp->v_specbitmap, i);
return (error); /* device open failed */
}
cvp->v_flag |= VCLONE;
cip = malloc(sizeof(struct cloneinfo), M_TEMP, M_WAITOK);
cip->ci_data = vp->v_data;
cip->ci_vp = cvp;
cvp->v_specparent = vp;
vp->v_flag |= VCLONED;
vp->v_data = cip;
DNPRINTF("clone of vnode %p is vnode %p\n", vp, cvp);
return (0); /* device cloned */
}
/* $OpenBSD: exec_subr.c,v 1.57 2019/11/29 06:34:45 deraadt Exp $ */
/* $NetBSD: exec_subr.c,v 1.9 1994/12/04 03:10:42 mycroft Exp $ */
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/mman.h>
#include <sys/resourcevar.h>
#include <uvm/uvm_extern.h>
#ifdef DEBUG
/*
* new_vmcmd():
* create a new vmcmd structure and fill in its fields based
* on function call arguments. make sure objects ref'd by
* the vmcmd are 'held'.
*
* If not debugging, this is a macro, so it's expanded inline.
*/
void
new_vmcmd(struct exec_vmcmd_set *evsp,
int (*proc)(struct proc *, struct exec_vmcmd *), u_long len, u_long addr,
struct vnode *vp, u_long offset, u_int prot, int flags)
{
struct exec_vmcmd *vcp;
if (evsp->evs_used >= evsp->evs_cnt)
vmcmdset_extend(evsp);
vcp = &evsp->evs_cmds[evsp->evs_used++];
vcp->ev_proc = proc;
vcp->ev_len = len;
vcp->ev_addr = addr;
if ((vcp->ev_vp = vp) != NULL)
vref(vp);
vcp->ev_offset = offset;
vcp->ev_prot = prot;
vcp->ev_flags = flags;
}
#endif /* DEBUG */
void
vmcmdset_extend(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *nvcp;
u_int ocnt;
#ifdef DIAGNOSTIC
if (evsp->evs_used < evsp->evs_cnt)
panic("vmcmdset_extend: not necessary");
#endif
ocnt = evsp->evs_cnt;
KASSERT(ocnt > 0);
/* figure out number of entries in new set */
evsp->evs_cnt += ocnt;
/* reallocate the command set */
nvcp = mallocarray(evsp->evs_cnt, sizeof(*nvcp), M_EXEC,
M_WAITOK);
memcpy(nvcp, evsp->evs_cmds, ocnt * sizeof(*nvcp));
if (evsp->evs_cmds != evsp->evs_start)
free(evsp->evs_cmds, M_EXEC, ocnt * sizeof(*nvcp));
evsp->evs_cmds = nvcp;
}
void
kill_vmcmds(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *vcp;
int i;
for (i = 0; i < evsp->evs_used; i++) {
vcp = &evsp->evs_cmds[i];
if (vcp->ev_vp != NULLVP) vrele(vcp->ev_vp);
}
/*
* Free old vmcmds and reset the array.
*/
evsp->evs_used = 0;
if (evsp->evs_cmds != evsp->evs_start) free(evsp->evs_cmds, M_EXEC,
evsp->evs_cnt * sizeof(struct exec_vmcmd));
evsp->evs_cmds = evsp->evs_start;
evsp->evs_cnt = EXEC_DEFAULT_VMCMD_SETSIZE;
}
int
exec_process_vmcmds(struct proc *p, struct exec_package *epp)
{
struct exec_vmcmd *base_vc = NULL;
int error = 0;
int i;
for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
struct exec_vmcmd *vcp;
vcp = &epp->ep_vmcmds.evs_cmds[i];
if (vcp->ev_flags & VMCMD_RELATIVE) {
#ifdef DIAGNOSTIC
if (base_vc == NULL)
panic("exec_process_vmcmds: RELATIVE no base");
#endif
vcp->ev_addr += base_vc->ev_addr;
}
error = (*vcp->ev_proc)(p, vcp);
if (vcp->ev_flags & VMCMD_BASE) {
base_vc = vcp;
}
}
kill_vmcmds(&epp->ep_vmcmds);
return (error);
}
/*
* vmcmd_map_pagedvn():
* handle vmcmd which specifies that a vnode should be mmap'd.
* appropriate for handling demand-paged text and data segments.
*/
int
vmcmd_map_pagedvn(struct proc *p, struct exec_vmcmd *cmd)
{
/*
* note that if you're going to map part of a process as being
* paged from a vnode, that vnode had damn well better be marked as
* VTEXT. that's handled in the routine which sets up the vmcmd to
* call this routine.
*/
struct uvm_object *uobj;
unsigned int syscalls = 0;
int error;
/*
* map the vnode in using uvm_map.
*/
if (cmd->ev_len == 0)
return (0);
if (cmd->ev_offset & PAGE_MASK)
return (EINVAL);
if (cmd->ev_addr & PAGE_MASK)
return (EINVAL);
if (cmd->ev_len & PAGE_MASK)
return (EINVAL);
/*
* first, attach to the object
*/
uobj = uvn_attach(cmd->ev_vp, PROT_READ | PROT_EXEC);
if (uobj == NULL)
return (ENOMEM);
/*
* do the map
*/
if ((cmd->ev_flags & VMCMD_SYSCALL) && (cmd->ev_prot & PROT_EXEC))
syscalls |= UVM_FLAG_SYSCALL;
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, cmd->ev_len,
uobj, cmd->ev_offset, 0,
UVM_MAPFLAG(cmd->ev_prot, PROT_MASK, MAP_INHERIT_COPY,
MADV_NORMAL, UVM_FLAG_COPYONW | UVM_FLAG_FIXED | syscalls));
/*
* check for error
*/
if (error) {
/*
* error: detach from object
*/
uobj->pgops->pgo_detach(uobj);
}
return (error);
}
/*
* vmcmd_map_readvn():
* handle vmcmd which specifies that a vnode should be read from.
* appropriate for non-demand-paged text/data segments, i.e. impure
* objects (a la OMAGIC and NMAGIC).
*/
int
vmcmd_map_readvn(struct proc *p, struct exec_vmcmd *cmd)
{
int error;
vm_prot_t prot;
if (cmd->ev_len == 0)
return (0);
prot = cmd->ev_prot;
cmd->ev_addr = trunc_page(cmd->ev_addr); /* required by uvm_map */
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(prot | PROT_WRITE, PROT_MASK, MAP_INHERIT_COPY,
MADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW));
if (error)
return (error);
error = vn_rdwr(UIO_READ, cmd->ev_vp, (caddr_t)cmd->ev_addr,
cmd->ev_len, cmd->ev_offset, UIO_USERSPACE, IO_UNIT,
p->p_ucred, NULL, p);
if (error)
return (error);
if ((prot & PROT_WRITE) == 0) {
/*
* we had to map in the area at PROT_WRITE so that vn_rdwr()
* could write to it. however, the caller seems to want
* it mapped read-only, so now we are going to have to call
* uvm_map_protect() to fix up the protection. ICK.
*/
return (uvm_map_protect(&p->p_vmspace->vm_map,
trunc_page(cmd->ev_addr),
round_page(cmd->ev_addr + cmd->ev_len),
prot, FALSE));
}
return (0);
}
/*
* vmcmd_map_zero():
* handle vmcmd which specifies a zero-filled address space region.
*/
int
vmcmd_map_zero(struct proc *p, struct exec_vmcmd *cmd)
{
if (cmd->ev_len == 0)
return (0);
cmd->ev_addr = trunc_page(cmd->ev_addr); /* required by uvm_map */
return (uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(cmd->ev_prot, PROT_MASK, MAP_INHERIT_COPY,
MADV_NORMAL, UVM_FLAG_FIXED|UVM_FLAG_COPYONW |
(cmd->ev_flags & VMCMD_STACK ? UVM_FLAG_STACK : 0))));
}
/*
* vmcmd_randomize():
* handle vmcmd which specifies a randomized address space region.
*/
#define RANDOMIZE_CTX_THRESHOLD 512
int
vmcmd_randomize(struct proc *p, struct exec_vmcmd *cmd)
{
int error;
struct arc4random_ctx *ctx;
char *buf;
size_t sublen, off = 0;
size_t len = cmd->ev_len;
if (len == 0)
return (0);
if (len > ELF_RANDOMIZE_LIMIT)
return (EINVAL);
buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
if (len < RANDOMIZE_CTX_THRESHOLD) {
arc4random_buf(buf, len);
error = copyout(buf, (void *)cmd->ev_addr, len);
explicit_bzero(buf, len);
} else {
ctx = arc4random_ctx_new();
do {
sublen = MIN(len, PAGE_SIZE);
arc4random_ctx_buf(ctx, buf, sublen);
error = copyout(buf, (void *)cmd->ev_addr + off, sublen);
if (error)
break;
off += sublen;
len -= sublen;
sched_pause(yield);
} while (len);
arc4random_ctx_free(ctx);
explicit_bzero(buf, PAGE_SIZE);
}
free(buf, M_TEMP, PAGE_SIZE);
return (error);
}
#ifndef MAXSSIZ_GUARD
#define MAXSSIZ_GUARD (1024 * 1024)
#endif
/*
* exec_setup_stack(): Set up the stack segment for an executable.
*
* Note that the ep_ssize parameter must be set to be the current stack
* limit; this is adjusted in the body of execve() to yield the
* appropriate stack segment usage once the argument length is
* calculated.
*
* This function returns an int for uniformity with other (future) formats'
* stack setup functions. They might have errors to return.
*/
int
exec_setup_stack(struct proc *p, struct exec_package *epp)
{
vaddr_t sgap;
#ifdef MACHINE_STACK_GROWS_UP
epp->ep_maxsaddr = USRSTACK;
epp->ep_minsaddr = USRSTACK + MAXSSIZ;
#else
epp->ep_maxsaddr = USRSTACK - MAXSSIZ - MAXSSIZ_GUARD;
epp->ep_minsaddr = USRSTACK;
#endif
epp->ep_ssize = round_page(lim_cur(RLIMIT_STACK));
if (stackgap_random != 0) {
sgap = arc4random() & (stackgap_random - 1);
sgap = trunc_page(sgap);
#ifdef MACHINE_STACK_GROWS_UP
epp->ep_maxsaddr += sgap;
epp->ep_minsaddr += sgap;
#else
epp->ep_maxsaddr -= sgap;
epp->ep_minsaddr -= sgap;
#endif
}
/*
* set up commands for stack. note that this takes *two*, one to
* map the part of the stack which we can access, and one to map
* the part which we can't.
*
* arguably, it could be made into one, but that would require the
* addition of another mapping proc, which is unnecessary
*
* note that in memory, things assumed to be: 0 ....... ep_maxsaddr
* <stack> ep_minsaddr
*/
#ifdef MACHINE_STACK_GROWS_UP
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero,
((epp->ep_minsaddr - epp->ep_ssize) - epp->ep_maxsaddr),
epp->ep_maxsaddr + epp->ep_ssize, NULLVP, 0,
PROT_NONE);
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, epp->ep_ssize,
epp->ep_maxsaddr, NULLVP, 0,
PROT_READ | PROT_WRITE, VMCMD_STACK);
#else
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero,
((epp->ep_minsaddr - epp->ep_ssize) - epp->ep_maxsaddr),
epp->ep_maxsaddr, NULLVP, 0,
PROT_NONE);
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, epp->ep_ssize,
(epp->ep_minsaddr - epp->ep_ssize), NULLVP, 0,
PROT_READ | PROT_WRITE, VMCMD_STACK);
#endif
return (0);
}
/* $OpenBSD: tsc.c,v 1.26 2022/08/25 17:38:16 cheloha Exp $ */
/*
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* Copyright (c) 2016,2017 Reyk Floeter <reyk@openbsd.org>
* Copyright (c) 2017 Adam Steen <adam@adamsteen.com.au>
* Copyright (c) 2017 Mike Belopuhov <mike@openbsd.org>
* Copyright (c) 2019 Paul Irofti <paul@irofti.net>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/timetc.h>
#include <sys/atomic.h>
#include <machine/cpu.h>
#include <machine/cpufunc.h>
#define RECALIBRATE_MAX_RETRIES 5
#define RECALIBRATE_SMI_THRESHOLD 50000
#define RECALIBRATE_DELAY_THRESHOLD 50
int tsc_recalibrate;
uint64_t tsc_frequency;
int tsc_is_invariant;
u_int tsc_get_timecount(struct timecounter *tc);
void tsc_delay(int usecs);
#include "lapic.h"
#if NLAPIC > 0
extern u_int32_t lapic_per_second;
#endif
struct timecounter tsc_timecounter = {
.tc_get_timecount = tsc_get_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
.tc_frequency = 0,
.tc_name = "tsc",
.tc_quality = -1000,
.tc_priv = NULL,
.tc_user = TC_TSC,
};
uint64_t
tsc_freq_cpuid(struct cpu_info *ci)
{
uint64_t count;
uint32_t eax, ebx, khz, dummy;
if (!strcmp(cpu_vendor, "GenuineIntel") &&
cpuid_level >= 0x15) {
eax = ebx = khz = dummy = 0;
CPUID(0x15, eax, ebx, khz, dummy);
khz /= 1000;
if (khz == 0) {
switch (ci->ci_model) {
case 0x4e: /* Skylake mobile */
case 0x5e: /* Skylake desktop */
case 0x8e: /* Kabylake mobile */
case 0x9e: /* Kabylake desktop */
case 0xa5: /* CML-H CML-S62 CML-S102 */
case 0xa6: /* CML-U62 */
khz = 24000; /* 24.0 MHz */
break;
case 0x5f: /* Atom Denverton */
khz = 25000; /* 25.0 MHz */
break;
case 0x5c: /* Atom Goldmont */
khz = 19200; /* 19.2 MHz */
break;
}
}
if (ebx == 0 || eax == 0)
count = 0;
else if ((count = (uint64_t)khz * (uint64_t)ebx / eax) != 0) {
#if NLAPIC > 0
lapic_per_second = khz * 1000;
#endif
return (count * 1000);
}
}
return (0);
}
void
tsc_identify(struct cpu_info *ci)
{
if (!(ci->ci_flags & CPUF_PRIMARY) ||
!(ci->ci_flags & CPUF_CONST_TSC) ||
!(ci->ci_flags & CPUF_INVAR_TSC))
return;
tsc_is_invariant = 1;
tsc_frequency = tsc_freq_cpuid(ci);
if (tsc_frequency > 0)
delay_init(tsc_delay, 5000);
}
static inline int
get_tsc_and_timecount(struct timecounter *tc, uint64_t *tsc, uint64_t *count)
{
uint64_t n, tsc1, tsc2;
int i;
for (i = 0; i < RECALIBRATE_MAX_RETRIES; i++) {
tsc1 = rdtsc_lfence();
n = (tc->tc_get_timecount(tc) & tc->tc_counter_mask);
tsc2 = rdtsc_lfence();
if ((tsc2 - tsc1) < RECALIBRATE_SMI_THRESHOLD) {
*count = n;
*tsc = tsc2;
return (0);
}
}
return (1);
}
static inline uint64_t
calculate_tsc_freq(uint64_t tsc1, uint64_t tsc2, int usec)
{
uint64_t delta;
delta = (tsc2 - tsc1);
return (delta * 1000000 / usec);
}
static inline uint64_t
calculate_tc_delay(struct timecounter *tc, uint64_t count1, uint64_t count2)
{
uint64_t delta;
if (count2 < count1)
count2 += tc->tc_counter_mask;
delta = (count2 - count1);
return (delta * 1000000 / tc->tc_frequency);
}
uint64_t
measure_tsc_freq(struct timecounter *tc)
{
uint64_t count1, count2, frequency, min_freq, tsc1, tsc2;
u_long s;
int delay_usec, i, err1, err2, usec, success = 0;
/* warmup the timers */
for (i = 0; i < 3; i++) {
(void)tc->tc_get_timecount(tc);
(void)rdtsc();
}
min_freq = ULLONG_MAX;
delay_usec = 100000;
for (i = 0; i < 3; i++) {
s = intr_disable();
err1 = get_tsc_and_timecount(tc, &tsc1, &count1);
delay(delay_usec);
err2 = get_tsc_and_timecount(tc, &tsc2, &count2);
intr_restore(s);
if (err1 || err2)
continue;
usec = calculate_tc_delay(tc, count1, count2);
if ((usec < (delay_usec - RECALIBRATE_DELAY_THRESHOLD)) ||
(usec > (delay_usec + RECALIBRATE_DELAY_THRESHOLD)))
continue;
frequency = calculate_tsc_freq(tsc1, tsc2, usec);
min_freq = MIN(min_freq, frequency);
success++;
}
return (success > 1 ? min_freq : 0);
}
void
calibrate_tsc_freq(void)
{
struct timecounter *reference = tsc_timecounter.tc_priv;
uint64_t freq;
if (!reference || !tsc_recalibrate)
return;
if ((freq = measure_tsc_freq(reference)) == 0)
return;
tsc_frequency = freq;
tsc_timecounter.tc_frequency = freq;
if (tsc_is_invariant)
tsc_timecounter.tc_quality = 2000;
}
void
cpu_recalibrate_tsc(struct timecounter *tc)
{
struct timecounter *reference = tsc_timecounter.tc_priv;
/* Prevent recalibration with a worse timecounter source */
if (reference && reference->tc_quality > tc->tc_quality)
return;
tsc_timecounter.tc_priv = tc;
calibrate_tsc_freq();
}
u_int
tsc_get_timecount(struct timecounter *tc)
{
return rdtsc_lfence();
}
void
tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
{
if (!(ci->ci_flags & CPUF_PRIMARY) ||
!(ci->ci_flags & CPUF_CONST_TSC) ||
!(ci->ci_flags & CPUF_INVAR_TSC))
return;
/* Newer CPUs don't require recalibration */
if (tsc_frequency > 0) {
tsc_timecounter.tc_frequency = tsc_frequency;
tsc_timecounter.tc_quality = 2000;
} else {
tsc_recalibrate = 1;
tsc_frequency = cpufreq;
tsc_timecounter.tc_frequency = cpufreq;
calibrate_tsc_freq();
}
tc_init(&tsc_timecounter);
}
void
tsc_delay(int usecs)
{
uint64_t interval, start;
interval = (uint64_t)usecs * tsc_frequency / 1000000;
start = rdtsc_lfence();
while (rdtsc_lfence() - start < interval)
CPU_BUSY_CYCLE();
}
#ifdef MULTIPROCESSOR
#define TSC_DEBUG 1
/*
* Protections for global variables in this code:
*
* a Modified atomically
* b Protected by a barrier
* p Only modified by the primary CPU
*/
#define TSC_TEST_MSECS 1 /* Test round duration */
#define TSC_TEST_ROUNDS 2 /* Number of test rounds */
/*
* tsc_test_status.val is isolated to its own cache line to limit
* false sharing and reduce the test's margin of error.
*/
struct tsc_test_status {
volatile uint64_t val; /* [a] Latest RDTSC value */
uint64_t pad1[7];
uint64_t lag_count; /* [b] Number of lags seen by CPU */
uint64_t lag_max; /* [b] Biggest lag seen by CPU */
int64_t adj; /* [b] Initial IA32_TSC_ADJUST value */
uint64_t pad2[5];
} __aligned(64);
struct tsc_test_status tsc_ap_status; /* Test results from AP */
struct tsc_test_status tsc_bp_status; /* Test results from BP */
uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */
const char *tsc_ap_name; /* [b] Name of AP running test */
volatile u_int tsc_egress_barrier; /* [a] Test end barrier */
volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */
volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */
int tsc_is_synchronized = 1; /* [p] Have we ever failed the test? */
void tsc_report_test_results(void);
void tsc_reset_adjust(struct tsc_test_status *);
void tsc_test_ap(void);
void tsc_test_bp(void);
void
tsc_test_sync_bp(struct cpu_info *ci)
{
if (!tsc_is_invariant)
return;
#ifndef TSC_DEBUG
/* No point in testing again if we already failed. */
if (!tsc_is_synchronized)
return;
#endif
/* Reset IA32_TSC_ADJUST if it exists. */
tsc_reset_adjust(&tsc_bp_status);
/* Reset the test cycle limit and round count. */
tsc_test_cycles = TSC_TEST_MSECS * tsc_frequency / 1000;
tsc_test_rounds = TSC_TEST_ROUNDS;
do {
/*
* Pass through the ingress barrier, run the test,
* then wait for the AP to reach the egress barrier.
*/
atomic_inc_int(&tsc_ingress_barrier);
while (tsc_ingress_barrier != 2)
CPU_BUSY_CYCLE();
tsc_test_bp();
while (tsc_egress_barrier != 1)
CPU_BUSY_CYCLE();
/*
* Report what happened. Adjust the TSC's quality
* if this is the first time we've failed the test.
*/
tsc_report_test_results();
if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
if (tsc_is_synchronized) {
tsc_is_synchronized = 0;
tc_reset_quality(&tsc_timecounter, -1000);
}
tsc_test_rounds = 0;
} else
tsc_test_rounds--;
/*
* Clean up for the next round. It is safe to reset the
* ingress barrier because at this point we know the AP
* has reached the egress barrier.
*/
memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
tsc_ingress_barrier = 0;
if (tsc_test_rounds == 0)
tsc_ap_name = NULL;
/*
* Pass through the egress barrier and release the AP.
* The AP is responsible for resetting the egress barrier.
*/
if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
panic("%s: unexpected egress count", __func__);
} while (tsc_test_rounds > 0);
}
void
tsc_test_sync_ap(struct cpu_info *ci)
{
if (!tsc_is_invariant)
return;
#ifndef TSC_DEBUG
if (!tsc_is_synchronized)
return;
#endif
/* The BP needs our name in order to report any problems. */
if (atomic_cas_ptr(&tsc_ap_name, NULL, ci->ci_dev->dv_xname) != NULL) {
panic("%s: %s: tsc_ap_name is not NULL: %s",
__func__, ci->ci_dev->dv_xname, tsc_ap_name);
}
tsc_reset_adjust(&tsc_ap_status);
/*
* The AP is only responsible for running the test and
* resetting the egress barrier. The BP handles everything
* else.
*/
do {
atomic_inc_int(&tsc_ingress_barrier);
while (tsc_ingress_barrier != 2)
CPU_BUSY_CYCLE();
tsc_test_ap();
atomic_inc_int(&tsc_egress_barrier);
while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
CPU_BUSY_CYCLE();
} while (tsc_test_rounds > 0);
}
void
tsc_report_test_results(void)
{
u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
if (tsc_bp_status.adj != 0) {
printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n",
tsc_bp_status.adj);
}
if (tsc_ap_status.adj != 0) {
printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n",
tsc_ap_name, tsc_ap_status.adj);
}
if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) {
printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
tsc_ap_name, round, TSC_TEST_ROUNDS);
}
if (tsc_bp_status.lag_count > 0) {
printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
tsc_ap_name, tsc_bp_status.lag_count,
tsc_bp_status.lag_max);
}
if (tsc_ap_status.lag_count > 0) {
printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
tsc_ap_status.lag_max);
}
}
/*
* Reset IA32_TSC_ADJUST if we have it.
*
* XXX We should rearrange cpu_hatch() so that the feature
* flags are already set before we get here. Check CPUID
* by hand until then.
*/
void
tsc_reset_adjust(struct tsc_test_status *tts)
{
uint32_t eax, ebx, ecx, edx;
CPUID(0, eax, ebx, ecx, edx);
if (eax >= 7) {
CPUID_LEAF(7, 0, eax, ebx, ecx, edx);
if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) {
tts->adj = rdmsr(MSR_TSC_ADJUST);
if (tts->adj != 0)
wrmsr(MSR_TSC_ADJUST, 0);
}
}
}
void
tsc_test_ap(void)
{
uint64_t ap_val, bp_val, end, lag;
ap_val = rdtsc_lfence();
end = ap_val + tsc_test_cycles;
while (__predict_true(ap_val < end)) {
/*
* Get the BP's latest TSC value, then read the AP's
* TSC. LFENCE is a serializing instruction, so we
* know bp_val predates ap_val. If ap_val is smaller
* than bp_val then the AP's TSC must trail that of
* the BP and the counters cannot be synchronized.
*/
bp_val = tsc_bp_status.val;
ap_val = rdtsc_lfence();
tsc_ap_status.val = ap_val;
/*
* Record the magnitude of the problem if the AP's TSC
* trails the BP's TSC.
*/
if (__predict_false(ap_val < bp_val)) {
tsc_ap_status.lag_count++;
lag = bp_val - ap_val;
if (tsc_ap_status.lag_max < lag)
tsc_ap_status.lag_max = lag;
}
}
}
/*
* This is similar to tsc_test_ap(), but with all relevant variables
* flipped around to run the test from the BP's perspective.
*/
void
tsc_test_bp(void)
{
uint64_t ap_val, bp_val, end, lag;
bp_val = rdtsc_lfence();
end = bp_val + tsc_test_cycles;
while (__predict_true(bp_val < end)) {
ap_val = tsc_ap_status.val;
bp_val = rdtsc_lfence();
tsc_bp_status.val = bp_val;
if (__predict_false(bp_val < ap_val)) {
tsc_bp_status.lag_count++;
lag = ap_val - bp_val;
if (tsc_bp_status.lag_max < lag)
tsc_bp_status.lag_max = lag;
}
}
}
#endif /* MULTIPROCESSOR */
/* $OpenBSD: vfs_subr.c,v 1.317 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
/*
* External virtual filesystem routines
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/mount.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/lockf.h>
#include <sys/stat.h>
#include <sys/acct.h>
#include <sys/namei.h>
#include <sys/ucred.h>
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/syscallargs.h>
#include <sys/pool.h>
#include <sys/tree.h>
#include <sys/specdev.h>
#include <sys/atomic.h>
#include <netinet/in.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_vnode.h>
#include "softraid.h"
void sr_quiesce(void);
enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
int vttoif_tab[9] = {
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT,
};
int prtactive = 0; /* 1 => print out reclaim of active vnodes */
int suid_clear = 1; /* 1 => clear SUID / SGID on owner change */
/*
* Insq/Remq for the vnode usage lists.
*/
#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
#define bufremvn(bp) { \
LIST_REMOVE(bp, b_vnbufs); \
LIST_NEXT(bp, b_vnbufs) = NOLIST; \
}
TAILQ_HEAD(freelst, vnode);
struct freelst vnode_hold_list; /* list of vnodes referencing buffers */
struct freelst vnode_free_list; /* vnode free list */
struct mntlist mountlist; /* mounted filesystem list */
void vclean(struct vnode *, int, struct proc *);
void insmntque(struct vnode *, struct mount *);
int getdevvp(dev_t, struct vnode **, enum vtype);
int vfs_hang_addrlist(struct mount *, struct netexport *,
struct export_args *);
int vfs_free_netcred(struct radix_node *, void *, u_int);
void vfs_free_addrlist(struct netexport *);
void vputonfreelist(struct vnode *);
int vflush_vnode(struct vnode *, void *);
int maxvnodes;
struct mutex vnode_mtx = MUTEX_INITIALIZER(IPL_BIO);
void vfs_unmountall(void);
#ifdef DEBUG
void printlockedvnodes(void);
#endif
struct pool vnode_pool;
struct pool uvm_vnode_pool;
static inline int rb_buf_compare(const struct buf *b1, const struct buf *b2);
RBT_GENERATE(buf_rb_bufs, buf, b_rbbufs, rb_buf_compare);
static inline int
rb_buf_compare(const struct buf *b1, const struct buf *b2)
{
if (b1->b_lblkno < b2->b_lblkno)
return(-1);
if (b1->b_lblkno > b2->b_lblkno)
return(1);
return(0);
}
/*
* Initialize the vnode management data structures.
*/
void
vntblinit(void)
{
/* buffer cache may need a vnode for each buffer */
maxvnodes = 2 * initialvnodes;
pool_init(&vnode_pool, sizeof(struct vnode), 0, IPL_NONE,
PR_WAITOK, "vnodes", NULL);
pool_init(&uvm_vnode_pool, sizeof(struct uvm_vnode), 0, IPL_NONE,
PR_WAITOK, "uvmvnodes", NULL);
TAILQ_INIT(&vnode_hold_list);
TAILQ_INIT(&vnode_free_list);
TAILQ_INIT(&mountlist);
/*
* Initialize the filesystem syncer.
*/
vn_initialize_syncerd();
#ifdef NFSSERVER
rn_init(sizeof(struct sockaddr_in));
#endif /* NFSSERVER */
}
/*
* Allocate a mount point.
*
* The returned mount point is marked as busy.
*/
struct mount *
vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp)
{
struct mount *mp;
mp = malloc(sizeof(*mp), M_MOUNT, M_WAITOK|M_ZERO);
rw_init_flags(&mp->mnt_lock, "vfslock", RWL_IS_VNODE);
(void)vfs_busy(mp, VB_READ|VB_NOWAIT);
TAILQ_INIT(&mp->mnt_vnodelist);
mp->mnt_vnodecovered = vp;
atomic_inc_int(&vfsp->vfc_refcount);
mp->mnt_vfc = vfsp;
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_flag = vfsp->vfc_flags;
strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
return (mp);
}
/*
* Release a mount point.
*/
void
vfs_mount_free(struct mount *mp)
{
atomic_dec_int(&mp->mnt_vfc->vfc_refcount);
free(mp, M_MOUNT, sizeof(*mp));
}
/*
* Mark a mount point as busy. Used to synchronize access and to delay
* unmounting.
*
* Default behaviour is to attempt getting a READ lock and in case of an
* ongoing unmount, to wait for it to finish and then return failure.
*/
int
vfs_busy(struct mount *mp, int flags)
{
int rwflags = 0;
if (flags & VB_WRITE)
rwflags |= RW_WRITE;
else
rwflags |= RW_READ;
if (flags & VB_WAIT)
rwflags |= RW_SLEEPFAIL;
else
rwflags |= RW_NOSLEEP;
#ifdef WITNESS
if (flags & VB_DUPOK)
rwflags |= RW_DUPOK;
#endif
if (rw_enter(&mp->mnt_lock, rwflags))
return (EBUSY);
return (0);
}
/*
* Free a busy file system
*/
void
vfs_unbusy(struct mount *mp)
{
rw_exit(&mp->mnt_lock);
}
int
vfs_isbusy(struct mount *mp)
{
if (RWLOCK_OWNER(&mp->mnt_lock) > 0)
return (1);
else
return (0);
}
/*
* Lookup a filesystem type, and if found allocate and initialize
* a mount structure for it.
*
* Devname is usually updated by mount(8) after booting.
*/
int
vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
{
struct vfsconf *vfsp;
struct mount *mp;
vfsp = vfs_byname(fstypename);
if (vfsp == NULL)
return (ENODEV);
mp = vfs_mount_alloc(NULLVP, vfsp);
mp->mnt_flag |= MNT_RDONLY;
mp->mnt_stat.f_mntonname[0] = '/';
copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN, NULL);
copystr(devname, mp->mnt_stat.f_mntfromspec, MNAMELEN, NULL);
*mpp = mp;
return (0);
}
/*
* Lookup a mount point by filesystem identifier.
*/
struct mount *
vfs_getvfs(fsid_t *fsid)
{
struct mount *mp;
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
return (mp);
}
}
return (NULL);
}
/*
* Get a new unique fsid
*/
void
vfs_getnewfsid(struct mount *mp)
{
static u_short xxxfs_mntid;
fsid_t tfsid;
int mtype;
mtype = mp->mnt_vfc->vfc_typenum;
mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
mp->mnt_stat.f_fsid.val[1] = mtype;
if (xxxfs_mntid == 0)
++xxxfs_mntid;
tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
tfsid.val[1] = mtype;
if (!TAILQ_EMPTY(&mountlist)) {
while (vfs_getvfs(&tfsid)) {
tfsid.val[0]++;
xxxfs_mntid++;
}
}
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
}
/*
* Set vnode attributes to VNOVAL
*/
void
vattr_null(struct vattr *vap)
{
vap->va_type = VNON;
/*
* Don't get fancy: u_quad_t = u_int = VNOVAL leaves the u_quad_t
* with 2^31-1 instead of 2^64-1. Just write'm out and let
* the compiler do its job.
*/
vap->va_mode = VNOVAL;
vap->va_nlink = VNOVAL;
vap->va_uid = VNOVAL;
vap->va_gid = VNOVAL;
vap->va_fsid = VNOVAL;
vap->va_fileid = VNOVAL;
vap->va_size = VNOVAL;
vap->va_blocksize = VNOVAL;
vap->va_atime.tv_sec = VNOVAL;
vap->va_atime.tv_nsec = VNOVAL;
vap->va_mtime.tv_sec = VNOVAL;
vap->va_mtime.tv_nsec = VNOVAL;
vap->va_ctime.tv_sec = VNOVAL;
vap->va_ctime.tv_nsec = VNOVAL;
vap->va_gen = VNOVAL;
vap->va_flags = VNOVAL;
vap->va_rdev = VNOVAL;
vap->va_bytes = VNOVAL;
vap->va_filerev = VNOVAL;
vap->va_vaflags = 0;
}
/*
* Routines having to do with the management of the vnode table.
*/
long numvnodes;
/*
* Return the next vnode from the free list.
*/
int
getnewvnode(enum vtagtype tag, struct mount *mp, const struct vops *vops,
struct vnode **vpp)
{
struct proc *p = curproc;
struct freelst *listhd;
static int toggle;
struct vnode *vp;
int s;
/*
* allow maxvnodes to increase if the buffer cache itself
* is big enough to justify it. (we don't shrink it ever)
*/
maxvnodes = maxvnodes < bcstats.numbufs ? bcstats.numbufs
: maxvnodes;
/*
* We must choose whether to allocate a new vnode or recycle an
* existing one. The criterion for allocating a new one is that
* the total number of vnodes is less than the number desired or
* there are no vnodes on either free list. Generally we only
* want to recycle vnodes that have no buffers associated with
* them, so we look first on the vnode_free_list. If it is empty,
* we next consider vnodes with referencing buffers on the
* vnode_hold_list. The toggle ensures that half the time we
* will use a buffer from the vnode_hold_list, and half the time
* we will allocate a new one unless the list has grown to twice
* the desired size. We are reticent to recycle vnodes from the
* vnode_hold_list because we will lose the identity of all its
* referencing buffers.
*/
toggle ^= 1;
if (numvnodes / 2 > maxvnodes)
toggle = 0;
s = splbio();
if ((numvnodes < maxvnodes) || ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) && ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
splx(s);
vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
vp->v_uvm = pool_get(&uvm_vnode_pool, PR_WAITOK | PR_ZERO);
vp->v_uvm->u_vnode = vp;
uvm_obj_init(&vp->v_uvm->u_obj, &uvm_vnodeops, 0);
RBT_INIT(buf_rb_bufs, &vp->v_bufs_tree);
cache_tree_init(&vp->v_nc_tree);
TAILQ_INIT(&vp->v_cache_dst);
numvnodes++;
} else {
TAILQ_FOREACH(vp, listhd, v_freelist) {
if (VOP_ISLOCKED(vp) == 0)
break;
}
/*
* Unless this is a bad time of the month, at most
* the first NCPUS items on the free list are
* locked, so this is close enough to being empty.
*/
if (vp == NULL) {
splx(s);
tablefull("vnode");
*vpp = NULL;
return (ENFILE);
}
#ifdef DIAGNOSTIC
if (vp->v_usecount) {
vprint("free vnode", vp);
panic("free vnode isn't");
}
#endif
TAILQ_REMOVE(listhd, vp, v_freelist);
vp->v_bioflag &= ~VBIOONFREELIST;
splx(s);
if (vp->v_type != VBAD) vgonel(vp, p);
#ifdef DIAGNOSTIC
if (vp->v_data) {
vprint("cleaned vnode", vp);
panic("cleaned vnode isn't");
}
s = splbio();
if (vp->v_numoutput)
panic("Clean vnode has pending I/O's"); splx(s);
#endif
vp->v_flag = 0;
vp->v_socket = NULL;
}
cache_purge(vp);
vp->v_type = VNON;
vp->v_tag = tag;
vp->v_op = vops;
insmntque(vp, mp);
*vpp = vp;
vp->v_usecount = 1;
vp->v_data = NULL;
return (0);
}
/*
* Move a vnode from one mount queue to another.
*/
void
insmntque(struct vnode *vp, struct mount *mp)
{
/*
* Delete from old mount point vnode list, if on one.
*/
if (vp->v_mount != NULL) TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
/*
* Insert into list of vnodes for the new mount point, if available.
*/
if ((vp->v_mount = mp) != NULL) TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
}
/*
* Create a vnode for a block device.
* Used for root filesystem, argdev, and swap areas.
* Also used for memory file system special devices.
*/
int
bdevvp(dev_t dev, struct vnode **vpp)
{
return (getdevvp(dev, vpp, VBLK));
}
/*
* Create a vnode for a character device.
* Used for console handling.
*/
int
cdevvp(dev_t dev, struct vnode **vpp)
{
return (getdevvp(dev, vpp, VCHR));
}
/*
* Create a vnode for a device.
* Used by bdevvp (block device) for root file system etc.,
* and by cdevvp (character device) for console.
*/
int
getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
{
struct vnode *vp;
struct vnode *nvp;
int error;
if (dev == NODEV) {
*vpp = NULLVP;
return (0);
}
error = getnewvnode(VT_NON, NULL, &spec_vops, &nvp);
if (error) {
*vpp = NULLVP;
return (error);
}
vp = nvp;
vp->v_type = type;
if ((nvp = checkalias(vp, dev, NULL)) != NULL) {
vput(vp);
vp = nvp;
}
if (vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_type == D_TTY)
vp->v_flag |= VISTTY;
*vpp = vp;
return (0);
}
/*
* Check to see if the new vnode represents a special device
* for which we already have a vnode (either because of
* bdevvp() or because of a different vnode representing
* the same block device). If such an alias exists, deallocate
* the existing contents and return the aliased vnode. The
* caller is responsible for filling it with its new contents.
*/
struct vnode *
checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
{
struct proc *p = curproc;
struct vnode *vp;
struct vnodechain *vchain;
if (nvp->v_type != VBLK && nvp->v_type != VCHR)
return (NULLVP);
vchain = &speclisth[SPECHASH(nvp_rdev)];
loop:
SLIST_FOREACH(vp, vchain, v_specnext) { if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) {
continue;
}
/*
* Alias, but not in use, so flush it out.
*/
if (vp->v_usecount == 0) {
vgonel(vp, p);
goto loop;
}
if (vget(vp, LK_EXCLUSIVE)) {
goto loop;
}
break;
}
/*
* Common case is actually in the if statement
*/
if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
nvp->v_specinfo = malloc(sizeof(struct specinfo), M_VNODE,
M_WAITOK);
nvp->v_rdev = nvp_rdev;
nvp->v_hashchain = vchain;
nvp->v_specmountpoint = NULL;
nvp->v_speclockf = NULL;
nvp->v_specbitmap = NULL;
if (nvp->v_type == VCHR && (cdevsw[major(nvp_rdev)].d_flags & D_CLONE) &&
(minor(nvp_rdev) >> CLONE_SHIFT == 0)) {
if (vp != NULLVP)
nvp->v_specbitmap = vp->v_specbitmap;
else
nvp->v_specbitmap = malloc(CLONE_MAPSZ,
M_VNODE, M_WAITOK | M_ZERO);
}
SLIST_INSERT_HEAD(vchain, nvp, v_specnext);
if (vp != NULLVP) {
nvp->v_flag |= VALIASED;
vp->v_flag |= VALIASED;
vput(vp);
}
return (NULLVP);
}
/*
* This code is the uncommon case. It is called in case
* we found an alias that was VT_NON && vtype of VBLK
* This means we found a block device that was created
* using bdevvp.
* An example of such a vnode is the root partition device vnode
* created in ffs_mountroot.
*
* The vnodes created by bdevvp should not be aliased (why?).
*/
VOP_UNLOCK(vp);
vclean(vp, 0, p);
vp->v_op = nvp->v_op;
vp->v_tag = nvp->v_tag;
nvp->v_type = VNON;
insmntque(vp, mp);
return (vp);
}
/*
* Grab a particular vnode from the free list, increment its
* reference count and lock it. If the vnode lock bit is set,
* the vnode is being eliminated in vgone. In that case, we
* cannot grab it, so the process is awakened when the
* transition is completed, and an error code is returned to
* indicate that the vnode is no longer usable, possibly
* having been changed to a new file system type.
*/
int
vget(struct vnode *vp, int flags)
{
int error, s, onfreelist;
/*
* If the vnode is in the process of being cleaned out for
* another use, we wait for the cleaning to finish and then
* return failure. Cleaning is determined by checking that
* the VXLOCK flag is set.
*/
mtx_enter(&vnode_mtx);
if (vp->v_lflag & VXLOCK) {
if (flags & LK_NOWAIT) {
mtx_leave(&vnode_mtx);
return (EBUSY);
}
vp->v_lflag |= VXWANT;
msleep_nsec(vp, &vnode_mtx, PINOD, "vget", INFSLP);
mtx_leave(&vnode_mtx);
return (ENOENT);
}
mtx_leave(&vnode_mtx);
s = splbio();
onfreelist = vp->v_bioflag & VBIOONFREELIST;
if (vp->v_usecount == 0 && onfreelist) {
if (vp->v_holdcnt > 0)
TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
else
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
vp->v_bioflag &= ~VBIOONFREELIST;
}
splx(s);
vp->v_usecount++;
if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags)) != 0) {
vp->v_usecount--;
if (vp->v_usecount == 0 && onfreelist) vputonfreelist(vp);
}
return (error);
}
return (0);
}
/* Vnode reference. */
void
vref(struct vnode *vp)
{ KERNEL_ASSERT_LOCKED();
#ifdef DIAGNOSTIC
if (vp->v_usecount == 0)
panic("vref used where vget required");
if (vp->v_type == VNON)
panic("vref on a VNON vnode");
#endif
vp->v_usecount++;
}
void
vputonfreelist(struct vnode *vp)
{
int s;
struct freelst *lst;
s = splbio();
#ifdef DIAGNOSTIC
if (vp->v_usecount != 0)
panic("Use count is not zero!");
/*
* If the hold count is still positive, one or many threads could still
* be waiting on the vnode lock inside uvn_io().
*/
if (vp->v_holdcnt == 0 && vp->v_lockcount != 0) panic("%s: lock count is not zero", __func__);
if (vp->v_bioflag & VBIOONFREELIST) {
vprint("vnode already on free list: ", vp);
panic("vnode already on free list");
}
#endif
vp->v_bioflag |= VBIOONFREELIST;
vp->v_bioflag &= ~VBIOERROR;
if (vp->v_holdcnt > 0)
lst = &vnode_hold_list;
else
lst = &vnode_free_list;
if (vp->v_type == VBAD)
TAILQ_INSERT_HEAD(lst, vp, v_freelist);
else
TAILQ_INSERT_TAIL(lst, vp, v_freelist);
splx(s);
}
/*
* vput(), just unlock and vrele()
*/
void
vput(struct vnode *vp)
{
struct proc *p = curproc;
int s;
#ifdef DIAGNOSTIC
if (vp == NULL)
panic("vput: null vp");
#endif
#ifdef DIAGNOSTIC
if (vp->v_usecount == 0) {
vprint("vput: bad ref count", vp);
panic("vput: ref cnt");
}
#endif
vp->v_usecount--;
KASSERT(vp->v_usecount > 0 || vp->v_uvcount == 0);
if (vp->v_usecount > 0) {
VOP_UNLOCK(vp);
return;
}
#ifdef DIAGNOSTIC
if (vp->v_writecount != 0) {
vprint("vput: bad writecount", vp);
panic("vput: v_writecount != 0");
}
#endif
VOP_INACTIVE(vp, p);
s = splbio();
if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST)) vputonfreelist(vp);
splx(s);
}
/*
* Vnode release - use for active VNODES.
* If count drops to zero, call inactive routine and return to freelist.
* Returns 0 if it did not sleep.
*/
int
vrele(struct vnode *vp)
{
struct proc *p = curproc;
int s;
#ifdef DIAGNOSTIC
if (vp == NULL)
panic("vrele: null vp");
#endif
#ifdef DIAGNOSTIC
if (vp->v_usecount == 0) {
vprint("vrele: bad ref count", vp);
panic("vrele: ref cnt");
}
#endif
vp->v_usecount--;
if (vp->v_usecount > 0) {
return (0);
}
#ifdef DIAGNOSTIC
if (vp->v_writecount != 0) {
vprint("vrele: bad writecount", vp);
panic("vrele: v_writecount != 0");
}
#endif
if (vn_lock(vp, LK_EXCLUSIVE)) {
#ifdef DIAGNOSTIC
vprint("vrele: cannot lock", vp);
#endif
return (1);
}
VOP_INACTIVE(vp, p);
s = splbio();
if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST)) vputonfreelist(vp);
splx(s);
return (1);
}
/* Page or buffer structure gets a reference. */
void
vhold(struct vnode *vp)
{
int s;
s = splbio();
/*
* If it is on the freelist and the hold count is currently
* zero, move it to the hold list.
*/
if ((vp->v_bioflag & VBIOONFREELIST) && vp->v_holdcnt == 0 && vp->v_usecount == 0) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
}
vp->v_holdcnt++;
splx(s);
}
/* Lose interest in a vnode. */
void
vdrop(struct vnode *vp)
{
int s;
s = splbio();
#ifdef DIAGNOSTIC
if (vp->v_holdcnt == 0)
panic("vdrop: zero holdcnt");
#endif
vp->v_holdcnt--;
/*
* If it is on the holdlist and the hold count drops to
* zero, move it to the free list.
*/
if ((vp->v_bioflag & VBIOONFREELIST) &&
vp->v_holdcnt == 0 && vp->v_usecount == 0) {
TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
}
splx(s);
}
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If MNT_NOFORCE is specified, there should not be any active ones,
* return error if any are found (nb: this is a user error, not a
* system error). If MNT_FORCE is specified, detach any active vnodes
* that are found.
*/
#ifdef DEBUG_SYSCTL
int busyprt = 0; /* print out busy vnodes */
struct ctldebug debug_vfs_busyprt = { "vfs_busyprt", &busyprt };
#endif
int
vfs_mount_foreach_vnode(struct mount *mp,
int (*func)(struct vnode *, void *), void *arg) {
struct vnode *vp, *nvp;
int error = 0;
loop:
TAILQ_FOREACH_SAFE(vp , &mp->mnt_vnodelist, v_mntvnodes, nvp) {
if (vp->v_mount != mp)
goto loop;
error = func(vp, arg);
if (error != 0)
break;
}
return (error);
}
struct vflush_args {
struct vnode *skipvp;
int busy;
int flags;
};
int
vflush_vnode(struct vnode *vp, void *arg)
{
struct vflush_args *va = arg;
struct proc *p = curproc;
int empty, s;
if (vp == va->skipvp) {
return (0);
}
if ((va->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
return (0);
}
/*
* If WRITECLOSE is set, only flush out regular file
* vnodes open for writing.
*/
if ((va->flags & WRITECLOSE) &&
(vp->v_writecount == 0 || vp->v_type != VREG)) {
return (0);
}
/*
* With v_usecount == 0, all we need to do is clear
* out the vnode data structures and we are done.
*/
if (vp->v_usecount == 0) {
vgonel(vp, p);
return (0);
}
/*
* If FORCECLOSE is set, forcibly close the vnode.
* For block or character devices, revert to an
* anonymous device. For all other files, just kill them.
*/
if (va->flags & FORCECLOSE) {
if (vp->v_type != VBLK && vp->v_type != VCHR) {
vgonel(vp, p);
} else {
vclean(vp, 0, p);
vp->v_op = &spec_vops;
insmntque(vp, NULL);
}
return (0);
}
/*
* If set, this is allowed to ignore vnodes which don't
* have changes pending to disk.
* XXX Might be nice to check per-fs "inode" flags, but
* generally the filesystem is sync'd already, right?
*/
s = splbio();
empty = (va->flags & IGNORECLEAN) && LIST_EMPTY(&vp->v_dirtyblkhd);
splx(s);
if (empty)
return (0);
#ifdef DEBUG_SYSCTL
if (busyprt)
vprint("vflush: busy vnode", vp);
#endif
va->busy++;
return (0);
}
int
vflush(struct mount *mp, struct vnode *skipvp, int flags)
{
struct vflush_args va;
va.skipvp = skipvp;
va.busy = 0;
va.flags = flags;
vfs_mount_foreach_vnode(mp, vflush_vnode, &va);
if (va.busy)
return (EBUSY);
return (0);
}
/*
* Disassociate the underlying file system from a vnode.
*/
void
vclean(struct vnode *vp, int flags, struct proc *p)
{
int active, do_wakeup = 0;
int s;
/*
* Check to see if the vnode is in use.
* If so we have to reference it before we clean it out
* so that its count cannot fall to zero and generate a
* race against ourselves to recycle it.
*/
if ((active = vp->v_usecount) != 0) vp->v_usecount++;
/*
* Prevent the vnode from being recycled or
* brought into use while we clean it out.
*/
mtx_enter(&vnode_mtx);
if (vp->v_lflag & VXLOCK)
panic("vclean: deadlock");
vp->v_lflag |= VXLOCK;
if (vp->v_lockcount > 0) {
/*
* Ensure that any thread currently waiting on the same lock has
* observed that the vnode is about to be exclusively locked
* before continuing.
*/
msleep_nsec(&vp->v_lockcount, &vnode_mtx, PINOD, "vop_lock",
INFSLP);
KASSERT(vp->v_lockcount == 0);
}
mtx_leave(&vnode_mtx);
/*
* Even if the count is zero, the VOP_INACTIVE routine may still
* have the object locked while it cleans it out. The VOP_LOCK
* ensures that the VOP_INACTIVE routine is done with its work.
* For active vnodes, it ensures that no other activity can
* occur while the underlying object is being cleaned out.
*/
VOP_LOCK(vp, LK_EXCLUSIVE | LK_DRAIN);
/*
* Clean out any VM data associated with the vnode.
*/
uvm_vnp_terminate(vp);
/*
* Clean out any buffers associated with the vnode.
*/
if (flags & DOCLOSE)
vinvalbuf(vp, V_SAVE, NOCRED, p, 0, INFSLP);
/*
* If purging an active vnode, it must be closed and
* deactivated before being reclaimed. Note that the
* VOP_INACTIVE will unlock the vnode
*/
if (active) {
if (flags & DOCLOSE)
VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
VOP_INACTIVE(vp, p);
} else {
/*
* Any other processes trying to obtain this lock must first
* wait for VXLOCK to clear, then call the new lock operation.
*/
VOP_UNLOCK(vp);
}
/*
* Reclaim the vnode.
*/
if (VOP_RECLAIM(vp, p))
panic("vclean: cannot reclaim");
if (active) {
vp->v_usecount--;
if (vp->v_usecount == 0) {
s = splbio();
if (vp->v_holdcnt > 0)
panic("vclean: not clean"); vputonfreelist(vp);
splx(s);
}
}
cache_purge(vp);
/*
* Done with purge, notify sleepers of the grim news.
*/
vp->v_op = &dead_vops;
VN_KNOTE(vp, NOTE_REVOKE);
vp->v_tag = VT_NON;
#ifdef VFSLCKDEBUG
vp->v_flag &= ~VLOCKSWORK;
#endif
mtx_enter(&vnode_mtx);
vp->v_lflag &= ~VXLOCK;
if (vp->v_lflag & VXWANT) {
vp->v_lflag &= ~VXWANT;
do_wakeup = 1;
}
mtx_leave(&vnode_mtx);
if (do_wakeup) wakeup(vp);
}
/*
* Recycle an unused vnode to the front of the free list.
*/
int
vrecycle(struct vnode *vp, struct proc *p)
{ if (vp->v_usecount == 0) { vgonel(vp, p);
return (1);
}
return (0);
}
/*
* Eliminate all activity associated with a vnode
* in preparation for reuse.
*/
void
vgone(struct vnode *vp)
{
struct proc *p = curproc;
vgonel(vp, p);
}
/*
* vgone, with struct proc.
*/
void
vgonel(struct vnode *vp, struct proc *p)
{
struct vnode *vq;
struct vnode *vx;
int s;
KASSERT(vp->v_uvcount == 0);
/*
* If a vgone (or vclean) is already in progress,
* wait until it is done and return.
*/
mtx_enter(&vnode_mtx);
if (vp->v_lflag & VXLOCK) {
vp->v_lflag |= VXWANT;
msleep_nsec(vp, &vnode_mtx, PINOD, "vgone", INFSLP);
mtx_leave(&vnode_mtx);
return;
}
mtx_leave(&vnode_mtx);
/*
* Clean out the filesystem specific data.
*/
vclean(vp, DOCLOSE, p);
/*
* Delete from old mount point vnode list, if on one.
*/
if (vp->v_mount != NULL) insmntque(vp, NULL);
/*
* If special device, remove it from special device alias list
* if it is on one.
*/
if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
vp->v_specinfo != NULL) {
if ((vp->v_flag & VALIASED) == 0 && vp->v_type == VCHR && (cdevsw[major(vp->v_rdev)].d_flags & D_CLONE) &&
(minor(vp->v_rdev) >> CLONE_SHIFT == 0)) {
free(vp->v_specbitmap, M_VNODE, CLONE_MAPSZ);
}
SLIST_REMOVE(vp->v_hashchain, vp, vnode, v_specnext); if (vp->v_flag & VALIASED) {
vx = NULL;
SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) { if (vq->v_rdev != vp->v_rdev ||
vq->v_type != vp->v_type)
continue;
if (vx)
break;
vx = vq;
}
if (vx == NULL)
panic("missing alias");
if (vq == NULL)
vx->v_flag &= ~VALIASED;
vp->v_flag &= ~VALIASED;
}
lf_purgelocks(&vp->v_speclockf);
free(vp->v_specinfo, M_VNODE, sizeof(struct specinfo));
vp->v_specinfo = NULL;
}
/*
* If it is on the freelist and not already at the head,
* move it to the head of the list.
*/
vp->v_type = VBAD;
/*
* Move onto the free list, unless we were called from
* getnewvnode and we're not on any free list
*/
s = splbio();
if (vp->v_usecount == 0 &&
(vp->v_bioflag & VBIOONFREELIST)) {
if (vp->v_holdcnt > 0)
panic("vgonel: not clean"); if (TAILQ_FIRST(&vnode_free_list) != vp) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
}
}
splx(s);
}
/*
* Lookup a vnode by device number.
*/
int
vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
{
struct vnode *vp;
int rc =0;
SLIST_FOREACH(vp, &speclisth[SPECHASH(dev)], v_specnext) {
if (dev != vp->v_rdev || type != vp->v_type)
continue;
*vpp = vp;
rc = 1;
break;
}
return (rc);
}
/*
* Revoke all the vnodes corresponding to the specified minor number
* range (endpoints inclusive) of the specified major.
*/
void
vdevgone(int maj, int minl, int minh, enum vtype type)
{
struct vnode *vp;
int mn;
for (mn = minl; mn <= minh; mn++)
if (vfinddev(makedev(maj, mn), type, &vp))
VOP_REVOKE(vp, REVOKEALL);
}
/*
* Calculate the total number of references to a special device.
*/
int
vcount(struct vnode *vp)
{
struct vnode *vq;
int count;
loop:
if ((vp->v_flag & VALIASED) == 0)
return (vp->v_usecount);
count = 0;
SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
continue;
/*
* Alias, but not in use, so flush it out.
*/
if (vq->v_usecount == 0 && vq != vp) {
vgone(vq);
goto loop;
}
count += vq->v_usecount;
}
return (count);
}
#if defined(DEBUG) || defined(DIAGNOSTIC)
/*
* Print out a description of a vnode.
*/
static char *typename[] =
{ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
void
vprint(char *label, struct vnode *vp)
{
char buf[64];
if (label != NULL)
printf("%s: ", label);
printf("%p, type %s, use %u, write %u, hold %u,",
vp, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
vp->v_holdcnt);
buf[0] = '\0';
if (vp->v_flag & VROOT)
strlcat(buf, "|VROOT", sizeof buf);
if (vp->v_flag & VTEXT)
strlcat(buf, "|VTEXT", sizeof buf);
if (vp->v_flag & VSYSTEM)
strlcat(buf, "|VSYSTEM", sizeof buf);
if (vp->v_lflag & VXLOCK)
strlcat(buf, "|VXLOCK", sizeof buf);
if (vp->v_lflag & VXWANT)
strlcat(buf, "|VXWANT", sizeof buf);
if (vp->v_bioflag & VBIOWAIT)
strlcat(buf, "|VBIOWAIT", sizeof buf);
if (vp->v_bioflag & VBIOONFREELIST)
strlcat(buf, "|VBIOONFREELIST", sizeof buf);
if (vp->v_bioflag & VBIOONSYNCLIST)
strlcat(buf, "|VBIOONSYNCLIST", sizeof buf);
if (vp->v_flag & VALIASED)
strlcat(buf, "|VALIASED", sizeof buf);
if (buf[0] != '\0')
printf(" flags (%s)", &buf[1]);
if (vp->v_data == NULL) {
printf("\n");
} else {
printf("\n\t");
VOP_PRINT(vp);
}
}
#endif /* DEBUG || DIAGNOSTIC */
#ifdef DEBUG
/*
* List all of the locked vnodes in the system.
* Called when debugging the kernel.
*/
void
printlockedvnodes(void)
{
struct mount *mp;
struct vnode *vp;
printf("Locked vnodes\n");
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (vfs_busy(mp, VB_READ|VB_NOWAIT))
continue;
TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
if (VOP_ISLOCKED(vp))
vprint(NULL, vp);
}
vfs_unbusy(mp);
}
}
#endif
/*
* Top level filesystem related information gathering.
*/
int
vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
struct vfsconf *vfsp, *tmpvfsp;
int ret;
/* all sysctl names at this level are at least name and field */
if (namelen < 2)
return (ENOTDIR); /* overloaded */
if (name[0] != VFS_GENERIC) {
vfsp = vfs_bytypenum(name[0]);
if (vfsp == NULL || vfsp->vfc_vfsops->vfs_sysctl == NULL)
return (EOPNOTSUPP);
return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
oldp, oldlenp, newp, newlen, p));
}
switch (name[1]) {
case VFS_MAXTYPENUM:
return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
case VFS_CONF:
if (namelen < 3)
return (ENOTDIR); /* overloaded */
vfsp = vfs_bytypenum(name[2]);
if (vfsp == NULL)
return (EOPNOTSUPP);
/* Make a copy, clear out kernel pointers */
tmpvfsp = malloc(sizeof(*tmpvfsp), M_TEMP, M_WAITOK|M_ZERO);
memcpy(tmpvfsp, vfsp, sizeof(*tmpvfsp));
tmpvfsp->vfc_vfsops = NULL;
ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpvfsp,
sizeof(struct vfsconf));
free(tmpvfsp, M_TEMP, sizeof(*tmpvfsp));
return (ret);
case VFS_BCACHESTAT: /* buffer cache statistics */
ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats,
sizeof(struct bcachestats));
return(ret);
}
return (EOPNOTSUPP);
}
/*
* Check to see if a filesystem is mounted on a block device.
*/
int
vfs_mountedon(struct vnode *vp)
{
struct vnode *vq;
int error = 0;
if (vp->v_specmountpoint != NULL)
return (EBUSY);
if (vp->v_flag & VALIASED) {
SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
if (vq->v_rdev != vp->v_rdev ||
vq->v_type != vp->v_type)
continue;
if (vq->v_specmountpoint != NULL) {
error = EBUSY;
break;
}
}
}
return (error);
}
#ifdef NFSSERVER
/*
* Build hash lists of net addresses and hang them off the mount point.
* Called by vfs_export() to set up the lists of export addresses.
*/
int
vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
struct export_args *argp)
{
struct netcred *np;
struct radix_node_head *rnh;
int nplen, i;
struct radix_node *rn;
struct sockaddr *saddr, *smask = NULL;
int error;
if (argp->ex_addrlen == 0) {
if (mp->mnt_flag & MNT_DEFEXPORTED)
return (EPERM);
np = &nep->ne_defexported;
/* fill in the kernel's ucred from userspace's xucred */
if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
return (error);
mp->mnt_flag |= MNT_DEFEXPORTED;
goto finish;
}
if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN ||
argp->ex_addrlen < 0 || argp->ex_masklen < 0)
return (EINVAL);
nplen = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
np = (struct netcred *)malloc(nplen, M_NETADDR, M_WAITOK|M_ZERO);
np->netc_len = nplen;
saddr = (struct sockaddr *)(np + 1);
error = copyin(argp->ex_addr, saddr, argp->ex_addrlen);
if (error)
goto out;
if (saddr->sa_len > argp->ex_addrlen)
saddr->sa_len = argp->ex_addrlen;
if (argp->ex_masklen) {
smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
error = copyin(argp->ex_mask, smask, argp->ex_masklen);
if (error)
goto out;
if (smask->sa_len > argp->ex_masklen)
smask->sa_len = argp->ex_masklen;
}
/* fill in the kernel's ucred from userspace's xucred */
if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
goto out;
i = saddr->sa_family;
switch (i) {
case AF_INET:
if ((rnh = nep->ne_rtable_inet) == NULL) {
if (!rn_inithead((void **)&nep->ne_rtable_inet,
offsetof(struct sockaddr_in, sin_addr))) {
error = ENOBUFS;
goto out;
}
rnh = nep->ne_rtable_inet;
}
break;
default:
error = EINVAL;
goto out;
}
rn = rn_addroute(saddr, smask, rnh, np->netc_rnodes, 0);
if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
error = EPERM;
goto out;
}
finish:
np->netc_exflags = argp->ex_flags;
return (0);
out:
free(np, M_NETADDR, np->netc_len);
return (error);
}
int
vfs_free_netcred(struct radix_node *rn, void *w, u_int id)
{
struct radix_node_head *rnh = (struct radix_node_head *)w;
struct netcred * np = (struct netcred *)rn;
rn_delete(rn->rn_key, rn->rn_mask, rnh, NULL);
free(np, M_NETADDR, np->netc_len);
return (0);
}
/*
* Free the net address hash lists that are hanging off the mount points.
*/
void
vfs_free_addrlist(struct netexport *nep)
{
struct radix_node_head *rnh;
if ((rnh = nep->ne_rtable_inet) != NULL) {
rn_walktree(rnh, vfs_free_netcred, rnh);
free(rnh, M_RTABLE, sizeof(*rnh));
nep->ne_rtable_inet = NULL;
}
}
#endif /* NFSSERVER */
int
vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp)
{
#ifdef NFSSERVER
int error;
if (argp->ex_flags & MNT_DELEXPORT) {
vfs_free_addrlist(nep);
mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
}
if (argp->ex_flags & MNT_EXPORTED) {
if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
return (error);
mp->mnt_flag |= MNT_EXPORTED;
}
return (0);
#else
return (ENOTSUP);
#endif /* NFSSERVER */
}
struct netcred *
vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam)
{
#ifdef NFSSERVER
struct netcred *np;
struct radix_node_head *rnh;
struct sockaddr *saddr;
np = NULL;
if (mp->mnt_flag & MNT_EXPORTED) {
/*
* Lookup in the export list first.
*/
if (nam != NULL) {
saddr = mtod(nam, struct sockaddr *);
switch(saddr->sa_family) {
case AF_INET:
rnh = nep->ne_rtable_inet;
break;
default:
rnh = NULL;
break;
}
if (rnh != NULL)
np = (struct netcred *)rn_match(saddr, rnh);
}
/*
* If no address match, use the default if it exists.
*/
if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
np = &nep->ne_defexported;
}
return (np);
#else
return (NULL);
#endif /* NFSSERVER */
}
/*
* Do the usual access checking.
* file_mode, uid and gid are from the vnode in question,
* while acc_mode and cred are from the VOP_ACCESS parameter list
*/
int
vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
mode_t acc_mode, struct ucred *cred)
{
mode_t mask;
/* User id 0 always gets read/write access. */
if (cred->cr_uid == 0) {
/* For VEXEC, at least one of the execute bits must be set. */
if ((acc_mode & VEXEC) && type != VDIR &&
(file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
return EACCES;
return 0;
}
mask = 0;
/* Otherwise, check the owner. */
if (cred->cr_uid == uid) {
if (acc_mode & VEXEC)
mask |= S_IXUSR;
if (acc_mode & VREAD)
mask |= S_IRUSR;
if (acc_mode & VWRITE)
mask |= S_IWUSR;
return (file_mode & mask) == mask ? 0 : EACCES;
}
/* Otherwise, check the groups. */
if (groupmember(gid, cred)) {
if (acc_mode & VEXEC)
mask |= S_IXGRP;
if (acc_mode & VREAD)
mask |= S_IRGRP;
if (acc_mode & VWRITE)
mask |= S_IWGRP;
return (file_mode & mask) == mask ? 0 : EACCES;
}
/* Otherwise, check everyone else. */
if (acc_mode & VEXEC)
mask |= S_IXOTH;
if (acc_mode & VREAD)
mask |= S_IROTH;
if (acc_mode & VWRITE)
mask |= S_IWOTH;
return (file_mode & mask) == mask ? 0 : EACCES;
}
int
vnoperm(struct vnode *vp)
{ if (vp->v_flag & VROOT || vp->v_mount == NULL)
return 0;
return (vp->v_mount->mnt_flag & MNT_NOPERM);
}
struct rwlock vfs_stall_lock = RWLOCK_INITIALIZER("vfs_stall");
unsigned int vfs_stalling = 0;
int
vfs_stall(struct proc *p, int stall)
{
struct mount *mp;
int allerror = 0, error;
if (stall) {
atomic_inc_int(&vfs_stalling);
rw_enter_write(&vfs_stall_lock);
}
/*
* The loop variable mp is protected by vfs_busy() so that it cannot
* be unmounted while VFS_SYNC() sleeps. Traverse forward to keep the
* lock order consistent with dounmount().
*/
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (stall) {
error = vfs_busy(mp, VB_WRITE|VB_WAIT|VB_DUPOK);
if (error) {
printf("%s: busy\n", mp->mnt_stat.f_mntonname);
allerror = error;
continue;
}
uvm_vnp_sync(mp);
error = VFS_SYNC(mp, MNT_WAIT, stall, p->p_ucred, p);
if (error) {
printf("%s: failed to sync\n",
mp->mnt_stat.f_mntonname);
vfs_unbusy(mp);
allerror = error;
continue;
}
mp->mnt_flag |= MNT_STALLED;
} else {
if (mp->mnt_flag & MNT_STALLED) {
vfs_unbusy(mp);
mp->mnt_flag &= ~MNT_STALLED;
}
}
}
if (!stall) {
rw_exit_write(&vfs_stall_lock);
atomic_dec_int(&vfs_stalling);
}
return (allerror);
}
void
vfs_stall_barrier(void)
{ if (__predict_false(vfs_stalling)) { rw_enter_read(&vfs_stall_lock);
rw_exit_read(&vfs_stall_lock);
}
}
/*
* Unmount all file systems.
* We traverse the list in reverse order under the assumption that doing so
* will avoid needing to worry about dependencies.
*/
void
vfs_unmountall(void)
{
struct mount *mp, *nmp;
int allerror, error, again = 1;
retry:
allerror = 0;
TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
if (vfs_busy(mp, VB_WRITE|VB_NOWAIT))
continue;
/* XXX Here is a race, the next pointer is not locked. */
if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
printf("unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
allerror = 1;
}
}
if (allerror) {
printf("WARNING: some file systems would not unmount\n");
if (again) {
printf("retrying\n");
again = 0;
goto retry;
}
}
}
/*
* Sync and unmount file systems before shutting down.
*/
void
vfs_shutdown(struct proc *p)
{
#ifdef ACCOUNTING
acct_shutdown();
#endif
printf("syncing disks...");
if (panicstr == NULL) {
/* Sync before unmount, in case we hang on something. */
sys_sync(p, NULL, NULL);
vfs_unmountall();
}
#if NSOFTRAID > 0
sr_quiesce();
#endif
if (vfs_syncwait(p, 1))
printf(" giving up\n");
else
printf(" done\n");
}
/*
* perform sync() operation and wait for buffers to flush.
*/
int
vfs_syncwait(struct proc *p, int verbose)
{
struct buf *bp;
int iter, nbusy, dcount, s;
#ifdef MULTIPROCESSOR
int hold_count;
#endif
sys_sync(p, NULL, NULL);
/* Wait for sync to finish. */
dcount = 10000;
for (iter = 0; iter < 20; iter++) {
nbusy = 0;
LIST_FOREACH(bp, &bufhead, b_list) {
if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
nbusy++;
/*
* With soft updates, some buffers that are
* written will be remarked as dirty until other
* buffers are written.
*/
if (bp->b_flags & B_DELWRI) {
s = splbio();
bremfree(bp);
buf_acquire(bp);
splx(s);
nbusy++;
bawrite(bp);
if (dcount-- <= 0) {
if (verbose)
printf("softdep ");
return 1;
}
}
}
if (nbusy == 0)
break;
if (verbose)
printf("%d ", nbusy);
#ifdef MULTIPROCESSOR
if (_kernel_lock_held())
hold_count = __mp_release_all(&kernel_lock);
else
hold_count = 0;
#endif
DELAY(40000 * iter);
#ifdef MULTIPROCESSOR
if (hold_count)
__mp_acquire_count(&kernel_lock, hold_count);
#endif
}
return nbusy;
}
/*
* posix file system related system variables.
*/
int
fs_posix_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen, struct proc *p)
{
/* all sysctl names at this level are terminal */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case FS_POSIX_SETUID:
return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
&suid_clear));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
/*
* file system related system variables.
*/
int
fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
sysctlfn *fn;
switch (name[0]) {
case FS_POSIX:
fn = fs_posix_sysctl;
break;
default:
return (EOPNOTSUPP);
}
return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
}
/*
* Routines dealing with vnodes and buffers
*/
/*
* Wait for all outstanding I/Os to complete
*
* Manipulates v_numoutput. Must be called at splbio()
*/
int
vwaitforio(struct vnode *vp, int slpflag, char *wmesg, uint64_t timeo)
{
int error = 0;
splassert(IPL_BIO); while (vp->v_numoutput) {
vp->v_bioflag |= VBIOWAIT;
error = tsleep_nsec(&vp->v_numoutput,
slpflag | (PRIBIO + 1), wmesg, timeo);
if (error)
break;
}
return (error);
}
/*
* Update outstanding I/O count and do wakeup if requested.
*
* Manipulates v_numoutput. Must be called at splbio()
*/
void
vwakeup(struct vnode *vp)
{
splassert(IPL_BIO);
if (vp != NULL) {
if (vp->v_numoutput-- == 0)
panic("vwakeup: neg numoutput");
if ((vp->v_bioflag & VBIOWAIT) && vp->v_numoutput == 0) {
vp->v_bioflag &= ~VBIOWAIT;
wakeup(&vp->v_numoutput);
}
}
}
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying object locked.
*/
int
vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p,
int slpflag, uint64_t slptimeo)
{
struct buf *bp;
struct buf *nbp, *blist;
int s, error;
#ifdef VFSLCKDEBUG
if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp)) panic("%s: vp isn't locked, vp %p", __func__, vp);
#endif
if (flags & V_SAVE) {
s = splbio();
vwaitforio(vp, 0, "vinvalbuf", INFSLP);
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
splx(s);
if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
return (error);
s = splbio();
if (vp->v_numoutput > 0 ||
!LIST_EMPTY(&vp->v_dirtyblkhd))
panic("%s: dirty bufs, vp %p", __func__, vp);
}
splx(s);
}
loop:
s = splbio();
for (;;) {
int count = 0;
if ((blist = LIST_FIRST(&vp->v_cleanblkhd)) &&
(flags & V_SAVEMETA))
while (blist && blist->b_lblkno < 0)
blist = LIST_NEXT(blist, b_vnbufs);
if (blist == NULL &&
(blist = LIST_FIRST(&vp->v_dirtyblkhd)) &&
(flags & V_SAVEMETA))
while (blist && blist->b_lblkno < 0)
blist = LIST_NEXT(blist, b_vnbufs);
if (!blist)
break;
for (bp = blist; bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
if (flags & V_SAVEMETA && bp->b_lblkno < 0)
continue;
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
"vinvalbuf", slptimeo);
if (error) {
splx(s);
return (error);
}
break;
}
bremfree(bp);
/*
* XXX Since there are no node locks for NFS, I believe
* there is a slight chance that a delayed write will
* occur while sleeping just above, so check for it.
*/
if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
buf_acquire(bp);
splx(s);
(void) VOP_BWRITE(bp);
goto loop;
}
buf_acquire_nomap(bp);
bp->b_flags |= B_INVAL;
brelse(bp);
count++;
/*
* XXX Temporary workaround XXX
*
* If this is a gigantisch vnode and we are
* trashing a ton of buffers, drop the lock
* and yield every so often. The longer term
* fix is to add a separate list for these
* invalid buffers so we don't have to do the
* work to free these here.
*/
if (count > 100) {
splx(s);
sched_pause(yield);
goto loop;
}
}
}
if (!(flags & V_SAVEMETA) && (!LIST_EMPTY(&vp->v_dirtyblkhd) || !LIST_EMPTY(&vp->v_cleanblkhd)))
panic("%s: flush failed, vp %p", __func__, vp);
splx(s);
return (0);
}
void
vflushbuf(struct vnode *vp, int sync)
{
struct buf *bp, *nbp;
int s;
loop:
s = splbio();
LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
if ((bp->b_flags & B_BUSY))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("vflushbuf: not dirty");
bremfree(bp);
buf_acquire(bp);
splx(s);
/*
* Wait for I/O associated with indirect blocks to complete,
* since there is no way to quickly wait for them below.
*/
if (bp->b_vp == vp || sync == 0)
(void) bawrite(bp);
else
(void) bwrite(bp);
goto loop;
}
if (sync == 0) {
splx(s);
return;
}
vwaitforio(vp, 0, "vflushbuf", INFSLP);
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
splx(s);
#ifdef DIAGNOSTIC
vprint("vflushbuf: dirty", vp);
#endif
goto loop;
}
splx(s);
}
/*
* Associate a buffer with a vnode.
*
* Manipulates buffer vnode queues. Must be called at splbio().
*/
void
bgetvp(struct vnode *vp, struct buf *bp)
{ splassert(IPL_BIO);
if (bp->b_vp)
panic("bgetvp: not free");
vhold(vp);
bp->b_vp = vp;
if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev;
else
bp->b_dev = NODEV;
/*
* Insert onto list for new vnode.
*/
bufinsvn(bp, &vp->v_cleanblkhd);
}
/*
* Disassociate a buffer from a vnode.
*
* Manipulates vnode buffer queues. Must be called at splbio().
*/
void
brelvp(struct buf *bp)
{
struct vnode *vp;
splassert(IPL_BIO);
if ((vp = bp->b_vp) == (struct vnode *) 0)
panic("brelvp: NULL");
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp); if ((vp->v_bioflag & VBIOONSYNCLIST) &&
LIST_EMPTY(&vp->v_dirtyblkhd)) {
vp->v_bioflag &= ~VBIOONSYNCLIST;
LIST_REMOVE(vp, v_synclist);
}
bp->b_vp = NULL;
vdrop(vp);
}
/*
* Replaces the current vnode associated with the buffer, if any,
* with a new vnode.
*
* If an output I/O is pending on the buffer, the old vnode
* I/O count is adjusted.
*
* Ignores vnode buffer queues. Must be called at splbio().
*/
void
buf_replacevnode(struct buf *bp, struct vnode *newvp)
{
struct vnode *oldvp = bp->b_vp;
splassert(IPL_BIO);
if (oldvp)
brelvp(bp);
if ((bp->b_flags & (B_READ | B_DONE)) == 0) {
newvp->v_numoutput++; /* put it on swapdev */
vwakeup(oldvp);
}
bgetvp(newvp, bp);
bufremvn(bp);
}
/*
* Used to assign buffers to the appropriate clean or dirty list on
* the vnode and to add newly dirty vnodes to the appropriate
* filesystem syncer list.
*
* Manipulates vnode buffer queues. Must be called at splbio().
*/
void
reassignbuf(struct buf *bp)
{
struct buflists *listheadp;
int delay;
struct vnode *vp = bp->b_vp;
splassert(IPL_BIO);
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp);
/*
* If dirty, put on list of dirty buffers;
* otherwise insert onto list of clean buffers.
*/
if ((bp->b_flags & B_DELWRI) == 0) {
listheadp = &vp->v_cleanblkhd;
if ((vp->v_bioflag & VBIOONSYNCLIST) &&
LIST_EMPTY(&vp->v_dirtyblkhd)) {
vp->v_bioflag &= ~VBIOONSYNCLIST;
LIST_REMOVE(vp, v_synclist);
}
} else {
listheadp = &vp->v_dirtyblkhd;
if ((vp->v_bioflag & VBIOONSYNCLIST) == 0) { switch (vp->v_type) {
case VDIR:
delay = syncdelay / 2;
break;
case VBLK:
if (vp->v_specmountpoint != NULL) { delay = syncdelay / 3;
break;
}
/* FALLTHROUGH */
default:
delay = syncdelay;
}
vn_syncer_add_to_worklist(vp, delay);
}
}
bufinsvn(bp, listheadp);
}
/*
* Check if vnode represents a disk device
*/
int
vn_isdisk(struct vnode *vp, int *errp)
{
if (vp->v_type != VBLK && vp->v_type != VCHR)
return (0);
return (1);
}
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
void
vfs_buf_print(void *b, int full,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct buf *bp = b;
(*pr)(" vp %p lblkno 0x%llx blkno 0x%llx dev 0x%x\n"
" proc %p error %d flags %lb\n",
bp->b_vp, (int64_t)bp->b_lblkno, (int64_t)bp->b_blkno, bp->b_dev,
bp->b_proc, bp->b_error, bp->b_flags, B_BITS);
(*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n"
" data %p saveaddr %p dep %p iodone %p\n",
bp->b_bufsize, bp->b_bcount, (long)bp->b_resid,
bp->b_data, bp->b_saveaddr,
LIST_FIRST(&bp->b_dep), bp->b_iodone);
(*pr)(" dirty {off 0x%x end 0x%x} valid {off 0x%x end 0x%x}\n",
bp->b_dirtyoff, bp->b_dirtyend, bp->b_validoff, bp->b_validend);
#ifdef FFS_SOFTUPDATES
if (full)
softdep_print(bp, full, pr);
#endif
}
const char *vtypes[] = { VTYPE_NAMES };
const char *vtags[] = { VTAG_NAMES };
void
vfs_vnode_print(void *v, int full,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct vnode *vp = v;
(*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
(u_int)vp->v_tag >= nitems(vtags)? "<unk>":vtags[vp->v_tag],
vp->v_tag,
(u_int)vp->v_type >= nitems(vtypes)? "<unk>":vtypes[vp->v_type],
vp->v_type, vp->v_mount, vp->v_mountedhere);
(*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n",
vp->v_data, vp->v_usecount, vp->v_writecount,
vp->v_holdcnt, vp->v_numoutput);
/* uvm_object_printit(&vp->v_uobj, full, pr); */
if (full) {
struct buf *bp;
(*pr)("clean bufs:\n");
LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
(*pr)("dirty bufs:\n");
LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
}
}
void
vfs_mount_print(struct mount *mp, int full,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct vfsconf *vfc = mp->mnt_vfc;
struct vnode *vp;
int cnt;
(*pr)("flags %b\nvnodecovered %p syncer %p data %p\n",
mp->mnt_flag, MNT_BITS,
mp->mnt_vnodecovered, mp->mnt_syncer, mp->mnt_data);
(*pr)("vfsconf: ops %p name \"%s\" num %d ref %u flags 0x%x\n",
vfc->vfc_vfsops, vfc->vfc_name, vfc->vfc_typenum,
vfc->vfc_refcount, vfc->vfc_flags);
(*pr)("statvfs cache: bsize %x iosize %x\n"
"blocks %llu free %llu avail %lld\n",
mp->mnt_stat.f_bsize, mp->mnt_stat.f_iosize, mp->mnt_stat.f_blocks,
mp->mnt_stat.f_bfree, mp->mnt_stat.f_bavail);
(*pr)(" files %llu ffiles %llu favail %lld\n", mp->mnt_stat.f_files,
mp->mnt_stat.f_ffree, mp->mnt_stat.f_favail);
(*pr)(" f_fsidx {0x%x, 0x%x} owner %u ctime 0x%llx\n",
mp->mnt_stat.f_fsid.val[0], mp->mnt_stat.f_fsid.val[1],
mp->mnt_stat.f_owner, mp->mnt_stat.f_ctime);
(*pr)(" syncwrites %llu asyncwrites = %llu\n",
mp->mnt_stat.f_syncwrites, mp->mnt_stat.f_asyncwrites);
(*pr)(" syncreads %llu asyncreads = %llu\n",
mp->mnt_stat.f_syncreads, mp->mnt_stat.f_asyncreads);
(*pr)(" fstype \"%s\" mnton \"%s\" mntfrom \"%s\" mntspec \"%s\"\n",
mp->mnt_stat.f_fstypename, mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromspec);
(*pr)("locked vnodes:");
/* XXX would take mountlist lock, except ddb has no context */
cnt = 0;
TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
if (VOP_ISLOCKED(vp)) {
if (cnt == 0)
(*pr)("\n %p", vp);
else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
(*pr)(",\n %p", vp);
else
(*pr)(", %p", vp);
cnt++;
}
}
(*pr)("\n");
if (full) {
(*pr)("all vnodes:");
/* XXX would take mountlist lock, except ddb has no context */
cnt = 0;
TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
if (cnt == 0)
(*pr)("\n %p", vp);
else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
(*pr)(",\n %p", vp);
else
(*pr)(", %p", vp);
cnt++;
}
(*pr)("\n");
}
}
#endif /* DDB */
void
copy_statfs_info(struct statfs *sbp, const struct mount *mp)
{
const struct statfs *mbp;
strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN);
if (sbp == (mbp = &mp->mnt_stat))
return;
sbp->f_fsid = mbp->f_fsid;
sbp->f_owner = mbp->f_owner;
sbp->f_flags = mbp->f_flags;
sbp->f_syncwrites = mbp->f_syncwrites;
sbp->f_asyncwrites = mbp->f_asyncwrites;
sbp->f_syncreads = mbp->f_syncreads;
sbp->f_asyncreads = mbp->f_asyncreads;
sbp->f_namemax = mbp->f_namemax;
memcpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, MNAMELEN);
memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, MNAMELEN);
memcpy(sbp->f_mntfromspec, mp->mnt_stat.f_mntfromspec, MNAMELEN);
memcpy(&sbp->mount_info, &mp->mnt_stat.mount_info,
sizeof(union mount_info));
}
/* $OpenBSD: ip_input.c,v 1.381 2022/08/29 14:43:56 bluhm Exp $ */
/* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
*/
#include "pf.h"
#include "carp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/pool.h>
#include <sys/task.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/netisr.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/if_ether.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <net/if_types.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#endif
#if NPF > 0
#include <net/pfvar.h>
#endif
#ifdef MROUTING
#include <netinet/ip_mroute.h>
#endif
#ifdef IPSEC
#include <netinet/ip_ipsp.h>
#endif /* IPSEC */
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
/* values controllable via sysctl */
int ipforwarding = 0;
int ipmforwarding = 0;
int ipmultipath = 0;
int ipsendredirects = 1;
int ip_dosourceroute = 0;
int ip_defttl = IPDEFTTL;
int ip_mtudisc = 1;
int ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
int ip_directedbcast = 0;
/* Protects `ipq' and `ip_frags'. */
struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
/* IP reassembly queue */
LIST_HEAD(, ipq) ipq;
/* Keep track of memory used for reassembly */
int ip_maxqueue = 300;
int ip_frags = 0;
const struct sysctl_bounded_args ipctl_vars[] = {
#ifdef MROUTING
{ IPCTL_MRTPROTO, &ip_mrtproto, SYSCTL_INT_READONLY },
#endif
{ IPCTL_FORWARDING, &ipforwarding, 0, 2 },
{ IPCTL_SENDREDIRECTS, &ipsendredirects, 0, 1 },
{ IPCTL_DEFTTL, &ip_defttl, 0, 255 },
{ IPCTL_DIRECTEDBCAST, &ip_directedbcast, 0, 1 },
{ IPCTL_IPPORT_FIRSTAUTO, &ipport_firstauto, 0, 65535 },
{ IPCTL_IPPORT_LASTAUTO, &ipport_lastauto, 0, 65535 },
{ IPCTL_IPPORT_HIFIRSTAUTO, &ipport_hifirstauto, 0, 65535 },
{ IPCTL_IPPORT_HILASTAUTO, &ipport_hilastauto, 0, 65535 },
{ IPCTL_IPPORT_MAXQUEUE, &ip_maxqueue, 0, 10000 },
{ IPCTL_MFORWARDING, &ipmforwarding, 0, 1 },
{ IPCTL_MULTIPATH, &ipmultipath, 0, 1 },
{ IPCTL_ARPTIMEOUT, &arpt_keep, 0, INT_MAX },
{ IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX },
};
struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP);
struct pool ipqent_pool;
struct pool ipq_pool;
struct cpumem *ipcounters;
int ip_sysctl_ipstat(void *, size_t *, void *);
static struct mbuf_queue ipsend_mq;
static struct mbuf_queue ipsendraw_mq;
extern struct niqueue arpinq;
int ip_ours(struct mbuf **, int *, int, int);
int ip_dooptions(struct mbuf *, struct ifnet *);
int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
int ip_fragcheck(struct mbuf **, int *);
struct mbuf * ip_reass(struct ipqent *, struct ipq *);
void ip_freef(struct ipq *);
void ip_flush(void);
static void ip_send_dispatch(void *);
static void ip_sendraw_dispatch(void *);
static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq);
static struct task ipsendraw_task =
TASK_INITIALIZER(ip_sendraw_dispatch, &ipsendraw_mq);
/*
* Used to save the IP options in case a protocol wants to respond
* to an incoming packet over the same route if the packet got here
* using IP source routing. This allows connection establishment and
* maintenance when the remote end is on a network that is not known
* to us.
*/
struct ip_srcrt {
int isr_nhops; /* number of hops */
struct in_addr isr_dst; /* final destination */
char isr_nop; /* one NOP to align */
char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */
struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)];
};
void save_rte(struct mbuf *, u_char *, struct in_addr);
/*
* IP initialization: fill in IP protocol switch table.
* All protocols not implemented in kernel go to raw IP protocol handler.
*/
void
ip_init(void)
{
const struct protosw *pr;
int i;
const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP;
const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP;
const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP;
const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP;
ipcounters = counters_alloc(ips_ncounters);
pool_init(&ipqent_pool, sizeof(struct ipqent), 0,
IPL_SOFTNET, 0, "ipqe", NULL);
pool_init(&ipq_pool, sizeof(struct ipq), 0,
IPL_SOFTNET, 0, "ipq", NULL);
pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
if (pr == NULL)
panic("ip_init");
for (i = 0; i < IPPROTO_MAX; i++)
ip_protox[i] = pr - inetsw;
for (pr = inetdomain.dom_protosw;
pr < inetdomain.dom_protoswNPROTOSW; pr++)
if (pr->pr_domain->dom_family == PF_INET &&
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW &&
pr->pr_protocol < IPPROTO_MAX)
ip_protox[pr->pr_protocol] = pr - inetsw;
LIST_INIT(&ipq);
/* Fill in list of ports not to allocate dynamically. */
memset(&baddynamicports, 0, sizeof(baddynamicports));
for (i = 0; defbaddynamicports_tcp[i] != 0; i++)
DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]);
for (i = 0; defbaddynamicports_udp[i] != 0; i++)
DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]);
/* Fill in list of ports only root can bind to. */
memset(&rootonlyports, 0, sizeof(rootonlyports));
for (i = 0; defrootonlyports_tcp[i] != 0; i++)
DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]);
for (i = 0; defrootonlyports_udp[i] != 0; i++)
DP_SET(rootonlyports.udp, defrootonlyports_udp[i]);
mq_init(&ipsend_mq, 64, IPL_SOFTNET);
mq_init(&ipsendraw_mq, 64, IPL_SOFTNET);
arpinit();
#ifdef IPSEC
ipsec_init();
#endif
#ifdef MROUTING
rt_timer_queue_init(&ip_mrouterq, MCAST_EXPIRE_FREQUENCY,
&mfc_expire_route);
#endif
}
/*
* Enqueue packet for local delivery. Queuing is used as a boundary
* between the network layer (input/forward path) running with
* NET_LOCK_SHARED() and the transport layer needing it exclusively.
*/
int
ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
{
nxt = ip_fragcheck(mp, offp);
if (nxt == IPPROTO_DONE)
return IPPROTO_DONE;
/* We are already in a IPv4/IPv6 local deliver loop. */
if (af != AF_UNSPEC)
return nxt;
niq_enqueue(&ipintrq, *mp);
*mp = NULL;
return IPPROTO_DONE;
}
/*
* Dequeue and process locally delivered packets.
* This is called with exclusive NET_LOCK().
*/
void
ipintr(void)
{
struct mbuf *m;
while ((m = niq_dequeue(&ipintrq)) != NULL) {
struct ip *ip;
int off, nxt;
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("ipintr no HDR");
#endif
ip = mtod(m, struct ip *);
off = ip->ip_hl << 2;
nxt = ip->ip_p;
nxt = ip_deliver(&m, &off, nxt, AF_INET);
KASSERT(nxt == IPPROTO_DONE);
}
}
/*
* IPv4 input routine.
*
* Checksum and byte swap header. Process options. Forward or deliver.
*/
void
ipv4_input(struct ifnet *ifp, struct mbuf *m)
{
int off, nxt;
off = 0;
nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp);
KASSERT(nxt == IPPROTO_DONE);}
struct mbuf *
ipv4_check(struct ifnet *ifp, struct mbuf *m)
{
struct ip *ip;
int hlen, len;
if (m->m_len < sizeof(*ip)) {
m = m_pullup(m, sizeof(*ip));
if (m == NULL) {
ipstat_inc(ips_toosmall);
return (NULL);
}
}
ip = mtod(m, struct ip *);
if (ip->ip_v != IPVERSION) {
ipstat_inc(ips_badvers);
goto bad;
}
hlen = ip->ip_hl << 2;
if (hlen < sizeof(*ip)) { /* minimum header length */
ipstat_inc(ips_badhlen);
goto bad;
}
if (hlen > m->m_len) {
m = m_pullup(m, hlen);
if (m == NULL) {
ipstat_inc(ips_badhlen);
return (NULL);
}
ip = mtod(m, struct ip *);
}
/* 127/8 must not appear on wire - RFC1122 */
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat_inc(ips_badaddr);
goto bad;
}
}
if (!ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) {
if (ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD)) {
ipstat_inc(ips_badsum);
goto bad;
}
ipstat_inc(ips_inswcsum);
if (in_cksum(m, hlen) != 0) {
ipstat_inc(ips_badsum);
goto bad;
}
SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK);
}
/* Retrieve the packet length. */
len = ntohs(ip->ip_len);
/*
* Convert fields to host representation.
*/
if (len < hlen) {
ipstat_inc(ips_badlen);
goto bad;
}
/*
* Check that the amount of data in the buffers
* is at least as much as the IP header would have us expect.
* Trim mbufs if longer than we expect.
* Drop packet if shorter than we expect.
*/
if (m->m_pkthdr.len < len) {
ipstat_inc(ips_tooshort);
goto bad;
}
if (m->m_pkthdr.len > len) {
if (m->m_len == m->m_pkthdr.len) {
m->m_len = len;
m->m_pkthdr.len = len;
} else
m_adj(m, len - m->m_pkthdr.len);
}
return (m);
bad:
m_freem(m);
return (NULL);
}
int
ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp)
{
struct mbuf *m;
struct rtentry *rt = NULL;
struct ip *ip;
int hlen;
in_addr_t pfrdr = 0;
KASSERT(*offp == 0);
ipstat_inc(ips_total);
m = *mp = ipv4_check(ifp, *mp);
if (m == NULL)
goto bad;
ip = mtod(m, struct ip *);
#if NCARP > 0
if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr,
&ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1)))
goto bad;
#endif
#if NPF > 0
/*
* Packet filter
*/
pfrdr = ip->ip_dst.s_addr;
if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS)
goto bad;
m = *mp;
if (m == NULL)
goto bad;
ip = mtod(m, struct ip *);
pfrdr = (pfrdr != ip->ip_dst.s_addr);
#endif
hlen = ip->ip_hl << 2;
/*
* Process options and, if not destined for us,
* ship it on. ip_dooptions returns 1 when an
* error was detected (causing an icmp message
* to be sent and the original packet to be freed).
*/
if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { m = *mp = NULL;
goto bad;
}
if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
ip->ip_dst.s_addr == INADDR_ANY) {
nxt = ip_ours(mp, offp, nxt, af);
goto out;
}
switch(in_ouraddr(m, ifp, &rt)) {
case 2:
goto bad;
case 1:
nxt = ip_ours(mp, offp, nxt, af);
goto out;
}
if (IN_MULTICAST(ip->ip_dst.s_addr)) {
/*
* Make sure M_MCAST is set. It should theoretically
* already be there, but let's play safe because upper
* layers check for this flag.
*/
m->m_flags |= M_MCAST;
#ifdef MROUTING
if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) {
int error;
if (m->m_flags & M_EXT) {
if ((m = *mp = m_pullup(m, hlen)) == NULL) {
ipstat_inc(ips_toosmall);
goto bad;
}
ip = mtod(m, struct ip *);
}
/*
* If we are acting as a multicast router, all
* incoming multicast packets are passed to the
* kernel-level multicast forwarding function.
* The packet is returned (relatively) intact; if
* ip_mforward() returns a non-zero value, the packet
* must be discarded, else it may be accepted below.
*
* (The IP ident field is put in the same byte order
* as expected when ip_mforward() is called from
* ip_output().)
*/
KERNEL_LOCK();
error = ip_mforward(m, ifp);
KERNEL_UNLOCK();
if (error) {
ipstat_inc(ips_cantforward);
goto bad;
}
/*
* The process-level routing daemon needs to receive
* all multicast IGMP packets, whether or not this
* host belongs to their destination groups.
*/
if (ip->ip_p == IPPROTO_IGMP) {
nxt = ip_ours(mp, offp, nxt, af);
goto out;
}
ipstat_inc(ips_forward);
}
#endif
/*
* See if we belong to the destination multicast group on the
* arrival interface.
*/
if (!in_hasmulti(&ip->ip_dst, ifp)) {
ipstat_inc(ips_notmember);
if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr))
ipstat_inc(ips_cantforward);
goto bad;
}
nxt = ip_ours(mp, offp, nxt, af);
goto out;
}
#if NCARP > 0
if (ip->ip_p == IPPROTO_ICMP &&
carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr,
&ip->ip_dst.s_addr, 1))
goto bad;
#endif
/*
* Not for us; forward if possible and desirable.
*/
if (ipforwarding == 0) {
ipstat_inc(ips_cantforward);
goto bad;
}
#ifdef IPSEC
if (ipsec_in_use) {
int rv;
rv = ipsec_forward_check(m, hlen, AF_INET);
if (rv != 0) { ipstat_inc(ips_cantforward);
goto bad;
}
/*
* Fall through, forward packet. Outbound IPsec policy
* checking will occur in ip_output().
*/
}
#endif /* IPSEC */
ip_forward(m, ifp, rt, pfrdr);
*mp = NULL;
return IPPROTO_DONE;
bad:
nxt = IPPROTO_DONE;
m_freemp(mp);
out:
rtfree(rt);
return nxt;
}
int
ip_fragcheck(struct mbuf **mp, int *offp)
{
struct ip *ip;
struct ipq *fp;
struct ipqent *ipqe;
int hlen;
uint16_t mff;
ip = mtod(*mp, struct ip *);
hlen = ip->ip_hl << 2;
/*
* If offset or more fragments are set, must reassemble.
* Otherwise, nothing need be done.
* (We could look in the reassembly queue to see
* if the packet was previously fragmented,
* but it's not worth the time; just let them time out.)
*/
if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF))) { if ((*mp)->m_flags & M_EXT) { /* XXX */
if ((*mp = m_pullup(*mp, hlen)) == NULL) {
ipstat_inc(ips_toosmall);
return IPPROTO_DONE;
}
ip = mtod(*mp, struct ip *);
}
/*
* Adjust ip_len to not reflect header,
* set ipqe_mff if more fragments are expected,
* convert offset of this to bytes.
*/
ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
mff = ISSET(ip->ip_off, htons(IP_MF));
if (mff) {
/*
* Make sure that fragments have a data length
* that's a non-zero multiple of 8 bytes.
*/
if (ntohs(ip->ip_len) == 0 ||
(ntohs(ip->ip_len) & 0x7) != 0) {
ipstat_inc(ips_badfrags);
m_freemp(mp);
return IPPROTO_DONE;
}
}
ip->ip_off = htons(ntohs(ip->ip_off) << 3);
mtx_enter(&ipq_mutex);
/*
* Look for queue of fragments
* of this datagram.
*/
LIST_FOREACH(fp, &ipq, ipq_q) { if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
ip->ip_p == fp->ipq_p)
break;
}
/*
* If datagram marked as having more fragments
* or if this is not the first fragment,
* attempt reassembly; if it succeeds, proceed.
*/
if (mff || ip->ip_off) {
ipstat_inc(ips_fragments);
if (ip_frags + 1 > ip_maxqueue) {
ip_flush();
ipstat_inc(ips_rcvmemdrop);
goto bad;
}
ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
if (ipqe == NULL) {
ipstat_inc(ips_rcvmemdrop);
goto bad;
}
ip_frags++;
ipqe->ipqe_mff = mff;
ipqe->ipqe_m = *mp;
ipqe->ipqe_ip = ip;
*mp = ip_reass(ipqe, fp);
if (*mp == NULL)
goto bad;
ipstat_inc(ips_reassembled);
ip = mtod(*mp, struct ip *);
hlen = ip->ip_hl << 2;
ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
} else {
if (fp != NULL) ip_freef(fp);
}
mtx_leave(&ipq_mutex);
}
*offp = hlen;
return ip->ip_p;
bad:
mtx_leave(&ipq_mutex);
m_freemp(mp);
return IPPROTO_DONE;
}
#ifndef INET6
#define IPSTAT_INC(name) ipstat_inc(ips_##name)
#else
#define IPSTAT_INC(name) (af == AF_INET ? \
ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name))
#endif
int
ip_deliver(struct mbuf **mp, int *offp, int nxt, int af)
{
const struct protosw *psw;
int naf = af;
#ifdef INET6
int nest = 0;
#endif /* INET6 */
NET_ASSERT_LOCKED_EXCLUSIVE();
/*
* Tell launch routine the next header
*/
IPSTAT_INC(delivered);
while (nxt != IPPROTO_DONE) {
#ifdef INET6
if (af == AF_INET6 &&
ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
ip6stat_inc(ip6s_toomanyhdr);
goto bad;
}
#endif /* INET6 */
/*
* protection against faulty packet - there should be
* more sanity checks in header chain processing.
*/
if ((*mp)->m_pkthdr.len < *offp) {
IPSTAT_INC(tooshort);
goto bad;
}
#ifdef IPSEC
if (ipsec_in_use) {
if (ipsec_local_check(*mp, *offp, nxt, af) != 0) {
IPSTAT_INC(cantforward);
goto bad;
}
}
/* Otherwise, just fall through and deliver the packet */
#endif /* IPSEC */
switch (nxt) {
case IPPROTO_IPV4:
naf = AF_INET;
ipstat_inc(ips_delivered);
break;
#ifdef INET6
case IPPROTO_IPV6:
naf = AF_INET6;
ip6stat_inc(ip6s_delivered);
break;
#endif /* INET6 */
}
switch (af) {
case AF_INET:
psw = &inetsw[ip_protox[nxt]];
break;
#ifdef INET6
case AF_INET6:
psw = &inet6sw[ip6_protox[nxt]];
break;
#endif /* INET6 */
}
nxt = (*psw->pr_input)(mp, offp, nxt, af);
af = naf;
}
return nxt;
bad:
m_freemp(mp);
return IPPROTO_DONE;
}
#undef IPSTAT_INC
int
in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt)
{
struct rtentry *rt;
struct ip *ip;
struct sockaddr_in sin;
int match = 0;
#if NPF > 0
switch (pf_ouraddr(m)) {
case 0:
return (0);
case 1:
return (1);
default:
/* pf does not know it */
break;
}
#endif
ip = mtod(m, struct ip *);
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = ip->ip_dst;
rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr,
m->m_pkthdr.ph_rtableid);
if (rtisvalid(rt)) {
if (ISSET(rt->rt_flags, RTF_LOCAL))
match = 1;
/*
* If directedbcast is enabled we only consider it local
* if it is received on the interface with that address.
*/
if (ISSET(rt->rt_flags, RTF_BROADCAST) && (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) {
match = 1;
/* Make sure M_BCAST is set */
m->m_flags |= M_BCAST;
}
}
*prt = rt;
if (!match) {
struct ifaddr *ifa;
/*
* No local address or broadcast address found, so check for
* ancient classful broadcast addresses.
* It must have been broadcast on the link layer, and for an
* address on the interface it was received on.
*/
if (!ISSET(m->m_flags, M_BCAST) || !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr))
return (0);
if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid))
return (0);
/*
* The check in the loop assumes you only rx a packet on an UP
* interface, and that M_BCAST will only be set on a BROADCAST
* interface.
*/
NET_ASSERT_LOCKED(); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET)
continue;
if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr,
ifatoia(ifa)->ia_addr.sin_addr.s_addr)) {
match = 1;
break;
}
}
} else if (ipforwarding == 0 && rt->rt_ifidx != ifp->if_index && !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) ||
(m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) {
/* received on wrong interface. */
#if NCARP > 0
struct ifnet *out_if;
/*
* Virtual IPs on carp interfaces need to be checked also
* against the parent interface and other carp interfaces
* sharing the same parent.
*/
out_if = if_get(rt->rt_ifidx);
if (!(out_if && carp_strict_addr_chk(out_if, ifp))) {
ipstat_inc(ips_wrongif);
match = 2;
}
if_put(out_if);
#else
ipstat_inc(ips_wrongif);
match = 2;
#endif
}
return (match);
}
/*
* Take incoming datagram fragment and try to
* reassemble it into whole datagram. If a chain for
* reassembly of this datagram already exists, then it
* is given as fp; otherwise have to make a chain.
*/
struct mbuf *
ip_reass(struct ipqent *ipqe, struct ipq *fp)
{
struct mbuf *m = ipqe->ipqe_m;
struct ipqent *nq, *p, *q;
struct ip *ip;
struct mbuf *t;
int hlen = ipqe->ipqe_ip->ip_hl << 2;
int i, next;
u_int8_t ecn, ecn0;
MUTEX_ASSERT_LOCKED(&ipq_mutex);
/*
* Presence of header sizes in mbufs
* would confuse code below.
*/
m->m_data += hlen;
m->m_len -= hlen;
/*
* If first fragment to arrive, create a reassembly queue.
*/
if (fp == NULL) {
fp = pool_get(&ipq_pool, PR_NOWAIT);
if (fp == NULL)
goto dropfrag;
LIST_INSERT_HEAD(&ipq, fp, ipq_q);
fp->ipq_ttl = IPFRAGTTL;
fp->ipq_p = ipqe->ipqe_ip->ip_p;
fp->ipq_id = ipqe->ipqe_ip->ip_id;
LIST_INIT(&fp->ipq_fragq);
fp->ipq_src = ipqe->ipqe_ip->ip_src;
fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
p = NULL;
goto insert;
}
/*
* Handle ECN by comparing this segment with the first one;
* if CE is set, do not lose CE.
* drop if CE and not-ECT are mixed for the same packet.
*/
ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT)
goto dropfrag;
if (ecn0 != IPTOS_ECN_CE)
LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |=
IPTOS_ECN_CE;
}
if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
goto dropfrag;
/*
* Find a segment which begins after this one does.
*/
for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = LIST_NEXT(q, ipqe_q))
if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
break;
/*
* If there is a preceding segment, it may provide some of
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
if (p != NULL) {
i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
ntohs(ipqe->ipqe_ip->ip_off);
if (i > 0) { if (i >= ntohs(ipqe->ipqe_ip->ip_len))
goto dropfrag;
m_adj(ipqe->ipqe_m, i);
ipqe->ipqe_ip->ip_off =
htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
ipqe->ipqe_ip->ip_len =
htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
}
}
/*
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
for (; q != NULL &&
ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
ntohs(q->ipqe_ip->ip_off); q = nq) {
i = (ntohs(ipqe->ipqe_ip->ip_off) +
ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
if (i < ntohs(q->ipqe_ip->ip_len)) {
q->ipqe_ip->ip_len =
htons(ntohs(q->ipqe_ip->ip_len) - i);
q->ipqe_ip->ip_off =
htons(ntohs(q->ipqe_ip->ip_off) + i);
m_adj(q->ipqe_m, i);
break;
}
nq = LIST_NEXT(q, ipqe_q);
m_freem(q->ipqe_m);
LIST_REMOVE(q, ipqe_q);
pool_put(&ipqent_pool, q);
ip_frags--;
}
insert:
/*
* Stick new segment in its place;
* check for complete reassembly.
*/
if (p == NULL) { LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
} else {
LIST_INSERT_AFTER(p, ipqe, ipqe_q);
}
next = 0;
for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = LIST_NEXT(q, ipqe_q)) {
if (ntohs(q->ipqe_ip->ip_off) != next)
return (0);
next += ntohs(q->ipqe_ip->ip_len);
}
if (p->ipqe_mff)
return (0);
/*
* Reassembly is complete. Check for a bogus message size and
* concatenate fragments.
*/
q = LIST_FIRST(&fp->ipq_fragq);
ip = q->ipqe_ip;
if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
ipstat_inc(ips_toolong);
ip_freef(fp);
return (0);
}
m = q->ipqe_m;
t = m->m_next;
m->m_next = 0;
m_cat(m, t);
nq = LIST_NEXT(q, ipqe_q);
pool_put(&ipqent_pool, q);
ip_frags--;
for (q = nq; q != NULL; q = nq) {
t = q->ipqe_m;
nq = LIST_NEXT(q, ipqe_q);
pool_put(&ipqent_pool, q);
ip_frags--;
m_removehdr(t);
m_cat(m, t);
}
/*
* Create header for new ip packet by
* modifying header of first packet;
* dequeue and discard fragment reassembly header.
* Make header visible.
*/
ip->ip_len = htons(next);
ip->ip_src = fp->ipq_src;
ip->ip_dst = fp->ipq_dst;
LIST_REMOVE(fp, ipq_q);
pool_put(&ipq_pool, fp);
m->m_len += (ip->ip_hl << 2);
m->m_data -= (ip->ip_hl << 2);
m_calchdrlen(m);
return (m);
dropfrag:
ipstat_inc(ips_fragdropped);
m_freem(m);
pool_put(&ipqent_pool, ipqe);
ip_frags--;
return (NULL);
}
/*
* Free a fragment reassembly header and all
* associated datagrams.
*/
void
ip_freef(struct ipq *fp)
{
struct ipqent *q;
MUTEX_ASSERT_LOCKED(&ipq_mutex);
while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) {
LIST_REMOVE(q, ipqe_q);
m_freem(q->ipqe_m);
pool_put(&ipqent_pool, q);
ip_frags--;
}
LIST_REMOVE(fp, ipq_q);
pool_put(&ipq_pool, fp);
}
/*
* IP timer processing;
* if a timer expires on a reassembly queue, discard it.
*/
void
ip_slowtimo(void)
{
struct ipq *fp, *nfp;
mtx_enter(&ipq_mutex);
LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) {
if (--fp->ipq_ttl == 0) {
ipstat_inc(ips_fragtimeout);
ip_freef(fp);
}
}
mtx_leave(&ipq_mutex);
}
/*
* Flush a bunch of datagram fragments, till we are down to 75%.
*/
void
ip_flush(void)
{
int max = 50;
MUTEX_ASSERT_LOCKED(&ipq_mutex);
while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) {
ipstat_inc(ips_fragdropped);
ip_freef(LIST_FIRST(&ipq));
}
}
/*
* Do option processing on a datagram,
* possibly discarding it if bad options are encountered,
* or forwarding it if source-routed.
* Returns 1 if packet has been forwarded/freed,
* 0 if the packet should be processed further.
*/
int
ip_dooptions(struct mbuf *m, struct ifnet *ifp)
{
struct ip *ip = mtod(m, struct ip *);
unsigned int rtableid = m->m_pkthdr.ph_rtableid;
struct rtentry *rt;
struct sockaddr_in ipaddr;
u_char *cp;
struct ip_timestamp ipt;
struct in_ifaddr *ia;
int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
struct in_addr sin, dst;
u_int32_t ntime;
dst = ip->ip_dst;
cp = (u_char *)(ip + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
KERNEL_LOCK();
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
}
switch (opt) {
default:
break;
/*
* Source routing with record.
* Find interface with current destination address.
* If none on this machine then drop if strictly routed,
* or do nothing if loosely routed.
* Record interface address and bring up next address
* component. If strictly routed make sure next
* address is on directly accessible net.
*/
case IPOPT_LSRR:
case IPOPT_SSRR:
if (!ip_dosourceroute) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
memset(&ipaddr, 0, sizeof(ipaddr));
ipaddr.sin_family = AF_INET;
ipaddr.sin_len = sizeof(ipaddr);
ipaddr.sin_addr = ip->ip_dst;
ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr),
m->m_pkthdr.ph_rtableid));
if (ia == NULL) {
if (opt == IPOPT_SSRR) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
/*
* Loose routing, and not at next destination
* yet; nothing to do except forward.
*/
break;
}
off--; /* 0 origin */
if ((off + sizeof(struct in_addr)) > optlen) {
/*
* End of source route. Should be for us.
*/
save_rte(m, cp, ip->ip_src);
break;
}
/*
* locate outgoing interface
*/
memset(&ipaddr, 0, sizeof(ipaddr));
ipaddr.sin_family = AF_INET;
ipaddr.sin_len = sizeof(ipaddr);
memcpy(&ipaddr.sin_addr, cp + off,
sizeof(ipaddr.sin_addr));
/* keep packet in the virtual instance */
rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid);
if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) &&
ISSET(rt->rt_flags, RTF_GATEWAY))) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
rtfree(rt);
goto bad;
}
ia = ifatoia(rt->rt_ifa);
memcpy(cp + off, &ia->ia_addr.sin_addr,
sizeof(struct in_addr));
rtfree(rt);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
ip->ip_dst = ipaddr.sin_addr;
/*
* Let ip_intr's mcast routing check handle mcast pkts
*/
forward = !IN_MULTICAST(ip->ip_dst.s_addr);
break;
case IPOPT_RR:
if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
/*
* If no space remains, ignore.
*/
off--; /* 0 origin */
if ((off + sizeof(struct in_addr)) > optlen)
break;
memset(&ipaddr, 0, sizeof(ipaddr));
ipaddr.sin_family = AF_INET;
ipaddr.sin_len = sizeof(ipaddr);
ipaddr.sin_addr = ip->ip_dst;
/*
* locate outgoing interface; if we're the destination,
* use the incoming interface (should be same).
* Again keep the packet inside the virtual instance.
*/
rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid);
if (!rtisvalid(rt)) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
rtfree(rt);
goto bad;
}
ia = ifatoia(rt->rt_ifa);
memcpy(cp + off, &ia->ia_addr.sin_addr,
sizeof(struct in_addr));
rtfree(rt);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
break;
case IPOPT_TS:
code = cp - (u_char *)ip;
if (optlen < sizeof(struct ip_timestamp))
goto bad;
memcpy(&ipt, cp, sizeof(struct ip_timestamp));
if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5)
goto bad;
if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) {
if (++ipt.ipt_oflw == 0)
goto bad;
break;
}
memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin);
switch (ipt.ipt_flg) {
case IPOPT_TS_TSONLY:
break;
case IPOPT_TS_TSANDADDR:
if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) +
sizeof(struct in_addr) > ipt.ipt_len)
goto bad;
memset(&ipaddr, 0, sizeof(ipaddr));
ipaddr.sin_family = AF_INET;
ipaddr.sin_len = sizeof(ipaddr);
ipaddr.sin_addr = dst;
ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr),
ifp));
if (ia == NULL)
continue;
memcpy(&sin, &ia->ia_addr.sin_addr,
sizeof(struct in_addr));
ipt.ipt_ptr += sizeof(struct in_addr);
break;
case IPOPT_TS_PRESPEC:
if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) +
sizeof(struct in_addr) > ipt.ipt_len)
goto bad;
memset(&ipaddr, 0, sizeof(ipaddr));
ipaddr.sin_family = AF_INET;
ipaddr.sin_len = sizeof(ipaddr);
ipaddr.sin_addr = sin;
if (ifa_ifwithaddr(sintosa(&ipaddr),
m->m_pkthdr.ph_rtableid) == NULL)
continue;
ipt.ipt_ptr += sizeof(struct in_addr);
break;
default:
/* XXX can't take &ipt->ipt_flg */
code = (u_char *)&ipt.ipt_ptr -
(u_char *)ip + 1;
goto bad;
}
ntime = iptime();
memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t));
ipt.ipt_ptr += sizeof(u_int32_t);
}
}
KERNEL_UNLOCK();
if (forward && ipforwarding > 0) { ip_forward(m, ifp, NULL, 1);
return (1);
}
return (0);
bad:
KERNEL_UNLOCK();
icmp_error(m, type, code, 0, 0);
ipstat_inc(ips_badoptions);
return (1);
}
/*
* Save incoming source route for use in replies,
* to be picked up later by ip_srcroute if the receiver is interested.
*/
void
save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
{
struct ip_srcrt *isr;
struct m_tag *mtag;
unsigned olen;
olen = option[IPOPT_OLEN];
if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes))
return;
mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT);
if (mtag == NULL) {
ipstat_inc(ips_idropped);
return;
}
isr = (struct ip_srcrt *)(mtag + 1);
memcpy(isr->isr_hdr, option, olen);
isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
isr->isr_dst = dst;
m_tag_prepend(m, mtag);
}
/*
* Retrieve incoming source route for use in replies,
* in the same form used by setsockopt.
* The first hop is placed before the options, will be removed later.
*/
struct mbuf *
ip_srcroute(struct mbuf *m0)
{
struct in_addr *p, *q;
struct mbuf *m;
struct ip_srcrt *isr;
struct m_tag *mtag;
if (!ip_dosourceroute)
return (NULL);
mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL);
if (mtag == NULL)
return (NULL);
isr = (struct ip_srcrt *)(mtag + 1);
if (isr->isr_nhops == 0)
return (NULL);
m = m_get(M_DONTWAIT, MT_SOOPTS);
if (m == NULL) {
ipstat_inc(ips_idropped);
return (NULL);
}
#define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr))
/* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */
m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ;
/*
* First save first hop for return route
*/
p = &(isr->isr_routes[isr->isr_nhops - 1]);
*(mtod(m, struct in_addr *)) = *p--;
/*
* Copy option fields and padding (nop) to mbuf.
*/
isr->isr_nop = IPOPT_NOP;
isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF;
memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop,
OPTSIZ);
q = (struct in_addr *)(mtod(m, caddr_t) +
sizeof(struct in_addr) + OPTSIZ);
#undef OPTSIZ
/*
* Record return path as an IP source route,
* reversing the path (pointers are now aligned).
*/
while (p >= isr->isr_routes) {
*q++ = *p--;
}
/*
* Last hop goes to final destination.
*/
*q = isr->isr_dst;
m_tag_delete(m0, (struct m_tag *)isr);
return (m);
}
/*
* Strip out IP options, at higher level protocol in the kernel.
*/
void
ip_stripoptions(struct mbuf *m)
{
int i;
struct ip *ip = mtod(m, struct ip *);
caddr_t opts;
int olen;
olen = (ip->ip_hl<<2) - sizeof (struct ip);
opts = (caddr_t)(ip + 1);
i = m->m_len - (sizeof (struct ip) + olen);
memmove(opts, opts + olen, i);
m->m_len -= olen;
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len -= olen;
ip->ip_hl = sizeof(struct ip) >> 2;
ip->ip_len = htons(ntohs(ip->ip_len) - olen);
}
const u_char inetctlerrmap[PRC_NCMDS] = {
0, 0, 0, 0,
0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
EMSGSIZE, EHOSTUNREACH, 0, 0,
0, 0, 0, 0,
ENOPROTOOPT
};
/*
* Forward a packet. If some error occurs return the sender
* an icmp packet. Note we can't always generate a meaningful
* icmp message because icmp doesn't have a large enough repertoire
* of codes and types.
*
* If not forwarding, just drop the packet. This could be confusing
* if ipforwarding was zero but some routing protocol was advancing
* us as a gateway to somewhere. However, we must let the routing
* protocol deal with that.
*
* The srcrt parameter indicates whether the packet is being forwarded
* via a source route.
*/
void
ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt)
{
struct mbuf mfake, *mcopy = NULL;
struct ip *ip = mtod(m, struct ip *);
struct sockaddr_in *sin;
struct route ro;
int error = 0, type = 0, code = 0, destmtu = 0, fake = 0, len;
u_int32_t dest;
dest = 0;
if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
ipstat_inc(ips_cantforward);
m_freem(m);
goto freecopy;
}
if (ip->ip_ttl <= IPTTLDEC) {
icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
goto freecopy;
}
memset(&ro, 0, sizeof(ro));
sin = satosin(&ro.ro_dst);
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = ip->ip_dst;
if (!rtisvalid(rt)) {
rtfree(rt);
rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr,
m->m_pkthdr.ph_rtableid);
if (rt == NULL) {
ipstat_inc(ips_noroute);
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
return;
}
}
/*
* Save at most 68 bytes of the packet in case
* we need to generate an ICMP message to the src.
* The data is saved in the mbuf on the stack that
* acts as a temporary storage not intended to be
* passed down the IP stack or to the mfree.
*/
memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr));
mfake.m_type = m->m_type;
if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) {
mfake.m_data = mfake.m_pktdat;
len = min(ntohs(ip->ip_len), 68);
m_copydata(m, 0, len, mfake.m_pktdat);
mfake.m_pkthdr.len = mfake.m_len = len;
#if NPF > 0
pf_pkt_addr_changed(&mfake);
#endif /* NPF > 0 */
fake = 1;
}
ip->ip_ttl -= IPTTLDEC;
/*
* If forwarding packet using same interface that it came in on,
* perhaps should send a redirect to sender to shortcut a hop.
* Only send redirect if source is sending directly to us,
* and if packet was not source routed (or has any options).
* Also, don't send redirect if forwarding using a default route
* or a route modified by a redirect.
* Don't send redirect if we advertise destination's arp address
* as ours (proxy arp).
*/
if ((rt->rt_ifidx == ifp->if_index) &&
(rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
ipsendredirects && !srcrt &&
!arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) {
if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) ==
ifatoia(rt->rt_ifa)->ia_net) {
if (rt->rt_flags & RTF_GATEWAY)
dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
else
dest = ip->ip_dst.s_addr;
/* Router requirements says to only send host redirects */
type = ICMP_REDIRECT;
code = ICMP_REDIRECT_HOST;
}
}
ro.ro_rt = rt;
ro.ro_tableid = m->m_pkthdr.ph_rtableid;
error = ip_output(m, NULL, &ro,
(IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
NULL, NULL, 0);
rt = ro.ro_rt;
if (error)
ipstat_inc(ips_cantforward);
else {
ipstat_inc(ips_forward);
if (type)
ipstat_inc(ips_redirectsent);
else
goto freecopy;
}
if (!fake)
goto freecopy;
switch (error) {
case 0: /* forwarded, but need redirect */
/* type, code set above */
break;
case EMSGSIZE:
type = ICMP_UNREACH;
code = ICMP_UNREACH_NEEDFRAG;
if (rt != NULL) {
if (rt->rt_mtu) {
destmtu = rt->rt_mtu;
} else {
struct ifnet *destifp;
destifp = if_get(rt->rt_ifidx);
if (destifp != NULL)
destmtu = destifp->if_mtu;
if_put(destifp);
}
}
ipstat_inc(ips_cantfrag);
if (destmtu == 0)
goto freecopy;
break;
case EACCES:
/*
* pf(4) blocked the packet. There is no need to send an ICMP
* packet back since pf(4) takes care of it.
*/
goto freecopy;
case ENOBUFS:
/*
* a router should not generate ICMP_SOURCEQUENCH as
* required in RFC1812 Requirements for IP Version 4 Routers.
* source quench could be a big problem under DoS attacks,
* or the underlying interface is rate-limited.
*/
goto freecopy;
case ENETUNREACH: /* shouldn't happen, checked above */
case EHOSTUNREACH:
case ENETDOWN:
case EHOSTDOWN:
default:
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
break;
}
mcopy = m_copym(&mfake, 0, len, M_DONTWAIT);
if (mcopy)
icmp_error(mcopy, type, code, dest, destmtu);
freecopy:
if (fake)
m_tag_delete_chain(&mfake);
rtfree(rt);
}
int
ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
int error;
#ifdef MROUTING
extern struct mrtstat mrtstat;
#endif
/* Almost all sysctl names at this level are terminal. */
if (namelen != 1 && name[0] != IPCTL_IFQUEUE &&
name[0] != IPCTL_ARPQUEUE)
return (ENOTDIR);
switch (name[0]) {
case IPCTL_SOURCEROUTE:
NET_LOCK();
error = sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
&ip_dosourceroute);
NET_UNLOCK();
return (error);
case IPCTL_MTUDISC:
NET_LOCK();
error = sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtudisc);
if (ip_mtudisc == 0) rt_timer_queue_flush(&ip_mtudisc_timeout_q);
NET_UNLOCK();
return error;
case IPCTL_MTUDISCTIMEOUT:
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&ip_mtudisc_timeout, 0, INT_MAX);
rt_timer_queue_change(&ip_mtudisc_timeout_q,
ip_mtudisc_timeout);
NET_UNLOCK();
return (error);
#ifdef IPSEC
case IPCTL_ENCDEBUG:
case IPCTL_IPSEC_STATS:
case IPCTL_IPSEC_EXPIRE_ACQUIRE:
case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT:
case IPCTL_IPSEC_REQUIRE_PFS:
case IPCTL_IPSEC_SOFT_ALLOCATIONS:
case IPCTL_IPSEC_ALLOCATIONS:
case IPCTL_IPSEC_SOFT_BYTES:
case IPCTL_IPSEC_BYTES:
case IPCTL_IPSEC_TIMEOUT:
case IPCTL_IPSEC_SOFT_TIMEOUT:
case IPCTL_IPSEC_SOFT_FIRSTUSE:
case IPCTL_IPSEC_FIRSTUSE:
case IPCTL_IPSEC_ENC_ALGORITHM:
case IPCTL_IPSEC_AUTH_ALGORITHM:
case IPCTL_IPSEC_IPCOMP_ALGORITHM:
return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp,
newlen));
#endif
case IPCTL_IFQUEUE:
return (sysctl_niq(name + 1, namelen - 1,
oldp, oldlenp, newp, newlen, &ipintrq));
case IPCTL_ARPQUEUE:
return (sysctl_niq(name + 1, namelen - 1,
oldp, oldlenp, newp, newlen, &arpinq));
case IPCTL_ARPQUEUED:
return (sysctl_rdint(oldp, oldlenp, newp, la_hold_total));
case IPCTL_STATS:
return (ip_sysctl_ipstat(oldp, oldlenp, newp));
#ifdef MROUTING
case IPCTL_MRTSTATS:
return (sysctl_rdstruct(oldp, oldlenp, newp,
&mrtstat, sizeof(mrtstat)));
case IPCTL_MRTMFC:
if (newp)
return (EPERM);
NET_LOCK();
error = mrt_sysctl_mfc(oldp, oldlenp);
NET_UNLOCK();
return (error);
case IPCTL_MRTVIF:
if (newp)
return (EPERM);
NET_LOCK();
error = mrt_sysctl_vif(oldp, oldlenp);
NET_UNLOCK();
return (error);
#else
case IPCTL_MRTPROTO:
case IPCTL_MRTSTATS:
case IPCTL_MRTMFC:
case IPCTL_MRTVIF:
return (EOPNOTSUPP);
#endif
default:
NET_LOCK();
error = sysctl_bounded_arr(ipctl_vars, nitems(ipctl_vars),
name, namelen, oldp, oldlenp, newp, newlen);
NET_UNLOCK();
return (error);
}
/* NOTREACHED */
}
int
ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp)
{
uint64_t counters[ips_ncounters];
struct ipstat ipstat;
u_long *words = (u_long *)&ipstat;
int i;
CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long)));
memset(&ipstat, 0, sizeof ipstat);
counters_read(ipcounters, counters, nitems(counters));
for (i = 0; i < nitems(counters); i++)
words[i] = (u_long)counters[i];
return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat)));
}
void
ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
struct mbuf *m)
{
if (inp->inp_socket->so_options & SO_TIMESTAMP) {
struct timeval tv;
m_microtime(m, &tv);
*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
SCM_TIMESTAMP, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & INP_RECVDSTADDR) {
*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
#ifdef notyet
/* this code is broken and will probably never be fixed. */
/* options were tossed already */
if (inp->inp_flags & INP_RECVOPTS) {
*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
/* ip_srcroute doesn't do what we want here, need to fix */
if (inp->inp_flags & INP_RECVRETOPTS) {
*mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
#endif
if (inp->inp_flags & INP_RECVIF) {
struct sockaddr_dl sdl;
struct ifnet *ifp;
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp == NULL || ifp->if_sadl == NULL) {
memset(&sdl, 0, sizeof(sdl));
sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]);
sdl.sdl_family = AF_LINK;
sdl.sdl_index = ifp != NULL ? ifp->if_index : 0;
sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0;
*mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len,
IP_RECVIF, IPPROTO_IP);
} else {
*mp = sbcreatecontrol((caddr_t) ifp->if_sadl,
ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP);
}
if (*mp)
mp = &(*mp)->m_next;
if_put(ifp);
}
if (inp->inp_flags & INP_RECVTTL) {
*mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & INP_RECVRTABLE) {
u_int rtableid = inp->inp_rtableid;
#if NPF > 0
if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL);
rtableid = divert->rdomain;
}
#endif
*mp = sbcreatecontrol((caddr_t) &rtableid,
sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
}
void
ip_send_do_dispatch(void *xmq, int flags)
{
struct mbuf_queue *mq = xmq;
struct mbuf *m;
struct mbuf_list ml;
struct m_tag *mtag;
mq_delist(mq, &ml);
if (ml_empty(&ml))
return;
NET_LOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
u_int32_t ipsecflowinfo = 0;
if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL))
!= NULL) {
ipsecflowinfo = *(u_int32_t *)(mtag + 1);
m_tag_delete(m, mtag);
}
ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo);
}
NET_UNLOCK();
}
void
ip_sendraw_dispatch(void *xmq)
{
ip_send_do_dispatch(xmq, IP_RAWOUTPUT);
}
void
ip_send_dispatch(void *xmq)
{
ip_send_do_dispatch(xmq, 0);
}
void
ip_send(struct mbuf *m)
{
mq_enqueue(&ipsend_mq, m);
task_add(net_tq(0), &ipsend_task);
}
void
ip_send_raw(struct mbuf *m)
{
mq_enqueue(&ipsendraw_mq, m);
task_add(net_tq(0), &ipsendraw_task);
}
/* $OpenBSD: uipc_mbuf.c,v 1.284 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: uipc_mbuf.c,v 1.15.4.1 1996/06/13 17:11:44 cgd Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/mbuf.h>
#include <sys/pool.h>
#include <sys/percpu.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <net/if.h>
#include <uvm/uvm_extern.h>
#ifdef DDB
#include <machine/db_machdep.h>
#endif
#if NPF > 0
#include <net/pfvar.h>
#endif /* NPF > 0 */
/* mbuf stats */
COUNTERS_BOOT_MEMORY(mbstat_boot, MBSTAT_COUNT);
struct cpumem *mbstat = COUNTERS_BOOT_INITIALIZER(mbstat_boot);
/* mbuf pools */
struct pool mbpool;
struct pool mtagpool;
/* mbuf cluster pools */
u_int mclsizes[MCLPOOLS] = {
MCLBYTES, /* must be at slot 0 */
MCLBYTES + 2, /* ETHER_ALIGNED 2k mbufs */
4 * 1024,
8 * 1024,
9 * 1024,
12 * 1024,
16 * 1024,
64 * 1024
};
static char mclnames[MCLPOOLS][8];
struct pool mclpools[MCLPOOLS];
struct pool *m_clpool(u_int);
int max_linkhdr; /* largest link-level header */
int max_protohdr; /* largest protocol header */
int max_hdr; /* largest link+protocol header */
struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
void m_extfree(struct mbuf *);
void m_zero(struct mbuf *);
unsigned long mbuf_mem_limit; /* how much memory can be allocated */
unsigned long mbuf_mem_alloc; /* how much memory has been allocated */
void *m_pool_alloc(struct pool *, int, int *);
void m_pool_free(struct pool *, void *);
struct pool_allocator m_pool_allocator = {
m_pool_alloc,
m_pool_free,
0 /* will be copied from pool_allocator_multi */
};
static void (*mextfree_fns[4])(caddr_t, u_int, void *);
static u_int num_extfree_fns;
#define M_DATABUF(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
(m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
#define M_SIZE(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
(m)->m_flags & M_PKTHDR ? MHLEN : MLEN)
/*
* Initialize the mbuf allocator.
*/
void
mbinit(void)
{
int i, error;
unsigned int lowbits;
CTASSERT(MSIZE == sizeof(struct mbuf));
m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
mbuf_mem_alloc = 0;
#if DIAGNOSTIC
if (mclsizes[0] != MCLBYTES)
panic("mbinit: the smallest cluster size != MCLBYTES");
if (mclsizes[nitems(mclsizes) - 1] != MAXMCLBYTES)
panic("mbinit: the largest cluster size != MAXMCLBYTES");
#endif
m_pool_init(&mbpool, MSIZE, 64, "mbufpl");
pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
IPL_NET, 0, "mtagpl", NULL);
for (i = 0; i < nitems(mclsizes); i++) {
lowbits = mclsizes[i] & ((1 << 10) - 1);
if (lowbits) {
snprintf(mclnames[i], sizeof(mclnames[0]),
"mcl%dk%u", mclsizes[i] >> 10, lowbits);
} else {
snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk",
mclsizes[i] >> 10);
}
m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
}
error = nmbclust_update(nmbclust);
KASSERT(error == 0);
(void)mextfree_register(m_extfree_pool);
KASSERT(num_extfree_fns == 1);
}
void
mbcpuinit(void)
{
int i;
mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT);
pool_cache_init(&mbpool);
pool_cache_init(&mtagpool);
for (i = 0; i < nitems(mclsizes); i++)
pool_cache_init(&mclpools[i]);
}
int
nmbclust_update(long newval)
{
int i;
if (newval < 0 || newval > LONG_MAX / MCLBYTES)
return ERANGE;
/* update the global mbuf memory limit */
nmbclust = newval;
mbuf_mem_limit = nmbclust * MCLBYTES;
pool_wakeup(&mbpool);
for (i = 0; i < nitems(mclsizes); i++)
pool_wakeup(&mclpools[i]);
return 0;
}
/*
* Space allocation routines.
*/
struct mbuf *
m_get(int nowait, int type)
{
struct mbuf *m;
struct counters_ref cr;
uint64_t *counters;
int s;
KASSERT(type >= 0 && type < MT_NTYPES); m = pool_get(&mbpool, nowait == M_WAIT ? PR_WAITOK : PR_NOWAIT); if (m == NULL)
return (NULL);
s = splnet();
counters = counters_enter(&cr, mbstat);
counters[type]++;
counters_leave(&cr, mbstat);
splx(s);
m->m_type = type;
m->m_next = NULL;
m->m_nextpkt = NULL;
m->m_data = m->m_dat;
m->m_flags = 0;
return (m);
}
/*
* ATTN: When changing anything here check m_inithdr() and m_defrag() those
* may need to change as well.
*/
struct mbuf *
m_gethdr(int nowait, int type)
{
struct mbuf *m;
struct counters_ref cr;
uint64_t *counters;
int s;
KASSERT(type >= 0 && type < MT_NTYPES); m = pool_get(&mbpool, nowait == M_WAIT ? PR_WAITOK : PR_NOWAIT); if (m == NULL)
return (NULL);
s = splnet();
counters = counters_enter(&cr, mbstat);
counters[type]++;
counters_leave(&cr, mbstat);
splx(s);
m->m_type = type;
return (m_inithdr(m));
}
struct mbuf *
m_inithdr(struct mbuf *m)
{
/* keep in sync with m_gethdr */
m->m_next = NULL;
m->m_nextpkt = NULL;
m->m_data = m->m_pktdat;
m->m_flags = M_PKTHDR;
memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
return (m);
}
static inline void
m_clearhdr(struct mbuf *m)
{
/* delete all mbuf tags to reset the state */
m_tag_delete_chain(m);
#if NPF > 0
pf_mbuf_unlink_state_key(m);
pf_mbuf_unlink_inpcb(m);
#endif /* NPF > 0 */
memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
}
void
m_removehdr(struct mbuf *m)
{ KASSERT(m->m_flags & M_PKTHDR); m_clearhdr(m);
m->m_flags &= ~M_PKTHDR;
}
void
m_resethdr(struct mbuf *m)
{
int len = m->m_pkthdr.len;
u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt; KASSERT(m->m_flags & M_PKTHDR);
m->m_flags &= (M_EXT|M_PKTHDR|M_EOR|M_EXTWR|M_ZEROIZE);
m_clearhdr(m);
/* like m_inithdr(), but keep any associated data and mbufs */
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
m->m_pkthdr.len = len;
m->m_pkthdr.ph_loopcnt = loopcnt;
}
void
m_calchdrlen(struct mbuf *m)
{
struct mbuf *n;
int plen = 0;
KASSERT(m->m_flags & M_PKTHDR); for (n = m; n; n = n->m_next)
plen += n->m_len;
m->m_pkthdr.len = plen;
}
struct mbuf *
m_getclr(int nowait, int type)
{
struct mbuf *m;
MGET(m, nowait, type);
if (m == NULL)
return (NULL);
memset(mtod(m, caddr_t), 0, MLEN);
return (m);
}
struct pool *
m_clpool(u_int pktlen)
{
struct pool *pp;
int pi;
for (pi = 0; pi < nitems(mclpools); pi++) {
pp = &mclpools[pi];
if (pktlen <= pp->pr_size)
return (pp);
}
return (NULL);
}
struct mbuf *
m_clget(struct mbuf *m, int how, u_int pktlen)
{
struct mbuf *m0 = NULL;
struct pool *pp;
caddr_t buf;
pp = m_clpool(pktlen);
#ifdef DIAGNOSTIC
if (pp == NULL)
panic("m_clget: request for %u byte cluster", pktlen);
#endif
if (m == NULL) {
m0 = m_gethdr(how, MT_DATA);
if (m0 == NULL)
return (NULL);
m = m0;
}
buf = pool_get(pp, how == M_WAIT ? PR_WAITOK : PR_NOWAIT);
if (buf == NULL) {
m_freem(m0);
return (NULL);
}
MEXTADD(m, buf, pp->pr_size, M_EXTWR, MEXTFREE_POOL, pp);
return (m);
}
void
m_extfree_pool(caddr_t buf, u_int size, void *pp)
{
pool_put(pp, buf);
}
struct mbuf *
m_free(struct mbuf *m)
{
struct mbuf *n;
struct counters_ref cr;
uint64_t *counters;
int s;
if (m == NULL)
return (NULL);
s = splnet();
counters = counters_enter(&cr, mbstat);
counters[m->m_type]--;
counters_leave(&cr, mbstat);
splx(s);
n = m->m_next;
if (m->m_flags & M_ZEROIZE) {
m_zero(m);
/* propagate M_ZEROIZE to the next mbuf in the chain */
if (n) n->m_flags |= M_ZEROIZE;
}
if (m->m_flags & M_PKTHDR) { m_tag_delete_chain(m);
#if NPF > 0
pf_mbuf_unlink_state_key(m);
pf_mbuf_unlink_inpcb(m);
#endif /* NPF > 0 */
}
if (m->m_flags & M_EXT) m_extfree(m);
pool_put(&mbpool, m);
return (n);
}
void
m_extref(struct mbuf *o, struct mbuf *n)
{
int refs = MCLISREFERENCED(o);
n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
if (refs)
mtx_enter(&m_extref_mtx); n->m_ext.ext_nextref = o->m_ext.ext_nextref;
n->m_ext.ext_prevref = o;
o->m_ext.ext_nextref = n;
n->m_ext.ext_nextref->m_ext.ext_prevref = n;
if (refs)
mtx_leave(&m_extref_mtx);
MCLREFDEBUGN((n), __FILE__, __LINE__);
}
static inline u_int
m_extunref(struct mbuf *m)
{
int refs = 0;
if (!MCLISREFERENCED(m))
return (0);
mtx_enter(&m_extref_mtx);
if (MCLISREFERENCED(m)) {
m->m_ext.ext_nextref->m_ext.ext_prevref =
m->m_ext.ext_prevref;
m->m_ext.ext_prevref->m_ext.ext_nextref =
m->m_ext.ext_nextref;
refs = 1;
}
mtx_leave(&m_extref_mtx);
return (refs);
}
/*
* Returns a number for use with MEXTADD.
* Should only be called once per function.
* Drivers can be assured that the index will be non zero.
*/
u_int
mextfree_register(void (*fn)(caddr_t, u_int, void *))
{
KASSERT(num_extfree_fns < nitems(mextfree_fns));
mextfree_fns[num_extfree_fns] = fn;
return num_extfree_fns++;
}
void
m_extfree(struct mbuf *m)
{ if (m_extunref(m) == 0) { KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
m->m_ext.ext_size, m->m_ext.ext_arg);
}
m->m_flags &= ~(M_EXT|M_EXTWR);
}
struct mbuf *
m_freem(struct mbuf *m)
{
struct mbuf *n;
if (m == NULL)
return (NULL);
n = m->m_nextpkt;
do
m = m_free(m);
while (m != NULL);
return (n);
}
void
m_purge(struct mbuf *m)
{ while (m != NULL)
m = m_freem(m);
}
/*
* mbuf chain defragmenter. This function uses some evil tricks to defragment
* an mbuf chain into a single buffer without changing the mbuf pointer.
* This needs to know a lot of the mbuf internals to make this work.
*/
int
m_defrag(struct mbuf *m, int how)
{
struct mbuf *m0;
if (m->m_next == NULL)
return (0);
KASSERT(m->m_flags & M_PKTHDR);
if ((m0 = m_gethdr(how, m->m_type)) == NULL)
return (ENOBUFS);
if (m->m_pkthdr.len > MHLEN) {
MCLGETL(m0, how, m->m_pkthdr.len);
if (!(m0->m_flags & M_EXT)) {
m_free(m0);
return (ENOBUFS);
}
}
m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
/* free chain behind and possible ext buf on the first mbuf */
m_freem(m->m_next);
m->m_next = NULL;
if (m->m_flags & M_EXT)
m_extfree(m);
/*
* Bounce copy mbuf over to the original mbuf and set everything up.
* This needs to reset or clear all pointers that may go into the
* original mbuf chain.
*/
if (m0->m_flags & M_EXT) {
memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
MCLINITREFERENCE(m);
m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
m->m_data = m->m_ext.ext_buf;
} else {
m->m_data = m->m_pktdat;
memcpy(m->m_data, m0->m_data, m0->m_len);
}
m->m_pkthdr.len = m->m_len = m0->m_len;
m0->m_flags &= ~(M_EXT|M_EXTWR); /* cluster is gone */
m_free(m0);
return (0);
}
/*
* Mbuffer utility routines.
*/
/*
* Ensure len bytes of contiguous space at the beginning of the mbuf chain
*/
struct mbuf *
m_prepend(struct mbuf *m, int len, int how)
{
struct mbuf *mn;
if (len > MHLEN)
panic("mbuf prepend length too big");
if (m_leadingspace(m) >= len) {
m->m_data -= len;
m->m_len += len;
} else {
MGET(mn, how, m->m_type);
if (mn == NULL) {
m_freem(m);
return (NULL);
}
if (m->m_flags & M_PKTHDR) M_MOVE_PKTHDR(mn, m);
mn->m_next = m;
m = mn;
m_align(m, len);
m->m_len = len;
}
if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += len;
return (m);
}
/*
* Make a copy of an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
* The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
*/
struct mbuf *
m_copym(struct mbuf *m0, int off, int len, int wait)
{
struct mbuf *m, *n, **np;
struct mbuf *top;
int copyhdr = 0;
if (off < 0 || len < 0)
panic("m_copym0: off %d, len %d", off, len); if (off == 0 && m0->m_flags & M_PKTHDR)
copyhdr = 1;
if ((m = m_getptr(m0, off, &off)) == NULL)
panic("m_copym0: short mbuf chain");
np = ⊤
top = NULL;
while (len > 0) {
if (m == NULL) {
if (len != M_COPYALL) panic("m_copym0: m == NULL and not COPYALL");
break;
}
MGET(n, wait, m->m_type);
*np = n;
if (n == NULL)
goto nospace;
if (copyhdr) { if (m_dup_pkthdr(n, m0, wait))
goto nospace;
if (len != M_COPYALL) n->m_pkthdr.len = len;
copyhdr = 0;
}
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
n->m_data = m->m_data + off;
n->m_ext = m->m_ext;
MCLADDREFERENCE(m, n);
} else {
n->m_data += m->m_data -
(m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
n->m_data += off;
memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off,
n->m_len);
}
if (len != M_COPYALL)
len -= n->m_len;
off += n->m_len;
#ifdef DIAGNOSTIC
if (off > m->m_len)
panic("m_copym0 overrun");
#endif
if (off == m->m_len) { m = m->m_next;
off = 0;
}
np = &n->m_next;
}
return (top);
nospace:
m_freem(top);
return (NULL);
}
/*
* Copy data from an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes, into the indicated buffer.
*/
void
m_copydata(struct mbuf *m, int off, int len, void *p)
{
caddr_t cp = p;
unsigned count;
if (off < 0)
panic("m_copydata: off %d < 0", off);
if (len < 0)
panic("m_copydata: len %d < 0", len);
if ((m = m_getptr(m, off, &off)) == NULL)
panic("m_copydata: short mbuf chain");
while (len > 0) {
if (m == NULL)
panic("m_copydata: null mbuf");
count = min(m->m_len - off, len);
memmove(cp, mtod(m, caddr_t) + off, count);
len -= count;
cp += count;
off = 0;
m = m->m_next;
}
}
/*
* Copy data from a buffer back into the indicated mbuf chain,
* starting "off" bytes from the beginning, extending the mbuf
* chain if necessary. The mbuf needs to be properly initialized
* including the setting of m_len.
*/
int
m_copyback(struct mbuf *m0, int off, int len, const void *_cp, int wait)
{
int mlen, totlen = 0;
struct mbuf *m = m0, *n;
caddr_t cp = (caddr_t)_cp;
int error = 0;
if (m0 == NULL)
return (0);
while (off > (mlen = m->m_len)) {
off -= mlen;
totlen += mlen;
if (m->m_next == NULL) { if ((n = m_get(wait, m->m_type)) == NULL) {
error = ENOBUFS;
goto out;
}
if (off + len > MLEN) {
MCLGETL(n, wait, off + len);
if (!(n->m_flags & M_EXT)) {
m_free(n);
error = ENOBUFS;
goto out;
}
}
memset(mtod(n, caddr_t), 0, off);
n->m_len = len + off;
m->m_next = n;
}
m = m->m_next;
}
while (len > 0) {
/* extend last packet to be filled fully */
if (m->m_next == NULL && (len > m->m_len - off))
m->m_len += min(len - (m->m_len - off),
m_trailingspace(m));
mlen = min(m->m_len - off, len);
memmove(mtod(m, caddr_t) + off, cp, mlen);
cp += mlen;
len -= mlen;
totlen += mlen + off;
if (len == 0)
break;
off = 0;
if (m->m_next == NULL) { if ((n = m_get(wait, m->m_type)) == NULL) {
error = ENOBUFS;
goto out;
}
if (len > MLEN) {
MCLGETL(n, wait, len);
if (!(n->m_flags & M_EXT)) {
m_free(n);
error = ENOBUFS;
goto out;
}
}
n->m_len = len;
m->m_next = n;
}
m = m->m_next;
}
out:
if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen;
return (error);
}
/*
* Concatenate mbuf chain n to m.
* n might be copied into m (when n->m_len is small), therefore data portion of
* n could be copied into an mbuf of different mbuf type.
* Therefore both chains should be of the same type (e.g. MT_DATA).
* Any m_pkthdr is not updated.
*/
void
m_cat(struct mbuf *m, struct mbuf *n)
{ while (m->m_next)
m = m->m_next;
while (n) { if (M_READONLY(m) || n->m_len > m_trailingspace(m)) {
/* just join the two chains */
m->m_next = n;
return;
}
/* splat the data from one into the other */
memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
n->m_len);
m->m_len += n->m_len;
n = m_free(n);
}
}
void
m_adj(struct mbuf *mp, int req_len)
{
int len = req_len;
struct mbuf *m;
int count;
if (mp == NULL)
return;
if (len >= 0) {
/*
* Trim from head.
*/
m = mp;
while (m != NULL && len > 0) {
if (m->m_len <= len) {
len -= m->m_len;
m->m_data += m->m_len;
m->m_len = 0;
m = m->m_next;
} else {
m->m_data += len;
m->m_len -= len;
len = 0;
}
}
if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len);
} else {
/*
* Trim from tail. Scan the mbuf chain,
* calculating its length and finding the last mbuf.
* If the adjustment only affects this mbuf, then just
* adjust and return. Otherwise, rescan and truncate
* after the remaining size.
*/
len = -len;
count = 0;
m = mp;
for (;;) {
count += m->m_len;
if (m->m_next == NULL)
break;
m = m->m_next;
}
if (m->m_len >= len) {
m->m_len -= len;
if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len;
return;
}
count -= len;
if (count < 0)
count = 0;
/*
* Correct length for chain is "count".
* Find the mbuf with last data, adjust its length,
* and toss data from remaining mbufs on chain.
*/
if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len = count;
m = mp;
for (;;) {
if (m->m_len >= count) {
m->m_len = count;
break;
}
count -= m->m_len;
m = m->m_next;
}
while ((m = m->m_next) != NULL)
m->m_len = 0;
}
}
/*
* Rearrange an mbuf chain so that len bytes are contiguous
* and in the data area of an mbuf (so that mtod will work
* for a structure of size len). Returns the resulting
* mbuf chain on success, frees it and returns null on failure.
*/
struct mbuf *
m_pullup(struct mbuf *m0, int len)
{
struct mbuf *m;
unsigned int adj;
caddr_t head, tail;
unsigned int space;
/* if len is already contig in m0, then don't do any work */
if (len <= m0->m_len)
return (m0);
/* look for some data */
m = m0->m_next;
if (m == NULL)
goto freem0;
head = M_DATABUF(m0); if (m0->m_len == 0) {
while (m->m_len == 0) {
m = m_free(m);
if (m == NULL)
goto freem0;
}
adj = mtod(m, unsigned long) & (sizeof(long) - 1);
} else
adj = mtod(m0, unsigned long) & (sizeof(long) - 1);
tail = head + M_SIZE(m0);
head += adj;
if (!M_READONLY(m0) && len <= tail - head) {
/* we can copy everything into the first mbuf */
if (m0->m_len == 0) {
m0->m_data = head; } else if (len > tail - mtod(m0, caddr_t)) {
/* need to memmove to make space at the end */
memmove(head, mtod(m0, caddr_t), m0->m_len);
m0->m_data = head;
}
len -= m0->m_len;
} else {
/* the first mbuf is too small or read-only, make a new one */
space = adj + len;
if (space > MAXMCLBYTES)
goto bad;
m0->m_next = m;
m = m0;
MGET(m0, M_DONTWAIT, m->m_type);
if (m0 == NULL)
goto bad;
if (space > MHLEN) {
MCLGETL(m0, M_DONTWAIT, space);
if ((m0->m_flags & M_EXT) == 0)
goto bad;
}
if (m->m_flags & M_PKTHDR) M_MOVE_PKTHDR(m0, m);
m0->m_len = 0;
m0->m_data += adj;
}
KDASSERT(m_trailingspace(m0) >= len);
for (;;) {
space = min(len, m->m_len);
memcpy(mtod(m0, caddr_t) + m0->m_len, mtod(m, caddr_t), space);
len -= space;
m0->m_len += space;
m->m_len -= space;
if (m->m_len > 0)
m->m_data += space;
else
m = m_free(m);
if (len == 0)
break;
if (m == NULL)
goto bad;
}
m0->m_next = m; /* link the chain back up */
return (m0);
bad:
m_freem(m);
freem0:
m_free(m0);
return (NULL);
}
/*
* Return a pointer to mbuf/offset of location in mbuf chain.
*/
struct mbuf *
m_getptr(struct mbuf *m, int loc, int *off)
{ while (loc >= 0) {
/* Normal end of search */
if (m->m_len > loc) {
*off = loc;
return (m);
} else {
loc -= m->m_len;
if (m->m_next == NULL) { if (loc == 0) {
/* Point at the end of valid data */
*off = m->m_len;
return (m);
} else {
return (NULL);
}
} else {
m = m->m_next;
}
}
}
return (NULL);
}
/*
* Partition an mbuf chain in two pieces, returning the tail --
* all but the first len0 bytes. In case of failure, it returns NULL and
* attempts to restore the chain to its original state.
*/
struct mbuf *
m_split(struct mbuf *m0, int len0, int wait)
{
struct mbuf *m, *n;
unsigned len = len0, remain, olen;
for (m = m0; m && len > m->m_len; m = m->m_next)
len -= m->m_len;
if (m == NULL)
return (NULL);
remain = m->m_len - len;
if (m0->m_flags & M_PKTHDR) {
MGETHDR(n, wait, m0->m_type);
if (n == NULL)
return (NULL);
if (m_dup_pkthdr(n, m0, wait)) {
m_freem(n);
return (NULL);
}
n->m_pkthdr.len -= len0;
olen = m0->m_pkthdr.len;
m0->m_pkthdr.len = len0;
if (remain == 0) {
n->m_next = m->m_next;
m->m_next = NULL;
n->m_len = 0;
return (n);
}
if (m->m_flags & M_EXT)
goto extpacket;
if (remain > MHLEN) {
/* m can't be the lead packet */
m_align(n, 0);
n->m_next = m_split(m, len, wait);
if (n->m_next == NULL) {
(void) m_free(n);
m0->m_pkthdr.len = olen;
return (NULL);
} else {
n->m_len = 0;
return (n);
}
} else
m_align(n, remain);
} else if (remain == 0) {
n = m->m_next;
m->m_next = NULL;
return (n);
} else {
MGET(n, wait, m->m_type);
if (n == NULL)
return (NULL);
m_align(n, remain);
}
extpacket:
if (m->m_flags & M_EXT) {
n->m_ext = m->m_ext;
MCLADDREFERENCE(m, n);
n->m_data = m->m_data + len;
} else {
memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + len, remain);
}
n->m_len = remain;
m->m_len = len;
n->m_next = m->m_next;
m->m_next = NULL;
return (n);
}
/*
* Make space for a new header of length hlen at skip bytes
* into the packet. When doing this we allocate new mbufs only
* when absolutely necessary. The mbuf where the new header
* is to go is returned together with an offset into the mbuf.
* If NULL is returned then the mbuf chain may have been modified;
* the caller is assumed to always free the chain.
*/
struct mbuf *
m_makespace(struct mbuf *m0, int skip, int hlen, int *off)
{
struct mbuf *m;
unsigned remain;
KASSERT(m0->m_flags & M_PKTHDR);
/*
* Limit the size of the new header to MHLEN. In case
* skip = 0 and the first buffer is not a cluster this
* is the maximum space available in that mbuf.
* In other words this code never prepends a mbuf.
*/
KASSERT(hlen < MHLEN);
for (m = m0; m && skip > m->m_len; m = m->m_next)
skip -= m->m_len;
if (m == NULL)
return (NULL);
/*
* At this point skip is the offset into the mbuf m
* where the new header should be placed. Figure out
* if there's space to insert the new header. If so,
* and copying the remainder makes sense then do so.
* Otherwise insert a new mbuf in the chain, splitting
* the contents of m as needed.
*/
remain = m->m_len - skip; /* data to move */
if (skip < remain && hlen <= m_leadingspace(m)) {
if (skip)
memmove(m->m_data-hlen, m->m_data, skip);
m->m_data -= hlen;
m->m_len += hlen;
*off = skip;
} else if (hlen > m_trailingspace(m)) {
struct mbuf *n;
if (remain > 0) {
MGET(n, M_DONTWAIT, m->m_type);
if (n && remain > MLEN) {
MCLGETL(n, M_DONTWAIT, remain);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (n == NULL)
return (NULL);
memcpy(n->m_data, mtod(m, char *) + skip, remain);
n->m_len = remain;
m->m_len -= remain;
n->m_next = m->m_next;
m->m_next = n;
}
if (hlen <= m_trailingspace(m)) {
m->m_len += hlen;
*off = skip;
} else {
n = m_get(M_DONTWAIT, m->m_type);
if (n == NULL)
return NULL;
n->m_len = hlen;
n->m_next = m->m_next;
m->m_next = n;
*off = 0; /* header is at front ... */
m = n; /* ... of new mbuf */
}
} else {
/*
* Copy the remainder to the back of the mbuf
* so there's space to write the new header.
*/
if (remain > 0)
memmove(mtod(m, caddr_t) + skip + hlen,
mtod(m, caddr_t) + skip, remain);
m->m_len += hlen;
*off = skip;
}
m0->m_pkthdr.len += hlen; /* adjust packet length */
return m;
}
/*
* Routine to copy from device local memory into mbufs.
*/
struct mbuf *
m_devget(char *buf, int totlen, int off)
{
struct mbuf *m;
struct mbuf *top, **mp;
int len;
top = NULL;
mp = ⊤
if (off < 0 || off > MHLEN)
return (NULL);
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m == NULL)
return (NULL);
m->m_pkthdr.len = totlen;
len = MHLEN;
while (totlen > 0) {
if (top != NULL) {
MGET(m, M_DONTWAIT, MT_DATA);
if (m == NULL) {
/*
* As we might get called by pfkey, make sure
* we do not leak sensitive data.
*/
top->m_flags |= M_ZEROIZE;
m_freem(top);
return (NULL);
}
len = MLEN;
}
if (totlen + off >= MINCLSIZE) {
MCLGET(m, M_DONTWAIT);
if (m->m_flags & M_EXT)
len = MCLBYTES;
} else {
/* Place initial small packet/header at end of mbuf. */
if (top == NULL && totlen + off + max_linkhdr <= len) {
m->m_data += max_linkhdr;
len -= max_linkhdr;
}
}
if (off) {
m->m_data += off;
len -= off;
off = 0;
}
m->m_len = len = min(totlen, len);
memcpy(mtod(m, void *), buf, (size_t)len);
buf += len;
*mp = m;
mp = &m->m_next;
totlen -= len;
}
return (top);
}
void
m_zero(struct mbuf *m)
{ if (M_READONLY(m)) {
mtx_enter(&m_extref_mtx);
if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
}
mtx_leave(&m_extref_mtx);
return;
}
explicit_bzero(M_DATABUF(m), M_SIZE(m));
}
/*
* Apply function f to the data in an mbuf chain starting "off" bytes from the
* beginning, continuing for "len" bytes.
*/
int
m_apply(struct mbuf *m, int off, int len,
int (*f)(caddr_t, caddr_t, unsigned int), caddr_t fstate)
{
int rval;
unsigned int count;
if (len < 0)
panic("m_apply: len %d < 0", len);
if (off < 0)
panic("m_apply: off %d < 0", off);
while (off > 0) {
if (m == NULL)
panic("m_apply: null mbuf in skip");
if (off < m->m_len)
break;
off -= m->m_len;
m = m->m_next;
}
while (len > 0) {
if (m == NULL)
panic("m_apply: null mbuf");
count = min(m->m_len - off, len);
rval = f(fstate, mtod(m, caddr_t) + off, count);
if (rval)
return (rval);
len -= count;
off = 0;
m = m->m_next;
}
return (0);
}
/*
* Compute the amount of space available before the current start of data
* in an mbuf. Read-only clusters never have space available.
*/
int
m_leadingspace(struct mbuf *m)
{
if (M_READONLY(m))
return 0;
KASSERT(m->m_data >= M_DATABUF(m));
return m->m_data - M_DATABUF(m);
}
/*
* Compute the amount of space available after the end of data in an mbuf.
* Read-only clusters never have space available.
*/
int
m_trailingspace(struct mbuf *m)
{ if (M_READONLY(m))
return 0;
KASSERT(M_DATABUF(m) + M_SIZE(m) >= (m->m_data + m->m_len)); return M_DATABUF(m) + M_SIZE(m) - (m->m_data + m->m_len);
}
/*
* Set the m_data pointer of a newly-allocated mbuf to place an object of
* the specified size at the end of the mbuf, longword aligned.
*/
void
m_align(struct mbuf *m, int len)
{ KASSERT(len >= 0 && !M_READONLY(m)); KASSERT(m->m_data == M_DATABUF(m)); /* newly-allocated check */ KASSERT(((len + sizeof(long) - 1) &~ (sizeof(long) - 1)) <= M_SIZE(m));
m->m_data = M_DATABUF(m) + ((M_SIZE(m) - (len)) &~ (sizeof(long) - 1));
}
/*
* Duplicate mbuf pkthdr from from to to.
* from must have M_PKTHDR set, and to must be empty.
*/
int
m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int wait)
{
int error;
KASSERT(from->m_flags & M_PKTHDR);
to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
to->m_flags |= (from->m_flags & M_COPYFLAGS);
to->m_pkthdr = from->m_pkthdr;
#if NPF > 0
to->m_pkthdr.pf.statekey = NULL;
pf_mbuf_link_state_key(to, from->m_pkthdr.pf.statekey);
to->m_pkthdr.pf.inp = NULL;
pf_mbuf_link_inpcb(to, from->m_pkthdr.pf.inp);
#endif /* NPF > 0 */
SLIST_INIT(&to->m_pkthdr.ph_tags);
if ((error = m_tag_copy_chain(to, from, wait)) != 0)
return (error);
if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat;
return (0);
}
struct mbuf *
m_dup_pkt(struct mbuf *m0, unsigned int adj, int wait)
{
struct mbuf *m;
int len;
KASSERT(m0->m_flags & M_PKTHDR);
len = m0->m_pkthdr.len + adj;
if (len > MAXMCLBYTES) /* XXX */
return (NULL);
m = m_get(wait, m0->m_type);
if (m == NULL)
return (NULL);
if (m_dup_pkthdr(m, m0, wait) != 0)
goto fail;
if (len > MHLEN) {
MCLGETL(m, wait, len);
if (!ISSET(m->m_flags, M_EXT))
goto fail;
}
m->m_len = m->m_pkthdr.len = len;
m_adj(m, adj);
m_copydata(m0, 0, m0->m_pkthdr.len, mtod(m, caddr_t));
return (m);
fail:
m_freem(m);
return (NULL);
}
void
m_microtime(const struct mbuf *m, struct timeval *tv)
{
if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) {
struct timeval btv, utv;
NSEC_TO_TIMEVAL(m->m_pkthdr.ph_timestamp, &utv);
microboottime(&btv);
timeradd(&btv, &utv, tv);
} else
microtime(tv);
}
void *
m_pool_alloc(struct pool *pp, int flags, int *slowdown)
{
void *v;
if (atomic_add_long_nv(&mbuf_mem_alloc, pp->pr_pgsize) > mbuf_mem_limit)
goto fail;
v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown);
if (v != NULL)
return (v);
fail:
atomic_sub_long(&mbuf_mem_alloc, pp->pr_pgsize);
return (NULL);
}
void
m_pool_free(struct pool *pp, void *v)
{
(*pool_allocator_multi.pa_free)(pp, v);
atomic_sub_long(&mbuf_mem_alloc, pp->pr_pgsize);
}
void
m_pool_init(struct pool *pp, u_int size, u_int align, const char *wmesg)
{
pool_init(pp, size, align, IPL_NET, 0, wmesg, &m_pool_allocator);
pool_set_constraints(pp, &kp_dma_contig);
}
u_int
m_pool_used(void)
{
return ((mbuf_mem_alloc * 100) / mbuf_mem_limit);
}
#ifdef DDB
void
m_print(void *v,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct mbuf *m = v;
(*pr)("mbuf %p\n", m);
(*pr)("m_type: %i\tm_flags: %b\n", m->m_type, m->m_flags, M_BITS);
(*pr)("m_next: %p\tm_nextpkt: %p\n", m->m_next, m->m_nextpkt);
(*pr)("m_data: %p\tm_len: %u\n", m->m_data, m->m_len);
(*pr)("m_dat: %p\tm_pktdat: %p\n", m->m_dat, m->m_pktdat);
if (m->m_flags & M_PKTHDR) {
(*pr)("m_ptkhdr.ph_ifidx: %u\tm_pkthdr.len: %i\n",
m->m_pkthdr.ph_ifidx, m->m_pkthdr.len);
(*pr)("m_ptkhdr.ph_tags: %p\tm_pkthdr.ph_tagsset: %b\n",
SLIST_FIRST(&m->m_pkthdr.ph_tags),
m->m_pkthdr.ph_tagsset, MTAG_BITS);
(*pr)("m_pkthdr.ph_flowid: %u\tm_pkthdr.ph_loopcnt: %u\n",
m->m_pkthdr.ph_flowid, m->m_pkthdr.ph_loopcnt);
(*pr)("m_pkthdr.csum_flags: %b\n",
m->m_pkthdr.csum_flags, MCS_BITS);
(*pr)("m_pkthdr.ether_vtag: %u\tm_ptkhdr.ph_rtableid: %u\n",
m->m_pkthdr.ether_vtag, m->m_pkthdr.ph_rtableid);
(*pr)("m_pkthdr.pf.statekey: %p\tm_pkthdr.pf.inp %p\n",
m->m_pkthdr.pf.statekey, m->m_pkthdr.pf.inp);
(*pr)("m_pkthdr.pf.qid: %u\tm_pkthdr.pf.tag: %u\n",
m->m_pkthdr.pf.qid, m->m_pkthdr.pf.tag);
(*pr)("m_pkthdr.pf.flags: %b\n",
m->m_pkthdr.pf.flags, MPF_BITS);
(*pr)("m_pkthdr.pf.routed: %u\tm_pkthdr.pf.prio: %u\n",
m->m_pkthdr.pf.routed, m->m_pkthdr.pf.prio);
}
if (m->m_flags & M_EXT) {
(*pr)("m_ext.ext_buf: %p\tm_ext.ext_size: %u\n",
m->m_ext.ext_buf, m->m_ext.ext_size);
(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
m->m_ext.ext_free_fn, m->m_ext.ext_arg);
(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
m->m_ext.ext_nextref, m->m_ext.ext_prevref);
}
}
#endif
/*
* mbuf lists
*/
void
ml_init(struct mbuf_list *ml)
{
ml->ml_head = ml->ml_tail = NULL;
ml->ml_len = 0;
}
void
ml_enqueue(struct mbuf_list *ml, struct mbuf *m)
{
if (ml->ml_tail == NULL)
ml->ml_head = ml->ml_tail = m;
else {
ml->ml_tail->m_nextpkt = m;
ml->ml_tail = m;
}
m->m_nextpkt = NULL;
ml->ml_len++;
}
void
ml_enlist(struct mbuf_list *mla, struct mbuf_list *mlb)
{
if (!ml_empty(mlb)) {
if (ml_empty(mla))
mla->ml_head = mlb->ml_head;
else
mla->ml_tail->m_nextpkt = mlb->ml_head;
mla->ml_tail = mlb->ml_tail;
mla->ml_len += mlb->ml_len;
ml_init(mlb);
}
}
struct mbuf *
ml_dequeue(struct mbuf_list *ml)
{
struct mbuf *m;
m = ml->ml_head;
if (m != NULL) {
ml->ml_head = m->m_nextpkt;
if (ml->ml_head == NULL) ml->ml_tail = NULL;
m->m_nextpkt = NULL;
ml->ml_len--;
}
return (m);
}
struct mbuf *
ml_dechain(struct mbuf_list *ml)
{
struct mbuf *m0;
m0 = ml->ml_head;
ml_init(ml);
return (m0);
}
unsigned int
ml_purge(struct mbuf_list *ml)
{
struct mbuf *m, *n;
unsigned int len;
for (m = ml->ml_head; m != NULL; m = n) {
n = m->m_nextpkt;
m_freem(m);
}
len = ml->ml_len;
ml_init(ml);
return (len);
}
unsigned int
ml_hdatalen(struct mbuf_list *ml)
{
struct mbuf *m;
m = ml->ml_head;
if (m == NULL)
return (0);
KASSERT(ISSET(m->m_flags, M_PKTHDR));
return (m->m_pkthdr.len);
}
/*
* mbuf queues
*/
void
mq_init(struct mbuf_queue *mq, u_int maxlen, int ipl)
{
mtx_init(&mq->mq_mtx, ipl);
ml_init(&mq->mq_list);
mq->mq_maxlen = maxlen;
}
int
mq_push(struct mbuf_queue *mq, struct mbuf *m)
{
struct mbuf *dropped = NULL;
mtx_enter(&mq->mq_mtx);
if (mq_len(mq) >= mq->mq_maxlen) {
mq->mq_drops++;
dropped = ml_dequeue(&mq->mq_list);
}
ml_enqueue(&mq->mq_list, m);
mtx_leave(&mq->mq_mtx);
if (dropped)
m_freem(dropped);
return (dropped != NULL);
}
int
mq_enqueue(struct mbuf_queue *mq, struct mbuf *m)
{
int dropped = 0;
mtx_enter(&mq->mq_mtx);
if (mq_len(mq) < mq->mq_maxlen)
ml_enqueue(&mq->mq_list, m);
else {
mq->mq_drops++;
dropped = 1;
}
mtx_leave(&mq->mq_mtx);
if (dropped)
m_freem(m);
return (dropped);
}
struct mbuf *
mq_dequeue(struct mbuf_queue *mq)
{
struct mbuf *m;
mtx_enter(&mq->mq_mtx);
m = ml_dequeue(&mq->mq_list);
mtx_leave(&mq->mq_mtx);
return (m);
}
int
mq_enlist(struct mbuf_queue *mq, struct mbuf_list *ml)
{
struct mbuf *m;
int dropped = 0;
mtx_enter(&mq->mq_mtx);
if (mq_len(mq) < mq->mq_maxlen)
ml_enlist(&mq->mq_list, ml);
else {
dropped = ml_len(ml);
mq->mq_drops += dropped;
}
mtx_leave(&mq->mq_mtx);
if (dropped) {
while ((m = ml_dequeue(ml)) != NULL)
m_freem(m);
}
return (dropped);
}
void
mq_delist(struct mbuf_queue *mq, struct mbuf_list *ml)
{
mtx_enter(&mq->mq_mtx);
*ml = mq->mq_list;
ml_init(&mq->mq_list);
mtx_leave(&mq->mq_mtx);
}
struct mbuf *
mq_dechain(struct mbuf_queue *mq)
{
struct mbuf *m0;
mtx_enter(&mq->mq_mtx);
m0 = ml_dechain(&mq->mq_list);
mtx_leave(&mq->mq_mtx);
return (m0);
}
unsigned int
mq_purge(struct mbuf_queue *mq)
{
struct mbuf_list ml;
mq_delist(mq, &ml);
return (ml_purge(&ml));
}
unsigned int
mq_hdatalen(struct mbuf_queue *mq)
{
unsigned int hdatalen;
mtx_enter(&mq->mq_mtx);
hdatalen = ml_hdatalen(&mq->mq_list);
mtx_leave(&mq->mq_mtx);
return (hdatalen);
}
int
sysctl_mq(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen, struct mbuf_queue *mq)
{
unsigned int maxlen;
int error;
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case IFQCTL_LEN:
return (sysctl_rdint(oldp, oldlenp, newp, mq_len(mq)));
case IFQCTL_MAXLEN:
maxlen = mq->mq_maxlen;
error = sysctl_int(oldp, oldlenp, newp, newlen, &maxlen);
if (!error && maxlen != mq->mq_maxlen) { mtx_enter(&mq->mq_mtx);
mq->mq_maxlen = maxlen;
mtx_leave(&mq->mq_mtx);
}
return (error);
case IFQCTL_DROPS:
return (sysctl_rdint(oldp, oldlenp, newp, mq_drops(mq)));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
/* $OpenBSD: exec_elf.c,v 1.168 2022/08/29 16:53:46 deraadt Exp $ */
/*
* Copyright (c) 1996 Per Fogelstrom
* All rights reserved.
*
* Copyright (c) 1994 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
/*
* Copyright (c) 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Jason R. Thorpe for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/core.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/fcntl.h>
#include <sys/ptrace.h>
#include <sys/signalvar.h>
#include <sys/pledge.h>
#include <sys/mman.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
#include <machine/exec.h>
int elf_load_file(struct proc *, char *, struct exec_package *,
struct elf_args *);
int elf_check_header(Elf_Ehdr *);
int elf_read_from(struct proc *, struct vnode *, u_long, void *, int);
void elf_load_psection(struct exec_vmcmd_set *, struct vnode *,
Elf_Phdr *, Elf_Addr *, Elf_Addr *, int *, int);
int elf_os_pt_note_name(Elf_Note *);
int elf_os_pt_note(struct proc *, struct exec_package *, Elf_Ehdr *, int *);
/* round up and down to page boundaries. */
#define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1))
#define ELF_TRUNC(a, b) ((a) & ~((b) - 1))
/*
* We limit the number of program headers to 32, this should
* be a reasonable limit for ELF, the most we have seen so far is 12
*/
#define ELF_MAX_VALID_PHDR 32
#define ELF_NOTE_NAME_OPENBSD 0x01
struct elf_note_name {
char *name;
int id;
} elf_note_names[] = {
{ "OpenBSD", ELF_NOTE_NAME_OPENBSD },
};
#define ELFROUNDSIZE sizeof(Elf_Word)
#define elfround(x) roundup((x), ELFROUNDSIZE)
/*
* Check header for validity; return 0 for ok, ENOEXEC if error
*/
int
elf_check_header(Elf_Ehdr *ehdr)
{
/*
* We need to check magic, class size, endianness, and version before
* we look at the rest of the Elf_Ehdr structure. These few elements
* are represented in a machine independent fashion.
*/
if (!IS_ELF(*ehdr) || ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || ehdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
ehdr->e_ident[EI_VERSION] != ELF_TARG_VER)
return (ENOEXEC);
/* Now check the machine dependent header */
if (ehdr->e_machine != ELF_TARG_MACH ||
ehdr->e_version != ELF_TARG_VER)
return (ENOEXEC);
/* Don't allow an insane amount of sections. */
if (ehdr->e_phnum > ELF_MAX_VALID_PHDR)
return (ENOEXEC);
return (0);
}
/*
* Load a psection at the appropriate address
*/
void
elf_load_psection(struct exec_vmcmd_set *vcset, struct vnode *vp,
Elf_Phdr *ph, Elf_Addr *addr, Elf_Addr *size, int *prot, int flags)
{
u_long msize, lsize, psize, rm, rf;
long diff, offset, bdiff;
Elf_Addr base;
/*
* If the user specified an address, then we load there.
*/
if (*addr != ELF_NO_ADDR) {
if (ph->p_align > 1) {
*addr = ELF_TRUNC(*addr, ph->p_align);
diff = ph->p_vaddr - ELF_TRUNC(ph->p_vaddr, ph->p_align);
/* page align vaddr */
base = *addr + trunc_page(ph->p_vaddr)
- ELF_TRUNC(ph->p_vaddr, ph->p_align);
} else {
diff = 0;
base = *addr + trunc_page(ph->p_vaddr) - ph->p_vaddr;
}
} else {
*addr = ph->p_vaddr;
if (ph->p_align > 1)
*addr = ELF_TRUNC(*addr, ph->p_align);
base = trunc_page(ph->p_vaddr);
diff = ph->p_vaddr - *addr;
}
bdiff = ph->p_vaddr - trunc_page(ph->p_vaddr);
/*
* Enforce W^X and map W|X segments without X permission
* initially. The dynamic linker will make these read-only
* and add back X permission after relocation processing.
* Static executables with W|X segments will probably crash.
*/
*prot |= (ph->p_flags & PF_R) ? PROT_READ : 0;
*prot |= (ph->p_flags & PF_W) ? PROT_WRITE : 0;
if ((ph->p_flags & PF_W) == 0)
*prot |= (ph->p_flags & PF_X) ? PROT_EXEC : 0;
msize = ph->p_memsz + diff;
offset = ph->p_offset - bdiff;
lsize = ph->p_filesz + bdiff;
psize = round_page(lsize);
/*
* Because the pagedvn pager can't handle zero fill of the last
* data page if it's not page aligned we map the last page readvn.
*/
if (ph->p_flags & PF_W) {
psize = trunc_page(lsize);
if (psize > 0)
NEW_VMCMD2(vcset, vmcmd_map_pagedvn, psize, base, vp,
offset, *prot, flags);
if (psize != lsize) {
NEW_VMCMD2(vcset, vmcmd_map_readvn, lsize - psize,
base + psize, vp, offset + psize, *prot, flags);
}
} else {
NEW_VMCMD2(vcset, vmcmd_map_pagedvn, psize, base, vp, offset,
*prot, flags);
}
/*
* Check if we need to extend the size of the segment
*/
rm = round_page(*addr + ph->p_memsz + diff);
rf = round_page(*addr + ph->p_filesz + diff);
if (rm != rf) {
NEW_VMCMD2(vcset, vmcmd_map_zero, rm - rf, rf, NULLVP, 0,
*prot, flags);
}
*size = msize;
}
/*
* Read from vnode into buffer at offset.
*/
int
elf_read_from(struct proc *p, struct vnode *vp, u_long off, void *buf,
int size)
{
int error;
size_t resid;
if ((error = vn_rdwr(UIO_READ, vp, buf, size, off, UIO_SYSSPACE,
0, p->p_ucred, &resid, p)) != 0)
return error;
/*
* See if we got all of it
*/
if (resid != 0)
return (ENOEXEC);
return (0);
}
/*
* Load a file (interpreter/library) pointed to by path [stolen from
* coff_load_shlib()]. Made slightly generic so it might be used externally.
*/
int
elf_load_file(struct proc *p, char *path, struct exec_package *epp,
struct elf_args *ap)
{
int error, i;
struct nameidata nd;
Elf_Ehdr eh;
Elf_Phdr *ph = NULL;
u_long phsize = 0;
Elf_Addr addr;
struct vnode *vp;
Elf_Phdr *base_ph = NULL;
struct interp_ld_sec {
Elf_Addr vaddr;
u_long memsz;
} loadmap[ELF_MAX_VALID_PHDR];
int nload, idx = 0;
Elf_Addr pos;
int file_align;
int loop;
size_t randomizequota = ELF_RANDOMIZE_LIMIT;
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p);
nd.ni_pledge = PLEDGE_RPATH;
nd.ni_unveil = UNVEIL_READ;
if ((error = namei(&nd)) != 0) {
return (error);
}
vp = nd.ni_vp;
if (vp->v_type != VREG) {
error = EACCES;
goto bad;
}
if ((error = VOP_GETATTR(vp, epp->ep_vap, p->p_ucred, p)) != 0)
goto bad;
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad;
}
if ((error = VOP_ACCESS(vp, VREAD, p->p_ucred, p)) != 0)
goto bad1;
if ((error = elf_read_from(p, nd.ni_vp, 0, &eh, sizeof(eh))) != 0)
goto bad1;
if (elf_check_header(&eh) || eh.e_type != ET_DYN) {
error = ENOEXEC;
goto bad1;
}
ph = mallocarray(eh.e_phnum, sizeof(Elf_Phdr), M_TEMP, M_WAITOK);
phsize = eh.e_phnum * sizeof(Elf_Phdr);
if ((error = elf_read_from(p, nd.ni_vp, eh.e_phoff, ph, phsize)) != 0)
goto bad1;
for (i = 0; i < eh.e_phnum; i++) {
if (ph[i].p_type == PT_LOAD) {
if (ph[i].p_filesz > ph[i].p_memsz ||
ph[i].p_memsz == 0) {
error = EINVAL;
goto bad1;
}
loadmap[idx].vaddr = trunc_page(ph[i].p_vaddr);
loadmap[idx].memsz = round_page (ph[i].p_vaddr +
ph[i].p_memsz - loadmap[idx].vaddr);
file_align = ph[i].p_align;
idx++;
}
}
nload = idx;
/*
* Load the interpreter where a non-fixed mmap(NULL, ...)
* would (i.e. something safely out of the way).
*/
pos = uvm_map_hint(p->p_vmspace, PROT_EXEC, VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS);
pos = ELF_ROUND(pos, file_align);
loop = 0;
for (i = 0; i < nload;/**/) {
vaddr_t addr;
struct uvm_object *uobj;
off_t uoff;
size_t size;
#ifdef this_needs_fixing
if (i == 0) {
uobj = &vp->v_uvm.u_obj;
/* need to fix uoff */
} else {
#endif
uobj = NULL;
uoff = 0;
#ifdef this_needs_fixing
}
#endif
addr = trunc_page(pos + loadmap[i].vaddr);
size = round_page(addr + loadmap[i].memsz) - addr;
/* CRAP - map_findspace does not avoid daddr+BRKSIZ */
if ((addr + size > (vaddr_t)p->p_vmspace->vm_daddr) &&
(addr < (vaddr_t)p->p_vmspace->vm_daddr + BRKSIZ))
addr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
BRKSIZ);
if (uvm_map_mquery(&p->p_vmspace->vm_map, &addr, size,
(i == 0 ? uoff : UVM_UNKNOWN_OFFSET), 0) != 0) {
if (loop == 0) {
loop = 1;
i = 0;
pos = 0;
continue;
}
error = ENOMEM;
goto bad1;
}
if (addr != pos + loadmap[i].vaddr) {
/* base changed. */
pos = addr - trunc_page(loadmap[i].vaddr);
pos = ELF_ROUND(pos,file_align);
i = 0;
continue;
}
i++;
}
/*
* Load all the necessary sections
*/
for (i = 0; i < eh.e_phnum; i++) {
Elf_Addr size = 0;
int prot = 0;
int flags;
switch (ph[i].p_type) {
case PT_LOAD:
if (base_ph == NULL) {
flags = VMCMD_BASE;
addr = pos;
base_ph = &ph[i];
} else {
flags = VMCMD_RELATIVE;
addr = ph[i].p_vaddr - base_ph->p_vaddr;
}
elf_load_psection(&epp->ep_vmcmds, nd.ni_vp,
&ph[i], &addr, &size, &prot, flags | VMCMD_SYSCALL);
/* If entry is within this section it must be text */
if (eh.e_entry >= ph[i].p_vaddr &&
eh.e_entry < (ph[i].p_vaddr + size)) {
epp->ep_entry = addr + eh.e_entry -
ELF_TRUNC(ph[i].p_vaddr,ph[i].p_align);
if (flags == VMCMD_RELATIVE)
epp->ep_entry += pos;
ap->arg_interp = pos;
}
addr += size;
break;
case PT_DYNAMIC:
case PT_PHDR:
case PT_NOTE:
break;
case PT_OPENBSD_RANDOMIZE:
if (ph[i].p_memsz > randomizequota) {
error = ENOMEM;
goto bad1;
}
randomizequota -= ph[i].p_memsz;
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_randomize,
ph[i].p_memsz, ph[i].p_vaddr + pos, NULLVP, 0, 0);
break;
default:
break;
}
}
vn_marktext(nd.ni_vp);
bad1:
VOP_CLOSE(nd.ni_vp, FREAD, p->p_ucred, p);
bad:
free(ph, M_TEMP, phsize);
vput(nd.ni_vp);
return (error);
}
/*
* Prepare an Elf binary's exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error out if
* this is not possible. Finally, set up vmcmds for the text, data, bss, and
* stack segments.
*/
int
exec_elf_makecmds(struct proc *p, struct exec_package *epp)
{
Elf_Ehdr *eh = epp->ep_hdr;
Elf_Phdr *ph, *pp, *base_ph = NULL;
Elf_Addr phdr = 0, exe_base = 0;
int error, i, has_phdr = 0, names = 0;
char *interp = NULL;
u_long phsize;
size_t randomizequota = ELF_RANDOMIZE_LIMIT;
if (epp->ep_hdrvalid < sizeof(Elf_Ehdr))
return (ENOEXEC);
if (elf_check_header(eh) || (eh->e_type != ET_EXEC && eh->e_type != ET_DYN))
return (ENOEXEC);
/*
* check if vnode is in open for writing, because we want to demand-
* page out of it. if it is, don't do it, for various reasons.
*/
if (epp->ep_vp->v_writecount != 0) {
#ifdef DIAGNOSTIC
if (epp->ep_vp->v_flag & VTEXT) panic("exec: a VTEXT vnode has writecount != 0");
#endif
return (ETXTBSY);
}
/*
* Allocate space to hold all the program headers, and read them
* from the file
*/
ph = mallocarray(eh->e_phnum, sizeof(Elf_Phdr), M_TEMP, M_WAITOK);
phsize = eh->e_phnum * sizeof(Elf_Phdr);
if ((error = elf_read_from(p, epp->ep_vp, eh->e_phoff, ph,
phsize)) != 0)
goto bad;
epp->ep_tsize = ELF_NO_ADDR;
epp->ep_dsize = ELF_NO_ADDR;
for (i = 0, pp = ph; i < eh->e_phnum; i++, pp++) {
if (pp->p_type == PT_INTERP && !interp) {
if (pp->p_filesz < 2 || pp->p_filesz > MAXPATHLEN)
goto bad;
interp = pool_get(&namei_pool, PR_WAITOK);
if ((error = elf_read_from(p, epp->ep_vp,
pp->p_offset, interp, pp->p_filesz)) != 0) {
goto bad;
}
if (interp[pp->p_filesz - 1] != '\0')
goto bad;
} else if (pp->p_type == PT_LOAD) { if (pp->p_filesz > pp->p_memsz ||
pp->p_memsz == 0) {
error = EINVAL;
goto bad;
}
if (base_ph == NULL)
base_ph = pp;
} else if (pp->p_type == PT_PHDR) {
has_phdr = 1;
}
}
if (eh->e_type == ET_DYN) {
/* need phdr and load sections for PIE */
if (!has_phdr || base_ph == NULL) {
error = EINVAL;
goto bad;
}
/* randomize exe_base for PIE */
exe_base = uvm_map_pie(base_ph->p_align);
}
/*
* Verify this is an OpenBSD executable. If it's marked that way
* via a PT_NOTE then also check for a PT_OPENBSD_WXNEEDED segment.
*/
if ((error = elf_os_pt_note(p, epp, epp->ep_hdr, &names)) != 0)
goto bad;
if (eh->e_ident[EI_OSABI] == ELFOSABI_OPENBSD) names |= ELF_NOTE_NAME_OPENBSD;
/*
* Load all the necessary sections
*/
for (i = 0, pp = ph; i < eh->e_phnum; i++, pp++) {
Elf_Addr addr, size = 0;
int prot = 0;
int flags = 0;
switch (pp->p_type) {
case PT_LOAD:
if (exe_base != 0) {
if (pp == base_ph) {
flags = VMCMD_BASE;
addr = exe_base;
} else {
flags = VMCMD_RELATIVE;
addr = pp->p_vaddr - base_ph->p_vaddr;
}
} else
addr = ELF_NO_ADDR;
/* Permit system calls in specific main-programs */
if (interp == NULL) {
/* statics. Also block the ld.so syscall-grant */
flags |= VMCMD_SYSCALL;
p->p_vmspace->vm_map.flags |= VM_MAP_SYSCALL_ONCE;
}
/*
* Calculates size of text and data segments
* by starting at first and going to end of last.
* 'rwx' sections are treated as data.
* this is correct for BSS_PLT, but may not be
* for DATA_PLT, is fine for TEXT_PLT.
*/
elf_load_psection(&epp->ep_vmcmds, epp->ep_vp,
pp, &addr, &size, &prot, flags);
/*
* Update exe_base in case alignment was off.
* For PIE, addr is relative to exe_base so
* adjust it (non PIE exe_base is 0 so no change).
*/
if (flags == VMCMD_BASE) exe_base = addr;
else
addr += exe_base;
/*
* Decide whether it's text or data by looking
* at the protection of the section
*/
if (prot & PROT_WRITE) {
/* data section */
if (epp->ep_dsize == ELF_NO_ADDR) {
epp->ep_daddr = addr;
epp->ep_dsize = size;
} else {
if (addr < epp->ep_daddr) {
epp->ep_dsize =
epp->ep_dsize +
epp->ep_daddr -
addr;
epp->ep_daddr = addr;
} else
epp->ep_dsize = addr+size -
epp->ep_daddr;
}
} else if (prot & PROT_EXEC) {
/* text section */
if (epp->ep_tsize == ELF_NO_ADDR) {
epp->ep_taddr = addr;
epp->ep_tsize = size;
} else {
if (addr < epp->ep_taddr) {
epp->ep_tsize =
epp->ep_tsize +
epp->ep_taddr -
addr;
epp->ep_taddr = addr;
} else
epp->ep_tsize = addr+size -
epp->ep_taddr;
}
}
break;
case PT_SHLIB:
error = ENOEXEC;
goto bad;
case PT_INTERP:
/* Already did this one */
case PT_DYNAMIC:
case PT_NOTE:
break;
case PT_PHDR:
/* Note address of program headers (in text segment) */
phdr = pp->p_vaddr;
break;
case PT_OPENBSD_RANDOMIZE:
if (ph[i].p_memsz > randomizequota) {
error = ENOMEM;
goto bad;
}
randomizequota -= ph[i].p_memsz;
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_randomize,
ph[i].p_memsz, ph[i].p_vaddr + exe_base, NULLVP, 0, 0);
break;
default:
/*
* Not fatal, we don't need to understand everything
* :-)
*/
break;
}
}
phdr += exe_base;
/*
* Strangely some linux programs may have all load sections marked
* writeable, in this case, textsize is not -1, but rather 0;
*/
if (epp->ep_tsize == ELF_NO_ADDR) epp->ep_tsize = 0;
/*
* Another possibility is that it has all load sections marked
* read-only. Fake a zero-sized data segment right after the
* text segment.
*/
if (epp->ep_dsize == ELF_NO_ADDR) { epp->ep_daddr = round_page(epp->ep_taddr + epp->ep_tsize);
epp->ep_dsize = 0;
}
epp->ep_interp = interp;
epp->ep_entry = eh->e_entry + exe_base;
/*
* Check if we found a dynamically linked binary and arrange to load
* its interpreter when the exec file is released.
*/
if (interp || eh->e_type == ET_DYN) {
struct elf_args *ap;
ap = malloc(sizeof(*ap), M_TEMP, M_WAITOK);
ap->arg_phaddr = phdr;
ap->arg_phentsize = eh->e_phentsize;
ap->arg_phnum = eh->e_phnum;
ap->arg_entry = eh->e_entry + exe_base;
ap->arg_interp = exe_base;
epp->ep_args = ap;
}
free(ph, M_TEMP, phsize);
vn_marktext(epp->ep_vp);
return (exec_setup_stack(p, epp));
bad:
if (interp)
pool_put(&namei_pool, interp);
free(ph, M_TEMP, phsize);
kill_vmcmds(&epp->ep_vmcmds);
if (error == 0)
return (ENOEXEC);
return (error);
}
/*
* Phase II of load. It is now safe to load the interpreter. Info collected
* when loading the program is available for setup of the interpreter.
*/
int
exec_elf_fixup(struct proc *p, struct exec_package *epp)
{
char *interp;
int error = 0;
struct elf_args *ap;
AuxInfo ai[ELF_AUX_ENTRIES], *a;
ap = epp->ep_args;
if (ap == NULL) {
return (0);
}
interp = epp->ep_interp;
if (interp &&
(error = elf_load_file(p, interp, epp, ap)) != 0) {
uprintf("execve: cannot load %s\n", interp);
free(ap, M_TEMP, sizeof *ap);
pool_put(&namei_pool, interp);
kill_vmcmds(&epp->ep_vmcmds);
return (error);
}
/*
* We have to do this ourselves...
*/
error = exec_process_vmcmds(p, epp);
/*
* Push extra arguments on the stack needed by dynamically
* linked binaries
*/
if (error == 0) {
memset(&ai, 0, sizeof ai);
a = ai;
a->au_id = AUX_phdr;
a->au_v = ap->arg_phaddr;
a++;
a->au_id = AUX_phent;
a->au_v = ap->arg_phentsize;
a++;
a->au_id = AUX_phnum;
a->au_v = ap->arg_phnum;
a++;
a->au_id = AUX_pagesz;
a->au_v = PAGE_SIZE;
a++;
a->au_id = AUX_base;
a->au_v = ap->arg_interp;
a++;
a->au_id = AUX_flags;
a->au_v = 0;
a++;
a->au_id = AUX_entry;
a->au_v = ap->arg_entry;
a++;
a->au_id = AUX_openbsd_timekeep;
a->au_v = p->p_p->ps_timekeep;
a++;
a->au_id = AUX_null;
a->au_v = 0;
a++;
error = copyout(ai, epp->ep_auxinfo, sizeof ai);
}
free(ap, M_TEMP, sizeof *ap);
if (interp)
pool_put(&namei_pool, interp);
return (error);
}
int
elf_os_pt_note_name(Elf_Note *np)
{
int i, j;
for (i = 0; i < nitems(elf_note_names); i++) {
size_t namlen = strlen(elf_note_names[i].name);
if (np->namesz < namlen)
continue;
/* verify name padding (after the NUL) is NUL */
for (j = namlen + 1; j < elfround(np->namesz); j++)
if (((char *)(np + 1))[j] != '\0')
continue;
/* verify desc padding is NUL */
for (j = np->descsz; j < elfround(np->descsz); j++)
if (((char *)(np + 1))[j] != '\0')
continue;
if (strcmp((char *)(np + 1), elf_note_names[i].name) == 0)
return elf_note_names[i].id;
}
return (0);
}
int
elf_os_pt_note(struct proc *p, struct exec_package *epp, Elf_Ehdr *eh, int *namesp)
{
Elf_Phdr *hph, *ph;
Elf_Note *np = NULL;
size_t phsize, offset, pfilesz = 0, total;
int error, names = 0;
hph = mallocarray(eh->e_phnum, sizeof(Elf_Phdr), M_TEMP, M_WAITOK);
phsize = eh->e_phnum * sizeof(Elf_Phdr);
if ((error = elf_read_from(p, epp->ep_vp, eh->e_phoff,
hph, phsize)) != 0)
goto out1;
for (ph = hph; ph < &hph[eh->e_phnum]; ph++) {
if (ph->p_type == PT_OPENBSD_WXNEEDED) {
epp->ep_flags |= EXEC_WXNEEDED;
continue;
}
if (ph->p_type != PT_NOTE || ph->p_filesz > 1024)
continue;
if (np && ph->p_filesz != pfilesz) {
free(np, M_TEMP, pfilesz);
np = NULL;
}
if (!np)
np = malloc(ph->p_filesz, M_TEMP, M_WAITOK);
pfilesz = ph->p_filesz;
if ((error = elf_read_from(p, epp->ep_vp, ph->p_offset,
np, ph->p_filesz)) != 0)
goto out2;
for (offset = 0; offset < ph->p_filesz; offset += total) {
Elf_Note *np2 = (Elf_Note *)((char *)np + offset);
if (offset + sizeof(Elf_Note) > ph->p_filesz)
break;
total = sizeof(Elf_Note) + elfround(np2->namesz) +
elfround(np2->descsz);
if (offset + total > ph->p_filesz)
break;
names |= elf_os_pt_note_name(np2);
}
}
out2:
free(np, M_TEMP, pfilesz);
out1:
free(hph, M_TEMP, phsize);
*namesp = names;
return ((names & ELF_NOTE_NAME_OPENBSD) ? 0 : ENOEXEC);
}
/*
* Start of routines related to dumping core
*/
#ifdef SMALL_KERNEL
int
coredump_elf(struct proc *p, void *cookie)
{
return EPERM;
}
#else /* !SMALL_KERNEL */
struct writesegs_state {
off_t notestart;
off_t secstart;
off_t secoff;
struct proc *p;
void *iocookie;
Elf_Phdr *psections;
size_t psectionslen;
size_t notesize;
int npsections;
};
uvm_coredump_setup_cb coredump_setup_elf;
uvm_coredump_walk_cb coredump_walk_elf;
int coredump_notes_elf(struct proc *, void *, size_t *);
int coredump_note_elf(struct proc *, void *, size_t *);
int coredump_writenote_elf(struct proc *, void *, Elf_Note *,
const char *, void *);
int
coredump_elf(struct proc *p, void *cookie)
{
#ifdef DIAGNOSTIC
off_t offset;
#endif
struct writesegs_state ws;
size_t notesize;
int error, i;
ws.p = p;
ws.iocookie = cookie;
ws.psections = NULL;
/*
* Walk the map to get all the segment offsets and lengths,
* write out the ELF header.
*/
error = uvm_coredump_walkmap(p, coredump_setup_elf,
coredump_walk_elf, &ws);
if (error)
goto out;
error = coredump_write(cookie, UIO_SYSSPACE, ws.psections,
ws.psectionslen);
if (error)
goto out;
/* Write out the notes. */
error = coredump_notes_elf(p, cookie, ¬esize);
if (error)
goto out;
#ifdef DIAGNOSTIC
if (notesize != ws.notesize)
panic("coredump: notesize changed: %zu != %zu",
ws.notesize, notesize);
offset = ws.notestart + notesize;
if (offset != ws.secstart)
panic("coredump: offset %lld != secstart %lld",
(long long) offset, (long long) ws.secstart);
#endif
/* Pass 3: finally, write the sections themselves. */
for (i = 0; i < ws.npsections - 1; i++) {
Elf_Phdr *pent = &ws.psections[i];
if (pent->p_filesz == 0)
continue;
#ifdef DIAGNOSTIC
if (offset != pent->p_offset)
panic("coredump: offset %lld != p_offset[%d] %lld",
(long long) offset, i,
(long long) pent->p_filesz);
#endif
error = coredump_write(cookie, UIO_USERSPACE,
(void *)(vaddr_t)pent->p_vaddr, pent->p_filesz);
if (error)
goto out;
coredump_unmap(cookie, (vaddr_t)pent->p_vaddr,
(vaddr_t)pent->p_vaddr + pent->p_filesz);
#ifdef DIAGNOSTIC
offset += ws.psections[i].p_filesz;
#endif
}
out:
free(ws.psections, M_TEMP, ws.psectionslen);
return (error);
}
/*
* Normally we lay out core files like this:
* [ELF Header] [Program headers] [Notes] [data for PT_LOAD segments]
*
* However, if there's >= 65535 segments then it overflows the field
* in the ELF header, so the standard specifies putting a magic
* number there and saving the real count in the .sh_info field of
* the first *section* header...which requires generating a section
* header. To avoid confusing tools, we include an .shstrtab section
* as well so all the indexes look valid. So in this case we lay
* out the core file like this:
* [ELF Header] [Section Headers] [.shstrtab] [Program headers] \
* [Notes] [data for PT_LOAD segments]
*
* The 'shstrtab' structure below is data for the second of the two
* section headers, plus the .shstrtab itself, in one const buffer.
*/
static const struct {
Elf_Shdr shdr;
char shstrtab[sizeof(ELF_SHSTRTAB) + 1];
} shstrtab = {
.shdr = {
.sh_name = 1, /* offset in .shstrtab below */
.sh_type = SHT_STRTAB,
.sh_offset = sizeof(Elf_Ehdr) + 2*sizeof(Elf_Shdr),
.sh_size = sizeof(ELF_SHSTRTAB) + 1,
.sh_addralign = 1,
},
.shstrtab = "\0" ELF_SHSTRTAB,
};
int
coredump_setup_elf(int segment_count, void *cookie)
{
Elf_Ehdr ehdr;
struct writesegs_state *ws = cookie;
Elf_Phdr *note;
int error;
/* Get the count of segments, plus one for the PT_NOTE */
ws->npsections = segment_count + 1;
/* Get the size of the notes. */
error = coredump_notes_elf(ws->p, NULL, &ws->notesize);
if (error)
return error;
/* Setup the ELF header */
memset(&ehdr, 0, sizeof(ehdr));
memcpy(ehdr.e_ident, ELFMAG, SELFMAG);
ehdr.e_ident[EI_CLASS] = ELF_TARG_CLASS;
ehdr.e_ident[EI_DATA] = ELF_TARG_DATA;
ehdr.e_ident[EI_VERSION] = EV_CURRENT;
/* XXX Should be the OSABI/ABI version of the executable. */
ehdr.e_ident[EI_OSABI] = ELFOSABI_SYSV;
ehdr.e_ident[EI_ABIVERSION] = 0;
ehdr.e_type = ET_CORE;
/* XXX This should be the e_machine of the executable. */
ehdr.e_machine = ELF_TARG_MACH;
ehdr.e_version = EV_CURRENT;
ehdr.e_entry = 0;
ehdr.e_flags = 0;
ehdr.e_ehsize = sizeof(ehdr);
ehdr.e_phentsize = sizeof(Elf_Phdr);
if (ws->npsections < PN_XNUM) {
ehdr.e_phoff = sizeof(ehdr);
ehdr.e_shoff = 0;
ehdr.e_phnum = ws->npsections;
ehdr.e_shentsize = 0;
ehdr.e_shnum = 0;
ehdr.e_shstrndx = 0;
} else {
/* too many segments, use extension setup */
ehdr.e_shoff = sizeof(ehdr);
ehdr.e_phnum = PN_XNUM;
ehdr.e_shentsize = sizeof(Elf_Shdr);
ehdr.e_shnum = 2;
ehdr.e_shstrndx = 1;
ehdr.e_phoff = shstrtab.shdr.sh_offset + shstrtab.shdr.sh_size;
}
/* Write out the ELF header. */
error = coredump_write(ws->iocookie, UIO_SYSSPACE, &ehdr, sizeof(ehdr));
if (error)
return error;
/*
* If an section header is needed to store extension info, write
* it out after the ELF header and before the program header.
*/
if (ehdr.e_shnum != 0) {
Elf_Shdr shdr = { .sh_info = ws->npsections };
error = coredump_write(ws->iocookie, UIO_SYSSPACE, &shdr,
sizeof shdr);
if (error)
return error;
error = coredump_write(ws->iocookie, UIO_SYSSPACE, &shstrtab,
sizeof(shstrtab.shdr) + sizeof(shstrtab.shstrtab));
if (error)
return error;
}
/*
* Allocate the segment header array and setup to collect
* the section sizes and offsets
*/
ws->psections = mallocarray(ws->npsections, sizeof(Elf_Phdr),
M_TEMP, M_WAITOK|M_CANFAIL|M_ZERO);
if (ws->psections == NULL)
return ENOMEM;
ws->psectionslen = ws->npsections * sizeof(Elf_Phdr);
ws->notestart = ehdr.e_phoff + ws->psectionslen;
ws->secstart = ws->notestart + ws->notesize;
ws->secoff = ws->secstart;
/* Fill in the PT_NOTE segment header in the last slot */
note = &ws->psections[ws->npsections - 1];
note->p_type = PT_NOTE;
note->p_offset = ws->notestart;
note->p_vaddr = 0;
note->p_paddr = 0;
note->p_filesz = ws->notesize;
note->p_memsz = 0;
note->p_flags = PF_R;
note->p_align = ELFROUNDSIZE;
return (0);
}
int
coredump_walk_elf(vaddr_t start, vaddr_t realend, vaddr_t end, vm_prot_t prot,
int nsegment, void *cookie)
{
struct writesegs_state *ws = cookie;
Elf_Phdr phdr;
vsize_t size, realsize;
size = end - start;
realsize = realend - start;
phdr.p_type = PT_LOAD;
phdr.p_offset = ws->secoff;
phdr.p_vaddr = start;
phdr.p_paddr = 0;
phdr.p_filesz = realsize;
phdr.p_memsz = size;
phdr.p_flags = 0;
if (prot & PROT_READ)
phdr.p_flags |= PF_R;
if (prot & PROT_WRITE)
phdr.p_flags |= PF_W;
if (prot & PROT_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = PAGE_SIZE;
ws->secoff += phdr.p_filesz;
ws->psections[nsegment] = phdr;
return (0);
}
int
coredump_notes_elf(struct proc *p, void *iocookie, size_t *sizep)
{
struct ps_strings pss;
struct iovec iov;
struct uio uio;
struct elfcore_procinfo cpi;
Elf_Note nhdr;
struct process *pr = p->p_p;
struct proc *q;
size_t size, notesize;
int error;
KASSERT(!P_HASSIBLING(p) || pr->ps_single != NULL);
size = 0;
/* First, write an elfcore_procinfo. */
notesize = sizeof(nhdr) + elfround(sizeof("OpenBSD")) +
elfround(sizeof(cpi));
if (iocookie) {
memset(&cpi, 0, sizeof(cpi));
cpi.cpi_version = ELFCORE_PROCINFO_VERSION;
cpi.cpi_cpisize = sizeof(cpi);
cpi.cpi_signo = p->p_sisig;
cpi.cpi_sigcode = p->p_sicode;
cpi.cpi_sigpend = p->p_siglist | pr->ps_siglist;
cpi.cpi_sigmask = p->p_sigmask;
cpi.cpi_sigignore = pr->ps_sigacts->ps_sigignore;
cpi.cpi_sigcatch = pr->ps_sigacts->ps_sigcatch;
cpi.cpi_pid = pr->ps_pid;
cpi.cpi_ppid = pr->ps_ppid;
cpi.cpi_pgrp = pr->ps_pgid;
if (pr->ps_session->s_leader)
cpi.cpi_sid = pr->ps_session->s_leader->ps_pid;
else
cpi.cpi_sid = 0;
cpi.cpi_ruid = p->p_ucred->cr_ruid;
cpi.cpi_euid = p->p_ucred->cr_uid;
cpi.cpi_svuid = p->p_ucred->cr_svuid;
cpi.cpi_rgid = p->p_ucred->cr_rgid;
cpi.cpi_egid = p->p_ucred->cr_gid;
cpi.cpi_svgid = p->p_ucred->cr_svgid;
(void)strlcpy(cpi.cpi_name, pr->ps_comm, sizeof(cpi.cpi_name));
nhdr.namesz = sizeof("OpenBSD");
nhdr.descsz = sizeof(cpi);
nhdr.type = NT_OPENBSD_PROCINFO;
error = coredump_writenote_elf(p, iocookie, &nhdr,
"OpenBSD", &cpi);
if (error)
return (error);
}
size += notesize;
/* Second, write an NT_OPENBSD_AUXV note. */
notesize = sizeof(nhdr) + elfround(sizeof("OpenBSD")) +
elfround(ELF_AUX_WORDS * sizeof(char *));
if (iocookie) {
iov.iov_base = &pss;
iov.iov_len = sizeof(pss);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)pr->ps_strings;
uio.uio_resid = sizeof(pss);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_procp = NULL;
error = uvm_io(&p->p_vmspace->vm_map, &uio, 0);
if (error)
return (error);
if (pss.ps_envstr == NULL)
return (EIO);
nhdr.namesz = sizeof("OpenBSD");
nhdr.descsz = ELF_AUX_WORDS * sizeof(char *);
nhdr.type = NT_OPENBSD_AUXV;
error = coredump_write(iocookie, UIO_SYSSPACE,
&nhdr, sizeof(nhdr));
if (error)
return (error);
error = coredump_write(iocookie, UIO_SYSSPACE,
"OpenBSD", elfround(nhdr.namesz));
if (error)
return (error);
error = coredump_write(iocookie, UIO_USERSPACE,
pss.ps_envstr + pss.ps_nenvstr + 1, nhdr.descsz);
if (error)
return (error);
}
size += notesize;
#ifdef PT_WCOOKIE
notesize = sizeof(nhdr) + elfround(sizeof("OpenBSD")) +
elfround(sizeof(register_t));
if (iocookie) {
register_t wcookie;
nhdr.namesz = sizeof("OpenBSD");
nhdr.descsz = sizeof(register_t);
nhdr.type = NT_OPENBSD_WCOOKIE;
wcookie = process_get_wcookie(p);
error = coredump_writenote_elf(p, iocookie, &nhdr,
"OpenBSD", &wcookie);
if (error)
return (error);
}
size += notesize;
#endif
/*
* Now write the register info for the thread that caused the
* coredump.
*/
error = coredump_note_elf(p, iocookie, ¬esize);
if (error)
return (error);
size += notesize;
/*
* Now, for each thread, write the register info and any other
* per-thread notes. Since we're dumping core, all the other
* threads in the process have been stopped and the list can't
* change.
*/
TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) {
if (q == p) /* we've taken care of this thread */
continue;
error = coredump_note_elf(q, iocookie, ¬esize);
if (error)
return (error);
size += notesize;
}
*sizep = size;
return (0);
}
int
coredump_note_elf(struct proc *p, void *iocookie, size_t *sizep)
{
Elf_Note nhdr;
int size, notesize, error;
int namesize;
char name[64+ELFROUNDSIZE];
struct reg intreg;
#ifdef PT_GETFPREGS
struct fpreg freg;
#endif
size = 0;
snprintf(name, sizeof(name)-ELFROUNDSIZE, "%s@%d",
"OpenBSD", p->p_tid + THREAD_PID_OFFSET);
namesize = strlen(name) + 1;
memset(name + namesize, 0, elfround(namesize) - namesize);
notesize = sizeof(nhdr) + elfround(namesize) + elfround(sizeof(intreg));
if (iocookie) {
error = process_read_regs(p, &intreg);
if (error)
return (error);
nhdr.namesz = namesize;
nhdr.descsz = sizeof(intreg);
nhdr.type = NT_OPENBSD_REGS;
error = coredump_writenote_elf(p, iocookie, &nhdr,
name, &intreg);
if (error)
return (error);
}
size += notesize;
#ifdef PT_GETFPREGS
notesize = sizeof(nhdr) + elfround(namesize) + elfround(sizeof(freg));
if (iocookie) {
error = process_read_fpregs(p, &freg);
if (error)
return (error);
nhdr.namesz = namesize;
nhdr.descsz = sizeof(freg);
nhdr.type = NT_OPENBSD_FPREGS;
error = coredump_writenote_elf(p, iocookie, &nhdr, name, &freg);
if (error)
return (error);
}
size += notesize;
#endif
*sizep = size;
/* XXX Add hook for machdep per-LWP notes. */
return (0);
}
int
coredump_writenote_elf(struct proc *p, void *cookie, Elf_Note *nhdr,
const char *name, void *data)
{
int error;
error = coredump_write(cookie, UIO_SYSSPACE, nhdr, sizeof(*nhdr));
if (error)
return error;
error = coredump_write(cookie, UIO_SYSSPACE, name,
elfround(nhdr->namesz));
if (error)
return error;
return coredump_write(cookie, UIO_SYSSPACE, data, nhdr->descsz);
}
#endif /* !SMALL_KERNEL */
/* $OpenBSD: subr_witness.c,v 1.48 2022/02/21 14:16:49 jsg Exp $ */
/*-
* Copyright (c) 2008 Isilon Systems, Inc.
* Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
* Copyright (c) 1998 Berkeley Software Design, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Berkeley Software Design Inc's name may not be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from BSDI Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp
* and BSDI Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp
*/
/*
* Implementation of the `witness' lock verifier. Originally implemented for
* mutexes in BSD/OS. Extended to handle generic lock objects and lock
* classes in FreeBSD.
*/
/*
* Main Entry: witness
* Pronunciation: 'wit-n&s
* Function: noun
* Etymology: Middle English witnesse, from Old English witnes knowledge,
* testimony, witness, from 2wit
* Date: before 12th century
* 1 : attestation of a fact or event : TESTIMONY
* 2 : one that gives evidence; specifically : one who testifies in
* a cause or before a judicial tribunal
* 3 : one asked to be present at a transaction so as to be able to
* testify to its having taken place
* 4 : one who has personal knowledge of something
* 5 a : something serving as evidence or proof : SIGN
* b : public affirmation by word or example of usually
* religious faith or conviction <the heroic witness to divine
* life -- Pilot>
* 6 capitalized : a member of the Jehovah's Witnesses
*/
/*
* Special rules concerning Giant and lock orders:
*
* 1) Giant must be acquired before any other mutexes. Stated another way,
* no other mutex may be held when Giant is acquired.
*
* 2) Giant must be released when blocking on a sleepable lock.
*
* This rule is less obvious, but is a result of Giant providing the same
* semantics as spl(). Basically, when a thread sleeps, it must release
* Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule
* 2).
*
* 3) Giant may be acquired before or after sleepable locks.
*
* This rule is also not quite as obvious. Giant may be acquired after
* a sleepable lock because it is a non-sleepable lock and non-sleepable
* locks may always be acquired while holding a sleepable lock. The second
* case, Giant before a sleepable lock, follows from rule 2) above. Suppose
* you have two threads T1 and T2 and a sleepable lock X. Suppose that T1
* acquires X and blocks on Giant. Then suppose that T2 acquires Giant and
* blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to
* execute. Thus, acquiring Giant both before and after a sleepable lock
* will not result in a lock order reversal.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#ifdef MULTIPROCESSOR
#include <sys/mplock.h>
#endif
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/stacktrace.h>
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/witness.h>
#include <machine/cpu.h>
#include <uvm/uvm_extern.h> /* uvm_pageboot_alloc */
#ifndef DDB
#error "DDB is required for WITNESS"
#endif
#include <machine/db_machdep.h>
#include <ddb/db_access.h>
#include <ddb/db_var.h>
#include <ddb/db_output.h>
#define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */
#define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */
#define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */
#ifndef WITNESS_COUNT
#define WITNESS_COUNT 1536
#endif
#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
#define WITNESS_PENDLIST (1024 + MAXCPUS)
/* Allocate 256 KB of stack data space */
#define WITNESS_LO_DATA_COUNT 2048
/* Prime, gives load factor of ~2 at full load */
#define WITNESS_LO_HASH_SIZE 1021
/*
* XXX: This is somewhat bogus, as we assume here that at most 2048 threads
* will hold LOCK_NCHILDREN locks. We handle failure ok, and we should
* probably be safe for the most part, but it's still a SWAG.
*/
#define LOCK_NCHILDREN 5
#define LOCK_CHILDCOUNT 2048
#define FULLGRAPH_SBUF_SIZE 512
/*
* These flags go in the witness relationship matrix and describe the
* relationship between any two struct witness objects.
*/
#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR)
#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT)
#define WITNESS_RELATED_MASK \
(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been
* observed. */
#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */
/* Descendant to ancestor flags */
#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2)
/* Ancestor to descendant flags */
#define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2)
#define WITNESS_INDEX_ASSERT(i) \
KASSERT((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
/*
* Lock classes. Each lock has a class which describes characteristics
* common to all types of locks of a given class.
*
* Spin locks in general must always protect against preemption, as it is
* an error to perform any type of context switch while holding a spin lock.
* Also, for an individual lock to be recursable, its class must allow
* recursion and the lock itself must explicitly allow recursion.
*/
struct lock_class {
const char *lc_name;
u_int lc_flags;
};
union lock_stack {
union lock_stack *ls_next;
struct stacktrace ls_stack;
};
#define LC_SLEEPLOCK 0x00000001 /* Sleep lock. */
#define LC_SPINLOCK 0x00000002 /* Spin lock. */
#define LC_SLEEPABLE 0x00000004 /* Sleeping allowed with this lock. */
#define LC_RECURSABLE 0x00000008 /* Locks of this type may recurse. */
#define LC_UPGRADABLE 0x00000010 /* Upgrades and downgrades permitted. */
/*
* Lock instances. A lock instance is the data associated with a lock while
* it is held by witness. For example, a lock instance will hold the
* recursion count of a lock. Lock instances are held in lists. Spin locks
* are held in a per-cpu list while sleep locks are held in per-thread list.
*/
struct lock_instance {
struct lock_object *li_lock;
union lock_stack *li_stack;
u_int li_flags;
};
/*
* A simple list type used to build the list of locks held by a thread
* or CPU. We can't simply embed the list in struct lock_object since a
* lock may be held by more than one thread if it is a shared lock. Locks
* are added to the head of the list, so we fill up each list entry from
* "the back" logically. To ease some of the arithmetic, we actually fill
* in each list entry the normal way (children[0] then children[1], etc.) but
* when we traverse the list we read children[count-1] as the first entry
* down to children[0] as the final entry.
*/
struct lock_list_entry {
struct lock_list_entry *ll_next;
struct lock_instance ll_children[LOCK_NCHILDREN];
int ll_count;
};
/*
* The main witness structure. One of these per named lock type in the system
* (for example, "vnode interlock").
*/
struct witness {
const struct lock_type *w_type;
const char *w_subtype;
uint32_t w_index; /* Index in the relationship matrix */
struct lock_class *w_class;
SLIST_ENTRY(witness) w_list; /* List of all witnesses. */
SLIST_ENTRY(witness) w_typelist; /* Witnesses of a type. */
SLIST_ENTRY(witness) w_hash_next; /* Linked list in
* hash buckets. */
uint16_t w_num_ancestors; /* direct/indirect
* ancestor count */
uint16_t w_num_descendants; /* direct/indirect
* descendant count */
int16_t w_ddb_level;
unsigned w_acquired:1;
unsigned w_displayed:1;
unsigned w_reversed:1;
};
SLIST_HEAD(witness_list, witness);
/*
* The witness hash table. Keys are witness names (const char *), elements are
* witness objects (struct witness *).
*/
struct witness_hash {
struct witness_list wh_array[WITNESS_HASH_SIZE];
uint32_t wh_size;
uint32_t wh_count;
};
/*
* Key type for the lock order data hash table.
*/
struct witness_lock_order_key {
uint16_t from;
uint16_t to;
};
struct witness_lock_order_data {
struct stacktrace wlod_stack;
struct witness_lock_order_key wlod_key;
struct witness_lock_order_data *wlod_next;
};
/*
* The witness lock order data hash table. Keys are witness index tuples
* (struct witness_lock_order_key), elements are lock order data objects
* (struct witness_lock_order_data).
*/
struct witness_lock_order_hash {
struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE];
u_int wloh_size;
u_int wloh_count;
};
struct witness_pendhelp {
const struct lock_type *wh_type;
struct lock_object *wh_lock;
};
struct witness_cpu {
struct lock_list_entry *wc_spinlocks;
struct lock_list_entry *wc_lle_cache;
union lock_stack *wc_stk_cache;
unsigned int wc_lle_count;
unsigned int wc_stk_count;
} __aligned(CACHELINESIZE);
#define WITNESS_LLE_CACHE_MAX 8
#define WITNESS_STK_CACHE_MAX (WITNESS_LLE_CACHE_MAX * LOCK_NCHILDREN)
struct witness_cpu witness_cpu[MAXCPUS];
/*
* Returns 0 if one of the locks is a spin lock and the other is not.
* Returns 1 otherwise.
*/
static __inline int
witness_lock_type_equal(struct witness *w1, struct witness *w2)
{
return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
}
static __inline int
witness_lock_order_key_equal(const struct witness_lock_order_key *a,
const struct witness_lock_order_key *b)
{
return (a->from == b->from && a->to == b->to);
}
static int _isitmyx(struct witness *w1, struct witness *w2, int rmask,
const char *fname);
static void adopt(struct witness *parent, struct witness *child);
static struct witness *enroll(const struct lock_type *, const char *,
struct lock_class *);
static struct lock_instance *find_instance(struct lock_list_entry *list,
const struct lock_object *lock);
static int isitmychild(struct witness *parent, struct witness *child);
static int isitmydescendant(struct witness *parent, struct witness *child);
static void itismychild(struct witness *parent, struct witness *child);
#ifdef DDB
static void db_witness_add_fullgraph(struct witness *parent);
static void witness_ddb_compute_levels(void);
static void witness_ddb_display(int(*)(const char *fmt, ...));
static void witness_ddb_display_descendants(int(*)(const char *fmt, ...),
struct witness *, int indent);
static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
struct witness_list *list);
static void witness_ddb_level_descendants(struct witness *parent, int l);
static void witness_ddb_list(struct proc *td);
#endif
static int witness_alloc_stacks(void);
static void witness_debugger(int dump);
static void witness_free(struct witness *m);
static struct witness *witness_get(void);
static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size);
static struct witness *witness_hash_get(const struct lock_type *,
const char *);
static void witness_hash_put(struct witness *w);
static void witness_init_hash_tables(void);
static void witness_increment_graph_generation(void);
static int witness_list_locks(struct lock_list_entry **,
int (*)(const char *, ...));
static void witness_lock_list_free(struct lock_list_entry *lle);
static struct lock_list_entry *witness_lock_list_get(void);
static void witness_lock_stack_free(union lock_stack *stack);
static union lock_stack *witness_lock_stack_get(void);
static int witness_lock_order_add(struct witness *parent,
struct witness *child);
static int witness_lock_order_check(struct witness *parent,
struct witness *child);
static struct witness_lock_order_data *witness_lock_order_get(
struct witness *parent,
struct witness *child);
static void witness_list_lock(struct lock_instance *instance,
int (*prnt)(const char *fmt, ...));
static void witness_setflag(struct lock_object *lock, int flag, int set);
/*
* If set to 0, lock order checking is disabled. If set to -1,
* witness is completely disabled. Otherwise witness performs full
* lock order checking for all locks. At runtime, lock order checking
* may be toggled. However, witness cannot be reenabled once it is
* completely disabled.
*/
#ifdef WITNESS_WATCH
static int witness_watch = 3;
#else
static int witness_watch = 2;
#endif
#ifdef WITNESS_LOCKTRACE
static int witness_locktrace = 1;
#else
static int witness_locktrace = 0;
#endif
int witness_count = WITNESS_COUNT;
int witness_uninitialized_report = 5;
static struct mutex w_mtx;
static struct rwlock w_ctlock = RWLOCK_INITIALIZER("w_ctlock");
/* w_list */
static struct witness_list w_free = SLIST_HEAD_INITIALIZER(w_free);
static struct witness_list w_all = SLIST_HEAD_INITIALIZER(w_all);
/* w_typelist */
static struct witness_list w_spin = SLIST_HEAD_INITIALIZER(w_spin);
static struct witness_list w_sleep = SLIST_HEAD_INITIALIZER(w_sleep);
/* lock list */
static struct lock_list_entry *w_lock_list_free = NULL;
static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
static u_int pending_cnt;
static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
static struct witness *w_data;
static uint8_t **w_rmatrix;
static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
static struct witness_hash w_hash; /* The witness hash table. */
/* The lock order data hash */
static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
static struct witness_lock_order_data *w_lofree = NULL;
static struct witness_lock_order_hash w_lohash;
static int w_max_used_index = 0;
static unsigned int w_generation = 0;
static union lock_stack *w_lock_stack_free;
static unsigned int w_lock_stack_num;
static struct lock_class lock_class_kernel_lock = {
.lc_name = "kernel_lock",
.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE
};
static struct lock_class lock_class_sched_lock = {
.lc_name = "sched_lock",
.lc_flags = LC_SPINLOCK | LC_RECURSABLE
};
static struct lock_class lock_class_mutex = {
.lc_name = "mutex",
.lc_flags = LC_SPINLOCK
};
static struct lock_class lock_class_rwlock = {
.lc_name = "rwlock",
.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_UPGRADABLE
};
static struct lock_class lock_class_rrwlock = {
.lc_name = "rrwlock",
.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE |
LC_UPGRADABLE
};
static struct lock_class *lock_classes[] = {
&lock_class_kernel_lock,
&lock_class_sched_lock,
&lock_class_mutex,
&lock_class_rwlock,
&lock_class_rrwlock,
};
/*
* This global is set to 0 once it becomes safe to use the witness code.
*/
static int witness_cold = 1;
/*
* This global is set to 1 once the static lock orders have been enrolled
* so that a warning can be issued for any spin locks enrolled later.
*/
static int witness_spin_warn = 0;
/*
* The WITNESS-enabled diagnostic code. Note that the witness code does
* assume that the early boot is single-threaded at least until after this
* routine is completed.
*/
void
witness_initialize(void)
{
struct lock_object *lock;
union lock_stack *stacks;
struct witness *w;
int i, s;
w_data = (void *)uvm_pageboot_alloc(sizeof(struct witness) *
witness_count);
memset(w_data, 0, sizeof(struct witness) * witness_count);
w_rmatrix = (void *)uvm_pageboot_alloc(sizeof(*w_rmatrix) *
(witness_count + 1));
for (i = 0; i < witness_count + 1; i++) {
w_rmatrix[i] = (void *)uvm_pageboot_alloc(
sizeof(*w_rmatrix[i]) * (witness_count + 1));
memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
(witness_count + 1));
}
mtx_init_flags(&w_mtx, IPL_HIGH, "witness lock", MTX_NOWITNESS);
for (i = witness_count - 1; i >= 0; i--) {
w = &w_data[i];
memset(w, 0, sizeof(*w));
w_data[i].w_index = i; /* Witness index never changes. */
witness_free(w);
}
KASSERTMSG(SLIST_FIRST(&w_free)->w_index == 0,
"%s: Invalid list of free witness objects", __func__);
/* Witness with index 0 is not used to aid in debugging. */
SLIST_REMOVE_HEAD(&w_free, w_list);
w_free_cnt--;
for (i = 0; i < witness_count; i++) {
memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
(witness_count + 1));
}
if (witness_locktrace) {
w_lock_stack_num = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
stacks = (void *)uvm_pageboot_alloc(sizeof(*stacks) *
w_lock_stack_num);
}
s = splhigh();
for (i = 0; i < w_lock_stack_num; i++)
witness_lock_stack_free(&stacks[i]);
for (i = 0; i < LOCK_CHILDCOUNT; i++)
witness_lock_list_free(&w_locklistdata[i]);
splx(s);
witness_init_hash_tables();
witness_spin_warn = 1;
/* Iterate through all locks and add them to witness. */
for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
lock = pending_locks[i].wh_lock;
KASSERTMSG(lock->lo_flags & LO_WITNESS,
"%s: lock %s is on pending list but not LO_WITNESS",
__func__, lock->lo_name);
lock->lo_witness = enroll(pending_locks[i].wh_type,
lock->lo_name, LOCK_CLASS(lock));
}
/* Mark the witness code as being ready for use. */
witness_cold = 0;
}
void
witness_init(struct lock_object *lock, const struct lock_type *type)
{
struct lock_class *class;
/* Various sanity checks. */
class = LOCK_CLASS(lock);
if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
(class->lc_flags & LC_RECURSABLE) == 0)
panic("%s: lock (%s) %s can not be recursable",
__func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(class->lc_flags & LC_SLEEPABLE) == 0)
panic("%s: lock (%s) %s can not be sleepable",
__func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
(class->lc_flags & LC_UPGRADABLE) == 0)
panic("%s: lock (%s) %s can not be upgradable",
__func__, class->lc_name, lock->lo_name);
/*
* If we shouldn't watch this lock, then just clear lo_witness.
* Record the type in case the lock becomes watched later.
* Otherwise, if witness_cold is set, then it is too early to
* enroll this lock, so defer it to witness_initialize() by adding
* it to the pending_locks list. If it is not too early, then enroll
* the lock now.
*/
if (witness_watch < 1 || panicstr != NULL || db_active ||
(lock->lo_flags & LO_WITNESS) == 0) {
lock->lo_witness = NULL;
lock->lo_type = type;
} else if (witness_cold) {
pending_locks[pending_cnt].wh_lock = lock;
pending_locks[pending_cnt++].wh_type = type;
if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, "
"increase WITNESS_PENDLIST",
__func__);
} else
lock->lo_witness = enroll(type, lock->lo_name, class);
}
static inline int
is_kernel_lock(const struct lock_object *lock)
{
#ifdef MULTIPROCESSOR
return (lock == &kernel_lock.mpl_lock_obj);
#else
return (0);
#endif
}
#ifdef DDB
static void
witness_ddb_compute_levels(void)
{
struct witness *w;
/*
* First clear all levels.
*/
SLIST_FOREACH(w, &w_all, w_list)
w->w_ddb_level = -1;
/*
* Look for locks with no parents and level all their descendants.
*/
SLIST_FOREACH(w, &w_all, w_list) {
/* If the witness has ancestors (is not a root), skip it. */
if (w->w_num_ancestors > 0)
continue;
witness_ddb_level_descendants(w, 0);
}
}
static void
witness_ddb_level_descendants(struct witness *w, int l)
{
int i;
if (w->w_ddb_level >= l)
return;
w->w_ddb_level = l;
l++;
for (i = 1; i <= w_max_used_index; i++) {
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
witness_ddb_level_descendants(&w_data[i], l);
}
}
static void
witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
struct witness *w, int indent)
{
int i;
for (i = 0; i < indent; i++)
prnt(" ");
prnt("%s (type: %s, depth: %d)",
w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
if (w->w_displayed) {
prnt(" -- (already displayed)\n");
return;
}
w->w_displayed = 1;
if (!w->w_acquired)
prnt(" -- never acquired\n");
else
prnt("\n");
indent++;
WITNESS_INDEX_ASSERT(w->w_index);
for (i = 1; i <= w_max_used_index; i++) {
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
witness_ddb_display_descendants(prnt, &w_data[i],
indent);
}
}
static void
witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
struct witness_list *list)
{
struct witness *w;
SLIST_FOREACH(w, list, w_typelist) {
if (!w->w_acquired || w->w_ddb_level > 0)
continue;
/* This lock has no ancestors - display its descendants. */
witness_ddb_display_descendants(prnt, w, 0);
}
}
static void
witness_ddb_display(int(*prnt)(const char *fmt, ...))
{
struct witness *w;
KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
witness_ddb_compute_levels();
/* Clear all the displayed flags. */
SLIST_FOREACH(w, &w_all, w_list)
w->w_displayed = 0;
/*
* First, handle sleep locks which have been acquired at least
* once.
*/
prnt("Sleep locks:\n");
witness_ddb_display_list(prnt, &w_sleep);
/*
* Now do spin locks which have been acquired at least once.
*/
prnt("\nSpin locks:\n");
witness_ddb_display_list(prnt, &w_spin);
/*
* Finally, any locks which have not been acquired yet.
*/
prnt("\nLocks which were never acquired:\n");
SLIST_FOREACH(w, &w_all, w_list) {
if (w->w_acquired)
continue;
prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
w->w_class->lc_name, w->w_ddb_level);
}
}
#endif /* DDB */
int
witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
{
if (witness_watch < 0 || panicstr != NULL || db_active)
return (0);
/* Require locks that witness knows about. */
if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
lock2->lo_witness == NULL)
return (EINVAL);
MUTEX_ASSERT_UNLOCKED(&w_mtx);
mtx_enter(&w_mtx);
/*
* If we already have either an explicit or implied lock order that
* is the other way around, then return an error.
*/
if (witness_watch &&
isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
mtx_leave(&w_mtx);
return (EINVAL);
}
/* Try to add the new order. */
itismychild(lock1->lo_witness, lock2->lo_witness);
mtx_leave(&w_mtx);
return (0);
}
void
witness_checkorder(struct lock_object *lock, int flags,
struct lock_object *interlock)
{
struct lock_list_entry *lock_list, *lle;
struct lock_instance *lock1, *lock2, *plock;
struct lock_class *class, *iclass;
struct proc *p;
struct witness *w, *w1;
int i, j, s;
if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
return;
if ((lock->lo_flags & LO_INITIALIZED) == 0) { if (witness_uninitialized_report > 0) {
witness_uninitialized_report--;
printf("witness: lock_object uninitialized: %p\n", lock);
witness_debugger(1);
}
lock->lo_flags |= LO_INITIALIZED;
}
if ((lock->lo_flags & LO_WITNESS) == 0)
return;
w = lock->lo_witness;
class = LOCK_CLASS(lock);
if (w == NULL)
w = lock->lo_witness =
enroll(lock->lo_type, lock->lo_name, class);
p = curproc;
if (class->lc_flags & LC_SLEEPLOCK) {
/*
* Since spin locks include a critical section, this check
* implicitly enforces a lock order of all sleep locks before
* all spin locks.
*/
lock_list = witness_cpu[cpu_number()].wc_spinlocks;
if (lock_list != NULL && lock_list->ll_count > 0) {
panic("acquiring blockable sleep lock with "
"spinlock or critical section held (%s) %s",
class->lc_name, lock->lo_name);
}
/*
* If this is the first lock acquired then just return as
* no order checking is needed.
*/
lock_list = p->p_sleeplocks;
if (lock_list == NULL || lock_list->ll_count == 0)
return;
} else {
/*
* If this is the first lock, just return as no order
* checking is needed.
*/
lock_list = witness_cpu[cpu_number()].wc_spinlocks;
if (lock_list == NULL || lock_list->ll_count == 0)
return;
}
s = splhigh();
/*
* Check to see if we are recursing on a lock we already own. If
* so, make sure that we don't mismatch exclusive and shared lock
* acquires.
*/
lock1 = find_instance(lock_list, lock);
if (lock1 != NULL) {
if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
(flags & LOP_EXCLUSIVE) == 0) {
printf("witness: shared lock of (%s) %s "
"while exclusively locked\n",
class->lc_name, lock->lo_name);
panic("excl->share");
}
if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
(flags & LOP_EXCLUSIVE) != 0) {
printf("witness: exclusive lock of (%s) %s "
"while share locked\n",
class->lc_name, lock->lo_name);
panic("share->excl");
}
goto out_splx;
}
/* Warn if the interlock is not locked exactly once. */
if (interlock != NULL) {
iclass = LOCK_CLASS(interlock);
lock1 = find_instance(lock_list, interlock);
if (lock1 == NULL)
panic("interlock (%s) %s not locked",
iclass->lc_name, interlock->lo_name);
else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
panic("interlock (%s) %s recursed",
iclass->lc_name, interlock->lo_name);
}
/*
* Find the previously acquired lock, but ignore interlocks.
*/
plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) {
if (lock_list->ll_count > 1)
plock =
&lock_list->ll_children[lock_list->ll_count - 2];
else {
lle = lock_list->ll_next;
/*
* The interlock is the only lock we hold, so
* simply return.
*/
if (lle == NULL)
goto out_splx;
plock = &lle->ll_children[lle->ll_count - 1];
}
}
/*
* Try to perform most checks without a lock. If this succeeds we
* can skip acquiring the lock and return success. Otherwise we redo
* the check with the lock held to handle races with concurrent updates.
*/
w1 = plock->li_lock->lo_witness;
if (witness_lock_order_check(w1, w))
goto out_splx;
mtx_enter(&w_mtx);
if (witness_lock_order_check(w1, w))
goto out;
witness_lock_order_add(w1, w);
/*
* Check for duplicate locks of the same type. Note that we only
* have to check for this on the last lock we just acquired. Any
* other cases will be caught as lock order violations.
*/
if (w1 == w) {
i = w->w_index;
if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
!(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
w_rmatrix[i][i] |= WITNESS_REVERSAL;
w->w_reversed = 1;
mtx_leave(&w_mtx);
printf("witness: acquiring duplicate lock of "
"same type: \"%s\"\n", w->w_type->lt_name);
printf(" 1st %s\n", plock->li_lock->lo_name);
printf(" 2nd %s\n", lock->lo_name);
witness_debugger(1);
} else
mtx_leave(&w_mtx);
goto out_splx;
}
MUTEX_ASSERT_LOCKED(&w_mtx);
/*
* If we know that the lock we are acquiring comes after
* the lock we most recently acquired in the lock order tree,
* then there is no need for any further checks.
*/
if (isitmychild(w1, w))
goto out;
for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { KASSERT(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
lock1 = &lle->ll_children[i];
/*
* Ignore the interlock.
*/
if (interlock == lock1->li_lock)
continue;
/*
* If this lock doesn't undergo witness checking,
* then skip it.
*/
w1 = lock1->li_lock->lo_witness;
if (w1 == NULL) {
KASSERTMSG((lock1->li_lock->lo_flags &
LO_WITNESS) == 0,
"lock missing witness structure");
continue;
}
/*
* If we are locking Giant and this is a sleepable
* lock, then skip it.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
is_kernel_lock(lock))
continue;
/*
* If we are locking a sleepable lock and this lock
* is Giant, then skip it.
*/
if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
is_kernel_lock(lock1->li_lock))
continue;
/*
* If we are locking a sleepable lock and this lock
* isn't sleepable, we want to treat it as a lock
* order violation to enfore a general lock order of
* sleepable locks before non-sleepable locks.
*/
if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
goto reversal;
/*
* If we are locking Giant and this is a non-sleepable
* lock, then treat it as a reversal.
*/
if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
is_kernel_lock(lock))
goto reversal;
/*
* Check the lock order hierarchy for a reveresal.
*/
if (!isitmydescendant(w, w1))
continue;
reversal:
/*
* We have a lock order violation, check to see if it
* is allowed or has already been yelled about.
*/
/* Bail if this violation is known */
if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
goto out;
/* Record this as a violation */
w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
w->w_reversed = w1->w_reversed = 1;
witness_increment_graph_generation();
mtx_leave(&w_mtx);
/*
* There are known LORs between VNODE locks. They are
* not an indication of a bug. VNODE locks are flagged
* as such (LO_IS_VNODE) and we don't yell if the LOR
* is between 2 VNODE locks.
*/
if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
(lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
goto out_splx;
/*
* Ok, yell about it.
*/
printf("witness: ");
if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
printf("lock order reversal: "
"(sleepable after non-sleepable)\n");
else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
&& is_kernel_lock(lock))
printf("lock order reversal: "
"(Giant after non-sleepable)\n");
else
printf("lock order reversal:\n");
/*
* Try to locate an earlier lock with
* witness w in our list.
*/
do {
lock2 = &lle->ll_children[i];
KASSERT(lock2->li_lock != NULL);
if (lock2->li_lock->lo_witness == w)
break;
if (i == 0 && lle->ll_next != NULL) {
lle = lle->ll_next;
i = lle->ll_count - 1;
KASSERT(i >= 0 && i < LOCK_NCHILDREN);
} else
i--;
} while (i >= 0);
if (i < 0) {
printf(" 1st %p %s (%s)\n",
lock1->li_lock, lock1->li_lock->lo_name,
w1->w_type->lt_name);
printf(" 2nd %p %s (%s)\n",
lock, lock->lo_name, w->w_type->lt_name);
} else {
printf(" 1st %p %s (%s)\n",
lock2->li_lock, lock2->li_lock->lo_name,
lock2->li_lock->lo_witness->w_type->
lt_name);
printf(" 2nd %p %s (%s)\n",
lock1->li_lock, lock1->li_lock->lo_name,
w1->w_type->lt_name);
printf(" 3rd %p %s (%s)\n", lock,
lock->lo_name, w->w_type->lt_name);
}
if (witness_watch > 1) {
struct witness_lock_order_data *wlod1, *wlod2;
mtx_enter(&w_mtx);
wlod1 = witness_lock_order_get(w, w1);
wlod2 = witness_lock_order_get(w1, w);
mtx_leave(&w_mtx);
/*
* It is safe to access saved stack traces,
* w_type, and w_class without the lock.
* Once written, they never change.
*/
if (wlod1 != NULL) {
printf("lock order \"%s\"(%s) -> "
"\"%s\"(%s) first seen at:\n",
w->w_type->lt_name,
w->w_class->lc_name,
w1->w_type->lt_name,
w1->w_class->lc_name);
stacktrace_print(
&wlod1->wlod_stack, printf);
} else {
printf("lock order data "
"w2 -> w1 missing\n");
}
if (wlod2 != NULL) {
printf("lock order \"%s\"(%s) -> "
"\"%s\"(%s) first seen at:\n",
w1->w_type->lt_name,
w1->w_class->lc_name,
w->w_type->lt_name,
w->w_class->lc_name);
stacktrace_print(
&wlod2->wlod_stack, printf);
} else {
printf("lock order data "
"w1 -> w2 missing\n");
}
}
witness_debugger(0);
goto out_splx;
}
}
/*
* If requested, build a new lock order. However, don't build a new
* relationship between a sleepable lock and Giant if it is in the
* wrong direction. The correct lock order is that sleepable locks
* always come before Giant.
*/
if (flags & LOP_NEWORDER && !(is_kernel_lock(plock->li_lock) &&
(lock->lo_flags & LO_SLEEPABLE) != 0))
itismychild(plock->li_lock->lo_witness, w);
out:
mtx_leave(&w_mtx);
out_splx:
splx(s);
}
void
witness_lock(struct lock_object *lock, int flags)
{
struct lock_list_entry **lock_list, *lle;
struct lock_instance *instance;
struct proc *p;
struct witness *w;
int s;
if (witness_cold || witness_watch < 0 || panicstr != NULL ||
db_active || (lock->lo_flags & LO_WITNESS) == 0)
return;
w = lock->lo_witness;
if (w == NULL)
w = lock->lo_witness =
enroll(lock->lo_type, lock->lo_name, LOCK_CLASS(lock)); p = curproc;
/* Determine lock list for this lock. */
if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
lock_list = &p->p_sleeplocks;
else
lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
s = splhigh();
/* Check to see if we are recursing on a lock we already own. */
instance = find_instance(*lock_list, lock);
if (instance != NULL) {
instance->li_flags++;
goto out;
}
w->w_acquired = 1;
/* Find the next open lock instance in the list and fill it. */
lle = *lock_list;
if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
lle = witness_lock_list_get();
if (lle == NULL)
goto out;
lle->ll_next = *lock_list;
*lock_list = lle;
}
instance = &lle->ll_children[lle->ll_count++];
instance->li_lock = lock;
if ((flags & LOP_EXCLUSIVE) != 0)
instance->li_flags = LI_EXCLUSIVE;
else
instance->li_flags = 0;
instance->li_stack = NULL;
if (witness_locktrace) {
instance->li_stack = witness_lock_stack_get();
if (instance->li_stack != NULL)
stacktrace_save(&instance->li_stack->ls_stack);
}
out:
splx(s);
}
void
witness_upgrade(struct lock_object *lock, int flags)
{
struct lock_instance *instance;
struct lock_class *class;
int s;
KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
if (lock->lo_witness == NULL || witness_watch < 0 ||
panicstr != NULL || db_active)
return;
class = LOCK_CLASS(lock);
if (witness_watch) {
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
panic("upgrade of non-upgradable lock (%s) %s",
class->lc_name, lock->lo_name);
if ((class->lc_flags & LC_SLEEPLOCK) == 0)
panic("upgrade of non-sleep lock (%s) %s",
class->lc_name, lock->lo_name);
}
s = splhigh();
instance = find_instance(curproc->p_sleeplocks, lock);
if (instance == NULL) {
panic("upgrade of unlocked lock (%s) %s",
class->lc_name, lock->lo_name);
goto out;
}
if (witness_watch) {
if ((instance->li_flags & LI_EXCLUSIVE) != 0)
panic("upgrade of exclusive lock (%s) %s",
class->lc_name, lock->lo_name);
if ((instance->li_flags & LI_RECURSEMASK) != 0)
panic("upgrade of recursed lock (%s) %s r=%d",
class->lc_name, lock->lo_name,
instance->li_flags & LI_RECURSEMASK);
}
instance->li_flags |= LI_EXCLUSIVE;
out:
splx(s);
}
void
witness_downgrade(struct lock_object *lock, int flags)
{
struct lock_instance *instance;
struct lock_class *class;
int s;
KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__); if (lock->lo_witness == NULL || witness_watch < 0 ||
panicstr != NULL || db_active)
return;
class = LOCK_CLASS(lock);
if (witness_watch) {
if ((lock->lo_flags & LO_UPGRADABLE) == 0)
panic(
"downgrade of non-upgradable lock (%s) %s",
class->lc_name, lock->lo_name); if ((class->lc_flags & LC_SLEEPLOCK) == 0)
panic("downgrade of non-sleep lock (%s) %s",
class->lc_name, lock->lo_name);
}
s = splhigh();
instance = find_instance(curproc->p_sleeplocks, lock);
if (instance == NULL) {
panic("downgrade of unlocked lock (%s) %s",
class->lc_name, lock->lo_name);
goto out;
}
if (witness_watch) {
if ((instance->li_flags & LI_EXCLUSIVE) == 0)
panic("downgrade of shared lock (%s) %s",
class->lc_name, lock->lo_name); if ((instance->li_flags & LI_RECURSEMASK) != 0)
panic("downgrade of recursed lock (%s) %s r=%d",
class->lc_name, lock->lo_name,
instance->li_flags & LI_RECURSEMASK);
}
instance->li_flags &= ~LI_EXCLUSIVE;
out:
splx(s);
}
void
witness_unlock(struct lock_object *lock, int flags)
{
struct lock_list_entry **lock_list, *lle;
struct lock_instance *instance;
struct lock_class *class;
struct proc *p;
int i, j;
int s;
if (witness_cold || lock->lo_witness == NULL ||
panicstr != NULL || db_active)
return;
p = curproc;
class = LOCK_CLASS(lock);
/* Find lock instance associated with this lock. */
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = &p->p_sleeplocks;
else
lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
s = splhigh();
lle = *lock_list;
for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) {
instance = &(*lock_list)->ll_children[i];
if (instance->li_lock == lock)
goto found;
}
/*
* When disabling WITNESS through witness_watch we could end up in
* having registered locks in the p_sleeplocks queue.
* We have to make sure we flush these queues, so just search for
* eventual register locks and remove them.
*/
if (witness_watch > 0) { panic("lock (%s) %s not locked", class->lc_name, lock->lo_name);
}
goto out;
found:
/* First, check for shared/exclusive mismatches. */
if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
(flags & LOP_EXCLUSIVE) == 0) {
printf("witness: shared unlock of (%s) %s "
"while exclusively locked\n",
class->lc_name, lock->lo_name);
panic("excl->ushare");
}
if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
(flags & LOP_EXCLUSIVE) != 0) {
printf("witness: exclusive unlock of (%s) %s "
"while share locked\n", class->lc_name, lock->lo_name);
panic("share->uexcl");
}
/* If we are recursed, unrecurse. */
if ((instance->li_flags & LI_RECURSEMASK) > 0) {
instance->li_flags--;
goto out;
}
/* The lock is now being dropped, check for NORELEASE flag */
if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
printf("witness: forbidden unlock of (%s) %s\n",
class->lc_name, lock->lo_name);
panic("lock marked norelease");
}
/* Release the stack buffer, if any. */
if (instance->li_stack != NULL) {
witness_lock_stack_free(instance->li_stack);
instance->li_stack = NULL;
}
/* Remove this item from the list. */
for (j = i; j < (*lock_list)->ll_count - 1; j++)
(*lock_list)->ll_children[j] =
(*lock_list)->ll_children[j + 1];
(*lock_list)->ll_count--;
/*
* In order to reduce contention on w_mtx, we want to keep always an
* head object into lists so that frequent allocation from the
* free witness pool (and subsequent locking) is avoided.
* In order to maintain the current code simple, when the head
* object is totally unloaded it means also that we do not have
* further objects in the list, so the list ownership needs to be
* hand over to another object if the current head needs to be freed.
*/
if ((*lock_list)->ll_count == 0) {
if (*lock_list == lle) {
if (lle->ll_next == NULL)
goto out;
} else
lle = *lock_list;
*lock_list = lle->ll_next;
witness_lock_list_free(lle);
}
out:
splx(s);
}
void
witness_thread_exit(struct proc *p)
{
struct lock_list_entry *lle;
int i, n, s;
lle = p->p_sleeplocks;
if (lle == NULL || panicstr != NULL || db_active)
return;
if (lle->ll_count != 0) {
for (n = 0; lle != NULL; lle = lle->ll_next)
for (i = lle->ll_count - 1; i >= 0; i--) {
if (n == 0)
printf("witness: thread %p exiting "
"with the following locks held:\n",
p);
n++;
witness_list_lock(&lle->ll_children[i],
printf);
}
panic("thread %p cannot exit while holding sleeplocks", p);
}
KASSERT(lle->ll_next == NULL);
s = splhigh();
witness_lock_list_free(lle);
splx(s);
}
/*
* Warn if any locks other than 'lock' are held. Flags can be passed in to
* exempt Giant and sleepable locks from the checks as well. If any
* non-exempt locks are held, then a supplied message is printed to the
* output channel along with a list of the offending locks. If indicated in the
* flags then a failure results in a panic as well.
*/
int
witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
{
struct lock_list_entry *lock_list, *lle;
struct lock_instance *lock1;
struct proc *p;
va_list ap;
int i, n;
if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
return (0);
n = 0;
p = curproc;
for (lle = p->p_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) {
lock1 = &lle->ll_children[i];
if (lock1->li_lock == lock)
continue;
if (flags & WARN_KERNELOK &&
is_kernel_lock(lock1->li_lock))
continue;
if (flags & WARN_SLEEPOK &&
(lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
continue;
if (n == 0) { printf("witness: ");
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf(" with the following %slocks held:\n",
(flags & WARN_SLEEPOK) != 0 ?
"non-sleepable " : "");
}
n++;
witness_list_lock(lock1, printf);
}
lock_list = witness_cpu[cpu_number()].wc_spinlocks;
if (lock_list != NULL && lock_list->ll_count != 0) {
/*
* We should only have one spinlock and as long as
* the flags cannot match for this locks class,
* check if the first spinlock is the one curproc
* should hold.
*/
lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0)
return (0);
printf("witness: ");
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf(" with the following %slocks held:\n",
(flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : "");
n += witness_list_locks(&lock_list, printf);
}
if (n > 0) {
if (flags & WARN_PANIC)
panic("%s", __func__);
else
witness_debugger(1);
}
return (n);
}
static struct witness *
enroll(const struct lock_type *type, const char *subtype,
struct lock_class *lock_class)
{
struct witness *w;
struct witness_list *typelist;
KASSERT(type != NULL); if (witness_watch < 0 || panicstr != NULL || db_active)
return (NULL);
if ((lock_class->lc_flags & LC_SPINLOCK)) {
typelist = &w_spin;
} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
typelist = &w_sleep;
} else {
panic("lock class %s is not sleep or spin",
lock_class->lc_name);
return (NULL);
}
mtx_enter(&w_mtx);
w = witness_hash_get(type, subtype);
if (w)
goto found;
if ((w = witness_get()) == NULL)
return (NULL);
w->w_type = type;
w->w_subtype = subtype;
w->w_class = lock_class;
SLIST_INSERT_HEAD(&w_all, w, w_list);
if (lock_class->lc_flags & LC_SPINLOCK) {
SLIST_INSERT_HEAD(&w_spin, w, w_typelist);
w_spin_cnt++;
} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
SLIST_INSERT_HEAD(&w_sleep, w, w_typelist);
w_sleep_cnt++;
}
/* Insert new witness into the hash */
witness_hash_put(w);
witness_increment_graph_generation();
mtx_leave(&w_mtx);
return (w);
found:
mtx_leave(&w_mtx);
if (lock_class != w->w_class)
panic("lock (%s) %s does not match earlier (%s) lock",
type->lt_name, lock_class->lc_name, w->w_class->lc_name);
return (w);
}
static void
adopt(struct witness *parent, struct witness *child)
{
int pi, ci, i, j;
if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
/* If the relationship is already known, there's no work to be done. */
if (isitmychild(parent, child))
return;
/* When the structure of the graph changes, bump up the generation. */
witness_increment_graph_generation();
/*
* The hard part ... create the direct relationship, then propagate all
* indirect relationships.
*/
pi = parent->w_index;
ci = child->w_index;
WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); KASSERT(pi != ci);
w_rmatrix[pi][ci] |= WITNESS_PARENT;
w_rmatrix[ci][pi] |= WITNESS_CHILD;
/*
* If parent was not already an ancestor of child,
* then we increment the descendant and ancestor counters.
*/
if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++;
child->w_num_ancestors++;
}
/*
* Find each ancestor of 'pi'. Note that 'pi' itself is counted as
* an ancestor of 'pi' during this loop.
*/
for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
(i != pi))
continue;
/* Find each descendant of 'i' and mark it as a descendant. */
for (j = 1; j <= w_max_used_index; j++) {
/*
* Skip children that are already marked as
* descendants of 'i'.
*/
if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
continue;
/*
* We are only interested in descendants of 'ci'. Note
* that 'ci' itself is counted as a descendant of 'ci'.
*/
if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
(j != ci))
continue;
w_rmatrix[i][j] |= WITNESS_ANCESTOR;
w_rmatrix[j][i] |= WITNESS_DESCENDANT;
w_data[i].w_num_descendants++;
w_data[j].w_num_ancestors++;
/*
* Make sure we aren't marking a node as both an
* ancestor and descendant. We should have caught
* this as a lock order reversal earlier.
*/
if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
(w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
printf("witness: rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
i, j, w_rmatrix[i][j]);
#ifdef DDB
db_stack_dump();
#endif
printf("witness disabled\n");
witness_watch = -1;
}
if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
(w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
printf("witness: rmatrix paradox! [%d][%d]=%d "
"both ancestor and descendant\n",
j, i, w_rmatrix[j][i]);
#ifdef DDB
db_stack_dump();
#endif
printf("witness disabled\n");
witness_watch = -1;
}
}
}
}
static void
itismychild(struct witness *parent, struct witness *child)
{ KASSERT(child != NULL && parent != NULL); if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
if (!witness_lock_type_equal(parent, child)) {
if (witness_cold == 0) mtx_leave(&w_mtx);
panic(
"%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
"the same lock type", __func__, parent->w_type->lt_name,
parent->w_class->lc_name, child->w_type->lt_name,
child->w_class->lc_name);
}
adopt(parent, child);
}
/*
* Generic code for the isitmy*() functions. The rmask parameter is the
* expected relationship of w1 to w2.
*/
static int
_isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
{
unsigned char r1, r2;
int i1, i2;
i1 = w1->w_index;
i2 = w2->w_index;
WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2);
r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
/* The flags on one better be the inverse of the flags on the other */
if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
/* Don't squawk if we're potentially racing with an update. */
if (w_mtx.mtx_owner != curcpu())
return (0);
printf("witness: %s: rmatrix mismatch between %s (index %d) "
"and %s (index %d): w_rmatrix[%d][%d] == %x but "
"w_rmatrix[%d][%d] == %x\n",
fname, w1->w_type->lt_name, i1, w2->w_type->lt_name,
i2, i1, i2, r1,
i2, i1, r2);
#ifdef DDB
db_stack_dump();
#endif
printf("witness disabled\n");
witness_watch = -1;
}
return (r1 & rmask);
}
/*
* Checks if @child is a direct child of @parent.
*/
static int
isitmychild(struct witness *parent, struct witness *child)
{
return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
}
/*
* Checks if @descendant is a direct or indirect descendant of @ancestor.
*/
static int
isitmydescendant(struct witness *ancestor, struct witness *descendant)
{
return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
__func__));
}
static struct witness *
witness_get(void)
{
struct witness *w;
int index;
if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
if (witness_watch < 0) {
mtx_leave(&w_mtx);
return (NULL);
}
if (SLIST_EMPTY(&w_free)) {
witness_watch = -1;
mtx_leave(&w_mtx);
printf("WITNESS: unable to allocate a new witness object\n");
return (NULL);
}
w = SLIST_FIRST(&w_free);
SLIST_REMOVE_HEAD(&w_free, w_list);
w_free_cnt--;
index = w->w_index;
KASSERT(index > 0 && index == w_max_used_index + 1 &&
index < witness_count);
memset(w, 0, sizeof(*w));
w->w_index = index;
if (index > w_max_used_index) w_max_used_index = index;
return (w);
}
static void
witness_free(struct witness *w)
{
SLIST_INSERT_HEAD(&w_free, w, w_list);
w_free_cnt++;
}
static struct lock_list_entry *
witness_lock_list_get(void)
{
struct lock_list_entry *lle;
struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
if (witness_watch < 0)
return (NULL);
splassert(IPL_HIGH);
if (wcpu->wc_lle_count > 0) {
lle = wcpu->wc_lle_cache;
wcpu->wc_lle_cache = lle->ll_next;
wcpu->wc_lle_count--;
memset(lle, 0, sizeof(*lle));
return (lle);
}
mtx_enter(&w_mtx);
lle = w_lock_list_free;
if (lle == NULL) {
witness_watch = -1;
mtx_leave(&w_mtx);
printf("%s: witness exhausted\n", __func__);
return (NULL);
}
w_lock_list_free = lle->ll_next;
mtx_leave(&w_mtx);
memset(lle, 0, sizeof(*lle));
return (lle);
}
static void
witness_lock_list_free(struct lock_list_entry *lle)
{
struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
splassert(IPL_HIGH);
if (wcpu->wc_lle_count < WITNESS_LLE_CACHE_MAX) {
lle->ll_next = wcpu->wc_lle_cache;
wcpu->wc_lle_cache = lle;
wcpu->wc_lle_count++;
return;
}
mtx_enter(&w_mtx);
lle->ll_next = w_lock_list_free;
w_lock_list_free = lle;
mtx_leave(&w_mtx);
}
static union lock_stack *
witness_lock_stack_get(void)
{
union lock_stack *stack = NULL;
struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
splassert(IPL_HIGH);
if (wcpu->wc_stk_count > 0) {
stack = wcpu->wc_stk_cache;
wcpu->wc_stk_cache = stack->ls_next;
wcpu->wc_stk_count--;
return (stack);
}
mtx_enter(&w_mtx);
if (w_lock_stack_free != NULL) {
stack = w_lock_stack_free;
w_lock_stack_free = stack->ls_next;
}
mtx_leave(&w_mtx);
return (stack);
}
static void
witness_lock_stack_free(union lock_stack *stack)
{
struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
splassert(IPL_HIGH);
if (wcpu->wc_stk_count < WITNESS_STK_CACHE_MAX) {
stack->ls_next = wcpu->wc_stk_cache;
wcpu->wc_stk_cache = stack;
wcpu->wc_stk_count++;
return;
}
mtx_enter(&w_mtx);
stack->ls_next = w_lock_stack_free;
w_lock_stack_free = stack;
mtx_leave(&w_mtx);
}
static struct lock_instance *
find_instance(struct lock_list_entry *list, const struct lock_object *lock)
{
struct lock_list_entry *lle;
struct lock_instance *instance;
int i;
for (lle = list; lle != NULL; lle = lle->ll_next) {
for (i = lle->ll_count - 1; i >= 0; i--) {
instance = &lle->ll_children[i]; if (instance->li_lock == lock)
return (instance);
}
}
return (NULL);
}
static void
witness_list_lock(struct lock_instance *instance,
int (*prnt)(const char *fmt, ...))
{
struct lock_object *lock;
lock = instance->li_lock;
prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
"exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
prnt(" r = %d (%p)\n", instance->li_flags & LI_RECURSEMASK, lock);
if (instance->li_stack != NULL) stacktrace_print(&instance->li_stack->ls_stack, prnt);
}
#ifdef DDB
static int
witness_thread_has_locks(struct proc *p)
{
if (p->p_sleeplocks == NULL)
return (0);
return (p->p_sleeplocks->ll_count != 0);
}
static int
witness_process_has_locks(struct process *pr)
{
struct proc *p;
TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
if (witness_thread_has_locks(p))
return (1);
}
return (0);
}
#endif
int
witness_list_locks(struct lock_list_entry **lock_list,
int (*prnt)(const char *fmt, ...))
{
struct lock_list_entry *lle;
int i, nheld;
nheld = 0;
for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) {
witness_list_lock(&lle->ll_children[i], prnt);
nheld++;
}
return (nheld);
}
/*
* This is a bit risky at best. We call this function when we have timed
* out acquiring a spin lock, and we assume that the other CPU is stuck
* with this lock held. So, we go groveling around in the other CPU's
* per-cpu data to try to find the lock instance for this spin lock to
* see when it was last acquired.
*/
void
witness_display_spinlock(struct lock_object *lock, struct proc *owner,
int (*prnt)(const char *fmt, ...))
{
struct lock_instance *instance;
if (owner->p_stat != SONPROC)
return;
instance = find_instance(
witness_cpu[owner->p_cpu->ci_cpuid].wc_spinlocks, lock);
if (instance != NULL)
witness_list_lock(instance, prnt);
}
void
witness_assert(const struct lock_object *lock, int flags)
{
struct lock_instance *instance;
struct lock_class *class;
if (lock->lo_witness == NULL || witness_watch < 1 ||
panicstr != NULL || db_active)
return;
class = LOCK_CLASS(lock);
if ((class->lc_flags & LC_SLEEPLOCK) != 0)
instance = find_instance(curproc->p_sleeplocks, lock);
else if ((class->lc_flags & LC_SPINLOCK) != 0)
instance = find_instance(
witness_cpu[cpu_number()].wc_spinlocks, lock);
else {
panic("lock (%s) %s is not sleep or spin!",
class->lc_name, lock->lo_name);
return;
}
switch (flags) {
case LA_UNLOCKED:
if (instance != NULL)
panic("lock (%s) %s locked",
class->lc_name, lock->lo_name);
break;
case LA_LOCKED:
case LA_LOCKED | LA_RECURSED:
case LA_LOCKED | LA_NOTRECURSED:
case LA_SLOCKED:
case LA_SLOCKED | LA_RECURSED:
case LA_SLOCKED | LA_NOTRECURSED:
case LA_XLOCKED:
case LA_XLOCKED | LA_RECURSED:
case LA_XLOCKED | LA_NOTRECURSED:
if (instance == NULL) {
panic("lock (%s) %s not locked",
class->lc_name, lock->lo_name);
break;
}
if ((flags & LA_XLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) == 0)
panic(
"lock (%s) %s not exclusively locked",
class->lc_name, lock->lo_name); if ((flags & LA_SLOCKED) != 0 &&
(instance->li_flags & LI_EXCLUSIVE) != 0)
panic(
"lock (%s) %s exclusively locked",
class->lc_name, lock->lo_name); if ((flags & LA_RECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) == 0)
panic("lock (%s) %s not recursed",
class->lc_name, lock->lo_name); if ((flags & LA_NOTRECURSED) != 0 &&
(instance->li_flags & LI_RECURSEMASK) != 0)
panic("lock (%s) %s recursed",
class->lc_name, lock->lo_name);
break;
default:
panic("invalid lock assertion");
}
}
static void
witness_setflag(struct lock_object *lock, int flag, int set)
{
struct lock_list_entry *lock_list;
struct lock_instance *instance;
struct lock_class *class;
if (lock->lo_witness == NULL || witness_watch < 0 ||
panicstr != NULL || db_active)
return;
class = LOCK_CLASS(lock);
if (class->lc_flags & LC_SLEEPLOCK)
lock_list = curproc->p_sleeplocks;
else
lock_list = witness_cpu[cpu_number()].wc_spinlocks;
instance = find_instance(lock_list, lock);
if (instance == NULL) {
panic("%s: lock (%s) %s not locked", __func__,
class->lc_name, lock->lo_name);
return;
}
if (set)
instance->li_flags |= flag;
else
instance->li_flags &= ~flag;
}
void
witness_norelease(struct lock_object *lock)
{
witness_setflag(lock, LI_NORELEASE, 1);
}
void
witness_releaseok(struct lock_object *lock)
{
witness_setflag(lock, LI_NORELEASE, 0);
}
#ifdef DDB
static void
witness_ddb_list(struct proc *p)
{
struct witness_cpu *wc = &witness_cpu[cpu_number()];
KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
KASSERTMSG(db_active, "%s: not in the debugger", __func__);
if (witness_watch < 1)
return;
witness_list_locks(&p->p_sleeplocks, db_printf);
/*
* We only handle spinlocks if td == curproc. This is somewhat broken
* if td is currently executing on some other CPU and holds spin locks
* as we won't display those locks. If we had a MI way of getting
* the per-cpu data for a given cpu then we could use
* td->td_oncpu to get the list of spinlocks for this thread
* and "fix" this.
*
* That still wouldn't really fix this unless we locked the scheduler
* lock or stopped the other CPU to make sure it wasn't changing the
* list out from under us. It is probably best to just not try to
* handle threads on other CPU's for now.
*/
if (p == curproc && wc->wc_spinlocks != NULL)
witness_list_locks(&wc->wc_spinlocks, db_printf);
}
void
db_witness_list(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{
struct proc *p;
if (have_addr)
p = (struct proc *)addr;
else
p = curproc;
witness_ddb_list(p);
}
void
db_witness_list_all(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct lock_list_entry *lock_list;
struct process *pr;
struct proc *p;
CPU_INFO_FOREACH(cii, ci) {
lock_list = witness_cpu[CPU_INFO_UNIT(ci)].wc_spinlocks;
if (lock_list == NULL || lock_list->ll_count == 0)
continue;
db_printf("CPU %d:\n", CPU_INFO_UNIT(ci));
witness_list_locks(&lock_list, db_printf);
}
/*
* It would be nice to list only threads and processes that actually
* held sleep locks, but that information is currently not exported
* by WITNESS.
*/
LIST_FOREACH(pr, &allprocess, ps_list) {
if (!witness_process_has_locks(pr))
continue;
TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
if (!witness_thread_has_locks(p))
continue;
db_printf("Process %d (%s) thread %p (%d)\n",
pr->ps_pid, pr->ps_comm, p, p->p_tid);
witness_ddb_list(p);
}
}
}
void
witness_print_badstacks(void)
{
static struct witness tmp_w1, tmp_w2;
static struct witness_lock_order_data tmp_data1, tmp_data2;
struct witness_lock_order_data *data1, *data2;
struct witness *w1, *w2;
int error, generation, i, j;
if (witness_watch < 1) {
db_printf("witness watch is disabled\n");
return;
}
if (witness_cold) {
db_printf("witness is cold\n");
return;
}
error = 0;
memset(&tmp_w1, 0, sizeof(tmp_w1));
memset(&tmp_w2, 0, sizeof(tmp_w2));
memset(&tmp_data1, 0, sizeof(tmp_data1));
memset(&tmp_data2, 0, sizeof(tmp_data2));
restart:
mtx_enter(&w_mtx);
generation = w_generation;
mtx_leave(&w_mtx);
db_printf("Number of known direct relationships is %d\n",
w_lohash.wloh_count);
for (i = 1; i < w_max_used_index; i++) {
mtx_enter(&w_mtx);
if (generation != w_generation) {
mtx_leave(&w_mtx);
/* The graph has changed, try again. */
db_printf("Lock graph changed, restarting trace.\n");
goto restart;
}
w1 = &w_data[i];
if (w1->w_reversed == 0) {
mtx_leave(&w_mtx);
continue;
}
/* Copy w1 locally so we can release the spin lock. */
tmp_w1 = *w1;
mtx_leave(&w_mtx);
if (tmp_w1.w_reversed == 0)
continue;
for (j = 1; j < w_max_used_index; j++) {
if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
continue;
mtx_enter(&w_mtx);
if (generation != w_generation) {
mtx_leave(&w_mtx);
/* The graph has changed, try again. */
db_printf("Lock graph changed, "
"restarting trace.\n");
goto restart;
}
w2 = &w_data[j];
data1 = witness_lock_order_get(w1, w2);
data2 = witness_lock_order_get(w2, w1);
/*
* Copy information locally so we can release the
* spin lock.
*/
tmp_w2 = *w2;
if (data1)
tmp_data1.wlod_stack = data1->wlod_stack;
if (data2 && data2 != data1)
tmp_data2.wlod_stack = data2->wlod_stack;
mtx_leave(&w_mtx);
db_printf("\nLock order reversal between \"%s\"(%s) "
"and \"%s\"(%s)!\n",
tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
if (data1) {
db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
"first seen at:\n",
tmp_w1.w_type->lt_name,
tmp_w1.w_class->lc_name,
tmp_w2.w_type->lt_name,
tmp_w2.w_class->lc_name);
stacktrace_print(&tmp_data1.wlod_stack,
db_printf);
db_printf("\n");
}
if (data2 && data2 != data1) {
db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
"first seen at:\n",
tmp_w2.w_type->lt_name,
tmp_w2.w_class->lc_name,
tmp_w1.w_type->lt_name,
tmp_w1.w_class->lc_name);
stacktrace_print(&tmp_data2.wlod_stack,
db_printf);
db_printf("\n");
}
}
}
mtx_enter(&w_mtx);
if (generation != w_generation) {
mtx_leave(&w_mtx);
/*
* The graph changed while we were printing stack data,
* try again.
*/
db_printf("Lock graph changed, restarting trace.\n");
goto restart;
}
mtx_leave(&w_mtx);
}
void
db_witness_display(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{
switch (modif[0]) {
case 'b':
witness_print_badstacks();
break;
default:
witness_ddb_display(db_printf);
break;
}
}
#endif
void
db_witness_print_fullgraph(void)
{
struct witness *w;
int error;
if (witness_watch < 1) {
db_printf("witness watch is disabled\n");
return;
}
if (witness_cold) {
db_printf("witness is cold\n");
return;
}
error = 0;
mtx_enter(&w_mtx);
SLIST_FOREACH(w, &w_all, w_list)
w->w_displayed = 0;
SLIST_FOREACH(w, &w_all, w_list)
db_witness_add_fullgraph(w);
mtx_leave(&w_mtx);
}
static void
db_witness_add_fullgraph(struct witness *w)
{
int i;
if (w->w_displayed != 0 || w->w_acquired == 0)
return;
w->w_displayed = 1;
WITNESS_INDEX_ASSERT(w->w_index);
for (i = 1; i <= w_max_used_index; i++) {
if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
db_printf("\"%s\",\"%s\"\n", w->w_type->lt_name,
w_data[i].w_type->lt_name);
db_witness_add_fullgraph(&w_data[i]);
}
}
}
/*
* A simple hash function. Takes a key pointer and a key size. If size == 0,
* interprets the key as a string and reads until the null
* terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
* hash value computed from the key.
*/
static uint32_t
witness_hash_djb2(const uint8_t *key, uint32_t size)
{
unsigned int hash = 5381;
int i;
/* hash = hash * 33 + key[i] */
if (size)
for (i = 0; i < size; i++)
hash = ((hash << 5) + hash) + (unsigned int)key[i];
else
for (i = 0; key[i] != 0; i++)
hash = ((hash << 5) + hash) + (unsigned int)key[i];
return (hash);
}
/*
* Initializes the two witness hash tables. Called exactly once from
* witness_initialize().
*/
static void
witness_init_hash_tables(void)
{
int i;
KASSERT(witness_cold);
/* Initialize the hash tables. */
for (i = 0; i < WITNESS_HASH_SIZE; i++)
SLIST_INIT(&w_hash.wh_array[i]);
w_hash.wh_size = WITNESS_HASH_SIZE;
w_hash.wh_count = 0;
/* Initialize the lock order data hash. */
w_lofree = NULL;
for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
w_lodata[i].wlod_next = w_lofree;
w_lofree = &w_lodata[i];
}
w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
w_lohash.wloh_count = 0;
for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
w_lohash.wloh_array[i] = NULL;
}
static struct witness *
witness_hash_get(const struct lock_type *type, const char *subtype)
{
struct witness *w;
uint32_t hash;
KASSERT(type != NULL); if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
hash = (uint32_t)((uintptr_t)type ^ (uintptr_t)subtype) %
w_hash.wh_size;
SLIST_FOREACH(w, &w_hash.wh_array[hash], w_hash_next) { if (w->w_type == type && w->w_subtype == subtype)
goto out;
}
out:
return (w);
}
static void
witness_hash_put(struct witness *w)
{
uint32_t hash;
KASSERT(w != NULL);
KASSERT(w->w_type != NULL); if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
KASSERTMSG(witness_hash_get(w->w_type, w->w_subtype) == NULL,
"%s: trying to add a hash entry that already exists!", __func__);
KASSERTMSG(SLIST_NEXT(w, w_hash_next) == NULL,
"%s: w->w_hash_next != NULL", __func__);
hash = (uint32_t)((uintptr_t)w->w_type ^ (uintptr_t)w->w_subtype) %
w_hash.wh_size;
SLIST_INSERT_HEAD(&w_hash.wh_array[hash], w, w_hash_next);
w_hash.wh_count++;
}
static struct witness_lock_order_data *
witness_lock_order_get(struct witness *parent, struct witness *child)
{
struct witness_lock_order_data *data = NULL;
struct witness_lock_order_key key;
unsigned int hash;
KASSERT(parent != NULL && child != NULL);
key.from = parent->w_index;
key.to = child->w_index;
WITNESS_INDEX_ASSERT(key.from);
WITNESS_INDEX_ASSERT(key.to);
if ((w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN) == 0)
goto out;
hash = witness_hash_djb2((const char*)&key,
sizeof(key)) % w_lohash.wloh_size;
data = w_lohash.wloh_array[hash];
while (data != NULL) {
if (witness_lock_order_key_equal(&data->wlod_key, &key))
break;
data = data->wlod_next;
}
out:
return (data);
}
/*
* Verify that parent and child have a known relationship, are not the same,
* and child is actually a child of parent. This is done without w_mtx
* to avoid contention in the common case.
*/
static int
witness_lock_order_check(struct witness *parent, struct witness *child)
{
if (parent != child &&
w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN &&
isitmychild(parent, child))
return (1);
return (0);
}
static int
witness_lock_order_add(struct witness *parent, struct witness *child)
{
static int lofree_empty_reported = 0;
struct witness_lock_order_data *data = NULL;
struct witness_lock_order_key key;
unsigned int hash;
KASSERT(parent != NULL && child != NULL);
key.from = parent->w_index;
key.to = child->w_index;
WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index]
& WITNESS_LOCK_ORDER_KNOWN)
return (1);
hash = witness_hash_djb2((const char*)&key,
sizeof(key)) % w_lohash.wloh_size;
w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
data = w_lofree;
if (data == NULL) {
if (!lofree_empty_reported) { lofree_empty_reported = 1;
printf("witness: out of free lock order entries\n");
}
return (0);
}
w_lofree = data->wlod_next;
data->wlod_next = w_lohash.wloh_array[hash];
data->wlod_key = key;
w_lohash.wloh_array[hash] = data;
w_lohash.wloh_count++;
stacktrace_save_at(&data->wlod_stack, 1);
return (1);
}
/* Call this whenever the structure of the witness graph changes. */
static void
witness_increment_graph_generation(void)
{
if (witness_cold == 0) MUTEX_ASSERT_LOCKED(&w_mtx);
w_generation++;
}
static void
witness_debugger(int dump)
{
switch (witness_watch) {
case 1:
break;
case 2:
if (dump)
db_stack_dump();
break;
case 3:
if (dump)
db_stack_dump(); db_enter();
break;
default:
panic("witness: locking error");
}
}
static int
witness_alloc_stacks(void)
{
union lock_stack *stacks;
unsigned int i, nstacks = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
rw_assert_wrlock(&w_ctlock);
if (w_lock_stack_num >= nstacks)
return (0);
nstacks -= w_lock_stack_num;
stacks = mallocarray(nstacks, sizeof(*stacks), M_WITNESS,
M_WAITOK | M_CANFAIL | M_ZERO);
if (stacks == NULL)
return (ENOMEM);
mtx_enter(&w_mtx);
for (i = 0; i < nstacks; i++) {
stacks[i].ls_next = w_lock_stack_free;
w_lock_stack_free = &stacks[i];
}
mtx_leave(&w_mtx);
w_lock_stack_num += nstacks;
return (0);
}
int
witness_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
int error, value;
if (namelen != 1)
return (ENOTDIR);
rw_enter_write(&w_ctlock);
switch (name[0]) {
case KERN_WITNESS_WATCH:
error = witness_sysctl_watch(oldp, oldlenp, newp, newlen);
break;
case KERN_WITNESS_LOCKTRACE:
value = witness_locktrace;
error = sysctl_int(oldp, oldlenp, newp, newlen, &value);
if (error == 0 && newp != NULL) {
switch (value) {
case 1:
error = witness_alloc_stacks();
/* FALLTHROUGH */
case 0:
if (error == 0)
witness_locktrace = value;
break;
default:
error = EINVAL;
break;
}
}
break;
default:
error = EOPNOTSUPP;
break;
}
rw_exit_write(&w_ctlock);
return (error);
}
int
witness_sysctl_watch(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
int error;
int value;
value = witness_watch;
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&value, -1, 3);
if (error == 0 && newp != NULL) {
mtx_enter(&w_mtx);
if (value < 0 || witness_watch >= 0) witness_watch = value;
else
error = EINVAL;
mtx_leave(&w_mtx);
}
return (error);
}
/* $OpenBSD: in6_pcb.c,v 1.123 2022/09/03 22:43:38 mvs Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*
* Copyright (c) 1982, 1986, 1990, 1993, 1995
* Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include "pf.h"
#include "stoeplitz.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/pfvar.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet6/in6_var.h>
#if NSTOEPLITZ > 0
#include <net/toeplitz.h>
#endif
const struct in6_addr zeroin6_addr;
struct inpcb *in6_pcbhash_lookup(struct inpcbtable *, u_int,
const struct in6_addr *, u_short, const struct in6_addr *, u_short);
struct inpcbhead *
in6_pcbhash(struct inpcbtable *table, u_int rdomain,
const struct in6_addr *faddr, u_short fport,
const struct in6_addr *laddr, u_short lport)
{
SIPHASH_CTX ctx;
u_int32_t nrdom = htonl(rdomain);
SipHash24_Init(&ctx, &table->inpt_key);
SipHash24_Update(&ctx, &nrdom, sizeof(nrdom));
SipHash24_Update(&ctx, faddr, sizeof(*faddr));
SipHash24_Update(&ctx, &fport, sizeof(fport));
SipHash24_Update(&ctx, laddr, sizeof(*laddr));
SipHash24_Update(&ctx, &lport, sizeof(lport));
return (&table->inpt_hashtbl[SipHash24_End(&ctx) & table->inpt_mask]);
}
int
in6_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in6 *sin6, int wild,
struct proc *p)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
u_short lport = sin6->sin6_port;
int reuseport = (so->so_options & SO_REUSEPORT);
wild |= INPLOOKUP_IPV6;
/* KAME hack: embed scopeid */
if (in6_embedscope(&sin6->sin6_addr, sin6, inp) != 0)
return (EINVAL);
/* this must be cleared for ifa_ifwithaddr() */
sin6->sin6_scope_id = 0;
/* reject IPv4 mapped address, we have no support for it */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
return (EADDRNOTAVAIL);
if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT))
reuseport = SO_REUSEADDR | SO_REUSEPORT;
} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct ifaddr *ifa = NULL;
sin6->sin6_port = 0; /*
* Yechhhh, because of upcoming
* call to ifa_ifwithaddr(), which
* does bcmp's over the PORTS as
* well. (What about flow?)
*/
sin6->sin6_flowinfo = 0;
if (!(so->so_options & SO_BINDANY) &&
(ifa = ifa_ifwithaddr(sin6tosa(sin6),
inp->inp_rtableid)) == NULL)
return (EADDRNOTAVAIL);
sin6->sin6_port = lport;
/*
* bind to an anycast address might accidentally
* cause sending a packet with an anycast source
* address, so we forbid it.
*
* We should allow to bind to a deprecated address,
* since the application dare to use it.
* But, can we assume that they are careful enough
* to check if the address is deprecated or not?
* Maybe, as a safeguard, we should have a setsockopt
* flag to control the bind(2) behavior against
* deprecated addresses (default: forbid bind(2)).
*/
if (ifa && ifatoia6(ifa)->ia6_flags & (IN6_IFF_ANYCAST|
IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED|IN6_IFF_DETACHED))
return (EADDRNOTAVAIL);
}
if (lport) {
struct inpcb *t;
int error = 0;
if (so->so_euid && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
t = in_pcblookup_local(table, &sin6->sin6_addr, lport,
INPLOOKUP_WILDCARD | INPLOOKUP_IPV6,
inp->inp_rtableid);
if (t && (so->so_euid != t->inp_socket->so_euid))
error = EADDRINUSE;
in_pcbunref(t);
if (error)
return (error);
}
t = in_pcblookup_local(table, &sin6->sin6_addr, lport,
wild, inp->inp_rtableid);
if (t && (reuseport & t->inp_socket->so_options) == 0)
error = EADDRINUSE;
in_pcbunref(t);
if (error)
return (error);
}
return (0);
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin6.
* Eventually, flow labels will have to be dealt with here, as well.
*
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in6_pcbconnect(struct inpcb *inp, struct mbuf *nam)
{
struct in6_addr *in6a = NULL;
struct sockaddr_in6 *sin6;
struct inpcb *t;
int error;
struct sockaddr_in6 tmp;
KASSERT(inp->inp_flags & INP_IPV6); if ((error = in6_nam2sin6(nam, &sin6)))
return (error);
if (sin6->sin6_port == 0)
return (EADDRNOTAVAIL);
/* reject IPv4 mapped address, we have no support for it */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
return (EADDRNOTAVAIL);
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
/* KAME hack: embed scopeid */
if (in6_embedscope(&sin6->sin6_addr, sin6, inp) != 0)
return EINVAL;
/* this must be cleared for ifa_ifwithaddr() */
sin6->sin6_scope_id = 0;
/* Source address selection. */
/*
* XXX: in6_selectsrc might replace the bound local address
* with the address specified by setsockopt(IPV6_PKTINFO).
* Is it the intended behavior?
*/
error = in6_pcbselsrc(&in6a, sin6, inp, inp->inp_outputopts6);
if (error)
return (error);
inp->inp_ipv6.ip6_hlim = (u_int8_t)in6_selecthlim(inp);
t = in6_pcblookup(inp->inp_table, &sin6->sin6_addr, sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) ? in6a : &inp->inp_laddr6,
inp->inp_lport, inp->inp_rtableid);
if (t != NULL) {
in_pcbunref(t);
return (EADDRINUSE);
}
KASSERT(IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) || inp->inp_lport); if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) { if (inp->inp_lport == 0) {
error = in_pcbbind(inp, NULL, curproc);
if (error)
return (error);
t = in6_pcblookup(inp->inp_table, &sin6->sin6_addr,
sin6->sin6_port, in6a, inp->inp_lport,
inp->inp_rtableid);
if (t != NULL) { inp->inp_lport = 0;
in_pcbunref(t);
return (EADDRINUSE);
}
}
inp->inp_laddr6 = *in6a;
}
inp->inp_faddr6 = sin6->sin6_addr;
inp->inp_fport = sin6->sin6_port;
inp->inp_flowinfo &= ~IPV6_FLOWLABEL_MASK;
if (ip6_auto_flowlabel)
inp->inp_flowinfo |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
#if NSTOEPLITZ > 0
inp->inp_flowid = stoeplitz_ip6port(&inp->inp_faddr6,
&inp->inp_laddr6, inp->inp_fport, inp->inp_lport);
#endif
in_pcbrehash(inp);
return (0);
}
/*
* Get the local address/port, and put it in a sockaddr_in6.
* This services the getsockname(2) call.
*/
void
in6_setsockaddr(struct inpcb *inp, struct mbuf *nam)
{
struct sockaddr_in6 *sin6;
nam->m_len = sizeof(struct sockaddr_in6);
sin6 = mtod(nam,struct sockaddr_in6 *);
bzero ((caddr_t)sin6,sizeof(struct sockaddr_in6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
sin6->sin6_port = inp->inp_lport;
sin6->sin6_addr = inp->inp_laddr6;
/* KAME hack: recover scopeid */
in6_recoverscope(sin6, &inp->inp_laddr6);
}
/*
* Get the foreign address/port, and put it in a sockaddr_in6.
* This services the getpeername(2) call.
*/
void
in6_setpeeraddr(struct inpcb *inp, struct mbuf *nam)
{
struct sockaddr_in6 *sin6;
nam->m_len = sizeof(struct sockaddr_in6);
sin6 = mtod(nam,struct sockaddr_in6 *);
bzero ((caddr_t)sin6,sizeof(struct sockaddr_in6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
sin6->sin6_port = inp->inp_fport;
sin6->sin6_addr = inp->inp_faddr6;
/* KAME hack: recover scopeid */
in6_recoverscope(sin6, &inp->inp_faddr6);
}
int
in6_sockaddr(struct socket *so, struct mbuf *nam)
{
struct inpcb *in6p;
in6p = sotoinpcb(so);
in6_setsockaddr(in6p, nam);
return (0);
}
int
in6_peeraddr(struct socket *so, struct mbuf *nam)
{
struct inpcb *in6p;
in6p = sotoinpcb(so);
in6_setpeeraddr(in6p, nam);
return (0);
}
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The local address and/or port numbers
* may be specified to limit the search. The "usual action" will be
* taken, depending on the ctlinput cmd. The caller must filter any
* cmds that are uninteresting (e.g., no error in the map).
* Call the protocol specific routine (if any) to report
* any errors for each matching socket.
*
* Also perform input-side security policy check
* once PCB to be notified has been located.
*/
void
in6_pcbnotify(struct inpcbtable *table, struct sockaddr_in6 *dst,
uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg,
u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int))
{
SIMPLEQ_HEAD(, inpcb) inpcblist;
struct inpcb *inp;
u_short fport = fport_arg, lport = lport_arg;
struct sockaddr_in6 sa6_src;
int errno;
u_int32_t flowinfo;
u_int rdomain;
if ((unsigned)cmd >= PRC_NCMDS)
return;
if (IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr))
return;
if (IN6_IS_ADDR_V4MAPPED(&dst->sin6_addr)) {
#ifdef DIAGNOSTIC
printf("%s: Huh? Thought we never got "
"called with mapped!\n", __func__);
#endif
return;
}
/*
* note that src can be NULL when we get notify by local fragmentation.
*/
sa6_src = (src == NULL) ? sa6_any : *src;
flowinfo = sa6_src.sin6_flowinfo;
/*
* Redirects go to all references to the destination,
* and use in_rtchange to invalidate the route cache.
* Dead host indications: also use in_rtchange to invalidate
* the cache, and deliver the error to all the sockets.
* Otherwise, if we have knowledge of the local port and address,
* deliver only to that socket.
*/
if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
fport = 0;
lport = 0;
sa6_src.sin6_addr = in6addr_any;
if (cmd != PRC_HOSTDEAD)
notify = in_rtchange;
}
errno = inet6ctlerrmap[cmd];
if (notify == NULL)
return;
SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
rw_enter_write(&table->inpt_notify);
mtx_enter(&table->inpt_mtx);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if ((inp->inp_flags & INP_IPV6) == 0)
continue;
/*
* Under the following condition, notify of redirects
* to the pcb, without making address matches against inpcb.
* - redirect notification is arrived.
* - the inpcb is unconnected.
* - the inpcb is caching !RTF_HOST routing entry.
* - the ICMPv6 notification is from the gateway cached in the
* inpcb. i.e. ICMPv6 notification is from nexthop gateway
* the inpcb used very recently.
*
* This is to improve interaction between netbsd/openbsd
* redirect handling code, and inpcb route cache code.
* without the clause, !RTF_HOST routing entry (which carries
* gateway used by inpcb right before the ICMPv6 redirect)
* will be cached forever in unconnected inpcb.
*
* There still is a question regarding to what is TRT:
* - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be
* generated on packet output. inpcb will always cache
* RTF_HOST routing entry so there's no need for the clause
* (ICMPv6 redirect will update RTF_HOST routing entry,
* and inpcb is caching it already).
* However, bsdi/freebsd are vulnerable to local DoS attacks
* due to the cloned routing entries.
* - Specwise, "destination cache" is mentioned in RFC2461.
* Jinmei says that it implies bsdi/freebsd behavior, itojun
* is not really convinced.
* - Having hiwat/lowat on # of cloned host route (redirect/
* pmtud) may be a good idea. netbsd/openbsd has it. see
* icmp6_mtudisc_update().
*/
if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) &&
IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) &&
inp->inp_route.ro_rt &&
!(inp->inp_route.ro_rt->rt_flags & RTF_HOST)) {
struct sockaddr_in6 *dst6;
dst6 = satosin6(&inp->inp_route.ro_dst);
if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr,
&dst->sin6_addr))
goto do_notify;
}
/*
* Detect if we should notify the error. If no source and
* destination ports are specified, but non-zero flowinfo and
* local address match, notify the error. This is the case
* when the error is delivered with an encrypted buffer
* by ESP. Otherwise, just compare addresses and ports
* as usual.
*/
if (lport == 0 && fport == 0 && flowinfo &&
inp->inp_socket != NULL &&
flowinfo == (inp->inp_flowinfo & IPV6_FLOWLABEL_MASK) &&
IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, &sa6_src.sin6_addr))
goto do_notify;
else if (!IN6_ARE_ADDR_EQUAL(&inp->inp_faddr6,
&dst->sin6_addr) ||
rtable_l2(inp->inp_rtableid) != rdomain ||
inp->inp_socket == NULL ||
(lport && inp->inp_lport != lport) ||
(!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6,
&sa6_src.sin6_addr)) ||
(fport && inp->inp_fport != fport)) {
continue;
}
do_notify:
in_pcbref(inp);
SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
}
mtx_leave(&table->inpt_mtx);
while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
(*notify)(inp, errno);
in_pcbunref(inp);
}
rw_exit_write(&table->inpt_notify);
}
struct inpcb *
in6_pcbhash_lookup(struct inpcbtable *table, u_int rdomain,
const struct in6_addr *faddr, u_short fport,
const struct in6_addr *laddr, u_short lport)
{
struct inpcbhead *head;
struct inpcb *inp;
NET_ASSERT_LOCKED(); MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
head = in6_pcbhash(table, rdomain, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) { if (!ISSET(inp->inp_flags, INP_IPV6))
continue;
if (inp->inp_fport == fport && inp->inp_lport == lport && IN6_ARE_ADDR_EQUAL(&inp->inp_faddr6, faddr) && IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr) &&
rtable_l2(inp->inp_rtableid) == rdomain) {
break;
}
}
if (inp != NULL) {
/*
* Move this PCB to the head of hash chain so that
* repeated accesses are quicker. This is analogous to
* the historic single-entry PCB cache.
*/
if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash);
}
}
return (inp);
}
struct inpcb *
in6_pcblookup(struct inpcbtable *table, const struct in6_addr *faddr,
u_int fport, const struct in6_addr *laddr, u_int lport, u_int rtable)
{
struct inpcb *inp;
u_int rdomain;
rdomain = rtable_l2(rtable);
mtx_enter(&table->inpt_mtx);
inp = in6_pcbhash_lookup(table, rdomain, faddr, fport, laddr, lport);
in_pcbref(inp);
mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: faddr= fport=%d laddr= lport=%d rdom=%u\n",
__func__, ntohs(fport), ntohs(lport), rdomain);
}
#endif
return (inp);
}
struct inpcb *
in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
u_int lport, struct mbuf *m, u_int rtable)
{
const struct in6_addr *key1, *key2;
struct inpcb *inp;
u_int rdomain;
key1 = laddr;
key2 = &zeroin6_addr;
#if NPF > 0
if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL); switch (divert->type) {
case PF_DIVERT_TO:
key1 = key2 = &divert->addr.v6;
lport = divert->port;
break;
case PF_DIVERT_REPLY:
return (NULL);
default:
panic("%s: unknown divert type %d, mbuf %p, divert %p",
__func__, divert->type, m, divert);
}
} else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) {
/*
* Redirected connections should not be treated the same
* as connections directed to ::1 since localhost
* can only be accessed from the host itself.
*/
key1 = &zeroin6_addr;
key2 = laddr;
}
#endif
rdomain = rtable_l2(rtable);
mtx_enter(&table->inpt_mtx);
inp = in6_pcbhash_lookup(table, rdomain, &zeroin6_addr, 0, key1, lport);
if (inp == NULL && ! IN6_ARE_ADDR_EQUAL(key1, key2)) { inp = in6_pcbhash_lookup(table, rdomain,
&zeroin6_addr, 0, key2, lport);
}
in_pcbref(inp);
mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: laddr= lport=%d rdom=%u\n",
__func__, ntohs(lport), rdomain);
}
#endif
return (inp);
}
/* $OpenBSD: uvm_aobj.c,v 1.107 2022/08/29 02:58:13 jsg Exp $ */
/* $NetBSD: uvm_aobj.c,v 1.39 2001/02/18 21:19:08 chs Exp $ */
/*
* Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and
* Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp
*/
/*
* uvm_aobj.c: anonymous memory uvm_object pager
*
* author: Chuck Silvers <chuq@chuq.com>
* started: Jan-1998
*
* - design mostly from Chuck Cranor
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/stdint.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
/*
* An anonymous UVM object (aobj) manages anonymous-memory. In addition to
* keeping the list of resident pages, it may also keep a list of allocated
* swap blocks. Depending on the size of the object, this list is either
* stored in an array (small objects) or in a hash table (large objects).
*/
/*
* Note: for hash tables, we break the address space of the aobj into blocks
* of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two.
*/
#define UAO_SWHASH_CLUSTER_SHIFT 4
#define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT)
/* Get the "tag" for this page index. */
#define UAO_SWHASH_ELT_TAG(idx) ((idx) >> UAO_SWHASH_CLUSTER_SHIFT)
#define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \
((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1))
/* Given an ELT and a page index, find the swap slot. */
#define UAO_SWHASH_ELT_PAGESLOT(elt, idx) \
((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)])
/* Given an ELT, return its pageidx base. */
#define UAO_SWHASH_ELT_PAGEIDX_BASE(elt) \
((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT)
/* The hash function. */
#define UAO_SWHASH_HASH(aobj, idx) \
(&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \
& (aobj)->u_swhashmask)])
/*
* The threshold which determines whether we will use an array or a
* hash table to store the list of allocated swap blocks.
*/
#define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4)
#define UAO_USES_SWHASH(aobj) \
((aobj)->u_pages > UAO_SWHASH_THRESHOLD)
/* The number of buckets in a hash, with an upper bound. */
#define UAO_SWHASH_MAXBUCKETS 256
#define UAO_SWHASH_BUCKETS(pages) \
(min((pages) >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))
/*
* uao_swhash_elt: when a hash table is being used, this structure defines
* the format of an entry in the bucket list.
*/
struct uao_swhash_elt {
LIST_ENTRY(uao_swhash_elt) list; /* the hash list */
voff_t tag; /* our 'tag' */
int count; /* our number of active slots */
int slots[UAO_SWHASH_CLUSTER_SIZE]; /* the slots */
};
/*
* uao_swhash: the swap hash table structure
*/
LIST_HEAD(uao_swhash, uao_swhash_elt);
/*
* uao_swhash_elt_pool: pool of uao_swhash_elt structures
*/
struct pool uao_swhash_elt_pool;
/*
* uvm_aobj: the actual anon-backed uvm_object
*
* => the uvm_object is at the top of the structure, this allows
* (struct uvm_aobj *) == (struct uvm_object *)
* => only one of u_swslots and u_swhash is used in any given aobj
*/
struct uvm_aobj {
struct uvm_object u_obj; /* has: pgops, memt, #pages, #refs */
int u_pages; /* number of pages in entire object */
int u_flags; /* the flags (see uvm_aobj.h) */
/*
* Either an array or hashtable (array of bucket heads) of
* offset -> swapslot mappings for the aobj.
*/
#define u_swslots u_swap.slot_array
#define u_swhash u_swap.slot_hash
union swslots {
int *slot_array;
struct uao_swhash *slot_hash;
} u_swap;
u_long u_swhashmask; /* mask for hashtable */
LIST_ENTRY(uvm_aobj) u_list; /* global list of aobjs */
};
struct pool uvm_aobj_pool;
static struct uao_swhash_elt *uao_find_swhash_elt(struct uvm_aobj *, int,
boolean_t);
static int uao_find_swslot(struct uvm_object *, int);
static boolean_t uao_flush(struct uvm_object *, voff_t,
voff_t, int);
static void uao_free(struct uvm_aobj *);
static int uao_get(struct uvm_object *, voff_t,
vm_page_t *, int *, int, vm_prot_t,
int, int);
static boolean_t uao_pagein(struct uvm_aobj *, int, int);
static boolean_t uao_pagein_page(struct uvm_aobj *, int);
void uao_dropswap_range(struct uvm_object *, voff_t, voff_t);
void uao_shrink_flush(struct uvm_object *, int, int);
int uao_shrink_hash(struct uvm_object *, int);
int uao_shrink_array(struct uvm_object *, int);
int uao_shrink_convert(struct uvm_object *, int);
int uao_grow_hash(struct uvm_object *, int);
int uao_grow_array(struct uvm_object *, int);
int uao_grow_convert(struct uvm_object *, int);
/*
* aobj_pager
*
* note that some functions (e.g. put) are handled elsewhere
*/
const struct uvm_pagerops aobj_pager = {
.pgo_reference = uao_reference,
.pgo_detach = uao_detach,
.pgo_flush = uao_flush,
.pgo_get = uao_get,
};
/*
* uao_list: global list of active aobjs, locked by uao_list_lock
*
* Lock ordering: generally the locking order is object lock, then list lock.
* in the case of swap off we have to iterate over the list, and thus the
* ordering is reversed. In that case we must use trylocking to prevent
* deadlock.
*/
static LIST_HEAD(aobjlist, uvm_aobj) uao_list = LIST_HEAD_INITIALIZER(uao_list);
static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
/*
* functions
*/
/*
* hash table/array related functions
*/
/*
* uao_find_swhash_elt: find (or create) a hash table entry for a page
* offset.
*/
static struct uao_swhash_elt *
uao_find_swhash_elt(struct uvm_aobj *aobj, int pageidx, boolean_t create)
{
struct uao_swhash *swhash;
struct uao_swhash_elt *elt;
voff_t page_tag;
swhash = UAO_SWHASH_HASH(aobj, pageidx); /* first hash to get bucket */
page_tag = UAO_SWHASH_ELT_TAG(pageidx); /* tag to search for */
/*
* now search the bucket for the requested tag
*/
LIST_FOREACH(elt, swhash, list) {
if (elt->tag == page_tag)
return elt;
}
if (!create)
return NULL;
/*
* allocate a new entry for the bucket and init/insert it in
*/
elt = pool_get(&uao_swhash_elt_pool, PR_NOWAIT | PR_ZERO);
/*
* XXX We cannot sleep here as the hash table might disappear
* from under our feet. And we run the risk of deadlocking
* the pagedeamon. In fact this code will only be called by
* the pagedaemon and allocation will only fail if we
* exhausted the pagedeamon reserve. In that case we're
* doomed anyway, so panic.
*/
if (elt == NULL)
panic("%s: can't allocate entry", __func__); LIST_INSERT_HEAD(swhash, elt, list);
elt->tag = page_tag;
return elt;
}
/*
* uao_find_swslot: find the swap slot number for an aobj/pageidx
*/
static inline int
uao_find_swslot(struct uvm_object *uobj, int pageidx)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we never return a slot
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP)
return 0;
/*
* if hashing, look in hash table.
*/
if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt =
uao_find_swhash_elt(aobj, pageidx, FALSE);
if (elt)
return UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
else
return 0;
}
/*
* otherwise, look in the array
*/
return aobj->u_swslots[pageidx];
}
/*
* uao_set_swslot: set the swap slot for a page in an aobj.
*
* => setting a slot to zero frees the slot
* => object must be locked by caller
* => we return the old slot number, or -1 if we failed to allocate
* memory to record the new slot number
*/
int
uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int oldslot;
KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0); KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we can't set a slot
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP) {
if (slot == 0)
return 0; /* a clear is ok */
/* but a set is not */
printf("uao_set_swslot: uobj = %p\n", uobj);
panic("uao_set_swslot: attempt to set a slot on a NOSWAP object");
}
/*
* are we using a hash table? if so, add it in the hash.
*/
if (UAO_USES_SWHASH(aobj)) {
/*
* Avoid allocating an entry just to free it again if
* the page had not swap slot in the first place, and
* we are freeing.
*/
struct uao_swhash_elt *elt =
uao_find_swhash_elt(aobj, pageidx, slot ? TRUE : FALSE);
if (elt == NULL) {
KASSERT(slot == 0);
return 0;
}
oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot;
/*
* now adjust the elt's reference counter and free it if we've
* dropped it to zero.
*/
if (slot) { if (oldslot == 0) elt->count++;
} else {
if (oldslot) elt->count--; if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
} else {
/* we are using an array */
oldslot = aobj->u_swslots[pageidx];
aobj->u_swslots[pageidx] = slot;
}
return oldslot;
}
/*
* end of hash/array functions
*/
/*
* uao_free: free all resources held by an aobj, and then free the aobj
*
* => the aobj should be dead
*/
static void
uao_free(struct uvm_aobj *aobj)
{
struct uvm_object *uobj = &aobj->u_obj;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
uao_dropswap_range(uobj, 0, 0);
rw_exit(uobj->vmobjlock);
if (UAO_USES_SWHASH(aobj)) {
/*
* free the hash table itself.
*/
hashfree(aobj->u_swhash, UAO_SWHASH_BUCKETS(aobj->u_pages), M_UVMAOBJ);
} else {
free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
}
/*
* finally free the aobj itself
*/
uvm_obj_destroy(uobj);
pool_put(&uvm_aobj_pool, aobj);
}
/*
* pager functions
*/
#ifdef TMPFS
/*
* Shrink an aobj to a given number of pages. The procedure is always the same:
* assess the necessity of data structure conversion (hash to array), secure
* resources, flush pages and drop swap slots.
*
*/
void
uao_shrink_flush(struct uvm_object *uobj, int startpg, int endpg)
{
KASSERT(startpg < endpg);
KASSERT(uobj->uo_refs == 1);
uao_flush(uobj, (voff_t)startpg << PAGE_SHIFT,
(voff_t)endpg << PAGE_SHIFT, PGO_FREE);
uao_dropswap_range(uobj, startpg, endpg);
}
int
uao_shrink_hash(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash *new_swhash;
struct uao_swhash_elt *elt;
unsigned long new_hashmask;
int i;
KASSERT(UAO_USES_SWHASH(aobj));
/*
* If the size of the hash table doesn't change, all we need to do is
* to adjust the page count.
*/
if (UAO_SWHASH_BUCKETS(aobj->u_pages) == UAO_SWHASH_BUCKETS(pages)) {
uao_shrink_flush(uobj, pages, aobj->u_pages);
aobj->u_pages = pages;
return 0;
}
new_swhash = hashinit(UAO_SWHASH_BUCKETS(pages), M_UVMAOBJ,
M_WAITOK | M_CANFAIL, &new_hashmask);
if (new_swhash == NULL)
return ENOMEM;
uao_shrink_flush(uobj, pages, aobj->u_pages);
/*
* Even though the hash table size is changing, the hash of the buckets
* we are interested in copying should not change.
*/
for (i = 0; i < UAO_SWHASH_BUCKETS(aobj->u_pages); i++) {
while (LIST_EMPTY(&aobj->u_swhash[i]) == 0) {
elt = LIST_FIRST(&aobj->u_swhash[i]);
LIST_REMOVE(elt, list);
LIST_INSERT_HEAD(&new_swhash[i], elt, list);
}
}
hashfree(aobj->u_swhash, UAO_SWHASH_BUCKETS(aobj->u_pages), M_UVMAOBJ);
aobj->u_swhash = new_swhash;
aobj->u_pages = pages;
aobj->u_swhashmask = new_hashmask;
return 0;
}
int
uao_shrink_convert(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash_elt *elt;
int i, *new_swslots;
new_swslots = mallocarray(pages, sizeof(int), M_UVMAOBJ,
M_WAITOK | M_CANFAIL | M_ZERO);
if (new_swslots == NULL)
return ENOMEM;
uao_shrink_flush(uobj, pages, aobj->u_pages);
/* Convert swap slots from hash to array. */
for (i = 0; i < pages; i++) {
elt = uao_find_swhash_elt(aobj, i, FALSE);
if (elt != NULL) {
new_swslots[i] = UAO_SWHASH_ELT_PAGESLOT(elt, i);
if (new_swslots[i] != 0)
elt->count--;
if (elt->count == 0) {
LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
}
hashfree(aobj->u_swhash, UAO_SWHASH_BUCKETS(aobj->u_pages), M_UVMAOBJ);
aobj->u_swslots = new_swslots;
aobj->u_pages = pages;
return 0;
}
int
uao_shrink_array(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int i, *new_swslots;
new_swslots = mallocarray(pages, sizeof(int), M_UVMAOBJ,
M_WAITOK | M_CANFAIL | M_ZERO);
if (new_swslots == NULL)
return ENOMEM;
uao_shrink_flush(uobj, pages, aobj->u_pages);
for (i = 0; i < pages; i++)
new_swslots[i] = aobj->u_swslots[i];
free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
aobj->u_swslots = new_swslots;
aobj->u_pages = pages;
return 0;
}
int
uao_shrink(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
KASSERT(pages < aobj->u_pages);
/*
* Distinguish between three possible cases:
* 1. aobj uses hash and must be converted to array.
* 2. aobj uses array and array size needs to be adjusted.
* 3. aobj uses hash and hash size needs to be adjusted.
*/
if (pages > UAO_SWHASH_THRESHOLD)
return uao_shrink_hash(uobj, pages); /* case 3 */
else if (aobj->u_pages > UAO_SWHASH_THRESHOLD)
return uao_shrink_convert(uobj, pages); /* case 1 */
else
return uao_shrink_array(uobj, pages); /* case 2 */
}
/*
* Grow an aobj to a given number of pages. Right now we only adjust the swap
* slots. We could additionally handle page allocation directly, so that they
* don't happen through uvm_fault(). That would allow us to use another
* mechanism for the swap slots other than malloc(). It is thus mandatory that
* the caller of these functions does not allow faults to happen in case of
* growth error.
*/
int
uao_grow_array(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int i, *new_swslots;
KASSERT(aobj->u_pages <= UAO_SWHASH_THRESHOLD);
new_swslots = mallocarray(pages, sizeof(int), M_UVMAOBJ,
M_WAITOK | M_CANFAIL | M_ZERO);
if (new_swslots == NULL)
return ENOMEM;
for (i = 0; i < aobj->u_pages; i++)
new_swslots[i] = aobj->u_swslots[i];
free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
aobj->u_swslots = new_swslots;
aobj->u_pages = pages;
return 0;
}
int
uao_grow_hash(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash *new_swhash;
struct uao_swhash_elt *elt;
unsigned long new_hashmask;
int i;
KASSERT(pages > UAO_SWHASH_THRESHOLD);
/*
* If the size of the hash table doesn't change, all we need to do is
* to adjust the page count.
*/
if (UAO_SWHASH_BUCKETS(aobj->u_pages) == UAO_SWHASH_BUCKETS(pages)) {
aobj->u_pages = pages;
return 0;
}
KASSERT(UAO_SWHASH_BUCKETS(aobj->u_pages) < UAO_SWHASH_BUCKETS(pages));
new_swhash = hashinit(UAO_SWHASH_BUCKETS(pages), M_UVMAOBJ,
M_WAITOK | M_CANFAIL, &new_hashmask);
if (new_swhash == NULL)
return ENOMEM;
for (i = 0; i < UAO_SWHASH_BUCKETS(aobj->u_pages); i++) {
while (LIST_EMPTY(&aobj->u_swhash[i]) == 0) {
elt = LIST_FIRST(&aobj->u_swhash[i]);
LIST_REMOVE(elt, list);
LIST_INSERT_HEAD(&new_swhash[i], elt, list);
}
}
hashfree(aobj->u_swhash, UAO_SWHASH_BUCKETS(aobj->u_pages), M_UVMAOBJ);
aobj->u_swhash = new_swhash;
aobj->u_pages = pages;
aobj->u_swhashmask = new_hashmask;
return 0;
}
int
uao_grow_convert(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash *new_swhash;
struct uao_swhash_elt *elt;
unsigned long new_hashmask;
int i, *old_swslots;
new_swhash = hashinit(UAO_SWHASH_BUCKETS(pages), M_UVMAOBJ,
M_WAITOK | M_CANFAIL, &new_hashmask);
if (new_swhash == NULL)
return ENOMEM;
/* Set these now, so we can use uao_find_swhash_elt(). */
old_swslots = aobj->u_swslots;
aobj->u_swhash = new_swhash;
aobj->u_swhashmask = new_hashmask;
for (i = 0; i < aobj->u_pages; i++) {
if (old_swslots[i] != 0) {
elt = uao_find_swhash_elt(aobj, i, TRUE);
elt->count++;
UAO_SWHASH_ELT_PAGESLOT(elt, i) = old_swslots[i];
}
}
free(old_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
aobj->u_pages = pages;
return 0;
}
int
uao_grow(struct uvm_object *uobj, int pages)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
KASSERT(pages > aobj->u_pages);
/*
* Distinguish between three possible cases:
* 1. aobj uses hash and hash size needs to be adjusted.
* 2. aobj uses array and array size needs to be adjusted.
* 3. aobj uses array and must be converted to hash.
*/
if (pages <= UAO_SWHASH_THRESHOLD)
return uao_grow_array(uobj, pages); /* case 2 */
else if (aobj->u_pages > UAO_SWHASH_THRESHOLD)
return uao_grow_hash(uobj, pages); /* case 1 */
else
return uao_grow_convert(uobj, pages);
}
#endif /* TMPFS */
/*
* uao_create: create an aobj of the given size and return its uvm_object.
*
* => for normal use, flags are zero or UAO_FLAG_CANFAIL.
* => for the kernel object, the flags are:
* UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once)
* UAO_FLAG_KERNSWAP - enable swapping of kernel object (" ")
*/
struct uvm_object *
uao_create(vsize_t size, int flags)
{
static struct uvm_aobj kernel_object_store;
static struct rwlock bootstrap_kernel_object_lock;
static int kobj_alloced = 0;
int pages = round_page(size) >> PAGE_SHIFT;
struct uvm_aobj *aobj;
int refs;
/*
* Allocate a new aobj, unless kernel object is requested.
*/
if (flags & UAO_FLAG_KERNOBJ) {
KASSERT(!kobj_alloced);
aobj = &kernel_object_store;
aobj->u_pages = pages;
aobj->u_flags = UAO_FLAG_NOSWAP;
refs = UVM_OBJ_KERN;
kobj_alloced = UAO_FLAG_KERNOBJ;
} else if (flags & UAO_FLAG_KERNSWAP) {
KASSERT(kobj_alloced == UAO_FLAG_KERNOBJ);
aobj = &kernel_object_store;
kobj_alloced = UAO_FLAG_KERNSWAP;
} else {
aobj = pool_get(&uvm_aobj_pool, PR_WAITOK);
aobj->u_pages = pages;
aobj->u_flags = 0;
refs = 1;
}
/*
* allocate hash/array if necessary
*/
if (flags == 0 || (flags & (UAO_FLAG_KERNSWAP | UAO_FLAG_CANFAIL))) {
int mflags;
if (flags)
mflags = M_NOWAIT;
else
mflags = M_WAITOK;
/* allocate hash table or array depending on object size */
if (UAO_USES_SWHASH(aobj)) {
aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(pages),
M_UVMAOBJ, mflags, &aobj->u_swhashmask);
if (aobj->u_swhash == NULL) {
if (flags & UAO_FLAG_CANFAIL) {
pool_put(&uvm_aobj_pool, aobj);
return NULL;
}
panic("uao_create: hashinit swhash failed");
}
} else {
aobj->u_swslots = mallocarray(pages, sizeof(int),
M_UVMAOBJ, mflags|M_ZERO);
if (aobj->u_swslots == NULL) {
if (flags & UAO_FLAG_CANFAIL) {
pool_put(&uvm_aobj_pool, aobj);
return NULL;
}
panic("uao_create: malloc swslots failed");
}
}
if (flags & UAO_FLAG_KERNSWAP) { aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */
return &aobj->u_obj;
/* done! */
}
}
/*
* Initialise UVM object.
*/
uvm_obj_init(&aobj->u_obj, &aobj_pager, refs);
if (flags & UAO_FLAG_KERNOBJ) {
/* Use a temporary static lock for kernel_object. */
rw_init(&bootstrap_kernel_object_lock, "kobjlk");
uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock);
}
/*
* now that aobj is ready, add it to the global list
*/
mtx_enter(&uao_list_lock);
LIST_INSERT_HEAD(&uao_list, aobj, u_list);
mtx_leave(&uao_list_lock);
return &aobj->u_obj;
}
/*
* uao_init: set up aobj pager subsystem
*
* => called at boot time from uvm_pager_init()
*/
void
uao_init(void)
{
/*
* NOTE: Pages for this pool must not come from a pageable
* kernel map!
*/
pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt), 0,
IPL_NONE, PR_WAITOK, "uaoeltpl", NULL);
pool_init(&uvm_aobj_pool, sizeof(struct uvm_aobj), 0,
IPL_NONE, PR_WAITOK, "aobjpl", NULL);
}
/*
* uao_reference: hold a reference to an anonymous UVM object.
*/
void
uao_reference(struct uvm_object *uobj)
{
/* Kernel object is persistent. */
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
atomic_inc_int(&uobj->uo_refs);
}
/*
* uao_detach: drop a reference to an anonymous UVM object.
*/
void
uao_detach(struct uvm_object *uobj)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct vm_page *pg;
/*
* Detaching from kernel_object is a NOP.
*/
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
/*
* Drop the reference. If it was the last one, destroy the object.
*/
if (atomic_dec_int_nv(&uobj->uo_refs) > 0) {
return;
}
/*
* Remove the aobj from the global list.
*/
mtx_enter(&uao_list_lock);
LIST_REMOVE(aobj, u_list);
mtx_leave(&uao_list_lock);
/*
* Free all the pages left in the aobj. For each page, when the
* page is no longer busy (and thus after any disk I/O that it is
* involved in is complete), release any swap resources and free
* the page itself.
*/
rw_enter(uobj->vmobjlock, RW_WRITE);
while ((pg = RBT_ROOT(uvm_objtree, &uobj->memt)) != NULL) {
pmap_page_protect(pg, PROT_NONE);
if (pg->pg_flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uao_det");
rw_enter(uobj->vmobjlock, RW_WRITE);
continue;
}
uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
uvm_lock_pageq();
uvm_pagefree(pg);
uvm_unlock_pageq();
}
/*
* Finally, free the anonymous UVM object itself.
*/
uao_free(aobj);
}
/*
* uao_flush: flush pages out of a uvm object
*
* => if PGO_CLEANIT is not set, then we will not block.
* => if PGO_ALLPAGE is set, then all pages in the object are valid targets
* for flushing.
* => NOTE: we are allowed to lock the page queues, so the caller
* must not be holding the lock on them [e.g. pagedaemon had
* better not call us with the queues locked]
* => we return TRUE unless we encountered some sort of I/O error
* XXXJRT currently never happens, as we never directly initiate
* XXXJRT I/O
*/
boolean_t
uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *) uobj;
struct vm_page *pg;
voff_t curoff;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (flags & PGO_ALLPAGES) {
start = 0;
stop = (voff_t)aobj->u_pages << PAGE_SHIFT;
} else {
start = trunc_page(start);
stop = round_page(stop);
if (stop > ((voff_t)aobj->u_pages << PAGE_SHIFT)) { printf("uao_flush: strange, got an out of range "
"flush (fixed)\n");
stop = (voff_t)aobj->u_pages << PAGE_SHIFT;
}
}
/*
* Don't need to do any work here if we're not freeing
* or deactivating pages.
*/
if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
return TRUE;
}
curoff = start;
for (;;) {
if (curoff < stop) {
pg = uvm_pagelookup(uobj, curoff);
curoff += PAGE_SIZE;
if (pg == NULL)
continue;
} else {
break;
}
/* Make sure page is unbusy, else wait for it. */
if (pg->pg_flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uaoflsh");
rw_enter(uobj->vmobjlock, RW_WRITE);
curoff -= PAGE_SIZE;
continue;
}
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* XXX In these first 3 cases, we always just
* XXX deactivate the page. We may want to
* XXX handle the different cases more specifically
* XXX in the future.
*/
case PGO_CLEANIT|PGO_FREE:
/* FALLTHROUGH */
case PGO_CLEANIT|PGO_DEACTIVATE:
/* FALLTHROUGH */
case PGO_DEACTIVATE:
deactivate_it:
if (pg->wire_count != 0)
continue;
uvm_lock_pageq();
pmap_page_protect(pg, PROT_NONE);
uvm_pagedeactivate(pg);
uvm_unlock_pageq();
continue;
case PGO_FREE:
/*
* If there are multiple references to
* the object, just deactivate the page.
*/
if (uobj->uo_refs > 1)
goto deactivate_it;
/* XXX skip the page if it's wired */
if (pg->wire_count != 0)
continue;
/*
* free the swap slot and the page.
*/
pmap_page_protect(pg, PROT_NONE);
/*
* freeing swapslot here is not strictly necessary.
* however, leaving it here doesn't save much
* because we need to update swap accounting anyway.
*/
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
uvm_lock_pageq();
uvm_pagefree(pg);
uvm_unlock_pageq();
continue;
default:
panic("uao_flush: weird flags");
}
}
return TRUE;
}
/*
* uao_get: fetch me a page
*
* we have three cases:
* 1: page is resident -> just return the page.
* 2: page is zero-fill -> allocate a new page and zero it.
* 3: page is swapped out -> fetch the page from swap.
*
* cases 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot.
* so, if the "center" page hits case 3 (or any page, with PGO_ALLPAGES),
* then we will need to return VM_PAGER_UNLOCK.
*
* => flags: PGO_ALLPAGES: get all of the pages
* PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
static int
uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
voff_t current_offset;
vm_page_t ptmp;
int lcv, gotpages, maxpages, swslot, rv, pageidx;
boolean_t done;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
/*
* get number of pages
*/
maxpages = *npagesp;
if (flags & PGO_LOCKED) {
/*
* step 1a: get pages that are already resident. only do
* this if the data structures are locked (i.e. the first
* time through).
*/
done = TRUE; /* be optimistic */
gotpages = 0; /* # of pages we got so far */
for (lcv = 0, current_offset = offset ; lcv < maxpages ;
lcv++, current_offset += PAGE_SIZE) {
/* do we care about this page? if not, skip it */
if (pps[lcv] == PGO_DONTCARE)
continue;
ptmp = uvm_pagelookup(uobj, current_offset);
/*
* if page is new, attempt to allocate the page,
* zero-fill'd.
*/
if (ptmp == NULL && uao_find_swslot(uobj,
current_offset >> PAGE_SHIFT) == 0) {
ptmp = uvm_pagealloc(uobj, current_offset,
NULL, UVM_PGA_ZERO);
if (ptmp) {
/* new page */
atomic_clearbits_int(&ptmp->pg_flags,
PG_BUSY|PG_FAKE);
atomic_setbits_int(&ptmp->pg_flags,
PQ_AOBJ);
UVM_PAGE_OWN(ptmp, NULL);
}
}
/*
* to be useful must get a non-busy page
*/
if (ptmp == NULL ||
(ptmp->pg_flags & PG_BUSY) != 0) {
if (lcv == centeridx ||
(flags & PGO_ALLPAGES) != 0)
/* need to do a wait or I/O! */
done = FALSE;
continue;
}
/*
* useful page: plug it in our result array
*/
atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(ptmp, "uao_get1");
pps[lcv] = ptmp;
gotpages++;
}
/*
* step 1b: now we've either done everything needed or we
* to unlock and do some waiting or I/O.
*/
*npagesp = gotpages;
if (done)
/* bingo! */
return VM_PAGER_OK;
else
/* EEK! Need to unlock and I/O */
return VM_PAGER_UNLOCK;
}
/*
* step 2: get non-resident or busy pages.
* data structures are unlocked.
*/
for (lcv = 0, current_offset = offset ; lcv < maxpages ;
lcv++, current_offset += PAGE_SIZE) {
/*
* - skip over pages we've already gotten or don't want
* - skip over pages we don't _have_ to get
*/
if (pps[lcv] != NULL || (lcv != centeridx && (flags & PGO_ALLPAGES) == 0))
continue;
pageidx = current_offset >> PAGE_SHIFT;
/*
* we have yet to locate the current page (pps[lcv]). we
* first look for a page that is already at the current offset.
* if we find a page, we check to see if it is busy or
* released. if that is the case, then we sleep on the page
* until it is no longer busy or released and repeat the lookup.
* if the page we found is neither busy nor released, then we
* busy it (so we own it) and plug it into pps[lcv]. this
* 'break's the following while loop and indicates we are
* ready to move on to the next page in the "lcv" loop above.
*
* if we exit the while loop with pps[lcv] still set to NULL,
* then it means that we allocated a new busy/fake/clean page
* ptmp in the object and we need to do I/O to fill in the data.
*/
/* top of "pps" while loop */
while (pps[lcv] == NULL) {
/* look for a resident page */
ptmp = uvm_pagelookup(uobj, current_offset);
/* not resident? allocate one now (if we can) */
if (ptmp == NULL) {
ptmp = uvm_pagealloc(uobj, current_offset,
NULL, 0);
/* out of RAM? */
if (ptmp == NULL) {
rw_exit(uobj->vmobjlock);
uvm_wait("uao_getpage");
rw_enter(uobj->vmobjlock, RW_WRITE);
/* goto top of pps while loop */
continue;
}
/*
* safe with PQ's unlocked: because we just
* alloc'd the page
*/
atomic_setbits_int(&ptmp->pg_flags, PQ_AOBJ);
/*
* got new page ready for I/O. break pps while
* loop. pps[lcv] is still NULL.
*/
break;
}
/* page is there, see if we need to wait on it */
if ((ptmp->pg_flags & PG_BUSY) != 0) { uvm_pagewait(ptmp, uobj->vmobjlock, "uao_get");
rw_enter(uobj->vmobjlock, RW_WRITE);
continue; /* goto top of pps while loop */
}
/*
* if we get here then the page is resident and
* unbusy. we busy it now (so we own it).
*/
/* we own it, caller must un-busy */
atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(ptmp, "uao_get2");
pps[lcv] = ptmp;
}
/*
* if we own the valid page at the correct offset, pps[lcv] will
* point to it. nothing more to do except go to the next page.
*/
if (pps[lcv])
continue; /* next lcv */
/*
* we have a "fake/busy/clean" page that we just allocated.
* do the needed "i/o", either reading from swap or zeroing.
*/
swslot = uao_find_swslot(uobj, pageidx);
/* just zero the page if there's nothing in swap. */
if (swslot == 0) {
/* page hasn't existed before, just zero it. */
uvm_pagezero(ptmp);
} else {
/*
* page in the swapped-out page.
* unlock object for i/o, relock when done.
*/
rw_exit(uobj->vmobjlock);
rv = uvm_swap_get(ptmp, swslot, PGO_SYNCIO);
rw_enter(uobj->vmobjlock, RW_WRITE);
/*
* I/O done. check for errors.
*/
if (rv != VM_PAGER_OK) {
/*
* remove the swap slot from the aobj
* and mark the aobj as having no real slot.
* don't free the swap slot, thus preventing
* it from being used again.
*/
swslot = uao_set_swslot(&aobj->u_obj, pageidx,
SWSLOT_BAD);
uvm_swap_markbad(swslot, 1);
if (ptmp->pg_flags & PG_WANTED) wakeup(ptmp);
atomic_clearbits_int(&ptmp->pg_flags,
PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(ptmp, NULL);
uvm_lock_pageq();
uvm_pagefree(ptmp);
uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
return rv;
}
}
/*
* we got the page! clear the fake flag (indicates valid
* data now in page) and plug into our result array. note
* that page is still busy.
*
* it is the callers job to:
* => check if the page is released
* => unbusy the page
* => activate the page
*/
atomic_clearbits_int(&ptmp->pg_flags, PG_FAKE);
pmap_clear_modify(ptmp); /* ... and clean */
pps[lcv] = ptmp;
} /* lcv loop */
rw_exit(uobj->vmobjlock);
return VM_PAGER_OK;
}
/*
* uao_dropswap: release any swap resources from this aobj page.
*
* => aobj must be locked or have a reference count of 0.
*/
int
uao_dropswap(struct uvm_object *uobj, int pageidx)
{
int slot;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
slot = uao_set_swslot(uobj, pageidx, 0);
if (slot) { uvm_swap_free(slot, 1);
}
return slot;
}
/*
* page in every page in every aobj that is paged-out to a range of swslots.
*
* => aobj must be locked and is returned locked.
* => returns TRUE if pagein was aborted due to lack of memory.
*/
boolean_t
uao_swap_off(int startslot, int endslot)
{
struct uvm_aobj *aobj;
/*
* Walk the list of all anonymous UVM objects. Grab the first.
*/
mtx_enter(&uao_list_lock);
if ((aobj = LIST_FIRST(&uao_list)) == NULL) {
mtx_leave(&uao_list_lock);
return FALSE;
}
uao_reference(&aobj->u_obj);
do {
struct uvm_aobj *nextaobj;
boolean_t rv;
/*
* Prefetch the next object and immediately hold a reference
* on it, so neither the current nor the next entry could
* disappear while we are iterating.
*/
if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
uao_reference(&nextaobj->u_obj);
}
mtx_leave(&uao_list_lock);
/*
* Page in all pages in the swap slot range.
*/
rw_enter(aobj->u_obj.vmobjlock, RW_WRITE);
rv = uao_pagein(aobj, startslot, endslot);
rw_exit(aobj->u_obj.vmobjlock);
/* Drop the reference of the current object. */
uao_detach(&aobj->u_obj);
if (rv) {
if (nextaobj) {
uao_detach(&nextaobj->u_obj);
}
return rv;
}
aobj = nextaobj;
mtx_enter(&uao_list_lock);
} while (aobj);
/*
* done with traversal, unlock the list
*/
mtx_leave(&uao_list_lock);
return FALSE;
}
/*
* page in any pages from aobj in the given range.
*
* => returns TRUE if pagein was aborted due to lack of memory.
*/
static boolean_t
uao_pagein(struct uvm_aobj *aobj, int startslot, int endslot)
{
boolean_t rv;
if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt;
int bucket;
restart:
for (bucket = aobj->u_swhashmask; bucket >= 0; bucket--) {
for (elt = LIST_FIRST(&aobj->u_swhash[bucket]);
elt != NULL;
elt = LIST_NEXT(elt, list)) {
int i;
for (i = 0; i < UAO_SWHASH_CLUSTER_SIZE; i++) {
int slot = elt->slots[i];
/*
* if the slot isn't in range, skip it.
*/
if (slot < startslot ||
slot >= endslot) {
continue;
}
/*
* process the page,
* the start over on this object
* since the swhash elt
* may have been freed.
*/
rv = uao_pagein_page(aobj,
UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + i);
if (rv) {
return rv;
}
goto restart;
}
}
}
} else {
int i;
for (i = 0; i < aobj->u_pages; i++) {
int slot = aobj->u_swslots[i];
/*
* if the slot isn't in range, skip it
*/
if (slot < startslot || slot >= endslot) {
continue;
}
/*
* process the page.
*/
rv = uao_pagein_page(aobj, i);
if (rv) {
return rv;
}
}
}
return FALSE;
}
/*
* uao_pagein_page: page in a single page from an anonymous UVM object.
*
* => Returns TRUE if pagein was aborted due to lack of memory.
*/
static boolean_t
uao_pagein_page(struct uvm_aobj *aobj, int pageidx)
{
struct uvm_object *uobj = &aobj->u_obj;
struct vm_page *pg;
int rv, slot, npages;
pg = NULL;
npages = 1;
KASSERT(rw_write_held(uobj->vmobjlock));
rv = uao_get(&aobj->u_obj, (voff_t)pageidx << PAGE_SHIFT,
&pg, &npages, 0, PROT_READ | PROT_WRITE, 0, 0);
/*
* relock and finish up.
*/
rw_enter(uobj->vmobjlock, RW_WRITE);
switch (rv) {
case VM_PAGER_OK:
break;
case VM_PAGER_ERROR:
case VM_PAGER_REFAULT:
/*
* nothing more to do on errors.
* VM_PAGER_REFAULT can only mean that the anon was freed,
* so again there's nothing to do.
*/
return FALSE;
}
/*
* ok, we've got the page now.
* mark it as dirty, clear its swslot and un-busy it.
*/
slot = uao_set_swslot(&aobj->u_obj, pageidx, 0);
uvm_swap_free(slot, 1);
atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_CLEAN|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
/*
* deactivate the page (to put it on a page queue).
*/
pmap_clear_reference(pg);
uvm_lock_pageq();
uvm_pagedeactivate(pg);
uvm_unlock_pageq();
return FALSE;
}
/*
* uao_dropswap_range: drop swapslots in the range.
*
* => aobj must be locked and is returned locked.
* => start is inclusive. end is exclusive.
*/
void
uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int swpgonlydelta = 0;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (end == 0) {
end = INT64_MAX;
}
if (UAO_USES_SWHASH(aobj)) {
int i, hashbuckets = aobj->u_swhashmask + 1;
voff_t taghi;
voff_t taglo;
taglo = UAO_SWHASH_ELT_TAG(start);
taghi = UAO_SWHASH_ELT_TAG(end);
for (i = 0; i < hashbuckets; i++) {
struct uao_swhash_elt *elt, *next;
for (elt = LIST_FIRST(&aobj->u_swhash[i]);
elt != NULL;
elt = next) {
int startidx, endidx;
int j;
next = LIST_NEXT(elt, list);
if (elt->tag < taglo || taghi < elt->tag) {
continue;
}
if (elt->tag == taglo) {
startidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(start);
} else {
startidx = 0;
}
if (elt->tag == taghi) {
endidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(end);
} else {
endidx = UAO_SWHASH_CLUSTER_SIZE;
}
for (j = startidx; j < endidx; j++) {
int slot = elt->slots[j];
KASSERT(uvm_pagelookup(&aobj->u_obj,
(voff_t)(UAO_SWHASH_ELT_PAGEIDX_BASE(elt)
+ j) << PAGE_SHIFT) == NULL);
if (slot > 0) {
uvm_swap_free(slot, 1);
swpgonlydelta++; KASSERT(elt->count > 0);
elt->slots[j] = 0;
elt->count--;
}
}
if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
}
} else {
int i;
if (aobj->u_pages < end) {
end = aobj->u_pages;
}
for (i = start; i < end; i++) {
int slot = aobj->u_swslots[i];
if (slot > 0) { uvm_swap_free(slot, 1);
swpgonlydelta++;
}
}
}
/*
* adjust the counter of pages only in swap for all
* the swap slots we've freed.
*/
if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta); atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
/* $OpenBSD: kern_clock.c,v 1.105 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_clock.c,v 1.34 1996/06/09 04:51:03 briggs Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/timeout.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/sched.h>
#include <sys/timetc.h>
#if defined(GPROF) || defined(DDBPROF)
#include <sys/gmon.h>
#endif
#include "dt.h"
#if NDT > 0
#include <dev/dt/dtvar.h>
#endif
/*
* Clock handling routines.
*
* This code is written to operate with two timers that run independently of
* each other. The main clock, running hz times per second, is used to keep
* track of real time. The second timer handles kernel and user profiling,
* and does resource use estimation. If the second timer is programmable,
* it is randomized to avoid aliasing between the two clocks. For example,
* the randomization prevents an adversary from always giving up the cpu
* just before its quantum expires. Otherwise, it would never accumulate
* cpu ticks. The mean frequency of the second timer is stathz.
*
* If no second timer exists, stathz will be zero; in this case we drive
* profiling and statistics off the main clock. This WILL NOT be accurate;
* do not do it unless absolutely necessary.
*
* The statistics clock may (or may not) be run at a higher rate while
* profiling. This profile clock runs at profhz. We require that profhz
* be an integral multiple of stathz.
*
* If the statistics clock is running fast, it must be divided by the ratio
* profhz/stathz for statistics. (For profiling, every tick counts.)
*/
int stathz;
int schedhz;
int profhz;
int profprocs;
int ticks;
static int psdiv, pscnt; /* prof => stat divider */
int psratio; /* ratio: prof / stat */
volatile unsigned long jiffies; /* XXX Linux API for drm(4) */
/*
* Initialize clock frequencies and start both clocks running.
*/
void
initclocks(void)
{
int i;
ticks = INT_MAX - (15 * 60 * hz);
jiffies = ULONG_MAX - (10 * 60 * hz);
/*
* Set divisors to 1 (normal case) and let the machine-specific
* code do its bit.
*/
psdiv = pscnt = 1;
cpu_initclocks();
/*
* Compute profhz/stathz, and fix profhz if needed.
*/
i = stathz ? stathz : hz;
if (profhz == 0)
profhz = i;
psratio = profhz / i;
inittimecounter();
}
/*
* hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
* We don't want to send signals with psignal from hardclock because it makes
* MULTIPROCESSOR locking very complicated. Instead, to use an idea from
* FreeBSD, we set a flag on the thread and when it goes to return to
* userspace it signals itself.
*/
/*
* The real-time timer, interrupting hz times per second.
*/
void
hardclock(struct clockframe *frame)
{
struct proc *p;
struct cpu_info *ci = curcpu();
p = curproc;
if (p && ((p->p_flag & (P_SYSTEM | P_WEXIT)) == 0)) {
struct process *pr = p->p_p;
/*
* Run current process's virtual and profile time, as needed.
*/
if (CLKF_USERMODE(frame) &&
timespecisset(&pr->ps_timer[ITIMER_VIRTUAL].it_value) &&
itimerdecr(&pr->ps_timer[ITIMER_VIRTUAL], tick_nsec) == 0) {
atomic_setbits_int(&p->p_flag, P_ALRMPEND);
need_proftick(p);
}
if (timespecisset(&pr->ps_timer[ITIMER_PROF].it_value) &&
itimerdecr(&pr->ps_timer[ITIMER_PROF], tick_nsec) == 0) {
atomic_setbits_int(&p->p_flag, P_PROFPEND);
need_proftick(p);
}
}
/*
* If no separate statistics clock is available, run it from here.
*/
if (stathz == 0)
statclock(frame);
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
#if NDT > 0
DT_ENTER(profile, NULL);
if (CPU_IS_PRIMARY(ci))
DT_ENTER(interval, NULL);
#endif
/*
* If we are not the primary CPU, we're not allowed to do
* any more work.
*/
if (CPU_IS_PRIMARY(ci) == 0)
return;
tc_ticktock();
ticks++;
jiffies++;
/*
* Update the timeout wheel.
*/
timeout_hardclock_update();
}
/*
* Compute number of hz in the specified amount of time.
*/
int
tvtohz(const struct timeval *tv)
{
unsigned long nticks;
time_t sec;
long usec;
/*
* If the number of usecs in the whole seconds part of the time
* fits in a long, then the total number of usecs will
* fit in an unsigned long. Compute the total and convert it to
* ticks, rounding up and adding 1 to allow for the current tick
* to expire. Rounding also depends on unsigned long arithmetic
* to avoid overflow.
*
* Otherwise, if the number of ticks in the whole seconds part of
* the time fits in a long, then convert the parts to
* ticks separately and add, using similar rounding methods and
* overflow avoidance. This method would work in the previous
* case but it is slightly slower and assumes that hz is integral.
*
* Otherwise, round the time down to the maximum
* representable value.
*
* If ints have 32 bits, then the maximum value for any timeout in
* 10ms ticks is 248 days.
*/
sec = tv->tv_sec;
usec = tv->tv_usec;
if (sec < 0 || (sec == 0 && usec <= 0))
nticks = 0;
else if (sec <= LONG_MAX / 1000000)
nticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
/ tick + 1;
else if (sec <= LONG_MAX / hz) nticks = sec * hz
+ ((unsigned long)usec + (tick - 1)) / tick + 1;
else
nticks = LONG_MAX;
if (nticks > INT_MAX)
nticks = INT_MAX;
return ((int)nticks);
}
int
tstohz(const struct timespec *ts)
{
struct timeval tv;
TIMESPEC_TO_TIMEVAL(&tv, ts);
/* Round up. */
if ((ts->tv_nsec % 1000) != 0) {
tv.tv_usec += 1;
if (tv.tv_usec >= 1000000) {
tv.tv_usec -= 1000000;
tv.tv_sec += 1;
}
}
return (tvtohz(&tv));
}
/*
* Start profiling on a process.
*
* Kernel profiling passes proc0 which never exits and hence
* keeps the profile clock running constantly.
*/
void
startprofclock(struct process *pr)
{
int s;
if ((pr->ps_flags & PS_PROFIL) == 0) {
atomic_setbits_int(&pr->ps_flags, PS_PROFIL);
if (++profprocs == 1 && stathz != 0) {
s = splstatclock();
psdiv = pscnt = psratio;
setstatclockrate(profhz);
splx(s);
}
}
}
/*
* Stop profiling on a process.
*/
void
stopprofclock(struct process *pr)
{
int s;
if (pr->ps_flags & PS_PROFIL) {
atomic_clearbits_int(&pr->ps_flags, PS_PROFIL);
if (--profprocs == 0 && stathz != 0) {
s = splstatclock();
psdiv = pscnt = 1;
setstatclockrate(stathz);
splx(s);
}
}
}
/*
* Statistics clock. Grab profile sample, and if divider reaches 0,
* do process and kernel statistics.
*/
void
statclock(struct clockframe *frame)
{
#if defined(GPROF) || defined(DDBPROF)
struct gmonparam *g;
u_long i;
#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
struct proc *p = curproc;
struct process *pr;
/*
* Notice changes in divisor frequency, and adjust clock
* frequency accordingly.
*/
if (spc->spc_psdiv != psdiv) {
spc->spc_psdiv = psdiv;
spc->spc_pscnt = psdiv;
if (psdiv == 1) {
setstatclockrate(stathz);
} else {
setstatclockrate(profhz);
}
}
if (CLKF_USERMODE(frame)) {
pr = p->p_p;
if (pr->ps_flags & PS_PROFIL)
addupc_intr(p, CLKF_PC(frame));
if (--spc->spc_pscnt > 0)
return;
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled record the tick.
*/
p->p_uticks++;
if (pr->ps_nice > NZERO)
spc->spc_cp_time[CP_NICE]++;
else
spc->spc_cp_time[CP_USER]++;
} else {
#if defined(GPROF) || defined(DDBPROF)
/*
* Kernel statistics are just like addupc_intr, only easier.
*/
g = ci->ci_gmon;
if (g != NULL && g->state == GMON_PROF_ON) {
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i]++;
}
}
#endif
if (p != NULL && p->p_p->ps_flags & PS_PROFIL)
addupc_intr(p, PROC_PC(p));
if (--spc->spc_pscnt > 0)
return;
/*
* Came from kernel mode, so we were:
* - spinning on a lock
* - handling an interrupt,
* - doing syscall or trap work on behalf of the current
* user process, or
* - spinning in the idle loop.
* Whichever it is, charge the time as appropriate.
* Note that we charge interrupts to the current process,
* regardless of whether they are ``for'' that process,
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
if (CLKF_INTR(frame)) {
if (p != NULL)
p->p_iticks++;
spc->spc_cp_time[spc->spc_spinning ?
CP_SPIN : CP_INTR]++;
} else if (p != NULL && p != spc->spc_idleproc) {
p->p_sticks++;
spc->spc_cp_time[spc->spc_spinning ?
CP_SPIN : CP_SYS]++;
} else
spc->spc_cp_time[spc->spc_spinning ?
CP_SPIN : CP_IDLE]++;
}
spc->spc_pscnt = psdiv;
if (p != NULL) {
p->p_cpticks++;
/*
* If no schedclock is provided, call it here at ~~12-25 Hz;
* ~~16 Hz is best
*/
if (schedhz == 0) {
if ((++spc->spc_schedticks & 3) == 0)
schedclock(p);
}
}
}
/*
* Return information about system clocks.
*/
int
sysctl_clockrate(char *where, size_t *sizep, void *newp)
{
struct clockinfo clkinfo;
/*
* Construct clockinfo structure.
*/
memset(&clkinfo, 0, sizeof clkinfo);
clkinfo.tick = tick;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
clkinfo.stathz = stathz ? stathz : hz;
return (sysctl_rdstruct(where, sizep, newp, &clkinfo, sizeof(clkinfo)));
}
/* $OpenBSD: inet_ntop.c,v 1.4 2016/09/04 17:05:24 claudio Exp $ */
/* Copyright (c) 1996 by Internet Software Consortium.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
* ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
* CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <netinet/in.h>
#define IN6ADDRSZ 16
#define INT16SZ 2
/*
* WARNING: Don't even consider trying to compile this on a system where
* sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX.
*/
static const char *inet_ntop4(const u_char *src, char *dst, size_t size);
#ifdef INET6
static const char *inet_ntop6(const u_char *src, char *dst, size_t size);
#endif /* INET6 */
/* char *
* inet_ntop(af, src, dst, size)
* convert a network format address to presentation format.
* return:
* pointer to presentation format address (`dst'), or NULL (see errno).
* author:
* Paul Vixie, 1996.
*/
const char *
inet_ntop(int af, const void *src, char *dst, socklen_t size)
{ switch (af) {
case AF_INET:
return (inet_ntop4(src, dst, (size_t)size));
#ifdef INET6
case AF_INET6:
return (inet_ntop6(src, dst, (size_t)size));
#endif /* INET6 */
default:
return (NULL);
}
/* NOTREACHED */
}
const char *
sockaddr_ntop(struct sockaddr *sa, char *dst, size_t size)
{
u_int8_t l;
size_t n;
if (sa->sa_len < 2)
return "bad sa";
switch (sa->sa_family) {
case AF_INET:
return inet_ntop4((u_char *)&satosin(sa)->sin_addr, dst, size);
#ifdef INET6
case AF_INET6:
return inet_ntop6((u_char *)&satosin6(sa)->sin6_addr, dst, size);
#endif
default:
n = snprintf(dst, size, "%d ", sa->sa_family);
for (l = 0; l < sa->sa_len - offsetof(struct sockaddr, sa_data); l++) {
int r = snprintf(dst + n, size - n, "%02x", sa->sa_data[l]);
if (r == -1)
return "bad sa";
n += r;
if (n > size)
n = size;
}
return (dst);
}
}
/* const char *
* inet_ntop4(src, dst, size)
* format an IPv4 address, more or less like inet_ntoa()
* return:
* `dst' (as a const)
* notes:
* (1) uses no statics
* (2) takes a u_char* not an in_addr as input
* author:
* Paul Vixie, 1996.
*/
static const char *
inet_ntop4(const u_char *src, char *dst, size_t size)
{
char tmp[sizeof "255.255.255.255"];
int l;
l = snprintf(tmp, sizeof(tmp), "%u.%u.%u.%u",
src[0], src[1], src[2], src[3]);
if (l <= 0 || l >= size) {
return (NULL);
}
strlcpy(dst, tmp, size);
return (dst);
}
#ifdef INET6
/* const char *
* inet_ntop6(src, dst, size)
* convert IPv6 binary address into presentation (printable) format
* author:
* Paul Vixie, 1996.
*/
static const char *
inet_ntop6(const u_char *src, char *dst, size_t size)
{
/*
* Note that int32_t and int16_t need only be "at least" large enough
* to contain a value of the specified size. On some systems, like
* Crays, there is no such thing as an integer variable with 16 bits.
* Keep this in mind if you think this function should have been coded
* to use pointer overlays. All the world's not a VAX.
*/
char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
char *tp, *ep;
struct { int base, len; } best, cur;
u_int words[IN6ADDRSZ / INT16SZ];
int i;
int advance;
/*
* Preprocess:
* Copy the input (bytewise) array into a wordwise array.
* Find the longest run of 0x00's in src[] for :: shorthanding.
*/
memset(words, '\0', sizeof words);
for (i = 0; i < IN6ADDRSZ; i++)
words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
best.base = -1;
cur.base = -1;
for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) {
if (words[i] == 0) {
if (cur.base == -1)
cur.base = i, cur.len = 1;
else
cur.len++;
} else {
if (cur.base != -1) { if (best.base == -1 || cur.len > best.len)
best = cur;
cur.base = -1;
}
}
}
if (cur.base != -1) {
if (best.base == -1 || cur.len > best.len)
best = cur;
}
if (best.base != -1 && best.len < 2)
best.base = -1;
/*
* Format the result.
*/
tp = tmp;
ep = tmp + sizeof(tmp);
for (i = 0; i < (IN6ADDRSZ / INT16SZ) && tp < ep; i++) {
/* Are we inside the best run of 0x00's? */
if (best.base != -1 && i >= best.base &&
i < (best.base + best.len)) {
if (i == best.base) { if (tp + 1 >= ep)
return (NULL);
*tp++ = ':';
}
continue;
}
/* Are we following an initial run of 0x00s or any real hex? */
if (i != 0) { if (tp + 1 >= ep)
return (NULL);
*tp++ = ':';
}
/* Is this address an encapsulated IPv4? */
if (i == 6 && best.base == 0 && (best.len == 6 || (best.len == 5 && words[5] == 0xffff))) { if (!inet_ntop4(src+12, tp, (size_t)(ep - tp)))
return (NULL);
tp += strlen(tp);
break;
}
advance = snprintf(tp, ep - tp, "%x", words[i]);
if (advance <= 0 || advance >= ep - tp)
return (NULL);
tp += advance;
}
/* Was it a trailing run of 0x00's? */
if (best.base != -1 && (best.base + best.len) == (IN6ADDRSZ / INT16SZ)) { if (tp + 1 >= ep)
return (NULL);
*tp++ = ':';
}
if (tp + 1 >= ep)
return (NULL);
*tp++ = '\0';
/*
* Check for overflow, copy, and we're done.
*/
if ((size_t)(tp - tmp) > size) {
return (NULL);
}
strlcpy(dst, tmp, size);
return (dst);
}
#endif /* INET6 */
/* $OpenBSD: if_loop.c,v 1.91 2020/07/22 02:16:01 dlg Exp $ */
/* $NetBSD: if_loop.c,v 1.15 1996/05/07 02:40:33 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_loop.c 8.1 (Berkeley) 6/10/93
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*
* Loopback interface driver for protocol testing and timing.
*/
#include "bpfilter.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/netisr.h>
#include <net/rtable.h>
#include <net/route.h>
#include <netinet/in.h>
#ifdef INET6
#include <netinet/ip6.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#endif
#if NBPFILTER > 0
#include <net/bpf.h>
#endif
#define LOMTU 32768
int loioctl(struct ifnet *, u_long, caddr_t);
void loopattach(int);
void lortrequest(struct ifnet *, int, struct rtentry *);
void loinput(struct ifnet *, struct mbuf *);
int looutput(struct ifnet *,
struct mbuf *, struct sockaddr *, struct rtentry *);
int loop_clone_create(struct if_clone *, int);
int loop_clone_destroy(struct ifnet *);
struct if_clone loop_cloner =
IF_CLONE_INITIALIZER("lo", loop_clone_create, loop_clone_destroy);
void
loopattach(int n)
{
if (loop_clone_create(&loop_cloner, 0))
panic("unable to create lo0");
if_clone_attach(&loop_cloner);
}
int
loop_clone_create(struct if_clone *ifc, int unit)
{
struct ifnet *ifp;
ifp = malloc(sizeof(*ifp), M_DEVBUF, M_WAITOK|M_ZERO);
snprintf(ifp->if_xname, sizeof ifp->if_xname, "lo%d", unit);
ifp->if_softc = NULL;
ifp->if_mtu = LOMTU;
ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
ifp->if_xflags = IFXF_CLONED;
ifp->if_rtrequest = lortrequest;
ifp->if_ioctl = loioctl;
ifp->if_input = loinput;
ifp->if_output = looutput;
ifp->if_type = IFT_LOOP;
ifp->if_hdrlen = sizeof(u_int32_t);
if (unit == 0) {
if_attachhead(ifp);
if_addgroup(ifp, ifc->ifc_name);
rtable_l2set(0, 0, ifp->if_index);
} else
if_attach(ifp);
if_alloc_sadl(ifp);
#if NBPFILTER > 0
bpfattach(&ifp->if_bpf, ifp, DLT_LOOP, sizeof(u_int32_t));
#endif
return (0);
}
int
loop_clone_destroy(struct ifnet *ifp)
{
struct ifnet *p;
unsigned int rdomain = 0;
if (ifp->if_index == rtable_loindex(ifp->if_rdomain)) {
/* rdomain 0 always needs a loopback */
if (ifp->if_rdomain == 0)
return (EPERM);
/* if there is any other interface in this rdomain, deny */
NET_LOCK();
TAILQ_FOREACH(p, &ifnet, if_list) {
if (p->if_rdomain != ifp->if_rdomain)
continue;
if (p->if_index == ifp->if_index)
continue;
NET_UNLOCK();
return (EBUSY);
}
NET_UNLOCK();
rdomain = ifp->if_rdomain;
}
if_detach(ifp);
free(ifp, M_DEVBUF, sizeof(*ifp));
if (rdomain)
rtable_l2set(rdomain, 0, 0);
return (0);
}
void
loinput(struct ifnet *ifp, struct mbuf *m)
{
int error;
if ((m->m_flags & M_PKTHDR) == 0)
panic("%s: no header mbuf", __func__);
error = if_input_local(ifp, m, m->m_pkthdr.ph_family);
if (error)
ifp->if_ierrors++;
}
int
looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
struct rtentry *rt)
{
if ((m->m_flags & M_PKTHDR) == 0)
panic("%s: no header mbuf", __func__); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m);
return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
}
/*
* Do not call if_input_local() directly. Queue the packet to avoid
* stack overflow and make TCP handshake over loopback work.
*/
return (if_output_local(ifp, m, dst->sa_family));
}
void
lortrequest(struct ifnet *ifp, int cmd, struct rtentry *rt)
{
if (rt && rt->rt_mtu == 0)
rt->rt_mtu = LOMTU;
}
/*
* Process an ioctl request.
*/
int
loioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
struct ifreq *ifr;
int error = 0;
switch (cmd) {
case SIOCSIFFLAGS:
break;
case SIOCSIFADDR:
ifp->if_flags |= IFF_RUNNING;
if_up(ifp); /* send up RTM_IFINFO */
/*
* Everything else is done at a higher level.
*/
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
break;
case SIOCSIFMTU:
ifr = (struct ifreq *)data;
ifp->if_mtu = ifr->ifr_mtu;
break;
default:
error = ENOTTY;
}
return (error);
}
/* $OpenBSD: in6_cksum.c,v 1.18 2019/04/22 22:47:49 bluhm Exp $ */
/* $KAME: in6_cksum.c,v 1.10 2000/12/03 00:53:59 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*/
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
/*
* m MUST contain a continuous IP6 header.
* off is a offset where TCP/UDP/ICMP6 header starts.
* len is a total length of a transport segment.
* (e.g. TCP header + TCP payload)
*/
int
in6_cksum(struct mbuf *m, uint8_t nxt, uint32_t off, uint32_t len)
{
uint16_t *w;
int sum = 0;
int mlen = 0;
int byte_swapped = 0;
struct ip6_hdr *ip6;
union {
uint16_t phs[4];
struct {
uint32_t ph_len;
uint8_t ph_zero[3];
uint8_t ph_nxt;
} ph __packed;
} uph;
union {
uint8_t c[2];
uint16_t s;
} s_util;
union {
uint16_t s[2];
uint32_t l;
} l_util;
/* sanity check */
if (m->m_pkthdr.len < off + len) {
panic("%s: mbuf len (%d) < off+len (%d+%d)",
__func__, m->m_pkthdr.len, off, len);
}
/* Skip pseudo-header if nxt == 0. */
if (nxt == 0)
goto skip_phdr;
bzero(&uph, sizeof(uph));
/*
* First create IP6 pseudo header and calculate a summary.
*/
ip6 = mtod(m, struct ip6_hdr *);
w = (uint16_t *)&ip6->ip6_src;
uph.ph.ph_len = htonl(len);
uph.ph.ph_nxt = nxt;
/* IPv6 source address */
sum += w[0];
if (!IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
sum += w[1];
sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
sum += w[6]; sum += w[7];
/* IPv6 destination address */
sum += w[8];
if (!IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
sum += w[9];
sum += w[10]; sum += w[11]; sum += w[12]; sum += w[13];
sum += w[14]; sum += w[15];
/* Payload length and upper layer identifier */
sum += uph.phs[0]; sum += uph.phs[1];
sum += uph.phs[2]; sum += uph.phs[3];
skip_phdr:
/*
* Secondly calculate a summary of the first mbuf excluding offset.
*/
while (m != NULL && off > 0) { if (m->m_len <= off)
off -= m->m_len;
else
break;
m = m->m_next;
}
if (m == NULL) { if (off) panic("%s: out of header, off %u", __func__, off);
goto end;
}
w = (uint16_t *)(mtod(m, uint8_t *) + off);
mlen = m->m_len - off;
if (len < mlen)
mlen = len;
len -= mlen;
/*
* Force to even boundary.
*/
if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *(uint8_t *)w;
w = (uint16_t *)((uint8_t *)w + 1);
mlen--;
byte_swapped = 1;
}
/*
* Unroll the loop to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
w += 4;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
goto next;
REDUCE;
while ((mlen -= 2) >= 0) {
sum += *w++;
}
if (byte_swapped) {
REDUCE;
sum <<= 8;
byte_swapped = 0;
if (mlen == -1) { s_util.c[1] = *(uint8_t *)w;
sum += s_util.s;
mlen = 0;
} else
mlen = -1;
} else if (mlen == -1) s_util.c[0] = *(uint8_t *)w;
next:
m = m->m_next;
/*
* Lastly calculate a summary of the rest of mbufs.
*/
for (;m && len; m = m->m_next) { if (m->m_len == 0)
continue;
w = mtod(m, uint16_t *);
if (mlen == -1) {
/*
* The first byte of this mbuf is the continuation
* of a word spanning between this mbuf and the
* last mbuf.
*
* s_util.c[0] is already saved when scanning previous
* mbuf.
*/
s_util.c[1] = *(uint8_t *)w;
sum += s_util.s;
w = (uint16_t *)((uint8_t *)w + 1);
mlen = m->m_len - 1;
len--;
} else
mlen = m->m_len;
if (len < mlen)
mlen = len;
len -= mlen;
/*
* Force to even boundary.
*/
if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *(uint8_t *)w;
w = (uint16_t *)((uint8_t *)w + 1);
mlen--;
byte_swapped = 1;
}
/*
* Unroll the loop to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
w += 4;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
continue;
REDUCE;
while ((mlen -= 2) >= 0) {
sum += *w++;
}
if (byte_swapped) {
REDUCE;
sum <<= 8;
byte_swapped = 0;
if (mlen == -1) { s_util.c[1] = *(uint8_t *)w;
sum += s_util.s;
mlen = 0;
} else
mlen = -1;
} else if (mlen == -1) s_util.c[0] = *(uint8_t *)w;
}
end:
if (len)
panic("%s: out of data, len %u", __func__, len); if (mlen == -1) {
/* The last mbuf has odd # of bytes. Follow the
standard (the odd byte may be shifted left by 8 bits
or not as determined by endian-ness of the machine) */
s_util.c[1] = 0;
sum += s_util.s;
}
REDUCE;
return (~sum & 0xffff);
}
/* $OpenBSD: uipc_mbuf2.c,v 1.45 2020/12/12 11:48:54 jan Exp $ */
/* $KAME: uipc_mbuf2.c,v 1.29 2001/02/14 13:42:10 itojun Exp $ */
/* $NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $ */
/*
* Copyright (C) 1999 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/mbuf.h>
extern struct pool mtagpool;
/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
static struct mbuf *m_dup1(struct mbuf *, int, int, int);
/*
* ensure that [off, off + len] is contiguous on the mbuf chain "m".
* packet chain before "off" is kept untouched.
* if offp == NULL, the target will start at <retval, 0> on resulting chain.
* if offp != NULL, the target will start at <retval, *offp> on resulting chain.
*
* on error return (NULL return value), original "m" will be freed.
*
* XXX m_trailingspace/m_leadingspace on shared cluster (sharedcluster)
*/
struct mbuf *
m_pulldown(struct mbuf *m, int off, int len, int *offp)
{
struct mbuf *n, *o;
int hlen, tlen, olen;
int sharedcluster;
/* check invalid arguments. */
if (m == NULL)
panic("m == NULL in m_pulldown()");
if ((n = m_getptr(m, off, &off)) == NULL) {
m_freem(m);
return (NULL); /* mbuf chain too short */
}
sharedcluster = M_READONLY(n);
/*
* the target data is on <n, off>.
* if we got enough data on the mbuf "n", we're done.
*/
if ((off == 0 || offp) && len <= n->m_len - off && !sharedcluster)
goto ok;
/*
* when len <= n->m_len - off and off != 0, it is a special case.
* len bytes from <n, off> sits in single mbuf, but the caller does
* not like the starting position (off).
* chop the current mbuf into two pieces, set off to 0.
*/
if (len <= n->m_len - off) {
struct mbuf *mlast;
o = m_dup1(n, off, n->m_len - off, M_DONTWAIT);
if (o == NULL) {
m_freem(m);
return (NULL); /* ENOBUFS */
}
for (mlast = o; mlast->m_next != NULL; mlast = mlast->m_next)
;
n->m_len = off;
mlast->m_next = n->m_next;
n->m_next = o;
n = o;
off = 0;
goto ok;
}
/*
* we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
* and construct contiguous mbuf with m_len == len.
* note that hlen + tlen == len, and tlen > 0.
*/
hlen = n->m_len - off;
tlen = len - hlen;
/*
* ensure that we have enough trailing data on mbuf chain.
* if not, we can do nothing about the chain.
*/
olen = 0;
for (o = n->m_next; o != NULL; o = o->m_next)
olen += o->m_len;
if (hlen + olen < len) {
m_freem(m);
return (NULL); /* mbuf chain too short */
}
/*
* easy cases first.
* we need to use m_copydata() to get data from <n->m_next, 0>.
*/
if ((off == 0 || offp) && m_trailingspace(n) >= tlen &&
!sharedcluster) {
m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
n->m_len += tlen;
m_adj(n->m_next, tlen);
goto ok;
}
if ((off == 0 || offp) && m_leadingspace(n->m_next) >= hlen &&
!sharedcluster && n->m_next->m_len >= tlen) {
n->m_next->m_data -= hlen;
n->m_next->m_len += hlen;
memmove(mtod(n->m_next, caddr_t), mtod(n, caddr_t) + off, hlen);
n->m_len -= hlen;
n = n->m_next;
off = 0;
goto ok;
}
/*
* now, we need to do the hard way. don't m_copym as there's no room
* on both ends.
*/
if (len > MAXMCLBYTES) {
m_freem(m);
return (NULL);
}
MGET(o, M_DONTWAIT, m->m_type);
if (o && len > MLEN) {
MCLGETL(o, M_DONTWAIT, len);
if ((o->m_flags & M_EXT) == 0) { m_free(o);
o = NULL;
}
}
if (!o) {
m_freem(m);
return (NULL); /* ENOBUFS */
}
/* get hlen from <n, off> into <o, 0> */
o->m_len = hlen;
memmove(mtod(o, caddr_t), mtod(n, caddr_t) + off, hlen);
n->m_len -= hlen;
/* get tlen from <n->m_next, 0> into <o, hlen> */
m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
o->m_len += tlen;
m_adj(n->m_next, tlen);
o->m_next = n->m_next;
n->m_next = o;
n = o;
off = 0;
ok:
if (offp) *offp = off;
return (n);
}
static struct mbuf *
m_dup1(struct mbuf *m, int off, int len, int wait)
{
struct mbuf *n;
int l;
if (len > MAXMCLBYTES)
return (NULL);
if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
MGETHDR(n, wait, m->m_type);
if (n == NULL)
return (NULL);
if (m_dup_pkthdr(n, m, wait)) {
m_free(n);
return (NULL);
}
l = MHLEN;
} else {
MGET(n, wait, m->m_type);
l = MLEN;
}
if (n && len > l) {
MCLGETL(n, wait, len);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (!n)
return (NULL);
m_copydata(m, off, len, mtod(n, caddr_t));
n->m_len = len;
return (n);
}
/* Get a packet tag structure along with specified data following. */
struct m_tag *
m_tag_get(int type, int len, int wait)
{
struct m_tag *t;
if (len < 0)
return (NULL);
if (len > PACKET_TAG_MAXSIZE)
panic("requested tag size for pool %#x is too big", type); t = pool_get(&mtagpool, wait == M_WAITOK ? PR_WAITOK : PR_NOWAIT); if (t == NULL)
return (NULL);
t->m_tag_id = type;
t->m_tag_len = len;
return (t);
}
/* Prepend a packet tag. */
void
m_tag_prepend(struct mbuf *m, struct m_tag *t)
{
SLIST_INSERT_HEAD(&m->m_pkthdr.ph_tags, t, m_tag_link);
m->m_pkthdr.ph_tagsset |= t->m_tag_id;
}
/* Unlink and free a packet tag. */
void
m_tag_delete(struct mbuf *m, struct m_tag *t)
{
u_int32_t ph_tagsset = 0;
struct m_tag *p;
SLIST_REMOVE(&m->m_pkthdr.ph_tags, t, m_tag, m_tag_link);
pool_put(&mtagpool, t);
SLIST_FOREACH(p, &m->m_pkthdr.ph_tags, m_tag_link)
ph_tagsset |= p->m_tag_id;
m->m_pkthdr.ph_tagsset = ph_tagsset;
}
/* Unlink and free a packet tag chain. */
void
m_tag_delete_chain(struct mbuf *m)
{
struct m_tag *p;
while ((p = SLIST_FIRST(&m->m_pkthdr.ph_tags)) != NULL) {
SLIST_REMOVE_HEAD(&m->m_pkthdr.ph_tags, m_tag_link);
pool_put(&mtagpool, p);
}
m->m_pkthdr.ph_tagsset = 0;
}
/* Find a tag, starting from a given position. */
struct m_tag *
m_tag_find(struct mbuf *m, int type, struct m_tag *t)
{
struct m_tag *p;
if (!(m->m_pkthdr.ph_tagsset & type))
return (NULL);
if (t == NULL)
p = SLIST_FIRST(&m->m_pkthdr.ph_tags);
else
p = SLIST_NEXT(t, m_tag_link);
while (p != NULL) { if (p->m_tag_id == type)
return (p);
p = SLIST_NEXT(p, m_tag_link);
}
return (NULL);
}
/* Copy a single tag. */
struct m_tag *
m_tag_copy(struct m_tag *t, int wait)
{
struct m_tag *p;
p = m_tag_get(t->m_tag_id, t->m_tag_len, wait);
if (p == NULL)
return (NULL);
memcpy(p + 1, t + 1, t->m_tag_len); /* Copy the data */
return (p);
}
/*
* Copy two tag chains. The destination mbuf (to) loses any attached
* tags even if the operation fails. This should not be a problem, as
* m_tag_copy_chain() is typically called with a newly-allocated
* destination mbuf.
*/
int
m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int wait)
{
struct m_tag *p, *t, *tprev = NULL;
m_tag_delete_chain(to);
SLIST_FOREACH(p, &from->m_pkthdr.ph_tags, m_tag_link) {
t = m_tag_copy(p, wait);
if (t == NULL) {
m_tag_delete_chain(to);
return (ENOBUFS);
}
if (tprev == NULL)
SLIST_INSERT_HEAD(&to->m_pkthdr.ph_tags, t, m_tag_link);
else
SLIST_INSERT_AFTER(tprev, t, m_tag_link);
tprev = t;
to->m_pkthdr.ph_tagsset |= t->m_tag_id;
}
return (0);
}
/* Initialize tags on an mbuf. */
void
m_tag_init(struct mbuf *m)
{
SLIST_INIT(&m->m_pkthdr.ph_tags);
}
/* Get first tag in chain. */
struct m_tag *
m_tag_first(struct mbuf *m)
{
return (SLIST_FIRST(&m->m_pkthdr.ph_tags));
}
/* Get next tag in chain. */
struct m_tag *
m_tag_next(struct mbuf *m, struct m_tag *t)
{
return (SLIST_NEXT(t, m_tag_link));
}
/* $OpenBSD: kern_exit.c,v 1.204 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_exit.c,v 1.39 1996/04/22 01:38:25 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <sys/ptrace.h>
#include <sys/acct.h>
#include <sys/filedesc.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/ktrace.h>
#include <sys/pool.h>
#include <sys/mutex.h>
#ifdef SYSVSEM
#include <sys/sem.h>
#endif
#include <sys/witness.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#include "kcov.h"
#if NKCOV > 0
#include <sys/kcov.h>
#endif
void proc_finish_wait(struct proc *, struct proc *);
void process_clear_orphan(struct process *);
void process_zap(struct process *);
void proc_free(struct proc *);
void unveil_destroy(struct process *ps);
/*
* exit --
* Death of process.
*/
int
sys_exit(struct proc *p, void *v, register_t *retval)
{
struct sys_exit_args /* {
syscallarg(int) rval;
} */ *uap = v;
exit1(p, SCARG(uap, rval), 0, EXIT_NORMAL);
/* NOTREACHED */
return (0);
}
int
sys___threxit(struct proc *p, void *v, register_t *retval)
{
struct sys___threxit_args /* {
syscallarg(pid_t *) notdead;
} */ *uap = v;
if (SCARG(uap, notdead) != NULL) {
pid_t zero = 0;
if (copyout(&zero, SCARG(uap, notdead), sizeof(zero)))
psignal(p, SIGSEGV);
}
exit1(p, 0, 0, EXIT_THREAD);
return (0);
}
/*
* Exit: deallocate address space and other resources, change proc state
* to zombie, and unlink proc from allproc and parent's lists. Save exit
* status and rusage for wait(). Check for child processes and orphan them.
*/
void
exit1(struct proc *p, int xexit, int xsig, int flags)
{
struct process *pr, *qr, *nqr;
struct rusage *rup;
int s;
atomic_setbits_int(&p->p_flag, P_WEXIT);
pr = p->p_p;
/* single-threaded? */
if (!P_HASSIBLING(p)) {
flags = EXIT_NORMAL;
} else {
/* nope, multi-threaded */
if (flags == EXIT_NORMAL)
single_thread_set(p, SINGLE_EXIT, 1);
else if (flags == EXIT_THREAD)
single_thread_check(p, 0);
}
if (flags == EXIT_NORMAL && !(pr->ps_flags & PS_EXITING)) {
if (pr->ps_pid == 1)
panic("init died (signal %d, exit %d)", xsig, xexit);
atomic_setbits_int(&pr->ps_flags, PS_EXITING);
pr->ps_xexit = xexit;
pr->ps_xsig = xsig;
/*
* If parent is waiting for us to exit or exec, PS_PPWAIT
* is set; we wake up the parent early to avoid deadlock.
*/
if (pr->ps_flags & PS_PPWAIT) {
atomic_clearbits_int(&pr->ps_flags, PS_PPWAIT);
atomic_clearbits_int(&pr->ps_pptr->ps_flags,
PS_ISPWAIT);
wakeup(pr->ps_pptr);
}
}
/* unlink ourselves from the active threads */
SCHED_LOCK(s);
TAILQ_REMOVE(&pr->ps_threads, p, p_thr_link);
SCHED_UNLOCK(s);
if ((p->p_flag & P_THREAD) == 0) {
/* main thread gotta wait because it has the pid, et al */
while (pr->ps_refcnt > 1)
tsleep_nsec(&pr->ps_threads, PWAIT, "thrdeath", INFSLP);
if (pr->ps_flags & PS_PROFIL)
stopprofclock(pr);
}
rup = pr->ps_ru;
if (rup == NULL) {
rup = pool_get(&rusage_pool, PR_WAITOK | PR_ZERO);
if (pr->ps_ru == NULL) {
pr->ps_ru = rup;
} else {
pool_put(&rusage_pool, rup);
rup = pr->ps_ru;
}
}
p->p_siglist = 0;
if ((p->p_flag & P_THREAD) == 0)
pr->ps_siglist = 0;
kqpoll_exit();
#if NKCOV > 0
kcov_exit(p);
#endif
if ((p->p_flag & P_THREAD) == 0) {
sigio_freelist(&pr->ps_sigiolst);
/* close open files and release open-file table */
fdfree(p);
cancel_all_itimers();
timeout_del(&pr->ps_rucheck_to);
#ifdef SYSVSEM
semexit(pr);
#endif
killjobc(pr);
#ifdef ACCOUNTING
acct_process(p);
#endif
#ifdef KTRACE
/* release trace file */
if (pr->ps_tracevp)
ktrcleartrace(pr);
#endif
unveil_destroy(pr);
/*
* If parent has the SAS_NOCLDWAIT flag set, we're not
* going to become a zombie.
*/
if (pr->ps_pptr->ps_sigacts->ps_sigflags & SAS_NOCLDWAIT)
atomic_setbits_int(&pr->ps_flags, PS_NOZOMBIE);
}
p->p_fd = NULL; /* zap the thread's copy */
/*
* Remove proc from pidhash chain and allproc so looking
* it up won't work. We will put the proc on the
* deadproc list later (using the p_hash member), and
* wake up the reaper when we do. If this is the last
* thread of a process that isn't PS_NOZOMBIE, we'll put
* the process on the zombprocess list below.
*/
/*
* NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
*/
p->p_stat = SDEAD;
LIST_REMOVE(p, p_hash);
LIST_REMOVE(p, p_list);
if ((p->p_flag & P_THREAD) == 0) {
LIST_REMOVE(pr, ps_hash);
LIST_REMOVE(pr, ps_list);
if ((pr->ps_flags & PS_NOZOMBIE) == 0)
LIST_INSERT_HEAD(&zombprocess, pr, ps_list);
else {
/*
* Not going to be a zombie, so it's now off all
* the lists scanned by ispidtaken(), so block
* fast reuse of the pid now.
*/
freepid(pr->ps_pid);
}
/*
* Reparent children to their original parent, in case
* they were being traced, or to init(8).
*/
qr = LIST_FIRST(&pr->ps_children);
if (qr) /* only need this if any child is S_ZOMB */
wakeup(initprocess);
for (; qr != NULL; qr = nqr) {
nqr = LIST_NEXT(qr, ps_sibling);
/*
* Traced processes are killed since their
* existence means someone is screwing up.
*/
if (qr->ps_flags & PS_TRACED &&
!(qr->ps_flags & PS_EXITING)) {
process_untrace(qr);
/*
* If single threading is active,
* direct the signal to the active
* thread to avoid deadlock.
*/
if (qr->ps_single)
ptsignal(qr->ps_single, SIGKILL,
STHREAD);
else
prsignal(qr, SIGKILL);
} else {
process_reparent(qr, initprocess);
}
}
/*
* Make sure orphans won't remember the exiting process.
*/
while ((qr = LIST_FIRST(&pr->ps_orphans)) != NULL) {
KASSERT(qr->ps_oppid == pr->ps_pid);
qr->ps_oppid = 0;
process_clear_orphan(qr);
}
}
/* add thread's accumulated rusage into the process's total */
ruadd(rup, &p->p_ru);
tuagg(pr, p);
/*
* clear %cpu usage during swap
*/
p->p_pctcpu = 0;
if ((p->p_flag & P_THREAD) == 0) {
/*
* Final thread has died, so add on our children's rusage
* and calculate the total times
*/
calcru(&pr->ps_tu, &rup->ru_utime, &rup->ru_stime, NULL);
ruadd(rup, &pr->ps_cru);
/*
* Notify parent that we're gone. If we're not going to
* become a zombie, reparent to process 1 (init) so that
* we can wake our original parent to possibly unblock
* wait4() to return ECHILD.
*/
if (pr->ps_flags & PS_NOZOMBIE) {
struct process *ppr = pr->ps_pptr;
process_reparent(pr, initprocess);
wakeup(ppr);
}
}
/* just a thread? detach it from its process */
if (p->p_flag & P_THREAD) {
/* scheduler_wait_hook(pr->ps_mainproc, p); XXX */
if (--pr->ps_refcnt == 1)
wakeup(&pr->ps_threads);
KASSERT(pr->ps_refcnt > 0);
}
/* Release the thread's read reference of resource limit structure. */
if (p->p_limit != NULL) {
struct plimit *limit;
limit = p->p_limit;
p->p_limit = NULL;
lim_free(limit);
}
/*
* Other substructures are freed from reaper and wait().
*/
/*
* Finally, call machine-dependent code to switch to a new
* context (possibly the idle context). Once we are no longer
* using the dead process's vmspace and stack, exit2() will be
* called to schedule those resources to be released by the
* reaper thread.
*
* Note that cpu_exit() will end with a call equivalent to
* cpu_switch(), finishing our execution (pun intended).
*/
uvmexp.swtch++;
cpu_exit(p);
panic("cpu_exit returned");
}
/*
* Locking of this proclist is special; it's accessed in a
* critical section of process exit, and thus locking it can't
* modify interrupt state. We use a simple spin lock for this
* proclist. We use the p_hash member to linkup to deadproc.
*/
struct mutex deadproc_mutex =
MUTEX_INITIALIZER_FLAGS(IPL_NONE, "deadproc", MTX_NOWITNESS);
struct proclist deadproc = LIST_HEAD_INITIALIZER(deadproc);
/*
* We are called from cpu_exit() once it is safe to schedule the
* dead process's resources to be freed.
*
* NOTE: One must be careful with locking in this routine. It's
* called from a critical section in machine-dependent code, so
* we should refrain from changing any interrupt state.
*
* We lock the deadproc list, place the proc on that list (using
* the p_hash member), and wake up the reaper.
*/
void
exit2(struct proc *p)
{
mtx_enter(&deadproc_mutex);
LIST_INSERT_HEAD(&deadproc, p, p_hash);
mtx_leave(&deadproc_mutex);
wakeup(&deadproc);
}
void
proc_free(struct proc *p)
{
crfree(p->p_ucred);
pool_put(&proc_pool, p);
nthreads--;
}
/*
* Process reaper. This is run by a kernel thread to free the resources
* of a dead process. Once the resources are free, the process becomes
* a zombie, and the parent is allowed to read the undead's status.
*/
void
reaper(void *arg)
{
struct proc *p;
KERNEL_UNLOCK();
SCHED_ASSERT_UNLOCKED();
for (;;) {
mtx_enter(&deadproc_mutex);
while ((p = LIST_FIRST(&deadproc)) == NULL)
msleep_nsec(&deadproc, &deadproc_mutex, PVM, "reaper",
INFSLP);
/* Remove us from the deadproc list. */
LIST_REMOVE(p, p_hash);
mtx_leave(&deadproc_mutex);
WITNESS_THREAD_EXIT(p);
KERNEL_LOCK();
/*
* Free the VM resources we're still holding on to.
* We must do this from a valid thread because doing
* so may block.
*/
uvm_uarea_free(p);
p->p_vmspace = NULL; /* zap the thread's copy */
if (p->p_flag & P_THREAD) {
/* Just a thread */
proc_free(p);
} else {
struct process *pr = p->p_p;
/* Release the rest of the process's vmspace */
uvm_exit(pr);
if ((pr->ps_flags & PS_NOZOMBIE) == 0) {
/* Process is now a true zombie. */
atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
}
/* Notify listeners of our demise and clean up. */
knote_processexit(pr);
if (pr->ps_flags & PS_ZOMBIE) {
/* Post SIGCHLD and wake up parent. */
prsignal(pr->ps_pptr, SIGCHLD);
wakeup(pr->ps_pptr);
} else {
/* No one will wait for us, just zap it. */
process_zap(pr);
}
}
KERNEL_UNLOCK();
}
}
int
sys_wait4(struct proc *q, void *v, register_t *retval)
{
struct sys_wait4_args /* {
syscallarg(pid_t) pid;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct rusage *) rusage;
} */ *uap = v;
struct rusage ru;
int status, error;
error = dowait4(q, SCARG(uap, pid),
SCARG(uap, status) ? &status : NULL,
SCARG(uap, options), SCARG(uap, rusage) ? &ru : NULL, retval);
if (error == 0 && retval[0] > 0 && SCARG(uap, status)) {
error = copyout(&status, SCARG(uap, status), sizeof(status));
}
if (error == 0 && retval[0] > 0 && SCARG(uap, rusage)) {
error = copyout(&ru, SCARG(uap, rusage), sizeof(ru));
#ifdef KTRACE
if (error == 0 && KTRPOINT(q, KTR_STRUCT)) ktrrusage(q, &ru);
#endif
}
return (error);
}
int
dowait4(struct proc *q, pid_t pid, int *statusp, int options,
struct rusage *rusage, register_t *retval)
{
int nfound;
struct process *pr;
struct proc *p;
int error;
if (pid == 0) pid = -q->p_p->ps_pgid; if (options &~ (WUNTRACED|WNOHANG|WCONTINUED))
return (EINVAL);
loop:
nfound = 0;
LIST_FOREACH(pr, &q->p_p->ps_children, ps_sibling) { if ((pr->ps_flags & PS_NOZOMBIE) || (pid != WAIT_ANY && pr->ps_pid != pid &&
pr->ps_pgid != -pid))
continue;
p = pr->ps_mainproc;
nfound++;
if (pr->ps_flags & PS_ZOMBIE) {
retval[0] = pr->ps_pid;
if (statusp != NULL) *statusp = W_EXITCODE(pr->ps_xexit,
pr->ps_xsig);
if (rusage != NULL) memcpy(rusage, pr->ps_ru, sizeof(*rusage));
proc_finish_wait(q, p);
return (0);
}
if (pr->ps_flags & PS_TRACED && (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single && pr->ps_single->p_stat == SSTOP &&
(pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
if (single_thread_wait(pr, 0))
goto loop;
atomic_setbits_int(&pr->ps_flags, PS_WAITED);
retval[0] = pr->ps_pid;
if (statusp != NULL) *statusp = W_STOPCODE(pr->ps_xsig); if (rusage != NULL) memset(rusage, 0, sizeof(*rusage));
return (0);
}
if (p->p_stat == SSTOP &&
(pr->ps_flags & PS_WAITED) == 0 &&
(p->p_flag & P_SUSPSINGLE) == 0 && (pr->ps_flags & PS_TRACED ||
options & WUNTRACED)) {
atomic_setbits_int(&pr->ps_flags, PS_WAITED);
retval[0] = pr->ps_pid;
if (statusp != NULL) *statusp = W_STOPCODE(pr->ps_xsig); if (rusage != NULL) memset(rusage, 0, sizeof(*rusage));
return (0);
}
if ((options & WCONTINUED) && (p->p_flag & P_CONTINUED)) {
atomic_clearbits_int(&p->p_flag, P_CONTINUED);
retval[0] = pr->ps_pid;
if (statusp != NULL) *statusp = _WCONTINUED; if (rusage != NULL) memset(rusage, 0, sizeof(*rusage));
return (0);
}
}
/*
* Look in the orphans list too, to allow the parent to
* collect its child's exit status even if child is being
* debugged.
*
* Debugger detaches from the parent upon successful
* switch-over from parent to child. At this point due to
* re-parenting the parent loses the child to debugger and a
* wait4(2) call would report that it has no children to wait
* for. By maintaining a list of orphans we allow the parent
* to successfully wait until the child becomes a zombie.
*/
if (nfound == 0) { LIST_FOREACH(pr, &q->p_p->ps_orphans, ps_orphan) { if ((pr->ps_flags & PS_NOZOMBIE) || (pid != WAIT_ANY && pr->ps_pid != pid &&
pr->ps_pgid != -pid))
continue;
nfound++;
break;
}
}
if (nfound == 0)
return (ECHILD);
if (options & WNOHANG) {
retval[0] = 0;
return (0);
}
if ((error = tsleep_nsec(q->p_p, PWAIT | PCATCH, "wait", INFSLP)) != 0)
return (error);
goto loop;
}
void
proc_finish_wait(struct proc *waiter, struct proc *p)
{
struct process *pr, *tr;
struct rusage *rup;
/*
* If we got the child via a ptrace 'attach',
* we need to give it back to the old parent.
*/
pr = p->p_p;
if (pr->ps_oppid != 0 && (pr->ps_oppid != pr->ps_pptr->ps_pid) &&
(tr = prfind(pr->ps_oppid))) {
pr->ps_oppid = 0;
atomic_clearbits_int(&pr->ps_flags, PS_TRACED);
process_reparent(pr, tr);
prsignal(tr, SIGCHLD);
wakeup(tr);
} else {
scheduler_wait_hook(waiter, p);
rup = &waiter->p_p->ps_cru;
ruadd(rup, pr->ps_ru);
LIST_REMOVE(pr, ps_list); /* off zombprocess */
freepid(pr->ps_pid);
process_zap(pr);
}
}
/*
* give process back to original parent or init(8)
*/
void
process_untrace(struct process *pr)
{
struct process *ppr = NULL;
KASSERT(pr->ps_flags & PS_TRACED);
if (pr->ps_oppid != 0 &&
(pr->ps_oppid != pr->ps_pptr->ps_pid))
ppr = prfind(pr->ps_oppid);
/* not being traced any more */
pr->ps_oppid = 0;
atomic_clearbits_int(&pr->ps_flags, PS_TRACED);
process_reparent(pr, ppr ? ppr : initprocess);
}
void
process_clear_orphan(struct process *pr)
{
if (pr->ps_flags & PS_ORPHAN) {
LIST_REMOVE(pr, ps_orphan);
atomic_clearbits_int(&pr->ps_flags, PS_ORPHAN);
}
}
/*
* make process 'parent' the new parent of process 'child'.
*/
void
process_reparent(struct process *child, struct process *parent)
{
if (child->ps_pptr == parent)
return;
KASSERT(child->ps_oppid == 0 ||
child->ps_oppid == child->ps_pptr->ps_pid);
LIST_REMOVE(child, ps_sibling);
LIST_INSERT_HEAD(&parent->ps_children, child, ps_sibling);
process_clear_orphan(child);
if (child->ps_flags & PS_TRACED) {
atomic_setbits_int(&child->ps_flags, PS_ORPHAN);
LIST_INSERT_HEAD(&child->ps_pptr->ps_orphans, child, ps_orphan);
}
child->ps_pptr = parent;
child->ps_ppid = parent->ps_pid;
}
void
process_zap(struct process *pr)
{
struct vnode *otvp;
struct proc *p = pr->ps_mainproc;
/*
* Finally finished with old proc entry.
* Unlink it from its process group and free it.
*/
leavepgrp(pr);
LIST_REMOVE(pr, ps_sibling);
process_clear_orphan(pr);
/*
* Decrement the count of procs running with this uid.
*/
(void)chgproccnt(pr->ps_ucred->cr_ruid, -1);
/*
* Release reference to text vnode
*/
otvp = pr->ps_textvp;
pr->ps_textvp = NULL;
if (otvp)
vrele(otvp);
KASSERT(pr->ps_refcnt == 1);
if (pr->ps_ptstat != NULL)
free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
pool_put(&rusage_pool, pr->ps_ru);
KASSERT(TAILQ_EMPTY(&pr->ps_threads));
sigactsfree(pr->ps_sigacts);
lim_free(pr->ps_limit);
crfree(pr->ps_ucred);
pool_put(&process_pool, pr);
nprocesses--;
proc_free(p);
}
/* $OpenBSD: uvm_io.c,v 1.29 2022/03/12 08:11:07 mpi Exp $ */
/* $NetBSD: uvm_io.c,v 1.12 2000/06/27 17:29:23 mrg Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_io.c,v 1.1.2.2 1997/12/30 12:02:00 mrg Exp
*/
/*
* uvm_io.c: uvm i/o ops
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <uvm/uvm.h>
/*
* functions
*/
/*
* uvm_io: perform I/O on a map
*
* => caller must have a reference to "map" so that it doesn't go away
* while we are working.
*/
int
uvm_io(vm_map_t map, struct uio *uio, int flags)
{
vaddr_t baseva, endva, pageoffset, kva;
vsize_t chunksz, togo, sz;
struct uvm_map_deadq dead_entries;
int error, extractflags;
/*
* step 0: sanity checks and set up for copy loop. start with a
* large chunk size. if we have trouble finding vm space we will
* reduce it.
*/
if (uio->uio_resid == 0)
return(0);
togo = uio->uio_resid;
baseva = (vaddr_t) uio->uio_offset;
endva = baseva + (togo - 1);
if (endva < baseva) /* wrap around? */
return(EIO);
if (baseva >= VM_MAXUSER_ADDRESS)
return(0);
if (endva >= VM_MAXUSER_ADDRESS)
/* EOF truncate */
togo = togo - (endva - VM_MAXUSER_ADDRESS + 1);
pageoffset = baseva & PAGE_MASK;
baseva = trunc_page(baseva);
chunksz = min(round_page(togo + pageoffset), MAXBSIZE);
error = 0;
extractflags = 0;
if (flags & UVM_IO_FIXPROT)
extractflags |= UVM_EXTRACT_FIXPROT;
/*
* step 1: main loop... while we've got data to move
*/
for (/*null*/; togo > 0 ; pageoffset = 0) {
/*
* step 2: extract mappings from the map into kernel_map
*/
error = uvm_map_extract(map, baseva, chunksz, &kva,
extractflags);
if (error) {
/* retry with a smaller chunk... */
if (error == ENOMEM && chunksz > PAGE_SIZE) { chunksz = trunc_page(chunksz / 2);
if (chunksz < PAGE_SIZE)
chunksz = PAGE_SIZE;
continue;
}
break;
}
/*
* step 3: move a chunk of data
*/
sz = chunksz - pageoffset;
if (sz > togo)
sz = togo;
error = uiomove((caddr_t) (kva + pageoffset), sz, uio);
togo -= sz;
baseva += chunksz;
/*
* step 4: unmap the area of kernel memory
*/
vm_map_lock(kernel_map);
TAILQ_INIT(&dead_entries);
uvm_unmap_remove(kernel_map, kva, kva+chunksz,
&dead_entries, FALSE, TRUE);
vm_map_unlock(kernel_map);
uvm_unmap_detach(&dead_entries, AMAP_REFALL);
if (error)
break;
}
return (error);
}
/* $OpenBSD: uvm_glue.c,v 1.83 2022/03/12 08:11:07 mpi Exp $ */
/* $NetBSD: uvm_glue.c,v 1.44 2001/02/06 19:54:44 eeh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
* from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_glue.c: glue functions
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/user.h>
#ifdef SYSVSHM
#include <sys/shm.h>
#endif
#include <sys/sched.h>
#include <uvm/uvm.h>
/*
* uvm_kernacc: can the kernel access a region of memory
*
* - called from malloc [DIAGNOSTIC], and /dev/kmem driver (mem.c)
*/
boolean_t
uvm_kernacc(caddr_t addr, size_t len, int rw)
{
boolean_t rv;
vaddr_t saddr, eaddr;
vm_prot_t prot = rw == B_READ ? PROT_READ : PROT_WRITE;
saddr = trunc_page((vaddr_t)addr);
eaddr = round_page((vaddr_t)addr + len);
vm_map_lock_read(kernel_map);
rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot);
vm_map_unlock_read(kernel_map);
return rv;
}
/*
* uvm_vslock: wire user memory for I/O
*
* - called from sys_sysctl
*/
int
uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
{ struct vm_map *map = &p->p_vmspace->vm_map;
vaddr_t start, end;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
if (end <= start)
return (EINVAL);
return uvm_fault_wire(map, start, end, access_type);
}
/*
* uvm_vsunlock: unwire user memory wired by uvm_vslock()
*
* - called from sys_sysctl
*/
void
uvm_vsunlock(struct proc *p, caddr_t addr, size_t len)
{
vaddr_t start, end;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
KASSERT(end > start); uvm_fault_unwire(&p->p_vmspace->vm_map, start, end);
}
/*
* uvm_vslock_device: wire user memory, make sure it's device reachable
* and bounce if necessary.
*
* - called from physio
*/
int
uvm_vslock_device(struct proc *p, void *addr, size_t len,
vm_prot_t access_type, void **retp)
{
struct vm_map *map = &p->p_vmspace->vm_map;
struct vm_page *pg;
struct pglist pgl;
int npages;
vaddr_t start, end, off;
vaddr_t sva, va;
vsize_t sz;
int error, mapv, i;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
sz = end - start;
off = (vaddr_t)addr - start;
if (end <= start)
return (EINVAL);
vm_map_lock_read(map);
retry:
mapv = map->timestamp;
vm_map_unlock_read(map);
if ((error = uvm_fault_wire(map, start, end, access_type)))
return (error);
vm_map_lock_read(map);
if (mapv != map->timestamp)
goto retry;
npages = atop(sz);
for (i = 0; i < npages; i++) {
paddr_t pa;
if (!pmap_extract(map->pmap, start + ptoa(i), &pa)) {
error = EFAULT;
goto out_unwire;
}
if (!PADDR_IS_DMA_REACHABLE(pa))
break;
}
if (i == npages) {
*retp = NULL;
return (0);
}
va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_nowait);
if (va == 0) {
error = ENOMEM;
goto out_unwire;
}
sva = va;
TAILQ_INIT(&pgl);
error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_WAITOK);
if (error)
goto out_unmap;
while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
TAILQ_REMOVE(&pgl, pg, pageq);
pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE);
va += PAGE_SIZE;
}
pmap_update(pmap_kernel());
KASSERT(va == sva + sz);
*retp = (void *)(sva + off);
if ((error = copyin(addr, *retp, len)) == 0)
return 0;
uvm_km_pgremove_intrsafe(sva, sva + sz);
pmap_kremove(sva, sz);
pmap_update(pmap_kernel());
out_unmap:
km_free((void *)sva, sz, &kv_any, &kp_none);
out_unwire:
uvm_fault_unwire_locked(map, start, end);
vm_map_unlock_read(map);
return (error);
}
/*
* uvm_vsunlock_device: unwire user memory wired by uvm_vslock_device()
*
* - called from physio
*/
void
uvm_vsunlock_device(struct proc *p, void *addr, size_t len, void *map)
{
vaddr_t start, end;
vaddr_t kva;
vsize_t sz;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
KASSERT(end > start);
sz = end - start;
if (map)
copyout(map, addr, len);
uvm_fault_unwire_locked(&p->p_vmspace->vm_map, start, end);
vm_map_unlock_read(&p->p_vmspace->vm_map);
if (!map)
return;
kva = trunc_page((vaddr_t)map);
uvm_km_pgremove_intrsafe(kva, kva + sz);
pmap_kremove(kva, sz);
pmap_update(pmap_kernel());
uvm_km_free(kernel_map, kva, sz);
}
/*
* uvm_uarea_alloc: allocate the u-area for a new thread
*/
vaddr_t
uvm_uarea_alloc(void)
{
vaddr_t uaddr;
uaddr = uvm_km_kmemalloc_pla(kernel_map, uvm.kernel_object, USPACE,
USPACE_ALIGN, UVM_KMF_ZERO,
no_constraint.ucr_low, no_constraint.ucr_high,
0, 0, USPACE/PAGE_SIZE);
return (uaddr);
}
/*
* uvm_uarea_free: free a dead thread's stack
*
* - the thread passed to us is a dead thread; we
* are running on a different context now (the reaper).
*/
void
uvm_uarea_free(struct proc *p)
{
uvm_km_free(kernel_map, (vaddr_t)p->p_addr, USPACE);
p->p_addr = NULL;
}
/*
* uvm_exit: exit a virtual address space
*/
void
uvm_exit(struct process *pr)
{
struct vmspace *vm = pr->ps_vmspace;
pr->ps_vmspace = NULL;
uvmspace_free(vm);
}
/*
* uvm_init_limit: init per-process VM limits
*
* - called for process 0 and then inherited by all others.
*/
void
uvm_init_limits(struct plimit *limit0)
{
/*
* Set up the initial limits on process VM. Set the maximum
* resident set size to be all of (reasonably) available memory.
* This causes any single, large process to start random page
* replacement once it fills memory.
*/
limit0->pl_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
limit0->pl_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
limit0->pl_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
limit0->pl_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ;
limit0->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(uvmexp.free);
}
#ifdef DEBUG
int enableswap = 1;
int swapdebug = 0;
#define SDB_FOLLOW 1
#define SDB_SWAPIN 2
#define SDB_SWAPOUT 4
#endif
/*
* swapout_threads: find threads that can be swapped
*
* - called by the pagedaemon
* - try and swap at least one process
* - processes that are sleeping or stopped for maxslp or more seconds
* are swapped... otherwise the longest-sleeping or stopped process
* is swapped, otherwise the longest resident process...
*/
void
uvm_swapout_threads(void)
{
struct process *pr;
struct proc *p, *slpp;
struct process *outpr;
int outpri;
int didswap = 0;
extern int maxslp;
/* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */
#ifdef DEBUG
if (!enableswap)
return;
#endif
/*
* outpr/outpri : stop/sleep process whose most active thread has
* the largest sleeptime < maxslp
*/
outpr = NULL;
outpri = 0;
LIST_FOREACH(pr, &allprocess, ps_list) {
if (pr->ps_flags & (PS_SYSTEM | PS_EXITING))
continue;
/*
* slpp: the sleeping or stopped thread in pr with
* the smallest p_slptime
*/
slpp = NULL;
TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
switch (p->p_stat) {
case SRUN:
case SONPROC:
goto next_process;
case SSLEEP:
case SSTOP:
if (slpp == NULL ||
slpp->p_slptime < p->p_slptime)
slpp = p;
continue;
}
}
if (slpp != NULL) {
if (slpp->p_slptime >= maxslp) {
pmap_collect(pr->ps_vmspace->vm_map.pmap);
didswap++;
} else if (slpp->p_slptime > outpri) {
outpr = pr;
outpri = slpp->p_slptime;
}
}
next_process: ;
}
/*
* If we didn't get rid of any real duds, toss out the next most
* likely sleeping/stopped or running candidate. We only do this
* if we are real low on memory since we don't gain much by doing
* it.
*/
if (didswap == 0 && uvmexp.free <= atop(round_page(USPACE)) &&
outpr != NULL) {
#ifdef DEBUG
if (swapdebug & SDB_SWAPOUT)
printf("swapout_threads: no duds, try procpr %p\n",
outpr);
#endif
pmap_collect(outpr->ps_vmspace->vm_map.pmap);
}
}
/*
* uvm_atopg: convert KVAs back to their page structures.
*/
struct vm_page *
uvm_atopg(vaddr_t kva)
{
struct vm_page *pg;
paddr_t pa;
boolean_t rv;
rv = pmap_extract(pmap_kernel(), kva, &pa);
KASSERT(rv);
pg = PHYS_TO_VM_PAGE(pa);
KASSERT(pg != NULL);
return (pg);
}
void
uvm_pause(void)
{
static unsigned int toggle;
if (toggle++ > 128) {
toggle = 0;
KERNEL_UNLOCK();
KERNEL_LOCK();
}
sched_pause(preempt);
}
#ifndef SMALL_KERNEL
int
fill_vmmap(struct process *pr, struct kinfo_vmentry *kve,
size_t *lenp)
{
struct vm_map *map;
if (pr != NULL)
map = &pr->ps_vmspace->vm_map;
else
map = kernel_map;
return uvm_map_fill_vmmap(map, kve, lenp);
}
#endif
/* $OpenBSD: vfs_cache.c,v 1.58 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_cache.c,v 1.13 1996/02/04 02:18:09 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/namei.h>
#include <sys/errno.h>
#include <sys/pool.h>
/*
* TODO: namecache access should really be locked.
*/
/*
* For simplicity (and economy of storage), names longer than
* a maximum length of NAMECACHE_MAXLEN are not cached; they occur
* infrequently in any case, and are almost never of interest.
*
* Upon reaching the last segment of a path, if the reference
* is for DELETE, or NOCACHE is set (rewrite), and the
* name is located in the cache, it will be dropped.
*/
/*
* Structures associated with name caching.
*/
long numcache; /* total number of cache entries allocated */
long numneg; /* number of negative cache entries */
TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */
TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU chain */
struct nchstats nchstats; /* cache effectiveness statistics */
int doingcache = 1; /* 1 => enable the cache */
struct pool nch_pool;
void cache_zap(struct namecache *);
u_long nextvnodeid;
static inline int
namecache_compare(const struct namecache *n1, const struct namecache *n2)
{
if (n1->nc_nlen == n2->nc_nlen)
return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen));
else
return (n1->nc_nlen - n2->nc_nlen);
}
RBT_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
RBT_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
void
cache_tree_init(struct namecache_rb_cache *tree)
{
RBT_INIT(namecache_rb_cache, tree);
}
/*
* blow away a namecache entry
*/
void
cache_zap(struct namecache *ncp)
{
struct vnode *dvp = NULL;
if (ncp->nc_vp != NULL) {
TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
numcache--;
} else {
TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
numneg--;
}
if (ncp->nc_dvp) { RBT_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp);
if (RBT_EMPTY(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree))
dvp = ncp->nc_dvp;
}
if (ncp->nc_vp && (ncp->nc_vpid == ncp->nc_vp->v_id)) { if (ncp->nc_vp != ncp->nc_dvp && ncp->nc_vp->v_type == VDIR && (ncp->nc_nlen > 2 ||
(ncp->nc_nlen > 1 &&
ncp->nc_name[1] != '.') || (ncp->nc_nlen > 0 &&
ncp->nc_name[0] != '.'))) {
TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me);
}
}
pool_put(&nch_pool, ncp);
if (dvp) vdrop(dvp);
}
/*
* Look for a name in the cache.
* dvp points to the directory to search. The componentname cnp holds
* the information on the entry being sought, such as its length
* and its name. If the lookup succeeds, vpp is set to point to the vnode
* and an error of 0 is returned. If the lookup determines the name does
* not exist (negative caching) an error of ENOENT is returned. If the
* lookup fails, an error of -1 is returned.
*/
int
cache_lookup(struct vnode *dvp, struct vnode **vpp,
struct componentname *cnp)
{
struct namecache *ncp;
struct namecache n;
struct vnode *vp;
u_long vpid;
int error;
*vpp = NULL;
if (!doingcache) {
cnp->cn_flags &= ~MAKEENTRY;
return (-1);
}
if (cnp->cn_namelen > NAMECACHE_MAXLEN) {
nchstats.ncs_long++;
cnp->cn_flags &= ~MAKEENTRY;
return (-1);
}
/* lookup in directory vnode's redblack tree */
n.nc_nlen = cnp->cn_namelen;
memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen);
ncp = RBT_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n);
if (ncp == NULL) {
nchstats.ncs_miss++;
return (-1);
}
if ((cnp->cn_flags & MAKEENTRY) == 0) {
nchstats.ncs_badhits++;
goto remove;
} else if (ncp->nc_vp == NULL) {
if (cnp->cn_nameiop != CREATE ||
(cnp->cn_flags & ISLASTCN) == 0) {
nchstats.ncs_neghits++;
/*
* Move this slot to end of the negative LRU chain,
*/
if (TAILQ_NEXT(ncp, nc_neg) != NULL) { TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
TAILQ_INSERT_TAIL(&nclruneghead, ncp,
nc_neg);
}
return (ENOENT);
} else {
nchstats.ncs_badhits++;
goto remove;
}
} else if (ncp->nc_vpid != ncp->nc_vp->v_id) {
nchstats.ncs_falsehits++;
goto remove;
}
/*
* Move this slot to end of the regular LRU chain.
*/
if (TAILQ_NEXT(ncp, nc_lru) != NULL) { TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
}
vp = ncp->nc_vp;
vpid = vp->v_id;
if (vp == dvp) { /* lookup on "." */
vref(dvp);
error = 0;
} else if (cnp->cn_flags & ISDOTDOT) {
VOP_UNLOCK(dvp);
cnp->cn_flags |= PDIRUNLOCK;
error = vget(vp, LK_EXCLUSIVE);
/*
* If the above vget() succeeded and both LOCKPARENT and
* ISLASTCN is set, lock the directory vnode as well.
*/
if (!error && (~cnp->cn_flags & (LOCKPARENT|ISLASTCN)) == 0) {
if ((error = vn_lock(dvp, LK_EXCLUSIVE)) != 0) {
vput(vp);
return (error);
}
cnp->cn_flags &= ~PDIRUNLOCK;
}
} else {
error = vget(vp, LK_EXCLUSIVE);
/*
* If the above vget() failed or either of LOCKPARENT or
* ISLASTCN is set, unlock the directory vnode.
*/
if (error || (~cnp->cn_flags & (LOCKPARENT|ISLASTCN)) != 0) { VOP_UNLOCK(dvp);
cnp->cn_flags |= PDIRUNLOCK;
}
}
/*
* Check that the lock succeeded, and that the capability number did
* not change while we were waiting for the lock.
*/
if (error || vpid != vp->v_id) {
if (!error) {
vput(vp);
nchstats.ncs_falsehits++;
} else
nchstats.ncs_badhits++;
/*
* The parent needs to be locked when we return to VOP_LOOKUP().
* The `.' case here should be extremely rare (if it can happen
* at all), so we don't bother optimizing out the unlock/relock.
*/
if (vp == dvp || error ||
(~cnp->cn_flags & (LOCKPARENT|ISLASTCN)) != 0) {
if ((error = vn_lock(dvp, LK_EXCLUSIVE)) != 0)
return (error);
cnp->cn_flags &= ~PDIRUNLOCK;
}
return (-1);
}
nchstats.ncs_goodhits++;
*vpp = vp;
return (0);
remove:
/*
* Last component and we are renaming or deleting,
* the cache entry is invalid, or otherwise don't
* want cache entry to exist.
*/
cache_zap(ncp);
return (-1);
}
/*
* Scan cache looking for name of directory entry pointing at vp.
*
* Fill in dvpp.
*
* If bufp is non-NULL, also place the name in the buffer which starts
* at bufp, immediately before *bpp, and move bpp backwards to point
* at the start of it. (Yes, this is a little baroque, but it's done
* this way to cater to the whims of getcwd).
*
* Returns 0 on success, -1 on cache miss, positive errno on failure.
*
* TODO: should we return *dvpp locked?
*/
int
cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
{
struct namecache *ncp;
struct vnode *dvp = NULL;
char *bp;
if (!doingcache)
goto out;
TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) {
dvp = ncp->nc_dvp;
if (dvp && dvp != vp && ncp->nc_dvpid == dvp->v_id)
goto found;
}
goto miss;
found:
#ifdef DIAGNOSTIC
if (ncp->nc_nlen == 1 &&
ncp->nc_name[0] == '.')
panic("cache_revlookup: found entry for ."); if (ncp->nc_nlen == 2 && ncp->nc_name[0] == '.' &&
ncp->nc_name[1] == '.')
panic("cache_revlookup: found entry for ..");
#endif
nchstats.ncs_revhits++;
if (bufp != NULL) {
bp = *bpp;
bp -= ncp->nc_nlen;
if (bp <= bufp) {
*dvpp = NULL;
return (ERANGE);
}
memcpy(bp, ncp->nc_name, ncp->nc_nlen);
*bpp = bp;
}
*dvpp = dvp;
/*
* XXX: Should we vget() here to have more
* consistent semantics with cache_lookup()?
*/
return (0);
miss:
nchstats.ncs_revmiss++;
out:
*dvpp = NULL;
return (-1);
}
/*
* Add an entry to the cache
*/
void
cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
{
struct namecache *ncp, *lncp;
if (!doingcache || cnp->cn_namelen > NAMECACHE_MAXLEN)
return;
/*
* allocate, or recycle (free and allocate) an ncp.
*/
if (numcache >= initialvnodes) { if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL)
cache_zap(ncp);
else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL)
cache_zap(ncp);
else
panic("wtf? leak?");
}
ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO);
/* grab the vnode we just found */
ncp->nc_vp = vp;
if (vp) ncp->nc_vpid = vp->v_id;
/* fill in cache info */
ncp->nc_dvp = dvp;
ncp->nc_dvpid = dvp->v_id;
ncp->nc_nlen = cnp->cn_namelen;
memcpy(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen);
if (RBT_EMPTY(namecache_rb_cache, &dvp->v_nc_tree)) { vhold(dvp);
}
if ((lncp = RBT_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp))
!= NULL) {
/* someone has raced us and added a different entry
* for the same vnode (different ncp) - we don't need
* this entry, so free it and we are done.
*/
pool_put(&nch_pool, ncp);
/* we know now dvp->v_nc_tree is not empty, no need
* to vdrop here
*/
goto done;
}
if (vp) {
TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
numcache++;
/* don't put . or .. in the reverse map */
if (vp != dvp && vp->v_type == VDIR && (ncp->nc_nlen > 2 ||
(ncp->nc_nlen > 1 &&
ncp->nc_name[1] != '.') || (ncp->nc_nlen > 0 &&
ncp->nc_name[0] != '.')))
TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp,
nc_me);
} else {
TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg);
numneg++;
}
if (numneg > initialvnodes) { if ((ncp = TAILQ_FIRST(&nclruneghead))
!= NULL)
cache_zap(ncp);
}
done:
return;
}
/*
* Name cache initialization, from vfs_init() when we are booting
*/
void
nchinit(void)
{
TAILQ_INIT(&nclruhead);
TAILQ_INIT(&nclruneghead);
pool_init(&nch_pool, sizeof(struct namecache), 0, IPL_NONE, PR_WAITOK,
"nchpl", NULL);
}
/*
* Cache flush, a particular vnode; called when a vnode is renamed to
* hide entries that would now be invalid
*/
void
cache_purge(struct vnode *vp)
{
struct namecache *ncp;
/* We should never have destinations cached for a non-VDIR vnode. */
KASSERT(vp->v_type == VDIR || TAILQ_EMPTY(&vp->v_cache_dst)); while ((ncp = TAILQ_FIRST(&vp->v_cache_dst)))
cache_zap(ncp);
while ((ncp = RBT_ROOT(namecache_rb_cache, &vp->v_nc_tree)))
cache_zap(ncp);
/* XXX this blows goats */
vp->v_id = ++nextvnodeid;
if (vp->v_id == 0) vp->v_id = ++nextvnodeid;
}
/*
* Cache flush, a whole filesystem; called when filesys is umounted to
* remove entries that would now be invalid
*/
void
cache_purgevfs(struct mount *mp)
{
struct namecache *ncp, *nxtcp;
/* whack the regular entries */
TAILQ_FOREACH_SAFE(ncp, &nclruhead, nc_lru, nxtcp) {
if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp)
continue;
/* free the resources we had */
cache_zap(ncp);
}
/* whack the negative entries */
TAILQ_FOREACH_SAFE(ncp, &nclruneghead, nc_neg, nxtcp) {
if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp)
continue;
/* free the resources we had */
cache_zap(ncp);
}
}
/* $OpenBSD: socketvar.h,v 1.110 2022/09/05 14:56:09 bluhm Exp $ */
/* $NetBSD: socketvar.h,v 1.18 1996/02/09 18:25:38 christos Exp $ */
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)socketvar.h 8.1 (Berkeley) 6/2/93
*/
#ifndef _SYS_SOCKETVAR_H_
#define _SYS_SOCKETVAR_H_
#include <sys/selinfo.h> /* for struct selinfo */
#include <sys/queue.h>
#include <sys/sigio.h> /* for struct sigio_ref */
#include <sys/task.h>
#include <sys/timeout.h>
#include <sys/rwlock.h>
#include <sys/refcnt.h>
#ifndef _SOCKLEN_T_DEFINED_
#define _SOCKLEN_T_DEFINED_
typedef __socklen_t socklen_t; /* length type for network syscalls */
#endif
TAILQ_HEAD(soqhead, socket);
/*
* Kernel structure per socket.
* Contains send and receive buffer queues,
* handle on protocol and pointer to protocol
* private data and error information.
*/
struct socket {
const struct protosw *so_proto; /* protocol handle */
struct rwlock so_lock; /* this socket lock */
struct refcnt so_refcnt; /* references to this socket */
void *so_pcb; /* protocol control block */
u_int so_state; /* internal state flags SS_*, below */
short so_type; /* generic type, see socket.h */
short so_options; /* from socket call, see socket.h */
short so_linger; /* time to linger while closing */
/*
* Variables for connection queueing.
* Socket where accepts occur is so_head in all subsidiary sockets.
* If so_head is 0, socket is not related to an accept.
* For head socket so_q0 queues partially completed connections,
* while so_q is a queue of connections ready to be accepted.
* If a connection is aborted and it has so_head set, then
* it has to be pulled out of either so_q0 or so_q.
* We allow connections to queue up based on current queue lengths
* and limit on number of queued connections for this socket.
*/
struct socket *so_head; /* back pointer to accept socket */
struct soqhead *so_onq; /* queue (q or q0) that we're on */
struct soqhead so_q0; /* queue of partial connections */
struct soqhead so_q; /* queue of incoming connections */
struct sigio_ref so_sigio; /* async I/O registration */
TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */
short so_q0len; /* partials on so_q0 */
short so_qlen; /* number of connections on so_q */
short so_qlimit; /* max number queued connections */
u_long so_newconn; /* # of pending sonewconn() threads */
short so_timeo; /* connection timeout */
u_long so_oobmark; /* chars to oob mark */
u_int so_error; /* error affecting connection */
/*
* Variables for socket splicing, allocated only when needed.
*/
struct sosplice {
struct socket *ssp_socket; /* send data to drain socket */
struct socket *ssp_soback; /* back ref to source socket */
off_t ssp_len; /* number of bytes spliced */
off_t ssp_max; /* maximum number of bytes */
struct timeval ssp_idletv; /* idle timeout */
struct timeout ssp_idleto;
struct task ssp_task; /* task for somove */
} *so_sp;
/*
* Variables for socket buffering.
*/
struct sockbuf {
/* The following fields are all zeroed on flush. */
#define sb_startzero sb_cc
u_long sb_cc; /* actual chars in buffer */
u_long sb_datacc; /* data only chars in buffer */
u_long sb_hiwat; /* max actual char count */
u_long sb_wat; /* default watermark */
u_long sb_mbcnt; /* chars of mbufs used */
u_long sb_mbmax; /* max chars of mbufs to use */
long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
struct mbuf *sb_mbtail; /* the last mbuf in the chain */
struct mbuf *sb_lastrecord;/* first mbuf of last record in
socket buffer */
short sb_flags; /* flags, see below */
/* End area that is zeroed on flush. */
#define sb_endzero sb_flags
uint64_t sb_timeo_nsecs;/* timeout for read/write */
struct selinfo sb_sel; /* process selecting read/write */
} so_rcv, so_snd;
#define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */
#define SB_LOCK 0x01 /* lock on data queue */
#define SB_WANT 0x02 /* someone is waiting to lock */
#define SB_WAIT 0x04 /* someone is waiting for data/space */
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_SPLICE 0x20 /* buffer is splice source or drain */
#define SB_NOINTR 0x40 /* operations not interruptible */
void (*so_upcall)(struct socket *so, caddr_t arg, int waitf);
caddr_t so_upcallarg; /* Arg for above */
uid_t so_euid, so_ruid; /* who opened the socket */
gid_t so_egid, so_rgid;
pid_t so_cpid; /* pid of process that opened socket */
};
/*
* Socket state bits.
*/
#define SS_NOFDREF 0x001 /* no file table ref any more */
#define SS_ISCONNECTED 0x002 /* socket connected to a peer */
#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */
#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */
#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */
#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */
#define SS_RCVATMARK 0x040 /* at mark on input */
#define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */
#define SS_PRIV 0x080 /* privileged for broadcast, raw... */
#define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */
#define SS_ISSENDING 0x2000 /* hint for lower layer */
#define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */
#define SS_NEWCONN_WAIT 0x8000 /* waiting sonewconn() relock */
#define SS_YP 0x10000 /* created using ypconnect(2) */
#ifdef _KERNEL
#include <lib/libkern/libkern.h>
void soassertlocked(struct socket *);
static inline void
soref(struct socket *so)
{
refcnt_take(&so->so_refcnt);
}
static inline void
sorele(struct socket *so)
{
refcnt_rele_wake(&so->so_refcnt);
}
/*
* Macros for sockets and socket buffering.
*/
#define isspliced(so) ((so)->so_sp && (so)->so_sp->ssp_socket)
#define issplicedback(so) ((so)->so_sp && (so)->so_sp->ssp_soback)
/*
* Do we need to notify the other side when I/O is possible?
*/
static inline int
sb_notify(struct socket *so, struct sockbuf *sb)
{
KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
return ((sb->sb_flags & (SB_WAIT|SB_ASYNC|SB_SPLICE)) != 0 ||
!klist_empty(&sb->sb_sel.si_note));
}
/*
* How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
* This is problematical if the fields are unsigned, as the space might
* still be negative (cc > hiwat or mbcnt > mbmax). Should detect
* overflow and return 0.
*/
static inline long
sbspace(struct socket *so, struct sockbuf *sb)
{
KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
soassertlocked(so);
return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
/* do we have to send all at once on a socket? */
#define sosendallatonce(so) \
((so)->so_proto->pr_flags & PR_ATOMIC)
/* are we sending on this socket? */
#define soissending(so) \
((so)->so_state & SS_ISSENDING)
/* can we read something from so? */
static inline int
soreadable(struct socket *so)
{
soassertlocked(so);
if (isspliced(so))
return 0;
return (so->so_state & SS_CANTRCVMORE) || so->so_qlen || so->so_error || so->so_rcv.sb_cc >= so->so_rcv.sb_lowat;
}
/* can we write something to so? */
#define sowriteable(so) \
((sbspace((so), &(so)->so_snd) >= (so)->so_snd.sb_lowat && \
(((so)->so_state & SS_ISCONNECTED) || \
((so)->so_proto->pr_flags & PR_CONNREQUIRED)==0)) || \
((so)->so_state & SS_CANTSENDMORE) || (so)->so_error)
/* adjust counters in sb reflecting allocation of m */
#define sballoc(so, sb, m) do { \
(sb)->sb_cc += (m)->m_len; \
if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME) \
(sb)->sb_datacc += (m)->m_len; \
(sb)->sb_mbcnt += MSIZE; \
if ((m)->m_flags & M_EXT) \
(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
} while (/* CONSTCOND */ 0)
/* adjust counters in sb reflecting freeing of m */
#define sbfree(so, sb, m) do { \
(sb)->sb_cc -= (m)->m_len; \
if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME) \
(sb)->sb_datacc -= (m)->m_len; \
(sb)->sb_mbcnt -= MSIZE; \
if ((m)->m_flags & M_EXT) \
(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
} while (/* CONSTCOND */ 0)
/*
* Set lock on sockbuf sb; sleep if lock is already held.
* Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
* Returns error without lock if sleep is interrupted.
*/
int sblock(struct socket *, struct sockbuf *, int);
/* release lock on sockbuf sb */
void sbunlock(struct socket *, struct sockbuf *);
#define SB_EMPTY_FIXUP(sb) do { \
if ((sb)->sb_mb == NULL) { \
(sb)->sb_mbtail = NULL; \
(sb)->sb_lastrecord = NULL; \
} \
} while (/*CONSTCOND*/0)
extern u_long sb_max;
extern struct pool socket_pool;
struct mbuf;
struct sockaddr;
struct proc;
struct msghdr;
struct stat;
struct knote;
/*
* File operations on sockets.
*/
int soo_read(struct file *, struct uio *, int);
int soo_write(struct file *, struct uio *, int);
int soo_ioctl(struct file *, u_long, caddr_t, struct proc *);
int soo_kqfilter(struct file *, struct knote *);
int soo_close(struct file *, struct proc *);
int soo_stat(struct file *, struct stat *, struct proc *);
void sbappend(struct socket *, struct sockbuf *, struct mbuf *);
void sbappendstream(struct socket *, struct sockbuf *, struct mbuf *);
int sbappendaddr(struct socket *, struct sockbuf *,
const struct sockaddr *, struct mbuf *, struct mbuf *);
int sbappendcontrol(struct socket *, struct sockbuf *, struct mbuf *,
struct mbuf *);
void sbappendrecord(struct socket *, struct sockbuf *, struct mbuf *);
void sbcompress(struct socket *, struct sockbuf *, struct mbuf *,
struct mbuf *);
struct mbuf *
sbcreatecontrol(const void *, size_t, int, int);
void sbdrop(struct socket *, struct sockbuf *, int);
void sbdroprecord(struct socket *, struct sockbuf *);
void sbflush(struct socket *, struct sockbuf *);
void sbrelease(struct socket *, struct sockbuf *);
int sbcheckreserve(u_long, u_long);
int sbchecklowmem(void);
int sbreserve(struct socket *, struct sockbuf *, u_long);
int sbwait(struct socket *, struct sockbuf *);
void soinit(void);
void soabort(struct socket *);
int soaccept(struct socket *, struct mbuf *);
int sobind(struct socket *, struct mbuf *, struct proc *);
void socantrcvmore(struct socket *);
void socantsendmore(struct socket *);
int soclose(struct socket *, int);
int soconnect(struct socket *, struct mbuf *);
int soconnect2(struct socket *, struct socket *);
int socreate(int, struct socket **, int, int);
int sodisconnect(struct socket *);
struct socket *soalloc(int);
void sofree(struct socket *, int);
int sogetopt(struct socket *, int, int, struct mbuf *);
void sohasoutofband(struct socket *);
void soisconnected(struct socket *);
void soisconnecting(struct socket *);
void soisdisconnected(struct socket *);
void soisdisconnecting(struct socket *);
int solisten(struct socket *, int);
struct socket *sonewconn(struct socket *, int);
void soqinsque(struct socket *, struct socket *, int);
int soqremque(struct socket *, int);
int soreceive(struct socket *, struct mbuf **, struct uio *,
struct mbuf **, struct mbuf **, int *, socklen_t);
int soreserve(struct socket *, u_long, u_long);
int sosend(struct socket *, struct mbuf *, struct uio *,
struct mbuf *, struct mbuf *, int);
int sosetopt(struct socket *, int, int, struct mbuf *);
int soshutdown(struct socket *, int);
void sowakeup(struct socket *, struct sockbuf *);
void sorwakeup(struct socket *);
void sowwakeup(struct socket *);
int sockargs(struct mbuf **, const void *, size_t, int);
int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
void solock(struct socket *);
void solock_shared(struct socket *);
int solock_persocket(struct socket *);
void solock_pair(struct socket *, struct socket *);
void sounlock(struct socket *);
void sounlock_shared(struct socket *);
int sendit(struct proc *, int, struct msghdr *, int, register_t *);
int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
int doaccept(struct proc *, int, struct sockaddr *, socklen_t *, int,
register_t *);
#ifdef SOCKBUF_DEBUG
void sblastrecordchk(struct sockbuf *, const char *);
#define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where))
void sblastmbufchk(struct sockbuf *, const char *);
#define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where))
void sbcheck(struct socket *, struct sockbuf *);
#define SBCHECK(so, sb) sbcheck((so), (sb))
#else
#define SBLASTRECORDCHK(sb, where) /* nothing */
#define SBLASTMBUFCHK(sb, where) /* nothing */
#define SBCHECK(so, sb) /* nothing */
#endif /* SOCKBUF_DEBUG */
#endif /* _KERNEL */
#endif /* _SYS_SOCKETVAR_H_ */
/* $OpenBSD: tty_subr.c,v 1.36 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: tty_subr.c,v 1.13 1996/02/09 19:00:43 christos Exp $ */
/*
* Copyright (c) 1993, 1994 Theo de Raadt
* All rights reserved.
*
* Per Lindqvist <pgd@compuram.bbt.se> supplied an almost fully working
* set of true clist functions that this is very loosely based on.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/malloc.h>
/*
* If TTY_QUOTE functionality isn't required by a line discipline,
* it can free c_cq and set it to NULL. This speeds things up,
* and also does not use any extra memory. This is useful for (say)
* a SLIP line discipline that wants a 32K ring buffer for data
* but doesn't need quoting.
*/
#define QMEM(n) ((((n)-1)/NBBY)+1)
void clrbits(u_char *, int, int);
/*
* Initialize a particular clist. Ok, they are really ring buffers,
* of the specified length, with/without quoting support.
*/
void
clalloc(struct clist *clp, int size, int quot)
{
clp->c_cs = malloc(size, M_TTYS, M_WAITOK|M_ZERO);
if (quot) clp->c_cq = malloc(QMEM(size), M_TTYS, M_WAITOK|M_ZERO);
else
clp->c_cq = NULL;
clp->c_cf = clp->c_cl = NULL;
clp->c_ce = clp->c_cs + size;
clp->c_cn = size;
clp->c_cc = 0;
}
void
clfree(struct clist *clp)
{
if (clp->c_cs) {
explicit_bzero(clp->c_cs, clp->c_cn);
free(clp->c_cs, M_TTYS, clp->c_cn);
}
if (clp->c_cq) {
explicit_bzero(clp->c_cq, QMEM(clp->c_cn));
free(clp->c_cq, M_TTYS, QMEM(clp->c_cn));
}
clp->c_cs = clp->c_cq = NULL;
}
/*
* Get a character from a clist.
*/
int
getc(struct clist *clp)
{
int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
c = *clp->c_cf & 0xff;
*clp->c_cf = 0;
if (clp->c_cq) { if (isset(clp->c_cq, clp->c_cf - clp->c_cs))
c |= TTY_QUOTE;
clrbit(clp->c_cq, clp->c_cf - clp->c_cs);
}
if (++clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs; if (--clp->c_cc == 0) clp->c_cf = clp->c_cl = NULL;
out:
splx(s);
return c;
}
/*
* Copy clist to buffer.
* Return number of bytes moved.
*/
int
q_to_b(struct clist *clp, u_char *cp, int count)
{
int cc;
u_char *p = cp;
int s;
s = spltty();
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl) cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
memcpy(p, clp->c_cf, cc);
memset(clp->c_cf, 0, cc);
if (clp->c_cq) clrbits(clp->c_cq, clp->c_cf - clp->c_cs, cc);
count -= cc;
p += cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0) clp->c_cf = clp->c_cl = NULL;
splx(s);
return p - cp;
}
/*
* Return count of contiguous characters in clist.
* Stop counting if flag&character is non-null.
*/
int
ndqb(struct clist *clp, int flag)
{
int count = 0;
int i;
int cc;
int s;
s = spltty();
if ((cc = clp->c_cc) == 0)
goto out;
if (flag == 0) {
count = clp->c_cl - clp->c_cf;
if (count <= 0)
count = clp->c_ce - clp->c_cf;
goto out;
}
i = clp->c_cf - clp->c_cs;
if (flag & TTY_QUOTE) {
while (cc-- > 0 && !(clp->c_cs[i++] & (flag & ~TTY_QUOTE) ||
isset(clp->c_cq, i))) {
count++;
if (i == clp->c_cn)
break;
}
} else {
while (cc-- > 0 && !(clp->c_cs[i++] & flag)) {
count++;
if (i == clp->c_cn)
break;
}
}
out:
splx(s);
return count;
}
/*
* Flush count bytes from clist.
*/
void
ndflush(struct clist *clp, int count)
{
int cc;
int s;
s = spltty();
if (count == clp->c_cc) {
clp->c_cc = 0;
clp->c_cf = clp->c_cl = NULL;
goto out;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl)
cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
count -= cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0) clp->c_cf = clp->c_cl = NULL;
out:
splx(s);
}
/*
* Put a character into the output queue.
*/
int
putc(int c, struct clist *clp)
{
int i;
int s;
s = spltty();
if (clp->c_cc == clp->c_cn) {
splx(s);
return -1;
}
if (clp->c_cc == 0) {
if (!clp->c_cs)
panic("%s: tty has no clist", __func__); clp->c_cf = clp->c_cl = clp->c_cs;
}
*clp->c_cl = c & 0xff;
i = clp->c_cl - clp->c_cs;
if (clp->c_cq) { if (c & TTY_QUOTE)
setbit(clp->c_cq, i);
else
clrbit(clp->c_cq, i);
}
clp->c_cc++;
clp->c_cl++;
if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs;
splx(s);
return 0;
}
/*
* optimized version of
*
* for (i = 0; i < len; i++)
* clrbit(cp, off + i);
*/
void
clrbits(u_char *cp, int off, int len)
{
int sby, sbi, eby, ebi;
int i;
u_char mask;
if (len==1) {
clrbit(cp, off);
return;
}
sby = off / NBBY;
sbi = off % NBBY;
eby = (off+len) / NBBY;
ebi = (off+len) % NBBY;
if (sby == eby) {
mask = ((1 << (ebi - sbi)) - 1) << sbi;
cp[sby] &= ~mask;
} else {
mask = (1<<sbi) - 1;
cp[sby++] &= mask;
for (i = sby; i < eby; i++)
cp[i] = 0x00;
mask = (1<<ebi) - 1;
if (mask) /* if no mask, eby may be 1 too far */
cp[eby] &= ~mask;
}
}
/*
* Copy buffer to clist.
* Return number of bytes not transferred.
*/
int
b_to_q(u_char *cp, int count, struct clist *clp)
{
int cc;
u_char *p = cp;
int s;
if (count <= 0)
return 0;
s = spltty();
if (clp->c_cc == clp->c_cn)
goto out;
if (clp->c_cc == 0) {
if (!clp->c_cs)
panic("%s: tty has no clist", __func__); clp->c_cf = clp->c_cl = clp->c_cs;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc < clp->c_cn) {
cc = clp->c_ce - clp->c_cl;
if (clp->c_cf > clp->c_cl)
cc = clp->c_cf - clp->c_cl;
if (cc > count)
cc = count;
memcpy(clp->c_cl, p, cc);
if (clp->c_cq) clrbits(clp->c_cq, clp->c_cl - clp->c_cs, cc);
p += cc;
count -= cc;
clp->c_cc += cc;
clp->c_cl += cc;
if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs;
}
out:
splx(s);
return count;
}
/*
* Given a non-NULL pointer into the clist return the pointer
* to the next character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and nextc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*/
u_char *
nextc(struct clist *clp, u_char *cp, int *c, int *ccp)
{
if (clp->c_cf == cp) {
/*
* First time initialization.
*/
*ccp = clp->c_cc;
}
if (*ccp == 0 || cp == NULL)
return NULL;
if (--(*ccp) == 0)
return NULL;
if (++cp == clp->c_ce) cp = clp->c_cs;
*c = *cp & 0xff;
if (clp->c_cq) { if (isset(clp->c_cq, cp - clp->c_cs)) *c |= TTY_QUOTE;
}
return cp;
}
/*
* Given a non-NULL pointer into the clist return the pointer
* to the first character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and nextc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*
* *c is set to the NEXT character
*/
u_char *
firstc(struct clist *clp, int *c, int *ccp)
{
u_char *cp;
*ccp = clp->c_cc;
if (*ccp == 0)
return NULL;
cp = clp->c_cf;
*c = *cp & 0xff;
if (clp->c_cq) { if (isset(clp->c_cq, cp - clp->c_cs)) *c |= TTY_QUOTE;
}
return clp->c_cf;
}
/*
* Remove the last character in the clist and return it.
*/
int
unputc(struct clist *clp)
{
unsigned int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
if (clp->c_cl == clp->c_cs) clp->c_cl = clp->c_ce - 1;
else
--clp->c_cl;
clp->c_cc--;
c = *clp->c_cl & 0xff;
*clp->c_cl = 0;
if (clp->c_cq) { if (isset(clp->c_cq, clp->c_cl - clp->c_cs))
c |= TTY_QUOTE;
clrbit(clp->c_cq, clp->c_cl - clp->c_cs);
}
if (clp->c_cc == 0) clp->c_cf = clp->c_cl = NULL;
out:
splx(s);
return c;
}
/*
* Put the chars in the from queue on the end of the to queue.
*/
void
catq(struct clist *from, struct clist *to)
{
int c;
int s;
s = spltty();
if (from->c_cc == 0) { /* nothing to move */
splx(s);
return;
}
/*
* if `to' queue is empty and the queues are the same max size,
* it is more efficient to just swap the clist structures.
*/
if (to->c_cc == 0 && from->c_cn == to->c_cn) { struct clist tmp;
tmp = *from;
*from = *to;
*to = tmp;
splx(s);
return;
}
splx(s);
while ((c = getc(from)) != -1)
putc(c, to);
}
/* $OpenBSD: in.c,v 1.176 2022/08/29 07:51:45 bluhm Exp $ */
/* $NetBSD: in.c,v 1.26 1996/02/13 23:41:39 christos Exp $ */
/*
* Copyright (C) 2001 WIDE Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.2 (Berkeley) 11/15/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/igmp_var.h>
#ifdef MROUTING
#include <netinet/ip_mroute.h>
#endif
#include "ether.h"
void in_socktrim(struct sockaddr_in *);
int in_ioctl_set_ifaddr(u_long, caddr_t, struct ifnet *, int);
int in_ioctl_change_ifaddr(u_long, caddr_t, struct ifnet *, int);
int in_ioctl_get(u_long, caddr_t, struct ifnet *);
void in_purgeaddr(struct ifaddr *);
int in_addhost(struct in_ifaddr *, struct sockaddr_in *);
int in_scrubhost(struct in_ifaddr *, struct sockaddr_in *);
int in_insert_prefix(struct in_ifaddr *);
void in_remove_prefix(struct in_ifaddr *);
/*
* Determine whether an IP address is in a reserved set of addresses
* that may not be forwarded, or whether datagrams to that destination
* may be forwarded.
*/
int
in_canforward(struct in_addr in)
{
u_int32_t net;
if (IN_MULTICAST(in.s_addr))
return (0);
if (IN_CLASSA(in.s_addr)) { net = in.s_addr & IN_CLASSA_NET;
if (net == 0 ||
net == htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
return (0);
}
return (1);
}
/*
* Trim a mask in a sockaddr
*/
void
in_socktrim(struct sockaddr_in *ap)
{
char *cplim = (char *) &ap->sin_addr;
char *cp = (char *) (&ap->sin_addr + 1);
ap->sin_len = 0;
while (--cp >= cplim)
if (*cp) {
(ap)->sin_len = cp - (char *) (ap) + 1;
break;
}
}
int
in_mask2len(struct in_addr *mask)
{
int x, y;
u_char *p;
p = (u_char *)mask;
for (x = 0; x < sizeof(*mask); x++) {
if (p[x] != 0xff)
break;
}
y = 0;
if (x < sizeof(*mask)) {
for (y = 0; y < 8; y++) {
if ((p[x] & (0x80 >> y)) == 0)
break;
}
}
return x * 8 + y;
}
void
in_len2mask(struct in_addr *mask, int len)
{
int i;
u_char *p;
p = (u_char *)mask;
bzero(mask, sizeof(*mask));
for (i = 0; i < len / 8; i++)
p[i] = 0xff;
if (len % 8)
p[i] = (0xff00 >> (len % 8)) & 0xff;
}
int
in_nam2sin(const struct mbuf *nam, struct sockaddr_in **sin)
{
struct sockaddr *sa = mtod(nam, struct sockaddr *);
if (nam->m_len < offsetof(struct sockaddr, sa_data))
return EINVAL;
if (sa->sa_family != AF_INET)
return EAFNOSUPPORT;
if (sa->sa_len != nam->m_len)
return EINVAL;
if (sa->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
*sin = satosin(sa);
return 0;
}
int
in_sa2sin(struct sockaddr *sa, struct sockaddr_in **sin)
{
if (sa->sa_family != AF_INET)
return EAFNOSUPPORT;
if (sa->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
*sin = satosin(sa);
return 0;
}
int
in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp)
{
int privileged;
int error;
privileged = 0;
if ((so->so_state & SS_PRIV) != 0)
privileged++;
switch (cmd) {
#ifdef MROUTING
case SIOCGETVIFCNT:
case SIOCGETSGCNT:
error = mrt_ioctl(so, cmd, data);
break;
#endif /* MROUTING */
default:
error = in_ioctl(cmd, data, ifp, privileged);
break;
}
return error;
}
int
in_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, int privileged)
{
struct ifreq *ifr = (struct ifreq *)data;
struct ifaddr *ifa;
struct in_ifaddr *ia = NULL;
struct sockaddr_in *sin = NULL, oldaddr;
int error = 0;
if (ifp == NULL)
return (ENXIO);
switch (cmd) {
case SIOCGIFADDR:
case SIOCGIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCGIFBRDADDR:
return in_ioctl_get(cmd, data, ifp);
case SIOCSIFADDR:
return in_ioctl_set_ifaddr(cmd, data, ifp, privileged);
case SIOCAIFADDR:
case SIOCDIFADDR:
return in_ioctl_change_ifaddr(cmd, data, ifp, privileged);
case SIOCSIFNETMASK:
case SIOCSIFDSTADDR:
case SIOCSIFBRDADDR:
break;
default:
return (EOPNOTSUPP);
}
if (ifr->ifr_addr.sa_family == AF_INET) {
error = in_sa2sin(&ifr->ifr_addr, &sin);
if (error)
return (error);
}
NET_LOCK();
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET)
continue;
/* find first address or exact match */
if (ia == NULL)
ia = ifatoia(ifa);
if (sin == NULL || sin->sin_addr.s_addr == INADDR_ANY)
break;
if (ifatoia(ifa)->ia_addr.sin_addr.s_addr ==
sin->sin_addr.s_addr) {
ia = ifatoia(ifa);
break;
}
}
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto err;
}
switch (cmd) {
case SIOCSIFDSTADDR:
if (!privileged) {
error = EPERM;
break;
}
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
break;
}
error = in_sa2sin(&ifr->ifr_dstaddr, &sin);
if (error)
break;
oldaddr = ia->ia_dstaddr;
ia->ia_dstaddr = *sin;
error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR, (caddr_t)ia);
if (error) {
ia->ia_dstaddr = oldaddr;
break;
}
in_scrubhost(ia, &oldaddr);
in_addhost(ia, &ia->ia_dstaddr);
break;
case SIOCSIFBRDADDR:
if (!privileged) {
error = EPERM;
break;
}
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
break;
}
error = in_sa2sin(&ifr->ifr_broadaddr, &sin);
if (error)
break;
ifa_update_broadaddr(ifp, &ia->ia_ifa, sintosa(sin));
break;
case SIOCSIFNETMASK:
if (!privileged) {
error = EPERM;
break;
}
if (ifr->ifr_addr.sa_len < 8) {
error = EINVAL;
break;
}
/* do not check inet family or strict len */
sin = satosin(&ifr->ifr_addr);
if (ntohl(sin->sin_addr.s_addr) &
(~ntohl(sin->sin_addr.s_addr) >> 1)) {
/* non-contiguous netmask */
error = EINVAL;
break;
}
ia->ia_netmask = ia->ia_sockmask.sin_addr.s_addr =
sin->sin_addr.s_addr;
break;
}
err:
NET_UNLOCK();
return (error);
}
int
in_ioctl_set_ifaddr(u_long cmd, caddr_t data, struct ifnet *ifp,
int privileged)
{
struct ifreq *ifr = (struct ifreq *)data;
struct ifaddr *ifa;
struct in_ifaddr *ia = NULL;
struct sockaddr_in *sin;
int error = 0;
int newifaddr;
if (cmd != SIOCSIFADDR)
panic("%s: invalid ioctl %lu", __func__, cmd); if (!privileged)
return (EPERM);
error = in_sa2sin(&ifr->ifr_addr, &sin);
if (error)
return (error);
NET_LOCK();
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
/* find first address */
ia = ifatoia(ifa);
break;
}
if (ia == NULL) {
ia = malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO);
refcnt_init_trace(&ia->ia_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
ia->ia_addr.sin_family = AF_INET;
ia->ia_addr.sin_len = sizeof(ia->ia_addr);
ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr);
ia->ia_ifa.ifa_dstaddr = sintosa(&ia->ia_dstaddr);
ia->ia_ifa.ifa_netmask = sintosa(&ia->ia_sockmask);
ia->ia_sockmask.sin_len = 8;
if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
ia->ia_broadaddr.sin_family = AF_INET;
}
ia->ia_ifp = ifp;
newifaddr = 1;
} else
newifaddr = 0;
in_ifscrub(ifp, ia);
error = in_ifinit(ifp, ia, sin, newifaddr);
if (!error) if_addrhooks_run(ifp);
NET_UNLOCK();
return error;
}
int
in_ioctl_change_ifaddr(u_long cmd, caddr_t data, struct ifnet *ifp,
int privileged)
{
struct ifaddr *ifa;
struct in_ifaddr *ia = NULL;
struct in_aliasreq *ifra = (struct in_aliasreq *)data;
struct sockaddr_in *sin = NULL, *dstsin = NULL, *broadsin = NULL;
struct sockaddr_in *masksin = NULL;
int error = 0;
int newifaddr;
if (ifra->ifra_addr.sin_family == AF_INET) {
error = in_sa2sin(sintosa(&ifra->ifra_addr), &sin);
if (error)
return (error);
}
NET_LOCK();
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET)
continue;
/* find first address, if no exact match wanted */
if (sin == NULL || sin->sin_addr.s_addr ==
ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
ia = ifatoia(ifa);
break;
}
}
switch (cmd) {
case SIOCAIFADDR: {
int needinit = 0;
if (!privileged) {
error = EPERM;
break;
}
if (ifra->ifra_mask.sin_len) { if (ifra->ifra_mask.sin_len < 8) {
error = EINVAL;
break;
}
/* do not check inet family or strict len */
masksin = &ifra->ifra_mask;
if (ntohl(masksin->sin_addr.s_addr) &
(~ntohl(masksin->sin_addr.s_addr) >> 1)) {
/* non-contiguous netmask */
error = EINVAL;
break;
}
}
if ((ifp->if_flags & IFF_POINTOPOINT) &&
ifra->ifra_dstaddr.sin_family == AF_INET) {
error = in_sa2sin(sintosa(&ifra->ifra_dstaddr),
&dstsin);
if (error)
break;
}
if ((ifp->if_flags & IFF_BROADCAST) &&
ifra->ifra_broadaddr.sin_family == AF_INET) {
error = in_sa2sin(sintosa(&ifra->ifra_broadaddr),
&broadsin);
if (error)
break;
}
if (ia == NULL) {
ia = malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO);
refcnt_init_trace(&ia->ia_ifa.ifa_refcnt,
DT_REFCNT_IDX_IFADDR);
ia->ia_addr.sin_family = AF_INET;
ia->ia_addr.sin_len = sizeof(ia->ia_addr);
ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr);
ia->ia_ifa.ifa_dstaddr = sintosa(&ia->ia_dstaddr);
ia->ia_ifa.ifa_netmask = sintosa(&ia->ia_sockmask);
ia->ia_sockmask.sin_len = 8;
if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
ia->ia_broadaddr.sin_family = AF_INET;
}
ia->ia_ifp = ifp;
newifaddr = 1;
} else
newifaddr = 0;
if (sin == NULL) {
sin = &ia->ia_addr;
} else if (newifaddr ||
sin->sin_addr.s_addr != ia->ia_addr.sin_addr.s_addr) {
needinit = 1;
}
if (masksin != NULL) {
in_ifscrub(ifp, ia);
ia->ia_netmask = ia->ia_sockmask.sin_addr.s_addr =
masksin->sin_addr.s_addr;
needinit = 1;
}
if (dstsin != NULL) {
in_ifscrub(ifp, ia);
ia->ia_dstaddr = *dstsin;
needinit = 1;
}
if (broadsin != NULL) {
if (newifaddr)
ia->ia_broadaddr = *broadsin;
else
ifa_update_broadaddr(ifp, &ia->ia_ifa,
sintosa(broadsin));
}
if (needinit) {
error = in_ifinit(ifp, ia, sin, newifaddr);
if (error)
break;
}
if_addrhooks_run(ifp);
break;
}
case SIOCDIFADDR:
if (!privileged) {
error = EPERM;
break;
}
if (ia == NULL) {
error = EADDRNOTAVAIL;
break;
}
/*
* Even if the individual steps were safe, shouldn't
* these kinds of changes happen atomically? What
* should happen to a packet that was routed after
* the scrub but before the other steps?
*/
in_purgeaddr(&ia->ia_ifa);
if_addrhooks_run(ifp);
break;
default:
panic("%s: invalid ioctl %lu", __func__, cmd);
}
NET_UNLOCK();
return (error);
}
int
in_ioctl_get(u_long cmd, caddr_t data, struct ifnet *ifp)
{
struct ifreq *ifr = (struct ifreq *)data;
struct ifaddr *ifa;
struct in_ifaddr *ia = NULL;
struct sockaddr *sa;
struct sockaddr_in *sin = NULL;
int error = 0;
sa = &ifr->ifr_addr;
if (sa->sa_family == AF_INET) { sa->sa_len = sizeof(struct sockaddr_in);
error = in_sa2sin(sa, &sin);
if (error)
return (error);
}
NET_LOCK_SHARED();
TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET)
continue;
/* find first address or exact match */
if (ia == NULL)
ia = ifatoia(ifa);
if (sin == NULL || sin->sin_addr.s_addr == INADDR_ANY)
break;
if (ifatoia(ifa)->ia_addr.sin_addr.s_addr ==
sin->sin_addr.s_addr) {
ia = ifatoia(ifa);
break;
}
}
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto err;
}
switch(cmd) {
case SIOCGIFADDR:
*satosin(&ifr->ifr_addr) = ia->ia_addr;
break;
case SIOCGIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
break;
}
*satosin(&ifr->ifr_dstaddr) = ia->ia_broadaddr;
break;
case SIOCGIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
break;
}
*satosin(&ifr->ifr_dstaddr) = ia->ia_dstaddr;
break;
case SIOCGIFNETMASK:
*satosin(&ifr->ifr_addr) = ia->ia_sockmask;
break;
default:
panic("%s: invalid ioctl %lu", __func__, cmd);
}
err:
NET_UNLOCK_SHARED();
return (error);
}
/*
* Delete any existing route for an interface.
*/
void
in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia)
{
if (ISSET(ifp->if_flags, IFF_POINTOPOINT))
in_scrubhost(ia, &ia->ia_dstaddr); else if (!ISSET(ifp->if_flags, IFF_LOOPBACK))
in_remove_prefix(ia);
}
/*
* Initialize an interface's internet address
* and routing table entry.
*/
int
in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
int newaddr)
{
u_int32_t i = sin->sin_addr.s_addr;
struct sockaddr_in oldaddr;
int error = 0, rterror;
NET_ASSERT_LOCKED();
/*
* Always remove the address from the tree to make sure its
* position gets updated in case the key changes.
*/
if (!newaddr) {
rt_ifa_dellocal(&ia->ia_ifa);
ifa_del(ifp, &ia->ia_ifa);
}
oldaddr = ia->ia_addr;
ia->ia_addr = *sin;
if (ia->ia_netmask == 0) {
if (IN_CLASSA(i))
ia->ia_netmask = IN_CLASSA_NET;
else if (IN_CLASSB(i))
ia->ia_netmask = IN_CLASSB_NET;
else
ia->ia_netmask = IN_CLASSC_NET;
ia->ia_sockmask.sin_addr.s_addr = ia->ia_netmask;
}
/*
* Give the interface a chance to initialize
* if this is its first address,
* and to validate the address if necessary.
*/
if ((error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) {
ia->ia_addr = oldaddr;
}
/*
* Add the address to the local list and the global tree. If an
* error occurred, put back the original address.
*/
ifa_add(ifp, &ia->ia_ifa);
rterror = rt_ifa_addlocal(&ia->ia_ifa);
if (rterror) {
if (!newaddr)
ifa_del(ifp, &ia->ia_ifa);
if (!error)
error = rterror;
goto out;
}
if (error)
goto out;
ia->ia_net = i & ia->ia_netmask;
in_socktrim(&ia->ia_sockmask);
/*
* Add route for the network.
*/
ia->ia_ifa.ifa_metric = ifp->if_metric;
if (ISSET(ifp->if_flags, IFF_BROADCAST)) {
if (IN_RFC3021_SUBNET(ia->ia_netmask))
ia->ia_broadaddr.sin_addr.s_addr = 0;
else {
ia->ia_broadaddr.sin_addr.s_addr =
ia->ia_net | ~ia->ia_netmask;
}
}
if (ISSET(ifp->if_flags, IFF_POINTOPOINT)) {
/* XXX We should not even call in_ifinit() in this case. */
if (ia->ia_dstaddr.sin_family != AF_INET)
goto out;
error = in_addhost(ia, &ia->ia_dstaddr);
} else if (!ISSET(ifp->if_flags, IFF_LOOPBACK)) {
error = in_insert_prefix(ia);
}
/*
* If the interface supports multicast, join the "all hosts"
* multicast group on that interface.
*/
if ((ifp->if_flags & IFF_MULTICAST) && ia->ia_allhosts == NULL) {
struct in_addr addr;
addr.s_addr = INADDR_ALLHOSTS_GROUP;
ia->ia_allhosts = in_addmulti(&addr, ifp);
}
out:
if (error && newaddr)
in_purgeaddr(&ia->ia_ifa);
return (error);
}
void
in_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct in_ifaddr *ia = ifatoia(ifa);
NET_ASSERT_LOCKED();
in_ifscrub(ifp, ia);
rt_ifa_dellocal(&ia->ia_ifa);
rt_ifa_purge(&ia->ia_ifa);
ifa_del(ifp, &ia->ia_ifa);
if (ia->ia_allhosts != NULL) {
in_delmulti(ia->ia_allhosts);
ia->ia_allhosts = NULL;
}
ia->ia_ifp = NULL;
ifafree(&ia->ia_ifa);
}
int
in_addhost(struct in_ifaddr *ia, struct sockaddr_in *dst)
{
return rt_ifa_add(&ia->ia_ifa, RTF_HOST | RTF_MPATH,
sintosa(dst), ia->ia_ifa.ifa_ifp->if_rdomain);
}
int
in_scrubhost(struct in_ifaddr *ia, struct sockaddr_in *dst)
{
return rt_ifa_del(&ia->ia_ifa, RTF_HOST,
sintosa(dst), ia->ia_ifa.ifa_ifp->if_rdomain);
}
/*
* Insert the cloning and broadcast routes for this subnet.
*/
int
in_insert_prefix(struct in_ifaddr *ia)
{
struct ifaddr *ifa = &ia->ia_ifa;
int error;
error = rt_ifa_add(ifa, RTF_CLONING | RTF_CONNECTED | RTF_MPATH,
ifa->ifa_addr, ifa->ifa_ifp->if_rdomain);
if (error)
return (error);
if (ia->ia_broadaddr.sin_addr.s_addr != 0) {
error = rt_ifa_add(ifa, RTF_HOST | RTF_BROADCAST | RTF_MPATH,
ifa->ifa_broadaddr, ifa->ifa_ifp->if_rdomain);
}
return (error);
}
void
in_remove_prefix(struct in_ifaddr *ia)
{
struct ifaddr *ifa = &ia->ia_ifa;
rt_ifa_del(ifa, RTF_CLONING | RTF_CONNECTED,
ifa->ifa_addr, ifa->ifa_ifp->if_rdomain);
if (ia->ia_broadaddr.sin_addr.s_addr != 0) {
rt_ifa_del(ifa, RTF_HOST | RTF_BROADCAST,
ifa->ifa_broadaddr, ifa->ifa_ifp->if_rdomain);
}
}
/*
* Return 1 if the address is a local broadcast address.
*/
int
in_broadcast(struct in_addr in, u_int rtableid)
{
struct ifnet *ifn;
struct ifaddr *ifa;
u_int rdomain;
rdomain = rtable_l2(rtableid);
#define ia (ifatoia(ifa))
TAILQ_FOREACH(ifn, &ifnet, if_list) { if (ifn->if_rdomain != rdomain)
continue;
if ((ifn->if_flags & IFF_BROADCAST) == 0)
continue;
TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) if (ifa->ifa_addr->sa_family == AF_INET && in.s_addr != ia->ia_addr.sin_addr.s_addr &&
in.s_addr == ia->ia_broadaddr.sin_addr.s_addr)
return 1;
}
return (0);
#undef ia
}
/*
* Add an address to the list of IP multicast addresses for a given interface.
*/
struct in_multi *
in_addmulti(struct in_addr *ap, struct ifnet *ifp)
{
struct in_multi *inm;
struct ifreq ifr;
/*
* See if address already in list.
*/
IN_LOOKUP_MULTI(*ap, ifp, inm);
if (inm != NULL) {
/*
* Found it; just increment the reference count.
*/
++inm->inm_refcnt;
} else {
/*
* New address; allocate a new multicast record
* and link it into the interface's multicast list.
*/
inm = malloc(sizeof(*inm), M_IPMADDR, M_WAITOK | M_ZERO);
inm->inm_sin.sin_len = sizeof(struct sockaddr_in);
inm->inm_sin.sin_family = AF_INET;
inm->inm_sin.sin_addr = *ap;
inm->inm_refcnt = 1;
inm->inm_ifidx = ifp->if_index;
inm->inm_ifma.ifma_addr = sintosa(&inm->inm_sin);
/*
* Ask the network driver to update its multicast reception
* filter appropriately for the new address.
*/
memset(&ifr, 0, sizeof(ifr));
memcpy(&ifr.ifr_addr, &inm->inm_sin, sizeof(inm->inm_sin));
if ((*ifp->if_ioctl)(ifp, SIOCADDMULTI,(caddr_t)&ifr) != 0) {
free(inm, M_IPMADDR, sizeof(*inm));
return (NULL);
}
TAILQ_INSERT_HEAD(&ifp->if_maddrlist, &inm->inm_ifma,
ifma_list);
/*
* Let IGMP know that we have joined a new IP multicast group.
*/
igmp_joingroup(inm, ifp);
}
return (inm);
}
/*
* Delete a multicast address record.
*/
void
in_delmulti(struct in_multi *inm)
{
struct ifreq ifr;
struct ifnet *ifp;
NET_ASSERT_LOCKED(); if (--inm->inm_refcnt != 0)
return;
ifp = if_get(inm->inm_ifidx);
if (ifp != NULL) {
/*
* No remaining claims to this record; let IGMP know that
* we are leaving the multicast group.
*/
igmp_leavegroup(inm, ifp);
/*
* Notify the network driver to update its multicast
* reception filter.
*/
memset(&ifr, 0, sizeof(ifr));
satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
satosin(&ifr.ifr_addr)->sin_family = AF_INET;
satosin(&ifr.ifr_addr)->sin_addr = inm->inm_addr;
KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
KERNEL_UNLOCK();
TAILQ_REMOVE(&ifp->if_maddrlist, &inm->inm_ifma, ifma_list);
}
if_put(ifp);
free(inm, M_IPMADDR, sizeof(*inm));
}
/*
* Return 1 if the multicast group represented by ``ap'' has been
* joined by interface ``ifp'', 0 otherwise.
*/
int
in_hasmulti(struct in_addr *ap, struct ifnet *ifp)
{
struct in_multi *inm;
int joined;
IN_LOOKUP_MULTI(*ap, ifp, inm);
joined = (inm != NULL);
return (joined);
}
void
in_ifdetach(struct ifnet *ifp)
{
struct ifaddr *ifa, *next;
/* nuke any of IPv4 addresses we have */
TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrlist, ifa_list, next) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
in_purgeaddr(ifa);
if_addrhooks_run(ifp);
}
if (ifp->if_xflags & IFXF_AUTOCONF4)
ifp->if_xflags &= ~IFXF_AUTOCONF4;
}
void
in_prefixlen2mask(struct in_addr *maskp, int plen)
{ if (plen == 0)
maskp->s_addr = 0;
else
maskp->s_addr = htonl(0xffffffff << (32 - plen));
}
/* $OpenBSD: vfs_bio.c,v 1.210 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
/*
* Copyright (c) 1994 Christopher G. Demetriou
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
*/
/*
* Some references:
* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/specdev.h>
#include <sys/tracepoint.h>
#include <uvm/uvm_extern.h>
/* XXX Should really be in buf.h, but for uvm_constraint_range.. */
int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
struct uvm_constraint_range high_constraint;
int fliphigh;
int nobuffers;
int needbuffer;
struct bio_ops bioops;
/* private bufcache functions */
void bufcache_init(void);
void bufcache_adjust(void);
struct buf *bufcache_gethighcleanbuf(void);
struct buf *bufcache_getdmacleanbuf(void);
/*
* Buffer pool for I/O buffers.
*/
struct pool bufpool;
struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
void buf_put(struct buf *);
struct buf *bio_doread(struct vnode *, daddr_t, int, int);
struct buf *buf_get(struct vnode *, daddr_t, size_t);
void bread_cluster_callback(struct buf *);
int64_t bufcache_recover_dmapages(int discard, int64_t howmany);
struct bcachestats bcstats; /* counters */
long lodirtypages; /* dirty page count low water mark */
long hidirtypages; /* dirty page count high water mark */
long targetpages; /* target number of pages for cache size */
long buflowpages; /* smallest size cache allowed */
long bufhighpages; /* largest size cache allowed */
long bufbackpages; /* minimum number of pages we shrink when asked to */
vsize_t bufkvm;
struct proc *cleanerproc;
int bd_req; /* Sleep point for cleaner daemon. */
#define NUM_CACHES 2
#define DMA_CACHE 0
struct bufcache cleancache[NUM_CACHES];
struct bufqueue dirtyqueue;
void
buf_put(struct buf *bp)
{ splassert(IPL_BIO);
#ifdef DIAGNOSTIC
if (bp->b_pobj != NULL) KASSERT(bp->b_bufsize > 0);
if (ISSET(bp->b_flags, B_DELWRI))
panic("buf_put: releasing dirty buffer"); if (bp->b_freelist.tqe_next != NOLIST &&
bp->b_freelist.tqe_next != (void *)-1)
panic("buf_put: still on the free list"); if (bp->b_vnbufs.le_next != NOLIST &&
bp->b_vnbufs.le_next != (void *)-1)
panic("buf_put: still on the vnode list");
if (!LIST_EMPTY(&bp->b_dep))
panic("buf_put: b_dep is not empty");
#endif
LIST_REMOVE(bp, b_list);
bcstats.numbufs--;
if (buf_dealloc_mem(bp) != 0)
return;
pool_put(&bufpool, bp);
}
/*
* Initialize buffers and hash links for buffers.
*/
void
bufinit(void)
{
u_int64_t dmapages;
u_int64_t highpages;
dmapages = uvm_pagecount(&dma_constraint);
/* take away a guess at how much of this the kernel will consume */
dmapages -= (atop(physmem) - atop(uvmexp.free));
/* See if we have memory above the dma accessible region. */
high_constraint.ucr_low = dma_constraint.ucr_high;
high_constraint.ucr_high = no_constraint.ucr_high;
if (high_constraint.ucr_low != high_constraint.ucr_high)
high_constraint.ucr_low++;
highpages = uvm_pagecount(&high_constraint);
/*
* Do we have any significant amount of high memory above
* the DMA region? if so enable moving buffers there, if not,
* don't bother.
*/
if (highpages > dmapages / 4)
fliphigh = 1;
else
fliphigh = 0;
/*
* If MD code doesn't say otherwise, use up to 10% of DMA'able
* memory for buffers.
*/
if (bufcachepercent == 0)
bufcachepercent = 10;
/*
* XXX these values and their same use in kern_sysctl
* need to move into buf.h
*/
KASSERT(bufcachepercent <= 90);
KASSERT(bufcachepercent >= 5);
if (bufpages == 0)
bufpages = dmapages * bufcachepercent / 100;
if (bufpages < BCACHE_MIN)
bufpages = BCACHE_MIN;
KASSERT(bufpages < dmapages);
bufhighpages = bufpages;
/*
* Set the base backoff level for the buffer cache. We will
* not allow uvm to steal back more than this number of pages.
*/
buflowpages = dmapages * 5 / 100;
if (buflowpages < BCACHE_MIN)
buflowpages = BCACHE_MIN;
/*
* set bufbackpages to 100 pages, or 10 percent of the low water mark
* if we don't have that many pages.
*/
bufbackpages = buflowpages * 10 / 100;
if (bufbackpages > 100)
bufbackpages = 100;
/*
* If the MD code does not say otherwise, reserve 10% of kva
* space for mapping buffers.
*/
if (bufkvm == 0)
bufkvm = VM_KERNEL_SPACE_SIZE / 10;
/*
* Don't use more than twice the amount of bufpages for mappings.
* It's twice since we map things sparsely.
*/
if (bufkvm > bufpages * PAGE_SIZE)
bufkvm = bufpages * PAGE_SIZE;
/*
* Round bufkvm to MAXPHYS because we allocate chunks of va space
* in MAXPHYS chunks.
*/
bufkvm &= ~(MAXPHYS - 1);
pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO, 0, "bufpl", NULL);
bufcache_init();
/*
* hmm - bufkvm is an argument because it's static, while
* bufpages is global because it can change while running.
*/
buf_mem_init(bufkvm);
/*
* Set the dirty page high water mark to be less than the low
* water mark for pages in the buffer cache. This ensures we
* can always back off by throwing away clean pages, and give
* ourselves a chance to write out the dirty pages eventually.
*/
hidirtypages = (buflowpages / 4) * 3;
lodirtypages = buflowpages / 2;
/*
* We are allowed to use up to the reserve.
*/
targetpages = bufpages - RESERVE_PAGES;
}
/*
* Change cachepct
*/
void
bufadjust(int newbufpages)
{
int s;
int64_t npages;
if (newbufpages < buflowpages)
newbufpages = buflowpages;
s = splbio();
bufpages = newbufpages;
/*
* We are allowed to use up to the reserve
*/
targetpages = bufpages - RESERVE_PAGES;
npages = bcstats.dmapages - targetpages;
/*
* Shrinking the cache happens here only if someone has manually
* adjusted bufcachepercent - or the pagedaemon has told us
* to give back memory *now* - so we give it all back.
*/
if (bcstats.dmapages > targetpages)
(void) bufcache_recover_dmapages(0, bcstats.dmapages - targetpages);
bufcache_adjust();
/*
* Wake up the cleaner if we have lots of dirty pages,
* or if we are getting low on buffer cache kva.
*/
if ((UNCLEAN_PAGES >= hidirtypages) ||
bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
wakeup(&bd_req);
splx(s);
}
/*
* Make the buffer cache back off from cachepct.
*/
int
bufbackoff(struct uvm_constraint_range *range, long size)
{
/*
* Back off "size" buffer cache pages. Called by the page
* daemon to consume buffer cache pages rather than scanning.
*
* It returns 0 to the pagedaemon to indicate that it has
* succeeded in freeing enough pages. It returns -1 to
* indicate that it could not and the pagedaemon should take
* other measures.
*
*/
long pdelta, oldbufpages;
/*
* If we will accept high memory for this backoff
* try to steal it from the high memory buffer cache.
*/
if (range != NULL && range->ucr_high > dma_constraint.ucr_high) {
struct buf *bp;
int64_t start = bcstats.numbufpages, recovered = 0;
int s = splbio();
while ((recovered < size) &&
(bp = bufcache_gethighcleanbuf())) {
bufcache_take(bp);
if (bp->b_vp) {
RBT_REMOVE(buf_rb_bufs,
&bp->b_vp->v_bufs_tree, bp);
brelvp(bp);
}
buf_put(bp);
recovered = start - bcstats.numbufpages;
}
bufcache_adjust();
splx(s);
/* If we got enough, return success */
if (recovered >= size)
return 0;
/*
* If we needed only memory above DMA,
* return failure
*/
if (range->ucr_low > dma_constraint.ucr_high)
return -1;
/* Otherwise get the rest from DMA */
size -= recovered;
}
/*
* XXX Otherwise do the dma memory cache dance. this needs
* refactoring later to get rid of 'bufpages'
*/
/*
* Back off by at least bufbackpages. If the page daemon gave us
* a larger size, back off by that much.
*/
pdelta = (size > bufbackpages) ? size : bufbackpages;
if (bufpages <= buflowpages)
return(-1);
if (bufpages - pdelta < buflowpages)
pdelta = bufpages - buflowpages;
oldbufpages = bufpages;
bufadjust(bufpages - pdelta);
if (oldbufpages - bufpages < size)
return (-1); /* we did not free what we were asked */
else
return(0);
}
/*
* Opportunistically flip a buffer into high memory. Will move the buffer
* if memory is available without sleeping, and return 0, otherwise will
* fail and return -1 with the buffer unchanged.
*/
int
buf_flip_high(struct buf *bp)
{
int s;
int ret = -1;
KASSERT(ISSET(bp->b_flags, B_BC));
KASSERT(ISSET(bp->b_flags, B_DMA));
KASSERT(bp->cache == DMA_CACHE);
KASSERT(fliphigh);
/* Attempt to move the buffer to high memory if we can */
s = splbio();
if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT) == 0) {
KASSERT(!ISSET(bp->b_flags, B_DMA));
bcstats.highflips++;
ret = 0;
} else
bcstats.highflops++;
splx(s);
return ret;
}
/*
* Flip a buffer to dma reachable memory, when we need it there for
* I/O. This can sleep since it will wait for memory allocation in the
* DMA reachable area since we have to have the buffer there to proceed.
*/
void
buf_flip_dma(struct buf *bp)
{ KASSERT(ISSET(bp->b_flags, B_BC)); KASSERT(ISSET(bp->b_flags, B_BUSY)); KASSERT(bp->cache < NUM_CACHES); if (!ISSET(bp->b_flags, B_DMA)) {
int s = splbio();
/* move buf to dma reachable memory */
(void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK);
KASSERT(ISSET(bp->b_flags, B_DMA)); bcstats.dmaflips++;
splx(s);
}
if (bp->cache > DMA_CACHE) { CLR(bp->b_flags, B_COLD);
CLR(bp->b_flags, B_WARM);
bp->cache = DMA_CACHE;
}
}
struct buf *
bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
{
struct buf *bp;
struct mount *mp;
bp = getblk(vp, blkno, size, 0, INFSLP);
/*
* If buffer does not have valid data, start a read.
* Note that if buffer is B_INVAL, getblk() won't return it.
* Therefore, it's valid if its I/O has completed or been delayed.
*/
if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { SET(bp->b_flags, B_READ | async);
bcstats.pendingreads++;
bcstats.numreads++;
VOP_STRATEGY(bp->b_vp, bp);
/* Pay for the read. */
curproc->p_ru.ru_inblock++; /* XXX */
} else if (async) {
brelse(bp);
}
mp = vp->v_type == VBLK ? vp->v_specmountpoint : vp->v_mount;
/*
* Collect statistics on synchronous and asynchronous reads.
* Reads from block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) { if (async == 0)
mp->mnt_stat.f_syncreads++;
else
mp->mnt_stat.f_asyncreads++;
}
return (bp);
}
/*
* Read a disk block.
* This algorithm described in Bach (p.54).
*/
int
bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp)
{
struct buf *bp;
/* Get buffer for block. */
bp = *bpp = bio_doread(vp, blkno, size, 0);
/* Wait for the read to complete, and return result. */
return (biowait(bp));
}
/*
* Read-ahead multiple disk blocks. The first is sync, the rest async.
* Trivial modification to the breada algorithm presented in Bach (p.55).
*/
int
breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
int rasizes[], int nrablks, struct buf **bpp)
{
struct buf *bp;
int i;
bp = *bpp = bio_doread(vp, blkno, size, 0);
/*
* For each of the read-ahead blocks, start a read, if necessary.
*/
for (i = 0; i < nrablks; i++) {
/* If it's in the cache, just go on to next one. */
if (incore(vp, rablks[i]))
continue;
/* Get a buffer for the read-ahead block */
(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
}
/* Otherwise, we had to start a read for it; wait until it's valid. */
return (biowait(bp));
}
/*
* Called from interrupt context.
*/
void
bread_cluster_callback(struct buf *bp)
{
struct buf **xbpp = bp->b_saveaddr;
int i;
if (xbpp[1] != NULL) {
size_t newsize = xbpp[1]->b_bufsize;
/*
* Shrink this buffer's mapping to only cover its part of
* the total I/O.
*/
buf_fix_mapping(bp, newsize);
bp->b_bcount = newsize;
}
/* Invalidate read-ahead buffers if read short */
if (bp->b_resid > 0) {
for (i = 1; xbpp[i] != NULL; i++)
continue;
for (i = i - 1; i != 0; i--) {
if (xbpp[i]->b_bufsize <= bp->b_resid) {
bp->b_resid -= xbpp[i]->b_bufsize;
SET(xbpp[i]->b_flags, B_INVAL);
} else if (bp->b_resid > 0) {
bp->b_resid = 0;
SET(xbpp[i]->b_flags, B_INVAL);
} else
break;
}
}
for (i = 1; xbpp[i] != NULL; i++) {
if (ISSET(bp->b_flags, B_ERROR))
SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
/*
* Move the pages from the master buffer's uvm object
* into the individual buffer's uvm objects.
*/
struct uvm_object *newobj = &xbpp[i]->b_uobj;
struct uvm_object *oldobj = &bp->b_uobj;
int page;
uvm_obj_init(newobj, &bufcache_pager, 1);
for (page = 0; page < atop(xbpp[i]->b_bufsize); page++) {
struct vm_page *pg = uvm_pagelookup(oldobj,
xbpp[i]->b_poffs + ptoa(page));
KASSERT(pg != NULL);
KASSERT(pg->wire_count == 1);
uvm_pagerealloc(pg, newobj, xbpp[i]->b_poffs + ptoa(page));
}
xbpp[i]->b_pobj = newobj;
biodone(xbpp[i]);
}
free(xbpp, M_TEMP, (i + 1) * sizeof(*xbpp));
if (ISSET(bp->b_flags, B_ASYNC)) {
brelse(bp);
} else {
CLR(bp->b_flags, B_WANTED);
wakeup(bp);
}
}
/*
* Read-ahead multiple disk blocks, but make sure only one (big) I/O
* request is sent to the disk.
* XXX This should probably be dropped and breadn should instead be optimized
* XXX to do fewer I/O requests.
*/
int
bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
{
struct buf *bp, **xbpp;
int howmany, maxra, i, inc;
daddr_t sblkno;
*rbpp = bio_doread(vp, blkno, size, 0);
/*
* If the buffer is in the cache skip any I/O operation.
*/
if (ISSET((*rbpp)->b_flags, B_CACHE))
goto out;
if (size != round_page(size))
goto out;
if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
goto out;
maxra++;
if (sblkno == -1 || maxra < 2)
goto out;
howmany = MAXPHYS / size;
if (howmany > maxra)
howmany = maxra;
xbpp = mallocarray(howmany + 1, sizeof(*xbpp), M_TEMP, M_NOWAIT);
if (xbpp == NULL)
goto out;
for (i = howmany - 1; i >= 0; i--) {
size_t sz;
/*
* First buffer allocates big enough size to cover what
* all the other buffers need.
*/
sz = i == 0 ? howmany * size : 0;
xbpp[i] = buf_get(vp, blkno + i + 1, sz);
if (xbpp[i] == NULL) { for (++i; i < howmany; i++) {
SET(xbpp[i]->b_flags, B_INVAL);
brelse(xbpp[i]);
}
free(xbpp, M_TEMP, (howmany + 1) * sizeof(*xbpp));
goto out;
}
}
bp = xbpp[0];
xbpp[howmany] = NULL;
inc = btodb(size);
for (i = 1; i < howmany; i++) {
bcstats.pendingreads++;
bcstats.numreads++;
/*
* We set B_DMA here because bp above will be B_DMA,
* and we are playing buffer slice-n-dice games from
* the memory allocated in bp.
*/
SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
xbpp[i]->b_blkno = sblkno + (i * inc);
xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
xbpp[i]->b_data = NULL;
xbpp[i]->b_pobj = bp->b_pobj;
xbpp[i]->b_poffs = bp->b_poffs + (i * size);
}
KASSERT(bp->b_lblkno == blkno + 1); KASSERT(bp->b_vp == vp);
bp->b_blkno = sblkno;
SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
bp->b_saveaddr = (void *)xbpp;
bp->b_iodone = bread_cluster_callback;
bcstats.pendingreads++;
bcstats.numreads++;
VOP_STRATEGY(bp->b_vp, bp);
curproc->p_ru.ru_inblock++;
out:
return (biowait(*rbpp));
}
/*
* Block write. Described in Bach (p.56)
*/
int
bwrite(struct buf *bp)
{
int rv, async, wasdelayed, s;
struct vnode *vp;
struct mount *mp;
vp = bp->b_vp;
if (vp != NULL)
mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
else
mp = NULL;
/*
* Remember buffer type, to switch on it later. If the write was
* synchronous, but the file system was mounted with MNT_ASYNC,
* convert it to a delayed write.
* XXX note that this relies on delayed tape writes being converted
* to async, not sync writes (which is safe, but ugly).
*/
async = ISSET(bp->b_flags, B_ASYNC);
if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
/*
* Don't convert writes from VND on async filesystems
* that already have delayed writes in the upper layer.
*/
if (!ISSET(bp->b_flags, B_NOCACHE)) { bdwrite(bp);
return (0);
}
}
/*
* Collect statistics on synchronous and asynchronous writes.
* Writes to block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) {
if (async)
mp->mnt_stat.f_asyncwrites++;
else
mp->mnt_stat.f_syncwrites++;
}
bcstats.pendingwrites++;
bcstats.numwrites++;
wasdelayed = ISSET(bp->b_flags, B_DELWRI);
CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
s = splbio();
/*
* If not synchronous, pay for the I/O operation and make
* sure the buf is on the correct vnode queue. We have
* to do this now, because if we don't, the vnode may not
* be properly notified that its I/O has completed.
*/
if (wasdelayed) {
reassignbuf(bp);
} else
curproc->p_ru.ru_oublock++;
/* Initiate disk write. Make sure the appropriate party is charged. */
bp->b_vp->v_numoutput++;
splx(s);
buf_flip_dma(bp);
SET(bp->b_flags, B_WRITEINPROG);
VOP_STRATEGY(bp->b_vp, bp);
/*
* If the queue is above the high water mark, wait till
* the number of outstanding write bufs drops below the low
* water mark.
*/
if (bp->b_bq) bufq_wait(bp->b_bq); if (async)
return (0);
/*
* If I/O was synchronous, wait for it to complete.
*/
rv = biowait(bp);
/* Release the buffer. */
brelse(bp);
return (rv);
}
/*
* Delayed write.
*
* The buffer is marked dirty, but is not queued for I/O.
* This routine should be used when the buffer is expected
* to be modified again soon, typically a small write that
* partially fills a buffer.
*
* NB: magnetic tapes cannot be delayed; they must be
* written in the order that the writes are requested.
*
* Described in Leffler, et al. (pp. 208-213).
*/
void
bdwrite(struct buf *bp)
{
int s;
/*
* If the block hasn't been seen before:
* (1) Mark it as having been seen,
* (2) Charge for the write.
* (3) Make sure it's on its vnode's correct block list,
* (4) If a buffer is rewritten, move it to end of dirty list
*/
if (!ISSET(bp->b_flags, B_DELWRI)) { SET(bp->b_flags, B_DELWRI);
s = splbio();
buf_flip_dma(bp);
reassignbuf(bp);
splx(s);
curproc->p_ru.ru_oublock++; /* XXX */
}
/* The "write" is done, so mark and release the buffer. */
CLR(bp->b_flags, B_NEEDCOMMIT);
CLR(bp->b_flags, B_NOCACHE); /* Must cache delayed writes */
SET(bp->b_flags, B_DONE);
brelse(bp);
}
/*
* Asynchronous block write; just an asynchronous bwrite().
*/
void
bawrite(struct buf *bp)
{
SET(bp->b_flags, B_ASYNC);
VOP_BWRITE(bp);
}
/*
* Must be called at splbio()
*/
void
buf_dirty(struct buf *bp)
{
splassert(IPL_BIO);
#ifdef DIAGNOSTIC
if (!ISSET(bp->b_flags, B_BUSY))
panic("Trying to dirty buffer on freelist!");
#endif
if (ISSET(bp->b_flags, B_DELWRI) == 0) {
SET(bp->b_flags, B_DELWRI);
buf_flip_dma(bp);
reassignbuf(bp);
}
}
/*
* Must be called at splbio()
*/
void
buf_undirty(struct buf *bp)
{
splassert(IPL_BIO);
#ifdef DIAGNOSTIC
if (!ISSET(bp->b_flags, B_BUSY))
panic("Trying to undirty buffer on freelist!");
#endif
if (ISSET(bp->b_flags, B_DELWRI)) {
CLR(bp->b_flags, B_DELWRI);
reassignbuf(bp);
}
}
/*
* Release a buffer on to the free lists.
* Described in Bach (p. 46).
*/
void
brelse(struct buf *bp)
{
int s;
s = splbio();
if (bp->b_data != NULL) KASSERT(bp->b_bufsize > 0);
/*
* softdep is basically incompatible with not caching buffers
* that have dependencies, so this buffer must be cached
*/
if (LIST_FIRST(&bp->b_dep) != NULL) CLR(bp->b_flags, B_NOCACHE);
/*
* Determine which queue the buffer should be on, then put it there.
*/
/* If it's not cacheable, or an error, mark it invalid. */
if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) SET(bp->b_flags, B_INVAL);
/* If it's a write error, also mark the vnode as damaged. */
if (ISSET(bp->b_flags, B_ERROR) && !ISSET(bp->b_flags, B_READ)) { if (bp->b_vp && bp->b_vp->v_type == VREG) SET(bp->b_vp->v_bioflag, VBIOERROR);
}
if (ISSET(bp->b_flags, B_INVAL)) {
/*
* If the buffer is invalid, free it now rather than leaving
* it in a queue and wasting memory.
*/
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_deallocate(bp);
if (ISSET(bp->b_flags, B_DELWRI)) { CLR(bp->b_flags, B_DELWRI);
}
if (bp->b_vp) { RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
brelvp(bp);
}
bp->b_vp = NULL;
/*
* Wake up any processes waiting for _this_ buffer to
* become free. They are not allowed to grab it
* since it will be freed. But the only sleeper is
* getblk and it will restart the operation after
* sleep.
*/
if (ISSET(bp->b_flags, B_WANTED)) { CLR(bp->b_flags, B_WANTED);
wakeup(bp);
}
buf_put(bp);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
*/
bufcache_release(bp);
/* Unlock the buffer. */
CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
buf_release(bp);
/* Wake up any processes waiting for _this_ buffer to
* become free. */
if (ISSET(bp->b_flags, B_WANTED)) { CLR(bp->b_flags, B_WANTED);
wakeup(bp);
}
if (bcstats.dmapages > targetpages)
(void) bufcache_recover_dmapages(0,
bcstats.dmapages - targetpages);
bufcache_adjust();
}
/* Wake up syncer and cleaner processes waiting for buffers. */
if (nobuffers) { nobuffers = 0;
wakeup(&nobuffers);
}
/* Wake up any processes waiting for any buffer to become free. */
if (needbuffer && bcstats.dmapages < targetpages &&
bcstats.kvaslots_avail > RESERVE_SLOTS) {
needbuffer = 0;
wakeup(&needbuffer);
}
splx(s);
}
/*
* Determine if a block is in the cache. Just look on what would be its hash
* chain. If it's there, return a pointer to it, unless it's marked invalid.
*/
struct buf *
incore(struct vnode *vp, daddr_t blkno)
{
struct buf *bp;
struct buf b;
int s;
s = splbio();
/* Search buf lookup tree */
b.b_lblkno = blkno;
bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
if (bp != NULL && ISSET(bp->b_flags, B_INVAL))
bp = NULL;
splx(s);
return (bp);
}
/*
* Get a block of requested size that is associated with
* a given vnode and block offset. If it is found in the
* block cache, mark it as having been found, make it busy
* and return it. Otherwise, return an empty block of the
* correct size. It is up to the caller to ensure that the
* cached blocks be of the correct size.
*/
struct buf *
getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag,
uint64_t slptimeo)
{
struct buf *bp;
struct buf b;
int s, error;
/*
* XXX
* The following is an inlined version of 'incore()', but with
* the 'invalid' test moved to after the 'busy' test. It's
* necessary because there are some cases in which the NFS
* code sets B_INVAL prior to writing data to the server, but
* in which the buffers actually contain valid data. In this
* case, we can't allow the system to allocate a new buffer for
* the block until the write is finished.
*/
start:
s = splbio();
b.b_lblkno = blkno;
bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
if (bp != NULL) { if (ISSET(bp->b_flags, B_BUSY)) {
SET(bp->b_flags, B_WANTED);
error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
"getblk", slptimeo);
splx(s);
if (error)
return (NULL);
goto start;
}
if (!ISSET(bp->b_flags, B_INVAL)) {
bcstats.cachehits++;
SET(bp->b_flags, B_CACHE);
bufcache_take(bp);
buf_acquire(bp);
splx(s);
return (bp);
}
}
splx(s);
if ((bp = buf_get(vp, blkno, size)) == NULL)
goto start;
return (bp);
}
/*
* Get an empty, disassociated buffer of given size.
*/
struct buf *
geteblk(size_t size)
{
struct buf *bp;
while ((bp = buf_get(NULL, 0, size)) == NULL)
continue;
return (bp);
}
/*
* Allocate a buffer.
* If vp is given, put it into the buffer cache for that vnode.
* If size != 0, allocate memory and call buf_map().
* If there is already a buffer for the given vnode/blkno, return NULL.
*/
struct buf *
buf_get(struct vnode *vp, daddr_t blkno, size_t size)
{
struct buf *bp;
int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
int npages;
int s;
s = splbio();
if (size) {
/*
* Wake up the cleaner if we have lots of dirty pages,
* or if we are getting low on buffer cache kva.
*/
if (UNCLEAN_PAGES >= hidirtypages ||
bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
wakeup(&bd_req);
npages = atop(round_page(size));
/*
* if our cache has been previously shrunk,
* allow it to grow again with use up to
* bufhighpages (cachepercent)
*/
if (bufpages < bufhighpages) bufadjust(bufhighpages);
/*
* If we would go over the page target with our
* new allocation, free enough buffers first
* to stay at the target with our new allocation.
*/
if (bcstats.dmapages + npages > targetpages) {
(void) bufcache_recover_dmapages(0, npages);
bufcache_adjust();
}
/*
* If we get here, we tried to free the world down
* above, and couldn't get down - Wake the cleaner
* and wait for it to push some buffers out.
*/
if ((bcstats.dmapages + npages > targetpages ||
bcstats.kvaslots_avail <= RESERVE_SLOTS) &&
curproc != syncerproc && curproc != cleanerproc) { wakeup(&bd_req);
needbuffer++;
tsleep_nsec(&needbuffer, PRIBIO, "needbuffer", INFSLP);
splx(s);
return (NULL);
}
if (bcstats.dmapages + npages > bufpages) {
/* cleaner or syncer */
nobuffers = 1;
tsleep_nsec(&nobuffers, PRIBIO, "nobuffers", INFSLP);
splx(s);
return (NULL);
}
}
bp = pool_get(&bufpool, poolwait|PR_ZERO);
if (bp == NULL) {
splx(s);
return (NULL);
}
bp->b_freelist.tqe_next = NOLIST;
bp->b_dev = NODEV;
LIST_INIT(&bp->b_dep);
bp->b_bcount = size;
buf_acquire_nomap(bp);
if (vp != NULL) {
/*
* We insert the buffer into the hash with B_BUSY set
* while we allocate pages for it. This way any getblk
* that happens while we allocate pages will wait for
* this buffer instead of starting its own buf_get.
*
* But first, we check if someone beat us to it.
*/
if (incore(vp, blkno)) { pool_put(&bufpool, bp);
splx(s);
return (NULL);
}
bp->b_blkno = bp->b_lblkno = blkno;
bgetvp(vp, bp);
if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp)) panic("buf_get: dup lblk vp %p bp %p", vp, bp);
} else {
bp->b_vnbufs.le_next = NOLIST;
SET(bp->b_flags, B_INVAL);
bp->b_vp = NULL;
}
LIST_INSERT_HEAD(&bufhead, bp, b_list);
bcstats.numbufs++;
if (size) {
buf_alloc_pages(bp, round_page(size));
KASSERT(ISSET(bp->b_flags, B_DMA)); buf_map(bp);
}
SET(bp->b_flags, B_BC);
splx(s);
return (bp);
}
/*
* Buffer cleaning daemon.
*/
void
buf_daemon(void *arg)
{
struct buf *bp = NULL;
int s, pushed = 0;
s = splbio();
for (;;) {
if (bp == NULL || (pushed >= 16 &&
UNCLEAN_PAGES < hidirtypages &&
bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){
pushed = 0;
/*
* Wake up anyone who was waiting for buffers
* to be released.
*/
if (needbuffer) {
needbuffer = 0;
wakeup(&needbuffer);
}
tsleep_nsec(&bd_req, PRIBIO - 7, "cleaner", INFSLP);
}
while ((bp = bufcache_getdirtybuf())) {
TRACEPOINT(vfs, cleaner, bp->b_flags, pushed,
lodirtypages, hidirtypages);
if (UNCLEAN_PAGES < lodirtypages &&
bcstats.kvaslots_avail > 2 * RESERVE_SLOTS &&
pushed >= 16)
break;
bufcache_take(bp);
buf_acquire(bp);
splx(s);
if (ISSET(bp->b_flags, B_INVAL)) {
brelse(bp);
s = splbio();
continue;
}
#ifdef DIAGNOSTIC
if (!ISSET(bp->b_flags, B_DELWRI))
panic("Clean buffer on dirty queue");
#endif
if (LIST_FIRST(&bp->b_dep) != NULL &&
!ISSET(bp->b_flags, B_DEFERRED) &&
buf_countdeps(bp, 0, 0)) {
SET(bp->b_flags, B_DEFERRED);
s = splbio();
bufcache_release(bp);
buf_release(bp);
continue;
}
bawrite(bp);
pushed++;
sched_pause(yield);
s = splbio();
}
}
}
/*
* Wait for operations on the buffer to complete.
* When they do, extract and return the I/O's error value.
*/
int
biowait(struct buf *bp)
{
int s;
KASSERT(!(bp->b_flags & B_ASYNC));
s = splbio();
while (!ISSET(bp->b_flags, B_DONE))
tsleep_nsec(bp, PRIBIO + 1, "biowait", INFSLP);
splx(s);
/* check for interruption of I/O (e.g. via NFS), then errors. */
if (ISSET(bp->b_flags, B_EINTR)) {
CLR(bp->b_flags, B_EINTR);
return (EINTR);
}
if (ISSET(bp->b_flags, B_ERROR)) return (bp->b_error ? bp->b_error : EIO);
else
return (0);
}
/*
* Mark I/O complete on a buffer.
*
* If a callback has been requested, e.g. the pageout
* daemon, do so. Otherwise, awaken waiting processes.
*
* [ Leffler, et al., says on p.247:
* "This routine wakes up the blocked process, frees the buffer
* for an asynchronous write, or, for a request by the pagedaemon
* process, invokes a procedure specified in the buffer structure" ]
*
* In real life, the pagedaemon (or other system processes) wants
* to do async stuff to, and doesn't want the buffer brelse()'d.
* (for swap pager, that puts swap buffers on the free lists (!!!),
* for the vn device, that puts malloc'd buffers on the free lists!)
*
* Must be called at splbio().
*/
void
biodone(struct buf *bp)
{ splassert(IPL_BIO);
if (ISSET(bp->b_flags, B_DONE))
panic("biodone already");
SET(bp->b_flags, B_DONE); /* note that it's done */
if (bp->b_bq) bufq_done(bp->b_bq, bp); if (LIST_FIRST(&bp->b_dep) != NULL)
buf_complete(bp);
if (!ISSET(bp->b_flags, B_READ)) { CLR(bp->b_flags, B_WRITEINPROG);
vwakeup(bp->b_vp);
}
if (bcstats.numbufs && (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) { if (!ISSET(bp->b_flags, B_READ)) {
bcstats.pendingwrites--;
} else
bcstats.pendingreads--;
}
if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
CLR(bp->b_flags, B_CALL); /* but note callout done */
(*bp->b_iodone)(bp);
} else {
if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
brelse(bp);
} else { /* or just wakeup the buffer */
CLR(bp->b_flags, B_WANTED);
wakeup(bp);
}
}
}
#ifdef DDB
void bcstats_print(int (*)(const char *, ...)
__attribute__((__format__(__kprintf__,1,2))));
/*
* bcstats_print: ddb hook to print interesting buffer cache counters
*/
void
bcstats_print(
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
(*pr)("Current Buffer Cache status:\n");
(*pr)("numbufs %lld busymapped %lld, delwri %lld\n",
bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
(*pr)("kvaslots %lld avail kva slots %lld\n",
bcstats.kvaslots, bcstats.kvaslots_avail);
(*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n",
bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
(*pr)("pendingreads %lld, pendingwrites %lld\n",
bcstats.pendingreads, bcstats.pendingwrites);
(*pr)("highflips %lld, highflops %lld, dmaflips %lld\n",
bcstats.highflips, bcstats.highflops, bcstats.dmaflips);
}
#endif
void
buf_adjcnt(struct buf *bp, long ncount)
{ KASSERT(ncount <= bp->b_bufsize); bp->b_bcount = ncount;
}
/* bufcache freelist code below */
/*
* Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* The code below implements a variant of the 2Q buffer cache algorithm by
* Johnson and Shasha.
*
* General Outline
* We divide the buffer cache into three working sets: current, previous,
* and long term. Each list is itself LRU and buffers get promoted and moved
* around between them. A buffer starts its life in the current working set.
* As time passes and newer buffers push it out, it will turn into the previous
* working set and is subject to recycling. But if it's accessed again from
* the previous working set, that's an indication that it's actually in the
* long term working set, so we promote it there. The separation of current
* and previous working sets prevents us from promoting a buffer that's only
* temporarily hot to the long term cache.
*
* The objective is to provide scan resistance by making the long term
* working set ineligible for immediate recycling, even as the current
* working set is rapidly turned over.
*
* Implementation
* The code below identifies the current, previous, and long term sets as
* hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
* 1/3 of the total clean pages, after which point they start pushing their
* oldest buffers into coldqueue.
* A buf always starts out with neither WARM or COLD flags set (implying HOT).
* When released, it will be returned to the tail of the hotqueue list.
* When the hotqueue gets too large, the oldest hot buf will be moved to the
* coldqueue, with the B_COLD flag set. When a cold buf is released, we set
* the B_WARM flag and put it onto the warmqueue. Warm bufs are also
* directly returned to the end of the warmqueue. As with the hotqueue, when
* the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue.
*
* Note that this design does still support large working sets, greater
* than the cap of hotqueue or warmqueue would imply. The coldqueue is still
* cached and has no maximum length. The hot and warm queues form a Y feeding
* into the coldqueue. Moving bufs between queues is constant time, so this
* design decays to one long warm->cold queue.
*
* In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
* is Am. We always cache pages, as opposed to pointers to pages for A1.
*
* This implementation adds support for multiple 2q caches.
*
* If we have more than one 2q cache, as bufs fall off the cold queue
* for recycling, bufs that have been warm before (which retain the
* B_WARM flag in addition to B_COLD) can be put into the hot queue of
* a second level 2Q cache. buffers which are only B_COLD are
* recycled. Bufs falling off the last cache's cold queue are always
* recycled.
*
*/
/*
* this function is called when a hot or warm queue may have exceeded its
* size limit. it will move a buf to the coldqueue.
*/
int chillbufs(struct
bufcache *cache, struct bufqueue *queue, int64_t *queuepages);
void
bufcache_init(void)
{
int i;
for (i = 0; i < NUM_CACHES; i++) {
TAILQ_INIT(&cleancache[i].hotqueue);
TAILQ_INIT(&cleancache[i].coldqueue);
TAILQ_INIT(&cleancache[i].warmqueue);
}
TAILQ_INIT(&dirtyqueue);
}
/*
* if the buffer caches have shrunk, we may need to rebalance our queues.
*/
void
bufcache_adjust(void)
{
int i;
for (i = 0; i < NUM_CACHES; i++) {
while (chillbufs(&cleancache[i], &cleancache[i].warmqueue,
&cleancache[i].warmbufpages) ||
chillbufs(&cleancache[i], &cleancache[i].hotqueue,
&cleancache[i].hotbufpages))
continue;
}
}
/*
* Get a clean buffer from the cache. if "discard" is set do not promote
* previously warm buffers as normal, because we are tossing everything
* away such as in a hibernation
*/
struct buf *
bufcache_getcleanbuf(int cachenum, int discard)
{
struct buf *bp = NULL;
struct bufcache *cache = &cleancache[cachenum];
struct bufqueue * queue;
splassert(IPL_BIO);
/* try cold queue */
while ((bp = TAILQ_FIRST(&cache->coldqueue)) ||
(bp = TAILQ_FIRST(&cache->warmqueue)) ||
(bp = TAILQ_FIRST(&cache->hotqueue))) {
int64_t pages = atop(bp->b_bufsize);
struct bufcache *newcache;
if (discard || cachenum >= NUM_CACHES - 1) {
/* Victim selected, give it up */
return bp;
}
KASSERT(bp->cache == cachenum);
/*
* If this buffer was warm before, move it to
* the hot queue in the next cache
*/
if (fliphigh) {
/*
* If we are in the DMA cache, try to flip the
* buffer up high to move it on to the other
* caches. if we can't move the buffer to high
* memory without sleeping, we give it up and
* return it rather than fight for more memory
* against non buffer cache competitors.
*/
SET(bp->b_flags, B_BUSY);
if (bp->cache == 0 && buf_flip_high(bp) == -1) {
CLR(bp->b_flags, B_BUSY);
return bp;
}
CLR(bp->b_flags, B_BUSY);
}
/* Move the buffer to the hot queue in the next cache */
if (ISSET(bp->b_flags, B_COLD)) {
queue = &cache->coldqueue;
} else if (ISSET(bp->b_flags, B_WARM)) {
queue = &cache->warmqueue;
cache->warmbufpages -= pages;
} else {
queue = &cache->hotqueue;
cache->hotbufpages -= pages;
}
TAILQ_REMOVE(queue, bp, b_freelist);
cache->cachepages -= pages;
CLR(bp->b_flags, B_WARM);
CLR(bp->b_flags, B_COLD);
bp->cache++;
newcache= &cleancache[bp->cache];
newcache->cachepages += pages;
newcache->hotbufpages += pages;
chillbufs(newcache, &newcache->hotqueue,
&newcache->hotbufpages);
TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
}
return bp;
}
void
discard_buffer(struct buf *bp)
{
splassert(IPL_BIO);
bufcache_take(bp);
if (bp->b_vp) {
RBT_REMOVE(buf_rb_bufs,
&bp->b_vp->v_bufs_tree, bp);
brelvp(bp);
}
buf_put(bp);
}
int64_t
bufcache_recover_dmapages(int discard, int64_t howmany)
{
struct buf *bp = NULL;
struct bufcache *cache = &cleancache[DMA_CACHE];
struct bufqueue * queue;
int64_t recovered = 0;
splassert(IPL_BIO);
while ((recovered < howmany) &&
((bp = TAILQ_FIRST(&cache->coldqueue)) ||
(bp = TAILQ_FIRST(&cache->warmqueue)) ||
(bp = TAILQ_FIRST(&cache->hotqueue)))) {
int64_t pages = atop(bp->b_bufsize);
struct bufcache *newcache;
if (discard || DMA_CACHE >= NUM_CACHES - 1) {
discard_buffer(bp);
continue;
}
KASSERT(bp->cache == DMA_CACHE);
/*
* If this buffer was warm before, move it to
* the hot queue in the next cache
*/
/*
* One way or another, the pages for this
* buffer are leaving DMA memory
*/
recovered += pages;
if (!fliphigh) {
discard_buffer(bp);
continue;
}
/*
* If we are in the DMA cache, try to flip the
* buffer up high to move it on to the other
* caches. if we can't move the buffer to high
* memory without sleeping, we give it up
* now rather than fight for more memory
* against non buffer cache competitors.
*/
SET(bp->b_flags, B_BUSY);
if (bp->cache == 0 && buf_flip_high(bp) == -1) {
CLR(bp->b_flags, B_BUSY);
discard_buffer(bp);
continue;
}
CLR(bp->b_flags, B_BUSY);
/*
* Move the buffer to the hot queue in the next cache
*/
if (ISSET(bp->b_flags, B_COLD)) {
queue = &cache->coldqueue;
} else if (ISSET(bp->b_flags, B_WARM)) {
queue = &cache->warmqueue;
cache->warmbufpages -= pages;
} else {
queue = &cache->hotqueue;
cache->hotbufpages -= pages;
}
TAILQ_REMOVE(queue, bp, b_freelist);
cache->cachepages -= pages;
CLR(bp->b_flags, B_WARM);
CLR(bp->b_flags, B_COLD);
bp->cache++;
newcache= &cleancache[bp->cache];
newcache->cachepages += pages;
newcache->hotbufpages += pages;
chillbufs(newcache, &newcache->hotqueue,
&newcache->hotbufpages);
TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
}
return recovered;
}
struct buf *
bufcache_getcleanbuf_range(int start, int end, int discard)
{
int i, j = start, q = end;
struct buf *bp = NULL;
/*
* XXX in theory we could promote warm buffers into a previous queue
* so in the pathological case of where we go through all the caches
* without getting a buffer we have to start at the beginning again.
*/
while (j <= q) {
for (i = q; i >= j; i--)
if ((bp = bufcache_getcleanbuf(i, discard)))
return (bp);
j++;
}
return bp;
}
struct buf *
bufcache_gethighcleanbuf(void)
{
if (!fliphigh)
return NULL;
return bufcache_getcleanbuf_range(DMA_CACHE + 1, NUM_CACHES - 1, 0);
}
struct buf *
bufcache_getdmacleanbuf(void)
{
if (fliphigh)
return bufcache_getcleanbuf_range(DMA_CACHE, DMA_CACHE, 0);
return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 0);
}
struct buf *
bufcache_getdirtybuf(void)
{
return TAILQ_FIRST(&dirtyqueue);
}
void
bufcache_take(struct buf *bp)
{
struct bufqueue *queue;
int64_t pages;
splassert(IPL_BIO); KASSERT(ISSET(bp->b_flags, B_BC)); KASSERT(bp->cache >= DMA_CACHE); KASSERT((bp->cache < NUM_CACHES));
pages = atop(bp->b_bufsize);
TRACEPOINT(vfs, bufcache_take, bp->b_flags, bp->cache, pages);
struct bufcache *cache = &cleancache[bp->cache];
if (!ISSET(bp->b_flags, B_DELWRI)) {
if (ISSET(bp->b_flags, B_COLD)) {
queue = &cache->coldqueue;
} else if (ISSET(bp->b_flags, B_WARM)) {
queue = &cache->warmqueue;
cache->warmbufpages -= pages;
} else {
queue = &cache->hotqueue;
cache->hotbufpages -= pages;
}
bcstats.numcleanpages -= pages;
cache->cachepages -= pages;
} else {
queue = &dirtyqueue;
bcstats.numdirtypages -= pages;
bcstats.delwribufs--;
}
TAILQ_REMOVE(queue, bp, b_freelist);
}
/* move buffers from a hot or warm queue to a cold queue in a cache */
int
chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages)
{
struct buf *bp;
int64_t limit, pages;
/*
* We limit the hot queue to be small, with a max of 4096 pages.
* We limit the warm queue to half the cache size.
*
* We impose a minimum size of 96 to prevent too much "wobbling".
*/
if (queue == &cache->hotqueue)
limit = min(cache->cachepages / 20, 4096);
else if (queue == &cache->warmqueue)
limit = (cache->cachepages / 2);
else
panic("chillbufs: invalid queue"); if (*queuepages > 96 && *queuepages > limit) {
bp = TAILQ_FIRST(queue);
if (!bp)
panic("inconsistent bufpage counts"); pages = atop(bp->b_bufsize);
*queuepages -= pages;
TAILQ_REMOVE(queue, bp, b_freelist);
/* we do not clear B_WARM */
SET(bp->b_flags, B_COLD);
TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist);
return 1;
}
return 0;
}
void
bufcache_release(struct buf *bp)
{
struct bufqueue *queue;
int64_t pages;
struct bufcache *cache = &cleancache[bp->cache];
KASSERT(ISSET(bp->b_flags, B_BC));
pages = atop(bp->b_bufsize);
TRACEPOINT(vfs, bufcache_rel, bp->b_flags, bp->cache, pages); if (fliphigh) { if (ISSET(bp->b_flags, B_DMA) && bp->cache > 0) panic("B_DMA buffer release from cache %d",
bp->cache);
else if ((!ISSET(bp->b_flags, B_DMA)) && bp->cache == 0) panic("Non B_DMA buffer release from cache %d",
bp->cache);
}
if (!ISSET(bp->b_flags, B_DELWRI)) {
int64_t *queuepages;
if (ISSET(bp->b_flags, B_WARM | B_COLD)) {
SET(bp->b_flags, B_WARM);
CLR(bp->b_flags, B_COLD);
queue = &cache->warmqueue;
queuepages = &cache->warmbufpages;
} else {
queue = &cache->hotqueue;
queuepages = &cache->hotbufpages;
}
*queuepages += pages;
bcstats.numcleanpages += pages;
cache->cachepages += pages;
chillbufs(cache, queue, queuepages);
} else {
queue = &dirtyqueue;
bcstats.numdirtypages += pages;
bcstats.delwribufs++;
}
TAILQ_INSERT_TAIL(queue, bp, b_freelist);
}
#ifdef HIBERNATE
/*
* Nuke the buffer cache from orbit when hibernating. We do not want to save
* any clean cache pages to swap and read them back. the original disk files
* are just as good.
*/
void
hibernate_suspend_bufcache(void)
{
struct buf *bp;
int s;
s = splbio();
/* Chuck away all the cache pages.. discard bufs, do not promote */
while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) {
bufcache_take(bp);
if (bp->b_vp) {
RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
brelvp(bp);
}
buf_put(bp);
}
splx(s);
}
void
hibernate_resume_bufcache(void)
{
/* XXX Nothing needed here for now */
}
#endif /* HIBERNATE */
/* $OpenBSD: uipc_domain.c,v 1.60 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: uipc_domain.c,v 1.14 1996/02/09 19:00:44 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93
*/
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/timeout.h>
#include "bpfilter.h"
#include "pflow.h"
const struct domain *const domains[] = {
#ifdef MPLS
&mplsdomain,
#endif
#if defined (IPSEC) || defined (TCP_SIGNATURE)
&pfkeydomain,
#endif
#ifdef INET6
&inet6domain,
#endif /* INET6 */
&inetdomain,
&unixdomain,
&routedomain,
NULL
};
void pffasttimo(void *);
void pfslowtimo(void *);
const struct domain * pffinddomain(int);
void
domaininit(void)
{
const struct domain *dp;
const struct protosw *pr;
static struct timeout pffast_timeout;
static struct timeout pfslow_timeout;
int i;
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_init)
(*dp->dom_init)();
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_init)
(*pr->pr_init)();
}
/*
* max_linkhdr of 64 was chosen to encompass tunnelling
* traffic in IP payloads, eg, by etherip(4) or gif(4),
* without needing to prepend an mbuf to fit those
* headers.
*/
if (max_linkhdr < 64)
max_linkhdr = 64;
max_hdr = max_linkhdr + max_protohdr;
timeout_set_proc(&pffast_timeout, pffasttimo, &pffast_timeout);
timeout_set_proc(&pfslow_timeout, pfslowtimo, &pfslow_timeout);
timeout_add(&pffast_timeout, 1);
timeout_add(&pfslow_timeout, 1);
}
const struct domain *
pffinddomain(int family)
{
const struct domain *dp;
int i;
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_family == family)
return (dp);
}
return (NULL);
}
const struct protosw *
pffindtype(int family, int type)
{
const struct domain *dp;
const struct protosw *pr;
dp = pffinddomain(family);
if (dp == NULL)
return (NULL);
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_type && pr->pr_type == type)
return (pr);
return (NULL);
}
const struct protosw *
pffindproto(int family, int protocol, int type)
{
const struct domain *dp;
const struct protosw *pr;
const struct protosw *maybe = NULL;
if (family == PF_UNSPEC)
return (NULL);
dp = pffinddomain(family);
if (dp == NULL)
return (NULL);
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
return (pr);
if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && pr->pr_protocol == 0 && maybe == NULL)
maybe = pr;
}
return (maybe);
}
static int
net_link_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
int node;
int error;
/*
* All sysctl names at this level are nonterminal.
*/
if (namelen < 2)
return (EISDIR); /* overloaded */
node = name[0];
namelen--;
name++; switch (node) {
case NET_LINK_IFRXQ:
error = net_ifiq_sysctl(name, namelen, oldp, oldlenp,
newp, newlen);
break;
default:
error = ENOPROTOOPT;
break;
}
return (error);
}
int
net_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
const struct domain *dp;
const struct protosw *pr;
int error, family, protocol;
/*
* All sysctl names at this level are nonterminal.
* Usually: next two components are protocol family and protocol
* number, then at least one addition component.
*/
if (namelen < 2)
return (EISDIR); /* overloaded */
family = name[0];
if (family == PF_UNSPEC)
return (0);
if (family == PF_LINK)
return (net_link_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
if (family == PF_UNIX)
return (uipc_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
#if NBPFILTER > 0
if (family == PF_BPF)
return (bpf_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
#endif
#if NPFLOW > 0
if (family == PF_PFLOW)
return (pflow_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
#endif
#ifdef PIPEX
if (family == PF_PIPEX)
return (pipex_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
#endif
#ifdef MPLS
if (family == PF_MPLS)
return (mpls_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen));
#endif
dp = pffinddomain(family);
if (dp == NULL)
return (ENOPROTOOPT);
if (namelen < 3)
return (EISDIR); /* overloaded */
protocol = name[1];
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_protocol == protocol && pr->pr_sysctl) { error = (*pr->pr_sysctl)(name + 2, namelen - 2,
oldp, oldlenp, newp, newlen);
return (error);
}
return (ENOPROTOOPT);
}
void
pfctlinput(int cmd, struct sockaddr *sa)
{
const struct domain *dp;
const struct protosw *pr;
int i;
NET_ASSERT_LOCKED();
for (i = 0; (dp = domains[i]) != NULL; i++) {
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_ctlinput)
(*pr->pr_ctlinput)(cmd, sa, 0, NULL);
}
}
void
pfslowtimo(void *arg)
{
struct timeout *to = (struct timeout *)arg;
const struct domain *dp;
const struct protosw *pr;
int i;
for (i = 0; (dp = domains[i]) != NULL; i++) {
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_slowtimo)
(*pr->pr_slowtimo)();
}
timeout_add_msec(to, 500);
}
void
pffasttimo(void *arg)
{
struct timeout *to = (struct timeout *)arg;
const struct domain *dp;
const struct protosw *pr;
int i;
for (i = 0; (dp = domains[i]) != NULL; i++) {
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_fasttimo)
(*pr->pr_fasttimo)();
}
timeout_add_msec(to, 200);
}
/* $OpenBSD: ffs_balloc.c,v 1.45 2019/07/19 00:24:31 cheloha Exp $ */
/* $NetBSD: ffs_balloc.c,v 1.3 1996/02/09 22:22:21 christos Exp $ */
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program.
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
int ffs1_balloc(struct inode *, off_t, int, struct ucred *, int, struct buf **);
#ifdef FFS2
int ffs2_balloc(struct inode *, off_t, int, struct ucred *, int, struct buf **);
#endif
/*
* Balloc defines the structure of file system storage
* by allocating the physical blocks on a device given
* the inode and the logical block number in a file.
*/
int
ffs1_balloc(struct inode *ip, off_t startoffset, int size, struct ucred *cred,
int flags, struct buf **bpp)
{
daddr_t lbn, nb, newb, pref;
struct fs *fs;
struct buf *bp, *nbp;
struct vnode *vp;
struct proc *p;
struct indir indirs[NIADDR + 2];
int32_t *bap;
int deallocated, osize, nsize, num, i, error;
int32_t *allocib, *blkp, *allocblk, allociblk[NIADDR+1];
int unwindidx = -1;
vp = ITOV(ip);
fs = ip->i_fs;
p = curproc;
lbn = lblkno(fs, startoffset);
size = blkoff(fs, startoffset) + size;
if (size > fs->fs_bsize)
panic("ffs1_balloc: blk too big");
if (bpp != NULL)
*bpp = NULL;
if (lbn < 0)
return (EFBIG);
/*
* If the next write will extend the file into a new block,
* and the file is currently composed of a fragment
* this fragment has to be extended to be a full block.
*/
nb = lblkno(fs, ip->i_ffs1_size);
if (nb < NDADDR && nb < lbn) {
osize = blksize(fs, ip, nb);
if (osize < fs->fs_bsize && osize > 0) {
error = ffs_realloccg(ip, nb,
ffs1_blkpref(ip, nb, (int)nb, &ip->i_ffs1_db[0]),
osize, (int)fs->fs_bsize, cred, bpp, &newb);
if (error)
return (error);
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, nb, newb,
ip->i_ffs1_db[nb], fs->fs_bsize, osize,
bpp ? *bpp : NULL);
ip->i_ffs1_size = lblktosize(fs, nb + 1);
uvm_vnp_setsize(vp, ip->i_ffs1_size);
ip->i_ffs1_db[nb] = newb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
if (flags & B_SYNC)
bwrite(*bpp);
else
bawrite(*bpp);
}
}
}
/*
* The first NDADDR blocks are direct blocks
*/
if (lbn < NDADDR) {
nb = ip->i_ffs1_db[lbn];
if (nb != 0 && ip->i_ffs1_size >= lblktosize(fs, lbn + 1)) {
/*
* The block is an already-allocated direct block
* and the file already extends past this block,
* thus this must be a whole block.
* Just read the block (if requested).
*/
if (bpp != NULL) {
error = bread(vp, lbn, fs->fs_bsize, bpp);
if (error) {
brelse(*bpp);
return (error);
}
}
return (0);
}
if (nb != 0) {
/*
* Consider need to reallocate a fragment.
*/
osize = fragroundup(fs, blkoff(fs, ip->i_ffs1_size));
nsize = fragroundup(fs, size);
if (nsize <= osize) {
/*
* The existing block is already
* at least as big as we want.
* Just read the block (if requested).
*/
if (bpp != NULL) {
error = bread(vp, lbn, fs->fs_bsize,
bpp);
if (error) {
brelse(*bpp);
return (error);
}
buf_adjcnt((*bpp), osize);
}
return (0);
} else {
/*
* The existing block is smaller than we
* want, grow it.
*/
error = ffs_realloccg(ip, lbn,
ffs1_blkpref(ip, lbn, (int)lbn,
&ip->i_ffs1_db[0]),
osize, nsize, cred, bpp, &newb);
if (error)
return (error);
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, lbn,
newb, nb, nsize, osize,
bpp ? *bpp : NULL);
}
} else {
/*
* The block was not previously allocated,
* allocate a new block or fragment.
*/
if (ip->i_ffs1_size < lblktosize(fs, lbn + 1))
nsize = fragroundup(fs, size);
else
nsize = fs->fs_bsize;
error = ffs_alloc(ip, lbn,
ffs1_blkpref(ip, lbn, (int)lbn, &ip->i_ffs1_db[0]),
nsize, cred, &newb);
if (error)
return (error);
if (bpp != NULL) {
*bpp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
if (nsize < fs->fs_bsize)
(*bpp)->b_bcount = nsize;
(*bpp)->b_blkno = fsbtodb(fs, newb);
if (flags & B_CLRBUF)
clrbuf(*bpp);
}
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, lbn, newb, 0,
nsize, 0, bpp ? *bpp : NULL);
}
ip->i_ffs1_db[lbn] = newb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
return (0);
}
/*
* Determine the number of levels of indirection.
*/
pref = 0;
if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
return(error);
#ifdef DIAGNOSTIC
if (num < 1)
panic ("ffs1_balloc: ufs_bmaparray returned indirect block");
#endif
/*
* Fetch the first indirect block allocating if necessary.
*/
--num;
nb = ip->i_ffs1_ib[indirs[0].in_off];
allocib = NULL;
allocblk = allociblk;
if (nb == 0) {
pref = ffs1_blkpref(ip, lbn, -indirs[0].in_off - 1, NULL);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
cred, &newb);
if (error)
goto fail;
nb = newb;
*allocblk++ = nb;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, INFSLP);
bp->b_blkno = fsbtodb(fs, nb);
clrbuf(bp);
if (DOINGSOFTDEP(vp)) {
softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
newb, 0, fs->fs_bsize, 0, bp);
bdwrite(bp);
} else {
/*
* Write synchronously so that indirect blocks
* never point at garbage.
*/
if ((error = bwrite(bp)) != 0)
goto fail;
}
allocib = &ip->i_ffs1_ib[indirs[0].in_off];
*allocib = nb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
/*
* Fetch through the indirect blocks, allocating as necessary.
*/
for (i = 1;;) {
error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, &bp);
if (error) {
brelse(bp);
goto fail;
}
bap = (int32_t *)bp->b_data;
nb = bap[indirs[i].in_off];
if (i == num)
break;
i++;
if (nb != 0) {
brelse(bp);
continue;
}
if (pref == 0)
pref = ffs1_blkpref(ip, lbn, i - num - 1, NULL);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
&newb);
if (error) {
brelse(bp);
goto fail;
}
nb = newb;
*allocblk++ = nb;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
clrbuf(nbp);
if (DOINGSOFTDEP(vp)) {
softdep_setup_allocindir_meta(nbp, ip, bp,
indirs[i - 1].in_off, nb);
bdwrite(nbp);
} else {
/*
* Write synchronously so that indirect blocks
* never point at garbage.
*/
if ((error = bwrite(nbp)) != 0) {
brelse(bp);
goto fail;
}
}
bap[indirs[i - 1].in_off] = nb;
if (allocib == NULL && unwindidx < 0)
unwindidx = i - 1;
/*
* If required, write synchronously, otherwise use
* delayed write.
*/
if (flags & B_SYNC) {
bwrite(bp);
} else {
bdwrite(bp);
}
}
/*
* Get the data block, allocating if necessary.
*/
if (nb == 0) {
pref = ffs1_blkpref(ip, lbn, indirs[i].in_off, &bap[0]);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
&newb);
if (error) {
brelse(bp);
goto fail;
}
nb = newb;
*allocblk++ = nb;
if (bpp != NULL) {
nbp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
if (flags & B_CLRBUF)
clrbuf(nbp);
*bpp = nbp;
}
if (DOINGSOFTDEP(vp))
softdep_setup_allocindir_page(ip, lbn, bp,
indirs[i].in_off, nb, 0, bpp ? *bpp : NULL);
bap[indirs[i].in_off] = nb;
/*
* If required, write synchronously, otherwise use
* delayed write.
*/
if (flags & B_SYNC) {
bwrite(bp);
} else {
bdwrite(bp);
}
return (0);
}
brelse(bp);
if (bpp != NULL) {
if (flags & B_CLRBUF) {
error = bread(vp, lbn, (int)fs->fs_bsize, &nbp);
if (error) {
brelse(nbp);
goto fail;
}
} else {
nbp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
}
*bpp = nbp;
}
return (0);
fail:
/*
* If we have failed to allocate any blocks, simply return the error.
* This is the usual case and avoids the need to fsync the file.
*/
if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
return (error);
/*
* If we have failed part way through block allocation, we have to
* deallocate any indirect blocks that we have allocated. We have to
* fsync the file before we start to get rid of all of its
* dependencies so that we do not leave them dangling. We have to sync
* it at the end so that the softdep code does not find any untracked
* changes. Although this is really slow, running out of disk space is
* not expected to be a common occurrence. The error return from fsync
* is ignored as we already have an error to return to the user.
*/
VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p);
for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ip, *blkp, fs->fs_bsize);
deallocated += fs->fs_bsize;
}
if (allocib != NULL) {
*allocib = 0;
} else if (unwindidx >= 0) {
int r;
r = bread(vp, indirs[unwindidx].in_lbn, (int)fs->fs_bsize, &bp);
if (r)
panic("Could not unwind indirect block, error %d", r);
bap = (int32_t *)bp->b_data;
bap[indirs[unwindidx].in_off] = 0;
if (flags & B_SYNC) {
bwrite(bp);
} else {
bdwrite(bp);
}
}
if (deallocated) {
/*
* Restore user's disk quota because allocation failed.
*/
(void)ufs_quota_free_blocks(ip, btodb(deallocated), cred);
ip->i_ffs1_blocks -= btodb(deallocated);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p);
return (error);
}
#ifdef FFS2
int
ffs2_balloc(struct inode *ip, off_t off, int size, struct ucred *cred,
int flags, struct buf **bpp)
{
daddr_t lbn, lastlbn, nb, newb, *blkp;
daddr_t pref, *allocblk, allociblk[NIADDR + 1];
daddr_t *bap, *allocib;
int deallocated, osize, nsize, num, i, error, unwindidx, r;
struct buf *bp, *nbp;
struct indir indirs[NIADDR + 2];
struct fs *fs;
struct vnode *vp;
struct proc *p;
vp = ITOV(ip);
fs = ip->i_fs;
p = curproc;
unwindidx = -1;
lbn = lblkno(fs, off);
size = blkoff(fs, off) + size;
if (size > fs->fs_bsize)
panic("ffs2_balloc: block too big"); if (bpp != NULL) *bpp = NULL; if (lbn < 0)
return (EFBIG);
/*
* If the next write will extend the file into a new block, and the
* file is currently composed of a fragment, this fragment has to be
* extended to be a full block.
*/
lastlbn = lblkno(fs, ip->i_ffs2_size);
if (lastlbn < NDADDR && lastlbn < lbn) {
nb = lastlbn;
osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) {
error = ffs_realloccg(ip, nb, ffs2_blkpref(ip,
lastlbn, nb, &ip->i_ffs2_db[0]), osize,
(int) fs->fs_bsize, cred, bpp, &newb);
if (error)
return (error);
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, nb, newb,
ip->i_ffs2_db[nb], fs->fs_bsize, osize,
bpp ? *bpp : NULL);
ip->i_ffs2_size = lblktosize(fs, nb + 1);
uvm_vnp_setsize(vp, ip->i_ffs2_size);
ip->i_ffs2_db[nb] = newb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp) {
if (flags & B_SYNC)
bwrite(*bpp);
else
bawrite(*bpp);
}
}
}
/*
* The first NDADDR blocks are direct.
*/
if (lbn < NDADDR) {
nb = ip->i_ffs2_db[lbn];
if (nb != 0 && ip->i_ffs2_size >= lblktosize(fs, lbn + 1)) {
/*
* The direct block is already allocated and the file
* extends past this block, thus this must be a whole
* block. Just read it, if requested.
*/
if (bpp != NULL) {
error = bread(vp, lbn, fs->fs_bsize, bpp);
if (error) { brelse(*bpp);
return (error);
}
}
return (0);
}
if (nb != 0) {
/*
* Consider the need to allocate a fragment.
*/
osize = fragroundup(fs, blkoff(fs, ip->i_ffs2_size));
nsize = fragroundup(fs, size);
if (nsize <= osize) {
/*
* The existing block is already at least as
* big as we want. Just read it, if requested.
*/
if (bpp != NULL) {
error = bread(vp, lbn, fs->fs_bsize,
bpp);
if (error) {
brelse(*bpp);
return (error);
}
buf_adjcnt((*bpp), osize);
}
return (0);
} else {
/*
* The existing block is smaller than we want,
* grow it.
*/
error = ffs_realloccg(ip, lbn,
ffs2_blkpref(ip, lbn, (int) lbn,
&ip->i_ffs2_db[0]), osize, nsize, cred,
bpp, &newb);
if (error)
return (error);
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, lbn,
newb, nb, nsize, osize,
bpp ? *bpp : NULL);
}
} else {
/*
* The block was not previously allocated, allocate a
* new block or fragment.
*/
if (ip->i_ffs2_size < lblktosize(fs, lbn + 1))
nsize = fragroundup(fs, size);
else
nsize = fs->fs_bsize;
error = ffs_alloc(ip, lbn, ffs2_blkpref(ip, lbn,
(int) lbn, &ip->i_ffs2_db[0]), nsize, cred, &newb);
if (error)
return (error);
if (bpp != NULL) {
bp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
if (nsize < fs->fs_bsize) bp->b_bcount = nsize;
bp->b_blkno = fsbtodb(fs, newb);
if (flags & B_CLRBUF)
clrbuf(bp);
*bpp = bp;
}
if (DOINGSOFTDEP(vp))
softdep_setup_allocdirect(ip, lbn, newb, 0,
nsize, 0, bpp ? *bpp : NULL);
}
ip->i_ffs2_db[lbn] = newb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
return (0);
}
/*
* Determine the number of levels of indirection.
*/
pref = 0;
error = ufs_getlbns(vp, lbn, indirs, &num);
if (error)
return (error);
#ifdef DIAGNOSTIC
if (num < 1)
panic("ffs2_balloc: ufs_bmaparray returned indirect block");
#endif
/*
* Fetch the first indirect block allocating it necessary.
*/
--num;
nb = ip->i_ffs2_ib[indirs[0].in_off];
allocib = NULL;
allocblk = allociblk;
if (nb == 0) {
pref = ffs2_blkpref(ip, lbn, -indirs[0].in_off - 1, NULL);
error = ffs_alloc(ip, lbn, pref, (int) fs->fs_bsize, cred,
&newb);
if (error)
goto fail;
nb = newb;
*allocblk++ = nb;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, INFSLP);
bp->b_blkno = fsbtodb(fs, nb);
clrbuf(bp);
if (DOINGSOFTDEP(vp)) {
softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
newb, 0, fs->fs_bsize, 0, bp);
bdwrite(bp);
} else {
/*
* Write synchronously so that indirect blocks never
* point at garbage.
*/
error = bwrite(bp);
if (error)
goto fail;
}
unwindidx = 0;
allocib = &ip->i_ffs2_ib[indirs[0].in_off];
*allocib = nb;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
/*
* Fetch through the indirect blocks, allocating as necessary.
*/
for (i = 1;;) {
error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, &bp);
if (error) {
brelse(bp);
goto fail;
}
bap = (int64_t *) bp->b_data;
nb = bap[indirs[i].in_off];
if (i == num)
break;
i++;
if (nb != 0) {
brelse(bp);
continue;
}
if (pref == 0) pref = ffs2_blkpref(ip, lbn, i - num - 1, NULL);
error = ffs_alloc(ip, lbn, pref, (int) fs->fs_bsize, cred,
&newb);
if (error) {
brelse(bp);
goto fail;
}
nb = newb;
*allocblk++ = nb;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
clrbuf(nbp);
if (DOINGSOFTDEP(vp)) {
softdep_setup_allocindir_meta(nbp, ip, bp,
indirs[i - 1].in_off, nb);
bdwrite(nbp);
} else {
/*
* Write synchronously so that indirect blocks never
* point at garbage.
*/
error = bwrite(nbp);
if (error) { brelse(bp);
goto fail;
}
}
if (unwindidx < 0)
unwindidx = i - 1;
bap[indirs[i - 1].in_off] = nb;
/*
* If required, write synchronously, otherwise use delayed
* write.
*/
if (flags & B_SYNC)
bwrite(bp);
else
bdwrite(bp);
}
/*
* Get the data block, allocating if necessary.
*/
if (nb == 0) {
pref = ffs2_blkpref(ip, lbn, indirs[num].in_off, &bap[0]);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
&newb);
if (error) {
brelse(bp);
goto fail;
}
nb = newb;
*allocblk++ = nb;
if (bpp != NULL) {
nbp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
if (flags & B_CLRBUF)
clrbuf(nbp);
*bpp = nbp;
}
if (DOINGSOFTDEP(vp))
softdep_setup_allocindir_page(ip, lbn, bp,
indirs[num].in_off, nb, 0, bpp ? *bpp : NULL);
bap[indirs[num].in_off] = nb;
if (allocib == NULL && unwindidx < 0)
unwindidx = i - 1;
/*
* If required, write synchronously, otherwise use delayed
* write.
*/
if (flags & B_SYNC)
bwrite(bp);
else
bdwrite(bp);
return (0);
}
brelse(bp);
if (bpp != NULL) {
if (flags & B_CLRBUF) {
error = bread(vp, lbn, (int)fs->fs_bsize, &nbp);
if (error) {
brelse(nbp);
goto fail;
}
} else {
nbp = getblk(vp, lbn, fs->fs_bsize, 0, INFSLP);
nbp->b_blkno = fsbtodb(fs, nb);
clrbuf(nbp);
}
*bpp = nbp;
}
return (0);
fail:
/*
* If we have failed to allocate any blocks, simply return the error.
* This is the usual case and avoids the need to fsync the file.
*/
if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
return (error);
/*
* If we have failed part way through block allocation, we have to
* deallocate any indirect blocks that we have allocated. We have to
* fsync the file before we start to get rid of all of its
* dependencies so that we do not leave them dangling. We have to sync
* it at the end so that the softdep code does not find any untracked
* changes. Although this is really slow, running out of disk space is
* not expected to be a common occurrence. The error return from fsync
* is ignored as we already have an error to return to the user.
*/
VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p);
if (unwindidx >= 0) {
/*
* First write out any buffers we've created to resolve their
* softdeps. This must be done in reverse order of creation so
* that we resolve the dependencies in one pass.
* Write the cylinder group buffers for these buffers too.
*/
for (i = num; i >= unwindidx; i--) {
if (i == 0)
break;
bp = getblk(vp, indirs[i].in_lbn, (int) fs->fs_bsize,
0, INFSLP);
if (bp->b_flags & B_DELWRI) {
nb = fsbtodb(fs, cgtod(fs, dtog(fs,
dbtofsb(fs, bp->b_blkno))));
bwrite(bp);
bp = getblk(ip->i_devvp, nb,
(int) fs->fs_cgsize, 0, INFSLP);
if (bp->b_flags & B_DELWRI)
bwrite(bp);
else {
bp->b_flags |= B_INVAL;
brelse(bp);
}
} else {
bp->b_flags |= B_INVAL;
brelse(bp);
}
}
if (DOINGSOFTDEP(vp) && unwindidx == 0) { ip->i_flag |= IN_CHANGE | IN_UPDATE;
ffs_update(ip, 1);
}
/*
* Now that any dependencies that we created have been
* resolved, we can undo the partial allocation.
*/
if (unwindidx == 0) {
*allocib = 0;
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (DOINGSOFTDEP(vp)) ffs_update(ip, 1);
} else {
r = bread(vp, indirs[unwindidx].in_lbn,
(int)fs->fs_bsize, &bp);
if (r)
panic("ffs2_balloc: unwind failed"); bap = (int64_t *) bp->b_data;
bap[indirs[unwindidx].in_off] = 0;
bwrite(bp);
}
for (i = unwindidx + 1; i <= num; i++) {
bp = getblk(vp, indirs[i].in_lbn, (int)fs->fs_bsize, 0,
INFSLP);
bp->b_flags |= B_INVAL;
brelse(bp);
}
}
for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
ffs_blkfree(ip, *blkp, fs->fs_bsize);
deallocated += fs->fs_bsize;
}
if (deallocated) {
/*
* Restore user's disk quota because allocation failed.
*/
(void) ufs_quota_free_blocks(ip, btodb(deallocated), cred);
ip->i_ffs2_blocks -= btodb(deallocated);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p);
return (error);
}
#endif /* FFS2 */
/*
* Balloc defines the structure of file system storage by allocating the
* physical blocks given the inode and the logical block number in a file.
*/
int
ffs_balloc(struct inode *ip, off_t off, int size, struct ucred *cred,
int flags, struct buf **bpp)
{
#ifdef FFS2
if (ip->i_fs->fs_magic == FS_UFS2_MAGIC)
return (ffs2_balloc(ip, off, size, cred, flags, bpp));
else
#endif
return (ffs1_balloc(ip, off, size, cred, flags, bpp));
}
/* $OpenBSD: kern_subr.c,v 1.51 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_subr.c,v 1.15 1996/04/09 17:21:56 ragge Exp $ */
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/malloc.h>
#include <sys/queue.h>
int
uiomove(void *cp, size_t n, struct uio *uio)
{
struct iovec *iov;
size_t cnt;
int error = 0;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE)
panic("uiomove: mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("uiomove: proc");
#endif
if (n > uio->uio_resid)
n = uio->uio_resid;
while (n > 0) {
iov = uio->uio_iov;
cnt = iov->iov_len;
if (cnt == 0) { KASSERT(uio->uio_iovcnt > 0);
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
if (cnt > n)
cnt = n;
switch (uio->uio_segflg) {
case UIO_USERSPACE:
sched_pause(preempt);
if (uio->uio_rw == UIO_READ)
error = copyout(cp, iov->iov_base, cnt);
else
error = copyin(iov->iov_base, cp, cnt); if (error)
return (error);
break;
case UIO_SYSSPACE:
if (uio->uio_rw == UIO_READ)
error = kcopy(cp, iov->iov_base, cnt);
else
error = kcopy(iov->iov_base, cp, cnt); if (error)
return(error);
}
iov->iov_base = (caddr_t)iov->iov_base + cnt;
iov->iov_len -= cnt;
uio->uio_resid -= cnt;
uio->uio_offset += cnt;
cp = (caddr_t)cp + cnt;
n -= cnt;
}
return (error);
}
/*
* Give next character to user as result of read.
*/
int
ureadc(int c, struct uio *uio)
{
struct iovec *iov;
if (uio->uio_resid == 0)
#ifdef DIAGNOSTIC
panic("ureadc: zero resid");
#else
return (EINVAL);
#endif
again:
if (uio->uio_iovcnt <= 0)
#ifdef DIAGNOSTIC
panic("ureadc: non-positive iovcnt");
#else
return (EINVAL);
#endif
iov = uio->uio_iov;
if (iov->iov_len <= 0) {
uio->uio_iovcnt--;
uio->uio_iov++;
goto again;
}
switch (uio->uio_segflg) {
case UIO_USERSPACE:
{
char tmp = c;
if (copyout(&tmp, iov->iov_base, sizeof(char)) != 0)
return (EFAULT);
}
break;
case UIO_SYSSPACE:
*(char *)iov->iov_base = c;
break;
}
iov->iov_base = (caddr_t)iov->iov_base + 1;
iov->iov_len--;
uio->uio_resid--;
uio->uio_offset++;
return (0);
}
/*
* General routine to allocate a hash table.
*/
void *
hashinit(int elements, int type, int flags, u_long *hashmask)
{
u_long hashsize, i;
LIST_HEAD(generic, generic) *hashtbl;
if (elements <= 0)
panic("hashinit: bad cnt"); if ((elements & (elements - 1)) == 0)
hashsize = elements;
else
for (hashsize = 1; hashsize < elements; hashsize <<= 1)
continue;
hashtbl = mallocarray(hashsize, sizeof(*hashtbl), type, flags);
if (hashtbl == NULL)
return NULL;
for (i = 0; i < hashsize; i++)
LIST_INIT(&hashtbl[i]);
*hashmask = hashsize - 1;
return (hashtbl);
}
void
hashfree(void *hash, int elements, int type)
{
u_long hashsize;
LIST_HEAD(generic, generic) *hashtbl = hash;
if (elements <= 0)
panic("hashfree: bad cnt"); if ((elements & (elements - 1)) == 0)
hashsize = elements;
else
for (hashsize = 1; hashsize < elements; hashsize <<= 1)
continue;
free(hashtbl, type, sizeof(*hashtbl) * hashsize);
}
/*
* "startup hook" types, functions, and variables.
*/
struct hook_desc_head startuphook_list =
TAILQ_HEAD_INITIALIZER(startuphook_list);
void *
hook_establish(struct hook_desc_head *head, int tail, void (*fn)(void *),
void *arg)
{
struct hook_desc *hdp;
hdp = malloc(sizeof(*hdp), M_DEVBUF, M_NOWAIT);
if (hdp == NULL)
return (NULL);
hdp->hd_fn = fn;
hdp->hd_arg = arg;
if (tail)
TAILQ_INSERT_TAIL(head, hdp, hd_list);
else
TAILQ_INSERT_HEAD(head, hdp, hd_list);
return (hdp);
}
void
hook_disestablish(struct hook_desc_head *head, void *vhook)
{
struct hook_desc *hdp;
#ifdef DIAGNOSTIC
for (hdp = TAILQ_FIRST(head); hdp != NULL;
hdp = TAILQ_NEXT(hdp, hd_list))
if (hdp == vhook)
break;
if (hdp == NULL)
return;
#endif
hdp = vhook;
TAILQ_REMOVE(head, hdp, hd_list);
free(hdp, M_DEVBUF, sizeof(*hdp));
}
/*
* Run hooks. Startup hooks are invoked right after scheduler_start but
* before root is mounted. Shutdown hooks are invoked immediately before the
* system is halted or rebooted, i.e. after file systems unmounted,
* after crash dump done, etc.
*/
void
dohooks(struct hook_desc_head *head, int flags)
{
struct hook_desc *hdp, *hdp_temp;
if ((flags & HOOK_REMOVE) == 0) {
TAILQ_FOREACH_SAFE(hdp, head, hd_list, hdp_temp) {
(*hdp->hd_fn)(hdp->hd_arg);
}
} else {
while ((hdp = TAILQ_FIRST(head)) != NULL) {
TAILQ_REMOVE(head, hdp, hd_list);
(*hdp->hd_fn)(hdp->hd_arg);
if ((flags & HOOK_FREE) != 0)
free(hdp, M_DEVBUF, sizeof(*hdp));
}
}
}
/* $OpenBSD: in4_cksum.c,v 1.11 2022/02/01 15:30:10 miod Exp $ */
/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */
/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */
/*
* Copyright (C) 1999 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
* This is only for IPv4 pseudo header checksum.
* No need to clear non-pseudo-header fields in IPv4 header.
* len is for actual payload size, and does not include IPv4 header and
* skipped header chain (off + len should be equal to the whole packet).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*/
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
int
in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
{
u_int16_t *w;
int sum = 0;
int mlen = 0;
int byte_swapped = 0;
union {
struct ipovly ipov;
u_int16_t w[10];
} u;
union {
u_int8_t c[2];
u_int16_t s;
} s_util;
union {
u_int16_t s[2];
u_int32_t l;
} l_util;
if (nxt != 0) {
/* pseudo header */
if (off < sizeof(struct ipovly))
panic("in4_cksum: offset too short");
if (m->m_len < sizeof(struct ip))
panic("in4_cksum: bad mbuf chain");
u.ipov.ih_x1[8] = 0;
u.ipov.ih_pr = nxt;
u.ipov.ih_len = htons(len);
u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
w = u.w;
/* assumes sizeof(ipov) == 20 and first 8 bytes are zeroes */
sum += w[4]; sum += w[5]; sum += w[6];
sum += w[7]; sum += w[8]; sum += w[9];
}
/* skip unnecessary part */
while (m && off > 0) { if (m->m_len > off)
break;
off -= m->m_len;
m = m->m_next;
}
for (;m && len; m = m->m_next) { if (m->m_len == 0)
continue;
w = (u_int16_t *)(mtod(m, caddr_t) + off);
if (mlen == -1) {
/*
* The first byte of this mbuf is the continuation
* of a word spanning between this mbuf and the
* last mbuf.
*
* s_util.c[0] is already saved when scanning previous
* mbuf.
*/
s_util.c[1] = *(u_int8_t *)w;
sum += s_util.s;
w = (u_int16_t *)((u_int8_t *)w + 1);
mlen = m->m_len - off - 1;
len--;
} else
mlen = m->m_len - off;
off = 0;
if (len < mlen)
mlen = len;
len -= mlen;
/*
* Force to even boundary.
*/
if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *(u_int8_t *)w;
w = (u_int16_t *)((int8_t *)w + 1);
mlen--;
byte_swapped = 1;
}
/*
* Unroll the loop to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
w += 4;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
continue;
REDUCE;
while ((mlen -= 2) >= 0) {
sum += *w++;
}
if (byte_swapped) {
REDUCE;
sum <<= 8;
byte_swapped = 0;
if (mlen == -1) { s_util.c[1] = *(u_int8_t *)w;
sum += s_util.s;
mlen = 0;
} else
mlen = -1;
} else if (mlen == -1) s_util.c[0] = *(u_int8_t *)w;
}
if (len) printf("cksum4: out of data\n");
if (mlen == -1) {
/* The last mbuf has odd # of bytes. Follow the
standard (the odd byte may be shifted left by 8 bits
or not as determined by endian-ness of the machine) */
s_util.c[1] = 0;
sum += s_util.s;
}
REDUCE;
return (~sum & 0xffff);
}
/* $OpenBSD: ip6_id.c,v 1.16 2021/03/10 10:21:49 jsg Exp $ */
/* $NetBSD: ip6_id.c,v 1.7 2003/09/13 21:32:59 itojun Exp $ */
/* $KAME: ip6_id.c,v 1.8 2003/09/06 13:41:06 itojun Exp $ */
/*
* Copyright (C) 2003 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright 1998 Niels Provos <provos@citi.umich.edu>
* All rights reserved.
*
* Theo de Raadt <deraadt@openbsd.org> came up with the idea of using
* such a mathematical system to generate more random (yet non-repeating)
* ids to solve the resolver/named problem. But Niels designed the
* actual system based on the constraints.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* seed = random (bits - 1) bit
* n = prime, g0 = generator to n,
* j = random so that gcd(j,n-1) == 1
* g = g0^j mod n will be a generator again.
*
* X[0] = random seed.
* X[n] = a*X[n-1]+b mod m is a Linear Congruential Generator
* with a = 7^(even random) mod m,
* b = random with gcd(b,m) == 1
* m = constant and a maximal period of m-1.
*
* The transaction id is determined by:
* id[n] = seed xor (g^X[n] mod n)
*
* Effectivly the id is restricted to the lower (bits - 1) bits, thus
* yielding two different cycles by toggling the msb on and off.
* This avoids reuse issues caused by reseeding.
*/
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
struct randomtab {
const int ru_bits; /* resulting bits */
const long ru_out; /* Time after which will be reseeded */
const u_int32_t ru_max; /* Uniq cycle, avoid blackjack prediction */
const u_int32_t ru_gen; /* Starting generator */
const u_int32_t ru_n; /* ru_n: prime, ru_n - 1: product of pfacts[] */
const u_int32_t ru_agen; /* determine ru_a as ru_agen^(2*rand) */
const u_int32_t ru_m; /* ru_m = 2^x*3^y */
const u_int32_t pfacts[4]; /* factors of ru_n */
u_int32_t ru_counter;
u_int32_t ru_msb;
u_int32_t ru_x;
u_int32_t ru_seed, ru_seed2;
u_int32_t ru_a, ru_b;
u_int32_t ru_g;
long ru_reseed;
};
static struct randomtab randomtab_20 = {
20, /* resulting bits */
180, /* Time after which will be reseeded */
200000, /* Uniq cycle, avoid blackjack prediction */
2, /* Starting generator */
524269, /* RU_N-1 = 2^2*3^2*14563 */
7, /* determine ru_a as RU_AGEN^(2*rand) */
279936, /* RU_M = 2^7*3^7 - don't change */
{ 2, 3, 14563, 0 }, /* factors of ru_n */
};
u_int32_t ip6id_pmod(u_int32_t, u_int32_t, u_int32_t);
void ip6id_initid(struct randomtab *);
u_int32_t ip6id_randomid(struct randomtab *);
/*
* Do a fast modular exponation, returned value will be in the range
* of 0 - (mod-1)
*/
u_int32_t
ip6id_pmod(u_int32_t gen, u_int32_t expo, u_int32_t mod)
{
u_int64_t s, t, u;
s = 1;
t = gen;
u = expo;
while (u) { if (u & 1) s = (s * t) % mod;
u >>= 1;
t = (t * t) % mod;
}
return (s);
}
/*
* Initializes the seed and chooses a suitable generator. Also toggles
* the msb flag. The msb flag is used to generate two distinct
* cycles of random numbers and thus avoiding reuse of ids.
*
* This function is called from id_randomid() when needed, an
* application does not have to worry about it.
*/
void
ip6id_initid(struct randomtab *p)
{
u_int32_t j, i;
int noprime = 1;
p->ru_x = arc4random_uniform(p->ru_m);
/* (bits - 1) bits of random seed */
p->ru_seed = arc4random() & (~0U >> (32 - p->ru_bits + 1));
p->ru_seed2 = arc4random() & (~0U >> (32 - p->ru_bits + 1));
/* Determine the LCG we use */
p->ru_b = (arc4random() & (~0U >> (32 - p->ru_bits))) | 1;
p->ru_a = ip6id_pmod(p->ru_agen,
(arc4random() & (~0U >> (32 - p->ru_bits))) & (~1U), p->ru_m);
while (p->ru_b % 3 == 0)
p->ru_b += 2;
j = arc4random_uniform(p->ru_n);
/*
* Do a fast gcd(j, RU_N - 1), so we can find a j with
* gcd(j, RU_N - 1) == 1, giving a new generator for
* RU_GEN^j mod RU_N
*/
while (noprime) {
for (i = 0; p->pfacts[i] > 0; i++)
if (j % p->pfacts[i] == 0)
break;
if (p->pfacts[i] == 0)
noprime = 0;
else
j = (j + 1) % p->ru_n;
}
p->ru_g = ip6id_pmod(p->ru_gen, j, p->ru_n);
p->ru_counter = 0;
p->ru_reseed = getuptime() + p->ru_out;
p->ru_msb = p->ru_msb ? 0 : (1U << (p->ru_bits - 1));
}
u_int32_t
ip6id_randomid(struct randomtab *p)
{
int i, n;
if (p->ru_counter >= p->ru_max || getuptime() > p->ru_reseed)
ip6id_initid(p);
/* Skip a random number of ids */
n = arc4random() & 0x3;
if (p->ru_counter + n >= p->ru_max) ip6id_initid(p); for (i = 0; i <= n; i++) {
/* Linear Congruential Generator */
p->ru_x = (u_int32_t)((u_int64_t)p->ru_a * p->ru_x + p->ru_b) % p->ru_m;
}
p->ru_counter += i;
return (p->ru_seed ^ ip6id_pmod(p->ru_g, p->ru_seed2 + p->ru_x, p->ru_n)) |
p->ru_msb;
}
u_int32_t
ip6_randomflowlabel(void)
{
return ip6id_randomid(&randomtab_20) & 0xfffff;
}
/* $OpenBSD: subr_evcount.c,v 1.13 2022/08/14 01:58:28 jsg Exp $ */
/*
* Copyright (c) 2004 Artur Grabowski <art@openbsd.org>
* Copyright (c) 2004 Aaron Campbell <aaron@openbsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/evcount.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
static TAILQ_HEAD(,evcount) evcount_list = TAILQ_HEAD_INITIALIZER(evcount_list);
void
evcount_attach(struct evcount *ec, const char *name, void *data)
{
static int nextid = 0;
memset(ec, 0, sizeof(*ec));
ec->ec_name = name;
ec->ec_id = ++nextid;
ec->ec_data = data;
TAILQ_INSERT_TAIL(&evcount_list, ec, next);
}
void
evcount_detach(struct evcount *ec)
{
TAILQ_REMOVE(&evcount_list, ec, next);
}
#ifndef SMALL_KERNEL
int
evcount_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
int error = 0, s, nintr, i;
struct evcount *ec;
u_int64_t count;
if (newp != NULL)
return (EPERM);
if (name[0] != KERN_INTRCNT_NUM) { if (namelen != 2)
return (ENOTDIR);
if (name[1] < 0)
return (EINVAL);
i = name[1];
} else
i = -1;
nintr = 0;
TAILQ_FOREACH(ec, &evcount_list, next) { if (nintr++ == i)
break;
}
switch (name[0]) {
case KERN_INTRCNT_NUM:
error = sysctl_rdint(oldp, oldlenp, NULL, nintr);
break;
case KERN_INTRCNT_CNT:
if (ec == NULL)
return (ENOENT);
s = splhigh();
count = ec->ec_count;
splx(s);
error = sysctl_rdquad(oldp, oldlenp, NULL, count);
break;
case KERN_INTRCNT_NAME:
if (ec == NULL)
return (ENOENT);
error = sysctl_rdstring(oldp, oldlenp, NULL, ec->ec_name);
break;
case KERN_INTRCNT_VECTOR:
if (ec == NULL || ec->ec_data == NULL)
return (ENOENT);
error = sysctl_rdint(oldp, oldlenp, NULL,
*((int *)ec->ec_data));
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
#endif /* SMALL_KERNEL */
/* $OpenBSD: vfs_default.c,v 1.51 2022/04/27 14:52:25 claudio Exp $ */
/*
* Portions of this code are:
*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/event.h>
#include <sys/specdev.h>
int filt_generic_readwrite(struct knote *, long);
void filt_generic_detach(struct knote *);
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
int
vop_generic_revoke(void *v)
{
struct vop_revoke_args *ap = v;
struct vnode *vp, *vq;
struct proc *p = curproc;
#ifdef DIAGNOSTIC
if ((ap->a_flags & REVOKEALL) == 0)
panic("vop_generic_revoke");
#endif
vp = ap->a_vp;
while (vp->v_type == VBLK && vp->v_specinfo != NULL &&
vp->v_specmountpoint != NULL) {
struct mount *mp = vp->v_specmountpoint;
/*
* If we have a mount point associated with the vnode, we must
* flush it out now, as to not leave a dangling zombie mount
* point laying around in VFS.
*/
if (!vfs_busy(mp, VB_WRITE|VB_WAIT)) { dounmount(mp, MNT_FORCE | MNT_DOOMED, p);
break;
}
}
if (vp->v_flag & VALIASED) {
/*
* If a vgone (or vclean) is already in progress,
* wait until it is done and return.
*/
mtx_enter(&vnode_mtx);
if (vp->v_lflag & VXLOCK) {
vp->v_lflag |= VXWANT;
msleep_nsec(vp, &vnode_mtx, PINOD,
"vop_generic_revokeall", INFSLP);
mtx_leave(&vnode_mtx);
return(0);
}
/*
* Ensure that vp will not be vgone'd while we
* are eliminating its aliases.
*/
vp->v_lflag |= VXLOCK;
mtx_leave(&vnode_mtx);
while (vp->v_flag & VALIASED) { SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq)
continue;
vgonel(vq, p);
break;
}
}
/*
* Remove the lock so that vgone below will
* really eliminate the vnode after which time
* vgone will awaken any sleepers.
*/
mtx_enter(&vnode_mtx);
vp->v_lflag &= ~VXLOCK;
mtx_leave(&vnode_mtx);
}
vgonel(vp, p);
return (0);
}
int
vop_generic_badop(void *v)
{
panic("%s", __func__);
}
int
vop_generic_bmap(void *v)
{
struct vop_bmap_args *ap = v;
if (ap->a_vpp)
*ap->a_vpp = ap->a_vp;
if (ap->a_bnp)
*ap->a_bnp = ap->a_bn;
if (ap->a_runp)
*ap->a_runp = 0;
return (0);
}
int
vop_generic_bwrite(void *v)
{
struct vop_bwrite_args *ap = v;
return (bwrite(ap->a_bp));
}
int
vop_generic_abortop(void *v)
{
struct vop_abortop_args *ap = v;
if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) pool_put(&namei_pool, ap->a_cnp->cn_pnbuf);
return (0);
}
const struct filterops generic_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_generic_detach,
.f_event = filt_generic_readwrite,
};
int
vop_generic_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
struct knote *kn = ap->a_kn;
switch (kn->kn_filter) {
case EVFILT_READ:
case EVFILT_WRITE:
kn->kn_fop = &generic_filtops;
break;
default:
return (EINVAL);
}
return (0);
}
/* Trivial lookup routine that always fails. */
int
vop_generic_lookup(void *v)
{
struct vop_lookup_args *ap = v;
*ap->a_vpp = NULL;
return (ENOTDIR);
}
void
filt_generic_detach(struct knote *kn)
{
}
int
filt_generic_readwrite(struct knote *kn, long hint)
{
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE) {
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
return (1);
}
kn->kn_data = 0;
return (1);
}
/* $OpenBSD: ip_spd.c,v 1.117 2022/06/17 13:40:21 bluhm Exp $ */
/*
* The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
*
* Copyright (c) 2000-2001 Angelos D. Keromytis.
*
* Permission to use, copy, and modify this software with or without fee
* is hereby granted, provided that this entire notice is included in
* all copies of any software which is or includes a copy or
* modification of this software.
* You may use this code under the GNU public license if you so wish. Please
* contribute changes back to the authors under this freer than GPL license
* so that we may further the use of strong encryption without limitations to
* all.
*
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
* PURPOSE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/kernel.h>
#include <sys/socketvar.h>
#include <sys/pool.h>
#include <sys/timeout.h>
#include <net/route.h>
#include <net/netisr.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_ipsp.h>
#include <net/pfkeyv2.h>
int ipsp_spd_inp(struct mbuf *, struct inpcb *, struct ipsec_policy *,
struct tdb **);
int ipsp_acquire_sa(struct ipsec_policy *, union sockaddr_union *,
union sockaddr_union *, struct sockaddr_encap *, struct mbuf *);
int ipsp_pending_acquire(struct ipsec_policy *, union sockaddr_union *);
void ipsp_delete_acquire_timer(void *);
void ipsp_delete_acquire_locked(struct ipsec_acquire *);
void ipsp_delete_acquire(struct ipsec_acquire *);
void ipsp_unref_acquire_locked(struct ipsec_acquire *);
struct pool ipsec_policy_pool;
struct pool ipsec_acquire_pool;
/*
* For tdb_walk() calling tdb_delete_locked() we need lock order
* tdb_sadb_mtx before ipo_tdb_mtx.
*/
struct mutex ipo_tdb_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
/* Protected by the NET_LOCK(). */
struct radix_node_head **spd_tables;
unsigned int spd_table_max;
struct mutex ipsec_acquire_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
struct ipsec_acquire_head ipsec_acquire_head =
TAILQ_HEAD_INITIALIZER(ipsec_acquire_head);
struct radix_node_head *
spd_table_get(unsigned int rtableid)
{
unsigned int rdomain;
NET_ASSERT_LOCKED(); if (spd_tables == NULL)
return (NULL);
rdomain = rtable_l2(rtableid);
if (rdomain > spd_table_max)
return (NULL);
return (spd_tables[rdomain]);
}
struct radix_node_head *
spd_table_add(unsigned int rtableid)
{
struct radix_node_head *rnh = NULL;
unsigned int rdomain;
void *p;
NET_ASSERT_LOCKED();
rdomain = rtable_l2(rtableid);
if (spd_tables == NULL || rdomain > spd_table_max) {
if ((p = mallocarray(rdomain + 1, sizeof(*rnh),
M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
return (NULL);
if (spd_tables != NULL) {
memcpy(p, spd_tables, sizeof(*rnh) * (spd_table_max+1));
free(spd_tables, M_RTABLE,
sizeof(*rnh) * (spd_table_max+1));
}
spd_tables = p;
spd_table_max = rdomain;
}
if (spd_tables[rdomain] == NULL) {
if (rn_inithead((void **)&rnh,
offsetof(struct sockaddr_encap, sen_type)) == 0)
rnh = NULL;
spd_tables[rdomain] = rnh;
}
return (spd_tables[rdomain]);
}
int
spd_table_walk(unsigned int rtableid,
int (*func)(struct ipsec_policy *, void *, unsigned int), void *arg)
{
struct radix_node_head *rnh;
int (*walker)(struct radix_node *, void *, u_int) = (void *)func;
int error;
rnh = spd_table_get(rtableid);
if (rnh == NULL)
return (0);
/* EGAIN means the tree changed. */
while ((error = rn_walktree(rnh, walker, arg)) == EAGAIN)
continue;
return (error);
}
/*
* Lookup at the SPD based on the headers contained on the mbuf. The second
* argument indicates what protocol family the header at the beginning of
* the mbuf is. hlen is the offset of the transport protocol header
* in the mbuf.
*
* Return combinations (of return value and *tdbout):
* - -EINVAL -> silently drop the packet
* - errno -> drop packet and return error
* - 0/NULL -> no IPsec required on packet
* - 0/TDB -> do IPsec
*
* In the case of incoming flows, only the first three combinations are
* returned.
*/
int
ipsp_spd_lookup(struct mbuf *m, int af, int hlen, int direction,
struct tdb *tdbin, struct inpcb *inp, struct tdb **tdbout,
struct ipsec_ids *ipsecflowinfo_ids)
{
struct radix_node_head *rnh;
struct radix_node *rn;
union sockaddr_union sdst, ssrc;
struct sockaddr_encap *ddst, dst;
struct ipsec_policy *ipo;
struct ipsec_ids *ids = NULL;
int error, signore = 0, dignore = 0;
u_int rdomain;
NET_ASSERT_LOCKED();
/*
* If there are no flows in place, there's no point
* continuing with the SPD lookup.
*/
if (!ipsec_in_use)
return ipsp_spd_inp(m, inp, NULL, tdbout);
/*
* If an input packet is destined to a BYPASS socket, just accept it.
*/
if ((inp != NULL) && (direction == IPSP_DIRECTION_IN) && (inp->inp_seclevel[SL_ESP_TRANS] == IPSEC_LEVEL_BYPASS) && (inp->inp_seclevel[SL_ESP_NETWORK] == IPSEC_LEVEL_BYPASS) &&
(inp->inp_seclevel[SL_AUTH] == IPSEC_LEVEL_BYPASS)) {
if (tdbout != NULL) *tdbout = NULL;
return 0;
}
memset(&dst, 0, sizeof(dst));
memset(&sdst, 0, sizeof(union sockaddr_union));
memset(&ssrc, 0, sizeof(union sockaddr_union));
ddst = (struct sockaddr_encap *)&dst;
ddst->sen_family = PF_KEY;
ddst->sen_len = SENT_LEN;
switch (af) {
case AF_INET:
if (hlen < sizeof (struct ip) || m->m_pkthdr.len < hlen)
return EINVAL;
ddst->sen_direction = direction;
ddst->sen_type = SENT_IP4;
m_copydata(m, offsetof(struct ip, ip_src),
sizeof(struct in_addr), (caddr_t) &(ddst->sen_ip_src));
m_copydata(m, offsetof(struct ip, ip_dst),
sizeof(struct in_addr), (caddr_t) &(ddst->sen_ip_dst));
m_copydata(m, offsetof(struct ip, ip_p), sizeof(u_int8_t),
(caddr_t) &(ddst->sen_proto));
sdst.sin.sin_family = ssrc.sin.sin_family = AF_INET;
sdst.sin.sin_len = ssrc.sin.sin_len =
sizeof(struct sockaddr_in);
ssrc.sin.sin_addr = ddst->sen_ip_src;
sdst.sin.sin_addr = ddst->sen_ip_dst;
/*
* If TCP/UDP, extract the port numbers to use in the lookup.
*/
switch (ddst->sen_proto) {
case IPPROTO_UDP:
case IPPROTO_TCP:
/* Make sure there's enough data in the packet. */
if (m->m_pkthdr.len < hlen + 2 * sizeof(u_int16_t))
return EINVAL;
/*
* Luckily, the offset of the src/dst ports in
* both the UDP and TCP headers is the same (first
* two 16-bit values in the respective headers),
* so we can just copy them.
*/
m_copydata(m, hlen, sizeof(u_int16_t),
(caddr_t) &(ddst->sen_sport));
m_copydata(m, hlen + sizeof(u_int16_t),
sizeof(u_int16_t),
(caddr_t) &(ddst->sen_dport));
break;
default:
ddst->sen_sport = 0;
ddst->sen_dport = 0;
}
break;
#ifdef INET6
case AF_INET6:
if (hlen < sizeof (struct ip6_hdr) || m->m_pkthdr.len < hlen)
return EINVAL;
ddst->sen_type = SENT_IP6;
ddst->sen_ip6_direction = direction;
m_copydata(m, offsetof(struct ip6_hdr, ip6_src),
sizeof(struct in6_addr),
(caddr_t) &(ddst->sen_ip6_src));
m_copydata(m, offsetof(struct ip6_hdr, ip6_dst),
sizeof(struct in6_addr),
(caddr_t) &(ddst->sen_ip6_dst));
m_copydata(m, offsetof(struct ip6_hdr, ip6_nxt),
sizeof(u_int8_t),
(caddr_t) &(ddst->sen_ip6_proto));
sdst.sin6.sin6_family = ssrc.sin6.sin6_family = AF_INET6;
sdst.sin6.sin6_len = ssrc.sin6.sin6_len =
sizeof(struct sockaddr_in6);
in6_recoverscope(&ssrc.sin6, &ddst->sen_ip6_src);
in6_recoverscope(&sdst.sin6, &ddst->sen_ip6_dst);
/*
* If TCP/UDP, extract the port numbers to use in the lookup.
*/
switch (ddst->sen_ip6_proto) {
case IPPROTO_UDP:
case IPPROTO_TCP:
/* Make sure there's enough data in the packet. */
if (m->m_pkthdr.len < hlen + 2 * sizeof(u_int16_t))
return EINVAL;
/*
* Luckily, the offset of the src/dst ports in
* both the UDP and TCP headers is the same
* (first two 16-bit values in the respective
* headers), so we can just copy them.
*/
m_copydata(m, hlen, sizeof(u_int16_t),
(caddr_t) &(ddst->sen_ip6_sport));
m_copydata(m, hlen + sizeof(u_int16_t),
sizeof(u_int16_t),
(caddr_t) &(ddst->sen_ip6_dport));
break;
default:
ddst->sen_ip6_sport = 0;
ddst->sen_ip6_dport = 0;
}
break;
#endif /* INET6 */
default:
return EAFNOSUPPORT;
}
/* Actual SPD lookup. */
rdomain = rtable_l2(m->m_pkthdr.ph_rtableid);
if ((rnh = spd_table_get(rdomain)) == NULL ||
(rn = rn_match((caddr_t)&dst, rnh)) == NULL) {
/*
* Return whatever the socket requirements are, there are no
* system-wide policies.
*/
return ipsp_spd_inp(m, inp, NULL, tdbout);
}
ipo = (struct ipsec_policy *)rn;
switch (ipo->ipo_type) {
case IPSP_PERMIT:
return ipsp_spd_inp(m, inp, ipo, tdbout);
case IPSP_DENY:
return EHOSTUNREACH;
case IPSP_IPSEC_USE:
case IPSP_IPSEC_ACQUIRE:
case IPSP_IPSEC_REQUIRE:
case IPSP_IPSEC_DONTACQ:
/* Nothing more needed here. */
break;
default:
return EINVAL;
}
/* Check for non-specific destination in the policy. */
switch (ipo->ipo_dst.sa.sa_family) {
case AF_INET:
if ((ipo->ipo_dst.sin.sin_addr.s_addr == INADDR_ANY) ||
(ipo->ipo_dst.sin.sin_addr.s_addr == INADDR_BROADCAST))
dignore = 1;
break;
#ifdef INET6
case AF_INET6:
if ((IN6_IS_ADDR_UNSPECIFIED(&ipo->ipo_dst.sin6.sin6_addr)) ||
(memcmp(&ipo->ipo_dst.sin6.sin6_addr, &in6mask128,
sizeof(in6mask128)) == 0))
dignore = 1;
break;
#endif /* INET6 */
}
/* Likewise for source. */
switch (ipo->ipo_src.sa.sa_family) {
case AF_INET:
if (ipo->ipo_src.sin.sin_addr.s_addr == INADDR_ANY)
signore = 1;
break;
#ifdef INET6
case AF_INET6:
if (IN6_IS_ADDR_UNSPECIFIED(&ipo->ipo_src.sin6.sin6_addr))
signore = 1;
break;
#endif /* INET6 */
}
/* Do we have a cached entry ? If so, check if it's still valid. */
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb != NULL &&
(ipo->ipo_tdb->tdb_flags & TDBF_INVALID)) {
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head, ipo,
ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
ipo->ipo_tdb = NULL;
}
mtx_leave(&ipo_tdb_mtx);
/* Outgoing packet policy check. */
if (direction == IPSP_DIRECTION_OUT) {
/*
* If the packet is destined for the policy-specified
* gateway/endhost, and the socket has the BYPASS
* option set, skip IPsec processing.
*/
if ((inp != NULL) && (inp->inp_seclevel[SL_ESP_TRANS] == IPSEC_LEVEL_BYPASS) &&
(inp->inp_seclevel[SL_ESP_NETWORK] ==
IPSEC_LEVEL_BYPASS) &&
(inp->inp_seclevel[SL_AUTH] == IPSEC_LEVEL_BYPASS)) {
/* Direct match. */
if (dignore ||
!memcmp(&sdst, &ipo->ipo_dst, sdst.sa.sa_len)) {
if (tdbout != NULL) *tdbout = NULL;
return 0;
}
}
/* Check that the cached TDB (if present), is appropriate. */
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb != NULL) {
if ((ipo->ipo_last_searched <= ipsec_last_added) || (ipo->ipo_sproto != ipo->ipo_tdb->tdb_sproto) ||
memcmp(dignore ? &sdst : &ipo->ipo_dst,
&ipo->ipo_tdb->tdb_dst,
ipo->ipo_tdb->tdb_dst.sa.sa_len))
goto nomatchout;
if (!ipsp_aux_match(ipo->ipo_tdb, ipsecflowinfo_ids? ipsecflowinfo_ids: ipo->ipo_ids,
&ipo->ipo_addr, &ipo->ipo_mask))
goto nomatchout;
/* Cached entry is good. */
error = ipsp_spd_inp(m, inp, ipo, tdbout);
mtx_leave(&ipo_tdb_mtx);
return error;
nomatchout:
/* Cached TDB was not good. */
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head, ipo,
ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
ipo->ipo_tdb = NULL;
ipo->ipo_last_searched = 0;
}
/*
* If no SA has been added since the last time we did a
* lookup, there's no point searching for one. However, if the
* destination gateway is left unspecified (or is all-1's),
* always lookup since this is a generic-match rule
* (otherwise, we can have situations where SAs to some
* destinations exist but are not used, possibly leading to an
* explosion in the number of acquired SAs).
*/
if (ipo->ipo_last_searched <= ipsec_last_added) {
struct tdb *tdbp_new;
/* "Touch" the entry. */
if (dignore == 0) ipo->ipo_last_searched = getuptime();
/* gettdb() takes tdb_sadb_mtx, preserve lock order */
mtx_leave(&ipo_tdb_mtx);
/* Find an appropriate SA from the existing ones. */
tdbp_new = gettdbbydst(rdomain,
dignore ? &sdst : &ipo->ipo_dst,
ipo->ipo_sproto,
ipsecflowinfo_ids? ipsecflowinfo_ids: ipo->ipo_ids,
&ipo->ipo_addr, &ipo->ipo_mask);
ids = NULL;
mtx_enter(&ipo_tdb_mtx);
if ((tdbp_new != NULL) &&
(tdbp_new->tdb_flags & TDBF_DELETED)) {
/*
* After tdb_delete() has released ipo_tdb_mtx
* in tdb_unlink(), never add a new one.
* tdb_cleanspd() has to catch all of them.
*/
tdb_unref(tdbp_new);
tdbp_new = NULL;
}
if (ipo->ipo_tdb != NULL) {
/* Remove cached TDB from parallel thread. */
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head,
ipo, ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
}
ipo->ipo_tdb = tdbp_new;
if (ipo->ipo_tdb != NULL) {
/* gettdbbydst() has already refcounted tdb */
TAILQ_INSERT_TAIL(
&ipo->ipo_tdb->tdb_policy_head,
ipo, ipo_tdb_next);
error = ipsp_spd_inp(m, inp, ipo, tdbout);
mtx_leave(&ipo_tdb_mtx);
return error;
}
}
mtx_leave(&ipo_tdb_mtx);
/* So, we don't have an SA -- just a policy. */
switch (ipo->ipo_type) {
case IPSP_IPSEC_REQUIRE:
/* Acquire SA through key management. */
if (ipsp_acquire_sa(ipo,
dignore ? &sdst : &ipo->ipo_dst,
signore ? NULL : &ipo->ipo_src, ddst, m) != 0) {
return EACCES;
}
/* FALLTHROUGH */
case IPSP_IPSEC_DONTACQ:
return -EINVAL; /* Silently drop packet. */
case IPSP_IPSEC_ACQUIRE:
/* Acquire SA through key management. */
ipsp_acquire_sa(ipo, dignore ? &sdst : &ipo->ipo_dst,
signore ? NULL : &ipo->ipo_src, ddst, NULL);
/* FALLTHROUGH */
case IPSP_IPSEC_USE:
return ipsp_spd_inp(m, inp, ipo, tdbout);
}
} else { /* IPSP_DIRECTION_IN */
if (tdbin != NULL) {
/*
* Special case for bundled IPcomp/ESP SAs:
* 1) only IPcomp flows are loaded into kernel
* 2) input processing processes ESP SA first
* 3) then optional IPcomp processing happens
* 4) we only update m_tag for ESP
* => 'tdbin' is always set to ESP SA
* => flow has ipo_proto for IPcomp
* So if 'tdbin' points to an ESP SA and this 'tdbin' is
* bundled with an IPcomp SA, then we replace 'tdbin'
* with the IPcomp SA at tdbin->tdb_inext.
*/
if (ipo->ipo_sproto == IPPROTO_IPCOMP && tdbin->tdb_sproto == IPPROTO_ESP && tdbin->tdb_inext != NULL && tdbin->tdb_inext->tdb_sproto == IPPROTO_IPCOMP)
tdbin = tdbin->tdb_inext;
/* Direct match in the cache. */
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb == tdbin) {
error = ipsp_spd_inp(m, inp, ipo, tdbout);
mtx_leave(&ipo_tdb_mtx);
return error;
}
mtx_leave(&ipo_tdb_mtx);
if (memcmp(dignore ? &ssrc : &ipo->ipo_dst, &tdbin->tdb_src, tdbin->tdb_src.sa.sa_len) ||
(ipo->ipo_sproto != tdbin->tdb_sproto))
goto nomatchin;
/* Match source/dest IDs. */
if (ipo->ipo_ids) if (tdbin->tdb_ids == NULL ||
!ipsp_ids_match(ipo->ipo_ids,
tdbin->tdb_ids))
goto nomatchin;
/* Add it to the cache. */
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb != NULL) { TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head,
ipo, ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
}
ipo->ipo_tdb = tdb_ref(tdbin);
TAILQ_INSERT_TAIL(&tdbin->tdb_policy_head, ipo,
ipo_tdb_next);
error = ipsp_spd_inp(m, inp, ipo, tdbout);
mtx_leave(&ipo_tdb_mtx);
return error;
nomatchin: /* Nothing needed here, falling through */
;
}
/* Check whether cached entry applies. */
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb != NULL) {
/*
* We only need to check that the correct
* security protocol and security gateway are
* set; IDs will be the same since the cached
* entry is linked on this policy.
*/
if (ipo->ipo_sproto == ipo->ipo_tdb->tdb_sproto &&
!memcmp(&ipo->ipo_tdb->tdb_src,
dignore ? &ssrc : &ipo->ipo_dst,
ipo->ipo_tdb->tdb_src.sa.sa_len))
goto skipinputsearch;
/* Not applicable, unlink. */
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head, ipo,
ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
ipo->ipo_tdb = NULL;
ipo->ipo_last_searched = 0;
}
/* Find whether there exists an appropriate SA. */
if (ipo->ipo_last_searched <= ipsec_last_added) {
struct tdb *tdbp_new;
if (dignore == 0) ipo->ipo_last_searched = getuptime();
/* gettdb() takes tdb_sadb_mtx, preserve lock order */
mtx_leave(&ipo_tdb_mtx);
tdbp_new = gettdbbysrc(rdomain,
dignore ? &ssrc : &ipo->ipo_dst,
ipo->ipo_sproto, ipo->ipo_ids,
&ipo->ipo_addr, &ipo->ipo_mask);
mtx_enter(&ipo_tdb_mtx);
if ((tdbp_new != NULL) &&
(tdbp_new->tdb_flags & TDBF_DELETED)) {
/*
* After tdb_delete() has released ipo_tdb_mtx
* in tdb_unlink(), never add a new one.
* tdb_cleanspd() has to catch all of them.
*/
tdb_unref(tdbp_new);
tdbp_new = NULL;
}
if (ipo->ipo_tdb != NULL) {
/* Remove cached TDB from parallel thread. */
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head,
ipo, ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
}
ipo->ipo_tdb = tdbp_new;
if (ipo->ipo_tdb != NULL) {
/* gettdbbysrc() has already refcounted tdb */
TAILQ_INSERT_TAIL(
&ipo->ipo_tdb->tdb_policy_head,
ipo, ipo_tdb_next);
}
}
skipinputsearch:
mtx_leave(&ipo_tdb_mtx);
switch (ipo->ipo_type) {
case IPSP_IPSEC_REQUIRE:
/* If appropriate SA exists, don't acquire another. */
if (ipo->ipo_tdb != NULL)
return -EINVAL; /* Silently drop packet. */
/* Acquire SA through key management. */
if ((error = ipsp_acquire_sa(ipo,
dignore ? &ssrc : &ipo->ipo_dst,
signore ? NULL : &ipo->ipo_src, ddst, m)) != 0)
return error;
/* FALLTHROUGH */
case IPSP_IPSEC_DONTACQ:
return -EINVAL; /* Silently drop packet. */
case IPSP_IPSEC_ACQUIRE:
/* If appropriate SA exists, don't acquire another. */
if (ipo->ipo_tdb != NULL)
return ipsp_spd_inp(m, inp, ipo, tdbout);
/* Acquire SA through key management. */
ipsp_acquire_sa(ipo, dignore ? &ssrc : &ipo->ipo_dst,
signore ? NULL : &ipo->ipo_src, ddst, NULL);
/* FALLTHROUGH */
case IPSP_IPSEC_USE:
return ipsp_spd_inp(m, inp, ipo, tdbout);
}
}
/* Shouldn't ever get this far. */
return EINVAL;
}
/*
* Delete a policy from the SPD.
*/
int
ipsec_delete_policy(struct ipsec_policy *ipo)
{
struct ipsec_acquire *ipa;
struct radix_node_head *rnh;
struct radix_node *rn = (struct radix_node *)ipo;
NET_ASSERT_LOCKED();
if (refcnt_rele(&ipo->ipo_refcnt) == 0)
return 0;
/* Delete from SPD. */
if ((rnh = spd_table_get(ipo->ipo_rdomain)) == NULL ||
rn_delete(&ipo->ipo_addr, &ipo->ipo_mask, rnh, rn) == NULL)
return (ESRCH);
mtx_enter(&ipo_tdb_mtx);
if (ipo->ipo_tdb != NULL) {
TAILQ_REMOVE(&ipo->ipo_tdb->tdb_policy_head, ipo,
ipo_tdb_next);
tdb_unref(ipo->ipo_tdb);
ipo->ipo_tdb = NULL;
}
mtx_leave(&ipo_tdb_mtx);
mtx_enter(&ipsec_acquire_mtx);
while ((ipa = TAILQ_FIRST(&ipo->ipo_acquires)) != NULL)
ipsp_delete_acquire_locked(ipa);
mtx_leave(&ipsec_acquire_mtx);
TAILQ_REMOVE(&ipsec_policy_head, ipo, ipo_list);
if (ipo->ipo_ids)
ipsp_ids_free(ipo->ipo_ids);
ipsec_in_use--;
pool_put(&ipsec_policy_pool, ipo);
return 0;
}
void
ipsp_delete_acquire_timer(void *v)
{
struct ipsec_acquire *ipa = v;
mtx_enter(&ipsec_acquire_mtx);
refcnt_rele(&ipa->ipa_refcnt);
ipsp_delete_acquire_locked(ipa);
mtx_leave(&ipsec_acquire_mtx);
}
/*
* Delete a pending IPsec acquire record.
*/
void
ipsp_delete_acquire(struct ipsec_acquire *ipa)
{
mtx_enter(&ipsec_acquire_mtx);
ipsp_delete_acquire_locked(ipa);
mtx_leave(&ipsec_acquire_mtx);
}
void
ipsp_delete_acquire_locked(struct ipsec_acquire *ipa)
{
if (timeout_del(&ipa->ipa_timeout) == 1)
refcnt_rele(&ipa->ipa_refcnt);
ipsp_unref_acquire_locked(ipa);
}
void
ipsec_unref_acquire(struct ipsec_acquire *ipa)
{
mtx_enter(&ipsec_acquire_mtx);
ipsp_unref_acquire_locked(ipa);
mtx_leave(&ipsec_acquire_mtx);
}
void
ipsp_unref_acquire_locked(struct ipsec_acquire *ipa)
{
MUTEX_ASSERT_LOCKED(&ipsec_acquire_mtx);
if (refcnt_rele(&ipa->ipa_refcnt) == 0)
return;
TAILQ_REMOVE(&ipsec_acquire_head, ipa, ipa_next);
TAILQ_REMOVE(&ipa->ipa_policy->ipo_acquires, ipa, ipa_ipo_next);
ipa->ipa_policy = NULL;
pool_put(&ipsec_acquire_pool, ipa);
}
/*
* Find out if there's an ACQUIRE pending.
* XXX Need a better structure.
*/
int
ipsp_pending_acquire(struct ipsec_policy *ipo, union sockaddr_union *gw)
{
struct ipsec_acquire *ipa;
NET_ASSERT_LOCKED();
mtx_enter(&ipsec_acquire_mtx);
TAILQ_FOREACH(ipa, &ipo->ipo_acquires, ipa_ipo_next) {
if (!memcmp(gw, &ipa->ipa_addr, gw->sa.sa_len))
break;
}
mtx_leave(&ipsec_acquire_mtx);
return (ipa != NULL);
}
/*
* Signal key management that we need an SA.
* XXX For outgoing policies, we could try to hold on to the mbuf.
*/
int
ipsp_acquire_sa(struct ipsec_policy *ipo, union sockaddr_union *gw,
union sockaddr_union *laddr, struct sockaddr_encap *ddst, struct mbuf *m)
{
struct ipsec_acquire *ipa;
NET_ASSERT_LOCKED();
/* Check whether request has been made already. */
if (ipsp_pending_acquire(ipo, gw))
return 0;
/* Add request in cache and proceed. */
ipa = pool_get(&ipsec_acquire_pool, PR_NOWAIT|PR_ZERO);
if (ipa == NULL)
return ENOMEM;
ipa->ipa_addr = *gw;
refcnt_init(&ipa->ipa_refcnt);
timeout_set(&ipa->ipa_timeout, ipsp_delete_acquire_timer, ipa);
ipa->ipa_info.sen_len = ipa->ipa_mask.sen_len = SENT_LEN;
ipa->ipa_info.sen_family = ipa->ipa_mask.sen_family = PF_KEY;
/* Just copy the right information. */
switch (ipo->ipo_addr.sen_type) {
case SENT_IP4:
ipa->ipa_info.sen_type = ipa->ipa_mask.sen_type = SENT_IP4;
ipa->ipa_info.sen_direction = ipo->ipo_addr.sen_direction;
ipa->ipa_mask.sen_direction = ipo->ipo_mask.sen_direction;
if (ipsp_is_unspecified(ipo->ipo_dst)) {
ipa->ipa_info.sen_ip_src = ddst->sen_ip_src;
ipa->ipa_mask.sen_ip_src.s_addr = INADDR_BROADCAST;
ipa->ipa_info.sen_ip_dst = ddst->sen_ip_dst;
ipa->ipa_mask.sen_ip_dst.s_addr = INADDR_BROADCAST;
} else {
ipa->ipa_info.sen_ip_src = ipo->ipo_addr.sen_ip_src;
ipa->ipa_mask.sen_ip_src = ipo->ipo_mask.sen_ip_src;
ipa->ipa_info.sen_ip_dst = ipo->ipo_addr.sen_ip_dst;
ipa->ipa_mask.sen_ip_dst = ipo->ipo_mask.sen_ip_dst;
}
ipa->ipa_info.sen_proto = ipo->ipo_addr.sen_proto;
ipa->ipa_mask.sen_proto = ipo->ipo_mask.sen_proto;
if (ipo->ipo_addr.sen_proto) {
ipa->ipa_info.sen_sport = ipo->ipo_addr.sen_sport;
ipa->ipa_mask.sen_sport = ipo->ipo_mask.sen_sport;
ipa->ipa_info.sen_dport = ipo->ipo_addr.sen_dport;
ipa->ipa_mask.sen_dport = ipo->ipo_mask.sen_dport;
}
break;
#ifdef INET6
case SENT_IP6:
ipa->ipa_info.sen_type = ipa->ipa_mask.sen_type = SENT_IP6;
ipa->ipa_info.sen_ip6_direction =
ipo->ipo_addr.sen_ip6_direction;
ipa->ipa_mask.sen_ip6_direction =
ipo->ipo_mask.sen_ip6_direction;
if (ipsp_is_unspecified(ipo->ipo_dst)) {
ipa->ipa_info.sen_ip6_src = ddst->sen_ip6_src;
ipa->ipa_mask.sen_ip6_src = in6mask128;
ipa->ipa_info.sen_ip6_dst = ddst->sen_ip6_dst;
ipa->ipa_mask.sen_ip6_dst = in6mask128;
} else {
ipa->ipa_info.sen_ip6_src = ipo->ipo_addr.sen_ip6_src;
ipa->ipa_mask.sen_ip6_src = ipo->ipo_mask.sen_ip6_src;
ipa->ipa_info.sen_ip6_dst = ipo->ipo_addr.sen_ip6_dst;
ipa->ipa_mask.sen_ip6_dst = ipo->ipo_mask.sen_ip6_dst;
}
ipa->ipa_info.sen_ip6_proto = ipo->ipo_addr.sen_ip6_proto;
ipa->ipa_mask.sen_ip6_proto = ipo->ipo_mask.sen_ip6_proto;
if (ipo->ipo_mask.sen_ip6_proto) {
ipa->ipa_info.sen_ip6_sport =
ipo->ipo_addr.sen_ip6_sport;
ipa->ipa_mask.sen_ip6_sport =
ipo->ipo_mask.sen_ip6_sport;
ipa->ipa_info.sen_ip6_dport =
ipo->ipo_addr.sen_ip6_dport;
ipa->ipa_mask.sen_ip6_dport =
ipo->ipo_mask.sen_ip6_dport;
}
break;
#endif /* INET6 */
default:
pool_put(&ipsec_acquire_pool, ipa);
return 0;
}
mtx_enter(&ipsec_acquire_mtx);
#ifdef IPSEC
if (timeout_add_sec(&ipa->ipa_timeout, ipsec_expire_acquire) == 1)
refcnt_take(&ipa->ipa_refcnt);
#endif
TAILQ_INSERT_TAIL(&ipsec_acquire_head, ipa, ipa_next);
TAILQ_INSERT_TAIL(&ipo->ipo_acquires, ipa, ipa_ipo_next);
ipa->ipa_policy = ipo;
mtx_leave(&ipsec_acquire_mtx);
/* PF_KEYv2 notification message. */
return pfkeyv2_acquire(ipo, gw, laddr, &ipa->ipa_seq, ddst);
}
/*
* Deal with PCB security requirements.
*/
int
ipsp_spd_inp(struct mbuf *m, struct inpcb *inp, struct ipsec_policy *ipo,
struct tdb **tdbout)
{
/* Sanity check. */
if (inp == NULL)
goto justreturn;
/* We only support IPSEC_LEVEL_BYPASS or IPSEC_LEVEL_AVAIL */
if (inp->inp_seclevel[SL_ESP_TRANS] == IPSEC_LEVEL_BYPASS && inp->inp_seclevel[SL_ESP_NETWORK] == IPSEC_LEVEL_BYPASS &&
inp->inp_seclevel[SL_AUTH] == IPSEC_LEVEL_BYPASS)
goto justreturn;
if (inp->inp_seclevel[SL_ESP_TRANS] == IPSEC_LEVEL_AVAIL && inp->inp_seclevel[SL_ESP_NETWORK] == IPSEC_LEVEL_AVAIL &&
inp->inp_seclevel[SL_AUTH] == IPSEC_LEVEL_AVAIL)
goto justreturn;
return -EINVAL; /* Silently drop packet. */
justreturn:
if (tdbout != NULL) {
if (ipo != NULL)
*tdbout = tdb_ref(ipo->ipo_tdb);
else
*tdbout = NULL;
}
return 0;
}
/*
* Find a pending ACQUIRE record based on its sequence number.
* XXX Need to use a better data structure.
*/
struct ipsec_acquire *
ipsec_get_acquire(u_int32_t seq)
{
struct ipsec_acquire *ipa;
NET_ASSERT_LOCKED();
mtx_enter(&ipsec_acquire_mtx);
TAILQ_FOREACH(ipa, &ipsec_acquire_head, ipa_next) {
if (ipa->ipa_seq == seq) {
refcnt_take(&ipa->ipa_refcnt);
break;
}
}
mtx_leave(&ipsec_acquire_mtx);
return ipa;
}
/* $OpenBSD: art.c,v 1.29 2020/11/12 15:25:28 mpi Exp $ */
/*
* Copyright (c) 2015 Martin Pieuchot
* Copyright (c) 2001 Yoichi Hariguchi
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Allotment Routing Table (ART).
*
* Yoichi Hariguchi paper can be found at:
* http://www.hariguchi.org/art/art.pdf
*/
#ifndef _KERNEL
#include "kern_compat.h"
#else
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/task.h>
#include <sys/socket.h>
#endif
#include <net/art.h>
int art_bindex(struct art_table *, uint8_t *, int);
void art_allot(struct art_table *at, int, struct art_node *,
struct art_node *);
struct art_table *art_table_get(struct art_root *, struct art_table *,
int);
struct art_table *art_table_put(struct art_root *, struct art_table *);
struct art_node *art_table_insert(struct art_root *, struct art_table *,
int, struct art_node *);
struct art_node *art_table_delete(struct art_root *, struct art_table *,
int, struct art_node *);
struct art_table *art_table_ref(struct art_root *, struct art_table *);
int art_table_free(struct art_root *, struct art_table *);
int art_table_walk(struct art_root *, struct art_table *,
int (*f)(struct art_node *, void *), void *);
int art_walk_apply(struct art_root *,
struct art_node *, struct art_node *,
int (*f)(struct art_node *, void *), void *);
void art_table_gc(void *);
void art_gc(void *);
struct pool an_pool, at_pool, at_heap_4_pool, at_heap_8_pool;
struct art_table *art_table_gc_list = NULL;
struct mutex art_table_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
struct task art_table_gc_task =
TASK_INITIALIZER(art_table_gc, NULL);
struct art_node *art_node_gc_list = NULL;
struct mutex art_node_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
struct task art_node_gc_task = TASK_INITIALIZER(art_gc, NULL);
void
art_init(void)
{
pool_init(&an_pool, sizeof(struct art_node), 0, IPL_SOFTNET, 0,
"art_node", NULL);
pool_init(&at_pool, sizeof(struct art_table), 0, IPL_SOFTNET, 0,
"art_table", NULL);
pool_init(&at_heap_4_pool, AT_HEAPSIZE(4), 0, IPL_SOFTNET, 0,
"art_heap4", NULL);
pool_init(&at_heap_8_pool, AT_HEAPSIZE(8), 0, IPL_SOFTNET, 0,
"art_heap8", &pool_allocator_single);
}
/*
* Per routing table initialization API function.
*/
struct art_root *
art_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
{
struct art_root *ar;
int i;
ar = malloc(sizeof(*ar), M_RTABLE, M_NOWAIT|M_ZERO);
if (ar == NULL)
return (NULL);
switch (alen) {
case 32:
ar->ar_alen = 32;
ar->ar_nlvl = 7;
ar->ar_bits[0] = 8;
for (i = 1; i < ar->ar_nlvl; i++)
ar->ar_bits[i] = 4;
break;
case 128:
ar->ar_alen = 128;
ar->ar_nlvl = 32;
for (i = 0; i < ar->ar_nlvl; i++)
ar->ar_bits[i] = 4;
break;
default:
printf("%s: incorrect address length %u\n", __func__, alen);
free(ar, M_RTABLE, sizeof(*ar));
return (NULL);
}
ar->ar_off = off;
rw_init(&ar->ar_lock, "art");
return (ar);
}
/*
* Return 1 if ``old'' and ``new`` are identical, 0 otherwise.
*/
static inline int
art_check_duplicate(struct art_root *ar, struct art_node *old,
struct art_node *new)
{
if (old == NULL)
return (0);
if (old->an_plen == new->an_plen)
return (1);
return (0);
}
/*
* Return the base index of the part of ``addr'' and ``plen''
* corresponding to the range covered by the table ``at''.
*
* In other words, this function take the multi-level (complete)
* address ``addr'' and prefix length ``plen'' and return the
* single level base index for the table ``at''.
*
* For example with an address size of 32bit divided into four
* 8bit-long tables, there's a maximum of 4 base indexes if the
* prefix length is > 24.
*/
int
art_bindex(struct art_table *at, uint8_t *addr, int plen)
{
uint8_t boff, bend;
uint32_t k;
if (plen < at->at_offset || plen > (at->at_offset + at->at_bits))
return (-1);
/*
* We are only interested in the part of the prefix length
* corresponding to the range of this table.
*/
plen -= at->at_offset;
/*
* Jump to the first byte of the address containing bits
* covered by this table.
*/
addr += (at->at_offset / 8);
/* ``at'' covers the bit range between ``boff'' & ``bend''. */
boff = (at->at_offset % 8);
bend = (at->at_bits + boff);
KASSERT(bend <= 32);
if (bend > 24) {
k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
k |= addr[1] << (bend - 16);
k |= addr[2] << (bend - 24);
k |= addr[3] >> (32 - bend);
} else if (bend > 16) {
k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
k |= addr[1] << (bend - 16);
k |= addr[2] >> (24 - bend);
} else if (bend > 8) {
k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
k |= addr[1] >> (16 - bend);
} else {
k = (addr[0] >> (8 - bend)) & ((1 << at->at_bits) - 1);
}
/*
* Single level base index formula:
*/
return ((k >> (at->at_bits - plen)) + (1 << plen));
}
/*
* Single level lookup function.
*
* Return the fringe index of the part of ``addr''
* corresponding to the range covered by the table ``at''.
*/
static inline int
art_findex(struct art_table *at, uint8_t *addr)
{
return art_bindex(at, addr, (at->at_offset + at->at_bits));
}
/*
* (Non-perfect) lookup API function.
*
* Return the best existing match for a destination.
*/
struct art_node *
art_match(struct art_root *ar, void *addr, struct srp_ref *nsr)
{
struct srp_ref dsr, ndsr;
void *entry;
struct art_table *at;
struct art_node *dflt, *ndflt;
int j;
entry = srp_enter(nsr, &ar->ar_root);
at = entry;
if (at == NULL)
goto done;
/*
* Remember the default route of each table we visit in case
* we do not find a better matching route.
*/
dflt = srp_enter(&dsr, &at->at_default);
/*
* Iterate until we find a leaf.
*/
while (1) {
/* Do a single level route lookup. */
j = art_findex(at, addr);
entry = srp_follow(nsr, &at->at_heap[j].node);
/* If this is a leaf (NULL is a leaf) we're done. */
if (ISLEAF(entry))
break;
at = SUBTABLE(entry);
ndflt = srp_enter(&ndsr, &at->at_default);
if (ndflt != NULL) {
srp_leave(&dsr);
dsr = ndsr;
dflt = ndflt;
} else
srp_leave(&ndsr);
}
if (entry == NULL) {
srp_leave(nsr);
*nsr = dsr;
KASSERT(ISLEAF(dflt));
return (dflt);
}
srp_leave(&dsr);
done:
KASSERT(ISLEAF(entry)); return (entry);
}
/*
* Perfect lookup API function.
*
* Return a perfect match for a destination/prefix-length pair or NULL if
* it does not exist.
*/
struct art_node *
art_lookup(struct art_root *ar, void *addr, int plen, struct srp_ref *nsr)
{
void *entry;
struct art_table *at;
int i, j;
KASSERT(plen >= 0 && plen <= ar->ar_alen);
entry = srp_enter(nsr, &ar->ar_root);
at = entry;
if (at == NULL)
goto done;
/* Default route */
if (plen == 0) {
entry = srp_follow(nsr, &at->at_default);
goto done;
}
/*
* If the prefix length is smaller than the sum of
* the stride length at this level the entry must
* be in the current table.
*/
while (plen > (at->at_offset + at->at_bits)) {
/* Do a single level route lookup. */
j = art_findex(at, addr);
entry = srp_follow(nsr, &at->at_heap[j].node);
/* A leaf is a match, but not a perfect one, or NULL */
if (ISLEAF(entry))
return (NULL);
at = SUBTABLE(entry);
}
i = art_bindex(at, addr, plen);
if (i == -1)
return (NULL);
entry = srp_follow(nsr, &at->at_heap[i].node);
if (!ISLEAF(entry)) entry = srp_follow(nsr, &SUBTABLE(entry)->at_default);
done:
KASSERT(ISLEAF(entry)); return (entry);
}
/*
* Insertion API function.
*
* Insert the given node or return an existing one if a node with the
* same destination/mask pair is already present.
*/
struct art_node *
art_insert(struct art_root *ar, struct art_node *an, void *addr, int plen)
{
struct art_table *at, *child;
struct art_node *node;
int i, j;
rw_assert_wrlock(&ar->ar_lock);
KASSERT(plen >= 0 && plen <= ar->ar_alen);
at = srp_get_locked(&ar->ar_root);
if (at == NULL) {
at = art_table_get(ar, NULL, -1);
if (at == NULL)
return (NULL);
srp_swap_locked(&ar->ar_root, at);
}
/* Default route */
if (plen == 0) {
node = srp_get_locked(&at->at_default);
if (node != NULL)
return (node);
art_table_ref(ar, at);
srp_swap_locked(&at->at_default, an);
return (an);
}
/*
* If the prefix length is smaller than the sum of
* the stride length at this level the entry must
* be in the current table.
*/
while (plen > (at->at_offset + at->at_bits)) {
/* Do a single level route lookup. */
j = art_findex(at, addr);
node = srp_get_locked(&at->at_heap[j].node);
/*
* If the node corresponding to the fringe index is
* a leaf we need to allocate a subtable. The route
* entry of this node will then become the default
* route of the subtable.
*/
if (ISLEAF(node)) {
child = art_table_get(ar, at, j);
if (child == NULL)
return (NULL);
art_table_ref(ar, at);
srp_swap_locked(&at->at_heap[j].node, ASNODE(child));
at = child;
} else
at = SUBTABLE(node);
}
i = art_bindex(at, addr, plen);
if (i == -1)
return (NULL);
return (art_table_insert(ar, at, i, an));
}
/*
* Single level insertion.
*/
struct art_node *
art_table_insert(struct art_root *ar, struct art_table *at, int i,
struct art_node *an)
{
struct art_node *prev, *node;
node = srp_get_locked(&at->at_heap[i].node);
if (!ISLEAF(node)) prev = srp_get_locked(&SUBTABLE(node)->at_default);
else
prev = node;
if (art_check_duplicate(ar, prev, an))
return (prev);
art_table_ref(ar, at);
/*
* If the index `i' of the route that we are inserting is not
* a fringe index, we need to allot this new route pointer to
* all the corresponding fringe indices.
*/
if (i < at->at_minfringe)
art_allot(at, i, prev, an);
else if (!ISLEAF(node))
srp_swap_locked(&SUBTABLE(node)->at_default, an);
else
srp_swap_locked(&at->at_heap[i].node, an);
return (an);
}
/*
* Deletion API function.
*/
struct art_node *
art_delete(struct art_root *ar, struct art_node *an, void *addr, int plen)
{
struct art_table *at;
struct art_node *node;
int i, j;
rw_assert_wrlock(&ar->ar_lock);
KASSERT(plen >= 0 && plen <= ar->ar_alen);
at = srp_get_locked(&ar->ar_root);
if (at == NULL)
return (NULL);
/* Default route */
if (plen == 0) {
node = srp_get_locked(&at->at_default);
srp_swap_locked(&at->at_default, NULL);
art_table_free(ar, at);
return (node);
}
/*
* If the prefix length is smaller than the sum of
* the stride length at this level the entry must
* be in the current table.
*/
while (plen > (at->at_offset + at->at_bits)) {
/* Do a single level route lookup. */
j = art_findex(at, addr);
node = srp_get_locked(&at->at_heap[j].node);
/* If this is a leaf, there is no route to delete. */
if (ISLEAF(node))
return (NULL);
at = SUBTABLE(node);
}
i = art_bindex(at, addr, plen);
if (i == -1)
return (NULL);
return (art_table_delete(ar, at, i, an));
}
/*
* Single level deletion.
*/
struct art_node *
art_table_delete(struct art_root *ar, struct art_table *at, int i,
struct art_node *an)
{
struct art_node *next, *node;
#ifdef DIAGNOSTIC
struct art_node *prev;
#endif
node = srp_get_locked(&at->at_heap[i].node);
#ifdef DIAGNOSTIC
if (!ISLEAF(node))
prev = srp_get_locked(&SUBTABLE(node)->at_default);
else
prev = node;
KASSERT(prev == an);
#endif
/* Get the next most specific route for the index `i'. */
if ((i >> 1) > 1)
next = srp_get_locked(&at->at_heap[i >> 1].node);
else
next = NULL;
/*
* If the index `i' of the route that we are removing is not
* a fringe index, we need to allot the next most specific
* route pointer to all the corresponding fringe indices.
*/
if (i < at->at_minfringe)
art_allot(at, i, an, next);
else if (!ISLEAF(node))
srp_swap_locked(&SUBTABLE(node)->at_default, next);
else
srp_swap_locked(&at->at_heap[i].node, next);
/* We have removed an entry from this table. */
art_table_free(ar, at);
return (an);
}
struct art_table *
art_table_ref(struct art_root *ar, struct art_table *at)
{
at->at_refcnt++;
return (at);
}
static inline int
art_table_rele(struct art_table *at)
{
if (at == NULL)
return (0);
return (--at->at_refcnt == 0);
}
int
art_table_free(struct art_root *ar, struct art_table *at)
{
if (art_table_rele(at)) {
/*
* Garbage collect this table and all its parents
* that are empty.
*/
do {
at = art_table_put(ar, at);
} while (art_table_rele(at));
return (1);
}
return (0);
}
/*
* Iteration API function.
*/
int
art_walk(struct art_root *ar, int (*f)(struct art_node *, void *), void *arg)
{
struct srp_ref sr;
struct art_table *at;
struct art_node *node;
int error = 0;
rw_enter_write(&ar->ar_lock);
at = srp_get_locked(&ar->ar_root);
if (at != NULL) {
art_table_ref(ar, at);
/*
* The default route should be processed here because the root
* table does not have a parent.
*/
node = srp_enter(&sr, &at->at_default);
error = art_walk_apply(ar, node, NULL, f, arg);
srp_leave(&sr);
if (error == 0)
error = art_table_walk(ar, at, f, arg);
art_table_free(ar, at);
}
rw_exit_write(&ar->ar_lock);
return (error);
}
int
art_table_walk(struct art_root *ar, struct art_table *at,
int (*f)(struct art_node *, void *), void *arg)
{
struct srp_ref sr;
struct art_node *node, *next;
struct art_table *nat;
int i, j, error = 0;
uint32_t maxfringe = (at->at_minfringe << 1);
/*
* Iterate non-fringe nodes in ``natural'' order.
*/
for (j = 1; j < at->at_minfringe; j += 2) {
/*
* The default route (index 1) is processed by the
* parent table (where it belongs) otherwise it could
* be processed more than once.
*/
for (i = max(j, 2); i < at->at_minfringe; i <<= 1) {
next = srp_get_locked(&at->at_heap[i >> 1].node);
node = srp_enter(&sr, &at->at_heap[i].node);
error = art_walk_apply(ar, node, next, f, arg);
srp_leave(&sr);
if (error != 0)
return (error);
}
}
/*
* Iterate fringe nodes.
*/
for (i = at->at_minfringe; i < maxfringe; i++) {
next = srp_get_locked(&at->at_heap[i >> 1].node);
node = srp_enter(&sr, &at->at_heap[i].node);
if (!ISLEAF(node)) { nat = art_table_ref(ar, SUBTABLE(node));
node = srp_follow(&sr, &nat->at_default);
} else
nat = NULL;
error = art_walk_apply(ar, node, next, f, arg);
srp_leave(&sr);
if (error != 0) {
art_table_free(ar, nat);
return (error);
}
if (nat != NULL) {
error = art_table_walk(ar, nat, f, arg);
art_table_free(ar, nat);
if (error != 0)
return (error);
}
}
return (0);
}
int
art_walk_apply(struct art_root *ar,
struct art_node *an, struct art_node *next,
int (*f)(struct art_node *, void *), void *arg)
{
int error = 0;
if ((an != NULL) && (an != next)) {
rw_exit_write(&ar->ar_lock);
error = (*f)(an, arg);
rw_enter_write(&ar->ar_lock);
}
return (error);
}
/*
* Create a table and use the given index to set its default route.
*
* Note: This function does not modify the root or the parent.
*/
struct art_table *
art_table_get(struct art_root *ar, struct art_table *parent, int j)
{
struct art_table *at;
struct art_node *node;
void *at_heap;
uint32_t lvl;
KASSERT(j != 0 && j != 1); KASSERT(parent != NULL || j == -1); if (parent != NULL) lvl = parent->at_level + 1;
else
lvl = 0;
KASSERT(lvl < ar->ar_nlvl);
at = pool_get(&at_pool, PR_NOWAIT|PR_ZERO);
if (at == NULL)
return (NULL);
switch (AT_HEAPSIZE(ar->ar_bits[lvl])) {
case AT_HEAPSIZE(4):
at_heap = pool_get(&at_heap_4_pool, PR_NOWAIT|PR_ZERO);
break;
case AT_HEAPSIZE(8):
at_heap = pool_get(&at_heap_8_pool, PR_NOWAIT|PR_ZERO);
break;
default:
panic("incorrect stride length %u", ar->ar_bits[lvl]);
}
if (at_heap == NULL) {
pool_put(&at_pool, at);
return (NULL);
}
at->at_parent = parent;
at->at_index = j;
at->at_minfringe = (1 << ar->ar_bits[lvl]);
at->at_level = lvl;
at->at_bits = ar->ar_bits[lvl];
at->at_heap = at_heap;
at->at_refcnt = 0;
if (parent != NULL) { node = srp_get_locked(&parent->at_heap[j].node);
/* node isn't being deleted, no srp_finalize needed */
srp_swap_locked(&at->at_default, node);
at->at_offset = (parent->at_offset + parent->at_bits);
}
return (at);
}
/*
* Delete a table and use its index to restore its parent's default route.
*
* Note: Modify its parent to unlink the table from it.
*/
struct art_table *
art_table_put(struct art_root *ar, struct art_table *at)
{
struct art_table *parent = at->at_parent;
struct art_node *node;
uint32_t j = at->at_index;
KASSERT(at->at_refcnt == 0);
KASSERT(j != 0 && j != 1);
if (parent != NULL) {
KASSERT(j != -1);
KASSERT(at->at_level == parent->at_level + 1);
KASSERT(parent->at_refcnt >= 1);
/* Give the route back to its parent. */
node = srp_get_locked(&at->at_default);
srp_swap_locked(&parent->at_heap[j].node, node);
} else {
KASSERT(j == -1);
KASSERT(at->at_level == 0);
srp_swap_locked(&ar->ar_root, NULL);
}
mtx_enter(&art_table_gc_mtx);
at->at_parent = art_table_gc_list;
art_table_gc_list = at;
mtx_leave(&art_table_gc_mtx);
task_add(systqmp, &art_table_gc_task);
return (parent);
}
void
art_table_gc(void *null)
{
struct art_table *at, *next;
mtx_enter(&art_table_gc_mtx);
at = art_table_gc_list;
art_table_gc_list = NULL;
mtx_leave(&art_table_gc_mtx);
while (at != NULL) {
next = at->at_parent;
if (at->at_level == 0)
srp_finalize(at, "arttfini");
else
srp_finalize(ASNODE(at), "arttfini");
switch (AT_HEAPSIZE(at->at_bits)) {
case AT_HEAPSIZE(4):
pool_put(&at_heap_4_pool, at->at_heap);
break;
case AT_HEAPSIZE(8):
pool_put(&at_heap_8_pool, at->at_heap);
break;
default:
panic("incorrect stride length %u", at->at_bits);
}
pool_put(&at_pool, at);
at = next;
}
}
/*
* Substitute a node by another in the subtree whose root index is given.
*
* This function iterates on the table ``at'' at index ``i'' until no
* more ``old'' node can be replaced by ``new''.
*
* This function was originally written by Don Knuth in CWEB. The
* complicated ``goto''s are the result of expansion of the two
* following recursions:
*
* art_allot(at, i, old, new)
* {
* int k = i;
* if (at->at_heap[k] == old)
* at->at_heap[k] = new;
* if (k >= at->at_minfringe)
* return;
* k <<= 1;
* art_allot(at, k, old, new);
* k++;
* art_allot(at, k, old, new);
* }
*/
void
art_allot(struct art_table *at, int i, struct art_node *old,
struct art_node *new)
{
struct art_node *node, *dflt;
int k = i;
KASSERT(i < at->at_minfringe);
again:
k <<= 1;
if (k < at->at_minfringe)
goto nonfringe;
/* Change fringe nodes. */
while (1) {
node = srp_get_locked(&at->at_heap[k].node);
if (!ISLEAF(node)) {
dflt = srp_get_locked(&SUBTABLE(node)->at_default);
if (dflt == old) {
srp_swap_locked(&SUBTABLE(node)->at_default,
new);
}
} else if (node == old) {
srp_swap_locked(&at->at_heap[k].node, new);
}
if (k % 2)
goto moveup;
k++;
}
nonfringe:
node = srp_get_locked(&at->at_heap[k].node);
if (node == old)
goto again;
moveon:
if (k % 2)
goto moveup;
k++;
goto nonfringe;
moveup:
k >>= 1;
srp_swap_locked(&at->at_heap[k].node, new);
/* Change non-fringe node. */
if (k != i)
goto moveon;
}
struct art_node *
art_get(void *dst, uint8_t plen)
{
struct art_node *an;
an = pool_get(&an_pool, PR_NOWAIT | PR_ZERO); if (an == NULL)
return (NULL);
an->an_plen = plen;
SRPL_INIT(&an->an_rtlist);
return (an);
}
void
art_put(struct art_node *an)
{
KASSERT(SRPL_EMPTY_LOCKED(&an->an_rtlist));
mtx_enter(&art_node_gc_mtx);
an->an_gc = art_node_gc_list;
art_node_gc_list = an;
mtx_leave(&art_node_gc_mtx);
task_add(systqmp, &art_node_gc_task);
}
void
art_gc(void *null)
{
struct art_node *an, *next;
mtx_enter(&art_node_gc_mtx);
an = art_node_gc_list;
art_node_gc_list = NULL;
mtx_leave(&art_node_gc_mtx);
while (an != NULL) {
next = an->an_gc;
srp_finalize(an, "artnfini");
pool_put(&an_pool, an);
an = next;
}
}
/* $OpenBSD: uvm_addr.c,v 1.31 2022/02/21 10:26:20 jsg Exp $ */
/*
* Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/* #define DEBUG */
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
#include <uvm/uvm_addr.h>
#include <sys/pool.h>
/* Max gap between hint allocations. */
#define UADDR_HINT_MAXGAP (4 * PAGE_SIZE)
/* Number of pivots in pivot allocator. */
#define NUM_PIVOTS 16
/*
* Max number (inclusive) of pages the pivot allocator
* will place between allocations.
*
* The uaddr_pivot_random() function attempts to bias towards
* small space between allocations, so putting a large number here is fine.
*/
#define PIVOT_RND 8
/*
* Number of allocations that a pivot can supply before expiring.
* When a pivot expires, a new pivot has to be found.
*
* Must be at least 1.
*/
#define PIVOT_EXPIRE 1024
/* Pool with uvm_addr_state structures. */
struct pool uaddr_pool;
struct pool uaddr_bestfit_pool;
struct pool uaddr_pivot_pool;
struct pool uaddr_rnd_pool;
/* uvm_addr state for bestfit selector. */
struct uaddr_bestfit_state {
struct uvm_addr_state ubf_uaddr;
struct uaddr_free_rbtree ubf_free;
};
/* uvm_addr state for rnd selector. */
struct uaddr_rnd_state {
struct uvm_addr_state ur_uaddr;
#if 0
TAILQ_HEAD(, vm_map_entry) ur_free;
#endif
};
/*
* Definition of a pivot in pivot selector.
*/
struct uaddr_pivot {
vaddr_t addr; /* End of prev. allocation. */
int expire;/* Best before date. */
int dir; /* Direction. */
struct vm_map_entry *entry; /* Will contain next alloc. */
};
/* uvm_addr state for pivot selector. */
struct uaddr_pivot_state {
struct uvm_addr_state up_uaddr;
/* Free space tree, for fast pivot selection. */
struct uaddr_free_rbtree up_free;
/* List of pivots. The pointers point to after the last allocation. */
struct uaddr_pivot up_pivots[NUM_PIVOTS];
};
/* Forward declaration (see below). */
extern const struct uvm_addr_functions uaddr_kernel_functions;
struct uvm_addr_state uaddr_kbootstrap;
/*
* Support functions.
*/
#ifndef SMALL_KERNEL
struct vm_map_entry *uvm_addr_entrybyspace(struct uaddr_free_rbtree*,
vsize_t);
#endif /* !SMALL_KERNEL */
void uaddr_kinsert(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_kremove(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_kbootstrapdestroy(struct uvm_addr_state *);
void uaddr_destroy(struct uvm_addr_state *);
void uaddr_kbootstrap_destroy(struct uvm_addr_state *);
void uaddr_rnd_destroy(struct uvm_addr_state *);
void uaddr_bestfit_destroy(struct uvm_addr_state *);
void uaddr_pivot_destroy(struct uvm_addr_state *);
#if 0
int uaddr_lin_select(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
#endif
int uaddr_kbootstrap_select(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
int uaddr_rnd_select(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
int uaddr_bestfit_select(struct vm_map *,
struct uvm_addr_state*, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
#ifndef SMALL_KERNEL
int uaddr_pivot_select(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
int uaddr_stack_brk_select(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
#endif /* !SMALL_KERNEL */
void uaddr_rnd_insert(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_rnd_remove(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_bestfit_insert(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_bestfit_remove(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_pivot_insert(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
void uaddr_pivot_remove(struct vm_map *,
struct uvm_addr_state *, struct vm_map_entry *);
#ifndef SMALL_KERNEL
vsize_t uaddr_pivot_random(void);
int uaddr_pivot_newpivot(struct vm_map *,
struct uaddr_pivot_state *, struct uaddr_pivot *,
struct vm_map_entry **, vaddr_t *,
vsize_t, vaddr_t, vaddr_t, vsize_t, vsize_t);
#endif /* !SMALL_KERNEL */
#if defined(DEBUG) || defined(DDB)
void uaddr_pivot_print(struct uvm_addr_state *, boolean_t,
int (*)(const char *, ...));
#if 0
void uaddr_rnd_print(struct uvm_addr_state *, boolean_t,
int (*)(const char *, ...));
#endif
#endif /* DEBUG || DDB */
#ifndef SMALL_KERNEL
/*
* Find smallest entry in tree that will fit sz bytes.
*/
struct vm_map_entry *
uvm_addr_entrybyspace(struct uaddr_free_rbtree *free, vsize_t sz)
{
struct vm_map_entry *tmp, *res;
tmp = RBT_ROOT(uaddr_free_rbtree, free);
res = NULL;
while (tmp) {
if (tmp->fspace >= sz) {
res = tmp;
tmp = RBT_LEFT(uaddr_free_rbtree, tmp);
} else if (tmp->fspace < sz)
tmp = RBT_RIGHT(uaddr_free_rbtree, tmp);
}
return res;
}
#endif /* !SMALL_KERNEL */
static inline vaddr_t
uvm_addr_align_forward(vaddr_t addr, vaddr_t align, vaddr_t offset)
{
vaddr_t adjusted;
KASSERT(offset < align || (align == 0 && offset == 0)); KASSERT((align & (align - 1)) == 0); KASSERT((offset & PAGE_MASK) == 0);
align = MAX(align, PAGE_SIZE);
adjusted = addr & ~(align - 1);
adjusted += offset;
return (adjusted < addr ? adjusted + align : adjusted);
}
static inline vaddr_t
uvm_addr_align_backward(vaddr_t addr, vaddr_t align, vaddr_t offset)
{
vaddr_t adjusted;
KASSERT(offset < align || (align == 0 && offset == 0));
KASSERT((align & (align - 1)) == 0);
KASSERT((offset & PAGE_MASK) == 0);
align = MAX(align, PAGE_SIZE);
adjusted = addr & ~(align - 1);
adjusted += offset;
return (adjusted > addr ? adjusted - align : adjusted);
}
/*
* Try to fit the requested space into the entry.
*/
int
uvm_addr_fitspace(vaddr_t *min_result, vaddr_t *max_result,
vaddr_t low_addr, vaddr_t high_addr, vsize_t sz,
vaddr_t align, vaddr_t offset,
vsize_t before_gap, vsize_t after_gap)
{
vaddr_t tmp;
vsize_t fspace;
if (low_addr > high_addr)
return ENOMEM;
fspace = high_addr - low_addr;
if (fspace < before_gap + after_gap)
return ENOMEM;
if (fspace - before_gap - after_gap < sz)
return ENOMEM;
/*
* Calculate lowest address.
*/
low_addr += before_gap;
low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset);
if (low_addr < tmp) /* Overflow during alignment. */
return ENOMEM;
if (high_addr - after_gap - sz < low_addr)
return ENOMEM;
/*
* Calculate highest address.
*/
high_addr -= after_gap + sz;
high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset);
if (high_addr > tmp) /* Overflow during alignment. */
return ENOMEM;
if (low_addr > high_addr)
return ENOMEM;
*min_result = low_addr;
*max_result = high_addr;
return 0;
}
/*
* Initialize uvm_addr.
*/
void
uvm_addr_init(void)
{
pool_init(&uaddr_pool, sizeof(struct uvm_addr_state), 0,
IPL_VM, PR_WAITOK, "uaddr", NULL);
pool_init(&uaddr_bestfit_pool, sizeof(struct uaddr_bestfit_state), 0,
IPL_VM, PR_WAITOK, "uaddrbest", NULL);
pool_init(&uaddr_pivot_pool, sizeof(struct uaddr_pivot_state), 0,
IPL_VM, PR_WAITOK, "uaddrpivot", NULL);
pool_init(&uaddr_rnd_pool, sizeof(struct uaddr_rnd_state), 0,
IPL_VM, PR_WAITOK, "uaddrrnd", NULL);
uaddr_kbootstrap.uaddr_minaddr = PAGE_SIZE;
uaddr_kbootstrap.uaddr_maxaddr = -(vaddr_t)PAGE_SIZE;
uaddr_kbootstrap.uaddr_functions = &uaddr_kernel_functions;
}
/*
* Invoke destructor function of uaddr.
*/
void
uvm_addr_destroy(struct uvm_addr_state *uaddr)
{
if (uaddr)
(*uaddr->uaddr_functions->uaddr_destroy)(uaddr);
}
/*
* Move address forward to satisfy align, offset.
*/
vaddr_t
uvm_addr_align(vaddr_t addr, vaddr_t align, vaddr_t offset)
{
vaddr_t result = (addr & ~(align - 1)) + offset;
if (result < addr)
result += align;
return result;
}
/*
* Move address backwards to satisfy align, offset.
*/
vaddr_t
uvm_addr_align_back(vaddr_t addr, vaddr_t align, vaddr_t offset)
{
vaddr_t result = (addr & ~(align - 1)) + offset;
if (result > addr)
result -= align;
return result;
}
/*
* Directional first fit.
*
* Do a linear search for free space, starting at addr in entry.
* direction == 1: search forward
* direction == -1: search backward
*
* Output: low <= addr <= high and entry will contain addr.
* 0 will be returned if no space is available.
*
* gap describes the space that must appear between the preceding entry.
*/
int
uvm_addr_linsearch(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vaddr_t hint, vsize_t sz, vaddr_t align, vaddr_t offset,
int direction, vaddr_t low, vaddr_t high,
vsize_t before_gap, vsize_t after_gap)
{
struct vm_map_entry *entry;
vaddr_t low_addr, high_addr;
KASSERT(entry_out != NULL && addr_out != NULL);
KASSERT(direction == -1 || direction == 1);
KASSERT((hint & PAGE_MASK) == 0 && (high & PAGE_MASK) == 0 &&
(low & PAGE_MASK) == 0 &&
(before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0);
KASSERT(high + sz > high); /* Check for overflow. */
/*
* Hint magic.
*/
if (hint == 0)
hint = (direction == 1 ? low : high);
else if (hint > high) {
if (direction != -1)
return ENOMEM;
hint = high;
} else if (hint < low) {
if (direction != 1)
return ENOMEM;
hint = low;
}
for (entry = uvm_map_entrybyaddr(&map->addr,
hint - (direction == -1 ? 1 : 0)); entry != NULL;
entry = (direction == 1 ?
RBT_NEXT(uvm_map_addr, entry) :
RBT_PREV(uvm_map_addr, entry))) {
if ((direction == 1 && VMMAP_FREE_START(entry) > high) ||
(direction == -1 && VMMAP_FREE_END(entry) < low)) {
break;
}
if (uvm_addr_fitspace(&low_addr, &high_addr,
MAX(low, VMMAP_FREE_START(entry)),
MIN(high, VMMAP_FREE_END(entry)),
sz, align, offset, before_gap, after_gap) == 0) {
*entry_out = entry;
if (hint >= low_addr && hint <= high_addr) {
*addr_out = hint;
} else {
*addr_out = (direction == 1 ?
low_addr : high_addr);
}
return 0;
}
}
return ENOMEM;
}
/*
* Invoke address selector of uaddr.
* uaddr may be NULL, in which case the algorithm will fail with ENOMEM.
*
* Will invoke uvm_addr_isavail to fill in last_out.
*/
int
uvm_addr_invoke(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, struct vm_map_entry **last_out,
vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
{
int error;
if (uaddr == NULL)
return ENOMEM;
hint &= ~((vaddr_t)PAGE_MASK);
if (hint != 0 && !(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
return ENOMEM;
error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
entry_out, addr_out, sz, align, offset, prot, hint);
if (error == 0) { KASSERT(*entry_out != NULL);
*last_out = NULL;
if (!uvm_map_isavail(map, uaddr, entry_out, last_out,
*addr_out, sz)) {
panic("uvm_addr_invoke: address selector %p "
"(%s 0x%lx-0x%lx) "
"returned unavailable address 0x%lx sz 0x%lx",
uaddr, uaddr->uaddr_functions->uaddr_name,
uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr,
*addr_out, sz);
}
}
return error;
}
#if defined(DEBUG) || defined(DDB)
void
uvm_addr_print(struct uvm_addr_state *uaddr, const char *slot, boolean_t full,
int (*pr)(const char *, ...))
{
if (uaddr == NULL) {
(*pr)("- uvm_addr %s: NULL\n", slot);
return;
}
(*pr)("- uvm_addr %s: %p (%s 0x%lx-0x%lx)\n", slot, uaddr,
uaddr->uaddr_functions->uaddr_name,
uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr);
if (uaddr->uaddr_functions->uaddr_print == NULL)
return;
(*uaddr->uaddr_functions->uaddr_print)(uaddr, full, pr);
}
#endif /* DEBUG || DDB */
/*
* Destroy a uvm_addr_state structure.
* The uaddr must have been previously allocated from uaddr_state_pool.
*/
void
uaddr_destroy(struct uvm_addr_state *uaddr)
{
pool_put(&uaddr_pool, uaddr);
}
#if 0
/*
* Linear allocator.
* This allocator uses a first-fit algorithm.
*
* If hint is set, search will start at the hint position.
* Only searches forward.
*/
const struct uvm_addr_functions uaddr_lin_functions = {
.uaddr_select = &uaddr_lin_select,
.uaddr_destroy = &uaddr_destroy,
.uaddr_name = "uaddr_lin"
};
struct uvm_addr_state *
uaddr_lin_create(vaddr_t minaddr, vaddr_t maxaddr)
{
struct uvm_addr_state *uaddr;
uaddr = pool_get(&uaddr_pool, PR_WAITOK);
uaddr->uaddr_minaddr = minaddr;
uaddr->uaddr_maxaddr = maxaddr;
uaddr->uaddr_functions = &uaddr_lin_functions;
return uaddr;
}
int
uaddr_lin_select(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vm_prot_t prot, vaddr_t hint)
{
vaddr_t guard_sz;
/*
* Deal with guardpages: search for space with one extra page.
*/
guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr - guard_sz < sz)
return ENOMEM;
return uvm_addr_linsearch(map, uaddr, entry_out, addr_out, 0, sz,
align, offset, 1, uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr - sz,
0, guard_sz);
}
#endif
/*
* Randomized allocator.
* This allocator use uvm_map_hint to acquire a random address and searches
* from there.
*/
const struct uvm_addr_functions uaddr_rnd_functions = {
.uaddr_select = &uaddr_rnd_select,
.uaddr_free_insert = &uaddr_rnd_insert,
.uaddr_free_remove = &uaddr_rnd_remove,
.uaddr_destroy = &uaddr_rnd_destroy,
#if defined(DEBUG) || defined(DDB)
#if 0
.uaddr_print = &uaddr_rnd_print,
#endif
#endif /* DEBUG || DDB */
.uaddr_name = "uaddr_rnd"
};
struct uvm_addr_state *
uaddr_rnd_create(vaddr_t minaddr, vaddr_t maxaddr)
{
struct uaddr_rnd_state *uaddr;
uaddr = pool_get(&uaddr_rnd_pool, PR_WAITOK);
uaddr->ur_uaddr.uaddr_minaddr = minaddr;
uaddr->ur_uaddr.uaddr_maxaddr = maxaddr;
uaddr->ur_uaddr.uaddr_functions = &uaddr_rnd_functions;
#if 0
TAILQ_INIT(&uaddr->ur_free);
#endif
return &uaddr->ur_uaddr;
}
int
uaddr_rnd_select(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vm_prot_t prot, vaddr_t hint)
{
struct vmspace *vm;
vaddr_t minaddr, maxaddr;
vaddr_t guard_sz;
vaddr_t low_addr, high_addr;
struct vm_map_entry *entry, *next;
vsize_t before_gap, after_gap;
vaddr_t tmp;
KASSERT((map->flags & VM_MAP_ISVMSPACE) != 0);
vm = (struct vmspace *)map;
/* Deal with guardpages: search for space with one extra page. */
guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
if (uaddr->uaddr_maxaddr - guard_sz < sz)
return ENOMEM;
minaddr = uvm_addr_align_forward(uaddr->uaddr_minaddr, align, offset);
maxaddr = uvm_addr_align_backward(uaddr->uaddr_maxaddr - sz - guard_sz,
align, offset);
/* Quick fail if the allocation won't fit. */
if (minaddr >= maxaddr)
return ENOMEM;
/* Select a hint. */
if (hint == 0)
hint = uvm_map_hint(vm, prot, minaddr, maxaddr);
/* Clamp hint to uaddr range. */
hint = MIN(MAX(hint, minaddr), maxaddr);
/* Align hint to align,offset parameters. */
tmp = hint;
hint = uvm_addr_align_forward(tmp, align, offset);
/* Check for overflow during alignment. */
if (hint < tmp || hint > maxaddr)
return ENOMEM; /* Compatibility mode: never look backwards. */
before_gap = 0;
after_gap = guard_sz;
hint -= MIN(hint, before_gap);
/*
* Use the augmented address tree to look up the first entry
* at or after hint with sufficient space.
*
* This code is the original optimized code, but will fail if the
* subtree it looks at does have sufficient space, but fails to meet
* the align constraint.
*
* Guard: subtree is not exhausted and max(fspace) >= required.
*/
entry = uvm_map_entrybyaddr(&map->addr, hint);
/* Walk up the tree, until there is at least sufficient space. */
while (entry != NULL &&
entry->fspace_augment < before_gap + after_gap + sz)
entry = RBT_PARENT(uvm_map_addr, entry);
while (entry != NULL) {
/* Test if this fits. */
if (VMMAP_FREE_END(entry) > hint &&
uvm_map_uaddr_e(map, entry) == uaddr &&
uvm_addr_fitspace(&low_addr, &high_addr,
MAX(uaddr->uaddr_minaddr, VMMAP_FREE_START(entry)),
MIN(uaddr->uaddr_maxaddr, VMMAP_FREE_END(entry)),
sz, align, offset, before_gap, after_gap) == 0) {
*entry_out = entry;
if (hint >= low_addr && hint <= high_addr)
*addr_out = hint;
else
*addr_out = low_addr;
return 0;
}
/* RBT_NEXT, but skip subtrees that cannot possible fit. */
next = RBT_RIGHT(uvm_map_addr, entry);
if (next != NULL &&
next->fspace_augment >= before_gap + after_gap + sz) {
entry = next;
while ((next = RBT_LEFT(uvm_map_addr, entry)) !=
NULL)
entry = next;
} else {
do_parent:
next = RBT_PARENT(uvm_map_addr, entry);
if (next == NULL)
entry = NULL;
else if (RBT_LEFT(uvm_map_addr, next) == entry)
entry = next;
else {
entry = next;
goto do_parent;
}
}
}
/* Lookup failed. */
return ENOMEM;
}
/*
* Destroy a uaddr_rnd_state structure.
*/
void
uaddr_rnd_destroy(struct uvm_addr_state *uaddr)
{
pool_put(&uaddr_rnd_pool, uaddr);
}
/*
* Add entry to tailq.
*/
void
uaddr_rnd_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
return;
}
/*
* Remove entry from tailq.
*/
void
uaddr_rnd_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
return;
}
#if 0
#if defined(DEBUG) || defined(DDB)
void
uaddr_rnd_print(struct uvm_addr_state *uaddr_p, boolean_t full,
int (*pr)(const char*, ...))
{
struct vm_map_entry *entry;
struct uaddr_rnd_state *uaddr;
vaddr_t addr;
size_t count;
vsize_t space;
uaddr = (struct uaddr_rnd_state *)uaddr_p;
addr = 0;
count = 0;
space = 0;
TAILQ_FOREACH(entry, &uaddr->ur_free, dfree.tailq) {
count++;
space += entry->fspace;
if (full) {
(*pr)("\tentry %p: 0x%lx-0x%lx G=0x%lx F=0x%lx\n",
entry, entry->start, entry->end,
entry->guard, entry->fspace);
(*pr)("\t\tfree: 0x%lx-0x%lx\n",
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
}
if (entry->start < addr) {
if (!full)
(*pr)("\tentry %p: 0x%lx-0x%lx "
"G=0x%lx F=0x%lx\n",
entry, entry->start, entry->end,
entry->guard, entry->fspace);
(*pr)("\t\tstart=0x%lx, expected at least 0x%lx\n",
entry->start, addr);
}
addr = VMMAP_FREE_END(entry);
}
(*pr)("\t0x%lu entries, 0x%lx free bytes\n", count, space);
}
#endif /* DEBUG || DDB */
#endif
/*
* Kernel allocation bootstrap logic.
*/
const struct uvm_addr_functions uaddr_kernel_functions = {
.uaddr_select = &uaddr_kbootstrap_select,
.uaddr_destroy = &uaddr_kbootstrap_destroy,
.uaddr_name = "uaddr_kbootstrap"
};
/*
* Select an address from the map.
*
* This function ignores the uaddr spec and instead uses the map directly.
* Because of that property, the uaddr algorithm can be shared across all
* kernel maps.
*/
int
uaddr_kbootstrap_select(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
{
vaddr_t tmp;
RBT_FOREACH(*entry_out, uvm_map_addr, &map->addr) {
if (VMMAP_FREE_END(*entry_out) <= uvm_maxkaddr &&
uvm_addr_fitspace(addr_out, &tmp,
VMMAP_FREE_START(*entry_out), VMMAP_FREE_END(*entry_out),
sz, align, offset, 0, 0) == 0)
return 0;
}
return ENOMEM;
}
/*
* Don't destroy the kernel bootstrap allocator.
*/
void
uaddr_kbootstrap_destroy(struct uvm_addr_state *uaddr)
{
KASSERT(uaddr == (struct uvm_addr_state *)&uaddr_kbootstrap);
}
#ifndef SMALL_KERNEL
/*
* Best fit algorithm.
*/
const struct uvm_addr_functions uaddr_bestfit_functions = {
.uaddr_select = &uaddr_bestfit_select,
.uaddr_free_insert = &uaddr_bestfit_insert,
.uaddr_free_remove = &uaddr_bestfit_remove,
.uaddr_destroy = &uaddr_bestfit_destroy,
.uaddr_name = "uaddr_bestfit"
};
struct uvm_addr_state *
uaddr_bestfit_create(vaddr_t minaddr, vaddr_t maxaddr)
{
struct uaddr_bestfit_state *uaddr;
uaddr = pool_get(&uaddr_bestfit_pool, PR_WAITOK);
uaddr->ubf_uaddr.uaddr_minaddr = minaddr;
uaddr->ubf_uaddr.uaddr_maxaddr = maxaddr;
uaddr->ubf_uaddr.uaddr_functions = &uaddr_bestfit_functions;
RBT_INIT(uaddr_free_rbtree, &uaddr->ubf_free);
return &uaddr->ubf_uaddr;
}
void
uaddr_bestfit_destroy(struct uvm_addr_state *uaddr)
{
pool_put(&uaddr_bestfit_pool, uaddr);
}
void
uaddr_bestfit_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
struct uaddr_bestfit_state *uaddr;
struct vm_map_entry *rb_rv;
uaddr = (struct uaddr_bestfit_state *)uaddr_p;
if ((rb_rv = RBT_INSERT(uaddr_free_rbtree, &uaddr->ubf_free, entry)) !=
NULL) {
panic("%s: duplicate insertion: state %p "
"inserting %p, colliding with %p", __func__,
uaddr, entry, rb_rv);
}
}
void
uaddr_bestfit_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
struct uaddr_bestfit_state *uaddr;
uaddr = (struct uaddr_bestfit_state *)uaddr_p;
if (RBT_REMOVE(uaddr_free_rbtree, &uaddr->ubf_free, entry) != entry)
panic("%s: entry was not in tree", __func__);}
int
uaddr_bestfit_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vm_prot_t prot, vaddr_t hint)
{
vaddr_t min, max;
struct uaddr_bestfit_state *uaddr;
struct vm_map_entry *entry;
vsize_t guardsz;
uaddr = (struct uaddr_bestfit_state *)uaddr_p;
guardsz = ((map->flags & VM_MAP_GUARDPAGES) ? PAGE_SIZE : 0);
if (sz + guardsz < sz)
return ENOMEM;
/*
* Find smallest item on freelist capable of holding item.
* Deal with guardpages: search for space with one extra page.
*/
entry = uvm_addr_entrybyspace(&uaddr->ubf_free, sz + guardsz); if (entry == NULL)
return ENOMEM;
/*
* Walk the tree until we find an entry that fits.
*/
while (uvm_addr_fitspace(&min, &max,
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
sz, align, offset, 0, guardsz) != 0) {
entry = RBT_NEXT(uaddr_free_rbtree, entry);
if (entry == NULL)
return ENOMEM;
}
/*
* Return the address that generates the least fragmentation.
*/
*entry_out = entry;
*addr_out = (min - VMMAP_FREE_START(entry) <=
VMMAP_FREE_END(entry) - guardsz - sz - max ?
min : max);
return 0;
}
#endif /* !SMALL_KERNEL */
#ifndef SMALL_KERNEL
/*
* A userspace allocator based on pivots.
*/
const struct uvm_addr_functions uaddr_pivot_functions = {
.uaddr_select = &uaddr_pivot_select,
.uaddr_free_insert = &uaddr_pivot_insert,
.uaddr_free_remove = &uaddr_pivot_remove,
.uaddr_destroy = &uaddr_pivot_destroy,
#if defined(DEBUG) || defined(DDB)
.uaddr_print = &uaddr_pivot_print,
#endif /* DEBUG || DDB */
.uaddr_name = "uaddr_pivot"
};
/*
* A special random function for pivots.
*
* This function will return:
* - a random number
* - a multiple of PAGE_SIZE
* - at least PAGE_SIZE
*
* The random function has a slightly higher change to return a small number.
*/
vsize_t
uaddr_pivot_random(void)
{
int r;
/*
* The sum of two six-sided dice will have a normal distribution.
* We map the highest probable number to 1, by folding the curve
* (think of a graph on a piece of paper, that you fold).
*
* Because the fold happens at PIVOT_RND - 1, the numbers 0 and 1
* have the same and highest probability of happening.
*/
r = arc4random_uniform(PIVOT_RND) + arc4random_uniform(PIVOT_RND) -
(PIVOT_RND - 1);
if (r < 0)
r = -r;
/*
* Make the returned value at least PAGE_SIZE and a multiple of
* PAGE_SIZE.
*/
return (vaddr_t)(1 + r) << PAGE_SHIFT;
}
/*
* Select a new pivot.
*
* A pivot must:
* - be chosen random
* - have a randomly chosen gap before it, where the uaddr_state starts
* - have a randomly chosen gap after it, before the uaddr_state ends
*
* Furthermore, the pivot must provide sufficient space for the allocation.
* The addr will be set to the selected address.
*
* Returns ENOMEM on failure.
*/
int
uaddr_pivot_newpivot(struct vm_map *map, struct uaddr_pivot_state *uaddr,
struct uaddr_pivot *pivot,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vsize_t before_gap, vsize_t after_gap)
{
struct vm_map_entry *entry, *found;
vaddr_t minaddr, maxaddr;
vsize_t dist;
vaddr_t found_minaddr, found_maxaddr;
vaddr_t min, max;
vsize_t arc4_arg;
int fit_error;
u_int32_t path;
minaddr = uaddr->up_uaddr.uaddr_minaddr;
maxaddr = uaddr->up_uaddr.uaddr_maxaddr;
KASSERT(minaddr < maxaddr);
#ifdef DIAGNOSTIC
if (minaddr + 2 * PAGE_SIZE > maxaddr) {
panic("uaddr_pivot_newpivot: cannot grant random pivot "
"in area less than 2 pages (size = 0x%lx)",
maxaddr - minaddr);
}
#endif /* DIAGNOSTIC */
/*
* Gap calculation: 1/32 of the size of the managed area.
*
* At most: sufficient to not get truncated at arc4random.
* At least: 2 PAGE_SIZE
*
* minaddr and maxaddr will be changed according to arc4random.
*/
dist = MAX((maxaddr - minaddr) / 32, 2 * (vaddr_t)PAGE_SIZE);
if (dist >> PAGE_SHIFT > 0xffffffff) {
minaddr += (vsize_t)arc4random() << PAGE_SHIFT;
maxaddr -= (vsize_t)arc4random() << PAGE_SHIFT;
} else {
minaddr += (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
PAGE_SHIFT;
maxaddr -= (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
PAGE_SHIFT;
}
/*
* A very fast way to find an entry that will be large enough
* to hold the allocation, but still is found more or less
* randomly: the tree path selector has a 50% chance to go for
* a bigger or smaller entry.
*
* Note that the memory may actually be available,
* but the fragmentation may be so bad and the gaps chosen
* so unfortunately, that the allocation will not succeed.
* Or the alignment can only be satisfied by an entry that
* is not visited in the randomly selected path.
*
* This code finds an entry with sufficient space in O(log n) time.
*/
path = arc4random();
found = NULL;
entry = RBT_ROOT(uaddr_free_rbtree, &uaddr->up_free);
while (entry != NULL) {
fit_error = uvm_addr_fitspace(&min, &max,
MAX(VMMAP_FREE_START(entry), minaddr),
MIN(VMMAP_FREE_END(entry), maxaddr),
sz, align, offset, before_gap, after_gap);
/* It fits, save this entry. */
if (fit_error == 0) {
found = entry;
found_minaddr = min;
found_maxaddr = max;
}
/* Next. */
if (fit_error != 0)
entry = RBT_RIGHT(uaddr_free_rbtree, entry);
else if ((path & 0x1) == 0) {
path >>= 1;
entry = RBT_RIGHT(uaddr_free_rbtree, entry);
} else {
path >>= 1;
entry = RBT_LEFT(uaddr_free_rbtree, entry);
}
}
if (found == NULL)
return ENOMEM; /* Not found a large enough region. */
/*
* Calculate a random address within found.
*
* found_minaddr and found_maxaddr are already aligned, so be sure
* to select a multiple of align as the offset in the entry.
* Preferably, arc4random_uniform is used to provide no bias within
* the entry.
* However if the size of the entry exceeds arc4random_uniforms
* argument limit, we simply use arc4random (thus limiting ourselves
* to 4G * PAGE_SIZE bytes offset).
*/
if (found_maxaddr == found_minaddr)
*addr_out = found_minaddr;
else {
KASSERT(align >= PAGE_SIZE && (align & (align - 1)) == 0);
arc4_arg = found_maxaddr - found_minaddr;
if (arc4_arg > 0xffffffff) {
*addr_out = found_minaddr +
(arc4random() & ~(align - 1));
} else {
*addr_out = found_minaddr +
(arc4random_uniform(arc4_arg) & ~(align - 1));
}
}
/* Address was found in this entry. */
*entry_out = found;
/*
* Set up new pivot and return selected address.
*
* Depending on the direction of the pivot, the pivot must be placed
* at the bottom or the top of the allocation:
* - if the pivot moves upwards, place the pivot at the top of the
* allocation,
* - if the pivot moves downwards, place the pivot at the bottom
* of the allocation.
*/
pivot->entry = found;
pivot->dir = (arc4random() & 0x1 ? 1 : -1);
if (pivot->dir > 0)
pivot->addr = *addr_out + sz;
else
pivot->addr = *addr_out;
pivot->expire = PIVOT_EXPIRE - 1; /* First use is right now. */
return 0;
}
/*
* Pivot selector.
*
* Each time the selector is invoked, it will select a random pivot, which
* it will use to select memory with. The memory will be placed at the pivot,
* with a randomly sized gap between the allocation and the pivot.
* The pivot will then move so it will never revisit this address.
*
* Each allocation, the pivot expiry timer ticks. Once the pivot becomes
* expired, it will be replaced with a newly created pivot. Pivots also
* automatically expire if they fail to provide memory for an allocation.
*
* Expired pivots are replaced using the uaddr_pivot_newpivot() function,
* which will ensure the pivot points at memory in such a way that the
* allocation will succeed.
* As an added bonus, the uaddr_pivot_newpivot() function will perform the
* allocation immediately and move the pivot as appropriate.
*
* If uaddr_pivot_newpivot() fails to find a new pivot that will allow the
* allocation to succeed, it will not create a new pivot and the allocation
* will fail.
*
* A pivot running into used memory will automatically expire (because it will
* fail to allocate).
*
* Characteristics of the allocator:
* - best case, an allocation is O(log N)
* (it would be O(1), if it werent for the need to check if the memory is
* free; although that can be avoided...)
* - worst case, an allocation is O(log N)
* (the uaddr_pivot_newpivot() function has that complexity)
* - failed allocations always take O(log N)
* (the uaddr_pivot_newpivot() function will walk that deep into the tree).
*/
int
uaddr_pivot_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vm_prot_t prot, vaddr_t hint)
{
struct uaddr_pivot_state *uaddr;
struct vm_map_entry *entry;
struct uaddr_pivot *pivot;
vaddr_t min, max;
vsize_t before_gap, after_gap;
int err;
/*
* When we have a hint, use the rnd allocator that finds the
* area that is closest to the hint, if there is such an area.
*/
if (hint != 0) {
if (uaddr_rnd_select(map, uaddr_p, entry_out, addr_out,
sz, align, offset, prot, hint) == 0)
return 0;
return ENOMEM;
}
/*
* Select a random pivot and a random gap sizes around the allocation.
*/
uaddr = (struct uaddr_pivot_state *)uaddr_p;
pivot = &uaddr->up_pivots[
arc4random_uniform(nitems(uaddr->up_pivots))];
before_gap = uaddr_pivot_random();
after_gap = uaddr_pivot_random();
if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0)
goto expired; /* Pivot is invalid (null or expired). */
/*
* Attempt to use the pivot to map the entry.
*/
entry = pivot->entry;
if (pivot->dir > 0) {
if (uvm_addr_fitspace(&min, &max,
MAX(VMMAP_FREE_START(entry), pivot->addr),
VMMAP_FREE_END(entry), sz, align, offset,
before_gap, after_gap) == 0) {
*addr_out = min;
*entry_out = entry;
pivot->addr = min + sz;
pivot->expire--;
return 0;
}
} else {
if (uvm_addr_fitspace(&min, &max,
VMMAP_FREE_START(entry),
MIN(VMMAP_FREE_END(entry), pivot->addr),
sz, align, offset, before_gap, after_gap) == 0) {
*addr_out = max;
*entry_out = entry;
pivot->addr = max;
pivot->expire--;
return 0;
}
}
expired:
/*
* Pivot expired or allocation failed.
* Use pivot selector to do the allocation and find a new pivot.
*/
err = uaddr_pivot_newpivot(map, uaddr, pivot, entry_out, addr_out,
sz, align, offset, before_gap, after_gap);
return err;
}
/*
* Free the pivot.
*/
void
uaddr_pivot_destroy(struct uvm_addr_state *uaddr)
{
pool_put(&uaddr_pivot_pool, uaddr);
}
/*
* Insert an entry with free space in the space tree.
*/
void
uaddr_pivot_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
struct uaddr_pivot_state *uaddr;
struct vm_map_entry *rb_rv;
struct uaddr_pivot *p;
vaddr_t check_addr;
vaddr_t start, end;
uaddr = (struct uaddr_pivot_state *)uaddr_p;
if ((rb_rv = RBT_INSERT(uaddr_free_rbtree, &uaddr->up_free, entry)) !=
NULL) {
panic("%s: duplicate insertion: state %p "
"inserting entry %p which collides with %p", __func__,
uaddr, entry, rb_rv);
}
start = VMMAP_FREE_START(entry);
end = VMMAP_FREE_END(entry);
/*
* Update all pivots that are contained in this entry.
*/
for (p = &uaddr->up_pivots[0];
p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
check_addr = p->addr;
if (check_addr == 0)
continue;
if (p->dir < 0)
check_addr--;
if (start <= check_addr &&
check_addr < end) {
KASSERT(p->entry == NULL);
p->entry = entry;
}
}
}
/*
* Remove an entry with free space from the space tree.
*/
void
uaddr_pivot_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
struct vm_map_entry *entry)
{
struct uaddr_pivot_state *uaddr;
struct uaddr_pivot *p;
uaddr = (struct uaddr_pivot_state *)uaddr_p;
if (RBT_REMOVE(uaddr_free_rbtree, &uaddr->up_free, entry) != entry)
panic("%s: entry was not in tree", __func__);
/*
* Inform any pivot with this entry that the entry is gone.
* Note that this does not automatically invalidate the pivot.
*/
for (p = &uaddr->up_pivots[0];
p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
if (p->entry == entry)
p->entry = NULL;
}
}
/*
* Create a new pivot selector.
*
* Initially, all pivots are in the expired state.
* Two reasons for this:
* - it means this allocator will not take a huge amount of time
* - pivots select better on demand, because the pivot selection will be
* affected by preceding allocations:
* the next pivots will likely end up in different segments of free memory,
* that was segmented by an earlier allocation; better spread.
*/
struct uvm_addr_state *
uaddr_pivot_create(vaddr_t minaddr, vaddr_t maxaddr)
{
struct uaddr_pivot_state *uaddr;
uaddr = pool_get(&uaddr_pivot_pool, PR_WAITOK);
uaddr->up_uaddr.uaddr_minaddr = minaddr;
uaddr->up_uaddr.uaddr_maxaddr = maxaddr;
uaddr->up_uaddr.uaddr_functions = &uaddr_pivot_functions;
RBT_INIT(uaddr_free_rbtree, &uaddr->up_free);
memset(uaddr->up_pivots, 0, sizeof(uaddr->up_pivots));
return &uaddr->up_uaddr;
}
#if defined(DEBUG) || defined(DDB)
/*
* Print the uaddr_pivot_state.
*
* If full, a listing of all entries in the state will be provided.
*/
void
uaddr_pivot_print(struct uvm_addr_state *uaddr_p, boolean_t full,
int (*pr)(const char *, ...))
{
struct uaddr_pivot_state *uaddr;
struct uaddr_pivot *pivot;
struct vm_map_entry *entry;
int i;
vaddr_t check_addr;
uaddr = (struct uaddr_pivot_state *)uaddr_p;
for (i = 0; i < NUM_PIVOTS; i++) {
pivot = &uaddr->up_pivots[i];
(*pr)("\tpivot 0x%lx, epires in %d, direction %d\n",
pivot->addr, pivot->expire, pivot->dir);
}
if (!full)
return;
if (RBT_EMPTY(uaddr_free_rbtree, &uaddr->up_free))
(*pr)("\tempty\n");
/* Print list of free space. */
RBT_FOREACH(entry, uaddr_free_rbtree, &uaddr->up_free) {
(*pr)("\t0x%lx - 0x%lx free (0x%lx bytes)\n",
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
VMMAP_FREE_END(entry) - VMMAP_FREE_START(entry));
for (i = 0; i < NUM_PIVOTS; i++) {
pivot = &uaddr->up_pivots[i];
check_addr = pivot->addr;
if (check_addr == 0)
continue;
if (pivot->dir < 0)
check_addr--;
if (VMMAP_FREE_START(entry) <= check_addr &&
check_addr < VMMAP_FREE_END(entry)) {
(*pr)("\t\tcontains pivot %d (0x%lx)\n",
i, pivot->addr);
}
}
}
}
#endif /* DEBUG || DDB */
#endif /* !SMALL_KERNEL */
#ifndef SMALL_KERNEL
/*
* Stack/break allocator.
*
* Stack area is grown into in the opposite direction of the stack growth,
* brk area is grown downward (because sbrk() grows upward).
*
* Both areas are grown into proportially: a weighted chance is used to
* select which one (stack or brk area) to try. If the allocation fails,
* the other one is tested.
*/
const struct uvm_addr_functions uaddr_stack_brk_functions = {
.uaddr_select = &uaddr_stack_brk_select,
.uaddr_destroy = &uaddr_destroy,
.uaddr_name = "uaddr_stckbrk"
};
/*
* Stack/brk address selector.
*/
int
uaddr_stack_brk_select(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **entry_out, vaddr_t *addr_out,
vsize_t sz, vaddr_t align, vaddr_t offset,
vm_prot_t prot, vaddr_t hint)
{
vaddr_t start;
vaddr_t end;
vsize_t before_gap;
vsize_t after_gap;
int dir;
/* Set up brk search strategy. */
start = MAX(map->b_start, uaddr->uaddr_minaddr);
end = MIN(map->b_end, uaddr->uaddr_maxaddr);
before_gap = 0;
after_gap = 0;
dir = -1; /* Opposite of brk() growth. */
if (end - start >= sz) {
if (uvm_addr_linsearch(map, uaddr, entry_out, addr_out,
0, sz, align, offset, dir, start, end - sz,
before_gap, after_gap) == 0)
return 0;
}
/* Set up stack search strategy. */
start = MAX(map->s_start, uaddr->uaddr_minaddr);
end = MIN(map->s_end, uaddr->uaddr_maxaddr);
before_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
after_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
#ifdef MACHINE_STACK_GROWS_UP
dir = -1;
#else
dir = 1;
#endif
if (end - start >= before_gap + after_gap &&
end - start - before_gap - after_gap >= sz) {
if (uvm_addr_linsearch(map, uaddr, entry_out, addr_out,
0, sz, align, offset, dir, start, end - sz,
before_gap, after_gap) == 0)
return 0;
}
return ENOMEM;
}
struct uvm_addr_state *
uaddr_stack_brk_create(vaddr_t minaddr, vaddr_t maxaddr)
{
struct uvm_addr_state* uaddr;
uaddr = pool_get(&uaddr_pool, PR_WAITOK);
uaddr->uaddr_minaddr = minaddr;
uaddr->uaddr_maxaddr = maxaddr;
uaddr->uaddr_functions = &uaddr_stack_brk_functions;
return uaddr;
}
#endif /* !SMALL_KERNEL */
#ifndef SMALL_KERNEL
/*
* Free space comparison.
* Compares smaller free-space before larger free-space.
*/
static inline int
uvm_mapent_fspace_cmp(const struct vm_map_entry *e1,
const struct vm_map_entry *e2)
{
if (e1->fspace != e2->fspace)
return (e1->fspace < e2->fspace ? -1 : 1);
return (e1->start < e2->start ? -1 : e1->start > e2->start);
}
RBT_GENERATE(uaddr_free_rbtree, vm_map_entry, dfree.rbtree,
uvm_mapent_fspace_cmp);
#endif /* !SMALL_KERNEL */
/* $OpenBSD: route.c,v 1.414 2022/08/29 07:51:45 bluhm Exp $ */
/* $NetBSD: route.c,v 1.14 1996/02/13 22:00:46 christos Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.2 (Berkeley) 11/15/93
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/timeout.h>
#include <sys/domain.h>
#include <sys/ioctl.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip_var.h>
#include <netinet/in_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#endif
#ifdef BFD
#include <net/bfd.h>
#endif
#define ROUNDUP(a) (a>0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
/* Give some jitter to hash, to avoid synchronization between routers. */
static uint32_t rt_hashjitter;
extern unsigned int rtmap_limit;
struct cpumem * rtcounters;
int rttrash; /* routes not in table but not freed */
struct pool rtentry_pool; /* pool for rtentry structures */
struct pool rttimer_pool; /* pool for rttimer structures */
int rt_setgwroute(struct rtentry *, u_int);
void rt_putgwroute(struct rtentry *);
int rtflushclone1(struct rtentry *, void *, u_int);
int rtflushclone(struct rtentry *, unsigned int);
int rt_ifa_purge_walker(struct rtentry *, void *, unsigned int);
struct rtentry *rt_match(struct sockaddr *, uint32_t *, int, unsigned int);
int rt_clone(struct rtentry **, struct sockaddr *, unsigned int);
struct sockaddr *rt_plentosa(sa_family_t, int, struct sockaddr_in6 *);
static int rt_copysa(struct sockaddr *, struct sockaddr *, struct sockaddr **);
#define LABELID_MAX 50000
struct rt_label {
TAILQ_ENTRY(rt_label) rtl_entry;
char rtl_name[RTLABEL_LEN];
u_int16_t rtl_id;
int rtl_ref;
};
TAILQ_HEAD(rt_labels, rt_label) rt_labels = TAILQ_HEAD_INITIALIZER(rt_labels);
void
route_init(void)
{
rtcounters = counters_alloc(rts_ncounters);
pool_init(&rtentry_pool, sizeof(struct rtentry), 0, IPL_MPFLOOR, 0,
"rtentry", NULL);
while (rt_hashjitter == 0)
rt_hashjitter = arc4random();
#ifdef BFD
bfdinit();
#endif
}
/*
* Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise.
*/
int
rtisvalid(struct rtentry *rt)
{ if (rt == NULL)
return (0);
if (!ISSET(rt->rt_flags, RTF_UP))
return (0);
if (ISSET(rt->rt_flags, RTF_GATEWAY)) { KASSERT(rt->rt_gwroute != NULL); KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY)); if (!ISSET(rt->rt_gwroute->rt_flags, RTF_UP))
return (0);
}
return (1);
}
/*
* Do the actual lookup for rtalloc(9), do not use directly!
*
* Return the best matching entry for the destination ``dst''.
*
* "RT_RESOLVE" means that a corresponding L2 entry should
* be added to the routing table and resolved (via ARP or
* NDP), if it does not exist.
*/
struct rtentry *
rt_match(struct sockaddr *dst, uint32_t *src, int flags, unsigned int tableid)
{
struct rtentry *rt = NULL;
rt = rtable_match(tableid, dst, src);
if (rt == NULL) {
rtstat_inc(rts_unreach);
return (NULL);
}
if (ISSET(rt->rt_flags, RTF_CLONING) && ISSET(flags, RT_RESOLVE))
rt_clone(&rt, dst, tableid);
rt->rt_use++;
return (rt);
}
int
rt_clone(struct rtentry **rtp, struct sockaddr *dst, unsigned int rtableid)
{
struct rt_addrinfo info;
struct rtentry *rt = *rtp;
int error = 0;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = dst;
/*
* The priority of cloned route should be different
* to avoid conflict with /32 cloning routes.
*
* It should also be higher to let the ARP layer find
* cloned routes instead of the cloning one.
*/
KERNEL_LOCK();
error = rtrequest(RTM_RESOLVE, &info, rt->rt_priority - 1, &rt,
rtableid);
KERNEL_UNLOCK();
if (error) {
rtm_miss(RTM_MISS, &info, 0, RTP_NONE, 0, error, rtableid);
} else {
/* Inform listeners of the new route */
rtm_send(rt, RTM_ADD, 0, rtableid);
rtfree(*rtp);
*rtp = rt;
}
return (error);
}
/*
* Originated from bridge_hash() in if_bridge.c
*/
#define mix(a, b, c) do { \
a -= b; a -= c; a ^= (c >> 13); \
b -= c; b -= a; b ^= (a << 8); \
c -= a; c -= b; c ^= (b >> 13); \
a -= b; a -= c; a ^= (c >> 12); \
b -= c; b -= a; b ^= (a << 16); \
c -= a; c -= b; c ^= (b >> 5); \
a -= b; a -= c; a ^= (c >> 3); \
b -= c; b -= a; b ^= (a << 10); \
c -= a; c -= b; c ^= (b >> 15); \
} while (0)
int
rt_hash(struct rtentry *rt, struct sockaddr *dst, uint32_t *src)
{
uint32_t a, b, c;
if (src == NULL || !rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_MPATH))
return (-1);
a = b = 0x9e3779b9;
c = rt_hashjitter;
switch (dst->sa_family) {
case AF_INET:
{
struct sockaddr_in *sin;
if (!ipmultipath)
return (-1);
sin = satosin(dst);
a += sin->sin_addr.s_addr;
b += src[0];
mix(a, b, c);
break;
}
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *sin6;
if (!ip6_multipath)
return (-1);
sin6 = satosin6(dst);
a += sin6->sin6_addr.s6_addr32[0];
b += sin6->sin6_addr.s6_addr32[2];
c += src[0];
mix(a, b, c);
a += sin6->sin6_addr.s6_addr32[1];
b += sin6->sin6_addr.s6_addr32[3];
c += src[1];
mix(a, b, c);
a += sin6->sin6_addr.s6_addr32[2];
b += sin6->sin6_addr.s6_addr32[1];
c += src[2];
mix(a, b, c);
a += sin6->sin6_addr.s6_addr32[3];
b += sin6->sin6_addr.s6_addr32[0];
c += src[3];
mix(a, b, c);
break;
}
#endif /* INET6 */
}
return (c & 0xffff);
}
/*
* Allocate a route, potentially using multipath to select the peer.
*/
struct rtentry *
rtalloc_mpath(struct sockaddr *dst, uint32_t *src, unsigned int rtableid)
{ return (rt_match(dst, src, RT_RESOLVE, rtableid));
}
/*
* Look in the routing table for the best matching entry for
* ``dst''.
*
* If a route with a gateway is found and its next hop is no
* longer valid, try to cache it.
*/
struct rtentry *
rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid)
{ return (rt_match(dst, NULL, flags, rtableid));
}
/*
* Cache the route entry corresponding to a reachable next hop in
* the gateway entry ``rt''.
*/
int
rt_setgwroute(struct rtentry *rt, u_int rtableid)
{
struct rtentry *prt, *nhrt;
unsigned int rdomain = rtable_l2(rtableid);
int error;
NET_ASSERT_LOCKED();
KASSERT(ISSET(rt->rt_flags, RTF_GATEWAY));
/* If we cannot find a valid next hop bail. */
nhrt = rt_match(rt->rt_gateway, NULL, RT_RESOLVE, rdomain);
if (nhrt == NULL)
return (ENOENT);
/* Next hop entry must be on the same interface. */
if (nhrt->rt_ifidx != rt->rt_ifidx) {
struct sockaddr_in6 sa_mask;
if (!ISSET(nhrt->rt_flags, RTF_LLINFO) ||
!ISSET(nhrt->rt_flags, RTF_CLONED)) {
rtfree(nhrt);
return (EHOSTUNREACH);
}
/*
* We found a L2 entry, so we might have multiple
* RTF_CLONING routes for the same subnet. Query
* the first route of the multipath chain and iterate
* until we find the correct one.
*/
prt = rtable_lookup(rdomain, rt_key(nhrt->rt_parent),
rt_plen2mask(nhrt->rt_parent, &sa_mask), NULL, RTP_ANY);
rtfree(nhrt);
while (prt != NULL && prt->rt_ifidx != rt->rt_ifidx)
prt = rtable_iterate(prt);
/* We found nothing or a non-cloning MPATH route. */
if (prt == NULL || !ISSET(prt->rt_flags, RTF_CLONING)) {
rtfree(prt);
return (EHOSTUNREACH);
}
error = rt_clone(&prt, rt->rt_gateway, rdomain);
if (error) {
rtfree(prt);
return (error);
}
nhrt = prt;
}
/*
* Next hop must be reachable, this also prevents rtentry
* loops for example when rt->rt_gwroute points to rt.
*/
if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) {
rtfree(nhrt);
return (ENETUNREACH);
}
/* Next hop is valid so remove possible old cache. */
rt_putgwroute(rt);
KASSERT(rt->rt_gwroute == NULL);
/*
* If the MTU of next hop is 0, this will reset the MTU of the
* route to run PMTUD again from scratch.
*/
if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu))
rt->rt_mtu = nhrt->rt_mtu;
/*
* To avoid reference counting problems when writing link-layer
* addresses in an outgoing packet, we ensure that the lifetime
* of a cached entry is greater than the bigger lifetime of the
* gateway entries it is pointed by.
*/
nhrt->rt_flags |= RTF_CACHED;
nhrt->rt_cachecnt++;
rt->rt_gwroute = nhrt;
return (0);
}
/*
* Invalidate the cached route entry of the gateway entry ``rt''.
*/
void
rt_putgwroute(struct rtentry *rt)
{
struct rtentry *nhrt = rt->rt_gwroute;
NET_ASSERT_LOCKED();
if (!ISSET(rt->rt_flags, RTF_GATEWAY) || nhrt == NULL)
return;
KASSERT(ISSET(nhrt->rt_flags, RTF_CACHED));
KASSERT(nhrt->rt_cachecnt > 0);
--nhrt->rt_cachecnt;
if (nhrt->rt_cachecnt == 0)
nhrt->rt_flags &= ~RTF_CACHED;
rtfree(rt->rt_gwroute);
rt->rt_gwroute = NULL;
}
void
rtref(struct rtentry *rt)
{
refcnt_take(&rt->rt_refcnt);
}
void
rtfree(struct rtentry *rt)
{ if (rt == NULL)
return;
if (refcnt_rele(&rt->rt_refcnt) == 0)
return;
KASSERT(!ISSET(rt->rt_flags, RTF_UP));
KASSERT(!RT_ROOT(rt));
atomic_dec_int(&rttrash);
KERNEL_LOCK();
rt_timer_remove_all(rt);
ifafree(rt->rt_ifa);
rtlabel_unref(rt->rt_labelid);
#ifdef MPLS
rt_mpls_clear(rt);
#endif
free(rt->rt_gateway, M_RTABLE, ROUNDUP(rt->rt_gateway->sa_len));
free(rt_key(rt), M_RTABLE, rt_key(rt)->sa_len);
KERNEL_UNLOCK();
pool_put(&rtentry_pool, rt);
}
struct ifaddr *
ifaref(struct ifaddr *ifa)
{
refcnt_take(&ifa->ifa_refcnt);
return ifa;
}
void
ifafree(struct ifaddr *ifa)
{
if (refcnt_rele(&ifa->ifa_refcnt) == 0)
return;
free(ifa, M_IFADDR, 0);
}
/*
* Force a routing table entry to the specified
* destination to go through the given gateway.
* Normally called as a result of a routing redirect
* message from the network layer.
*/
void
rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
struct sockaddr *src, struct rtentry **rtp, unsigned int rdomain)
{
struct rtentry *rt;
int error = 0;
enum rtstat_counters stat = rts_ncounters;
struct rt_addrinfo info;
struct ifaddr *ifa;
unsigned int ifidx = 0;
int flags = RTF_GATEWAY|RTF_HOST;
uint8_t prio = RTP_NONE;
NET_ASSERT_LOCKED();
/* verify the gateway is directly reachable */
rt = rtalloc(gateway, 0, rdomain);
if (!rtisvalid(rt) || ISSET(rt->rt_flags, RTF_GATEWAY)) {
rtfree(rt);
error = ENETUNREACH;
goto out;
}
ifidx = rt->rt_ifidx;
ifa = rt->rt_ifa;
rtfree(rt);
rt = NULL;
rt = rtable_lookup(rdomain, dst, NULL, NULL, RTP_ANY);
/*
* If the redirect isn't from our current router for this dst,
* it's either old or wrong. If it redirects us to ourselves,
* we have a routing loop, perhaps as a result of an interface
* going down recently.
*/
#define equal(a1, a2) \
((a1)->sa_len == (a2)->sa_len && \
bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0)
if (rt != NULL && (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
error = EINVAL;
else if (ifa_ifwithaddr(gateway, rdomain) != NULL ||
(gateway->sa_family == AF_INET &&
in_broadcast(satosin(gateway)->sin_addr, rdomain)))
error = EHOSTUNREACH;
if (error)
goto done;
/*
* Create a new entry if we just got back a wildcard entry
* or the lookup failed. This is necessary for hosts
* which use routing redirects generated by smart gateways
* to dynamically build the routing tables.
*/
if (rt == NULL)
goto create;
/*
* Don't listen to the redirect if it's
* for a route to an interface.
*/
if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
if (!ISSET(rt->rt_flags, RTF_HOST)) {
/*
* Changing from route to net => route to host.
* Create new route, rather than smashing route to net.
*/
create:
rtfree(rt);
flags |= RTF_DYNAMIC;
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_ifa = ifa;
info.rti_flags = flags;
rt = NULL;
error = rtrequest(RTM_ADD, &info, RTP_DEFAULT, &rt,
rdomain);
if (error == 0) {
flags = rt->rt_flags;
prio = rt->rt_priority;
}
stat = rts_dynamic;
} else {
/*
* Smash the current notion of the gateway to
* this destination. Should check about netmask!!!
*/
rt->rt_flags |= RTF_MODIFIED;
flags |= RTF_MODIFIED;
prio = rt->rt_priority;
stat = rts_newgateway;
rt_setgate(rt, gateway, rdomain);
}
} else
error = EHOSTUNREACH;
done:
if (rt) {
if (rtp && !error)
*rtp = rt;
else
rtfree(rt);
}
out:
if (error)
rtstat_inc(rts_badredirect);
else if (stat != rts_ncounters)
rtstat_inc(stat);
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_AUTHOR] = src;
rtm_miss(RTM_REDIRECT, &info, flags, prio, ifidx, error, rdomain);
}
/*
* Delete a route and generate a message
*/
int
rtdeletemsg(struct rtentry *rt, struct ifnet *ifp, u_int tableid)
{
int error;
struct rt_addrinfo info;
struct sockaddr_rtlabel sa_rl;
struct sockaddr_in6 sa_mask;
KASSERT(rt->rt_ifidx == ifp->if_index);
/*
* Request the new route so that the entry is not actually
* deleted. That will allow the information being reported to
* be accurate (and consistent with route_output()).
*/
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
if (!ISSET(rt->rt_flags, RTF_HOST))
info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
error = rtrequest_delete(&info, rt->rt_priority, ifp, &rt, tableid);
rtm_miss(RTM_DELETE, &info, info.rti_flags, rt->rt_priority,
rt->rt_ifidx, error, tableid);
if (error == 0)
rtfree(rt);
return (error);
}
static inline int
rtequal(struct rtentry *a, struct rtentry *b)
{
if (a == b)
return 1;
if (memcmp(rt_key(a), rt_key(b), rt_key(a)->sa_len) == 0 &&
rt_plen(a) == rt_plen(b))
return 1;
else
return 0;
}
int
rtflushclone1(struct rtentry *rt, void *arg, u_int id)
{
struct rtentry *cloningrt = arg;
struct ifnet *ifp;
if (!ISSET(rt->rt_flags, RTF_CLONED))
return 0;
/* Cached route must stay alive as long as their parent are alive. */
if (ISSET(rt->rt_flags, RTF_CACHED) && (rt->rt_parent != cloningrt))
return 0;
if (!rtequal(rt->rt_parent, cloningrt))
return 0;
/*
* This happens when an interface with a RTF_CLONING route is
* being detached. In this case it's safe to bail because all
* the routes are being purged by rt_ifa_purge().
*/
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
return 0;
if_put(ifp);
return EEXIST;
}
int
rtflushclone(struct rtentry *parent, unsigned int rtableid)
{
struct rtentry *rt = NULL;
struct ifnet *ifp;
int error;
#ifdef DIAGNOSTIC
if (!parent || (parent->rt_flags & RTF_CLONING) == 0)
panic("rtflushclone: called with a non-cloning route");
#endif
do {
error = rtable_walk(rtableid, rt_key(parent)->sa_family, &rt,
rtflushclone1, parent);
if (rt != NULL && error == EEXIST) {
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL) {
error = EAGAIN;
} else {
error = rtdeletemsg(rt, ifp, rtableid);
if (error == 0)
error = EAGAIN;
if_put(ifp);
}
}
rtfree(rt);
rt = NULL;
} while (error == EAGAIN);
return error;
}
int
rtrequest_delete(struct rt_addrinfo *info, u_int8_t prio, struct ifnet *ifp,
struct rtentry **ret_nrt, u_int tableid)
{
struct rtentry *rt;
int error;
NET_ASSERT_LOCKED();
if (!rtable_exists(tableid))
return (EAFNOSUPPORT);
rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY], prio);
if (rt == NULL)
return (ESRCH);
/* Make sure that's the route the caller want to delete. */
if (ifp != NULL && ifp->if_index != rt->rt_ifidx) {
rtfree(rt);
return (ESRCH);
}
#ifdef BFD
if (ISSET(rt->rt_flags, RTF_BFD))
bfdclear(rt);
#endif
error = rtable_delete(tableid, info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], rt);
if (error != 0) {
rtfree(rt);
return (ESRCH);
}
/* Release next hop cache before flushing cloned entries. */
rt_putgwroute(rt);
/* Clean up any cloned children. */
if (ISSET(rt->rt_flags, RTF_CLONING))
rtflushclone(rt, tableid);
rtfree(rt->rt_parent);
rt->rt_parent = NULL;
rt->rt_flags &= ~RTF_UP;
KASSERT(ifp->if_index == rt->rt_ifidx);
ifp->if_rtrequest(ifp, RTM_DELETE, rt);
atomic_inc_int(&rttrash);
if (ret_nrt != NULL)
*ret_nrt = rt;
else
rtfree(rt);
return (0);
}
int
rtrequest(int req, struct rt_addrinfo *info, u_int8_t prio,
struct rtentry **ret_nrt, u_int tableid)
{
struct ifnet *ifp;
struct rtentry *rt, *crt;
struct ifaddr *ifa;
struct sockaddr *ndst;
struct sockaddr_rtlabel *sa_rl, sa_rl2;
struct sockaddr_dl sa_dl = { sizeof(sa_dl), AF_LINK };
int error;
NET_ASSERT_LOCKED(); if (!rtable_exists(tableid))
return (EAFNOSUPPORT);
if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; switch (req) {
case RTM_DELETE:
return (EINVAL);
case RTM_RESOLVE:
if (ret_nrt == NULL || (rt = *ret_nrt) == NULL)
return (EINVAL);
if ((rt->rt_flags & RTF_CLONING) == 0)
return (EINVAL);
KASSERT(rt->rt_ifa->ifa_ifp != NULL);
info->rti_ifa = rt->rt_ifa;
info->rti_flags = rt->rt_flags | (RTF_CLONED|RTF_HOST);
info->rti_flags &= ~(RTF_CLONING|RTF_CONNECTED|RTF_STATIC);
info->rti_info[RTAX_GATEWAY] = sdltosa(&sa_dl);
info->rti_info[RTAX_LABEL] =
rtlabel_id2sa(rt->rt_labelid, &sa_rl2);
/* FALLTHROUGH */
case RTM_ADD:
if (info->rti_ifa == NULL)
return (EINVAL);
ifa = info->rti_ifa;
ifp = ifa->ifa_ifp;
if (prio == 0) prio = ifp->if_priority + RTP_STATIC;
error = rt_copysa(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK], &ndst);
if (error)
return (error);
rt = pool_get(&rtentry_pool, PR_NOWAIT | PR_ZERO);
if (rt == NULL) {
free(ndst, M_RTABLE, ndst->sa_len);
return (ENOBUFS);
}
refcnt_init(&rt->rt_refcnt);
rt->rt_flags = info->rti_flags | RTF_UP;
rt->rt_priority = prio; /* init routing priority */
LIST_INIT(&rt->rt_timer);
/* Check the link state if the table supports it. */
if (rtable_mpath_capable(tableid, ndst->sa_family) && !ISSET(rt->rt_flags, RTF_LOCAL) && (!LINK_STATE_IS_UP(ifp->if_link_state) ||
!ISSET(ifp->if_flags, IFF_UP))) {
rt->rt_flags &= ~RTF_UP;
rt->rt_priority |= RTP_DOWN;
}
if (info->rti_info[RTAX_LABEL] != NULL) {
sa_rl = (struct sockaddr_rtlabel *)
info->rti_info[RTAX_LABEL];
rt->rt_labelid = rtlabel_name2id(sa_rl->sr_label);
}
#ifdef MPLS
/* We have to allocate additional space for MPLS infos */
if (info->rti_flags & RTF_MPLS &&
(info->rti_info[RTAX_SRC] != NULL ||
info->rti_info[RTAX_DST]->sa_family == AF_MPLS)) {
error = rt_mpls_set(rt, info->rti_info[RTAX_SRC],
info->rti_mpls);
if (error) {
free(ndst, M_RTABLE, ndst->sa_len);
pool_put(&rtentry_pool, rt);
return (error);
}
} else
rt_mpls_clear(rt);
#endif
rt->rt_ifa = ifaref(ifa);
rt->rt_ifidx = ifp->if_index;
/*
* Copy metrics and a back pointer from the cloned
* route's parent.
*/
if (ISSET(rt->rt_flags, RTF_CLONED)) { rtref(*ret_nrt);
rt->rt_parent = *ret_nrt;
rt->rt_rmx = (*ret_nrt)->rt_rmx;
}
/*
* We must set rt->rt_gateway before adding ``rt'' to
* the routing table because the radix MPATH code use
* it to (re)order routes.
*/
if ((error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY],
tableid))) {
ifafree(ifa);
rtfree(rt->rt_parent);
rt_putgwroute(rt);
free(rt->rt_gateway, M_RTABLE,
ROUNDUP(rt->rt_gateway->sa_len));
free(ndst, M_RTABLE, ndst->sa_len);
pool_put(&rtentry_pool, rt);
return (error);
}
error = rtable_insert(tableid, ndst,
info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
rt->rt_priority, rt);
if (error != 0 &&
(crt = rtable_match(tableid, ndst, NULL)) != NULL) {
/* overwrite cloned route */
if (ISSET(crt->rt_flags, RTF_CLONED) &&
!ISSET(crt->rt_flags, RTF_CACHED)) {
struct ifnet *cifp;
cifp = if_get(crt->rt_ifidx);
KASSERT(cifp != NULL);
rtdeletemsg(crt, cifp, tableid);
if_put(cifp);
error = rtable_insert(tableid, ndst,
info->rti_info[RTAX_NETMASK],
info->rti_info[RTAX_GATEWAY],
rt->rt_priority, rt);
}
rtfree(crt);
}
if (error != 0) {
ifafree(ifa);
rtfree(rt->rt_parent);
rt_putgwroute(rt);
free(rt->rt_gateway, M_RTABLE,
ROUNDUP(rt->rt_gateway->sa_len));
free(ndst, M_RTABLE, ndst->sa_len);
pool_put(&rtentry_pool, rt);
return (EEXIST);
}
ifp->if_rtrequest(ifp, req, rt);
if_group_routechange(info->rti_info[RTAX_DST],
info->rti_info[RTAX_NETMASK]);
if (ret_nrt != NULL)
*ret_nrt = rt;
else
rtfree(rt);
break;
}
return (0);
}
int
rt_setgate(struct rtentry *rt, struct sockaddr *gate, u_int rtableid)
{
int glen = ROUNDUP(gate->sa_len);
struct sockaddr *sa;
if (rt->rt_gateway == NULL || glen != ROUNDUP(rt->rt_gateway->sa_len)) {
sa = malloc(glen, M_RTABLE, M_NOWAIT);
if (sa == NULL)
return (ENOBUFS);
if (rt->rt_gateway != NULL) { free(rt->rt_gateway, M_RTABLE,
ROUNDUP(rt->rt_gateway->sa_len));
}
rt->rt_gateway = sa;
}
memmove(rt->rt_gateway, gate, glen);
if (ISSET(rt->rt_flags, RTF_GATEWAY)) return (rt_setgwroute(rt, rtableid));
return (0);
}
/*
* Return the route entry containing the next hop link-layer
* address corresponding to ``rt''.
*/
struct rtentry *
rt_getll(struct rtentry *rt)
{ if (ISSET(rt->rt_flags, RTF_GATEWAY)) { KASSERT(rt->rt_gwroute != NULL);
return (rt->rt_gwroute);
}
return (rt);
}
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst,
struct sockaddr *netmask)
{
u_char *cp1 = (u_char *)src;
u_char *cp2 = (u_char *)dst;
u_char *cp3 = (u_char *)netmask;
u_char *cplim = cp2 + *cp3;
u_char *cplim2 = cp2 + *cp1;
*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
cp3 += 2;
if (cplim > cplim2)
cplim = cplim2;
while (cp2 < cplim)
*cp2++ = *cp1++ & *cp3++;
if (cp2 < cplim2)
bzero(cp2, cplim2 - cp2);
}
/*
* allocate new sockaddr structure based on the user supplied src and mask
* that is useable for the routing table.
*/
static int
rt_copysa(struct sockaddr *src, struct sockaddr *mask, struct sockaddr **dst)
{
static const u_char maskarray[] = {
0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };
struct sockaddr *ndst;
const struct domain *dp;
u_char *csrc, *cdst;
int i, plen;
for (i = 0; (dp = domains[i]) != NULL; i++) { if (dp->dom_rtoffset == 0)
continue;
if (src->sa_family == dp->dom_family)
break;
}
if (dp == NULL)
return (EAFNOSUPPORT);
if (src->sa_len < dp->dom_sasize)
return (EINVAL);
plen = rtable_satoplen(src->sa_family, mask);
if (plen == -1)
return (EINVAL);
ndst = malloc(dp->dom_sasize, M_RTABLE, M_NOWAIT|M_ZERO);
if (ndst == NULL)
return (ENOBUFS);
ndst->sa_family = src->sa_family;
ndst->sa_len = dp->dom_sasize;
csrc = (u_char *)src + dp->dom_rtoffset;
cdst = (u_char *)ndst + dp->dom_rtoffset;
memcpy(cdst, csrc, plen / 8);
if (plen % 8 != 0) cdst[plen / 8] = csrc[plen / 8] & maskarray[plen % 8];
*dst = ndst;
return (0);
}
int
rt_ifa_add(struct ifaddr *ifa, int flags, struct sockaddr *dst,
unsigned int rdomain)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct rtentry *rt;
struct sockaddr_rtlabel sa_rl;
struct rt_addrinfo info;
uint8_t prio = ifp->if_priority + RTP_STATIC;
int error;
KASSERT(rdomain == rtable_l2(rdomain));
memset(&info, 0, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = flags;
info.rti_info[RTAX_DST] = dst;
if (flags & RTF_LLINFO)
info.rti_info[RTAX_GATEWAY] = sdltosa(ifp->if_sadl);
else
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
info.rti_info[RTAX_LABEL] = rtlabel_id2sa(ifp->if_rtlabelid, &sa_rl);
#ifdef MPLS
if ((flags & RTF_MPLS) == RTF_MPLS)
info.rti_mpls = MPLS_OP_POP;
#endif /* MPLS */
if ((flags & RTF_HOST) == 0)
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
if (flags & (RTF_LOCAL|RTF_BROADCAST))
prio = RTP_LOCAL;
if (flags & RTF_CONNECTED)
prio = ifp->if_priority + RTP_CONNECTED;
error = rtrequest(RTM_ADD, &info, prio, &rt, rdomain);
if (error == 0) {
/*
* A local route is created for every address configured
* on an interface, so use this information to notify
* userland that a new address has been added.
*/
if (flags & RTF_LOCAL)
rtm_addr(RTM_NEWADDR, ifa);
rtm_send(rt, RTM_ADD, 0, rdomain);
rtfree(rt);
}
return (error);
}
int
rt_ifa_del(struct ifaddr *ifa, int flags, struct sockaddr *dst,
unsigned int rdomain)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct rtentry *rt;
struct mbuf *m = NULL;
struct sockaddr *deldst;
struct rt_addrinfo info;
struct sockaddr_rtlabel sa_rl;
uint8_t prio = ifp->if_priority + RTP_STATIC;
int error;
KASSERT(rdomain == rtable_l2(rdomain));
if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
m = m_get(M_DONTWAIT, MT_SONAME);
if (m == NULL)
return (ENOBUFS);
deldst = mtod(m, struct sockaddr *);
rt_maskedcopy(dst, deldst, ifa->ifa_netmask);
dst = deldst;
}
memset(&info, 0, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = flags;
info.rti_info[RTAX_DST] = dst;
if ((flags & RTF_LLINFO) == 0)
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
info.rti_info[RTAX_LABEL] = rtlabel_id2sa(ifp->if_rtlabelid, &sa_rl);
if ((flags & RTF_HOST) == 0)
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
if (flags & (RTF_LOCAL|RTF_BROADCAST))
prio = RTP_LOCAL;
if (flags & RTF_CONNECTED)
prio = ifp->if_priority + RTP_CONNECTED;
rtable_clearsource(rdomain, ifa->ifa_addr);
error = rtrequest_delete(&info, prio, ifp, &rt, rdomain);
if (error == 0) {
rtm_send(rt, RTM_DELETE, 0, rdomain);
if (flags & RTF_LOCAL)
rtm_addr(RTM_DELADDR, ifa);
rtfree(rt);
}
m_free(m);
return (error);
}
/*
* Add ifa's address as a local rtentry.
*/
int
rt_ifa_addlocal(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct rtentry *rt;
u_int flags = RTF_HOST|RTF_LOCAL;
int error = 0;
/*
* If the configured address correspond to the magical "any"
* address do not add a local route entry because that might
* corrupt the routing tree which uses this value for the
* default routes.
*/
switch (ifa->ifa_addr->sa_family) {
case AF_INET:
if (satosin(ifa->ifa_addr)->sin_addr.s_addr == INADDR_ANY)
return (0);
break;
#ifdef INET6
case AF_INET6:
if (IN6_ARE_ADDR_EQUAL(&satosin6(ifa->ifa_addr)->sin6_addr,
&in6addr_any))
return (0);
break;
#endif
default:
break;
}
if (!ISSET(ifp->if_flags, (IFF_LOOPBACK|IFF_POINTOPOINT)))
flags |= RTF_LLINFO;
/* If there is no local entry, allocate one. */
rt = rtalloc(ifa->ifa_addr, 0, ifp->if_rdomain);
if (rt == NULL || ISSET(rt->rt_flags, flags) != flags) {
error = rt_ifa_add(ifa, flags | RTF_MPATH, ifa->ifa_addr,
ifp->if_rdomain);
}
rtfree(rt);
return (error);
}
/*
* Remove local rtentry of ifa's address if it exists.
*/
int
rt_ifa_dellocal(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct rtentry *rt;
u_int flags = RTF_HOST|RTF_LOCAL;
int error = 0;
/*
* We do not add local routes for such address, so do not bother
* removing them.
*/
switch (ifa->ifa_addr->sa_family) {
case AF_INET:
if (satosin(ifa->ifa_addr)->sin_addr.s_addr == INADDR_ANY)
return (0);
break;
#ifdef INET6
case AF_INET6:
if (IN6_ARE_ADDR_EQUAL(&satosin6(ifa->ifa_addr)->sin6_addr,
&in6addr_any))
return (0);
break;
#endif
default:
break;
}
if (!ISSET(ifp->if_flags, (IFF_LOOPBACK|IFF_POINTOPOINT)))
flags |= RTF_LLINFO;
/*
* Before deleting, check if a corresponding local host
* route surely exists. With this check, we can avoid to
* delete an interface direct route whose destination is same
* as the address being removed. This can happen when removing
* a subnet-router anycast address on an interface attached
* to a shared medium.
*/
rt = rtalloc(ifa->ifa_addr, 0, ifp->if_rdomain);
if (rt != NULL && ISSET(rt->rt_flags, flags) == flags) {
error = rt_ifa_del(ifa, flags, ifa->ifa_addr,
ifp->if_rdomain);
}
rtfree(rt);
return (error);
}
/*
* Remove all addresses attached to ``ifa''.
*/
void
rt_ifa_purge(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct rtentry *rt = NULL;
unsigned int rtableid;
int error, af = ifa->ifa_addr->sa_family;
KASSERT(ifp != NULL);
for (rtableid = 0; rtableid < rtmap_limit; rtableid++) {
/* skip rtables that are not in the rdomain of the ifp */
if (rtable_l2(rtableid) != ifp->if_rdomain)
continue;
do {
error = rtable_walk(rtableid, af, &rt,
rt_ifa_purge_walker, ifa);
if (rt != NULL && error == EEXIST) {
error = rtdeletemsg(rt, ifp, rtableid);
if (error == 0)
error = EAGAIN;
}
rtfree(rt);
rt = NULL;
} while (error == EAGAIN);
if (error == EAFNOSUPPORT)
error = 0;
if (error)
break;
}
}
int
rt_ifa_purge_walker(struct rtentry *rt, void *vifa, unsigned int rtableid)
{
struct ifaddr *ifa = vifa;
if (rt->rt_ifa == ifa)
return EEXIST;
return 0;
}
/*
* Route timer routines. These routes allow functions to be called
* for various routes at any time. This is useful in supporting
* path MTU discovery and redirect route deletion.
*
* This is similar to some BSDI internal functions, but it provides
* for multiple queues for efficiency's sake...
*/
struct mutex rttimer_mtx;
struct rttimer {
TAILQ_ENTRY(rttimer) rtt_next; /* [T] entry on timer queue */
LIST_ENTRY(rttimer) rtt_link; /* [T] timers per rtentry */
struct timeout rtt_timeout; /* [I] timeout for this entry */
struct rttimer_queue *rtt_queue; /* [I] back pointer to queue */
struct rtentry *rtt_rt; /* [T] back pointer to route */
time_t rtt_expire; /* [I] rt expire time */
u_int rtt_tableid; /* [I] rtable id of rtt_rt */
};
#define RTTIMER_CALLOUT(r) { \
if (r->rtt_queue->rtq_func != NULL) { \
(*r->rtt_queue->rtq_func)(r->rtt_rt, r->rtt_tableid); \
} else { \
struct ifnet *ifp; \
\
ifp = if_get(r->rtt_rt->rt_ifidx); \
if (ifp != NULL && \
(r->rtt_rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == \
(RTF_DYNAMIC|RTF_HOST)) \
rtdeletemsg(r->rtt_rt, ifp, r->rtt_tableid); \
if_put(ifp); \
} \
}
/*
* Some subtle order problems with domain initialization mean that
* we cannot count on this being run from rt_init before various
* protocol initializations are done. Therefore, we make sure
* that this is run when the first queue is added...
*/
void
rt_timer_init(void)
{
pool_init(&rttimer_pool, sizeof(struct rttimer), 0,
IPL_MPFLOOR, 0, "rttmr", NULL);
mtx_init(&rttimer_mtx, IPL_MPFLOOR);
}
void
rt_timer_queue_init(struct rttimer_queue *rtq, int timeout,
void (*func)(struct rtentry *, u_int))
{
rtq->rtq_timeout = timeout;
rtq->rtq_count = 0;
rtq->rtq_func = func;
TAILQ_INIT(&rtq->rtq_head);
}
void
rt_timer_queue_change(struct rttimer_queue *rtq, int timeout)
{
mtx_enter(&rttimer_mtx);
rtq->rtq_timeout = timeout;
mtx_leave(&rttimer_mtx);
}
void
rt_timer_queue_flush(struct rttimer_queue *rtq)
{
struct rttimer *r;
TAILQ_HEAD(, rttimer) rttlist;
NET_ASSERT_LOCKED();
TAILQ_INIT(&rttlist);
mtx_enter(&rttimer_mtx);
while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
TAILQ_INSERT_TAIL(&rttlist, r, rtt_next);
KASSERT(rtq->rtq_count > 0);
rtq->rtq_count--;
}
mtx_leave(&rttimer_mtx);
while ((r = TAILQ_FIRST(&rttlist)) != NULL) {
TAILQ_REMOVE(&rttlist, r, rtt_next);
RTTIMER_CALLOUT(r);
pool_put(&rttimer_pool, r);
}
}
unsigned long
rt_timer_queue_count(struct rttimer_queue *rtq)
{
return (rtq->rtq_count);
}
static inline struct rttimer *
rt_timer_unlink(struct rttimer *r)
{
MUTEX_ASSERT_LOCKED(&rttimer_mtx);
LIST_REMOVE(r, rtt_link);
r->rtt_rt = NULL;
if (timeout_del(&r->rtt_timeout) == 0) {
/* timeout fired, so rt_timer_timer will do the cleanup */
return NULL;
}
TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
KASSERT(r->rtt_queue->rtq_count > 0);
r->rtt_queue->rtq_count--;
return r;
}
void
rt_timer_remove_all(struct rtentry *rt)
{
struct rttimer *r;
TAILQ_HEAD(, rttimer) rttlist;
TAILQ_INIT(&rttlist);
mtx_enter(&rttimer_mtx);
while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) {
r = rt_timer_unlink(r);
if (r != NULL)
TAILQ_INSERT_TAIL(&rttlist, r, rtt_next);
}
mtx_leave(&rttimer_mtx);
while ((r = TAILQ_FIRST(&rttlist)) != NULL) {
TAILQ_REMOVE(&rttlist, r, rtt_next);
pool_put(&rttimer_pool, r);
}
}
time_t
rt_timer_get_expire(const struct rtentry *rt)
{
const struct rttimer *r;
time_t expire = 0;
mtx_enter(&rttimer_mtx);
LIST_FOREACH(r, &rt->rt_timer, rtt_link) {
if (expire == 0 || expire > r->rtt_expire)
expire = r->rtt_expire;
}
mtx_leave(&rttimer_mtx);
return expire;
}
int
rt_timer_add(struct rtentry *rt, struct rttimer_queue *queue, u_int rtableid)
{
struct rttimer *r, *rnew;
rnew = pool_get(&rttimer_pool, PR_NOWAIT | PR_ZERO);
if (rnew == NULL)
return (ENOBUFS);
rnew->rtt_rt = rt;
rnew->rtt_queue = queue;
rnew->rtt_tableid = rtableid;
rnew->rtt_expire = getuptime() + queue->rtq_timeout;
timeout_set_proc(&rnew->rtt_timeout, rt_timer_timer, rnew);
mtx_enter(&rttimer_mtx);
/*
* If there's already a timer with this action, destroy it before
* we add a new one.
*/
LIST_FOREACH(r, &rt->rt_timer, rtt_link) {
if (r->rtt_queue == queue) {
r = rt_timer_unlink(r);
break; /* only one per list, so we can quit... */
}
}
LIST_INSERT_HEAD(&rt->rt_timer, rnew, rtt_link);
TAILQ_INSERT_TAIL(&queue->rtq_head, rnew, rtt_next);
timeout_add_sec(&rnew->rtt_timeout, queue->rtq_timeout);
rnew->rtt_queue->rtq_count++;
mtx_leave(&rttimer_mtx);
if (r != NULL)
pool_put(&rttimer_pool, r);
return (0);
}
void
rt_timer_timer(void *arg)
{
struct rttimer *r = arg;
struct rttimer_queue *rtq = r->rtt_queue;
NET_LOCK();
mtx_enter(&rttimer_mtx);
if (r->rtt_rt != NULL)
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
KASSERT(rtq->rtq_count > 0);
rtq->rtq_count--;
mtx_leave(&rttimer_mtx);
if (r->rtt_rt != NULL)
RTTIMER_CALLOUT(r);
NET_UNLOCK();
pool_put(&rttimer_pool, r);
}
#ifdef MPLS
int
rt_mpls_set(struct rtentry *rt, struct sockaddr *src, uint8_t op)
{
struct sockaddr_mpls *psa_mpls = (struct sockaddr_mpls *)src;
struct rt_mpls *rt_mpls;
if (psa_mpls == NULL && op != MPLS_OP_POP)
return (EOPNOTSUPP);
if (psa_mpls != NULL && psa_mpls->smpls_len != sizeof(*psa_mpls))
return (EINVAL);
if (psa_mpls != NULL && psa_mpls->smpls_family != AF_MPLS)
return (EAFNOSUPPORT);
rt->rt_llinfo = malloc(sizeof(struct rt_mpls), M_TEMP, M_NOWAIT|M_ZERO);
if (rt->rt_llinfo == NULL)
return (ENOMEM);
rt_mpls = (struct rt_mpls *)rt->rt_llinfo;
if (psa_mpls != NULL)
rt_mpls->mpls_label = psa_mpls->smpls_label;
rt_mpls->mpls_operation = op;
/* XXX: set experimental bits */
rt->rt_flags |= RTF_MPLS;
return (0);
}
void
rt_mpls_clear(struct rtentry *rt)
{
if (rt->rt_llinfo != NULL && rt->rt_flags & RTF_MPLS) { free(rt->rt_llinfo, M_TEMP, sizeof(struct rt_mpls));
rt->rt_llinfo = NULL;
}
rt->rt_flags &= ~RTF_MPLS;
}
#endif
u_int16_t
rtlabel_name2id(char *name)
{
struct rt_label *label, *p;
u_int16_t new_id = 1;
if (!name[0])
return (0);
TAILQ_FOREACH(label, &rt_labels, rtl_entry)
if (strcmp(name, label->rtl_name) == 0) {
label->rtl_ref++;
return (label->rtl_id);
}
/*
* to avoid fragmentation, we do a linear search from the beginning
* and take the first free slot we find. if there is none or the list
* is empty, append a new entry at the end.
*/
TAILQ_FOREACH(p, &rt_labels, rtl_entry) {
if (p->rtl_id != new_id)
break;
new_id = p->rtl_id + 1;
}
if (new_id > LABELID_MAX)
return (0);
label = malloc(sizeof(*label), M_RTABLE, M_NOWAIT|M_ZERO);
if (label == NULL)
return (0);
strlcpy(label->rtl_name, name, sizeof(label->rtl_name));
label->rtl_id = new_id;
label->rtl_ref++;
if (p != NULL) /* insert new entry before p */
TAILQ_INSERT_BEFORE(p, label, rtl_entry);
else /* either list empty or no free slot in between */
TAILQ_INSERT_TAIL(&rt_labels, label, rtl_entry);
return (label->rtl_id);
}
const char *
rtlabel_id2name(u_int16_t id)
{
struct rt_label *label;
TAILQ_FOREACH(label, &rt_labels, rtl_entry)
if (label->rtl_id == id)
return (label->rtl_name);
return (NULL);
}
struct sockaddr *
rtlabel_id2sa(u_int16_t labelid, struct sockaddr_rtlabel *sa_rl)
{
const char *label;
if (labelid == 0 || (label = rtlabel_id2name(labelid)) == NULL)
return (NULL);
bzero(sa_rl, sizeof(*sa_rl));
sa_rl->sr_len = sizeof(*sa_rl);
sa_rl->sr_family = AF_UNSPEC;
strlcpy(sa_rl->sr_label, label, sizeof(sa_rl->sr_label));
return ((struct sockaddr *)sa_rl);
}
void
rtlabel_unref(u_int16_t id)
{
struct rt_label *p, *next;
if (id == 0)
return;
TAILQ_FOREACH_SAFE(p, &rt_labels, rtl_entry, next) { if (id == p->rtl_id) { if (--p->rtl_ref == 0) { TAILQ_REMOVE(&rt_labels, p, rtl_entry);
free(p, M_RTABLE, sizeof(*p));
}
break;
}
}
}
int
rt_if_track(struct ifnet *ifp)
{
unsigned int rtableid;
struct rtentry *rt = NULL;
int i, error = 0;
for (rtableid = 0; rtableid < rtmap_limit; rtableid++) {
/* skip rtables that are not in the rdomain of the ifp */
if (rtable_l2(rtableid) != ifp->if_rdomain)
continue;
for (i = 1; i <= AF_MAX; i++) {
if (!rtable_mpath_capable(rtableid, i))
continue;
do {
error = rtable_walk(rtableid, i, &rt,
rt_if_linkstate_change, ifp);
if (rt != NULL && error == EEXIST) {
error = rtdeletemsg(rt, ifp, rtableid);
if (error == 0)
error = EAGAIN;
}
rtfree(rt);
rt = NULL;
} while (error == EAGAIN);
if (error == EAFNOSUPPORT)
error = 0;
if (error)
break;
}
}
return (error);
}
int
rt_if_linkstate_change(struct rtentry *rt, void *arg, u_int id)
{
struct ifnet *ifp = arg;
struct sockaddr_in6 sa_mask;
int error;
if (rt->rt_ifidx != ifp->if_index)
return (0);
/* Local routes are always usable. */
if (rt->rt_flags & RTF_LOCAL) {
rt->rt_flags |= RTF_UP;
return (0);
}
if (LINK_STATE_IS_UP(ifp->if_link_state) && ifp->if_flags & IFF_UP) {
if (ISSET(rt->rt_flags, RTF_UP))
return (0);
/* bring route up */
rt->rt_flags |= RTF_UP;
error = rtable_mpath_reprio(id, rt_key(rt), rt_plen(rt),
rt->rt_priority & RTP_MASK, rt);
} else {
/*
* Remove redirected and cloned routes (mainly ARP)
* from down interfaces so we have a chance to get
* new routes from a better source.
*/
if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC) &&
!ISSET(rt->rt_flags, RTF_CACHED|RTF_BFD)) {
return (EEXIST);
}
if (!ISSET(rt->rt_flags, RTF_UP))
return (0);
/* take route down */
rt->rt_flags &= ~RTF_UP;
error = rtable_mpath_reprio(id, rt_key(rt), rt_plen(rt),
rt->rt_priority | RTP_DOWN, rt);
}
if_group_routechange(rt_key(rt), rt_plen2mask(rt, &sa_mask));
return (error);
}
struct sockaddr *
rt_plentosa(sa_family_t af, int plen, struct sockaddr_in6 *sa_mask)
{
struct sockaddr_in *sin = (struct sockaddr_in *)sa_mask;
#ifdef INET6
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa_mask;
#endif
KASSERT(plen >= 0 || plen == -1); if (plen == -1)
return (NULL);
memset(sa_mask, 0, sizeof(*sa_mask));
switch (af) {
case AF_INET:
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
in_prefixlen2mask(&sin->sin_addr, plen);
break;
#ifdef INET6
case AF_INET6:
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
in6_prefixlen2mask(&sin6->sin6_addr, plen);
break;
#endif /* INET6 */
default:
return (NULL);
}
return ((struct sockaddr *)sa_mask);
}
struct sockaddr *
rt_plen2mask(struct rtentry *rt, struct sockaddr_in6 *sa_mask)
{
return (rt_plentosa(rt_key(rt)->sa_family, rt_plen(rt), sa_mask));
}
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_output.h>
void db_print_sa(struct sockaddr *);
void db_print_ifa(struct ifaddr *);
void
db_print_sa(struct sockaddr *sa)
{
int len;
u_char *p;
if (sa == NULL) {
db_printf("[NULL]");
return;
}
p = (u_char *)sa;
len = sa->sa_len;
db_printf("[");
while (len > 0) {
db_printf("%d", *p);
p++;
len--;
if (len)
db_printf(",");
}
db_printf("]\n");
}
void
db_print_ifa(struct ifaddr *ifa)
{
if (ifa == NULL)
return;
db_printf(" ifa_addr=");
db_print_sa(ifa->ifa_addr);
db_printf(" ifa_dsta=");
db_print_sa(ifa->ifa_dstaddr);
db_printf(" ifa_mask=");
db_print_sa(ifa->ifa_netmask);
db_printf(" flags=0x%x, refcnt=%u, metric=%d\n",
ifa->ifa_flags, ifa->ifa_refcnt.r_refs, ifa->ifa_metric);
}
/*
* Function to pass to rtable_walk().
* Return non-zero error to abort walk.
*/
int
db_show_rtentry(struct rtentry *rt, void *w, unsigned int id)
{
db_printf("rtentry=%p", rt);
db_printf(" flags=0x%x refcnt=%u use=%llu expire=%lld\n",
rt->rt_flags, rt->rt_refcnt.r_refs, rt->rt_use, rt->rt_expire);
db_printf(" key="); db_print_sa(rt_key(rt));
db_printf(" plen=%d", rt_plen(rt));
db_printf(" gw="); db_print_sa(rt->rt_gateway);
db_printf(" ifidx=%u ", rt->rt_ifidx);
db_printf(" ifa=%p\n", rt->rt_ifa);
db_print_ifa(rt->rt_ifa);
db_printf(" gwroute=%p llinfo=%p priority=%d\n",
rt->rt_gwroute, rt->rt_llinfo, rt->rt_priority);
return (0);
}
/*
* Function to print all the route trees.
*/
int
db_show_rtable(int af, unsigned int rtableid)
{
db_printf("Route tree for af %d, rtableid %u\n", af, rtableid);
rtable_walk(rtableid, af, NULL, db_show_rtentry, NULL);
return (0);
}
#endif /* DDB */
/* $OpenBSD: ip_id.c,v 1.25 2021/03/10 10:21:48 jsg Exp $ */
/*
* Copyright (c) 2008 Theo de Raadt, Ryan McBride
*
* Slightly different algorithm from the one designed by
* Matthew Dillon <dillon@backplane.com> for The DragonFly Project
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Random ip sequence number generator. Use the system PRNG to shuffle
* the 65536 entry ID space. We reshuffle the ID we pick out of the array
* into the previous 32767 cells, providing an guarantee that an ID will not
* be reused for at least 32768 calls.
*/
#include <sys/param.h>
#include <sys/systm.h>
static u_int16_t ip_shuffle[65536];
static int isindex = 0;
u_int16_t ip_randomid(void);
/*
* Return a random IP id. Shuffle the new value we get into the previous half
* of the ip_shuffle ring (-32767 or swap with ourself), to avoid duplicates
* occurring too quickly but also still be random.
*
* 0 is a special IP ID -- don't return it.
*/
u_int16_t
ip_randomid(void)
{
static int ipid_initialized;
u_int16_t si, r;
int i, i2;
if (!ipid_initialized) {
ipid_initialized = 1;
/*
* Initialize with a random permutation. Do so using Knuth
* which avoids the exchange in the Durstenfeld shuffle.
* (See "The Art of Computer Programming, Vol 2" 3rd ed, pg. 145).
*
* Even if our PRNG is imperfect at boot time, we have deferred
* doing this until the first packet being sent and now must
* generate an ID.
*/
for (i = 0; i < nitems(ip_shuffle); ++i) {
i2 = arc4random_uniform(i + 1);
ip_shuffle[i] = ip_shuffle[i2];
ip_shuffle[i2] = i;
}
}
do {
arc4random_buf(&si, sizeof(si));
i = isindex & 0xFFFF;
i2 = (isindex - (si & 0x7FFF)) & 0xFFFF;
r = ip_shuffle[i];
ip_shuffle[i] = ip_shuffle[i2];
ip_shuffle[i2] = r;
isindex++;
} while (r == 0);
return (r);
}
/* $OpenBSD: machdep.c,v 1.280 2022/08/25 17:25:25 cheloha Exp $ */
/* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/exec.h>
#include <sys/buf.h>
#include <sys/reboot.h>
#include <sys/conf.h>
#include <sys/msgbuf.h>
#include <sys/mount.h>
#include <sys/extent.h>
#include <sys/core.h>
#include <sys/kcore.h>
#include <sys/syscallargs.h>
#include <dev/cons.h>
#include <stand/boot/bootarg.h>
#include <net/if.h>
#include <uvm/uvm_extern.h>
#include <sys/sysctl.h>
#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/pio.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/fpu.h>
#include <machine/biosvar.h>
#include <machine/mpbiosvar.h>
#include <machine/kcore.h>
#include <machine/tss.h>
#include <dev/isa/isareg.h>
#include <dev/ic/i8042reg.h>
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_extern.h>
extern int db_console;
#endif
#include "isa.h"
#include "isadma.h"
#include "ksyms.h"
#include "acpi.h"
#if NACPI > 0
#include <dev/acpi/acpivar.h>
#endif
#include "com.h"
#if NCOM > 0
#include <sys/tty.h>
#include <dev/ic/comvar.h>
#include <dev/ic/comreg.h>
#endif
#include "softraid.h"
#if NSOFTRAID > 0
#include <dev/softraidvar.h>
#endif
#ifdef HIBERNATE
#include <machine/hibernate_var.h>
#endif /* HIBERNATE */
#include "ukbd.h"
#include "pckbc.h"
#if NPCKBC > 0 && NUKBD > 0
#include <dev/ic/pckbcvar.h>
#endif
/* #define MACHDEP_DEBUG */
#ifdef MACHDEP_DEBUG
#define DPRINTF(x...) do { printf(x); } while(0)
#else
#define DPRINTF(x...)
#endif /* MACHDEP_DEBUG */
/* the following is used externally (sysctl_hw) */
char machine[] = MACHINE;
/*
* switchto vectors
*/
void (*cpu_idle_cycle_fcn)(void) = NULL;
/* the following is used externally for concurrent handlers */
int setperf_prio = 0;
#ifdef CPURESET_DELAY
int cpureset_delay = CPURESET_DELAY;
#else
int cpureset_delay = 0;
#endif
int physmem;
u_int64_t dumpmem_low;
u_int64_t dumpmem_high;
extern int boothowto;
int cpu_class;
paddr_t dumpmem_paddr;
vaddr_t dumpmem_vaddr;
psize_t dumpmem_sz;
vaddr_t kern_end;
vaddr_t msgbuf_vaddr;
paddr_t msgbuf_paddr;
vaddr_t idt_vaddr;
paddr_t idt_paddr;
vaddr_t lo32_vaddr;
paddr_t lo32_paddr;
paddr_t tramp_pdirpa;
int kbd_reset;
int lid_action = 1;
int pwr_action = 1;
int forceukbd;
/*
* safepri is a safe priority for sleep to set for a spin-wait
* during autoconfiguration or after a panic.
*/
int safepri = 0;
struct vm_map *exec_map = NULL;
struct vm_map *phys_map = NULL;
/* UVM constraint ranges. */
struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL };
struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL };
struct uvm_constraint_range *uvm_md_constraints[] = {
&isa_constraint,
&dma_constraint,
NULL,
};
paddr_t avail_start;
paddr_t avail_end;
void (*delay_func)(int) = i8254_delay;
void (*initclock_func)(void) = i8254_initclocks;
/*
* Format of boot information passed to us by 32-bit /boot
*/
typedef struct _boot_args32 {
int ba_type;
int ba_size;
int ba_nextX; /* a ptr in 32-bit world, but not here */
char ba_arg[1];
} bootarg32_t;
#define BOOTARGC_MAX NBPG /* one page */
bios_bootmac_t *bios_bootmac;
/* locore copies the arguments from /boot to here for us */
char bootinfo[BOOTARGC_MAX];
int bootinfo_size = BOOTARGC_MAX;
void getbootinfo(char *, int);
/* Data passed to us by /boot, filled in by getbootinfo() */
bios_diskinfo_t *bios_diskinfo;
bios_memmap_t *bios_memmap;
u_int32_t bios_cksumlen;
bios_efiinfo_t *bios_efiinfo;
bios_ucode_t *bios_ucode;
/*
* Size of memory segments, before any memory is stolen.
*/
phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
int mem_cluster_cnt;
int cpu_dump(void);
int cpu_dumpsize(void);
u_long cpu_dump_mempagecnt(void);
void dumpsys(void);
void cpu_init_extents(void);
void map_tramps(void);
void init_x86_64(paddr_t);
void (*cpuresetfn)(void);
void enter_shared_special_pages(void);
#ifdef APERTURE
int allowaperture = 0;
#endif
/*
* Machine-dependent startup code
*/
void
cpu_startup(void)
{
vaddr_t minaddr, maxaddr;
msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
printf("%s", version);
startclocks();
rtcinit();
printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
ptoa((psize_t)physmem)/1024/1024);
/*
* Allocate a submap for exec arguments. This map effectively
* limits the number of processes exec'ing at any time.
*/
minaddr = vm_map_min(kernel_map);
exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
/*
* Allocate a submap for physio
*/
minaddr = vm_map_min(kernel_map);
phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
VM_PHYS_SIZE, 0, FALSE, NULL);
printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
ptoa((psize_t)uvmexp.free)/1024/1024);
bufinit();
if (boothowto & RB_CONFIG) {
#ifdef BOOT_CONFIG
user_config();
#else
printf("kernel does not support -c; continuing..\n");
#endif
}
/* Safe for i/o port / memory space allocation to use malloc now. */
x86_bus_space_mallocok();
#ifndef SMALL_KERNEL
cpu_ucode_setup();
cpu_ucode_apply(&cpu_info_primary);
#endif
cpu_tsx_disable(&cpu_info_primary);
/* enter the IDT and trampoline code in the u-k maps */
enter_shared_special_pages();
/* initialize CPU0's TSS and GDT and put them in the u-k maps */
cpu_enter_pages(&cpu_info_full_primary);
}
/*
* enter_shared_special_pages
*
* Requests mapping of various special pages required in the Intel Meltdown
* case (to be entered into the U-K page table):
*
* 1 IDT page
* Various number of pages covering the U-K ".kutext" section. This section
* contains code needed during trampoline operation
* Various number of pages covering the U-K ".kudata" section. This section
* contains data accessed by the trampoline, before switching to U+K
* (for example, various shared global variables used by IPIs, etc)
*
* The linker script places the required symbols in the sections above.
*
* On CPUs not affected by Meltdown, the calls to pmap_enter_special below
* become no-ops.
*/
void
enter_shared_special_pages(void)
{
extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
extern char __text_page_start[], __text_page_end[];
extern char __kernel_kutext_page_phys[];
extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
vaddr_t va;
paddr_t pa;
/* idt */
pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
(uint64_t)idt_vaddr, (uint64_t)idt_paddr);
/* .kutext section */
va = (vaddr_t)__kutext_start;
pa = (paddr_t)__kernel_kutext_phys;
while (va < (vaddr_t)__kutext_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
/* .kutext.page section */
va = (vaddr_t)__text_page_start;
pa = (paddr_t)__kernel_kutext_page_phys;
while (va < (vaddr_t)__text_page_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
/* .kudata section */
va = (vaddr_t)__kudata_start;
pa = (paddr_t)__kernel_kudata_phys;
while (va < (vaddr_t)__kudata_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
}
/*
* Set up proc0's PCB and the cpu's TSS.
*/
void
x86_64_proc0_tss_ldt_init(void)
{
struct pcb *pcb;
cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
pcb->pcb_fsbase = 0;
pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
lldt(0);
}
bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)
{
bios_diskinfo_t *pdi;
if (bios_diskinfo == NULL)
return NULL;
for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
if (pdi->bsd_dev == dev)
break;
} else {
if (pdi->bios_number == dev)
break;
}
}
if (pdi->bios_number == -1)
return NULL;
else
return pdi;
}
int
bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
bios_diskinfo_t *pdi;
extern dev_t bootdev;
int biosdev;
/* all sysctl names at this level except diskinfo are terminal */
if (namelen != 1 && name[0] != BIOS_DISKINFO)
return (ENOTDIR); /* overloaded */
if (!(bootapiver & BAPIV_VECTOR))
return EOPNOTSUPP;
switch (name[0]) {
case BIOS_DEV:
if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
return ENXIO;
biosdev = pdi->bios_number;
return sysctl_rdint(oldp, oldlenp, newp, biosdev);
case BIOS_DISKINFO:
if (namelen != 2)
return ENOTDIR;
if ((pdi = bios_getdiskinfo(name[1])) == NULL)
return ENXIO;
return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
case BIOS_CKSUMLEN:
return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
default:
return EOPNOTSUPP;
}
/* NOTREACHED */
}
extern int tsc_is_invariant;
extern int amd64_has_xcrypt;
const struct sysctl_bounded_args cpuctl_vars[] = {
{ CPU_LIDACTION, &lid_action, 0, 2 },
{ CPU_PWRACTION, &pwr_action, 0, 2 },
{ CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
{ CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
{ CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
{ CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
};
/*
* machine dependent system variables.
*/
int
cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
extern uint64_t tsc_frequency;
dev_t consdev;
dev_t dev;
switch (name[0]) {
case CPU_CONSDEV:
if (namelen != 1)
return (ENOTDIR); /* overloaded */
if (cn_tab != NULL) consdev = cn_tab->cn_dev;
else
consdev = NODEV;
return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
sizeof consdev));
case CPU_CHR2BLK:
if (namelen != 2)
return (ENOTDIR); /* overloaded */
dev = chrtoblk((dev_t)name[1]);
return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
case CPU_BIOS:
return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen, p);
case CPU_CPUVENDOR:
return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
case CPU_KBDRESET:
return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
&kbd_reset));
case CPU_ALLOWAPERTURE:
if (namelen != 1)
return (ENOTDIR); /* overloaded */
#ifdef APERTURE
if (securelevel > 0)
return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
&allowaperture));
else
return (sysctl_int(oldp, oldlenp, newp, newlen,
&allowaperture));
#else
return (sysctl_rdint(oldp, oldlenp, newp, 0));
#endif
#if NPCKBC > 0 && NUKBD > 0
case CPU_FORCEUKBD:
{
int error;
if (forceukbd)
return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
if (forceukbd) pckbc_release_console();
return (error);
}
#endif
case CPU_TSCFREQ:
return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
default:
return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
name, namelen, oldp, oldlenp, newp, newlen));
}
/* NOTREACHED */
}
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode to call routine, followed by
* syscall to sigreturn routine below. After sigreturn resets the
* signal mask, the stack, and the frame pointer, it returns to the
* user specified pc.
*/
int
sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
int info, int onstack)
{
struct proc *p = curproc;
struct trapframe *tf = p->p_md.md_regs;
struct sigcontext ksc;
struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
register_t sp, scp, sip;
u_long sss;
memset(&ksc, 0, sizeof ksc);
ksc.sc_rdi = tf->tf_rdi;
ksc.sc_rsi = tf->tf_rsi;
ksc.sc_rdx = tf->tf_rdx;
ksc.sc_rcx = tf->tf_rcx;
ksc.sc_r8 = tf->tf_r8;
ksc.sc_r9 = tf->tf_r9;
ksc.sc_r10 = tf->tf_r10;
ksc.sc_r11 = tf->tf_r11;
ksc.sc_r12 = tf->tf_r12;
ksc.sc_r13 = tf->tf_r13;
ksc.sc_r14 = tf->tf_r14;
ksc.sc_r15 = tf->tf_r15;
ksc.sc_rbx = tf->tf_rbx;
ksc.sc_rax = tf->tf_rax;
ksc.sc_rbp = tf->tf_rbp;
ksc.sc_rip = tf->tf_rip;
ksc.sc_cs = tf->tf_cs;
ksc.sc_rflags = tf->tf_rflags;
ksc.sc_rsp = tf->tf_rsp;
ksc.sc_ss = tf->tf_ss;
ksc.sc_mask = mask;
/* Allocate space for the signal handler context. */
if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
!sigonstack(tf->tf_rsp) && onstack)
sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
else
sp = tf->tf_rsp - 128;
sp &= ~15ULL; /* just in case */
sss = (sizeof(ksc) + 15) & ~15;
/* Save FPU state to PCB if necessary, then copy it out */
if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
fpusavereset(&p->p_addr->u_pcb.pcb_savefpu);
}
sp -= fpu_save_len;
ksc.sc_fpstate = (struct fxsave64 *)sp;
if (copyout(sfp, (void *)sp, fpu_save_len))
return 1;
/* Now reset the FPU state in PCB */
memcpy(&p->p_addr->u_pcb.pcb_savefpu,
&proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
sip = 0;
if (info) {
sip = sp - ((sizeof(*ksip) + 15) & ~15);
sss += (sizeof(*ksip) + 15) & ~15;
if (copyout(ksip, (void *)sip, sizeof(*ksip)))
return 1;
}
scp = sp - sss;
ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
if (copyout(&ksc, (void *)scp, sizeof(ksc)))
return 1;
/*
* Build context to run handler in.
*/
tf->tf_rax = (u_int64_t)catcher;
tf->tf_rdi = sig;
tf->tf_rsi = sip;
tf->tf_rdx = scp;
tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
tf->tf_rsp = scp;
tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
/* The reset state _is_ the userspace state for this thread now */
curcpu()->ci_pflags |= CPUPF_USERXSTATE;
return 0;
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* psl to gain improper privileges or to cause
* a machine fault.
*/
int
sys_sigreturn(struct proc *p, void *v, register_t *retval)
{
struct sys_sigreturn_args /* {
syscallarg(struct sigcontext *) sigcntxp;
} */ *uap = v;
struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
struct trapframe *tf = p->p_md.md_regs;
int error;
if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
sigexit(p, SIGILL);
return (EPERM);
}
if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
return (error);
if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
sigexit(p, SIGILL);
return (EFAULT);
}
/* Prevent reuse of the sigcontext cookie */
ksc.sc_cookie = 0;
(void)copyout(&ksc.sc_cookie, (caddr_t)scp +
offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
!USERMODE(ksc.sc_cs, ksc.sc_eflags))
return (EINVAL);
/* Current state is obsolete; toss it and force a reload */
if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
fpureset();
}
/* Copy in the FPU state to restore */
if (__predict_true(ksc.sc_fpstate != NULL)) {
struct fxsave64 *fx = &p->p_addr->u_pcb.pcb_savefpu.fp_fxsave;
if ((error = copyin(ksc.sc_fpstate, fx, fpu_save_len)))
return (error);
fx->fx_mxcsr &= fpu_mxcsr_mask;
} else {
/* shouldn't happen, but handle it */
memcpy(&p->p_addr->u_pcb.pcb_savefpu,
&proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
}
tf->tf_rdi = ksc.sc_rdi;
tf->tf_rsi = ksc.sc_rsi;
tf->tf_rdx = ksc.sc_rdx;
tf->tf_rcx = ksc.sc_rcx;
tf->tf_r8 = ksc.sc_r8;
tf->tf_r9 = ksc.sc_r9;
tf->tf_r10 = ksc.sc_r10;
tf->tf_r11 = ksc.sc_r11;
tf->tf_r12 = ksc.sc_r12;
tf->tf_r13 = ksc.sc_r13;
tf->tf_r14 = ksc.sc_r14;
tf->tf_r15 = ksc.sc_r15;
tf->tf_rbx = ksc.sc_rbx;
tf->tf_rax = ksc.sc_rax;
tf->tf_rbp = ksc.sc_rbp;
tf->tf_rip = ksc.sc_rip;
tf->tf_cs = ksc.sc_cs;
tf->tf_rflags = ksc.sc_rflags;
tf->tf_rsp = ksc.sc_rsp;
tf->tf_ss = ksc.sc_ss;
/* Restore signal mask. */
p->p_sigmask = ksc.sc_mask & ~sigcantmask;
/*
* sigreturn() needs to return to userspace via the 'iretq'
* method, so that if the process was interrupted (by tick,
* an IPI, whatever) as opposed to already being in the kernel
* when a signal was being delivered, the process will be
* completely restored, including the userland %rcx and %r11
* registers which the 'sysretq' instruction cannot restore.
* Also need to make sure we can handle faulting on xrstor.
*/
p->p_md.md_flags |= MDP_IRET;
return (EJUSTRETURN);
}
#ifdef MULTIPROCESSOR
/* force a CPU into the kernel, whether or not it's idle */
void
cpu_kick(struct cpu_info *ci)
{
/* only need to kick other CPUs */
if (ci != curcpu()) {
if (cpu_mwait_size > 0) {
/*
* If not idling, then send an IPI, else
* just clear the "keep idling" bit.
*/
if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
x86_send_ipi(ci, X86_IPI_NOP);
else
atomic_clearbits_int(&ci->ci_mwait,
MWAIT_KEEP_IDLING);
} else {
/* no mwait, so need an IPI */
x86_send_ipi(ci, X86_IPI_NOP);
}
}
}
#endif
/*
* Notify the current process (p) that it has a signal pending,
* process as soon as possible.
*/
void
signotify(struct proc *p)
{
aston(p);
cpu_kick(p->p_cpu);
}
#ifdef MULTIPROCESSOR
void
cpu_unidle(struct cpu_info *ci)
{ if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
/*
* Just clear the "keep idling" bit; if it wasn't
* idling then we didn't need to do anything anyway.
*/
atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
return;
}
if (ci != curcpu()) x86_send_ipi(ci, X86_IPI_NOP);
}
#endif
int waittime = -1;
struct pcb dumppcb;
__dead void
boot(int howto)
{
if ((howto & RB_POWERDOWN) != 0)
lid_action = 0;
if ((howto & RB_RESET) != 0)
goto doreset;
if (cold) {
if ((howto & RB_USERREQ) == 0)
howto |= RB_HALT;
goto haltsys;
}
boothowto = howto;
if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
waittime = 0;
vfs_shutdown(curproc);
if ((howto & RB_TIMEBAD) == 0) {
resettodr();
} else {
printf("WARNING: not updating battery clock\n");
}
}
if_downall();
uvm_shutdown();
splhigh();
cold = 1;
if ((howto & RB_DUMP) != 0)
dumpsys();
haltsys:
config_suspend_all(DVACT_POWERDOWN);
#ifdef MULTIPROCESSOR
x86_broadcast_ipi(X86_IPI_HALT);
#endif
if ((howto & RB_HALT) != 0) {
#if NACPI > 0 && !defined(SMALL_KERNEL)
extern int acpi_enabled;
if (acpi_enabled) {
delay(500000);
if ((howto & RB_POWERDOWN) != 0)
acpi_powerdown();
}
#endif
printf("\n");
printf("The operating system has halted.\n");
printf("Please press any key to reboot.\n\n");
cnpollc(1); /* for proper keyboard command handling */
cngetc();
cnpollc(0);
}
doreset:
printf("rebooting...\n");
if (cpureset_delay > 0)
delay(cpureset_delay * 1000);
cpu_reset();
for (;;)
continue;
/* NOTREACHED */
}
/*
* These variables are needed by /sbin/savecore
*/
u_long dumpmag = 0x8fca0101; /* magic number */
int dumpsize = 0; /* pages */
long dumplo = 0; /* blocks */
/*
* cpu_dump: dump the machine-dependent kernel core dump headers.
*/
int
cpu_dump(void)
{
int (*dump)(dev_t, daddr_t, caddr_t, size_t);
char buf[dbtob(1)];
kcore_seg_t *segp;
cpu_kcore_hdr_t *cpuhdrp;
phys_ram_seg_t *memsegp;
caddr_t va;
int i;
dump = bdevsw[major(dumpdev)].d_dump;
memset(buf, 0, sizeof buf);
segp = (kcore_seg_t *)buf;
cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
ALIGN(sizeof(*cpuhdrp))];
/*
* Generate a segment header.
*/
CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
/*
* Add the machine-dependent header info.
*/
cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
cpuhdrp->nmemsegs = mem_cluster_cnt;
/*
* Fill in the memory segment descriptors.
*/
for (i = 0; i < mem_cluster_cnt; i++) {
memsegp[i].start = mem_clusters[i].start;
memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
}
/*
* If we have dump memory then assume the kernel stack is in high
* memory and bounce
*/
if (dumpmem_vaddr != 0) {
memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
va = (caddr_t)dumpmem_vaddr;
} else {
va = (caddr_t)buf;
}
return (dump(dumpdev, dumplo, va, dbtob(1)));
}
/*
* This is called by main to set dumplo and dumpsize.
* Dumps always skip the first PAGE_SIZE of disk space
* in case there might be a disk label stored there.
* If there is extra space, put dump at the end to
* reduce the chance that swapping trashes it.
*/
void
dumpconf(void)
{
int nblks, dumpblks; /* size of dump area */
if (dumpdev == NODEV ||
(nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
return;
if (nblks <= ctod(1))
return;
dumpblks = cpu_dumpsize();
if (dumpblks < 0)
return;
dumpblks += ctod(cpu_dump_mempagecnt());
/* If dump won't fit (incl. room for possible label), punt. */
if (dumpblks > (nblks - ctod(1)))
return;
/* Put dump at end of partition */
dumplo = nblks - dumpblks;
/* dumpsize is in page units, and doesn't include headers. */
dumpsize = cpu_dump_mempagecnt();
}
/*
* Doadump comes here after turning off memory management and
* getting on the dump stack, either when called above, or by
* the auto-restart code.
*/
#define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */
void
dumpsys(void)
{
u_long totalbytesleft, bytes, i, n, memseg;
u_long maddr;
daddr_t blkno;
void *va;
int (*dump)(dev_t, daddr_t, caddr_t, size_t);
int error;
/* Save registers. */
savectx(&dumppcb);
if (dumpdev == NODEV)
return;
/*
* For dumps during autoconfiguration,
* if dump device has already configured...
*/
if (dumpsize == 0)
dumpconf();
if (dumplo <= 0 || dumpsize == 0) {
printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
minor(dumpdev));
return;
}
printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
minor(dumpdev), dumplo);
error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
printf("dump ");
if (error == -1) {
printf("area unavailable\n");
return;
}
if ((error = cpu_dump()) != 0)
goto err;
totalbytesleft = ptoa(cpu_dump_mempagecnt());
blkno = dumplo + cpu_dumpsize();
dump = bdevsw[major(dumpdev)].d_dump;
error = 0;
for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
maddr = mem_clusters[memseg].start;
bytes = mem_clusters[memseg].size;
for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
/* Print out how many MBs we have left to go. */
if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
printf("%ld ", totalbytesleft / (1024 * 1024));
/* Limit size for next transfer. */
n = bytes - i;
if (n > BYTES_PER_DUMP)
n = BYTES_PER_DUMP;
if (maddr > 0xffffffff) {
va = (void *)dumpmem_vaddr;
if (n > dumpmem_sz)
n = dumpmem_sz;
memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
} else {
va = (void *)PMAP_DIRECT_MAP(maddr);
}
error = (*dump)(dumpdev, blkno, va, n);
if (error)
goto err;
maddr += n;
blkno += btodb(n); /* XXX? */
#if 0 /* XXX this doesn't work. grr. */
/* operator aborting dump? */
if (sget() != NULL) {
error = EINTR;
break;
}
#endif
}
}
err:
switch (error) {
case ENXIO:
printf("device bad\n");
break;
case EFAULT:
printf("device not ready\n");
break;
case EINVAL:
printf("area improper\n");
break;
case EIO:
printf("i/o error\n");
break;
case EINTR:
printf("aborted from console\n");
break;
case 0:
printf("succeeded\n");
break;
default:
printf("error %d\n", error);
break;
}
printf("\n\n");
delay(5000000); /* 5 seconds */
}
/*
* Force the userspace FS.base to be reloaded from the PCB on return from
* the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
* to their expected userspace value.
*/
void
reset_segs(void)
{
/*
* This operates like the cpu_switchto() sequence: if we
* haven't reset %[defg]s already, do so now.
*/
if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
__asm volatile(
"movw %%ax,%%ds\n\t"
"movw %%ax,%%es\n\t"
"movw %%ax,%%fs\n\t"
"cli\n\t" /* block intr when on user GS.base */
"swapgs\n\t" /* swap from kernel to user GS.base */
"movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
"swapgs\n\t" /* back to kernel GS.base */
"sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
}
}
/*
* Clear registers on exec
*/
void
setregs(struct proc *p, struct exec_package *pack, u_long stack,
register_t *retval)
{
struct trapframe *tf;
/* Reset FPU state in PCB */
memcpy(&p->p_addr->u_pcb.pcb_savefpu,
&proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
/* state in CPU is obsolete; reset it */
fpureset();
} else {
/* the reset state _is_ the userspace state now */
curcpu()->ci_pflags |= CPUPF_USERXSTATE;
}
/* To reset all registers we have to return via iretq */
p->p_md.md_flags |= MDP_IRET;
reset_segs();
p->p_addr->u_pcb.pcb_fsbase = 0;
tf = p->p_md.md_regs;
tf->tf_rdi = 0;
tf->tf_rsi = 0;
tf->tf_rbp = 0;
tf->tf_rbx = 0;
tf->tf_rdx = 0;
tf->tf_rcx = 0;
tf->tf_rax = 0;
tf->tf_r8 = 0;
tf->tf_r9 = 0;
tf->tf_r10 = 0;
tf->tf_r11 = 0;
tf->tf_r12 = 0;
tf->tf_r13 = 0;
tf->tf_r14 = 0;
tf->tf_r15 = 0;
tf->tf_rip = pack->ep_entry;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
tf->tf_rflags = PSL_USERSET;
tf->tf_rsp = stack;
tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
retval[1] = 0;
}
/*
* Initialize segments and descriptor tables
*/
struct gate_descriptor *idt;
char idt_allocmap[NIDT];
extern struct user *proc0paddr;
void
setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
int sel)
{
gd->gd_looffset = (u_int64_t)func & 0xffff;
gd->gd_selector = sel;
gd->gd_ist = ist;
gd->gd_type = type;
gd->gd_dpl = dpl;
gd->gd_p = 1;
gd->gd_hioffset = (u_int64_t)func >> 16;
gd->gd_zero = 0;
gd->gd_xx1 = 0;
gd->gd_xx2 = 0;
gd->gd_xx3 = 0;
}
void
unsetgate(struct gate_descriptor *gd)
{
memset(gd, 0, sizeof (*gd));
}
void
setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
{
rd->rd_limit = limit;
rd->rd_base = (u_int64_t)base;
}
/*
* Note that the base and limit fields are ignored in long mode.
*/
void
set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
int type, int dpl, int gran, int def32, int is64)
{
sd->sd_lolimit = (unsigned)limit;
sd->sd_lobase = (unsigned long)base;
sd->sd_type = type;
sd->sd_dpl = dpl;
sd->sd_p = 1;
sd->sd_hilimit = (unsigned)limit >> 16;
sd->sd_avl = 0;
sd->sd_long = is64;
sd->sd_def32 = def32;
sd->sd_gran = gran;
sd->sd_hibase = (unsigned long)base >> 24;
}
void
set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
int type, int dpl, int gran)
{
memset(sd, 0, sizeof *sd);
sd->sd_lolimit = (unsigned)limit;
sd->sd_lobase = (u_int64_t)base;
sd->sd_type = type;
sd->sd_dpl = dpl;
sd->sd_p = 1;
sd->sd_hilimit = (unsigned)limit >> 16;
sd->sd_gran = gran;
sd->sd_hibase = (u_int64_t)base >> 24;
}
void cpu_init_idt(void)
{
struct region_descriptor region;
setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1);
lidt(®ion);
}
void
cpu_init_extents(void)
{
extern struct extent *iomem_ex;
static int already_done;
int i;
/* We get called for each CPU, only first should do this */
if (already_done)
return;
/*
* Allocate the physical addresses used by RAM from the iomem
* extent map.
*/
for (i = 0; i < mem_cluster_cnt; i++) {
if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
mem_clusters[i].size, EX_NOWAIT)) {
/* XXX What should we do? */
printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
" FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
mem_clusters[i].start + mem_clusters[i].size - 1);
}
}
already_done = 1;
}
void
map_tramps(void)
{
#if defined(MULTIPROCESSOR) || \
(NACPI > 0 && !defined(SMALL_KERNEL))
struct pmap *kmp = pmap_kernel();
extern paddr_t tramp_pdirpa;
#ifdef MULTIPROCESSOR
extern u_char cpu_spinup_trampoline[];
extern u_char cpu_spinup_trampoline_end[];
extern u_char mp_tramp_data_start[];
extern u_char mp_tramp_data_end[];
extern u_int32_t mp_pdirpa;
#endif
/*
* The initial PML4 pointer must be below 4G, so if the
* current one isn't, use a "bounce buffer" and save it
* for tramps to use.
*/
if (kmp->pm_pdirpa > 0xffffffff) {
pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
tramp_pdirpa = lo32_paddr;
pmap_kremove(lo32_vaddr, PAGE_SIZE);
} else
tramp_pdirpa = kmp->pm_pdirpa;
#ifdef MULTIPROCESSOR
/* Map MP tramp code and data pages RW for copy */
pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
PROT_READ | PROT_WRITE);
pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
PROT_READ | PROT_WRITE);
memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
memcpy((caddr_t)MP_TRAMPOLINE,
cpu_spinup_trampoline,
cpu_spinup_trampoline_end-cpu_spinup_trampoline);
memcpy((caddr_t)MP_TRAMP_DATA,
mp_tramp_data_start,
mp_tramp_data_end - mp_tramp_data_start);
/*
* We need to patch this after we copy the tramp data,
* the symbol points into the copied tramp data page.
*/
mp_pdirpa = tramp_pdirpa;
/* Unmap, will be remapped in cpu_start_secondary */
pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
#endif /* MULTIPROCESSOR */
#endif
}
#define IDTVEC(name) __CONCAT(X, name)
typedef void (vector)(void);
extern vector *IDTVEC(exceptions)[];
paddr_t early_pte_pages;
void
init_x86_64(paddr_t first_avail)
{
struct region_descriptor region;
bios_memmap_t *bmp;
int x, ist;
uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
/*
* locore0 mapped 3 pages for use before the pmap is initialized
* starting at first_avail. These pages are currently used by
* efifb to create early-use VAs for the framebuffer before efifb
* is attached.
*/
early_pte_pages = first_avail;
first_avail += 3 * NBPG;
cpu_init_msrs(&cpu_info_primary);
proc0.p_addr = proc0paddr;
cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
x86_bus_space_init();
i8254_startclock();
/*
* Initialize PAGE_SIZE-dependent variables.
*/
uvm_setpagesize();
/*
* Boot arguments are in a single page specified by /boot.
*
* We require the "new" vector form, as well as memory ranges
* to be given in bytes rather than KB.
*
* locore copies the data into bootinfo[] for us.
*/
if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
(BAPIV_VECTOR | BAPIV_BMEMMAP)) {
if (bootinfo_size >= sizeof(bootinfo))
panic("boot args too big");
getbootinfo(bootinfo, bootinfo_size);
} else
panic("invalid /boot");
cninit();
/*
* Memory on the AMD64 port is described by three different things.
*
* 1. biosbasemem - This is outdated, and should really only be used to
* sanitize the other values. This is what we get back from the BIOS
* using the legacy routines, describing memory below 640KB.
*
* 2. bios_memmap[] - This is the memory map as the bios has returned
* it to us. It includes memory the kernel occupies, etc.
*
* 3. mem_cluster[] - This is the massaged free memory segments after
* taking into account the contents of bios_memmap, biosbasemem,
* and locore/machdep/pmap kernel allocations of physical
* pages.
*
* The other thing is that the physical page *RANGE* is described by
* three more variables:
*
* avail_start - This is a physical address of the start of available
* pages, until IOM_BEGIN. This is basically the start
* of the UVM managed range of memory, with some holes...
*
* avail_end - This is the end of physical pages. All physical pages
* that UVM manages are between avail_start and avail_end.
* There are holes...
*
* first_avail - This is the first available physical page after the
* kernel, page tables, etc.
*
* We skip the first few pages for trampolines, hibernate, and to avoid
* buggy SMI implementations that could corrupt the first 64KB.
*/
avail_start = 16*PAGE_SIZE;
#ifdef MULTIPROCESSOR
if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
avail_start = MP_TRAMPOLINE + PAGE_SIZE;
if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
avail_start = MP_TRAMP_DATA + PAGE_SIZE;
#endif
#if (NACPI > 0 && !defined(SMALL_KERNEL))
if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
#endif
#ifdef HIBERNATE
if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
#endif /* HIBERNATE */
/*
* We need to go through the BIOS memory map given, and
* fill out mem_clusters and mem_cluster_cnt stuff, taking
* into account all the points listed above.
*/
avail_end = mem_cluster_cnt = 0;
for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
paddr_t s1, s2, e1, e2;
/* Ignore non-free memory */
if (bmp->type != BIOS_MAP_FREE)
continue;
if (bmp->size < PAGE_SIZE)
continue;
/* Init our segment(s), round/trunc to pages */
s1 = round_page(bmp->addr);
e1 = trunc_page(bmp->addr + bmp->size);
s2 = e2 = 0;
/*
* XXX Some buggy ACPI BIOSes use memory that they
* declare as free. Current worst offender is
* Supermicro 5019D-FTN4. Typically the affected memory
* areas are small blocks between areas reserved for
* ACPI and other BIOS goo. So skip areas smaller
* than 32 MB above the 16 MB boundary (to avoid
* affecting legacy stuff).
*/
if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
continue;
/* Check and adjust our segment(s) */
/* Nuke low pages */
if (s1 < avail_start) {
s1 = avail_start;
if (s1 > e1)
continue;
}
/*
* The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
* memory, so discard anything above that.
*/
if (e1 >= max_dm_size) {
e1 = max_dm_size;
if (s1 > e1)
continue;
}
/* Crop stuff into "640K hole" */
if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
e1 = IOM_BEGIN;
if (s1 < biosbasemem && e1 > biosbasemem)
e1 = biosbasemem;
/* Split any segments straddling the 16MB boundary */
if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
e2 = e1;
s2 = e1 = 16*1024*1024;
}
/* Store segment(s) */
if (e1 - s1 >= PAGE_SIZE) {
mem_clusters[mem_cluster_cnt].start = s1;
mem_clusters[mem_cluster_cnt].size = e1 - s1;
mem_cluster_cnt++;
}
if (e2 - s2 >= PAGE_SIZE) {
mem_clusters[mem_cluster_cnt].start = s2;
mem_clusters[mem_cluster_cnt].size = e2 - s2;
mem_cluster_cnt++;
}
if (avail_end < e1) avail_end = e1;
if (avail_end < e2) avail_end = e2;
}
/*
* Call pmap initialization to make new kernel address space.
* We must do this before loading pages into the VM system.
*/
first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
/* Allocate these out of the 640KB base memory */
if (avail_start != PAGE_SIZE)
avail_start = pmap_prealloc_lowmem_ptps(avail_start);
cpu_init_extents();
/* Make sure the end of the space used by the kernel is rounded. */
first_avail = round_page(first_avail);
kern_end = KERNBASE + first_avail;
/*
* Now, load the memory clusters (which have already been
* flensed) into the VM system.
*/
for (x = 0; x < mem_cluster_cnt; x++) {
paddr_t seg_start = mem_clusters[x].start;
paddr_t seg_end = seg_start + mem_clusters[x].size;
if (seg_start < first_avail) seg_start = first_avail;
if (seg_start > seg_end) continue;
if (seg_end - seg_start < PAGE_SIZE) continue;
physmem += atop(mem_clusters[x].size);
#if DEBUG_MEMLOAD
printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
seg_start, seg_end, atop(seg_start), atop(seg_end));
#endif
uvm_page_physload(atop(seg_start), atop(seg_end),
atop(seg_start), atop(seg_end), 0);
}
/*
* Now, load the memory between the end of I/O memory "hole"
* and the kernel.
*/
{
paddr_t seg_start = round_page(IOM_END);
paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
if (seg_start < seg_end) {
#if DEBUG_MEMLOAD
printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
#endif
uvm_page_physload(atop(seg_start), atop(seg_end),
atop(seg_start), atop(seg_end), 0);
}
}
#if DEBUG_MEMLOAD
printf("avail_start = 0x%lx\n", avail_start);
printf("avail_end = 0x%lx\n", avail_end);
printf("first_avail = 0x%lx\n", first_avail);
#endif
/*
* Steal memory for the message buffer (at end of core).
*/
{
struct vm_physseg *vps = NULL;
psize_t sz = round_page(MSGBUFSIZE);
psize_t reqsz = sz;
for (x = 0; x < vm_nphysseg; x++) {
vps = &vm_physmem[x];
if (ptoa(vps->avail_end) == avail_end)
break;
}
if (x == vm_nphysseg)
panic("init_x86_64: can't find end of memory");
/* Shrink so it'll fit in the last segment. */
if ((vps->avail_end - vps->avail_start) < atop(sz))
sz = ptoa(vps->avail_end - vps->avail_start);
vps->avail_end -= atop(sz);
vps->end -= atop(sz);
msgbuf_paddr = ptoa(vps->avail_end);
/* Remove the last segment if it now has no pages. */
if (vps->start == vps->end) {
for (vm_nphysseg--; x < vm_nphysseg; x++)
vm_physmem[x] = vm_physmem[x + 1];
}
/* Now find where the new avail_end is. */
for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
if (vm_physmem[x].avail_end > avail_end)
avail_end = vm_physmem[x].avail_end;
avail_end = ptoa(avail_end);
/* Warn if the message buffer had to be shrunk. */
if (sz != reqsz)
printf("WARNING: %ld bytes not available for msgbuf "
"in last cluster (%ld used)\n", reqsz, sz);
}
/*
* Steal some memory for a dump bouncebuffer if we have memory over
* the 32-bit barrier.
*/
if (avail_end > 0xffffffff) {
struct vm_physseg *vps = NULL;
psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
/* XXX assumes segments are ordered */
for (x = 0; x < vm_nphysseg; x++) {
vps = &vm_physmem[x];
/* Find something between 16meg and 4gig */
if (ptoa(vps->avail_end) <= 0xffffffff &&
ptoa(vps->avail_start) >= 0xffffff)
break;
}
if (x == vm_nphysseg)
panic("init_x86_64: no memory between "
"0xffffff-0xffffffff");
/* Shrink so it'll fit in the segment. */
if ((vps->avail_end - vps->avail_start) < atop(sz))
sz = ptoa(vps->avail_end - vps->avail_start);
vps->avail_end -= atop(sz);
vps->end -= atop(sz);
dumpmem_paddr = ptoa(vps->avail_end);
dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
dumpmem_sz = sz;
/* Remove the last segment if it now has no pages. */
if (vps->start == vps->end) {
for (vm_nphysseg--; x < vm_nphysseg; x++)
vm_physmem[x] = vm_physmem[x + 1];
}
}
pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
idt = (struct gate_descriptor *)idt_vaddr;
cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
/* make gdt gates and memory segments */
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE32_SEL), 0,
atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
SDT_SYS386TSS, SEL_KPL, 0);
/* exceptions */
for (x = 0; x < 32; x++) {
/* trap2 == NMI, trap8 == double fault */
ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
(x == 3) ? SEL_UPL : SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
idt_allocmap[x] = 1;
}
setregion(®ion, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
lgdt(®ion);
cpu_init_idt();
intr_default_setup();
fpuinit(&cpu_info_primary);
softintr_init();
splraise(IPL_IPI);
intr_enable();
#ifdef DDB
db_machine_init();
ddb_init();
if (boothowto & RB_KDB)
db_enter();
#endif
}
void
cpu_reset(void)
{
intr_disable();
if (cpuresetfn)
(*cpuresetfn)();
/*
* The keyboard controller has 4 random output pins, one of which is
* connected to the RESET pin on the CPU in many PCs. We tell the
* keyboard controller to pulse this line a couple of times.
*/
outb(IO_KBD + KBCMDP, KBC_PULSE0);
delay(100000);
outb(IO_KBD + KBCMDP, KBC_PULSE0);
delay(100000);
/*
* Try to cause a triple fault and watchdog reset by making the IDT
* invalid and causing a fault.
*/
memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
__asm volatile("divl %0,%1" : : "q" (0), "a" (0));
for (;;)
continue;
/* NOTREACHED */
}
/*
* cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
*/
int
cpu_dumpsize(void)
{
int size;
size = ALIGN(sizeof(kcore_seg_t)) +
ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
if (roundup(size, dbtob(1)) != dbtob(1))
return (-1);
return (1);
}
/*
* cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
*/
u_long
cpu_dump_mempagecnt(void)
{
u_long i, n;
n = 0;
for (i = 0; i < mem_cluster_cnt; i++)
n += atop(mem_clusters[i].size);
return (n);
}
/*
* Figure out which portions of memory are used by the kernel/system.
*/
int
amd64_pa_used(paddr_t addr)
{
struct vm_page *pg;
/* Kernel manages these */
if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
return 1;
/* Kernel is loaded here */
if (addr > IOM_END && addr < (kern_end - KERNBASE))
return 1;
/* Low memory used for various bootstrap things */
if (addr < avail_start)
return 1;
/*
* The only regions I can think of that are left are the things
* we steal away from UVM. The message buffer?
* XXX - ignore these for now.
*/
return 0;
}
void
cpu_initclocks(void)
{
(*initclock_func)();
}
void
need_resched(struct cpu_info *ci)
{
ci->ci_want_resched = 1;
/* There's a risk we'll be called before the idle threads start */
if (ci->ci_curproc) {
aston(ci->ci_curproc);
cpu_kick(ci);
}
}
/*
* Allocate an IDT vector slot within the given range.
* XXX needs locking to avoid MP allocation races.
*/
int
idt_vec_alloc(int low, int high)
{
int vec;
for (vec = low; vec <= high; vec++) {
if (idt_allocmap[vec] == 0) {
idt_allocmap[vec] = 1;
return vec;
}
}
return 0;
}
void
idt_vec_set(int vec, void (*function)(void))
{
/*
* Vector should be allocated, so no locking needed.
*/
KASSERT(idt_allocmap[vec] == 1);
setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
}
void
idt_vec_free(int vec)
{
unsetgate(&idt[vec]);
idt_allocmap[vec] = 0;
}
#ifdef DIAGNOSTIC
void
splassert_check(int wantipl, const char *func)
{
int cpl = curcpu()->ci_ilevel;
int floor = curcpu()->ci_handled_intr_level;
if (cpl < wantipl) { splassert_fail(wantipl, cpl, func);
}
if (floor > wantipl) { splassert_fail(wantipl, floor, func);
}
}
#endif
int
copyin32(const uint32_t *uaddr, uint32_t *kaddr)
{
if ((vaddr_t)uaddr & 0x3)
return EFAULT;
/* copyin(9) is atomic */
return copyin(uaddr, kaddr, sizeof(uint32_t));
}
void
getbootinfo(char *bootinfo, int bootinfo_size)
{
bootarg32_t *q;
bios_ddb_t *bios_ddb;
bios_bootduid_t *bios_bootduid;
bios_bootsr_t *bios_bootsr;
#undef BOOTINFO_DEBUG
#ifdef BOOTINFO_DEBUG
printf("bootargv:");
#endif
for (q = (bootarg32_t *)bootinfo;
(q->ba_type != BOOTARG_END) &&
((((char *)q) - bootinfo) < bootinfo_size);
q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
switch (q->ba_type) {
case BOOTARG_MEMMAP:
bios_memmap = (bios_memmap_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" memmap %p", bios_memmap);
#endif
break;
case BOOTARG_DISKINFO:
bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" diskinfo %p", bios_diskinfo);
#endif
break;
case BOOTARG_APMINFO:
/* generated by i386 boot loader */
break;
case BOOTARG_CKSUMLEN:
bios_cksumlen = *(u_int32_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" cksumlen %d", bios_cksumlen);
#endif
break;
case BOOTARG_PCIINFO:
/* generated by i386 boot loader */
break;
case BOOTARG_CONSDEV:
if (q->ba_size > sizeof(bios_oconsdev_t) +
offsetof(struct _boot_args32, ba_arg)) {
#if NCOM > 0
bios_consdev_t *cdp =
(bios_consdev_t*)q->ba_arg;
static const int ports[] =
{ 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
int unit = minor(cdp->consdev);
uint64_t consaddr = cdp->consaddr;
if (consaddr == -1 && unit >= 0 &&
unit < nitems(ports))
consaddr = ports[unit];
if (major(cdp->consdev) == 8 &&
consaddr != -1) {
comconsunit = unit;
comconsaddr = consaddr;
comconsrate = cdp->conspeed;
comconsfreq = cdp->consfreq;
comcons_reg_width = cdp->reg_width;
comcons_reg_shift = cdp->reg_shift;
if (cdp->flags & BCD_MMIO)
comconsiot = X86_BUS_SPACE_MEM;
else
comconsiot = X86_BUS_SPACE_IO;
}
#endif
#ifdef BOOTINFO_DEBUG
printf(" console 0x%x:%d",
cdp->consdev, cdp->conspeed);
#endif
} else {
#if NCOM > 0
bios_oconsdev_t *cdp =
(bios_oconsdev_t*)q->ba_arg;
static const int ports[] =
{ 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
int unit = minor(cdp->consdev);
int consaddr = cdp->consaddr;
if (consaddr == -1 && unit >= 0 &&
unit < nitems(ports))
consaddr = ports[unit];
if (major(cdp->consdev) == 8 &&
consaddr != -1) {
comconsunit = unit;
comconsaddr = consaddr;
comconsrate = cdp->conspeed;
comconsiot = X86_BUS_SPACE_IO;
}
#endif
#ifdef BOOTINFO_DEBUG
printf(" console 0x%x:%d",
cdp->consdev, cdp->conspeed);
#endif
}
break;
case BOOTARG_BOOTMAC:
bios_bootmac = (bios_bootmac_t *)q->ba_arg;
break;
case BOOTARG_DDB:
bios_ddb = (bios_ddb_t *)q->ba_arg;
#ifdef DDB
db_console = bios_ddb->db_console;
#endif
break;
case BOOTARG_BOOTDUID:
bios_bootduid = (bios_bootduid_t *)q->ba_arg;
memcpy(bootduid, bios_bootduid, sizeof(bootduid));
break;
case BOOTARG_BOOTSR:
bios_bootsr = (bios_bootsr_t *)q->ba_arg;
#if NSOFTRAID > 0
memcpy(&sr_bootuuid, &bios_bootsr->uuid,
sizeof(sr_bootuuid));
memcpy(&sr_bootkey, &bios_bootsr->maskkey,
sizeof(sr_bootkey));
#endif
explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
break;
case BOOTARG_EFIINFO:
bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
break;
case BOOTARG_UCODE:
bios_ucode = (bios_ucode_t *)q->ba_arg;
break;
default:
#ifdef BOOTINFO_DEBUG
printf(" unsupported arg (%d) %p", q->ba_type,
q->ba_arg);
#endif
break;
}
}
#ifdef BOOTINFO_DEBUG
printf("\n");
#endif
}
int
check_context(const struct reg *regs, struct trapframe *tf)
{
uint16_t sel;
if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
return EINVAL;
sel = regs->r_ss & 0xffff;
if (!VALID_USER_DSEL(sel))
return EINVAL;
sel = regs->r_cs & 0xffff;
if (!VALID_USER_CSEL(sel))
return EINVAL;
if (regs->r_rip >= VM_MAXUSER_ADDRESS)
return EINVAL;
return 0;
}
void
delay_init(void(*fn)(int), int fn_quality)
{
static int cur_quality = 0;
if (fn_quality > cur_quality) {
delay_func = fn;
cur_quality = fn_quality;
}
}
/* $OpenBSD: ip6_input.c,v 1.254 2022/08/21 14:15:55 bluhm Exp $ */
/* $KAME: ip6_input.c,v 1.188 2001/03/29 05:34:31 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
*/
#include "pf.h"
#include "carp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/sysctl.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/timeout.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/task.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/netisr.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet6/in6_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#include "gif.h"
#include "bpfilter.h"
#ifdef MROUTING
#include <netinet6/ip6_mroute.h>
#endif
#if NPF > 0
#include <net/pfvar.h>
#endif
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
struct niqueue ip6intrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IPV6);
struct cpumem *ip6counters;
uint8_t ip6_soiikey[IP6_SOIIKEY_LEN];
int ip6_ours(struct mbuf **, int *, int, int);
int ip6_check_rh0hdr(struct mbuf *, int *);
int ip6_hbhchcheck(struct mbuf **, int *, int *);
int ip6_hopopts_input(struct mbuf **, int *, u_int32_t *, u_int32_t *);
struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
int ip6_sysctl_soiikey(void *, size_t *, void *, size_t);
static struct mbuf_queue ip6send_mq;
static void ip6_send_dispatch(void *);
static struct task ip6send_task =
TASK_INITIALIZER(ip6_send_dispatch, &ip6send_mq);
/*
* IP6 initialization: fill in IP6 protocol switch table.
* All protocols not implemented in kernel go to raw IP6 protocol handler.
*/
void
ip6_init(void)
{
const struct protosw *pr;
int i;
pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
if (pr == NULL)
panic("%s", __func__);
for (i = 0; i < IPPROTO_MAX; i++)
ip6_protox[i] = pr - inet6sw;
for (pr = inet6domain.dom_protosw;
pr < inet6domain.dom_protoswNPROTOSW; pr++)
if (pr->pr_domain->dom_family == PF_INET6 &&
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW &&
pr->pr_protocol < IPPROTO_MAX)
ip6_protox[pr->pr_protocol] = pr - inet6sw;
ip6_randomid_init();
nd6_init();
frag6_init();
mq_init(&ip6send_mq, 64, IPL_SOFTNET);
ip6counters = counters_alloc(ip6s_ncounters);
#ifdef MROUTING
rt_timer_queue_init(&ip6_mrouterq, MCAST_EXPIRE_TIMEOUT,
&mf6c_expire_route);
#endif
}
struct ip6_offnxt {
int ion_off;
int ion_nxt;
};
/*
* Enqueue packet for local delivery. Queuing is used as a boundary
* between the network layer (input/forward path) running with
* NET_LOCK_SHARED() and the transport layer needing it exclusively.
*/
int
ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
{
/* ip6_hbhchcheck() may be run before, then off and nxt are set */
if (*offp == 0) {
nxt = ip6_hbhchcheck(mp, offp, NULL);
if (nxt == IPPROTO_DONE)
return IPPROTO_DONE;
}
/* We are already in a IPv4/IPv6 local deliver loop. */
if (af != AF_UNSPEC)
return nxt;
/* save values for later, use after dequeue */
if (*offp != sizeof(struct ip6_hdr)) {
struct m_tag *mtag;
struct ip6_offnxt *ion;
/* mbuf tags are expensive, but only used for header options */
mtag = m_tag_get(PACKET_TAG_IP6_OFFNXT, sizeof(*ion),
M_NOWAIT);
if (mtag == NULL) {
ip6stat_inc(ip6s_idropped);
m_freemp(mp);
return IPPROTO_DONE;
}
ion = (struct ip6_offnxt *)(mtag + 1);
ion->ion_off = *offp;
ion->ion_nxt = nxt;
m_tag_prepend(*mp, mtag);
}
niq_enqueue(&ip6intrq, *mp);
*mp = NULL;
return IPPROTO_DONE;
}
/*
* Dequeue and process locally delivered packets.
* This is called with exclusive NET_LOCK().
*/
void
ip6intr(void)
{
struct mbuf *m;
while ((m = niq_dequeue(&ip6intrq)) != NULL) {
struct m_tag *mtag;
int off, nxt;
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("ip6intr no HDR");
#endif
mtag = m_tag_find(m, PACKET_TAG_IP6_OFFNXT, NULL);
if (mtag != NULL) {
struct ip6_offnxt *ion;
ion = (struct ip6_offnxt *)(mtag + 1);
off = ion->ion_off;
nxt = ion->ion_nxt;
m_tag_delete(m, mtag);
} else {
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
off = sizeof(struct ip6_hdr);
nxt = ip6->ip6_nxt;
}
nxt = ip_deliver(&m, &off, nxt, AF_INET6);
KASSERT(nxt == IPPROTO_DONE);
}
}
void
ipv6_input(struct ifnet *ifp, struct mbuf *m)
{
int off, nxt;
off = 0;
nxt = ip6_input_if(&m, &off, IPPROTO_IPV6, AF_UNSPEC, ifp);
KASSERT(nxt == IPPROTO_DONE);}
struct mbuf *
ipv6_check(struct ifnet *ifp, struct mbuf *m)
{
struct ip6_hdr *ip6;
if (m->m_len < sizeof(*ip6)) {
m = m_pullup(m, sizeof(*ip6));
if (m == NULL) { ip6stat_inc(ip6s_toosmall);
return (NULL);
}
}
ip6 = mtod(m, struct ip6_hdr *);
if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
ip6stat_inc(ip6s_badvers);
goto bad;
}
/*
* Check against address spoofing/corruption.
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
/*
* XXX: "badscope" is not very suitable for a multicast source.
*/
ip6stat_inc(ip6s_badscope);
goto bad;
}
if ((IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) || IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst)) &&
(ifp->if_flags & IFF_LOOPBACK) == 0) {
ip6stat_inc(ip6s_badscope);
goto bad;
}
/* Drop packets if interface ID portion is already filled. */
if (((IN6_IS_SCOPE_EMBED(&ip6->ip6_src) && ip6->ip6_src.s6_addr16[1]) || (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst) && ip6->ip6_dst.s6_addr16[1])) &&
(ifp->if_flags & IFF_LOOPBACK) == 0) {
ip6stat_inc(ip6s_badscope);
goto bad;
}
if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
!(m->m_flags & M_LOOP)) {
/*
* In this case, the packet should come from the loopback
* interface. However, we cannot just check the if_flags,
* because ip6_mloopback() passes the "actual" interface
* as the outgoing/incoming interface.
*/
ip6stat_inc(ip6s_badscope);
goto bad;
}
/*
* The following check is not documented in specs. A malicious
* party may be able to use IPv4 mapped addr to confuse tcp/udp stack
* and bypass security checks (act as if it was from 127.0.0.1 by using
* IPv6 src ::ffff:127.0.0.1). Be cautious.
*
* This check chokes if we are in an SIIT cloud. As none of BSDs
* support IPv4-less kernel compilation, we cannot support SIIT
* environment at all. So, it makes more sense for us to reject any
* malicious packets for non-SIIT environment, than try to do a
* partial support for SIIT environment.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
ip6stat_inc(ip6s_badscope);
goto bad;
}
/*
* Reject packets with IPv4 compatible addresses (auto tunnel).
*
* The code forbids automatic tunneling as per RFC4213.
*/
if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
ip6stat_inc(ip6s_badscope);
goto bad;
}
return (m);
bad:
m_freem(m);
return (NULL);
}
int
ip6_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp)
{
struct mbuf *m;
struct ip6_hdr *ip6;
struct sockaddr_in6 sin6;
struct rtentry *rt = NULL;
int ours = 0;
u_int16_t src_scope, dst_scope;
#if NPF > 0
struct in6_addr odst;
#endif
int srcrt = 0;
KASSERT(*offp == 0);
ip6stat_inc(ip6s_total);
m = *mp = ipv6_check(ifp, *mp);
if (m == NULL)
goto bad;
ip6 = mtod(m, struct ip6_hdr *);
#if NCARP > 0
if (carp_lsdrop(ifp, m, AF_INET6, ip6->ip6_src.s6_addr32,
ip6->ip6_dst.s6_addr32, (ip6->ip6_nxt == IPPROTO_ICMPV6 ? 0 : 1)))
goto bad;
#endif
ip6stat_inc(ip6s_nxthist + ip6->ip6_nxt);
/*
* If the packet has been received on a loopback interface it
* can be destined to any local address, not necessarily to
* an address configured on `ifp'.
*/
if (ifp->if_flags & IFF_LOOPBACK) { if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
src_scope = ip6->ip6_src.s6_addr16[1];
ip6->ip6_src.s6_addr16[1] = 0;
}
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
dst_scope = ip6->ip6_dst.s6_addr16[1];
ip6->ip6_dst.s6_addr16[1] = 0;
}
}
#if NPF > 0
/*
* Packet filter
*/
odst = ip6->ip6_dst;
if (pf_test(AF_INET6, PF_IN, ifp, mp) != PF_PASS)
goto bad;
m = *mp;
if (m == NULL)
goto bad;
ip6 = mtod(m, struct ip6_hdr *);
srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
#endif
/*
* Without embedded scope ID we cannot find link-local
* addresses in the routing table.
*/
if (ifp->if_flags & IFF_LOOPBACK) {
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
ip6->ip6_src.s6_addr16[1] = src_scope;
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
ip6->ip6_dst.s6_addr16[1] = dst_scope;
} else {
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
ip6->ip6_src.s6_addr16[1] = htons(ifp->if_index);
if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index);
}
/*
* Be more secure than RFC5095 and scan for type 0 routing headers.
* If pf has already scanned the header chain, do not do it twice.
*/
if (!(m->m_pkthdr.pf.flags & PF_TAG_PROCESSED) &&
ip6_check_rh0hdr(m, offp)) {
ip6stat_inc(ip6s_badoptions);
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, *offp);
m = *mp = NULL;
goto bad;
}
#if NPF > 0
if (pf_ouraddr(m) == 1) {
nxt = ip6_ours(mp, offp, nxt, af);
goto out;
}
#endif
/*
* Multicast check
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/*
* Make sure M_MCAST is set. It should theoretically
* already be there, but let's play safe because upper
* layers check for this flag.
*/
m->m_flags |= M_MCAST;
/*
* See if we belong to the destination multicast group on the
* arrival interface.
*/
if (in6_hasmulti(&ip6->ip6_dst, ifp))
ours = 1;
#ifdef MROUTING
if (ip6_mforwarding && ip6_mrouter[ifp->if_rdomain]) {
int error;
nxt = ip6_hbhchcheck(&m, offp, &ours);
if (nxt == IPPROTO_DONE)
goto out;
ip6 = mtod(m, struct ip6_hdr *);
/*
* If we are acting as a multicast router, all
* incoming multicast packets are passed to the
* kernel-level multicast forwarding function.
* The packet is returned (relatively) intact; if
* ip6_mforward() returns a non-zero value, the packet
* must be discarded, else it may be accepted below.
*/
KERNEL_LOCK();
error = ip6_mforward(ip6, ifp, m);
KERNEL_UNLOCK();
if (error) {
ip6stat_inc(ip6s_cantforward);
goto bad;
}
if (ours) { if (af == AF_UNSPEC) nxt = ip6_ours(mp, offp, nxt, af);
goto out;
}
goto bad;
}
#endif
if (!ours) {
ip6stat_inc(ip6s_notmember);
if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
ip6stat_inc(ip6s_cantforward);
goto bad;
}
nxt = ip6_ours(mp, offp, nxt, af);
goto out;
}
/*
* Unicast check
*/
memset(&sin6, 0, sizeof(struct sockaddr_in6));
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ip6->ip6_dst;
rt = rtalloc_mpath(sin6tosa(&sin6), &ip6->ip6_src.s6_addr32[0],
m->m_pkthdr.ph_rtableid);
/*
* Accept the packet if the route to the destination is marked
* as local.
*/
if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL)) {
struct in6_ifaddr *ia6 = ifatoia6(rt->rt_ifa);
if (ip6_forwarding == 0 && rt->rt_ifidx != ifp->if_index && !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) ||
(m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) {
/* received on wrong interface */
#if NCARP > 0
struct ifnet *out_if;
/*
* Virtual IPs on carp interfaces need to be checked
* also against the parent interface and other carp
* interfaces sharing the same parent.
*/
out_if = if_get(rt->rt_ifidx);
if (!(out_if && carp_strict_addr_chk(out_if, ifp))) {
ip6stat_inc(ip6s_wrongif);
if_put(out_if);
goto bad;
}
if_put(out_if);
#else
ip6stat_inc(ip6s_wrongif);
goto bad;
#endif
}
/*
* packets to a tentative, duplicated, or somehow invalid
* address must not be accepted.
*/
if ((ia6->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED))) {
char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
inet_ntop(AF_INET6, &ip6->ip6_src, src, sizeof(src));
inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst));
/* address is not ready, so discard the packet. */
nd6log((LOG_INFO,
"%s: packet to an unready address %s->%s\n",
__func__, src, dst));
goto bad;
} else {
nxt = ip6_ours(mp, offp, nxt, af);
goto out;
}
}
#if NCARP > 0
if (ip6->ip6_nxt == IPPROTO_ICMPV6 &&
carp_lsdrop(ifp, m, AF_INET6, ip6->ip6_src.s6_addr32,
ip6->ip6_dst.s6_addr32, 1))
goto bad;
#endif
/*
* Now there is no reason to process the packet if it's not our own
* and we're not a router.
*/
if (!ip6_forwarding) {
ip6stat_inc(ip6s_cantforward);
goto bad;
}
nxt = ip6_hbhchcheck(&m, offp, &ours);
if (nxt == IPPROTO_DONE)
goto out;
if (ours) {
if (af == AF_UNSPEC) nxt = ip6_ours(mp, offp, nxt, af);
goto out;
}
#ifdef IPSEC
if (ipsec_in_use) {
int rv;
rv = ipsec_forward_check(m, *offp, AF_INET6);
if (rv != 0) { ip6stat_inc(ip6s_cantforward);
goto bad;
}
/*
* Fall through, forward packet. Outbound IPsec policy
* checking will occur in ip6_forward().
*/
}
#endif /* IPSEC */
ip6_forward(m, rt, srcrt);
*mp = NULL;
return IPPROTO_DONE;
bad:
nxt = IPPROTO_DONE;
m_freemp(mp);
out:
rtfree(rt);
return nxt;
}
/* On error free mbuf and return IPPROTO_DONE. */
int
ip6_hbhchcheck(struct mbuf **mp, int *offp, int *oursp)
{
struct ip6_hdr *ip6;
u_int32_t plen, rtalert = ~0;
int nxt;
ip6 = mtod(*mp, struct ip6_hdr *);
/*
* Process Hop-by-Hop options header if it's contained.
* m may be modified in ip6_hopopts_input().
* If a JumboPayload option is included, plen will also be modified.
*/
plen = (u_int32_t)ntohs(ip6->ip6_plen);
*offp = sizeof(struct ip6_hdr);
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
if (ip6_hopopts_input(mp, offp, &plen, &rtalert))
goto bad; /* m have already been freed */
/* adjust pointer */
ip6 = mtod(*mp, struct ip6_hdr *);
/*
* if the payload length field is 0 and the next header field
* indicates Hop-by-Hop Options header, then a Jumbo Payload
* option MUST be included.
*/
if (ip6->ip6_plen == 0 && plen == 0) {
/*
* Note that if a valid jumbo payload option is
* contained, ip6_hopopts_input() must set a valid
* (non-zero) payload length to the variable plen.
*/
ip6stat_inc(ip6s_badoptions);
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
(caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
goto bad;
}
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, *mp,
sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
if (hbh == NULL) {
ip6stat_inc(ip6s_tooshort);
goto bad;
}
nxt = hbh->ip6h_nxt;
/*
* accept the packet if a router alert option is included
* and we act as an IPv6 router.
*/
if (rtalert != ~0 && ip6_forwarding && oursp != NULL) *oursp = 1;
} else
nxt = ip6->ip6_nxt;
/*
* Check that the amount of data in the buffers
* is as at least much as the IPv6 header would have us expect.
* Trim mbufs if longer than we expect.
* Drop packet if shorter than we expect.
*/
if ((*mp)->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
ip6stat_inc(ip6s_tooshort);
m_freemp(mp);
goto bad;
}
if ((*mp)->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
if ((*mp)->m_len == (*mp)->m_pkthdr.len) {
(*mp)->m_len = sizeof(struct ip6_hdr) + plen;
(*mp)->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
} else {
m_adj((*mp), sizeof(struct ip6_hdr) + plen -
(*mp)->m_pkthdr.len);
}
}
return nxt;
bad:
return IPPROTO_DONE;
}
/* scan packet for RH0 routing header. Mostly stolen from pf.c:pf_test() */
int
ip6_check_rh0hdr(struct mbuf *m, int *offp)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct ip6_rthdr rthdr;
struct ip6_ext opt6;
u_int8_t proto = ip6->ip6_nxt;
int done = 0, lim, off, rh_cnt = 0;
off = ((caddr_t)ip6 - m->m_data) + sizeof(struct ip6_hdr);
lim = min(m->m_pkthdr.len, ntohs(ip6->ip6_plen) + sizeof(*ip6));
do {
switch (proto) {
case IPPROTO_ROUTING:
if (rh_cnt++) {
/* more than one rh header present */
*offp = off;
return (1);
}
if (off + sizeof(rthdr) > lim) {
/* packet to short to make sense */
*offp = off;
return (1);
}
m_copydata(m, off, sizeof(rthdr), &rthdr);
if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
*offp = off +
offsetof(struct ip6_rthdr, ip6r_type);
return (1);
}
off += (rthdr.ip6r_len + 1) * 8;
proto = rthdr.ip6r_nxt;
break;
case IPPROTO_AH:
case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
/* get next header and header length */
if (off + sizeof(opt6) > lim) {
/*
* Packet to short to make sense, we could
* reject the packet but as a router we
* should not do that so forward it.
*/
return (0);
}
m_copydata(m, off, sizeof(opt6), &opt6);
if (proto == IPPROTO_AH)
off += (opt6.ip6e_len + 2) * 4;
else
off += (opt6.ip6e_len + 1) * 8;
proto = opt6.ip6e_nxt;
break;
case IPPROTO_FRAGMENT:
default:
/* end of header stack */
done = 1;
break;
}
} while (!done);
return (0);
}
/*
* Hop-by-Hop options header processing. If a valid jumbo payload option is
* included, the real payload length will be stored in plenp.
* On error free mbuf and return -1.
*
* rtalertp - XXX: should be stored in a more smart way
*/
int
ip6_hopopts_input(struct mbuf **mp, int *offp, u_int32_t *plenp,
u_int32_t *rtalertp)
{
int off = *offp, hbhlen;
struct ip6_hbh *hbh;
/* validation of the length of the header */
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, *mp,
sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
if (hbh == NULL) {
ip6stat_inc(ip6s_tooshort);
return -1;
}
hbhlen = (hbh->ip6h_len + 1) << 3;
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, *mp, sizeof(struct ip6_hdr),
hbhlen);
if (hbh == NULL) {
ip6stat_inc(ip6s_tooshort);
return -1;
}
off += hbhlen;
hbhlen -= sizeof(struct ip6_hbh);
if (ip6_process_hopopts(mp, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
hbhlen, rtalertp, plenp) < 0)
return (-1);
*offp = off;
return (0);
}
/*
* Search header for all Hop-by-hop options and process each option.
* This function is separate from ip6_hopopts_input() in order to
* handle a case where the sending node itself process its hop-by-hop
* options header. In such a case, the function is called from ip6_output().
* On error free mbuf and return -1.
*
* The function assumes that hbh header is located right after the IPv6 header
* (RFC2460 p7), opthead is pointer into data content in m, and opthead to
* opthead + hbhlen is located in continuous memory region.
*/
int
ip6_process_hopopts(struct mbuf **mp, u_int8_t *opthead, int hbhlen,
u_int32_t *rtalertp, u_int32_t *plenp)
{
struct ip6_hdr *ip6;
int optlen = 0;
u_int8_t *opt = opthead;
u_int16_t rtalert_val;
u_int32_t jumboplen;
const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) {
case IP6OPT_PAD1:
optlen = 1;
break;
case IP6OPT_PADN:
if (hbhlen < IP6OPT_MINLEN) {
ip6stat_inc(ip6s_toosmall);
goto bad;
}
optlen = *(opt + 1) + 2;
break;
case IP6OPT_ROUTER_ALERT:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_RTALERT_LEN) {
ip6stat_inc(ip6s_toosmall);
goto bad;
}
if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
/* XXX stat */
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_RTALERT_LEN;
memcpy((caddr_t)&rtalert_val, (caddr_t)(opt + 2), 2);
*rtalertp = ntohs(rtalert_val);
break;
case IP6OPT_JUMBO:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_JUMBO_LEN) {
ip6stat_inc(ip6s_toosmall);
goto bad;
}
if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
/* XXX stat */
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_JUMBO_LEN;
/*
* IPv6 packets that have non 0 payload length
* must not contain a jumbo payload option.
*/
ip6 = mtod(*mp, struct ip6_hdr *);
if (ip6->ip6_plen) {
ip6stat_inc(ip6s_badoptions);
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt - opthead);
return (-1);
}
/*
* We may see jumbolen in unaligned location, so
* we'd need to perform memcpy().
*/
memcpy(&jumboplen, opt + 2, sizeof(jumboplen));
jumboplen = (u_int32_t)htonl(jumboplen);
#if 1
/*
* if there are multiple jumbo payload options,
* *plenp will be non-zero and the packet will be
* rejected.
* the behavior may need some debate in ipngwg -
* multiple options does not make sense, however,
* there's no explicit mention in specification.
*/
if (*plenp != 0) {
ip6stat_inc(ip6s_badoptions);
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
#endif
/*
* jumbo payload length must be larger than 65535.
*/
if (jumboplen <= IPV6_MAXPACKET) {
ip6stat_inc(ip6s_badoptions);
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
*plenp = jumboplen;
break;
default: /* unknown option */
if (hbhlen < IP6OPT_MINLEN) {
ip6stat_inc(ip6s_toosmall);
goto bad;
}
optlen = ip6_unknown_opt(mp, opt,
erroff + opt - opthead);
if (optlen == -1)
return (-1);
optlen += 2;
break;
}
}
return (0);
bad:
m_freemp(mp);
return (-1);
}
/*
* Unknown option processing.
* The third argument `off' is the offset from the IPv6 header to the option,
* which allows returning an ICMPv6 error even if the IPv6 header and the
* option header are not continuous.
* On error free mbuf and return -1.
*/
int
ip6_unknown_opt(struct mbuf **mp, u_int8_t *optp, int off)
{
struct ip6_hdr *ip6;
switch (IP6OPT_TYPE(*optp)) {
case IP6OPT_TYPE_SKIP: /* ignore the option */
return ((int)*(optp + 1));
case IP6OPT_TYPE_DISCARD: /* silently discard */
m_freemp(mp);
return (-1);
case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
ip6stat_inc(ip6s_badoptions);
icmp6_error(*mp, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
return (-1);
case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
ip6stat_inc(ip6s_badoptions);
ip6 = mtod(*mp, struct ip6_hdr *);
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
((*mp)->m_flags & (M_BCAST|M_MCAST)))
m_freemp(mp);
else
icmp6_error(*mp, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_OPTION, off);
return (-1);
}
m_freemp(mp); /* XXX: NOTREACHED */
return (-1);
}
/*
* Create the "control" list for this pcb.
*
* The routine will be called from upper layer handlers like udp_input().
* Thus the routine assumes that the caller (udp_input) have already
* called IP6_EXTHDR_CHECK() and all the extension headers are located in the
* very first mbuf on the mbuf chain.
* We may want to add some infinite loop prevention or sanity checks for safety.
* (This applies only when you are using KAME mbuf chain restriction, i.e.
* you are using IP6_EXTHDR_CHECK() not m_pulldown())
*/
void
ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
if (in6p->inp_socket->so_options & SO_TIMESTAMP) {
struct timeval tv;
m_microtime(m, &tv);
*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
SCM_TIMESTAMP, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
/* RFC 2292 sec. 5 */
if ((in6p->inp_flags & IN6P_PKTINFO) != 0) {
struct in6_pktinfo pi6;
memcpy(&pi6.ipi6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
if (IN6_IS_SCOPE_EMBED(&pi6.ipi6_addr))
pi6.ipi6_addr.s6_addr16[1] = 0;
pi6.ipi6_ifindex = m ? m->m_pkthdr.ph_ifidx : 0;
*mp = sbcreatecontrol((caddr_t) &pi6,
sizeof(struct in6_pktinfo),
IPV6_PKTINFO, IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if ((in6p->inp_flags & IN6P_HOPLIMIT) != 0) {
int hlim = ip6->ip6_hlim & 0xff;
*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
IPV6_HOPLIMIT, IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if ((in6p->inp_flags & IN6P_TCLASS) != 0) {
u_int32_t flowinfo;
int tclass;
flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
flowinfo >>= 20;
tclass = flowinfo & 0xff;
*mp = sbcreatecontrol((caddr_t)&tclass, sizeof(tclass),
IPV6_TCLASS, IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
/*
* IPV6_HOPOPTS socket option. Recall that we required super-user
* privilege for the option (see ip6_ctloutput), but it might be too
* strict, since there might be some hop-by-hop options which can be
* returned to normal user.
* See also RFC 2292 section 6 (or RFC 3542 section 8).
*/
if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
/*
* Check if a hop-by-hop options header is contained in the
* received packet, and if so, store the options as ancillary
* data. Note that a hop-by-hop options header must be
* just after the IPv6 header, which is assured through the
* IPv6 input processing.
*/
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
int hbhlen = 0;
struct mbuf *ext;
ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
ip6->ip6_nxt);
if (ext == NULL) {
ip6stat_inc(ip6s_tooshort);
return;
}
hbh = mtod(ext, struct ip6_hbh *);
hbhlen = (hbh->ip6h_len + 1) << 3;
if (hbhlen != ext->m_len) {
m_freem(ext);
ip6stat_inc(ip6s_tooshort);
return;
}
/*
* XXX: We copy the whole header even if a
* jumbo payload option is included, the option which
* is to be removed before returning according to
* RFC2292.
* Note: this constraint is removed in RFC3542.
*/
*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
IPV6_HOPOPTS,
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
m_freem(ext);
}
}
/* IPV6_DSTOPTS and IPV6_RTHDR socket options */
if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
/*
* Search for destination options headers or routing
* header(s) through the header chain, and stores each
* header as ancillary data.
* Note that the order of the headers remains in
* the chain of ancillary data.
*/
while (1) { /* is explicit loop prevention necessary? */
struct ip6_ext *ip6e = NULL;
int elen;
struct mbuf *ext = NULL;
/*
* if it is not an extension header, don't try to
* pull it from the chain.
*/
switch (nxt) {
case IPPROTO_DSTOPTS:
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
goto loopend;
}
ext = ip6_pullexthdr(m, off, nxt);
if (ext == NULL) {
ip6stat_inc(ip6s_tooshort);
return;
}
ip6e = mtod(ext, struct ip6_ext *);
if (nxt == IPPROTO_AH)
elen = (ip6e->ip6e_len + 2) << 2;
else
elen = (ip6e->ip6e_len + 1) << 3;
if (elen != ext->m_len) {
m_freem(ext);
ip6stat_inc(ip6s_tooshort);
return;
}
switch (nxt) {
case IPPROTO_DSTOPTS:
if (!(in6p->inp_flags & IN6P_DSTOPTS))
break;
*mp = sbcreatecontrol((caddr_t)ip6e, elen,
IPV6_DSTOPTS,
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_ROUTING:
if (!(in6p->inp_flags & IN6P_RTHDR))
break;
*mp = sbcreatecontrol((caddr_t)ip6e, elen,
IPV6_RTHDR,
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
/*
* other cases have been filtered in the above.
* none will visit this case. here we supply
* the code just in case (nxt overwritten or
* other cases).
*/
m_freem(ext);
goto loopend;
}
/* proceed with the next header. */
off += elen;
nxt = ip6e->ip6e_nxt;
ip6e = NULL;
m_freem(ext);
ext = NULL;
}
loopend:
;
}
}
/*
* pull single extension header from mbuf chain. returns single mbuf that
* contains the result, or NULL on error.
*/
struct mbuf *
ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
{
struct ip6_ext ip6e;
size_t elen;
struct mbuf *n;
#ifdef DIAGNOSTIC
switch (nxt) {
case IPPROTO_DSTOPTS:
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
}
#endif
if (off + sizeof(ip6e) > m->m_pkthdr.len)
return NULL;
m_copydata(m, off, sizeof(ip6e), &ip6e);
if (nxt == IPPROTO_AH)
elen = (ip6e.ip6e_len + 2) << 2;
else
elen = (ip6e.ip6e_len + 1) << 3;
if (off + elen > m->m_pkthdr.len)
return NULL;
MGET(n, M_DONTWAIT, MT_DATA);
if (n && elen >= MLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (n == NULL) {
ip6stat_inc(ip6s_idropped);
return NULL;
}
n->m_len = 0;
if (elen >= m_trailingspace(n)) {
m_free(n);
return NULL;
}
m_copydata(m, off, elen, mtod(n, caddr_t));
n->m_len = elen;
return n;
}
/*
* Get offset to the previous header followed by the header
* currently processed.
*/
int
ip6_get_prevhdr(struct mbuf *m, int off)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
if (off == sizeof(struct ip6_hdr)) {
return offsetof(struct ip6_hdr, ip6_nxt);
} else if (off < sizeof(struct ip6_hdr)) {
panic("%s: off < sizeof(struct ip6_hdr)", __func__);
} else {
int len, nlen, nxt;
struct ip6_ext ip6e;
nxt = ip6->ip6_nxt;
len = sizeof(struct ip6_hdr);
nlen = 0;
while (len < off) {
m_copydata(m, len, sizeof(ip6e), &ip6e);
switch (nxt) {
case IPPROTO_FRAGMENT:
nlen = sizeof(struct ip6_frag);
break;
case IPPROTO_AH:
nlen = (ip6e.ip6e_len + 2) << 2;
break;
default:
nlen = (ip6e.ip6e_len + 1) << 3;
break;
}
len += nlen;
nxt = ip6e.ip6e_nxt;
}
return (len - nlen);
}
}
/*
* get next header offset. m will be retained.
*/
int
ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
struct ip6_hdr ip6;
struct ip6_ext ip6e;
struct ip6_frag fh;
/* just in case */
if (m == NULL)
panic("%s: m == NULL", __func__); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
return -1;
switch (proto) {
case IPPROTO_IPV6:
if (m->m_pkthdr.len < off + sizeof(ip6))
return -1;
m_copydata(m, off, sizeof(ip6), &ip6);
if (nxtp) *nxtp = ip6.ip6_nxt;
off += sizeof(ip6);
return off;
case IPPROTO_FRAGMENT:
/*
* terminate parsing if it is not the first fragment,
* it does not make sense to parse through it.
*/
if (m->m_pkthdr.len < off + sizeof(fh))
return -1;
m_copydata(m, off, sizeof(fh), &fh);
if ((fh.ip6f_offlg & IP6F_OFF_MASK) != 0)
return -1;
if (nxtp) *nxtp = fh.ip6f_nxt;
off += sizeof(struct ip6_frag);
return off;
case IPPROTO_AH:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), &ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 2) << 2;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_HOPOPTS:
case IPPROTO_ROUTING:
case IPPROTO_DSTOPTS:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), &ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 1) << 3;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_NONE:
case IPPROTO_ESP:
case IPPROTO_IPCOMP:
/* give up */
return -1;
default:
return -1;
}
return -1;
}
/*
* get offset for the last header in the chain. m will be kept untainted.
*/
int
ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
int newoff;
int nxt;
if (!nxtp) {
nxt = -1;
nxtp = &nxt;
}
while (1) {
newoff = ip6_nexthdr(m, off, proto, nxtp);
if (newoff < 0)
return off;
else if (newoff < off)
return -1; /* invalid */
else if (newoff == off)
return newoff;
off = newoff;
proto = *nxtp;
}
}
/*
* System control for IP6
*/
const u_char inet6ctlerrmap[PRC_NCMDS] = {
0, 0, 0, 0,
0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
EMSGSIZE, EHOSTUNREACH, 0, 0,
0, 0, 0, 0,
ENOPROTOOPT
};
#ifdef MROUTING
extern int ip6_mrtproto;
#endif
const struct sysctl_bounded_args ipv6ctl_vars[] = {
{ IPV6CTL_DAD_PENDING, &ip6_dad_pending, SYSCTL_INT_READONLY },
#ifdef MROUTING
{ IPV6CTL_MRTPROTO, &ip6_mrtproto, SYSCTL_INT_READONLY },
#endif
{ IPV6CTL_FORWARDING, &ip6_forwarding, 0, 1 },
{ IPV6CTL_SENDREDIRECTS, &ip6_sendredirects, 0, 1 },
{ IPV6CTL_DEFHLIM, &ip6_defhlim, 0, 255 },
{ IPV6CTL_MAXFRAGPACKETS, &ip6_maxfragpackets, 0, 1000 },
{ IPV6CTL_LOG_INTERVAL, &ip6_log_interval, 0, INT_MAX },
{ IPV6CTL_HDRNESTLIMIT, &ip6_hdrnestlimit, 0, 100 },
{ IPV6CTL_DAD_COUNT, &ip6_dad_count, 0, 10 },
{ IPV6CTL_AUTO_FLOWLABEL, &ip6_auto_flowlabel, 0, 1 },
{ IPV6CTL_DEFMCASTHLIM, &ip6_defmcasthlim, 0, 255 },
{ IPV6CTL_USE_DEPRECATED, &ip6_use_deprecated, 0, 1 },
{ IPV6CTL_MAXFRAGS, &ip6_maxfrags, 0, 1000 },
{ IPV6CTL_MFORWARDING, &ip6_mforwarding, 0, 1 },
{ IPV6CTL_MULTIPATH, &ip6_multipath, 0, 1 },
{ IPV6CTL_MCAST_PMTU, &ip6_mcast_pmtu, 0, 1 },
{ IPV6CTL_NEIGHBORGCTHRESH, &ip6_neighborgcthresh, -1, 5 * 2048 },
{ IPV6CTL_MAXDYNROUTES, &ip6_maxdynroutes, -1, 5 * 4096 },
};
int
ip6_sysctl_ip6stat(void *oldp, size_t *oldlenp, void *newp)
{
struct ip6stat *ip6stat;
int ret;
CTASSERT(sizeof(*ip6stat) == (ip6s_ncounters * sizeof(uint64_t)));
ip6stat = malloc(sizeof(*ip6stat), M_TEMP, M_WAITOK);
counters_read(ip6counters, (uint64_t *)ip6stat, ip6s_ncounters);
ret = sysctl_rdstruct(oldp, oldlenp, newp,
ip6stat, sizeof(*ip6stat));
free(ip6stat, M_TEMP, sizeof(*ip6stat));
return (ret);
}
int
ip6_sysctl_soiikey(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
uint8_t oldkey[IP6_SOIIKEY_LEN];
int error;
error = suser(curproc);
if (error != 0)
return (error);
memcpy(oldkey, ip6_soiikey, sizeof(oldkey));
error = sysctl_struct(oldp, oldlenp, newp, newlen, ip6_soiikey,
sizeof(ip6_soiikey));
return (error);
}
int
ip6_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
#ifdef MROUTING
extern struct mrt6stat mrt6stat;
#endif
int error;
/* Almost all sysctl names at this level are terminal. */
if (namelen != 1 && name[0] != IPV6CTL_IFQUEUE)
return (ENOTDIR);
switch (name[0]) {
case IPV6CTL_STATS:
return (ip6_sysctl_ip6stat(oldp, oldlenp, newp));
#ifdef MROUTING
case IPV6CTL_MRTSTATS:
if (newp != NULL)
return (EPERM);
NET_LOCK();
error = sysctl_struct(oldp, oldlenp, newp, newlen,
&mrt6stat, sizeof(mrt6stat));
NET_UNLOCK();
return (error);
case IPV6CTL_MRTMIF:
if (newp)
return (EPERM);
NET_LOCK();
error = mrt6_sysctl_mif(oldp, oldlenp);
NET_UNLOCK();
return (error);
case IPV6CTL_MRTMFC:
if (newp)
return (EPERM);
NET_LOCK();
error = mrt6_sysctl_mfc(oldp, oldlenp);
NET_UNLOCK();
return (error);
#else
case IPV6CTL_MRTSTATS:
case IPV6CTL_MRTPROTO:
case IPV6CTL_MRTMIF:
case IPV6CTL_MRTMFC:
return (EOPNOTSUPP);
#endif
case IPV6CTL_MTUDISCTIMEOUT:
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&ip6_mtudisc_timeout, 0, INT_MAX);
rt_timer_queue_change(&icmp6_mtudisc_timeout_q,
ip6_mtudisc_timeout);
NET_UNLOCK();
return (error);
case IPV6CTL_IFQUEUE:
return (sysctl_niq(name + 1, namelen - 1,
oldp, oldlenp, newp, newlen, &ip6intrq));
case IPV6CTL_SOIIKEY:
return (ip6_sysctl_soiikey(oldp, oldlenp, newp, newlen));
default:
NET_LOCK();
error = sysctl_bounded_arr(ipv6ctl_vars, nitems(ipv6ctl_vars),
name, namelen, oldp, oldlenp, newp, newlen);
NET_UNLOCK();
return (error);
}
/* NOTREACHED */
}
void
ip6_send_dispatch(void *xmq)
{
struct mbuf_queue *mq = xmq;
struct mbuf *m;
struct mbuf_list ml;
mq_delist(mq, &ml);
if (ml_empty(&ml))
return;
NET_LOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
ip6_output(m, NULL, NULL, 0, NULL, NULL);
}
NET_UNLOCK();
}
void
ip6_send(struct mbuf *m)
{
mq_enqueue(&ip6send_mq, m);
task_add(net_tq(0), &ip6send_task);
}
/* $OpenBSD: ffs_alloc.c,v 1.114 2021/03/11 13:31:35 jsg Exp $ */
/* $NetBSD: ffs_alloc.c,v 1.11 1996/05/11 18:27:09 mycroft Exp $ */
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program.
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/syslog.h>
#include <sys/stdint.h>
#include <sys/time.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#define ffs_fserr(fs, uid, cp) do { \
log(LOG_ERR, "uid %u on %s: %s\n", (uid), \
(fs)->fs_fsmnt, (cp)); \
} while (0)
daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int);
struct buf * ffs_cgread(struct fs *, struct inode *, u_int);
daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t);
ufsino_t ffs_dirpref(struct inode *);
daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int);
daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int,
daddr_t (*)(struct inode *, u_int, daddr_t, int));
daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int);
daddr_t ffs_mapsearch(struct fs *, struct cg *, daddr_t, int);
static const struct timeval fserr_interval = { 2, 0 };
/*
* Allocate a block in the file system.
*
* The size of the requested block is given, which must be some
* multiple of fs_fsize and <= fs_bsize.
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate a block:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate a block in the same cylinder group.
* 4) quadratically rehash into other cylinder groups, until an
* available block is located.
* If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
* 2) quadratically rehash into other cylinder groups, until an
* available block is located.
*/
int
ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
struct ucred *cred, daddr_t *bnp)
{
static struct timeval fsfull_last;
struct fs *fs;
daddr_t bno;
u_int cg;
int error;
*bnp = 0;
fs = ip->i_fs;
#ifdef DIAGNOSTIC
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
panic("ffs_alloc: bad size");
}
if (cred == NOCRED)
panic("ffs_alloc: missing credential");
#endif /* DIAGNOSTIC */
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
goto nospace;
if ((error = ufs_quota_alloc_blocks(ip, btodb(size), cred)) != 0)
return (error);
/*
* Start allocation in the preferred block's cylinder group or
* the file's inode's cylinder group if no preferred block was
* specified.
*/
if (bpref >= fs->fs_size)
bpref = 0;
if (bpref == 0)
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref);
/* Try allocating a block. */
bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
if (bno > 0) {
/* allocation successful, update inode data */
DIP_ADD(ip, blocks, btodb(size));
ip->i_flag |= IN_CHANGE | IN_UPDATE;
*bnp = bno;
return (0);
}
/* Restore user's disk quota because allocation failed. */
(void) ufs_quota_free_blocks(ip, btodb(size), cred);
nospace:
if (ratecheck(&fsfull_last, &fserr_interval)) { ffs_fserr(fs, cred->cr_uid, "file system full");
uprintf("\n%s: write failed, file system is full\n",
fs->fs_fsmnt);
}
return (ENOSPC);
}
/*
* Reallocate a fragment to a bigger size
*
* The number and size of the old block is given, and a preference
* and new size is also specified. The allocator attempts to extend
* the original block. Failing that, the regular block allocator is
* invoked to get an appropriate block.
*/
int
ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
int nsize, struct ucred *cred, struct buf **bpp, daddr_t *blknop)
{
static struct timeval fsfull_last;
struct fs *fs;
struct buf *bp = NULL;
daddr_t quota_updated = 0;
int request, error;
u_int cg;
daddr_t bprev, bno;
if (bpp != NULL) *bpp = NULL;
fs = ip->i_fs;
#ifdef DIAGNOSTIC
if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
(u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
printf(
"dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
panic("ffs_realloccg: bad size");
}
if (cred == NOCRED)
panic("ffs_realloccg: missing credential");
#endif /* DIAGNOSTIC */
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
goto nospace;
bprev = DIP(ip, db[lbprev]);
if (bprev == 0) {
printf("dev = 0x%x, bsize = %d, bprev = %lld, fs = %s\n",
ip->i_dev, fs->fs_bsize, (long long)bprev, fs->fs_fsmnt);
panic("ffs_realloccg: bad bprev");
}
/*
* Allocate the extra space in the buffer.
*/
if (bpp != NULL) { if ((error = bread(ITOV(ip), lbprev, fs->fs_bsize, &bp)) != 0)
goto error;
buf_adjcnt(bp, osize);
}
if ((error = ufs_quota_alloc_blocks(ip, btodb(nsize - osize), cred))
!= 0)
goto error;
quota_updated = btodb(nsize - osize);
/*
* Check for extension in the existing location.
*/
cg = dtog(fs, bprev);
if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
DIP_ADD(ip, blocks, btodb(nsize - osize));
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
if (bp->b_blkno != fsbtodb(fs, bno))
panic("ffs_realloccg: bad blockno");
#ifdef DIAGNOSTIC
if (nsize > bp->b_bufsize)
panic("ffs_realloccg: small buf");
#endif
buf_adjcnt(bp, nsize);
bp->b_flags |= B_DONE;
memset(bp->b_data + osize, 0, nsize - osize);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
}
/*
* Allocate a new disk location.
*/
if (bpref >= fs->fs_size)
bpref = 0;
switch (fs->fs_optim) {
case FS_OPTSPACE:
/*
* Allocate an exact sized fragment. Although this makes
* best use of space, we will waste time relocating it if
* the file continues to grow. If the fragmentation is
* less than half of the minimum free reserve, we choose
* to begin optimizing for time.
*/
request = nsize;
if (fs->fs_minfree < 5 ||
fs->fs_cstotal.cs_nffree >
fs->fs_dsize * fs->fs_minfree / (2 * 100))
break;
fs->fs_optim = FS_OPTTIME;
break;
case FS_OPTTIME:
/*
* At this point we have discovered a file that is trying to
* grow a small fragment to a larger fragment. To save time,
* we allocate a full sized block, then free the unused portion.
* If the file continues to grow, the `ffs_fragextend' call
* above will be able to grow it in place without further
* copying. If aberrant programs cause disk fragmentation to
* grow within 2% of the free reserve, we choose to begin
* optimizing for space.
*/
request = fs->fs_bsize;
if (fs->fs_cstotal.cs_nffree <
fs->fs_dsize * (fs->fs_minfree - 2) / 100)
break;
fs->fs_optim = FS_OPTSPACE;
break;
default:
printf("dev = 0x%x, optim = %d, fs = %s\n",
ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
panic("ffs_realloccg: bad optim");
/* NOTREACHED */
}
bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
if (bno <= 0)
goto nospace;
(void) uvm_vnp_uncache(ITOV(ip));
if (!DOINGSOFTDEP(ITOV(ip))) ffs_blkfree(ip, bprev, (long)osize); if (nsize < request) ffs_blkfree(ip, bno + numfrags(fs, nsize),
(long)(request - nsize));
DIP_ADD(ip, blocks, btodb(nsize - osize));
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
bp->b_blkno = fsbtodb(fs, bno);
#ifdef DIAGNOSTIC
if (nsize > bp->b_bufsize)
panic("ffs_realloccg: small buf 2");
#endif
buf_adjcnt(bp, nsize);
bp->b_flags |= B_DONE;
memset(bp->b_data + osize, 0, nsize - osize);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
nospace:
if (ratecheck(&fsfull_last, &fserr_interval)) { ffs_fserr(fs, cred->cr_uid, "file system full");
uprintf("\n%s: write failed, file system is full\n",
fs->fs_fsmnt);
}
error = ENOSPC;
error:
if (bp != NULL) { brelse(bp);
bp = NULL;
}
/*
* Restore user's disk quota because allocation failed.
*/
if (quota_updated != 0) (void)ufs_quota_free_blocks(ip, quota_updated, cred);
return error;
}
/*
* Allocate an inode in the file system.
*
* If allocating a directory, use ffs_dirpref to select the inode.
* If allocating in a directory, the following hierarchy is followed:
* 1) allocate the preferred inode.
* 2) allocate an inode in the same cylinder group.
* 3) quadratically rehash into other cylinder groups, until an
* available inode is located.
* If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadratically rehash into other cylinder groups, until an
* available inode is located.
*/
int
ffs_inode_alloc(struct inode *pip, mode_t mode, struct ucred *cred,
struct vnode **vpp)
{
static struct timeval fsnoinodes_last;
struct vnode *pvp = ITOV(pip);
struct fs *fs;
struct inode *ip;
ufsino_t ino, ipref;
u_int cg;
int error;
*vpp = NULL;
fs = pip->i_fs;
if (fs->fs_cstotal.cs_nifree == 0)
goto noinodes;
if ((mode & IFMT) == IFDIR)
ipref = ffs_dirpref(pip);
else
ipref = pip->i_number;
if (ipref >= fs->fs_ncg * fs->fs_ipg)
ipref = 0;
cg = ino_to_cg(fs, ipref);
/*
* Track number of dirs created one after another
* in a same cg without intervening by files.
*/
if ((mode & IFMT) == IFDIR) {
if (fs->fs_contigdirs[cg] < 255)
fs->fs_contigdirs[cg]++;
} else {
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
ino = (ufsino_t)ffs_hashalloc(pip, cg, ipref, mode, ffs_nodealloccg);
if (ino == 0)
goto noinodes;
error = VFS_VGET(pvp->v_mount, ino, vpp);
if (error) {
ffs_inode_free(pip, ino, mode);
return (error);
}
ip = VTOI(*vpp);
if (DIP(ip, mode)) {
printf("mode = 0%o, inum = %u, fs = %s\n",
DIP(ip, mode), ip->i_number, fs->fs_fsmnt);
panic("ffs_valloc: dup alloc");
}
if (DIP(ip, blocks)) {
printf("free inode %s/%d had %lld blocks\n",
fs->fs_fsmnt, ino, (long long)DIP(ip, blocks)); DIP_ASSIGN(ip, blocks, 0);
}
DIP_ASSIGN(ip, flags, 0);
/*
* Set up a new generation number for this inode.
* On wrap, we make sure to assign a number != 0 and != UINT_MAX
* (the original value).
*/
if (DIP(ip, gen) != 0) DIP_ADD(ip, gen, 1); while (DIP(ip, gen) == 0) DIP_ASSIGN(ip, gen, arc4random_uniform(UINT_MAX));
return (0);
noinodes:
if (ratecheck(&fsnoinodes_last, &fserr_interval)) { ffs_fserr(fs, cred->cr_uid, "out of inodes");
uprintf("\n%s: create/symlink failed, no inodes free\n",
fs->fs_fsmnt);
}
return (ENOSPC);
}
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
ufsino_t
ffs_dirpref(struct inode *pip)
{
struct fs *fs;
u_int cg, prefcg;
u_int dirsize, cgsize;
u_int avgifree, avgbfree, avgndir, curdirsize;
u_int minifree, minbfree, maxndir;
u_int mincg, minndir;
u_int maxcontigdirs;
fs = pip->i_fs;
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
/*
* Force allocation in another cg if creating a first level dir.
*/
if (ITOV(pip)->v_flag & VROOT) {
prefcg = arc4random_uniform(fs->fs_ncg);
mincg = prefcg;
minndir = fs->fs_ipg;
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
cg = mincg;
goto end;
} else
prefcg = ino_to_cg(fs, pip->i_number);
/*
* Count various limits which used for
* optimal allocation of a directory inode.
*/
maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
minifree = avgifree - (avgifree / 4);
if (minifree < 1)
minifree = 1;
minbfree = avgbfree - (avgbfree / 4);
if (minbfree < 1)
minbfree = 1;
cgsize = fs->fs_fsize * fs->fs_fpg;
dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
if (dirsize < curdirsize)
dirsize = curdirsize;
if (dirsize <= 0)
maxcontigdirs = 0; /* dirsize overflowed */
else
maxcontigdirs = min(avgbfree * fs->fs_bsize / dirsize, 255); if (fs->fs_avgfpdir > 0)
maxcontigdirs = min(maxcontigdirs,
fs->fs_ipg / fs->fs_avgfpdir);
if (maxcontigdirs == 0)
maxcontigdirs = 1;
/*
* Limit number of dirs in one cg and reserve space for
* regular files, but only if we have no deficit in
* inodes or space.
*
* We are trying to find a suitable cylinder group nearby
* our preferred cylinder group to place a new directory.
* We scan from our preferred cylinder group forward looking
* for a cylinder group that meets our criterion. If we get
* to the final cylinder group and do not find anything,
* we start scanning forwards from the beginning of the
* filesystem. While it might seem sensible to start scanning
* backwards or even to alternate looking forward and backward,
* this approach fails badly when the filesystem is nearly full.
* Specifically, we first search all the areas that have no space
* and finally try the one preceding that. We repeat this on
* every request and in the case of the final block end up
* searching the entire filesystem. By jumping to the front
* of the filesystem, our future forward searches always look
* in new cylinder groups so finds every possible block after
* one pass over the filesystem.
*/
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs)
goto end;
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs)
goto end;
}
/*
* This is a backstop when we have deficit in space.
*/
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
goto end;
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
goto end;
end:
return ((ufsino_t)(fs->fs_ipg * cg));
}
/*
* Select the desired position for the next block in a file. The file is
* logically divided into sections. The first section is composed of the
* direct blocks. Each additional section contains fs_maxbpg blocks.
*
* If no blocks have been allocated in the first section, the policy is to
* request a block in the same cylinder group as the inode that describes
* the file. The first indirect is allocated immediately following the last
* direct block and the data blocks for the first indirect immediately
* follow it.
*
* If no blocks have been allocated in any other section, the indirect
* block(s) are allocated in the same cylinder group as its inode in an
* area reserved immediately following the inode blocks. The policy for
* the data blocks is to place them in a cylinder group with a greater than
* average number of free blocks. An appropriate cylinder group is found
* by using a rotor that sweeps the cylinder groups. When a new group of
* blocks is needed, the sweep begins in the cylinder group following the
* cylinder group from which the previous allocation was made. The sweep
* continues until a cylinder group with greater than the average number
* of free blocks is found. If the allocation is for the first block in an
* indirect block, the information on the previous allocation is unavailable;
* here a best guess is made based upon the logical block number being
* allocated.
*/
int32_t
ffs1_blkpref(struct inode *ip, daddr_t lbn, int indx, int32_t *bap)
{
struct fs *fs;
u_int cg, inocg;
u_int avgbfree, startcg;
uint32_t pref;
KASSERT(indx <= 0 || bap != NULL);
fs = ip->i_fs;
/*
* Allocation of indirect blocks is indicated by passing negative
* values in indx: -1 for single indirect, -2 for double indirect,
* -3 for triple indirect. As noted below, we attempt to allocate
* the first indirect inline with the file data. For all later
* indirect blocks, the data is often allocated in other cylinder
* groups. However to speed random file access and to speed up
* fsck, the filesystem reserves the first fs_metaspace blocks
* (typically half of fs_minfree) of the data area of each cylinder
* group to hold these later indirect blocks.
*/
inocg = ino_to_cg(fs, ip->i_number);
if (indx < 0) {
/*
* Our preference for indirect blocks is the zone at the
* beginning of the inode's cylinder group data area that
* we try to reserve for indirect blocks.
*/
pref = cgmeta(fs, inocg);
/*
* If we are allocating the first indirect block, try to
* place it immediately following the last direct block.
*/
if (indx == -1 && lbn < NDADDR + NINDIR(fs) &&
ip->i_din1->di_db[NDADDR - 1] != 0)
pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag;
return (pref);
}
/*
* If we are allocating the first data block in the first indirect
* block and the indirect has been allocated in the data block area,
* try to place it immediately following the indirect block.
*/
if (lbn == NDADDR) {
pref = ip->i_din1->di_ib[0];
if (pref != 0 && pref >= cgdata(fs, inocg) &&
pref < cgbase(fs, inocg + 1))
return (pref + fs->fs_frag);
}
/*
* If we are the beginning of a file, or we have already allocated
* the maximum number of blocks per cylinder group, or we do not
* have a block allocated immediately preceding us, then we need
* to decide where to start allocating new blocks.
*/
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
/*
* If we are allocating a directory data block, we want
* to place it in the metadata area.
*/
if ((DIP(ip, mode) & IFMT) == IFDIR)
return (cgmeta(fs, inocg));
/*
* Until we fill all the direct and all the first indirect's
* blocks, we try to allocate in the data area of the inode's
* cylinder group.
*/
if (lbn < NDADDR + NINDIR(fs))
return (cgdata(fs, inocg));
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg = inocg + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs, bap[indx - 1]) + 1;
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
for (cg = 0; cg <= startcg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
fs->fs_cgrotor = cg;
return (cgdata(fs, cg));
}
return (0);
}
/*
* Otherwise, we just always try to lay things out contiguously.
*/
return (bap[indx - 1] + fs->fs_frag);
}
/*
* Same as above, for UFS2.
*/
#ifdef FFS2
int64_t
ffs2_blkpref(struct inode *ip, daddr_t lbn, int indx, int64_t *bap)
{
struct fs *fs;
u_int cg, inocg;
u_int avgbfree, startcg;
uint64_t pref;
KASSERT(indx <= 0 || bap != NULL);
fs = ip->i_fs;
/*
* Allocation of indirect blocks is indicated by passing negative
* values in indx: -1 for single indirect, -2 for double indirect,
* -3 for triple indirect. As noted below, we attempt to allocate
* the first indirect inline with the file data. For all later
* indirect blocks, the data is often allocated in other cylinder
* groups. However to speed random file access and to speed up
* fsck, the filesystem reserves the first fs_metaspace blocks
* (typically half of fs_minfree) of the data area of each cylinder
* group to hold these later indirect blocks.
*/
inocg = ino_to_cg(fs, ip->i_number);
if (indx < 0) {
/*
* Our preference for indirect blocks is the zone at the
* beginning of the inode's cylinder group data area that
* we try to reserve for indirect blocks.
*/
pref = cgmeta(fs, inocg);
/*
* If we are allocating the first indirect block, try to
* place it immediately following the last direct block.
*/
if (indx == -1 && lbn < NDADDR + NINDIR(fs) &&
ip->i_din2->di_db[NDADDR - 1] != 0)
pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag;
return (pref);
}
/*
* If we are allocating the first data block in the first indirect
* block and the indirect has been allocated in the data block area,
* try to place it immediately following the indirect block.
*/
if (lbn == NDADDR) {
pref = ip->i_din2->di_ib[0];
if (pref != 0 && pref >= cgdata(fs, inocg) &&
pref < cgbase(fs, inocg + 1))
return (pref + fs->fs_frag);
}
/*
* If we are the beginning of a file, or we have already allocated
* the maximum number of blocks per cylinder group, or we do not
* have a block allocated immediately preceding us, then we need
* to decide where to start allocating new blocks.
*/
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
/*
* If we are allocating a directory data block, we want
* to place it in the metadata area.
*/
if ((DIP(ip, mode) & IFMT) == IFDIR) return (cgmeta(fs, inocg));
/*
* Until we fill all the direct and all the first indirect's
* blocks, we try to allocate in the data area of the inode's
* cylinder group.
*/
if (lbn < NDADDR + NINDIR(fs))
return (cgdata(fs, inocg));
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg = inocg + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs, bap[indx - 1] + 1);
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree)
return (cgbase(fs, cg) + fs->fs_frag);
for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree)
return (cgbase(fs, cg) + fs->fs_frag);
return (0);
}
/*
* Otherwise, we just always try to lay things out contiguously.
*/
return (bap[indx - 1] + fs->fs_frag);
}
#endif /* FFS2 */
/*
* Implement the cylinder overflow algorithm.
*
* The policy implemented by this algorithm is:
* 1) allocate the block in its requested cylinder group.
* 2) quadratically rehash on the cylinder group number.
* 3) brute force search for a free block.
*/
daddr_t
ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref, int size,
daddr_t (*allocator)(struct inode *, u_int, daddr_t, int))
{
struct fs *fs;
daddr_t result;
u_int i, icg = cg;
fs = ip->i_fs;
/*
* 1: preferred cylinder group
*/
result = (*allocator)(ip, cg, pref, size);
if (result)
return (result);
/*
* 2: quadratic rehash
*/
for (i = 1; i < fs->fs_ncg; i *= 2) {
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
result = (*allocator)(ip, cg, 0, size);
if (result)
return (result);
}
/*
* 3: brute force search
* Note that we start at i == 2, since 0 was checked initially,
* and 1 is always checked in the quadratic rehash.
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
result = (*allocator)(ip, cg, 0, size);
if (result)
return (result);
cg++;
if (cg == fs->fs_ncg)
cg = 0;
}
return (0);
}
struct buf *
ffs_cgread(struct fs *fs, struct inode *ip, u_int cg)
{
struct buf *bp;
if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, &bp)) {
brelse(bp);
return (NULL);
}
if (!cg_chkmagic((struct cg *)bp->b_data)) { brelse(bp);
return (NULL);
}
return bp;
}
/*
* Determine whether a fragment can be extended.
*
* Check to see if the necessary fragments are available, and
* if they are, allocate them.
*/
daddr_t
ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize)
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct timespec now;
daddr_t bno;
int i, frags, bbase;
fs = ip->i_fs;
if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
return (0);
frags = numfrags(fs, nsize);
bbase = fragnum(fs, bprev);
if (bbase > fragnum(fs, (bprev + frags - 1))) {
/* cannot extend across a block boundary */
return (0);
}
if (!(bp = ffs_cgread(fs, ip, cg)))
return (0);
cgp = (struct cg *)bp->b_data;
nanotime(&now);
cgp->cg_ffs2_time = now.tv_sec;
cgp->cg_time = now.tv_sec;
bno = dtogd(fs, bprev);
for (i = numfrags(fs, osize); i < frags; i++) if (isclr(cg_blksfree(cgp), bno + i)) { brelse(bp);
return (0);
}
/*
* the current fragment can be extended
* deduct the count on fragment being extended into
* increase the count on the remaining fragment (if any)
* allocate the extended piece
*/
for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(cg_blksfree(cgp), bno + i))
break;
cgp->cg_frsum[i - numfrags(fs, osize)]--;
if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize); i < frags; i++) { clrbit(cg_blksfree(cgp), bno + i);
cgp->cg_cs.cs_nffree--;
fs->fs_cstotal.cs_nffree--;
fs->fs_cs(fs, cg).cs_nffree--;
}
fs->fs_fmod = 1;
if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, bprev);
bdwrite(bp);
return (bprev);
}
/*
* Determine whether a block can be allocated.
*
* Check to see if a block of the appropriate size is available,
* and if it is, allocate it.
*/
daddr_t
ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size)
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct timespec now;
daddr_t bno, blkno;
int i, frags, allocsiz;
fs = ip->i_fs;
if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
return (0);
if (!(bp = ffs_cgread(fs, ip, cg)))
return (0);
cgp = (struct cg *)bp->b_data;
if (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize) { brelse(bp);
return (0);
}
nanotime(&now);
cgp->cg_ffs2_time = now.tv_sec;
cgp->cg_time = now.tv_sec;
if (size == fs->fs_bsize) {
/* allocate and return a complete data block */
bno = ffs_alloccgblk(ip, bp, bpref);
bdwrite(bp);
return (bno);
}
/*
* check to see if any fragments are already available
* allocsiz is the size which will be allocated, hacking
* it down to a smaller size if necessary
*/
frags = numfrags(fs, size);
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0)
break;
if (allocsiz == fs->fs_frag) {
/*
* no fragments were available, so a block will be
* allocated, and hacked up
*/
if (cgp->cg_cs.cs_nbfree == 0) {
brelse(bp);
return (0);
}
bno = ffs_alloccgblk(ip, bp, bpref);
bpref = dtogd(fs, bno);
for (i = frags; i < fs->fs_frag; i++) setbit(cg_blksfree(cgp), bpref + i);
i = fs->fs_frag - frags;
cgp->cg_cs.cs_nffree += i;
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
fs->fs_fmod = 1;
cgp->cg_frsum[i]++;
bdwrite(bp);
return (bno);
}
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
if (bno < 0) {
brelse(bp);
return (0);
}
for (i = 0; i < frags; i++) clrbit(cg_blksfree(cgp), bno + i);
cgp->cg_cs.cs_nffree -= frags;
fs->fs_cstotal.cs_nffree -= frags;
fs->fs_cs(fs, cg).cs_nffree -= frags;
fs->fs_fmod = 1;
cgp->cg_frsum[allocsiz]--;
if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++;
blkno = cgbase(fs, cg) + bno;
if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, blkno);
bdwrite(bp);
return (blkno);
}
/*
* Allocate a block in a cylinder group.
* Note that this routine only allocates fs_bsize blocks; these
* blocks may be fragmented by the routine that allocates them.
*/
daddr_t
ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref)
{
struct fs *fs;
struct cg *cgp;
daddr_t bno, blkno;
u_int8_t *blksfree;
int cylno, cgbpref;
fs = ip->i_fs;
cgp = (struct cg *) bp->b_data;
blksfree = cg_blksfree(cgp);
if (bpref == 0) {
bpref = cgp->cg_rotor; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
/* map bpref to correct zone in this cg */
if (bpref < cgdata(fs, cgbpref))
bpref = cgmeta(fs, cgp->cg_cgx);
else
bpref = cgdata(fs, cgp->cg_cgx);
}
/*
* If the requested block is available, use it.
*/
bno = dtogd(fs, blknum(fs, bpref));
if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
goto gotit;
/*
* Take the next available block in this cylinder group.
*/
bno = ffs_mapsearch(fs, cgp, bpref, (int) fs->fs_frag);
if (bno < 0)
return (0);
/* Update cg_rotor only if allocated from the data zone */
if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno;
gotit:
blkno = fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, blkno);
ffs_clusteracct(fs, cgp, blkno, -1);
cgp->cg_cs.cs_nbfree--;
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
if (fs->fs_magic != FS_UFS2_MAGIC) { cylno = cbtocylno(fs, bno); cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--; cg_blktot(cgp)[cylno]--;
}
fs->fs_fmod = 1;
blkno = cgbase(fs, cgp->cg_cgx) + bno;
if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, fs, blkno);
return (blkno);
}
/* inode allocation routine */
daddr_t
ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode)
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct timespec now;
int start, len, loc, map, i;
#ifdef FFS2
struct buf *ibp = NULL;
struct ufs2_dinode *dp2;
#endif
/*
* For efficiency, before looking at the bitmaps for free inodes,
* check the counters kept in the superblock cylinder group summaries,
* and in the cylinder group itself.
*/
fs = ip->i_fs;
if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
if (!(bp = ffs_cgread(fs, ip, cg)))
return (0);
cgp = (struct cg *)bp->b_data;
if (cgp->cg_cs.cs_nifree == 0) {
brelse(bp);
return (0);
}
/*
* We are committed to the allocation from now on, so update the time
* on the cylinder group.
*/
nanotime(&now);
cgp->cg_ffs2_time = now.tv_sec;
cgp->cg_time = now.tv_sec;
/*
* If there was a preferred location for the new inode, try to find it.
*/
if (ipref) {
ipref %= fs->fs_ipg;
if (isclr(cg_inosused(cgp), ipref))
goto gotit; /* inode is free, grab it. */
}
/*
* Otherwise, look for the next available inode, starting at cg_irotor
* (the position in the bitmap of the last used inode).
*/
start = cgp->cg_irotor / NBBY;
len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
loc = skpc(0xff, len, &cg_inosused(cgp)[start]); if (loc == 0) {
/*
* If we didn't find a free inode in the upper part of the
* bitmap (from cg_irotor to the end), then look at the bottom
* part (from 0 to cg_irotor).
*/
len = start + 1;
start = 0;
loc = skpc(0xff, len, &cg_inosused(cgp)[0]); if (loc == 0) {
/*
* If we failed again, then either the bitmap or the
* counters kept for the cylinder group are wrong.
*/
printf("cg = %d, irotor = %d, fs = %s\n",
cg, cgp->cg_irotor, fs->fs_fsmnt);
panic("ffs_nodealloccg: map corrupted");
/* NOTREACHED */
}
}
/* skpc() returns the position relative to the end */
i = start + len - loc;
/*
* Okay, so now in 'i' we have the location in the bitmap of a byte
* holding a free inode. Find the corresponding bit and set it,
* updating cg_irotor as well, accordingly.
*/
map = cg_inosused(cgp)[i];
ipref = i * NBBY;
for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
if ((map & i) == 0) {
cgp->cg_irotor = ipref;
goto gotit;
}
}
printf("fs = %s\n", fs->fs_fsmnt);
panic("ffs_nodealloccg: block not in map");
/* NOTREACHED */
gotit:
#ifdef FFS2
/*
* For FFS2, check if all inodes in this cylinder group have been used
* at least once. If they haven't, and we are allocating an inode past
* the last allocated block of inodes, read in a block and initialize
* all inodes in it.
*/
if (fs->fs_magic == FS_UFS2_MAGIC &&
/* Inode is beyond last initialized block of inodes? */
ipref + INOPB(fs) > cgp->cg_initediblk &&
/* Has any inode not been used at least once? */
cgp->cg_initediblk < cgp->cg_ffs2_niblk) {
ibp = getblk(ip->i_devvp, fsbtodb(fs,
ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
(int)fs->fs_bsize, 0, INFSLP);
memset(ibp->b_data, 0, fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
/* Give each inode a generation number */
for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0)
dp2->di_gen = arc4random();
dp2++;
}
/* Update the counter of initialized inodes */
cgp->cg_initediblk += INOPB(fs);
}
#endif /* FFS2 */
if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref); setbit(cg_inosused(cgp), ipref);
/* Update the counters we keep on free inodes */
cgp->cg_cs.cs_nifree--;
fs->fs_cstotal.cs_nifree--;
fs->fs_cs(fs, cg).cs_nifree--;
fs->fs_fmod = 1; /* file system was modified */
/* Update the counters we keep on allocated directories */
if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++;
fs->fs_cstotal.cs_ndir++;
fs->fs_cs(fs, cg).cs_ndir++;
}
bdwrite(bp);
#ifdef FFS2
if (ibp != NULL) bawrite(ibp);
#endif
/* Return the allocated inode number */
return (cg * fs->fs_ipg + ipref);
}
/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*/
void
ffs_blkfree(struct inode *ip, daddr_t bno, long size)
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct timespec now;
daddr_t blkno;
int i, cg, blk, frags, bbase;
fs = ip->i_fs;
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
printf("dev = 0x%x, bsize = %d, size = %ld, fs = %s\n",
ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
panic("ffs_blkfree: bad size");
}
cg = dtog(fs, bno);
if ((u_int)bno >= fs->fs_size) {
printf("bad block %lld, ino %u\n", (long long)bno,
ip->i_number);
ffs_fserr(fs, DIP(ip, uid), "bad block");
return;
}
if (!(bp = ffs_cgread(fs, ip, cg)))
return;
cgp = (struct cg *)bp->b_data;
nanotime(&now);
cgp->cg_ffs2_time = now.tv_sec;
cgp->cg_time = now.tv_sec;
bno = dtogd(fs, bno);
if (size == fs->fs_bsize) {
blkno = fragstoblks(fs, bno);
if (!ffs_isfreeblock(fs, cg_blksfree(cgp), blkno)) {
printf("dev = 0x%x, block = %lld, fs = %s\n",
ip->i_dev, (long long)bno, fs->fs_fsmnt);
panic("ffs_blkfree: freeing free block");
}
ffs_setblock(fs, cg_blksfree(cgp), blkno);
ffs_clusteracct(fs, cgp, blkno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if (fs->fs_magic != FS_UFS2_MAGIC) { i = cbtocylno(fs, bno); cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++; cg_blktot(cgp)[i]++;
}
} else {
bbase = bno - fragnum(fs, bno);
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, cg_blksfree(cgp), bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
/*
* deallocate the fragment
*/
frags = numfrags(fs, size);
for (i = 0; i < frags; i++) { if (isset(cg_blksfree(cgp), bno + i)) {
printf("dev = 0x%x, block = %lld, fs = %s\n",
ip->i_dev, (long long)(bno + i),
fs->fs_fsmnt);
panic("ffs_blkfree: freeing free frag");
}
setbit(cg_blksfree(cgp), bno + i);
}
cgp->cg_cs.cs_nffree += i;
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, cg_blksfree(cgp), bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
/*
* if a complete block has been reassembled, account for it
*/
blkno = fragstoblks(fs, bbase);
if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
cgp->cg_cs.cs_nffree -= fs->fs_frag;
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
ffs_clusteracct(fs, cgp, blkno, 1);
cgp->cg_cs.cs_nbfree++;
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if (fs->fs_magic != FS_UFS2_MAGIC) {
i = cbtocylno(fs, bbase);
cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++; cg_blktot(cgp)[i]++;
}
}
}
fs->fs_fmod = 1;
bdwrite(bp);
}
int
ffs_inode_free(struct inode *pip, ufsino_t ino, mode_t mode)
{
struct vnode *pvp = ITOV(pip);
if (DOINGSOFTDEP(pvp)) {
softdep_freefile(pvp, ino, mode);
return (0);
}
return (ffs_freefile(pip, ino, mode));
}
/*
* Do the actual free operation.
* The specified inode is placed back in the free map.
*/
int
ffs_freefile(struct inode *pip, ufsino_t ino, mode_t mode)
{
struct fs *fs;
struct cg *cgp;
struct buf *bp;
struct timespec now;
u_int cg;
fs = pip->i_fs;
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("ffs_freefile: range: dev = 0x%x, ino = %d, fs = %s",
pip->i_dev, ino, fs->fs_fsmnt);
cg = ino_to_cg(fs, ino);
if (!(bp = ffs_cgread(fs, pip, cg)))
return (0);
cgp = (struct cg *)bp->b_data;
nanotime(&now);
cgp->cg_ffs2_time = now.tv_sec;
cgp->cg_time = now.tv_sec;
ino %= fs->fs_ipg;
if (isclr(cg_inosused(cgp), ino)) {
printf("dev = 0x%x, ino = %u, fs = %s\n",
pip->i_dev, ino, fs->fs_fsmnt);
if (fs->fs_ronly == 0)
panic("ffs_freefile: freeing free inode");
}
clrbit(cg_inosused(cgp), ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino;
cgp->cg_cs.cs_nifree++;
fs->fs_cstotal.cs_nifree++;
fs->fs_cs(fs, cg).cs_nifree++;
if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--;
fs->fs_cstotal.cs_ndir--;
fs->fs_cs(fs, cg).cs_ndir--;
}
fs->fs_fmod = 1;
bdwrite(bp);
return (0);
}
/*
* Find a block of the specified size in the specified cylinder group.
*
* It is a panic if a request is made to find a block if none are
* available.
*/
daddr_t
ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
{
daddr_t bno;
int start, len, loc, i;
int blk, field, subfield, pos;
/*
* find the fragment by searching through the free block
* map for an appropriate bit pattern
*/
if (bpref)
start = dtogd(fs, bpref) / NBBY;
else
start = cgp->cg_frotor / NBBY;
len = howmany(fs->fs_fpg, NBBY) - start;
loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[start],
(u_char *)fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
len = start + 1;
start = 0;
loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[0],
(u_char *)fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
printf("start = %d, len = %d, fs = %s\n",
start, len, fs->fs_fsmnt);
panic("ffs_alloccg: map corrupted");
/* NOTREACHED */
}
}
bno = (start + len - loc) * NBBY;
cgp->cg_frotor = bno;
/*
* found the byte in the map
* sift through the bits to find the selected frag
*/
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, cg_blksfree(cgp), bno);
blk <<= 1;
field = around[allocsiz];
subfield = inside[allocsiz];
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
if ((blk & field) == subfield)
return (bno + pos);
field <<= 1;
subfield <<= 1;
}
}
printf("bno = %lld, fs = %s\n", (long long)bno, fs->fs_fsmnt);
panic("ffs_alloccg: block not in map");
return (-1);
}
/*
* Update the cluster map because of an allocation or free.
*
* Cnt == 1 means free; cnt == -1 means allocating.
*/
void
ffs_clusteracct(struct fs *fs, struct cg *cgp, daddr_t blkno, int cnt)
{
int32_t *sump;
int32_t *lp;
u_char *freemapp, *mapp;
int i, start, end, forw, back, map, bit;
if (fs->fs_contigsumsize <= 0)
return;
freemapp = cg_clustersfree(cgp);
sump = cg_clustersum(cgp);
/*
* Allocate or clear the actual block.
*/
if (cnt > 0)
setbit(freemapp, blkno);
else
clrbit(freemapp, blkno);
/*
* Find the size of the cluster going forward.
*/
start = blkno + 1;
end = start + fs->fs_contigsumsize;
if (end >= cgp->cg_nclusterblks)
end = cgp->cg_nclusterblks;
mapp = &freemapp[start / NBBY];
map = *mapp++;
bit = 1 << (start % NBBY);
for (i = start; i < end; i++) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != (NBBY - 1)) {
bit <<= 1;
} else {
map = *mapp++;
bit = 1;
}
}
forw = i - start;
/*
* Find the size of the cluster going backward.
*/
start = blkno - 1;
end = start - fs->fs_contigsumsize;
if (end < 0)
end = -1;
mapp = &freemapp[start / NBBY];
map = *mapp--;
bit = 1 << (start % NBBY);
for (i = start; i > end; i--) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != 0) {
bit >>= 1;
} else {
map = *mapp--;
bit = 1 << (NBBY - 1);
}
}
back = start - i;
/*
* Account for old cluster and the possibly new forward and
* back clusters.
*/
i = back + forw + 1;
if (i > fs->fs_contigsumsize)
i = fs->fs_contigsumsize;
sump[i] += cnt;
if (back > 0) sump[back] -= cnt; if (forw > 0) sump[forw] -= cnt;
/*
* Update cluster summary information.
*/
lp = &sump[fs->fs_contigsumsize];
for (i = fs->fs_contigsumsize; i > 0; i--) if (*lp-- > 0)
break;
fs->fs_maxcluster[cgp->cg_cgx] = i;
}
/* $OpenBSD: kern_proc.c,v 1.92 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_proc.c,v 1.14 1996/02/09 18:59:41 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.4 (Berkeley) 1/4/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/wait.h>
#include <sys/rwlock.h>
#include <sys/malloc.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/pool.h>
#include <sys/vnode.h>
struct rwlock uidinfolk;
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
u_long uihash; /* size of hash table - 1 */
/*
* Other process lists
*/
struct tidhashhead *tidhashtbl;
u_long tidhash;
struct pidhashhead *pidhashtbl;
u_long pidhash;
struct pgrphashhead *pgrphashtbl;
u_long pgrphash;
struct processlist allprocess;
struct processlist zombprocess;
struct proclist allproc;
struct pool proc_pool;
struct pool process_pool;
struct pool rusage_pool;
struct pool ucred_pool;
struct pool pgrp_pool;
struct pool session_pool;
void pgdelete(struct pgrp *);
void fixjobc(struct process *, struct pgrp *, int);
static void orphanpg(struct pgrp *);
#ifdef DEBUG
void pgrpdump(void);
#endif
/*
* Initialize global process hashing structures.
*/
void
procinit(void)
{
LIST_INIT(&allprocess);
LIST_INIT(&zombprocess);
LIST_INIT(&allproc);
rw_init(&uidinfolk, "uidinfo");
tidhashtbl = hashinit(maxthread / 4, M_PROC, M_NOWAIT, &tidhash);
pidhashtbl = hashinit(maxprocess / 4, M_PROC, M_NOWAIT, &pidhash);
pgrphashtbl = hashinit(maxprocess / 4, M_PROC, M_NOWAIT, &pgrphash);
uihashtbl = hashinit(maxprocess / 16, M_PROC, M_NOWAIT, &uihash);
if (!tidhashtbl || !pidhashtbl || !pgrphashtbl || !uihashtbl)
panic("procinit: malloc");
pool_init(&proc_pool, sizeof(struct proc), 0, IPL_NONE,
PR_WAITOK, "procpl", NULL);
pool_init(&process_pool, sizeof(struct process), 0, IPL_NONE,
PR_WAITOK, "processpl", NULL);
pool_init(&rusage_pool, sizeof(struct rusage), 0, IPL_NONE,
PR_WAITOK, "zombiepl", NULL);
pool_init(&ucred_pool, sizeof(struct ucred), 0, IPL_MPFLOOR,
0, "ucredpl", NULL);
pool_init(&pgrp_pool, sizeof(struct pgrp), 0, IPL_NONE,
PR_WAITOK, "pgrppl", NULL);
pool_init(&session_pool, sizeof(struct session), 0, IPL_NONE,
PR_WAITOK, "sessionpl", NULL);
}
/*
* This returns with `uidinfolk' held: caller must call uid_release()
* after making whatever change they needed.
*/
struct uidinfo *
uid_find(uid_t uid)
{
struct uidinfo *uip, *nuip;
struct uihashhead *uipp;
uipp = UIHASH(uid);
rw_enter_write(&uidinfolk);
LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid)
break;
if (uip)
return (uip);
rw_exit_write(&uidinfolk);
nuip = malloc(sizeof(*nuip), M_PROC, M_WAITOK|M_ZERO);
rw_enter_write(&uidinfolk);
LIST_FOREACH(uip, uipp, ui_hash)
if (uip->ui_uid == uid)
break;
if (uip) {
free(nuip, M_PROC, sizeof(*nuip));
return (uip);
}
nuip->ui_uid = uid;
LIST_INSERT_HEAD(uipp, nuip, ui_hash);
return (nuip);
}
void
uid_release(struct uidinfo *uip)
{
rw_exit_write(&uidinfolk);
}
/*
* Change the count associated with number of threads
* a given user is using.
*/
int
chgproccnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long count;
uip = uid_find(uid);
count = (uip->ui_proccnt += diff);
uid_release(uip);
if (count < 0)
panic("chgproccnt: procs < 0");
return count;
}
/*
* Is pr an inferior of parent?
*/
int
inferior(struct process *pr, struct process *parent)
{ for (; pr != parent; pr = pr->ps_pptr) if (pr->ps_pid == 0 || pr->ps_pid == 1)
return (0);
return (1);
}
/*
* Locate a proc (thread) by number
*/
struct proc *
tfind(pid_t tid)
{
struct proc *p;
LIST_FOREACH(p, TIDHASH(tid), p_hash) if (p->p_tid == tid)
return (p);
return (NULL);
}
/*
* Locate a process by number
*/
struct process *
prfind(pid_t pid)
{
struct process *pr;
LIST_FOREACH(pr, PIDHASH(pid), ps_hash) if (pr->ps_pid == pid)
return (pr);
return (NULL);
}
/*
* Locate a process group by number
*/
struct pgrp *
pgfind(pid_t pgid)
{
struct pgrp *pgrp;
LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) if (pgrp->pg_id == pgid)
return (pgrp);
return (NULL);
}
/*
* Locate a zombie process
*/
struct process *
zombiefind(pid_t pid)
{
struct process *pr;
LIST_FOREACH(pr, &zombprocess, ps_list)
if (pr->ps_pid == pid)
return (pr);
return (NULL);
}
/*
* Move process to a new process group. If a session is provided
* then it's a new session to contain this process group; otherwise
* the process is staying within its existing session.
*/
void
enternewpgrp(struct process *pr, struct pgrp *pgrp, struct session *newsess)
{
#ifdef DIAGNOSTIC
if (SESS_LEADER(pr))
panic("%s: session leader attempted setpgrp", __func__);
#endif
if (newsess != NULL) {
/*
* New session. Initialize it completely
*/
timeout_set(&newsess->s_verauthto, zapverauth, newsess);
newsess->s_leader = pr;
newsess->s_count = 1;
newsess->s_ttyvp = NULL;
newsess->s_ttyp = NULL;
memcpy(newsess->s_login, pr->ps_session->s_login,
sizeof(newsess->s_login));
atomic_clearbits_int(&pr->ps_flags, PS_CONTROLT);
pgrp->pg_session = newsess;
#ifdef DIAGNOSTIC
if (pr != curproc->p_p) panic("%s: mksession but not curproc", __func__);
#endif
} else {
pgrp->pg_session = pr->ps_session;
pgrp->pg_session->s_count++;
}
pgrp->pg_id = pr->ps_pid;
LIST_INIT(&pgrp->pg_members);
LIST_INIT(&pgrp->pg_sigiolst);
LIST_INSERT_HEAD(PGRPHASH(pr->ps_pid), pgrp, pg_hash);
pgrp->pg_jobc = 0;
enterthispgrp(pr, pgrp);
}
/*
* move process to an existing process group
*/
void
enterthispgrp(struct process *pr, struct pgrp *pgrp)
{
struct pgrp *savepgrp = pr->ps_pgrp;
/*
* Adjust eligibility of affected pgrps to participate in job control.
* Increment eligibility counts before decrementing, otherwise we
* could reach 0 spuriously during the first call.
*/
fixjobc(pr, pgrp, 1);
fixjobc(pr, savepgrp, 0);
LIST_REMOVE(pr, ps_pglist);
pr->ps_pgrp = pgrp;
LIST_INSERT_HEAD(&pgrp->pg_members, pr, ps_pglist); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp);
}
/*
* remove process from process group
*/
void
leavepgrp(struct process *pr)
{
if (pr->ps_session->s_verauthppid == pr->ps_pid)
zapverauth(pr->ps_session);
LIST_REMOVE(pr, ps_pglist);
if (LIST_EMPTY(&pr->ps_pgrp->pg_members))
pgdelete(pr->ps_pgrp);
pr->ps_pgrp = NULL;
}
/*
* delete a process group
*/
void
pgdelete(struct pgrp *pgrp)
{
sigio_freelist(&pgrp->pg_sigiolst);
if (pgrp->pg_session->s_ttyp != NULL &&
pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); SESSRELE(pgrp->pg_session);
pool_put(&pgrp_pool, pgrp);
}
void
zapverauth(void *v)
{
struct session *sess = v;
sess->s_verauthuid = 0;
sess->s_verauthppid = 0;
}
/*
* Adjust pgrp jobc counters when specified process changes process group.
* We count the number of processes in each process group that "qualify"
* the group for terminal job control (those with a parent in a different
* process group of the same session). If that count reaches zero, the
* process group becomes orphaned. Check both the specified process'
* process group and that of its children.
* entering == 0 => pr is leaving specified group.
* entering == 1 => pr is entering specified group.
* XXX need proctree lock
*/
void
fixjobc(struct process *pr, struct pgrp *pgrp, int entering)
{
struct pgrp *hispgrp;
struct session *mysession = pgrp->pg_session;
/*
* Check pr's parent to see whether pr qualifies its own process
* group; if so, adjust count for pr's process group.
*/
if ((hispgrp = pr->ps_pptr->ps_pgrp) != pgrp &&
hispgrp->pg_session == mysession) {
if (entering)
pgrp->pg_jobc++; else if (--pgrp->pg_jobc == 0)
orphanpg(pgrp);
}
/*
* Check this process' children to see whether they qualify
* their process groups; if so, adjust counts for children's
* process groups.
*/
LIST_FOREACH(pr, &pr->ps_children, ps_sibling) if ((hispgrp = pr->ps_pgrp) != pgrp && hispgrp->pg_session == mysession &&
(pr->ps_flags & PS_ZOMBIE) == 0) {
if (entering)
hispgrp->pg_jobc++; else if (--hispgrp->pg_jobc == 0)
orphanpg(hispgrp);
}
}
void
killjobc(struct process *pr)
{
if (SESS_LEADER(pr)) {
struct session *sp = pr->ps_session;
if (sp->s_ttyvp) {
struct vnode *ovp;
/*
* Controlling process.
* Signal foreground pgrp,
* drain controlling terminal
* and revoke access to controlling terminal.
*/
if (sp->s_ttyp->t_session == sp) {
if (sp->s_ttyp->t_pgrp)
pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
ttywait(sp->s_ttyp);
/*
* The tty could have been revoked
* if we blocked.
*/
if (sp->s_ttyvp)
VOP_REVOKE(sp->s_ttyvp, REVOKEALL);
}
ovp = sp->s_ttyvp;
sp->s_ttyvp = NULL;
if (ovp)
vrele(ovp);
/*
* s_ttyp is not zero'd; we use this to
* indicate that the session once had a
* controlling terminal. (for logging and
* informational purposes)
*/
}
sp->s_leader = NULL;
}
fixjobc(pr, pr->ps_pgrp, 0);
}
/*
* A process group has become orphaned;
* if there are any stopped processes in the group,
* hang-up all process in that group.
*/
static void
orphanpg(struct pgrp *pg)
{
struct process *pr;
LIST_FOREACH(pr, &pg->pg_members, ps_pglist) {
if (pr->ps_mainproc->p_stat == SSTOP) {
LIST_FOREACH(pr, &pg->pg_members, ps_pglist) {
prsignal(pr, SIGHUP);
prsignal(pr, SIGCONT);
}
return;
}
}
}
#ifdef DDB
void
proc_printit(struct proc *p, const char *modif,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
static const char *const pstat[] = {
"idle", "run", "sleep", "stop", "zombie", "dead", "onproc"
};
char pstbuf[5];
const char *pst = pstbuf;
if (p->p_stat < 1 || p->p_stat > sizeof(pstat) / sizeof(pstat[0]))
snprintf(pstbuf, sizeof(pstbuf), "%d", p->p_stat);
else
pst = pstat[(int)p->p_stat - 1];
(*pr)("PROC (%s) pid=%d stat=%s\n", p->p_p->ps_comm, p->p_tid, pst);
(*pr)(" flags process=%b proc=%b\n",
p->p_p->ps_flags, PS_BITS, p->p_flag, P_BITS);
(*pr)(" pri=%u, usrpri=%u, nice=%d\n",
p->p_runpri, p->p_usrpri, p->p_p->ps_nice);
(*pr)(" forw=%p, list=%p,%p\n",
TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
(*pr)(" process=%p user=%p, vmspace=%p\n",
p->p_p, p->p_addr, p->p_vmspace);
(*pr)(" estcpu=%u, cpticks=%d, pctcpu=%u.%u\n",
p->p_estcpu, p->p_cpticks, p->p_pctcpu / 100, p->p_pctcpu % 100);
(*pr)(" user=%u, sys=%u, intr=%u\n",
p->p_uticks, p->p_sticks, p->p_iticks);
}
#include <machine/db_machdep.h>
#include <ddb/db_output.h>
void
db_kill_cmd(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{
struct process *pr;
struct proc *p;
pr = prfind(addr);
if (pr == NULL) {
db_printf("%ld: No such process", addr);
return;
}
p = TAILQ_FIRST(&pr->ps_threads);
/* Send uncatchable SIGABRT for coredump */
sigabort(p);
}
void
db_show_all_procs(db_expr_t addr, int haddr, db_expr_t count, char *modif)
{
char *mode;
int skipzomb = 0;
int has_kernel_lock = 0;
struct proc *p;
struct process *pr, *ppr;
if (modif[0] == 0)
modif[0] = 'n'; /* default == normal mode */
mode = "mawno";
while (*mode && *mode != modif[0])
mode++;
if (*mode == 0 || *mode == 'm') {
db_printf("usage: show all procs [/a] [/n] [/w]\n");
db_printf("\t/a == show process address info\n");
db_printf("\t/n == show normal process info [default]\n");
db_printf("\t/w == show process pgrp/wait info\n");
db_printf("\t/o == show normal info for non-idle SONPROC\n");
return;
}
pr = LIST_FIRST(&allprocess);
switch (*mode) {
case 'a':
db_printf(" TID %-9s %18s %18s %18s\n",
"COMMAND", "STRUCT PROC *", "UAREA *", "VMSPACE/VM_MAP");
break;
case 'n':
db_printf(" PID %6s %5s %5s S %10s %-12s %-15s\n",
"TID", "PPID", "UID", "FLAGS", "WAIT", "COMMAND");
break;
case 'w':
db_printf(" TID %-15s %-5s %18s %s\n",
"COMMAND", "PGRP", "WAIT-CHANNEL", "WAIT-MSG");
break;
case 'o':
skipzomb = 1;
db_printf(" TID %5s %5s %10s %10s %3s %-30s\n",
"PID", "UID", "PRFLAGS", "PFLAGS", "CPU", "COMMAND");
break;
}
while (pr != NULL) {
ppr = pr->ps_pptr;
TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
#ifdef MULTIPROCESSOR
if (__mp_lock_held(&kernel_lock, p->p_cpu))
has_kernel_lock = 1;
else
has_kernel_lock = 0;
#endif
if (p->p_stat) {
if (*mode == 'o') {
if (p->p_stat != SONPROC)
continue;
if (p->p_cpu != NULL && p->p_cpu->
ci_schedstate.spc_idleproc == p)
continue;
}
if (*mode == 'n') {
db_printf("%c%5d ", (p == curproc ?
'*' : ' '), pr->ps_pid);
} else {
db_printf("%c%6d ", (p == curproc ?
'*' : ' '), p->p_tid);
}
switch (*mode) {
case 'a':
db_printf("%-9.9s %18p %18p %18p\n",
pr->ps_comm, p, p->p_addr, p->p_vmspace);
break;
case 'n':
db_printf("%6d %5d %5d %d %#10x "
"%-12.12s %-15s\n",
p->p_tid, ppr ? ppr->ps_pid : -1,
pr->ps_ucred->cr_ruid, p->p_stat,
p->p_flag | pr->ps_flags,
(p->p_wchan && p->p_wmesg) ?
p->p_wmesg : "", pr->ps_comm);
break;
case 'w':
db_printf("%-15s %-5d %18p %s\n",
pr->ps_comm, (pr->ps_pgrp ?
pr->ps_pgrp->pg_id : -1),
p->p_wchan,
(p->p_wchan && p->p_wmesg) ?
p->p_wmesg : "");
break;
case 'o':
db_printf("%5d %5d %#10x %#10x %3d"
"%c %-31s\n",
pr->ps_pid, pr->ps_ucred->cr_ruid,
pr->ps_flags, p->p_flag,
CPU_INFO_UNIT(p->p_cpu),
has_kernel_lock ? 'K' : ' ',
pr->ps_comm);
break;
}
}
}
pr = LIST_NEXT(pr, ps_list);
if (pr == NULL && skipzomb == 0) {
skipzomb = 1;
pr = LIST_FIRST(&zombprocess);
}
}
}
#endif
#ifdef DEBUG
void
pgrpdump(void)
{
struct pgrp *pgrp;
struct process *pr;
int i;
for (i = 0; i <= pgrphash; i++) {
if (!LIST_EMPTY(&pgrphashtbl[i])) {
printf("\tindx %d\n", i);
LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
printf("\tpgrp %p, pgid %d, sess %p, sesscnt %d, mem %p\n",
pgrp, pgrp->pg_id, pgrp->pg_session,
pgrp->pg_session->s_count,
LIST_FIRST(&pgrp->pg_members));
LIST_FOREACH(pr, &pgrp->pg_members, ps_pglist) {
printf("\t\tpid %d addr %p pgrp %p\n",
pr->ps_pid, pr, pr->ps_pgrp);
}
}
}
}
}
#endif /* DEBUG */
/* $OpenBSD: bpf.c,v 1.219 2022/07/09 12:48:21 visa Exp $ */
/* $NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $ */
/*
* Copyright (c) 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
*
* This code is derived from the Stanford/CMU enet packet filter,
* (net/enet.c) distributed as part of 4.3BSD, and code contributed
* to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
* Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)bpf.c 8.2 (Berkeley) 3/28/94
*/
#include "bpfilter.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/ioctl.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/socket.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/rwlock.h>
#include <sys/atomic.h>
#include <sys/event.h>
#include <sys/mutex.h>
#include <sys/refcnt.h>
#include <sys/smr.h>
#include <sys/specdev.h>
#include <sys/sigio.h>
#include <sys/task.h>
#include <sys/time.h>
#include <net/if.h>
#include <net/bpf.h>
#include <net/bpfdesc.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include "vlan.h"
#if NVLAN > 0
#include <net/if_vlan_var.h>
#endif
#define BPF_BUFSIZE 32768
#define PRINET 26 /* interruptible */
/*
* The default read buffer size is patchable.
*/
int bpf_bufsize = BPF_BUFSIZE;
int bpf_maxbufsize = BPF_MAXBUFSIZE;
/*
* bpf_iflist is the list of interfaces; each corresponds to an ifnet
* bpf_d_list is the list of descriptors
*/
struct bpf_if *bpf_iflist;
LIST_HEAD(, bpf_d) bpf_d_list;
int bpf_allocbufs(struct bpf_d *);
void bpf_ifname(struct bpf_if*, struct ifreq *);
void bpf_mcopy(const void *, void *, size_t);
int bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
struct sockaddr *);
int bpf_setif(struct bpf_d *, struct ifreq *);
int bpfkqfilter(dev_t, struct knote *);
void bpf_wakeup(struct bpf_d *);
void bpf_wakeup_cb(void *);
int _bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
void bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
const struct bpf_hdr *);
int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
int bpf_setdlt(struct bpf_d *, u_int);
void filt_bpfrdetach(struct knote *);
int filt_bpfread(struct knote *, long);
int filt_bpfreadmodify(struct kevent *, struct knote *);
int filt_bpfreadprocess(struct knote *, struct kevent *);
int bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
struct bpf_d *bpfilter_lookup(int);
/*
* Called holding ``bd_mtx''.
*/
void bpf_attachd(struct bpf_d *, struct bpf_if *);
void bpf_detachd(struct bpf_d *);
void bpf_resetd(struct bpf_d *);
void bpf_prog_smr(void *);
void bpf_d_smr(void *);
/*
* Reference count access to descriptor buffers
*/
void bpf_get(struct bpf_d *);
void bpf_put(struct bpf_d *);
struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
int
bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
struct sockaddr *sockp)
{
struct bpf_program_smr *bps;
struct bpf_insn *fcode = NULL;
struct mbuf *m;
struct m_tag *mtag;
int error;
u_int hlen, alen, mlen;
u_int len;
u_int linktype;
u_int slen;
/*
* Build a sockaddr based on the data link layer type.
* We do this at this level because the ethernet header
* is copied directly into the data field of the sockaddr.
* In the case of SLIP, there is no header and the packet
* is forwarded as is.
* Also, we are careful to leave room at the front of the mbuf
* for the link level header.
*/
linktype = d->bd_bif->bif_dlt;
switch (linktype) {
case DLT_SLIP:
sockp->sa_family = AF_INET;
hlen = 0;
break;
case DLT_PPP:
sockp->sa_family = AF_UNSPEC;
hlen = 0;
break;
case DLT_EN10MB:
sockp->sa_family = AF_UNSPEC;
/* XXX Would MAXLINKHDR be better? */
hlen = ETHER_HDR_LEN;
break;
case DLT_IEEE802_11:
case DLT_IEEE802_11_RADIO:
sockp->sa_family = AF_UNSPEC;
hlen = 0;
break;
case DLT_RAW:
case DLT_NULL:
sockp->sa_family = AF_UNSPEC;
hlen = 0;
break;
case DLT_LOOP:
sockp->sa_family = AF_UNSPEC;
hlen = sizeof(u_int32_t);
break;
default:
return (EIO);
}
if (uio->uio_resid > MAXMCLBYTES)
return (EMSGSIZE);
len = uio->uio_resid;
if (len < hlen)
return (EINVAL);
/*
* Get the length of the payload so we can align it properly.
*/
alen = len - hlen;
/*
* Allocate enough space for headers and the aligned payload.
*/
mlen = max(max_linkhdr, hlen) + roundup(alen, sizeof(long));
if (mlen > MAXMCLBYTES)
return (EMSGSIZE);
MGETHDR(m, M_WAIT, MT_DATA);
if (mlen > MHLEN) {
MCLGETL(m, M_WAIT, mlen);
if ((m->m_flags & M_EXT) == 0) {
error = ENOBUFS;
goto bad;
}
}
m_align(m, alen); /* Align the payload. */
m->m_data -= hlen;
m->m_pkthdr.ph_ifidx = 0;
m->m_pkthdr.len = len;
m->m_len = len;
error = uiomove(mtod(m, caddr_t), len, uio);
if (error)
goto bad;
smr_read_enter();
bps = SMR_PTR_GET(&d->bd_wfilter);
if (bps != NULL)
fcode = bps->bps_bf.bf_insns;
slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
smr_read_leave();
if (slen < len) {
error = EPERM;
goto bad;
}
/*
* Make room for link header, and copy it to sockaddr
*/
if (hlen != 0) {
if (linktype == DLT_LOOP) {
u_int32_t af;
/* the link header indicates the address family */
KASSERT(hlen == sizeof(u_int32_t));
memcpy(&af, m->m_data, hlen);
sockp->sa_family = ntohl(af);
} else
memcpy(sockp->sa_data, m->m_data, hlen);
m->m_pkthdr.len -= hlen;
m->m_len -= hlen;
m->m_data += hlen;
}
/*
* Prepend the data link type as a mbuf tag
*/
mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
*(u_int *)(mtag + 1) = linktype;
m_tag_prepend(m, mtag);
*mp = m;
return (0);
bad:
m_freem(m);
return (error);
}
/*
* Attach file to the bpf interface, i.e. make d listen on bp.
*/
void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
/*
* Point d at bp, and add d to the interface's list of listeners.
* Finally, point the driver's bpf cookie at the interface so
* it will divert packets to bpf.
*/
d->bd_bif = bp;
KERNEL_ASSERT_LOCKED();
SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
*bp->bif_driverp = bp;
}
/*
* Detach a file from its interface.
*/
void
bpf_detachd(struct bpf_d *d)
{
struct bpf_if *bp;
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
bp = d->bd_bif;
/* Not attached. */
if (bp == NULL)
return;
/* Remove ``d'' from the interface's descriptor list. */
KERNEL_ASSERT_LOCKED();
SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
/*
* Let the driver know that there are no more listeners.
*/
*bp->bif_driverp = NULL;
}
d->bd_bif = NULL;
/*
* Check if this descriptor had requested promiscuous mode.
* If so, turn it off.
*/
if (d->bd_promisc) {
int error;
KASSERT(bp->bif_ifp != NULL);
d->bd_promisc = 0;
bpf_get(d);
mtx_leave(&d->bd_mtx);
NET_LOCK();
error = ifpromisc(bp->bif_ifp, 0);
NET_UNLOCK();
mtx_enter(&d->bd_mtx);
bpf_put(d);
if (error && !(error == EINVAL || error == ENODEV ||
error == ENXIO))
/*
* Something is really wrong if we were able to put
* the driver into promiscuous mode, but can't
* take it out.
*/
panic("bpf: ifpromisc failed");
}
}
void
bpfilterattach(int n)
{
LIST_INIT(&bpf_d_list);
}
/*
* Open ethernet device. Returns ENXIO for illegal minor device number,
* EBUSY if file is open by another process.
*/
int
bpfopen(dev_t dev, int flag, int mode, struct proc *p)
{
struct bpf_d *bd;
int unit = minor(dev);
if (unit & ((1 << CLONE_SHIFT) - 1))
return (ENXIO);
KASSERT(bpfilter_lookup(unit) == NULL);
/* create on demand */
if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
return (EBUSY);
/* Mark "free" and do most initialization. */
bd->bd_unit = unit;
bd->bd_bufsize = bpf_bufsize;
bd->bd_sig = SIGIO;
mtx_init(&bd->bd_mtx, IPL_NET);
task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
smr_init(&bd->bd_smr);
sigio_init(&bd->bd_sigio);
klist_init_mutex(&bd->bd_klist, &bd->bd_mtx);
bd->bd_rtout = 0; /* no timeout by default */
refcnt_init(&bd->bd_refcnt);
LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
return (0);
}
/*
* Close the descriptor by detaching it from its interface,
* deallocating its buffers, and marking it free.
*/
int
bpfclose(dev_t dev, int flag, int mode, struct proc *p)
{
struct bpf_d *d;
d = bpfilter_lookup(minor(dev));
mtx_enter(&d->bd_mtx);
bpf_detachd(d);
bpf_wakeup(d);
LIST_REMOVE(d, bd_list);
mtx_leave(&d->bd_mtx);
bpf_put(d);
return (0);
}
/*
* Rotate the packet buffers in descriptor d. Move the store buffer
* into the hold slot, and the free buffer into the store slot.
* Zero the length of the new store buffer.
*/
#define ROTATE_BUFFERS(d) \
KASSERT(d->bd_in_uiomove == 0); \
MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
(d)->bd_hbuf = (d)->bd_sbuf; \
(d)->bd_hlen = (d)->bd_slen; \
(d)->bd_sbuf = (d)->bd_fbuf; \
(d)->bd_slen = 0; \
(d)->bd_fbuf = NULL;
/*
* bpfread - read next chunk of packets from buffers
*/
int
bpfread(dev_t dev, struct uio *uio, int ioflag)
{
uint64_t end, now;
struct bpf_d *d;
caddr_t hbuf;
int error, hlen;
KERNEL_ASSERT_LOCKED();
d = bpfilter_lookup(minor(dev));
if (d->bd_bif == NULL)
return (ENXIO);
bpf_get(d);
mtx_enter(&d->bd_mtx);
/*
* Restrict application to use a buffer the same size as
* as kernel buffers.
*/
if (uio->uio_resid != d->bd_bufsize) {
error = EINVAL;
goto out;
}
/*
* If there's a timeout, mark when the read should end.
*/
if (d->bd_rtout != 0) {
now = nsecuptime();
end = now + d->bd_rtout;
if (end < now)
end = UINT64_MAX;
}
/*
* If the hold buffer is empty, then do a timed sleep, which
* ends when the timeout expires or when enough packets
* have arrived to fill the store buffer.
*/
while (d->bd_hbuf == NULL) {
if (d->bd_bif == NULL) {
/* interface is gone */
if (d->bd_slen == 0) {
error = EIO;
goto out;
}
ROTATE_BUFFERS(d);
break;
}
if (d->bd_immediate && d->bd_slen != 0) {
/*
* A packet(s) either arrived since the previous
* read or arrived while we were asleep.
* Rotate the buffers and return what's here.
*/
ROTATE_BUFFERS(d);
break;
}
if (ISSET(ioflag, IO_NDELAY)) {
/* User requested non-blocking I/O */
error = EWOULDBLOCK;
} else if (d->bd_rtout == 0) {
/* No read timeout set. */
d->bd_nreaders++;
error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
"bpf", INFSLP);
d->bd_nreaders--;
} else if ((now = nsecuptime()) < end) {
/* Read timeout has not expired yet. */
d->bd_nreaders++;
error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
"bpf", end - now);
d->bd_nreaders--;
} else {
/* Read timeout has expired. */
error = EWOULDBLOCK;
}
if (error == EINTR || error == ERESTART)
goto out;
if (error == EWOULDBLOCK) {
/*
* On a timeout, return what's in the buffer,
* which may be nothing. If there is something
* in the store buffer, we can rotate the buffers.
*/
if (d->bd_hbuf != NULL)
/*
* We filled up the buffer in between
* getting the timeout and arriving
* here, so we don't need to rotate.
*/
break;
if (d->bd_slen == 0) {
error = 0;
goto out;
}
ROTATE_BUFFERS(d);
break;
}
}
/*
* At this point, we know we have something in the hold slot.
*/
hbuf = d->bd_hbuf;
hlen = d->bd_hlen;
d->bd_hbuf = NULL;
d->bd_hlen = 0;
d->bd_fbuf = NULL;
d->bd_in_uiomove = 1;
/*
* Move data from hold buffer into user space.
* We know the entire buffer is transferred since
* we checked above that the read buffer is bpf_bufsize bytes.
*/
mtx_leave(&d->bd_mtx);
error = uiomove(hbuf, hlen, uio);
mtx_enter(&d->bd_mtx);
/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
KASSERT(d->bd_fbuf == NULL);
KASSERT(d->bd_hbuf == NULL);
d->bd_fbuf = hbuf;
d->bd_in_uiomove = 0;
out:
mtx_leave(&d->bd_mtx);
bpf_put(d);
return (error);
}
/*
* If there are processes sleeping on this descriptor, wake them up.
*/
void
bpf_wakeup(struct bpf_d *d)
{
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
if (d->bd_nreaders)
wakeup(d);
KNOTE(&d->bd_klist, 0);
/*
* As long as pgsigio() needs to be protected
* by the KERNEL_LOCK() we have to delay the wakeup to
* another context to keep the hot path KERNEL_LOCK()-free.
*/
if (d->bd_async && d->bd_sig) {
bpf_get(d);
if (!task_add(systq, &d->bd_wake_task))
bpf_put(d);
}
}
void
bpf_wakeup_cb(void *xd)
{
struct bpf_d *d = xd;
if (d->bd_async && d->bd_sig)
pgsigio(&d->bd_sigio, d->bd_sig, 0);
bpf_put(d);
}
int
bpfwrite(dev_t dev, struct uio *uio, int ioflag)
{
struct bpf_d *d;
struct ifnet *ifp;
struct mbuf *m;
int error;
struct sockaddr_storage dst;
KERNEL_ASSERT_LOCKED();
d = bpfilter_lookup(minor(dev));
if (d->bd_bif == NULL)
return (ENXIO);
bpf_get(d);
ifp = d->bd_bif->bif_ifp;
if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
error = ENETDOWN;
goto out;
}
if (uio->uio_resid == 0) {
error = 0;
goto out;
}
error = bpf_movein(uio, d, &m, sstosa(&dst));
if (error)
goto out;
if (m->m_pkthdr.len > ifp->if_mtu) {
m_freem(m);
error = EMSGSIZE;
goto out;
}
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
m->m_pkthdr.pf.prio = ifp->if_llprio;
if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
dst.ss_family = pseudo_AF_HDRCMPLT;
NET_LOCK();
error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
NET_UNLOCK();
out:
bpf_put(d);
return (error);
}
/*
* Reset a descriptor by flushing its packet buffer and clearing the
* receive and drop counts.
*/
void
bpf_resetd(struct bpf_d *d)
{
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
KASSERT(d->bd_in_uiomove == 0);
if (d->bd_hbuf != NULL) {
/* Free the hold buffer. */
d->bd_fbuf = d->bd_hbuf;
d->bd_hbuf = NULL;
}
d->bd_slen = 0;
d->bd_hlen = 0;
d->bd_rcount = 0;
d->bd_dcount = 0;
}
/*
* FIONREAD Check for read packet available.
* BIOCGBLEN Get buffer len [for read()].
* BIOCSETF Set ethernet read filter.
* BIOCFLUSH Flush read packet buffer.
* BIOCPROMISC Put interface into promiscuous mode.
* BIOCGDLTLIST Get supported link layer types.
* BIOCGDLT Get link layer type.
* BIOCSDLT Set link layer type.
* BIOCGETIF Get interface name.
* BIOCSETIF Set interface.
* BIOCSRTIMEOUT Set read timeout.
* BIOCGRTIMEOUT Get read timeout.
* BIOCGSTATS Get packet stats.
* BIOCIMMEDIATE Set immediate mode.
* BIOCVERSION Get filter language version.
* BIOCGHDRCMPLT Get "header already complete" flag
* BIOCSHDRCMPLT Set "header already complete" flag
*/
int
bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
{
struct bpf_d *d;
int error = 0;
d = bpfilter_lookup(minor(dev));
if (d->bd_locked && suser(p) != 0) {
/* list of allowed ioctls when locked and not root */
switch (cmd) {
case BIOCGBLEN:
case BIOCFLUSH:
case BIOCGDLT:
case BIOCGDLTLIST:
case BIOCGETIF:
case BIOCGRTIMEOUT:
case BIOCGSTATS:
case BIOCVERSION:
case BIOCGRSIG:
case BIOCGHDRCMPLT:
case FIONREAD:
case BIOCLOCK:
case BIOCSRTIMEOUT:
case BIOCIMMEDIATE:
case TIOCGPGRP:
case BIOCGDIRFILT:
break;
default:
return (EPERM);
}
}
bpf_get(d);
switch (cmd) {
default:
error = EINVAL;
break;
/*
* Check for read packet available.
*/
case FIONREAD:
{
int n;
mtx_enter(&d->bd_mtx);
n = d->bd_slen;
if (d->bd_hbuf != NULL)
n += d->bd_hlen;
mtx_leave(&d->bd_mtx);
*(int *)addr = n;
break;
}
/*
* Get buffer len [for read()].
*/
case BIOCGBLEN:
*(u_int *)addr = d->bd_bufsize;
break;
/*
* Set buffer length.
*/
case BIOCSBLEN:
if (d->bd_bif != NULL)
error = EINVAL;
else {
u_int size = *(u_int *)addr;
if (size > bpf_maxbufsize)
*(u_int *)addr = size = bpf_maxbufsize;
else if (size < BPF_MINBUFSIZE)
*(u_int *)addr = size = BPF_MINBUFSIZE;
mtx_enter(&d->bd_mtx);
d->bd_bufsize = size;
mtx_leave(&d->bd_mtx);
}
break;
/*
* Set link layer read filter.
*/
case BIOCSETF:
error = bpf_setf(d, (struct bpf_program *)addr, 0);
break;
/*
* Set link layer write filter.
*/
case BIOCSETWF:
error = bpf_setf(d, (struct bpf_program *)addr, 1);
break;
/*
* Flush read packet buffer.
*/
case BIOCFLUSH:
mtx_enter(&d->bd_mtx);
bpf_resetd(d);
mtx_leave(&d->bd_mtx);
break;
/*
* Put interface into promiscuous mode.
*/
case BIOCPROMISC:
if (d->bd_bif == NULL) {
/*
* No interface attached yet.
*/
error = EINVAL;
} else if (d->bd_bif->bif_ifp != NULL) {
if (d->bd_promisc == 0) {
MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
NET_LOCK();
error = ifpromisc(d->bd_bif->bif_ifp, 1);
NET_UNLOCK();
if (error == 0)
d->bd_promisc = 1;
}
}
break;
/*
* Get a list of supported device parameters.
*/
case BIOCGDLTLIST:
if (d->bd_bif == NULL)
error = EINVAL;
else
error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
break;
/*
* Get device parameters.
*/
case BIOCGDLT:
if (d->bd_bif == NULL)
error = EINVAL;
else
*(u_int *)addr = d->bd_bif->bif_dlt;
break;
/*
* Set device parameters.
*/
case BIOCSDLT:
if (d->bd_bif == NULL)
error = EINVAL;
else {
mtx_enter(&d->bd_mtx);
error = bpf_setdlt(d, *(u_int *)addr);
mtx_leave(&d->bd_mtx);
}
break;
/*
* Set interface name.
*/
case BIOCGETIF:
if (d->bd_bif == NULL)
error = EINVAL;
else
bpf_ifname(d->bd_bif, (struct ifreq *)addr);
break;
/*
* Set interface.
*/
case BIOCSETIF:
error = bpf_setif(d, (struct ifreq *)addr);
break;
/*
* Set read timeout.
*/
case BIOCSRTIMEOUT:
{
struct timeval *tv = (struct timeval *)addr;
uint64_t rtout;
if (tv->tv_sec < 0 || !timerisvalid(tv)) {
error = EINVAL;
break;
}
rtout = TIMEVAL_TO_NSEC(tv);
if (rtout > MAXTSLP) {
error = EOVERFLOW;
break;
}
mtx_enter(&d->bd_mtx);
d->bd_rtout = rtout;
mtx_leave(&d->bd_mtx);
break;
}
/*
* Get read timeout.
*/
case BIOCGRTIMEOUT:
{
struct timeval *tv = (struct timeval *)addr;
memset(tv, 0, sizeof(*tv));
mtx_enter(&d->bd_mtx);
NSEC_TO_TIMEVAL(d->bd_rtout, tv);
mtx_leave(&d->bd_mtx);
break;
}
/*
* Get packet stats.
*/
case BIOCGSTATS:
{
struct bpf_stat *bs = (struct bpf_stat *)addr;
bs->bs_recv = d->bd_rcount;
bs->bs_drop = d->bd_dcount;
break;
}
/*
* Set immediate mode.
*/
case BIOCIMMEDIATE:
d->bd_immediate = *(u_int *)addr;
break;
case BIOCVERSION:
{
struct bpf_version *bv = (struct bpf_version *)addr;
bv->bv_major = BPF_MAJOR_VERSION;
bv->bv_minor = BPF_MINOR_VERSION;
break;
}
case BIOCGHDRCMPLT: /* get "header already complete" flag */
*(u_int *)addr = d->bd_hdrcmplt;
break;
case BIOCSHDRCMPLT: /* set "header already complete" flag */
d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
break;
case BIOCLOCK: /* set "locked" flag (no reset) */
d->bd_locked = 1;
break;
case BIOCGFILDROP: /* get "filter-drop" flag */
*(u_int *)addr = d->bd_fildrop;
break;
case BIOCSFILDROP: { /* set "filter-drop" flag */
unsigned int fildrop = *(u_int *)addr;
switch (fildrop) {
case BPF_FILDROP_PASS:
case BPF_FILDROP_CAPTURE:
case BPF_FILDROP_DROP:
d->bd_fildrop = fildrop;
break;
default:
error = EINVAL;
break;
}
break;
}
case BIOCGDIRFILT: /* get direction filter */
*(u_int *)addr = d->bd_dirfilt;
break;
case BIOCSDIRFILT: /* set direction filter */
d->bd_dirfilt = (*(u_int *)addr) &
(BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
break;
case FIONBIO: /* Non-blocking I/O */
/* let vfs to keep track of this */
break;
case FIOASYNC: /* Send signal on receive packets */
d->bd_async = *(int *)addr;
break;
case FIOSETOWN: /* Process or group to send signals to */
case TIOCSPGRP:
error = sigio_setown(&d->bd_sigio, cmd, addr);
break;
case FIOGETOWN:
case TIOCGPGRP:
sigio_getown(&d->bd_sigio, cmd, addr);
break;
case BIOCSRSIG: /* Set receive signal */
{
u_int sig;
sig = *(u_int *)addr;
if (sig >= NSIG)
error = EINVAL;
else
d->bd_sig = sig;
break;
}
case BIOCGRSIG:
*(u_int *)addr = d->bd_sig;
break;
}
bpf_put(d);
return (error);
}
/*
* Set d's packet filter program to fp. If this file already has a filter,
* free it and replace it. Returns EINVAL for bogus requests.
*/
int
bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
{
struct bpf_program_smr *bps, *old_bps;
struct bpf_insn *fcode;
u_int flen, size;
KERNEL_ASSERT_LOCKED();
if (fp->bf_insns == 0) {
if (fp->bf_len != 0)
return (EINVAL);
bps = NULL;
} else {
flen = fp->bf_len;
if (flen > BPF_MAXINSNS)
return (EINVAL);
fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
M_WAITOK | M_CANFAIL);
if (fcode == NULL)
return (ENOMEM);
size = flen * sizeof(*fp->bf_insns);
if (copyin(fp->bf_insns, fcode, size) != 0 ||
bpf_validate(fcode, (int)flen) == 0) {
free(fcode, M_DEVBUF, size);
return (EINVAL);
}
bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
smr_init(&bps->bps_smr);
bps->bps_bf.bf_len = flen;
bps->bps_bf.bf_insns = fcode;
}
if (wf == 0) {
old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
} else {
old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
}
mtx_enter(&d->bd_mtx);
bpf_resetd(d);
mtx_leave(&d->bd_mtx);
if (old_bps != NULL)
smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
return (0);
}
/*
* Detach a file from its current interface (if attached at all) and attach
* to the interface indicated by the name stored in ifr.
* Return an errno or 0.
*/
int
bpf_setif(struct bpf_d *d, struct ifreq *ifr)
{
struct bpf_if *bp, *candidate = NULL;
int error = 0;
/*
* Look through attached interfaces for the named one.
*/
for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
continue;
if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
candidate = bp;
}
/* Not found. */
if (candidate == NULL)
return (ENXIO);
/*
* Allocate the packet buffers if we need to.
* If we're already attached to requested interface,
* just flush the buffer.
*/
mtx_enter(&d->bd_mtx);
if (d->bd_sbuf == NULL) {
if ((error = bpf_allocbufs(d)))
goto out;
}
if (candidate != d->bd_bif) {
/*
* Detach if attached to something else.
*/
bpf_detachd(d);
bpf_attachd(d, candidate);
}
bpf_resetd(d);
out:
mtx_leave(&d->bd_mtx);
return (error);
}
/*
* Copy the interface name to the ifreq.
*/
void
bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
{
bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
}
const struct filterops bpfread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_bpfrdetach,
.f_event = filt_bpfread,
.f_modify = filt_bpfreadmodify,
.f_process = filt_bpfreadprocess,
};
int
bpfkqfilter(dev_t dev, struct knote *kn)
{
struct bpf_d *d;
struct klist *klist;
KERNEL_ASSERT_LOCKED();
d = bpfilter_lookup(minor(dev));
if (d == NULL)
return (ENXIO);
switch (kn->kn_filter) {
case EVFILT_READ:
klist = &d->bd_klist;
kn->kn_fop = &bpfread_filtops;
break;
default:
return (EINVAL);
}
bpf_get(d);
kn->kn_hook = d;
klist_insert(klist, kn);
return (0);
}
void
filt_bpfrdetach(struct knote *kn)
{
struct bpf_d *d = kn->kn_hook;
klist_remove(&d->bd_klist, kn);
bpf_put(d);
}
int
filt_bpfread(struct knote *kn, long hint)
{
struct bpf_d *d = kn->kn_hook;
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
kn->kn_data = d->bd_hlen;
if (d->bd_immediate)
kn->kn_data += d->bd_slen;
return (kn->kn_data > 0);
}
int
filt_bpfreadmodify(struct kevent *kev, struct knote *kn)
{
struct bpf_d *d = kn->kn_hook;
int active;
mtx_enter(&d->bd_mtx);
active = knote_modify_fn(kev, kn, filt_bpfread);
mtx_leave(&d->bd_mtx);
return (active);
}
int
filt_bpfreadprocess(struct knote *kn, struct kevent *kev)
{
struct bpf_d *d = kn->kn_hook;
int active;
mtx_enter(&d->bd_mtx);
active = knote_process_fn(kn, kev, filt_bpfread);
mtx_leave(&d->bd_mtx);
return (active);
}
/*
* Copy data from an mbuf chain into a buffer. This code is derived
* from m_copydata in sys/uipc_mbuf.c.
*/
void
bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
{
const struct mbuf *m;
u_int count;
u_char *dst;
m = src_arg;
dst = dst_arg;
while (len > 0) {
if (m == NULL)
panic("bpf_mcopy");
count = min(m->m_len, len);
bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
m = m->m_next;
dst += count;
len -= count;
}
}
int
bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
{
return _bpf_mtap(arg, m, m, direction);
}
int
_bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
u_int direction)
{
struct bpf_if *bp = (struct bpf_if *)arg;
struct bpf_d *d;
size_t pktlen, slen;
const struct mbuf *m0;
struct bpf_hdr tbh;
int gothdr = 0;
int drop = 0;
if (m == NULL)
return (0);
if (bp == NULL)
return (0);
pktlen = 0;
for (m0 = m; m0 != NULL; m0 = m0->m_next)
pktlen += m0->m_len;
smr_read_enter();
SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
struct bpf_program_smr *bps;
struct bpf_insn *fcode = NULL;
atomic_inc_long(&d->bd_rcount);
if (ISSET(d->bd_dirfilt, direction))
continue;
bps = SMR_PTR_GET(&d->bd_rfilter);
if (bps != NULL) fcode = bps->bps_bf.bf_insns;
slen = bpf_mfilter(fcode, m, pktlen);
if (slen == 0)
continue;
if (d->bd_fildrop != BPF_FILDROP_PASS)
drop = 1;
if (d->bd_fildrop != BPF_FILDROP_DROP) { if (!gothdr) {
struct timeval tv;
memset(&tbh, 0, sizeof(tbh));
if (ISSET(mp->m_flags, M_PKTHDR)) {
tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
tbh.bh_flags = mp->m_pkthdr.pf.prio;
if (ISSET(mp->m_pkthdr.csum_flags,
M_FLOWID))
SET(tbh.bh_flags, BPF_F_FLOWID);
m_microtime(mp, &tv);
} else
microtime(&tv);
tbh.bh_tstamp.tv_sec = tv.tv_sec;
tbh.bh_tstamp.tv_usec = tv.tv_usec;
SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
gothdr = 1;
}
mtx_enter(&d->bd_mtx);
bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
mtx_leave(&d->bd_mtx);
}
}
smr_read_leave();
return (drop);
}
/*
* Incoming linkage from device drivers, where a data buffer should be
* prepended by an arbitrary header. In this situation we already have a
* way of representing a chain of memory buffers, ie, mbufs, so reuse
* the existing functionality by attaching the buffers to mbufs.
*
* Con up a minimal mbuf chain to pacify bpf by allocating (only) a
* struct m_hdr each for the header and data on the stack.
*/
int
bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
const void *buf, unsigned int buflen, u_int direction)
{
struct m_hdr mh, md;
struct mbuf *m0 = NULL;
struct mbuf **mp = &m0;
if (hdr != NULL) {
mh.mh_flags = 0;
mh.mh_next = NULL;
mh.mh_len = hdrlen;
mh.mh_data = (void *)hdr;
*mp = (struct mbuf *)&mh;
mp = &mh.mh_next;
}
if (buf != NULL) {
md.mh_flags = 0;
md.mh_next = NULL;
md.mh_len = buflen;
md.mh_data = (void *)buf;
*mp = (struct mbuf *)&md;
}
return bpf_mtap(arg, m0, direction);
}
/*
* Incoming linkage from device drivers, where we have a mbuf chain
* but need to prepend some arbitrary header from a linear buffer.
*
* Con up a minimal dummy header to pacify bpf. Allocate (only) a
* struct m_hdr on the stack. This is safe as bpf only reads from the
* fields in this header that we initialize, and will not try to free
* it or keep a pointer to it.
*/
int
bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
u_int direction)
{
struct m_hdr mh;
const struct mbuf *m0;
if (dlen > 0) {
mh.mh_flags = 0;
mh.mh_next = (struct mbuf *)m;
mh.mh_len = dlen;
mh.mh_data = (void *)data;
m0 = (struct mbuf *)&mh;
} else
m0 = m;
return _bpf_mtap(arg, m, m0, direction);
}
/*
* Incoming linkage from device drivers, where we have a mbuf chain
* but need to prepend the address family.
*
* Con up a minimal dummy header to pacify bpf. We allocate (only) a
* struct m_hdr on the stack. This is safe as bpf only reads from the
* fields in this header that we initialize, and will not try to free
* it or keep a pointer to it.
*/
int
bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
{
u_int32_t afh;
afh = htonl(af);
return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
}
/*
* Incoming linkage from device drivers, where we have a mbuf chain
* but need to prepend a VLAN encapsulation header.
*
* Con up a minimal dummy header to pacify bpf. Allocate (only) a
* struct m_hdr on the stack. This is safe as bpf only reads from the
* fields in this header that we initialize, and will not try to free
* it or keep a pointer to it.
*/
int
bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
{
#if NVLAN > 0
struct ether_vlan_header evh;
struct m_hdr mh, md;
if ((m->m_flags & M_VLANTAG) == 0)
#endif
{
return _bpf_mtap(arg, m, m, direction);
}
#if NVLAN > 0
KASSERT(m->m_len >= ETHER_HDR_LEN);
memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
evh.evl_proto = evh.evl_encap_proto;
evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
evh.evl_tag = htons(m->m_pkthdr.ether_vtag);
mh.mh_flags = 0;
mh.mh_data = (caddr_t)&evh;
mh.mh_len = sizeof(evh);
mh.mh_next = (struct mbuf *)&md;
md.mh_flags = 0;
md.mh_data = m->m_data + ETHER_HDR_LEN;
md.mh_len = m->m_len - ETHER_HDR_LEN;
md.mh_next = m->m_next;
return _bpf_mtap(arg, m, (struct mbuf *)&mh, direction);
#endif
}
/*
* Move the packet data from interface memory (pkt) into the
* store buffer. Wake up listeners if needed.
* "copy" is the routine called to do the actual data
* transfer. bcopy is passed in to copy contiguous chunks, while
* bpf_mcopy is passed in to copy mbuf chains. In the latter case,
* pkt is really an mbuf.
*/
void
bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
const struct bpf_hdr *tbh)
{
struct bpf_hdr *bh;
int totlen, curlen;
int hdrlen, do_wakeup = 0;
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
if (d->bd_bif == NULL)
return;
hdrlen = d->bd_bif->bif_hdrlen;
/*
* Figure out how many bytes to move. If the packet is
* greater or equal to the snapshot length, transfer that
* much. Otherwise, transfer the whole packet (unless
* we hit the buffer size limit).
*/
totlen = hdrlen + min(snaplen, pktlen);
if (totlen > d->bd_bufsize)
totlen = d->bd_bufsize;
/*
* Round up the end of the previous packet to the next longword.
*/
curlen = BPF_WORDALIGN(d->bd_slen);
if (curlen + totlen > d->bd_bufsize) {
/*
* This packet will overflow the storage buffer.
* Rotate the buffers if we can, then wakeup any
* pending reads.
*/
if (d->bd_fbuf == NULL) {
/*
* We haven't completed the previous read yet,
* so drop the packet.
*/
++d->bd_dcount;
return;
}
ROTATE_BUFFERS(d);
do_wakeup = 1;
curlen = 0;
}
/*
* Append the bpf header.
*/
bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
*bh = *tbh;
bh->bh_datalen = pktlen;
bh->bh_hdrlen = hdrlen;
bh->bh_caplen = totlen - hdrlen;
/*
* Copy the packet data into the store buffer and update its length.
*/
bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
d->bd_slen = curlen + totlen;
if (d->bd_immediate) {
/*
* Immediate mode is set. A packet arrived so any
* reads should be woken up.
*/
do_wakeup = 1;
}
if (do_wakeup)
bpf_wakeup(d);
}
/*
* Initialize all nonzero fields of a descriptor.
*/
int
bpf_allocbufs(struct bpf_d *d)
{
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
if (d->bd_fbuf == NULL)
return (ENOMEM);
d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
if (d->bd_sbuf == NULL) {
free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
d->bd_fbuf = NULL;
return (ENOMEM);
}
d->bd_slen = 0;
d->bd_hlen = 0;
return (0);
}
void
bpf_prog_smr(void *bps_arg)
{
struct bpf_program_smr *bps = bps_arg;
free(bps->bps_bf.bf_insns, M_DEVBUF,
bps->bps_bf.bf_len * sizeof(struct bpf_insn));
free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
}
void
bpf_d_smr(void *smr)
{
struct bpf_d *bd = smr;
sigio_free(&bd->bd_sigio);
free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
if (bd->bd_rfilter != NULL)
bpf_prog_smr(bd->bd_rfilter);
if (bd->bd_wfilter != NULL)
bpf_prog_smr(bd->bd_wfilter);
klist_free(&bd->bd_klist);
free(bd, M_DEVBUF, sizeof(*bd));
}
void
bpf_get(struct bpf_d *bd)
{
refcnt_take(&bd->bd_refcnt);
}
/*
* Free buffers currently in use by a descriptor
* when the reference count drops to zero.
*/
void
bpf_put(struct bpf_d *bd)
{
if (refcnt_rele(&bd->bd_refcnt) == 0)
return;
smr_call(&bd->bd_smr, bpf_d_smr, bd);
}
void *
bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
{
struct bpf_if *bp;
if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
panic("bpfattach");
SMR_SLIST_INIT(&bp->bif_dlist);
bp->bif_driverp = (struct bpf_if **)bpfp;
bp->bif_name = name;
bp->bif_ifp = NULL;
bp->bif_dlt = dlt;
bp->bif_next = bpf_iflist;
bpf_iflist = bp;
*bp->bif_driverp = NULL;
/*
* Compute the length of the bpf header. This is not necessarily
* equal to SIZEOF_BPF_HDR because we want to insert spacing such
* that the network layer header begins on a longword boundary (for
* performance reasons and to alleviate alignment restrictions).
*/
bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
return (bp);
}
void
bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
{
struct bpf_if *bp;
bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
bp->bif_ifp = ifp;
}
/* Detach an interface from its attached bpf device. */
void
bpfdetach(struct ifnet *ifp)
{
struct bpf_if *bp, *nbp;
KERNEL_ASSERT_LOCKED();
for (bp = bpf_iflist; bp; bp = nbp) {
nbp = bp->bif_next;
if (bp->bif_ifp == ifp)
bpfsdetach(bp);
}
ifp->if_bpf = NULL;
}
void
bpfsdetach(void *p)
{
struct bpf_if *bp = p, *tbp;
struct bpf_d *bd;
int maj;
KERNEL_ASSERT_LOCKED();
/* Locate the major number. */
for (maj = 0; maj < nchrdev; maj++)
if (cdevsw[maj].d_open == bpfopen)
break;
while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist))) {
vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
klist_invalidate(&bd->bd_klist);
}
for (tbp = bpf_iflist; tbp; tbp = tbp->bif_next) {
if (tbp->bif_next == bp) {
tbp->bif_next = bp->bif_next;
break;
}
}
if (bpf_iflist == bp)
bpf_iflist = bp->bif_next;
free(bp, M_DEVBUF, sizeof(*bp));
}
int
bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
switch (name[0]) {
case NET_BPF_BUFSIZE:
return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&bpf_bufsize, BPF_MINBUFSIZE, bpf_maxbufsize);
case NET_BPF_MAXBUFSIZE:
return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&bpf_maxbufsize, BPF_MINBUFSIZE, INT_MAX);
default:
return (EOPNOTSUPP);
}
}
int
bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen)
{
int flags = RW_INTR;
int error;
if (namelen != 1)
return (ENOTDIR);
flags |= (newp == NULL) ? RW_READ : RW_WRITE;
error = rw_enter(&bpf_sysctl_lk, flags);
if (error != 0)
return (error);
error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
rw_exit(&bpf_sysctl_lk);
return (error);
}
struct bpf_d *
bpfilter_lookup(int unit)
{
struct bpf_d *bd;
KERNEL_ASSERT_LOCKED();
LIST_FOREACH(bd, &bpf_d_list, bd_list)
if (bd->bd_unit == unit)
return (bd);
return (NULL);
}
/*
* Get a list of available data link type of the interface.
*/
int
bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
{
int n, error;
struct bpf_if *bp;
const char *name;
name = d->bd_bif->bif_name;
n = 0;
error = 0;
for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
if (strcmp(name, bp->bif_name) != 0)
continue;
if (bfl->bfl_list != NULL) {
if (n >= bfl->bfl_len)
return (ENOMEM);
error = copyout(&bp->bif_dlt,
bfl->bfl_list + n, sizeof(u_int));
if (error)
break;
}
n++;
}
bfl->bfl_len = n;
return (error);
}
/*
* Set the data link type of a BPF instance.
*/
int
bpf_setdlt(struct bpf_d *d, u_int dlt)
{
const char *name;
struct bpf_if *bp;
MUTEX_ASSERT_LOCKED(&d->bd_mtx);
if (d->bd_bif->bif_dlt == dlt)
return (0);
name = d->bd_bif->bif_name;
for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
if (strcmp(name, bp->bif_name) != 0)
continue;
if (bp->bif_dlt == dlt)
break;
}
if (bp == NULL)
return (EINVAL);
bpf_detachd(d);
bpf_attachd(d, bp);
bpf_resetd(d);
return (0);
}
u_int32_t bpf_mbuf_ldw(const void *, u_int32_t, int *);
u_int32_t bpf_mbuf_ldh(const void *, u_int32_t, int *);
u_int32_t bpf_mbuf_ldb(const void *, u_int32_t, int *);
int bpf_mbuf_copy(const struct mbuf *, u_int32_t,
void *, u_int32_t);
const struct bpf_ops bpf_mbuf_ops = {
bpf_mbuf_ldw,
bpf_mbuf_ldh,
bpf_mbuf_ldb,
};
int
bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
{
u_int8_t *cp = buf;
u_int32_t count;
while (off >= m->m_len) {
off -= m->m_len;
m = m->m_next;
if (m == NULL)
return (-1);
}
for (;;) {
count = min(m->m_len - off, len);
memcpy(cp, m->m_data + off, count);
len -= count;
if (len == 0)
return (0);
m = m->m_next;
if (m == NULL)
break;
cp += count;
off = 0;
}
return (-1);
}
u_int32_t
bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
{
u_int32_t v;
if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
*err = 1;
return (0);
}
*err = 0;
return ntohl(v);
}
u_int32_t
bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
{
u_int16_t v;
if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
*err = 1;
return (0);
}
*err = 0;
return ntohs(v);
}
u_int32_t
bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
{
const struct mbuf *m = m0;
u_int8_t v;
while (k >= m->m_len) {
k -= m->m_len;
m = m->m_next;
if (m == NULL) { *err = 1;
return (0);
}
}
v = m->m_data[k];
*err = 0;
return v;
}
u_int
bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
{
return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
}
/* $OpenBSD: tcp_input.c,v 1.380 2022/09/03 19:22:19 bluhm Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/timeout.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
struct tcpiphdr tcp_saveti;
int tcp_mss_adv(struct mbuf *, int);
int tcp_flush_queue(struct tcpcb *);
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
struct tcpipv6hdr tcp_saveti6;
/* for the packet header length in the mbuf */
#define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len)
#define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr))
#define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip))
#endif /* INET6 */
int tcprexmtthresh = 3;
int tcptv_keep_init = TCPTV_KEEP_INIT;
int tcp_rst_ppslim = 100; /* 100pps */
int tcp_rst_ppslim_count = 0;
struct timeval tcp_rst_ppslim_last;
int tcp_ackdrop_ppslim = 100; /* 100pps */
int tcp_ackdrop_ppslim_count = 0;
struct timeval tcp_ackdrop_ppslim_last;
#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
/* for modulo comparisons of timestamps */
#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
/* for TCP SACK comparisons */
#define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b))
#define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b))
/*
* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
*/
#ifdef INET6
#define ND6_HINT(tp) \
do { \
if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \
nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \
} \
} while (0)
#else
#define ND6_HINT(tp)
#endif
#ifdef TCP_ECN
/*
* ECN (Explicit Congestion Notification) support based on RFC3168
* implementation note:
* snd_last is used to track a recovery phase.
* when cwnd is reduced, snd_last is set to snd_max.
* while snd_last > snd_una, the sender is in a recovery phase and
* its cwnd should not be reduced again.
* snd_last follows snd_una when not in a recovery phase.
*/
#endif
/*
* Macro to compute ACK transmission behavior. Delay the ACK unless
* we have already delayed an ACK (must send an ACK every two segments).
* We also ACK immediately if we received a PUSH and the ACK-on-PUSH
* option is enabled or when the packet is coming from a loopback
* interface.
*/
#define TCP_SETUP_ACK(tp, tiflags, m) \
do { \
struct ifnet *ifp = NULL; \
if (m && (m->m_flags & M_PKTHDR)) \
ifp = if_get(m->m_pkthdr.ph_ifidx); \
if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \
(tcp_ack_on_push && (tiflags) & TH_PUSH) || \
(ifp && (ifp->if_flags & IFF_LOOPBACK))) \
tp->t_flags |= TF_ACKNOW; \
else \
TCP_TIMER_ARM_MSEC(tp, TCPT_DELACK, tcp_delack_msecs); \
if_put(ifp); \
} while (0)
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
void syn_cache_put(struct syn_cache *);
void syn_cache_rm(struct syn_cache *);
int syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t);
void syn_cache_timer(void *);
void syn_cache_reaper(void *);
void syn_cache_insert(struct syn_cache *, struct tcpcb *);
void syn_cache_reset(struct sockaddr *, struct sockaddr *,
struct tcphdr *, u_int);
int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *,
unsigned int, struct socket *, struct mbuf *, u_char *, int,
struct tcp_opt_info *, tcp_seq *, uint32_t);
struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *,
struct tcphdr *, unsigned int, unsigned int, struct socket *,
struct mbuf *, uint32_t);
struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *,
struct syn_cache_head **, u_int);
/*
* Insert segment ti into reassembly queue of tcp with
* control block tp. Return TH_FIN if reassembly now includes
* a segment with FIN. The macro form does the common case inline
* (segment is the next to be received on an established connection,
* and the queue is empty), avoiding linkage into and removal
* from the queue and repetition of various conversions.
* Set DELACK for segments received in order, but ack immediately
* when segments are out of order (so fast retransmit can work).
*/
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
{
struct tcpqent *p, *q, *nq, *tiqe;
/*
* Allocate a new queue entry, before we throw away any data.
* If we can't, just drop the packet. XXX
*/
tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
if (tiqe == NULL) {
tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
/* Reuse last entry since new segment fills a hole */
m_freem(tiqe->tcpqe_m);
TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
}
if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
/* Flush segment queue for this connection */
tcp_freeq(tp);
tcpstat_inc(tcps_rcvmemdrop);
m_freem(m);
return (0);
}
}
/*
* Find a segment which begins after this one does.
*/
for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
p = q, q = TAILQ_NEXT(q, tcpqe_q))
if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
break;
/*
* If there is a preceding segment, it may provide some of
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
if (p != NULL) {
struct tcphdr *phdr = p->tcpqe_tcp;
int i;
/* conversion to int (in i) handles seq wraparound */
i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
if (i > 0) {
if (i >= *tlen) {
tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte,
*tlen);
m_freem(m);
pool_put(&tcpqe_pool, tiqe);
return (0);
}
m_adj(m, i);
*tlen -= i;
th->th_seq += i;
}
}
tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen);
tp->t_rcvoopack++;
/*
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
for (; q != NULL; q = nq) {
struct tcphdr *qhdr = q->tcpqe_tcp;
int i = (th->th_seq + *tlen) - qhdr->th_seq;
if (i <= 0)
break;
if (i < qhdr->th_reseqlen) {
qhdr->th_seq += i;
qhdr->th_reseqlen -= i;
m_adj(q->tcpqe_m, i);
break;
}
nq = TAILQ_NEXT(q, tcpqe_q);
m_freem(q->tcpqe_m);
TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
pool_put(&tcpqe_pool, q);
}
/* Insert the new segment queue entry into place. */
tiqe->tcpqe_m = m;
th->th_reseqlen = *tlen;
tiqe->tcpqe_tcp = th;
if (p == NULL) {
TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
} else {
TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
}
if (th->th_seq != tp->rcv_nxt)
return (0);
return (tcp_flush_queue(tp));
}
int
tcp_flush_queue(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
struct tcpqent *q, *nq;
int flags;
/*
* Present data to user, advancing rcv_nxt through
* completed sequence space.
*/
if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
return (0);
q = TAILQ_FIRST(&tp->t_segq);
if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
return (0);
if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
return (0);
do {
tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
flags = q->tcpqe_tcp->th_flags & TH_FIN;
nq = TAILQ_NEXT(q, tcpqe_q);
TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
ND6_HINT(tp);
if (so->so_state & SS_CANTRCVMORE)
m_freem(q->tcpqe_m);
else
sbappendstream(so, &so->so_rcv, q->tcpqe_m);
pool_put(&tcpqe_pool, q);
q = nq;
} while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
return (flags);
}
/*
* TCP input routine, follows pages 65-76 of the
* protocol specification dated September, 1981 very closely.
*/
int
tcp_input(struct mbuf **mp, int *offp, int proto, int af)
{
struct mbuf *m = *mp;
int iphlen = *offp;
struct ip *ip = NULL;
struct inpcb *inp = NULL;
u_int8_t *optp = NULL;
int optlen = 0;
int tlen, off;
struct tcpcb *otp = NULL, *tp = NULL;
int tiflags;
struct socket *so = NULL;
int todrop, acked, ourfinisacked;
int hdroptlen = 0;
short ostate;
caddr_t saveti;
tcp_seq iss, *reuse = NULL;
uint32_t now;
u_long tiwin;
struct tcp_opt_info opti;
struct tcphdr *th;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif /* INET6 */
#ifdef TCP_ECN
u_char iptos;
#endif
tcpstat_inc(tcps_rcvtotal);
opti.ts_present = 0;
opti.maxseg = 0;
now = READ_ONCE(tcp_now);
/*
* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
*/
if (m->m_flags & (M_BCAST|M_MCAST))
goto drop;
/*
* Get IP and TCP header together in first mbuf.
* Note: IP leaves IP header in first mbuf.
*/
IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
if (!th) {
tcpstat_inc(tcps_rcvshort);
return IPPROTO_DONE;
}
tlen = m->m_pkthdr.len - iphlen;
switch (af) {
case AF_INET:
ip = mtod(m, struct ip *);
#ifdef TCP_ECN
/* save ip_tos before clearing it for checksum */
iptos = ip->ip_tos;
#endif
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
#ifdef TCP_ECN
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
#endif
/*
* Be proactive about unspecified IPv6 address in source.
* As we use all-zero to indicate unbounded/unconnected pcb,
* unspecified IPv6 address can be used to confuse us.
*
* Note that packets with unspecified IPv6 destination is
* already dropped in ip6_input.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
/* XXX stat */
goto drop;
}
/* Discard packets to multicast */
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/* XXX stat */
goto drop;
}
break;
#endif
default:
unhandled_af(af);
}
/*
* Checksum extended TCP header and data.
*/
if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
int sum;
if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
tcpstat_inc(tcps_rcvbadsum);
goto drop;
}
tcpstat_inc(tcps_inswcsum);
switch (af) {
case AF_INET:
sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen);
break;
#ifdef INET6
case AF_INET6:
sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
tlen);
break;
#endif
}
if (sum != 0) {
tcpstat_inc(tcps_rcvbadsum);
goto drop;
}
}
/*
* Check that TCP offset makes sense,
* pull out TCP options and adjust length. XXX
*/
off = th->th_off << 2;
if (off < sizeof(struct tcphdr) || off > tlen) {
tcpstat_inc(tcps_rcvbadoff);
goto drop;
}
tlen -= off;
if (off > sizeof(struct tcphdr)) {
IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
if (!th) {
tcpstat_inc(tcps_rcvshort);
return IPPROTO_DONE;
}
optlen = off - sizeof(struct tcphdr);
optp = (u_int8_t *)(th + 1);
/*
* Do quick retrieval of timestamp options ("options
* prediction?"). If timestamp is the only option and it's
* formatted as recommended in RFC 1323 appendix A, we
* quickly get the values now and not bother calling
* tcp_dooptions(), etc.
*/
if ((optlen == TCPOLEN_TSTAMP_APPA ||
(optlen > TCPOLEN_TSTAMP_APPA &&
optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
*(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
(th->th_flags & TH_SYN) == 0) {
opti.ts_present = 1;
opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
optp = NULL; /* we've parsed the options */
}
}
tiflags = th->th_flags;
/*
* Convert TCP protocol specific fields to host format.
*/
th->th_seq = ntohl(th->th_seq);
th->th_ack = ntohl(th->th_ack);
th->th_win = ntohs(th->th_win);
th->th_urp = ntohs(th->th_urp);
/*
* Locate pcb for segment.
*/
#if NPF > 0
inp = pf_inp_lookup(m);
#endif
findpcb:
if (inp == NULL) {
switch (af) {
#ifdef INET6
case AF_INET6:
inp = in6_pcblookup(&tcbtable, &ip6->ip6_src,
th->th_sport, &ip6->ip6_dst, th->th_dport,
m->m_pkthdr.ph_rtableid);
break;
#endif
case AF_INET:
inp = in_pcblookup(&tcbtable, ip->ip_src,
th->th_sport, ip->ip_dst, th->th_dport,
m->m_pkthdr.ph_rtableid);
break;
}
}
if (inp == NULL) {
tcpstat_inc(tcps_pcbhashmiss);
switch (af) {
#ifdef INET6
case AF_INET6:
inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst,
th->th_dport, m, m->m_pkthdr.ph_rtableid);
break;
#endif /* INET6 */
case AF_INET:
inp = in_pcblookup_listen(&tcbtable, ip->ip_dst,
th->th_dport, m, m->m_pkthdr.ph_rtableid);
break;
}
/*
* If the state is CLOSED (i.e., TCB does not exist) then
* all data in the incoming segment is discarded.
* If the TCB exists but is in CLOSED state, it is embryonic,
* but should either do a listen or a connect soon.
*/
}
#ifdef IPSEC
if (ipsec_in_use) {
struct m_tag *mtag;
struct tdb *tdb = NULL;
int error;
/* Find most recent IPsec tag */
mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
if (mtag != NULL) {
struct tdb_ident *tdbi;
tdbi = (struct tdb_ident *)(mtag + 1);
tdb = gettdb(tdbi->rdomain, tdbi->spi,
&tdbi->dst, tdbi->proto);
}
error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN,
tdb, inp, NULL, NULL);
tdb_unref(tdb);
if (error) {
tcpstat_inc(tcps_rcvnosec);
goto drop;
}
}
#endif /* IPSEC */
if (inp == NULL) {
tcpstat_inc(tcps_noport);
goto dropwithreset_ratelim;
}
KASSERT(sotoinpcb(inp->inp_socket) == inp);
KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp);
soassertlocked(inp->inp_socket);
/* Check the minimum TTL for socket. */
switch (af) {
case AF_INET:
if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
goto drop;
break;
#ifdef INET6
case AF_INET6:
if (inp->inp_ip6_minhlim &&
inp->inp_ip6_minhlim > ip6->ip6_hlim)
goto drop;
break;
#endif
}
tp = intotcpcb(inp);
if (tp == NULL)
goto dropwithreset_ratelim;
if (tp->t_state == TCPS_CLOSED)
goto drop;
/* Unscale the window into a 32-bit value. */
if ((tiflags & TH_SYN) == 0)
tiwin = th->th_win << tp->snd_scale;
else
tiwin = th->th_win;
so = inp->inp_socket;
if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
union syn_cache_sa src;
union syn_cache_sa dst;
bzero(&src, sizeof(src));
bzero(&dst, sizeof(dst));
switch (af) {
case AF_INET:
src.sin.sin_len = sizeof(struct sockaddr_in);
src.sin.sin_family = AF_INET;
src.sin.sin_addr = ip->ip_src;
src.sin.sin_port = th->th_sport;
dst.sin.sin_len = sizeof(struct sockaddr_in);
dst.sin.sin_family = AF_INET;
dst.sin.sin_addr = ip->ip_dst;
dst.sin.sin_port = th->th_dport;
break;
#ifdef INET6
case AF_INET6:
src.sin6.sin6_len = sizeof(struct sockaddr_in6);
src.sin6.sin6_family = AF_INET6;
src.sin6.sin6_addr = ip6->ip6_src;
src.sin6.sin6_port = th->th_sport;
dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
dst.sin6.sin6_family = AF_INET6;
dst.sin6.sin6_addr = ip6->ip6_dst;
dst.sin6.sin6_port = th->th_dport;
break;
#endif /* INET6 */
}
if (so->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
switch (af) {
#ifdef INET6
case AF_INET6:
saveti = (caddr_t) &tcp_saveti6;
memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6));
memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th));
break;
#endif
case AF_INET:
saveti = (caddr_t) &tcp_saveti;
memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip));
memcpy(&tcp_saveti.ti_t, th, sizeof(*th));
break;
}
}
if (so->so_options & SO_ACCEPTCONN) {
switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) {
case TH_SYN|TH_ACK|TH_RST:
case TH_SYN|TH_RST:
case TH_ACK|TH_RST:
case TH_RST:
syn_cache_reset(&src.sa, &dst.sa, th,
inp->inp_rtableid);
goto drop;
case TH_SYN|TH_ACK:
/*
* Received a SYN,ACK. This should
* never happen while we are in
* LISTEN. Send an RST.
*/
goto badsyn;
case TH_ACK:
so = syn_cache_get(&src.sa, &dst.sa,
th, iphlen, tlen, so, m, now);
if (so == NULL) {
/*
* We don't have a SYN for
* this ACK; send an RST.
*/
goto badsyn;
} else if (so == (struct socket *)(-1)) {
/*
* We were unable to create
* the connection. If the
* 3-way handshake was
* completed, and RST has
* been sent to the peer.
* Since the mbuf might be
* in use for the reply,
* do not free it.
*/
m = *mp = NULL;
goto drop;
} else {
/*
* We have created a
* full-blown connection.
*/
tp = NULL;
in_pcbunref(inp);
inp = in_pcbref(sotoinpcb(so));
tp = intotcpcb(inp);
if (tp == NULL)
goto badsyn; /*XXX*/
}
break;
default:
/*
* None of RST, SYN or ACK was set.
* This is an invalid packet for a
* TCB in LISTEN state. Send a RST.
*/
goto badsyn;
case TH_SYN:
/*
* Received a SYN.
*/
#ifdef INET6
/*
* If deprecated address is forbidden, we do
* not accept SYN to deprecated interface
* address to prevent any new inbound
* connection from getting established.
* When we do not accept SYN, we send a TCP
* RST, with deprecated source address (instead
* of dropping it). We compromise it as it is
* much better for peer to send a RST, and
* RST will be the final packet for the
* exchange.
*
* If we do not forbid deprecated addresses, we
* accept the SYN packet. RFC2462 does not
* suggest dropping SYN in this case.
* If we decipher RFC2462 5.5.4, it says like
* this:
* 1. use of deprecated addr with existing
* communication is okay - "SHOULD continue
* to be used"
* 2. use of it with new communication:
* (2a) "SHOULD NOT be used if alternate
* address with sufficient scope is
* available"
* (2b) nothing mentioned otherwise.
* Here we fall into (2b) case as we have no
* choice in our source address selection - we
* must obey the peer.
*
* The wording in RFC2462 is confusing, and
* there are multiple description text for
* deprecated address handling - worse, they
* are not exactly the same. I believe 5.5.4
* is the best one, so we follow 5.5.4.
*/
if (ip6 && !ip6_use_deprecated) {
struct in6_ifaddr *ia6;
struct ifnet *ifp =
if_get(m->m_pkthdr.ph_ifidx);
if (ifp &&
(ia6 = in6ifa_ifpwithaddr(ifp,
&ip6->ip6_dst)) &&
(ia6->ia6_flags &
IN6_IFF_DEPRECATED)) {
tp = NULL;
if_put(ifp);
goto dropwithreset;
}
if_put(ifp);
}
#endif
/*
* LISTEN socket received a SYN
* from itself? This can't possibly
* be valid; drop the packet.
*/
if (th->th_dport == th->th_sport) {
switch (af) {
#ifdef INET6
case AF_INET6:
if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
&ip6->ip6_dst)) {
tcpstat_inc(tcps_badsyn);
goto drop;
}
break;
#endif /* INET6 */
case AF_INET:
if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
tcpstat_inc(tcps_badsyn);
goto drop;
}
break;
}
}
/*
* SYN looks ok; create compressed TCP
* state for it.
*/
if (so->so_qlen > so->so_qlimit ||
syn_cache_add(&src.sa, &dst.sa, th, iphlen,
so, m, optp, optlen, &opti, reuse, now)
== -1) {
tcpstat_inc(tcps_dropsyn);
goto drop;
}
in_pcbunref(inp);
return IPPROTO_DONE;
}
}
}
#ifdef DIAGNOSTIC
/*
* Should not happen now that all embryonic connections
* are handled with compressed state.
*/
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
#endif
#if NPF > 0
pf_inp_link(m, inp);
#endif
/*
* Segment received on connection.
* Reset idle time and keep-alive timer.
*/
tp->t_rcvtime = now;
if (TCPS_HAVEESTABLISHED(tp->t_state))
TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
if (tp->sack_enable)
tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
/*
* Process options.
*/
#ifdef TCP_SIGNATURE
if (optp || (tp->t_flags & TF_SIGNATURE))
#else
if (optp)
#endif
if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti,
m->m_pkthdr.ph_rtableid, now))
goto drop;
if (opti.ts_present && opti.ts_ecr) {
int rtt_test;
/* subtract out the tcp timestamp modulator */
opti.ts_ecr -= tp->ts_modulate;
/* make sure ts_ecr is sensible */
rtt_test = now - opti.ts_ecr;
if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
opti.ts_ecr = 0;
}
#ifdef TCP_ECN
/* if congestion experienced, set ECE bit in subsequent packets. */
if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
tp->t_flags |= TF_RCVD_CE;
tcpstat_inc(tcps_ecn_rcvce);
}
#endif
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
* no control flags, is in-sequence, the window didn't
* change and we're not retransmitting, it's a
* candidate. If the length is zero and the ack moved
* forward, we're the sender side of the xfer. Just
* free the data acked & wake any higher level process
* that was blocked waiting for space. If the length
* is non-zero and the ack didn't move, we're the
* receiver side. If we're getting packets in-order
* (the reassembly queue is empty), add the data to
* the socket buffer and note that we need a delayed ack.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
#ifdef TCP_ECN
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
#else
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
#endif
(!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
th->th_seq == tp->rcv_nxt &&
tiwin && tiwin == tp->snd_wnd &&
tp->snd_nxt == tp->snd_max) {
/*
* If last ACK falls within this segment's sequence numbers,
* record the timestamp.
* Fix from Braden, see Stevens p. 870
*/
if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
tp->ts_recent_age = now;
tp->ts_recent = opti.ts_val;
}
if (tlen == 0) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
tp->t_dupacks == 0) {
/*
* this is a pure ack for outstanding data.
*/
tcpstat_inc(tcps_predack);
if (opti.ts_present && opti.ts_ecr)
tcp_xmit_timer(tp, now - opti.ts_ecr);
else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, now - tp->t_rtttime);
acked = th->th_ack - tp->snd_una;
tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte,
acked);
tp->t_rcvacktime = now;
ND6_HINT(tp);
sbdrop(so, &so->so_snd, acked);
/*
* If we had a pending ICMP message that
* refers to data that have just been
* acknowledged, disregard the recorded ICMP
* message.
*/
if ((tp->t_flags & TF_PMTUD_PEND) &&
SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
tp->t_flags &= ~TF_PMTUD_PEND;
/*
* Keep track of the largest chunk of data
* acknowledged since last PMTU update
*/
if (tp->t_pmtud_mss_acked < acked)
tp->t_pmtud_mss_acked = acked;
tp->snd_una = th->th_ack;
/* Pull snd_wl2 up to prevent seq wrap. */
tp->snd_wl2 = th->th_ack;
/*
* We want snd_last to track snd_una so
* as to avoid sequence wraparound problems
* for very large transfers.
*/
#ifdef TCP_ECN
if (SEQ_GT(tp->snd_una, tp->snd_last))
#endif
tp->snd_last = tp->snd_una;
m_freem(m);
/*
* If all outstanding data are acked, stop
* retransmit timer, otherwise restart timer
* using current (possibly backed-off) value.
* If process is waiting for space,
* wakeup/selwakeup/signal. If data
* are ready to send, let tcp_output
* decide between more output or persist.
*/
if (tp->snd_una == tp->snd_max)
TCP_TIMER_DISARM(tp, TCPT_REXMT);
else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
tcp_update_sndspace(tp);
if (sb_notify(so, &so->so_snd)) {
tp->t_flags |= TF_BLOCKOUTPUT;
sowwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
}
if (so->so_snd.sb_cc ||
tp->t_flags & TF_NEEDOUTPUT)
(void) tcp_output(tp);
in_pcbunref(inp);
return IPPROTO_DONE;
}
} else if (th->th_ack == tp->snd_una &&
TAILQ_EMPTY(&tp->t_segq) &&
tlen <= sbspace(so, &so->so_rcv)) {
/*
* This is a pure, in-sequence data packet
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
/* Clean receiver SACK report if present */
if (tp->sack_enable && tp->rcv_numsacks)
tcp_clean_sackreport(tp);
tcpstat_inc(tcps_preddat);
tp->rcv_nxt += tlen;
/* Pull snd_wl1 and rcv_up up to prevent seq wrap. */
tp->snd_wl1 = th->th_seq;
/* Packet has most recent segment, no urgent exists. */
tp->rcv_up = tp->rcv_nxt;
tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
ND6_HINT(tp);
TCP_SETUP_ACK(tp, tiflags, m);
/*
* Drop TCP, IP headers and TCP options then add data
* to socket buffer.
*/
if (so->so_state & SS_CANTRCVMORE)
m_freem(m);
else {
if (opti.ts_present && opti.ts_ecr) {
if (tp->rfbuf_ts < opti.ts_ecr &&
opti.ts_ecr - tp->rfbuf_ts < hz) {
tcp_update_rcvspace(tp);
/* Start over with next RTT. */
tp->rfbuf_cnt = 0;
tp->rfbuf_ts = 0;
} else
tp->rfbuf_cnt += tlen;
}
m_adj(m, iphlen + off);
sbappendstream(so, &so->so_rcv, m);
}
tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
in_pcbunref(inp);
return IPPROTO_DONE;
}
}
/*
* Compute mbuf offset to TCP data segment.
*/
hdroptlen = iphlen + off;
/*
* Calculate amount of space in receive window,
* and then do TCP input processing.
* Receive window is amount of space in rcv queue,
* but not less than advertised window.
*/
{ int win;
win = sbspace(so, &so->so_rcv);
if (win < 0)
win = 0;
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
}
/* Reset receive buffer auto scaling when not in bulk receive mode. */
tp->rfbuf_cnt = 0;
tp->rfbuf_ts = 0;
switch (tp->t_state) {
/*
* If the state is SYN_RECEIVED:
* if seg contains SYN/ACK, send an RST.
* if seg contains an ACK, but not for our SYN/ACK, send an RST
*/
case TCPS_SYN_RECEIVED:
if (tiflags & TH_ACK) {
if (tiflags & TH_SYN) {
tcpstat_inc(tcps_badsyn);
goto dropwithreset;
}
if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))
goto dropwithreset;
}
break;
/*
* If the state is SYN_SENT:
* if seg contains an ACK, but not for our SYN, drop the input.
* if seg contains a RST, then drop the connection.
* if seg does not contain SYN, then drop it.
* Otherwise this is an acceptable SYN segment
* initialize tp->rcv_nxt and tp->irs
* if seg contains ack then advance tp->snd_una
* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
* arrange for segment to be acked (eventually)
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
if ((tiflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
SEQ_GT(th->th_ack, tp->snd_max)))
goto dropwithreset;
if (tiflags & TH_RST) {
#ifdef TCP_ECN
/* if ECN is enabled, fall back to non-ecn at rexmit */
if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
goto drop;
#endif
if (tiflags & TH_ACK)
tp = tcp_drop(tp, ECONNREFUSED);
goto drop;
}
if ((tiflags & TH_SYN) == 0)
goto drop;
if (tiflags & TH_ACK) {
tp->snd_una = th->th_ack;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
}
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->irs = th->th_seq;
tcp_mss(tp, opti.maxseg);
/* Reset initial window to 1 segment for retransmit */
if (tp->t_rxtshift > 0)
tp->snd_cwnd = tp->t_maxseg;
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
/*
* If we've sent a SACK_PERMITTED option, and the peer
* also replied with one, then TF_SACK_PERMIT should have
* been set in tcp_dooptions(). If it was not, disable SACKs.
*/
if (tp->sack_enable)
tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
#ifdef TCP_ECN
/*
* if ECE is set but CWR is not set for SYN-ACK, or
* both ECE and CWR are set for simultaneous open,
* peer is ECN capable.
*/
if (tcp_do_ecn) {
switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) {
case TH_ACK|TH_ECE:
case TH_ECE|TH_CWR:
tp->t_flags |= TF_ECN_PERMIT;
tiflags &= ~(TH_ECE|TH_CWR);
tcpstat_inc(tcps_ecn_accepts);
}
}
#endif
if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
tcpstat_inc(tcps_connects);
tp->t_flags |= TF_BLOCKOUTPUT;
soisconnected(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
tp->t_state = TCPS_ESTABLISHED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
/* Do window scaling on this connection? */
if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
(TF_RCVD_SCALE|TF_REQ_SCALE)) {
tp->snd_scale = tp->requested_s_scale;
tp->rcv_scale = tp->request_r_scale;
}
tcp_flush_queue(tp);
/*
* if we didn't have to retransmit the SYN,
* use its rtt as our initial srtt & rtt var.
*/
if (tp->t_rtttime)
tcp_xmit_timer(tp, now - tp->t_rtttime);
/*
* Since new data was acked (the SYN), open the
* congestion window by one MSS. We do this
* here, because we won't go through the normal
* ACK processing below. And since this is the
* start of the connection, we know we are in
* the exponential phase of slow-start.
*/
tp->snd_cwnd += tp->t_maxseg;
} else
tp->t_state = TCPS_SYN_RECEIVED;
#if 0
trimthenstep6:
#endif
/*
* Advance th->th_seq to correspond to first data byte.
* If data, trim to stay within window,
* dropping FIN if necessary.
*/
th->th_seq++;
if (tlen > tp->rcv_wnd) {
todrop = tlen - tp->rcv_wnd;
m_adj(m, -todrop);
tlen = tp->rcv_wnd;
tiflags &= ~TH_FIN;
tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin,
todrop);
}
tp->snd_wl1 = th->th_seq - 1;
tp->rcv_up = th->th_seq;
goto step6;
/*
* If a new connection request is received while in TIME_WAIT,
* drop the old connection and start over if the if the
* timestamp or the sequence numbers are above the previous
* ones.
*/
case TCPS_TIME_WAIT:
if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
((opti.ts_present &&
TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
SEQ_GT(th->th_seq, tp->rcv_nxt))) {
#if NPF > 0
/*
* The socket will be recreated but the new state
* has already been linked to the socket. Remove the
* link between old socket and new state.
*/
pf_inp_unlink(inp);
#endif
/*
* Advance the iss by at least 32768, but
* clear the msb in order to make sure
* that SEG_LT(snd_nxt, iss).
*/
iss = tp->snd_nxt +
((arc4random() & 0x7fffffff) | 0x8000);
reuse = &iss;
tp = tcp_close(tp);
in_pcbunref(inp);
inp = NULL;
goto findpcb;
}
}
/*
* States other than LISTEN or SYN_SENT.
* First check timestamp, if present.
* Then check that at least some bytes of segment are within
* receive window. If segment begins before rcv_nxt,
* drop leading data (and SYN); if nothing left, just ack.
*
* RFC 1323 PAWS: If we have a timestamp reply on this segment
* and it's less than opti.ts_recent, drop it.
*/
if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
TSTMP_LT(opti.ts_val, tp->ts_recent)) {
/* Check to see if ts_recent is over 24 days old. */
if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
/*
* Invalidate ts_recent. If this segment updates
* ts_recent, the age will be reset later and ts_recent
* will get a valid value. If it does not, setting
* ts_recent to zero will at least satisfy the
* requirement that zero be placed in the timestamp
* echo reply when ts_recent isn't valid. The
* age isn't reset until we get a valid ts_recent
* because we don't want out-of-order segments to be
* dropped when ts_recent is old.
*/
tp->ts_recent = 0;
} else {
tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen);
tcpstat_inc(tcps_pawsdrop);
if (tlen)
goto dropafterack;
goto drop;
}
}
todrop = tp->rcv_nxt - th->th_seq;
if (todrop > 0) {
if (tiflags & TH_SYN) {
tiflags &= ~TH_SYN;
th->th_seq++;
if (th->th_urp > 1)
th->th_urp--;
else
tiflags &= ~TH_URG;
todrop--;
}
if (todrop > tlen ||
(todrop == tlen && (tiflags & TH_FIN) == 0)) {
/*
* Any valid FIN must be to the left of the
* window. At this point, FIN must be a
* duplicate or out-of-sequence, so drop it.
*/
tiflags &= ~TH_FIN;
/*
* Send ACK to resynchronize, and drop any data,
* but keep on processing for RST or ACK.
*/
tp->t_flags |= TF_ACKNOW;
todrop = tlen;
tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop);
} else {
tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte,
todrop);
}
hdroptlen += todrop; /* drop from head afterwards */
th->th_seq += todrop;
tlen -= todrop;
if (th->th_urp > todrop)
th->th_urp -= todrop;
else {
tiflags &= ~TH_URG;
th->th_urp = 0;
}
}
/*
* If new data are received on a connection after the
* user processes are gone, then RST the other end.
*/
if ((so->so_state & SS_NOFDREF) &&
tp->t_state > TCPS_CLOSE_WAIT && tlen) {
tp = tcp_close(tp);
tcpstat_inc(tcps_rcvafterclose);
goto dropwithreset;
}
/*
* If segment ends after window, drop trailing data
* (and PUSH and FIN); if nothing left, just ACK.
*/
todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
if (todrop > 0) {
tcpstat_inc(tcps_rcvpackafterwin);
if (todrop >= tlen) {
tcpstat_add(tcps_rcvbyteafterwin, tlen);
/*
* If window is closed can only take segments at
* window edge, and have to drop data and PUSH from
* incoming segments. Continue processing, but
* remember to ack. Otherwise, drop segment
* and ack.
*/
if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
tp->t_flags |= TF_ACKNOW;
tcpstat_inc(tcps_rcvwinprobe);
} else
goto dropafterack;
} else
tcpstat_add(tcps_rcvbyteafterwin, todrop);
m_adj(m, -todrop);
tlen -= todrop;
tiflags &= ~(TH_PUSH|TH_FIN);
}
/*
* If last ACK falls within this segment's sequence numbers,
* record its timestamp if it's more recent.
* NOTE that the test is modified according to the latest
* proposal of the tcplw@cray.com list (Braden 1993/04/26).
*/
if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
tp->ts_recent_age = now;
tp->ts_recent = opti.ts_val;
}
/*
* If the RST bit is set examine the state:
* SYN_RECEIVED STATE:
* If passive open, return to LISTEN state.
* If active open, inform user that connection was refused.
* ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
* Inform user that connection was reset, and close tcb.
* CLOSING, LAST_ACK, TIME_WAIT STATES
* Close the tcb.
*/
if (tiflags & TH_RST) {
if (th->th_seq != tp->last_ack_sent &&
th->th_seq != tp->rcv_nxt &&
th->th_seq != (tp->rcv_nxt + 1))
goto drop;
switch (tp->t_state) {
case TCPS_SYN_RECEIVED:
#ifdef TCP_ECN
/* if ECN is enabled, fall back to non-ecn at rexmit */
if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
goto drop;
#endif
so->so_error = ECONNREFUSED;
goto close;
case TCPS_ESTABLISHED:
case TCPS_FIN_WAIT_1:
case TCPS_FIN_WAIT_2:
case TCPS_CLOSE_WAIT:
so->so_error = ECONNRESET;
close:
tp->t_state = TCPS_CLOSED;
tcpstat_inc(tcps_drops);
tp = tcp_close(tp);
goto drop;
case TCPS_CLOSING:
case TCPS_LAST_ACK:
case TCPS_TIME_WAIT:
tp = tcp_close(tp);
goto drop;
}
}
/*
* If a SYN is in the window, then this is an
* error and we ACK and drop the packet.
*/
if (tiflags & TH_SYN)
goto dropafterack_ratelim;
/*
* If the ACK bit is off we drop the segment and return.
*/
if ((tiflags & TH_ACK) == 0) {
if (tp->t_flags & TF_ACKNOW)
goto dropafterack;
else
goto drop;
}
/*
* Ack processing.
*/
switch (tp->t_state) {
/*
* In SYN_RECEIVED state, the ack ACKs our SYN, so enter
* ESTABLISHED state and continue processing.
* The ACK was checked above.
*/
case TCPS_SYN_RECEIVED:
tcpstat_inc(tcps_connects);
tp->t_flags |= TF_BLOCKOUTPUT;
soisconnected(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
tp->t_state = TCPS_ESTABLISHED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
/* Do window scaling? */
if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
(TF_RCVD_SCALE|TF_REQ_SCALE)) {
tp->snd_scale = tp->requested_s_scale;
tp->rcv_scale = tp->request_r_scale;
tiwin = th->th_win << tp->snd_scale;
}
tcp_flush_queue(tp);
tp->snd_wl1 = th->th_seq - 1;
/* fall into ... */
/*
* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
* ACKs. If the ack is in the range
* tp->snd_una < th->th_ack <= tp->snd_max
* then advance tp->snd_una to th->th_ack and drop
* data from the retransmission queue. If this ACK reflects
* more up to date window information we update our window information.
*/
case TCPS_ESTABLISHED:
case TCPS_FIN_WAIT_1:
case TCPS_FIN_WAIT_2:
case TCPS_CLOSE_WAIT:
case TCPS_CLOSING:
case TCPS_LAST_ACK:
case TCPS_TIME_WAIT:
#ifdef TCP_ECN
/*
* if we receive ECE and are not already in recovery phase,
* reduce cwnd by half but don't slow-start.
* advance snd_last to snd_max not to reduce cwnd again
* until all outstanding packets are acked.
*/
if (tcp_do_ecn && (tiflags & TH_ECE)) {
if ((tp->t_flags & TF_ECN_PERMIT) &&
SEQ_GEQ(tp->snd_una, tp->snd_last)) {
u_int win;
win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
if (win > 1) {
tp->snd_ssthresh = win / 2 * tp->t_maxseg;
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_last = tp->snd_max;
tp->t_flags |= TF_SEND_CWR;
tcpstat_inc(tcps_cwr_ecn);
}
}
tcpstat_inc(tcps_ecn_rcvece);
}
/*
* if we receive CWR, we know that the peer has reduced
* its congestion window. stop sending ecn-echo.
*/
if ((tiflags & TH_CWR)) {
tp->t_flags &= ~TF_RCVD_CE;
tcpstat_inc(tcps_ecn_rcvcwr);
}
#endif /* TCP_ECN */
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
/*
* Duplicate/old ACK processing.
* Increments t_dupacks:
* Pure duplicate (same seq/ack/window, no data)
* Doesn't affect t_dupacks:
* Data packets.
* Normal window updates (window opens)
* Resets t_dupacks:
* New data ACKed.
* Window shrinks
* Old ACK
*/
if (tlen) {
/* Drop very old ACKs unless th_seq matches */
if (th->th_seq != tp->rcv_nxt &&
SEQ_LT(th->th_ack,
tp->snd_una - tp->max_sndwnd)) {
tcpstat_inc(tcps_rcvacktooold);
goto drop;
}
break;
}
/*
* If we get an old ACK, there is probably packet
* reordering going on. Be conservative and reset
* t_dupacks so that we are less aggressive in
* doing a fast retransmit.
*/
if (th->th_ack != tp->snd_una) {
tp->t_dupacks = 0;
break;
}
if (tiwin == tp->snd_wnd) {
tcpstat_inc(tcps_rcvdupack);
/*
* If we have outstanding data (other than
* a window probe), this is a completely
* duplicate ack (ie, window info didn't
* change), the ack is the biggest we've
* seen and we've seen exactly our rexmt
* threshold of them, assume a packet
* has been dropped and retransmit it.
* Kludge snd_nxt & the congestion
* window so we send only this one
* packet.
*
* We know we're losing at the current
* window size so do congestion avoidance
* (set ssthresh to half the current window
* and pull our congestion window back to
* the new ssthresh).
*
* Dup acks mean that packets have left the
* network (they're now cached at the receiver)
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
*/
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
tp->t_dupacks = 0;
else if (++tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_long win =
ulmin(tp->snd_wnd, tp->snd_cwnd) /
2 / tp->t_maxseg;
if (SEQ_LT(th->th_ack, tp->snd_last)){
/*
* False fast retx after
* timeout. Do not cut window.
*/
tp->t_dupacks = 0;
goto drop;
}
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_maxseg;
tp->snd_last = tp->snd_max;
if (tp->sack_enable) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
#ifdef TCP_ECN
tp->t_flags |= TF_SEND_CWR;
#endif
tcpstat_inc(tcps_cwr_frecovery);
tcpstat_inc(tcps_sack_recovery_episode);
/*
* tcp_output() will send
* oldest SACK-eligible rtx.
*/
(void) tcp_output(tp);
tp->snd_cwnd = tp->snd_ssthresh+
tp->t_maxseg * tp->t_dupacks;
goto drop;
}
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
#ifdef TCP_ECN
tp->t_flags |= TF_SEND_CWR;
#endif
tcpstat_inc(tcps_cwr_frecovery);
tcpstat_inc(tcps_sndrexmitfast);
(void) tcp_output(tp);
tp->snd_cwnd = tp->snd_ssthresh +
tp->t_maxseg * tp->t_dupacks;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
goto drop;
} else if (tp->t_dupacks > tcprexmtthresh) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
goto drop;
}
} else if (tiwin < tp->snd_wnd) {
/*
* The window was retracted! Previous dup
* ACKs may have been due to packets arriving
* after the shrunken window, not a missing
* packet, so play it safe and reset t_dupacks
*/
tp->t_dupacks = 0;
}
break;
}
/*
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
if (tp->t_dupacks >= tcprexmtthresh) {
/* Check for a partial ACK */
if (SEQ_LT(th->th_ack, tp->snd_last)) {
if (tp->sack_enable)
tcp_sack_partialack(tp, th);
else
tcp_newreno_partialack(tp, th);
} else {
/* Out of fast recovery */
tp->snd_cwnd = tp->snd_ssthresh;
if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
tp->snd_ssthresh)
tp->snd_cwnd =
tcp_seq_subtract(tp->snd_max,
th->th_ack);
tp->t_dupacks = 0;
}
} else {
/*
* Reset the duplicate ACK counter if we
* were not in fast recovery.
*/
tp->t_dupacks = 0;
}
if (SEQ_GT(th->th_ack, tp->snd_max)) {
tcpstat_inc(tcps_rcvacktoomuch);
goto dropafterack_ratelim;
}
acked = th->th_ack - tp->snd_una;
tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked);
tp->t_rcvacktime = now;
/*
* If we have a timestamp reply, update smoothed
* round trip time. If no timestamp is present but
* transmit timer is running and timed sequence
* number was acked, update smoothed round trip time.
* Since we now have an rtt measurement, cancel the
* timer backoff (cf., Phil Karn's retransmit alg.).
* Recompute the initial retransmit timer.
*/
if (opti.ts_present && opti.ts_ecr)
tcp_xmit_timer(tp, now - opti.ts_ecr);
else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, now - tp->t_rtttime);
/*
* If all outstanding data is acked, stop retransmit
* timer and remember to restart (more output or persist).
* If there is more data to be acked, restart retransmit
* timer, using current (possibly backed-off) value.
*/
if (th->th_ack == tp->snd_max) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_flags |= TF_NEEDOUTPUT;
} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
/*
* When new data is acked, open the congestion window.
* If the window gives us less than ssthresh packets
* in flight, open exponentially (maxseg per packet).
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
{
u_int cw = tp->snd_cwnd;
u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
incr = max(incr * incr / cw, 1);
if (tp->t_dupacks < tcprexmtthresh)
tp->snd_cwnd = ulmin(cw + incr,
TCP_MAXWIN << tp->snd_scale);
}
ND6_HINT(tp);
if (acked > so->so_snd.sb_cc) {
if (tp->snd_wnd > so->so_snd.sb_cc)
tp->snd_wnd -= so->so_snd.sb_cc;
else
tp->snd_wnd = 0;
sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
ourfinisacked = 1;
} else {
sbdrop(so, &so->so_snd, acked);
if (tp->snd_wnd > acked)
tp->snd_wnd -= acked;
else
tp->snd_wnd = 0;
ourfinisacked = 0;
}
tcp_update_sndspace(tp);
if (sb_notify(so, &so->so_snd)) {
tp->t_flags |= TF_BLOCKOUTPUT;
sowwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
}
/*
* If we had a pending ICMP message that referred to data
* that have just been acknowledged, disregard the recorded
* ICMP message.
*/
if ((tp->t_flags & TF_PMTUD_PEND) &&
SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
tp->t_flags &= ~TF_PMTUD_PEND;
/*
* Keep track of the largest chunk of data acknowledged
* since last PMTU update
*/
if (tp->t_pmtud_mss_acked < acked)
tp->t_pmtud_mss_acked = acked;
tp->snd_una = th->th_ack;
#ifdef TCP_ECN
/* sync snd_last with snd_una */
if (SEQ_GT(tp->snd_una, tp->snd_last))
tp->snd_last = tp->snd_una;
#endif
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
switch (tp->t_state) {
/*
* In FIN_WAIT_1 STATE in addition to the processing
* for the ESTABLISHED state if our FIN is now acknowledged
* then enter FIN_WAIT_2.
*/
case TCPS_FIN_WAIT_1:
if (ourfinisacked) {
/*
* If we can't receive any more
* data, then closing user can proceed.
* Starting the timer is contrary to the
* specification, but if we don't get a FIN
* we'll hang forever.
*/
if (so->so_state & SS_CANTRCVMORE) {
tp->t_flags |= TF_BLOCKOUTPUT;
soisdisconnected(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
}
tp->t_state = TCPS_FIN_WAIT_2;
}
break;
/*
* In CLOSING STATE in addition to the processing for
* the ESTABLISHED state if the ACK acknowledges our FIN
* then enter the TIME-WAIT state, otherwise ignore
* the segment.
*/
case TCPS_CLOSING:
if (ourfinisacked) {
tp->t_state = TCPS_TIME_WAIT;
tcp_canceltimers(tp);
TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
tp->t_flags |= TF_BLOCKOUTPUT;
soisdisconnected(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
}
break;
/*
* In LAST_ACK, we may still be waiting for data to drain
* and/or to be acked, as well as for the ack of our FIN.
* If our FIN is now acknowledged, delete the TCB,
* enter the closed state and return.
*/
case TCPS_LAST_ACK:
if (ourfinisacked) {
tp = tcp_close(tp);
goto drop;
}
break;
/*
* In TIME_WAIT state the only thing that should arrive
* is a retransmission of the remote FIN. Acknowledge
* it and restart the finack timer.
*/
case TCPS_TIME_WAIT:
TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
goto dropafterack;
}
}
step6:
/*
* Update window information.
* Don't look at window if no ACK: TAC's send garbage on first SYN.
*/
if ((tiflags & TH_ACK) &&
(SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
(SEQ_LT(tp->snd_wl2, th->th_ack) ||
(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
/* keep track of pure window updates */
if (tlen == 0 &&
tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
tcpstat_inc(tcps_rcvwinupd);
tp->snd_wnd = tiwin;
tp->snd_wl1 = th->th_seq;
tp->snd_wl2 = th->th_ack;
if (tp->snd_wnd > tp->max_sndwnd)
tp->max_sndwnd = tp->snd_wnd;
tp->t_flags |= TF_NEEDOUTPUT;
}
/*
* Process segments with URG.
*/
if ((tiflags & TH_URG) && th->th_urp &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
/*
* This is a kludge, but if we receive and accept
* random urgent pointers, we'll crash in
* soreceive. It's hard to imagine someone
* actually wanting to send this much urgent data.
*/
if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
th->th_urp = 0; /* XXX */
tiflags &= ~TH_URG; /* XXX */
goto dodata; /* XXX */
}
/*
* If this segment advances the known urgent pointer,
* then mark the data stream. This should not happen
* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
* a FIN has been received from the remote side.
* In these states we ignore the URG.
*
* According to RFC961 (Assigned Protocols),
* the urgent pointer points to the last octet
* of urgent data. We continue, however,
* to consider it to indicate the first octet
* of data past the urgent section as the original
* spec states (in one of two places).
*/
if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
tp->rcv_up = th->th_seq + th->th_urp;
so->so_oobmark = so->so_rcv.sb_cc +
(tp->rcv_up - tp->rcv_nxt) - 1;
if (so->so_oobmark == 0)
so->so_state |= SS_RCVATMARK;
sohasoutofband(so);
tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
}
/*
* Remove out of band data so doesn't get presented to user.
* This can happen independent of advancing the URG pointer,
* but if two URG's are pending at once, some out-of-band
* data may creep in... ick.
*/
if (th->th_urp <= (u_int16_t) tlen &&
(so->so_options & SO_OOBINLINE) == 0)
tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
} else
/*
* If no out of band data is expected,
* pull receive urgent pointer along
* with the receive window.
*/
if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
tp->rcv_up = tp->rcv_nxt;
dodata: /* XXX */
/*
* Process the segment text, merging it into the TCP sequencing queue,
* and arranging for acknowledgment of receipt if necessary.
* This process logically involves adjusting tp->rcv_wnd as data
* is presented to the user (this happens in tcp_usrreq.c,
* case PRU_RCVD). If a FIN has already been received on this
* connection then we just ignore the text.
*/
if ((tlen || (tiflags & TH_FIN)) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq laststart = th->th_seq;
tcp_seq lastend = th->th_seq + tlen;
if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
tp->t_state == TCPS_ESTABLISHED) {
TCP_SETUP_ACK(tp, tiflags, m);
tp->rcv_nxt += tlen;
tiflags = th->th_flags & TH_FIN;
tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
ND6_HINT(tp);
if (so->so_state & SS_CANTRCVMORE)
m_freem(m);
else {
m_adj(m, hdroptlen);
sbappendstream(so, &so->so_rcv, m);
}
tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
} else {
m_adj(m, hdroptlen);
tiflags = tcp_reass(tp, th, m, &tlen);
tp->t_flags |= TF_ACKNOW;
}
if (tp->sack_enable)
tcp_update_sack_list(tp, laststart, lastend);
/*
* variable len never referenced again in modern BSD,
* so why bother computing it ??
*/
#if 0
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
* buffer size.
*/
len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
#endif /* 0 */
} else {
m_freem(m);
tiflags &= ~TH_FIN;
}
/*
* If FIN is received ACK the FIN and let the user know
* that the connection is closing. Ignore a FIN received before
* the connection is fully established.
*/
if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tp->t_flags |= TF_BLOCKOUTPUT;
socantrcvmore(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
tp->t_flags |= TF_ACKNOW;
tp->rcv_nxt++;
}
switch (tp->t_state) {
/*
* In ESTABLISHED STATE enter the CLOSE_WAIT state.
*/
case TCPS_ESTABLISHED:
tp->t_state = TCPS_CLOSE_WAIT;
break;
/*
* If still in FIN_WAIT_1 STATE FIN has not been acked so
* enter the CLOSING state.
*/
case TCPS_FIN_WAIT_1:
tp->t_state = TCPS_CLOSING;
break;
/*
* In FIN_WAIT_2 state enter the TIME_WAIT state,
* starting the time-wait timer, turning off the other
* standard timers.
*/
case TCPS_FIN_WAIT_2:
tp->t_state = TCPS_TIME_WAIT;
tcp_canceltimers(tp);
TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
tp->t_flags |= TF_BLOCKOUTPUT;
soisdisconnected(so);
tp->t_flags &= ~TF_BLOCKOUTPUT;
break;
/*
* In TIME_WAIT state restart the 2 MSL time_wait timer.
*/
case TCPS_TIME_WAIT:
TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
break;
}
}
if (otp)
tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen);
/*
* Return any desired output.
*/
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
in_pcbunref(inp);
return IPPROTO_DONE;
badsyn:
/*
* Received a bad SYN. Increment counters and dropwithreset.
*/
tcpstat_inc(tcps_badsyn);
tp = NULL;
goto dropwithreset;
dropafterack_ratelim:
if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
tcp_ackdrop_ppslim) == 0) {
/* XXX stat */
goto drop;
}
/* ...fall into dropafterack... */
dropafterack:
/*
* Generate an ACK dropping incoming segment if it occupies
* sequence space, where the ACK reflects our state.
*/
if (tiflags & TH_RST)
goto drop;
m_freem(m);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
in_pcbunref(inp);
return IPPROTO_DONE;
dropwithreset_ratelim:
/*
* We may want to rate-limit RSTs in certain situations,
* particularly if we are sending an RST in response to
* an attempt to connect to or otherwise communicate with
* a port for which we have no socket.
*/
if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
tcp_rst_ppslim) == 0) {
/* XXX stat */
goto drop;
}
/* ...fall into dropwithreset... */
dropwithreset:
/*
* Generate a RST, dropping incoming segment.
* Make ACK acceptable to originator of segment.
* Don't bother to respond to RST.
*/
if (tiflags & TH_RST)
goto drop;
if (tiflags & TH_ACK) {
tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack,
TH_RST, m->m_pkthdr.ph_rtableid, now);
} else {
if (tiflags & TH_SYN)
tlen++;
tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen,
(tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now);
}
m_freem(m);
in_pcbunref(inp);
return IPPROTO_DONE;
drop:
/*
* Drop space held by incoming segment and return.
*/
if (otp)
tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen);
m_freem(m);
in_pcbunref(inp);
return IPPROTO_DONE;
}
int
tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
struct mbuf *m, int iphlen, struct tcp_opt_info *oi,
u_int rtableid, uint32_t now)
{
u_int16_t mss = 0;
int opt, optlen;
#ifdef TCP_SIGNATURE
caddr_t sigp = NULL;
struct tdb *tdb = NULL;
#endif /* TCP_SIGNATURE */
for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == TCPOPT_EOL)
break;
if (opt == TCPOPT_NOP)
optlen = 1;
else {
if (cnt < 2)
break;
optlen = cp[1];
if (optlen < 2 || optlen > cnt)
break;
}
switch (opt) {
default:
continue;
case TCPOPT_MAXSEG:
if (optlen != TCPOLEN_MAXSEG)
continue;
if (!(th->th_flags & TH_SYN))
continue;
if (TCPS_HAVERCVDSYN(tp->t_state))
continue;
memcpy(&mss, cp + 2, sizeof(mss));
mss = ntohs(mss);
oi->maxseg = mss;
break;
case TCPOPT_WINDOW:
if (optlen != TCPOLEN_WINDOW)
continue;
if (!(th->th_flags & TH_SYN))
continue;
if (TCPS_HAVERCVDSYN(tp->t_state))
continue;
tp->t_flags |= TF_RCVD_SCALE;
tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
break;
case TCPOPT_TIMESTAMP:
if (optlen != TCPOLEN_TIMESTAMP)
continue;
oi->ts_present = 1;
memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val));
oi->ts_val = ntohl(oi->ts_val);
memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr));
oi->ts_ecr = ntohl(oi->ts_ecr);
if (!(th->th_flags & TH_SYN))
continue;
if (TCPS_HAVERCVDSYN(tp->t_state))
continue;
/*
* A timestamp received in a SYN makes
* it ok to send timestamp requests and replies.
*/
tp->t_flags |= TF_RCVD_TSTMP;
tp->ts_recent = oi->ts_val;
tp->ts_recent_age = now;
break;
case TCPOPT_SACK_PERMITTED:
if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
continue;
if (!(th->th_flags & TH_SYN))
continue;
if (TCPS_HAVERCVDSYN(tp->t_state))
continue;
/* MUST only be set on SYN */
tp->t_flags |= TF_SACK_PERMIT;
break;
case TCPOPT_SACK:
tcp_sack_option(tp, th, cp, optlen);
break;
#ifdef TCP_SIGNATURE
case TCPOPT_SIGNATURE:
if (optlen != TCPOLEN_SIGNATURE)
continue;
if (sigp && timingsafe_bcmp(sigp, cp + 2, 16))
goto bad;
sigp = cp + 2;
break;
#endif /* TCP_SIGNATURE */
}
}
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE) {
union sockaddr_union src, dst;
memset(&src, 0, sizeof(union sockaddr_union));
memset(&dst, 0, sizeof(union sockaddr_union));
switch (tp->pf) {
case 0:
case AF_INET:
src.sa.sa_len = sizeof(struct sockaddr_in);
src.sa.sa_family = AF_INET;
src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
dst.sa.sa_len = sizeof(struct sockaddr_in);
dst.sa.sa_family = AF_INET;
dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
break;
#ifdef INET6
case AF_INET6:
src.sa.sa_len = sizeof(struct sockaddr_in6);
src.sa.sa_family = AF_INET6;
src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
dst.sa.sa_len = sizeof(struct sockaddr_in6);
dst.sa.sa_family = AF_INET6;
dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
break;
#endif /* INET6 */
}
tdb = gettdbbysrcdst(rtable_l2(rtableid),
0, &src, &dst, IPPROTO_TCP);
/*
* We don't have an SA for this peer, so we turn off
* TF_SIGNATURE on the listen socket
*/
if (tdb == NULL && tp->t_state == TCPS_LISTEN)
tp->t_flags &= ~TF_SIGNATURE;
}
if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
tcpstat_inc(tcps_rcvbadsig);
goto bad;
}
if (sigp) {
char sig[16];
if (tdb == NULL) {
tcpstat_inc(tcps_rcvbadsig);
goto bad;
}
if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
goto bad;
if (timingsafe_bcmp(sig, sigp, 16)) {
tcpstat_inc(tcps_rcvbadsig);
goto bad;
}
tcpstat_inc(tcps_rcvgoodsig);
}
tdb_unref(tdb);
#endif /* TCP_SIGNATURE */
return (0);
#ifdef TCP_SIGNATURE
bad:
tdb_unref(tdb);
#endif /* TCP_SIGNATURE */
return (-1);
}
u_long
tcp_seq_subtract(u_long a, u_long b)
{
return ((long)(a - b));
}
/*
* This function is called upon receipt of new valid data (while not in header
* prediction mode), and it updates the ordered list of sacks.
*/
void
tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
tcp_seq rcv_lastend)
{
/*
* First reported block MUST be the most recent one. Subsequent
* blocks SHOULD be in the order in which they arrived at the
* receiver. These two conditions make the implementation fully
* compliant with RFC 2018.
*/
int i, j = 0, count = 0, lastpos = -1;
struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
/* First clean up current list of sacks */
for (i = 0; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0) {
count++; /* count = number of blocks to be discarded */
continue;
}
if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
tp->sackblks[i].start = tp->sackblks[i].end = 0;
count++;
} else {
temp[j].start = tp->sackblks[i].start;
temp[j++].end = tp->sackblks[i].end;
}
}
tp->rcv_numsacks -= count;
if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
tcp_clean_sackreport(tp);
if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
/* ==> need first sack block */
tp->sackblks[0].start = rcv_laststart;
tp->sackblks[0].end = rcv_lastend;
tp->rcv_numsacks = 1;
}
return;
}
/* Otherwise, sack blocks are already present. */
for (i = 0; i < tp->rcv_numsacks; i++)
tp->sackblks[i] = temp[i]; /* first copy back sack list */
if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
return; /* sack list remains unchanged */
/*
* From here, segment just received should be (part of) the 1st sack.
* Go through list, possibly coalescing sack block entries.
*/
firstsack.start = rcv_laststart;
firstsack.end = rcv_lastend;
for (i = 0; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (SEQ_LT(sack.end, firstsack.start) ||
SEQ_GT(sack.start, firstsack.end))
continue; /* no overlap */
if (sack.start == firstsack.start && sack.end == firstsack.end){
/*
* identical block; delete it here since we will
* move it to the front of the list.
*/
tp->sackblks[i].start = tp->sackblks[i].end = 0;
lastpos = i; /* last posn with a zero entry */
continue;
}
if (SEQ_LEQ(sack.start, firstsack.start))
firstsack.start = sack.start; /* merge blocks */
if (SEQ_GEQ(sack.end, firstsack.end))
firstsack.end = sack.end; /* merge blocks */
tp->sackblks[i].start = tp->sackblks[i].end = 0;
lastpos = i; /* last posn with a zero entry */
}
if (lastpos != -1) { /* at least one merge */
for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0)
continue;
temp[j++] = sack;
}
tp->rcv_numsacks = j; /* including first blk (added later) */
for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
tp->sackblks[i] = temp[i];
} else { /* no merges -- shift sacks by 1 */
if (tp->rcv_numsacks < MAX_SACK_BLKS)
tp->rcv_numsacks++;
for (i = tp->rcv_numsacks-1; i > 0; i--)
tp->sackblks[i] = tp->sackblks[i-1];
}
tp->sackblks[0] = firstsack;
return;
}
/*
* Process the TCP SACK option. tp->snd_holes is an ordered list
* of holes (oldest to newest, in terms of the sequence space).
*/
void
tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
{
int tmp_olen;
u_char *tmp_cp;
struct sackhole *cur, *p, *temp;
if (!tp->sack_enable)
return;
/* SACK without ACK doesn't make sense. */
if ((th->th_flags & TH_ACK) == 0)
return;
/* Make sure the ACK on this segment is in [snd_una, snd_max]. */
if (SEQ_LT(th->th_ack, tp->snd_una) ||
SEQ_GT(th->th_ack, tp->snd_max))
return;
/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
return;
/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
tmp_cp = cp + 2;
tmp_olen = optlen - 2;
tcpstat_inc(tcps_sack_rcv_opts);
if (tp->snd_numholes < 0)
tp->snd_numholes = 0;
if (tp->t_maxseg == 0)
panic("tcp_sack_option"); /* Should never happen */
while (tmp_olen > 0) {
struct sackblk sack;
memcpy(&sack.start, tmp_cp, sizeof(tcp_seq));
sack.start = ntohl(sack.start);
memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq));
sack.end = ntohl(sack.end);
tmp_olen -= TCPOLEN_SACK;
tmp_cp += TCPOLEN_SACK;
if (SEQ_LEQ(sack.end, sack.start))
continue; /* bad SACK fields */
if (SEQ_LEQ(sack.end, tp->snd_una))
continue; /* old block */
if (SEQ_GT(th->th_ack, tp->snd_una)) {
if (SEQ_LT(sack.start, th->th_ack))
continue;
}
if (SEQ_GT(sack.end, tp->snd_max))
continue;
if (tp->snd_holes == NULL) { /* first hole */
tp->snd_holes = (struct sackhole *)
pool_get(&sackhl_pool, PR_NOWAIT);
if (tp->snd_holes == NULL) {
/* ENOBUFS, so ignore SACKed block for now */
goto dropped;
}
cur = tp->snd_holes;
cur->start = th->th_ack;
cur->end = sack.start;
cur->rxmit = cur->start;
cur->next = NULL;
tp->snd_numholes = 1;
tp->rcv_lastsack = sack.end;
/*
* dups is at least one. If more data has been
* SACKed, it can be greater than one.
*/
cur->dups = min(tcprexmtthresh,
((sack.end - cur->end)/tp->t_maxseg));
if (cur->dups < 1)
cur->dups = 1;
continue; /* with next sack block */
}
/* Go thru list of holes: p = previous, cur = current */
p = cur = tp->snd_holes;
while (cur) {
if (SEQ_LEQ(sack.end, cur->start))
/* SACKs data before the current hole */
break; /* no use going through more holes */
if (SEQ_GEQ(sack.start, cur->end)) {
/* SACKs data beyond the current hole */
cur->dups++;
if (((sack.end - cur->end)/tp->t_maxseg) >=
tcprexmtthresh)
cur->dups = tcprexmtthresh;
p = cur;
cur = cur->next;
continue;
}
if (SEQ_LEQ(sack.start, cur->start)) {
/* Data acks at least the beginning of hole */
if (SEQ_GEQ(sack.end, cur->end)) {
/* Acks entire hole, so delete hole */
if (p != cur) {
p->next = cur->next;
pool_put(&sackhl_pool, cur);
cur = p->next;
} else {
cur = cur->next;
pool_put(&sackhl_pool, p);
p = cur;
tp->snd_holes = p;
}
tp->snd_numholes--;
continue;
}
/* otherwise, move start of hole forward */
cur->start = sack.end;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
p = cur;
cur = cur->next;
continue;
}
/* move end of hole backward */
if (SEQ_GEQ(sack.end, cur->end)) {
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur->dups++;
if (((sack.end - cur->end)/tp->t_maxseg) >=
tcprexmtthresh)
cur->dups = tcprexmtthresh;
p = cur;
cur = cur->next;
continue;
}
if (SEQ_LT(cur->start, sack.start) &&
SEQ_GT(cur->end, sack.end)) {
/*
* ACKs some data in middle of a hole; need to
* split current hole
*/
if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT)
goto dropped;
temp = (struct sackhole *)
pool_get(&sackhl_pool, PR_NOWAIT);
if (temp == NULL)
goto dropped; /* ENOBUFS */
temp->next = cur->next;
temp->start = sack.end;
temp->end = cur->end;
temp->dups = cur->dups;
temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
cur->end = sack.start;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur->dups++;
if (((sack.end - cur->end)/tp->t_maxseg) >=
tcprexmtthresh)
cur->dups = tcprexmtthresh;
cur->next = temp;
p = temp;
cur = p->next;
tp->snd_numholes++;
}
}
/* At this point, p points to the last hole on the list */
if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
/*
* Need to append new hole at end.
* Last hole is p (and it's not NULL).
*/
if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT)
goto dropped;
temp = (struct sackhole *)
pool_get(&sackhl_pool, PR_NOWAIT);
if (temp == NULL)
goto dropped; /* ENOBUFS */
temp->start = tp->rcv_lastsack;
temp->end = sack.start;
temp->dups = min(tcprexmtthresh,
((sack.end - sack.start)/tp->t_maxseg));
if (temp->dups < 1)
temp->dups = 1;
temp->rxmit = temp->start;
temp->next = 0;
p->next = temp;
tp->rcv_lastsack = sack.end;
tp->snd_numholes++;
}
}
return;
dropped:
tcpstat_inc(tcps_sack_drop_opts);
}
/*
* Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
* it is completely acked; otherwise, tcp_sack_option(), called from
* tcp_dooptions(), will fix up the hole.
*/
void
tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th)
{
if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
/* max because this could be an older ack just arrived */
tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
th->th_ack : tp->snd_una;
struct sackhole *cur = tp->snd_holes;
struct sackhole *prev;
while (cur)
if (SEQ_LEQ(cur->end, lastack)) {
prev = cur;
cur = cur->next;
pool_put(&sackhl_pool, prev);
tp->snd_numholes--;
} else if (SEQ_LT(cur->start, lastack)) {
cur->start = lastack;
if (SEQ_LT(cur->rxmit, cur->start))
cur->rxmit = cur->start;
break;
} else
break;
tp->snd_holes = cur;
}
}
/*
* Delete all receiver-side SACK information.
*/
void
tcp_clean_sackreport(struct tcpcb *tp)
{
int i;
tp->rcv_numsacks = 0;
for (i = 0; i < MAX_SACK_BLKS; i++)
tp->sackblks[i].start = tp->sackblks[i].end=0;
}
/*
* Partial ack handling within a sack recovery episode. When a partial ack
* arrives, turn off retransmission timer, deflate the window, do not clear
* tp->t_dupacks.
*/
void
tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
{
/* Turn off retx. timer (will start again next segment) */
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
/*
* Partial window deflation. This statement relies on the
* fact that tp->snd_una has not been updated yet.
*/
if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
tp->snd_cwnd -= th->th_ack - tp->snd_una;
tp->snd_cwnd += tp->t_maxseg;
} else
tp->snd_cwnd = tp->t_maxseg;
tp->snd_cwnd += tp->t_maxseg;
tp->t_flags |= TF_NEEDOUTPUT;
}
/*
* Pull out of band byte out of a segment so
* it doesn't appear in the user's data queue.
* It is still reflected in the segment length for
* sequencing purposes.
*/
void
tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off)
{
int cnt = off + urgent - 1;
while (cnt >= 0) {
if (m->m_len > cnt) {
char *cp = mtod(m, caddr_t) + cnt;
struct tcpcb *tp = sototcpcb(so);
tp->t_iobc = *cp;
tp->t_oobflags |= TCPOOB_HAVEDATA;
memmove(cp, cp + 1, m->m_len - cnt - 1);
m->m_len--;
return;
}
cnt -= m->m_len;
m = m->m_next;
if (m == NULL)
break;
}
panic("tcp_pulloutofband");
}
/*
* Collect new round-trip time estimate
* and update averages and current timeout.
*/
void
tcp_xmit_timer(struct tcpcb *tp, int rtt)
{
short delta;
short rttmin;
if (rtt < 0)
rtt = 0;
else if (rtt > TCP_RTT_MAX)
rtt = TCP_RTT_MAX;
tcpstat_inc(tcps_rttupdated);
if (tp->t_srtt != 0) {
/*
* delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
* after the binary point (scaled by 4), whereas
* srtt is stored as fixed point with 5 bits after the
* binary point (i.e., scaled by 32). The following magic
* is equivalent to the smoothing algorithm in rfc793 with
* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
* point).
*/
delta = (rtt << TCP_RTT_BASE_SHIFT) -
(tp->t_srtt >> TCP_RTT_SHIFT);
if ((tp->t_srtt += delta) <= 0)
tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
/*
* We accumulate a smoothed rtt variance (actually, a
* smoothed mean difference), then set the retransmit
* timer to smoothed rtt + 4 times the smoothed variance.
* rttvar is stored as fixed point with 4 bits after the
* binary point (scaled by 16). The following is
* equivalent to rfc793 smoothing with an alpha of .75
* (rttvar = rttvar*3/4 + |delta| / 4). This replaces
* rfc793's wired-in beta.
*/
if (delta < 0)
delta = -delta;
delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
if ((tp->t_rttvar += delta) <= 0)
tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
} else {
/*
* No rtt measurement yet - use the unsmoothed rtt.
* Set the variance to half the rtt (so our first
* retransmit happens at 3*rtt).
*/
tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
tp->t_rttvar = (rtt + 1) <<
(TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
}
tp->t_rtttime = 0;
tp->t_rxtshift = 0;
/*
* the retransmit should happen at rtt + 4 * rttvar.
* Because of the way we do the smoothing, srtt and rttvar
* will each average +1/2 tick of bias. When we compute
* the retransmit timer, we want 1/2 tick of rounding and
* 1 extra tick because of +-1/2 tick uncertainty in the
* firing of the timer. The bias will give us exactly the
* 1.5 tick we need. But, because the bias is
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
*/
rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX);
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
/*
* We received an ack for a packet that wasn't retransmitted;
* it is probably safe to discard any error indications we've
* received recently. This isn't quite right, but close enough
* for now (a route might have failed after we sent a segment,
* and the return path might not be symmetrical).
*/
tp->t_softerror = 0;
}
/*
* Determine a reasonable value for maxseg size.
* If the route is known, check route for mtu.
* If none, use an mss that can be handled on the outgoing
* interface without forcing IP to fragment; if bigger than
* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
* to utilize large mbufs. If no route is found, route has no mtu,
* or the destination isn't local, use a default, hopefully conservative
* size (usually 512 or the default IP max size, but no more than the mtu
* of the interface), as we can't discover anything about intervening
* gateways or networks. We also initialize the congestion/slow start
* window to be a single segment if the destination isn't local.
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
*
* Also take into account the space needed for options that we
* send regularly. Make maxseg shorter by that amount to assure
* that we can send maxseg amount of data even when the options
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
* NOTE: offer == -1 indicates that the maxseg size changed due to
* Path MTU discovery.
*/
int
tcp_mss(struct tcpcb *tp, int offer)
{
struct rtentry *rt;
struct ifnet *ifp = NULL;
int mss, mssopt;
int iphlen;
struct inpcb *inp;
inp = tp->t_inpcb;
mssopt = mss = tcp_mssdflt;
rt = in_pcbrtentry(inp);
if (rt == NULL)
goto out;
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL)
goto out;
switch (tp->pf) {
#ifdef INET6
case AF_INET6:
iphlen = sizeof(struct ip6_hdr);
break;
#endif
case AF_INET:
iphlen = sizeof(struct ip);
break;
default:
/* the family does not support path MTU discovery */
goto out;
}
/*
* if there's an mtu associated with the route and we support
* path MTU discovery for the underlying protocol family, use it.
*/
if (rt->rt_mtu) {
/*
* One may wish to lower MSS to take into account options,
* especially security-related options.
*/
if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph: if path MTU is
* smaller than 1280, use 1280 as packet size and
* attach fragment header.
*/
mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
sizeof(struct tcphdr);
} else {
mss = rt->rt_mtu - iphlen -
sizeof(struct tcphdr);
}
} else if (ifp->if_flags & IFF_LOOPBACK) {
mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); } else if (tp->pf == AF_INET) { if (ip_mtudisc) mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
}
#ifdef INET6
else if (tp->pf == AF_INET6) {
/*
* for IPv6, path MTU discovery is always turned on,
* or the node must use packet size <= 1280.
*/
mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
}
#endif /* INET6 */
/* Calculate the value that we offer in TCPOPT_MAXSEG */
if (offer != -1) {
mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
mssopt = max(tcp_mssdflt, mssopt);
}
out:
if_put(ifp);
/*
* The current mss, t_maxseg, is initialized to the default value.
* If we compute a smaller value, reduce the current mss.
* If we compute a larger value, return it for use in sending
* a max seg size option, but don't store it for use
* unless we received an offer at least that large from peer.
*
* However, do not accept offers lower than the minimum of
* the interface MTU and 216.
*/
if (offer > 0) tp->t_peermss = offer;
if (tp->t_peermss)
mss = min(mss, max(tp->t_peermss, 216));
/* sanity - at least max opt. space */
mss = max(mss, 64);
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
* segment. We need to store this value (maxopd) apart
* from maxseg, because now every segment carries options
* and thus we normally have somewhat less data in segments.
*/
tp->t_maxopd = mss;
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
mss -= TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
mss -= TCPOLEN_SIGLEN;
#endif
if (offer == -1) {
/* mss changed due to Path MTU discovery */
tp->t_flags &= ~TF_PMTUD_PEND;
tp->t_pmtud_mtu_sent = 0;
tp->t_pmtud_mss_acked = 0;
if (mss < tp->t_maxseg) {
/*
* Follow suggestion in RFC 2414 to reduce the
* congestion window by the ratio of the old
* segment size to the new segment size.
*/
tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
mss, mss);
}
} else if (tcp_do_rfc3390 == 2) {
/* increase initial window */
tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
} else if (tcp_do_rfc3390) {
/* increase initial window */
tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
} else
tp->snd_cwnd = mss;
tp->t_maxseg = mss;
return (offer != -1 ? mssopt : mss);
}
u_int
tcp_hdrsz(struct tcpcb *tp)
{
u_int hlen;
switch (tp->pf) {
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
case AF_INET:
hlen = sizeof(struct ip);
break;
default:
hlen = 0;
break;
}
hlen += sizeof(struct tcphdr);
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
hlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
hlen += TCPOLEN_SIGLEN;
#endif
return (hlen);
}
/*
* Set connection variables based on the effective MSS.
* We are passed the TCPCB for the actual connection. If we
* are the server, we are called by the compressed state engine
* when the 3-way handshake is complete. If we are the client,
* we are called when we receive the SYN,ACK from the server.
*
* NOTE: The t_maxseg value must be initialized in the TCPCB
* before this routine is called!
*/
void
tcp_mss_update(struct tcpcb *tp)
{
int mss;
u_long bufsize;
struct rtentry *rt;
struct socket *so;
so = tp->t_inpcb->inp_socket;
mss = tp->t_maxseg;
rt = in_pcbrtentry(tp->t_inpcb);
if (rt == NULL)
return;
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss) {
mss = bufsize;
/* Update t_maxseg and t_maxopd */
tcp_mss(tp, mss);
} else {
bufsize = roundup(bufsize, mss);
if (bufsize > sb_max)
bufsize = sb_max;
(void)sbreserve(so, &so->so_snd, bufsize);
}
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
if (bufsize > sb_max)
bufsize = sb_max;
(void)sbreserve(so, &so->so_rcv, bufsize);
}
}
/*
* When a partial ack arrives, force the retransmission of the
* next unacknowledged segment. Do not clear tp->t_dupacks.
* By setting snd_nxt to ti_ack, this forces retransmission timer
* to be started again.
*/
void
tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th)
{
/*
* snd_una has not been updated and the socket send buffer
* not yet drained of the acked data, so we have to leave
* snd_una as it was to get the correct data offset in
* tcp_output().
*/
tcp_seq onxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
/*
* Set snd_cwnd to one segment beyond acknowledged offset
* (tp->snd_una not yet updated when this function is called)
*/
tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
(void)tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
/*
* Partial window deflation. Relies on fact that tp->snd_una
* not updated yet.
*/
if (tp->snd_cwnd > th->th_ack - tp->snd_una)
tp->snd_cwnd -= th->th_ack - tp->snd_una;
else
tp->snd_cwnd = 0;
tp->snd_cwnd += tp->t_maxseg;
}
int
tcp_mss_adv(struct mbuf *m, int af)
{
int mss = 0;
int iphlen;
struct ifnet *ifp = NULL;
if (m && (m->m_flags & M_PKTHDR))
ifp = if_get(m->m_pkthdr.ph_ifidx);
switch (af) {
case AF_INET:
if (ifp != NULL)
mss = ifp->if_mtu;
iphlen = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
if (ifp != NULL)
mss = ifp->if_mtu;
iphlen = sizeof(struct ip6_hdr);
break;
#endif
default:
unhandled_af(af);
}
if_put(ifp);
mss = mss - iphlen - sizeof(struct tcphdr);
return (max(mss, tcp_mssdflt));
}
/*
* TCP compressed state engine. Currently used to hold compressed
* state for SYN_RECEIVED.
*/
/* syn hash parameters */
int tcp_syn_hash_size = TCP_SYN_HASH_SIZE;
int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
int tcp_syn_use_limit = 100000;
struct syn_cache_set tcp_syn_cache[2];
int tcp_syn_cache_active;
#define SYN_HASH(sa, sp, dp, rand) \
(((sa)->s_addr ^ (rand)[0]) * \
(((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4]))
#ifndef INET6
#define SYN_HASHALL(hash, src, dst, rand) \
do { \
hash = SYN_HASH(&satosin(src)->sin_addr, \
satosin(src)->sin_port, \
satosin(dst)->sin_port, (rand)); \
} while (/*CONSTCOND*/ 0)
#else
#define SYN_HASH6(sa, sp, dp, rand) \
(((sa)->s6_addr32[0] ^ (rand)[0]) * \
((sa)->s6_addr32[1] ^ (rand)[1]) * \
((sa)->s6_addr32[2] ^ (rand)[2]) * \
((sa)->s6_addr32[3] ^ (rand)[3]) * \
(((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4]))
#define SYN_HASHALL(hash, src, dst, rand) \
do { \
switch ((src)->sa_family) { \
case AF_INET: \
hash = SYN_HASH(&satosin(src)->sin_addr, \
satosin(src)->sin_port, \
satosin(dst)->sin_port, (rand)); \
break; \
case AF_INET6: \
hash = SYN_HASH6(&satosin6(src)->sin6_addr, \
satosin6(src)->sin6_port, \
satosin6(dst)->sin6_port, (rand)); \
break; \
default: \
hash = 0; \
} \
} while (/*CONSTCOND*/0)
#endif /* INET6 */
void
syn_cache_rm(struct syn_cache *sc)
{
sc->sc_flags |= SCF_DEAD;
TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
sc->sc_tp = NULL;
LIST_REMOVE(sc, sc_tpq);
sc->sc_buckethead->sch_length--;
timeout_del(&sc->sc_timer);
sc->sc_set->scs_count--;
}
void
syn_cache_put(struct syn_cache *sc)
{
m_free(sc->sc_ipopts);
if (sc->sc_route4.ro_rt != NULL) { rtfree(sc->sc_route4.ro_rt);
sc->sc_route4.ro_rt = NULL;
}
timeout_set(&sc->sc_timer, syn_cache_reaper, sc);
timeout_add(&sc->sc_timer, 0);
}
struct pool syn_cache_pool;
/*
* We don't estimate RTT with SYNs, so each packet starts with the default
* RTT and each timer step has a fixed timeout value.
*/
#define SYN_CACHE_TIMER_ARM(sc) \
do { \
TCPT_RANGESET((sc)->sc_rxtcur, \
TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
TCPTV_REXMTMAX); \
if (!timeout_initialized(&(sc)->sc_timer)) \
timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \
timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
} while (/*CONSTCOND*/0)
void
syn_cache_init(void)
{
int i;
/* Initialize the hash buckets. */
tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size,
sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO);
tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size,
sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO);
tcp_syn_cache[0].scs_size = tcp_syn_hash_size;
tcp_syn_cache[1].scs_size = tcp_syn_hash_size;
for (i = 0; i < tcp_syn_hash_size; i++) {
TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket);
TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket);
}
/* Initialize the syn cache pool. */
pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET,
0, "syncache", NULL);
}
void
syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
{
struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active];
struct syn_cache_head *scp;
struct syn_cache *sc2;
int i;
NET_ASSERT_LOCKED();
/*
* If there are no entries in the hash table, reinitialize
* the hash secrets. To avoid useless cache swaps and
* reinitialization, use it until the limit is reached.
* An empty cache is also the opportunity to resize the hash.
*/
if (set->scs_count == 0 && set->scs_use <= 0) {
set->scs_use = tcp_syn_use_limit;
if (set->scs_size != tcp_syn_hash_size) {
scp = mallocarray(tcp_syn_hash_size, sizeof(struct
syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO);
if (scp == NULL) {
/* Try again next time. */
set->scs_use = 0;
} else {
free(set->scs_buckethead, M_SYNCACHE,
set->scs_size *
sizeof(struct syn_cache_head));
set->scs_buckethead = scp;
set->scs_size = tcp_syn_hash_size;
for (i = 0; i < tcp_syn_hash_size; i++)
TAILQ_INIT(&scp[i].sch_bucket);
}
}
arc4random_buf(set->scs_random, sizeof(set->scs_random));
tcpstat_inc(tcps_sc_seedrandom);
}
SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa,
set->scs_random);
scp = &set->scs_buckethead[sc->sc_hash % set->scs_size];
sc->sc_buckethead = scp;
/*
* Make sure that we don't overflow the per-bucket
* limit or the total cache size limit.
*/
if (scp->sch_length >= tcp_syn_bucket_limit) {
tcpstat_inc(tcps_sc_bucketoverflow);
/*
* Someone might attack our bucket hash function. Reseed
* with random as soon as the passive syn cache gets empty.
*/
set->scs_use = 0;
/*
* The bucket is full. Toss the oldest element in the
* bucket. This will be the first entry in the bucket.
*/
sc2 = TAILQ_FIRST(&scp->sch_bucket);
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find an
* entry in our bucket.
*/
if (sc2 == NULL)
panic("%s: bucketoverflow: impossible", __func__);
#endif
syn_cache_rm(sc2);
syn_cache_put(sc2);
} else if (set->scs_count >= tcp_syn_cache_limit) {
struct syn_cache_head *scp2, *sce;
tcpstat_inc(tcps_sc_overflowed);
/*
* The cache is full. Toss the oldest entry in the
* first non-empty bucket we can find.
*
* XXX We would really like to toss the oldest
* entry in the cache, but we hope that this
* condition doesn't happen very often.
*/
scp2 = scp;
if (TAILQ_EMPTY(&scp2->sch_bucket)) {
sce = &set->scs_buckethead[set->scs_size];
for (++scp2; scp2 != scp; scp2++) {
if (scp2 >= sce)
scp2 = &set->scs_buckethead[0];
if (! TAILQ_EMPTY(&scp2->sch_bucket))
break;
}
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find a
* non-empty bucket.
*/
if (scp2 == scp)
panic("%s: cacheoverflow: impossible",
__func__);
#endif
}
sc2 = TAILQ_FIRST(&scp2->sch_bucket);
syn_cache_rm(sc2);
syn_cache_put(sc2);
}
/*
* Initialize the entry's timer.
*/
sc->sc_rxttot = 0;
sc->sc_rxtshift = 0;
SYN_CACHE_TIMER_ARM(sc);
/* Link it from tcpcb entry */
LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
/* Put it into the bucket. */
TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
scp->sch_length++;
sc->sc_set = set;
set->scs_count++;
set->scs_use--;
tcpstat_inc(tcps_sc_added);
/*
* If the active cache has exceeded its use limit and
* the passive syn cache is empty, exchange their roles.
*/
if (set->scs_use <= 0 &&
tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0)
tcp_syn_cache_active = !tcp_syn_cache_active;
}
/*
* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
* If we have retransmitted an entry the maximum number of times, expire
* that entry.
*/
void
syn_cache_timer(void *arg)
{
struct syn_cache *sc = arg;
uint32_t now;
NET_LOCK();
if (sc->sc_flags & SCF_DEAD)
goto out;
now = READ_ONCE(tcp_now);
if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
/* Drop it -- too many retransmissions. */
goto dropit;
}
/*
* Compute the total amount of time this entry has
* been on a queue. If this entry has been on longer
* than the keep alive timer would allow, expire it.
*/
sc->sc_rxttot += sc->sc_rxtcur;
if (sc->sc_rxttot >= tcptv_keep_init)
goto dropit;
tcpstat_inc(tcps_sc_retransmitted);
(void) syn_cache_respond(sc, NULL, now);
/* Advance the timer back-off. */
sc->sc_rxtshift++;
SYN_CACHE_TIMER_ARM(sc);
out:
NET_UNLOCK();
return;
dropit:
tcpstat_inc(tcps_sc_timed_out);
syn_cache_rm(sc);
syn_cache_put(sc);
NET_UNLOCK();
}
void
syn_cache_reaper(void *arg)
{
struct syn_cache *sc = arg;
pool_put(&syn_cache_pool, (sc));
return;
}
/*
* Remove syn cache created by the specified tcb entry,
* because this does not make sense to keep them
* (if there's no tcb entry, syn cache entry will never be used)
*/
void
syn_cache_cleanup(struct tcpcb *tp)
{
struct syn_cache *sc, *nsc;
NET_ASSERT_LOCKED(); LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
#ifdef DIAGNOSTIC
if (sc->sc_tp != tp)
panic("invalid sc_tp in syn_cache_cleanup");
#endif
syn_cache_rm(sc);
syn_cache_put(sc);
}
/* just for safety */
LIST_INIT(&tp->t_sc);
}
/*
* Find an entry in the syn cache.
*/
struct syn_cache *
syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst,
struct syn_cache_head **headp, u_int rtableid)
{
struct syn_cache_set *sets[2];
struct syn_cache *sc;
struct syn_cache_head *scp;
u_int32_t hash;
int i;
NET_ASSERT_LOCKED();
/* Check the active cache first, the passive cache is likely empty. */
sets[0] = &tcp_syn_cache[tcp_syn_cache_active];
sets[1] = &tcp_syn_cache[!tcp_syn_cache_active];
for (i = 0; i < 2; i++) {
if (sets[i]->scs_count == 0)
continue;
SYN_HASHALL(hash, src, dst, sets[i]->scs_random);
scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size];
*headp = scp;
TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) {
if (sc->sc_hash != hash)
continue;
if (!bcmp(&sc->sc_src, src, src->sa_len) &&
!bcmp(&sc->sc_dst, dst, dst->sa_len) &&
rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid))
return (sc);
}
}
return (NULL);
}
/*
* This function gets called when we receive an ACK for a
* socket in the LISTEN state. We look up the connection
* in the syn cache, and if its there, we pull it out of
* the cache and turn it into a full-blown connection in
* the SYN-RECEIVED state.
*
* The return values may not be immediately obvious, and their effects
* can be subtle, so here they are:
*
* NULL SYN was not found in cache; caller should drop the
* packet and send an RST.
*
* -1 We were unable to create the new connection, and are
* aborting it. An ACK,RST is being sent to the peer
* (unless we got screwy sequence numbers; see below),
* because the 3-way handshake has been completed. Caller
* should not free the mbuf, since we may be using it. If
* we are not, we will free it.
*
* Otherwise, the return value is a pointer to the new socket
* associated with the connection.
*/
struct socket *
syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
struct inpcb *inp, *oldinp;
struct tcpcb *tp = NULL;
struct mbuf *am;
struct socket *oso;
NET_ASSERT_LOCKED();
sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
if (sc == NULL)
return (NULL);
/*
* Verify the sequence and ack numbers. Try getting the correct
* response again.
*/
if ((th->th_ack != sc->sc_iss + 1) ||
SEQ_LEQ(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
(void) syn_cache_respond(sc, m, now);
return ((struct socket *)(-1));
}
/* Remove this cache entry */
syn_cache_rm(sc);
/*
* Ok, create the full blown connection, and set things up
* as they would have been set up if we had created the
* connection when the SYN arrived. If we can't create
* the connection, abort it.
*/
oso = so;
so = sonewconn(so, SS_ISCONNECTED);
if (so == NULL)
goto resetandabort;
oldinp = sotoinpcb(oso);
inp = sotoinpcb(so);
#ifdef IPSEC
/*
* We need to copy the required security levels
* from the old pcb. Ditto for any other
* IPsec-related information.
*/
memcpy(inp->inp_seclevel, oldinp->inp_seclevel,
sizeof(oldinp->inp_seclevel));
#endif /* IPSEC */
#ifdef INET6
/*
* inp still has the OLD in_pcb stuff, set the
* v6-related flags on the new guy, too.
*/
inp->inp_flags |= (oldinp->inp_flags & INP_IPV6);
if (inp->inp_flags & INP_IPV6) {
inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim;
inp->inp_hops = oldinp->inp_hops;
} else
#endif /* INET6 */
{
inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl;
}
#if NPF > 0
if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
struct pf_divert *divert;
divert = pf_find_divert(m);
KASSERT(divert != NULL);
inp->inp_rtableid = divert->rdomain;
} else
#endif
/* inherit rtable from listening socket */
inp->inp_rtableid = sc->sc_rtableid;
inp->inp_lport = th->th_dport;
switch (src->sa_family) {
#ifdef INET6
case AF_INET6:
inp->inp_laddr6 = satosin6(dst)->sin6_addr;
break;
#endif /* INET6 */
case AF_INET:
inp->inp_laddr = satosin(dst)->sin_addr;
inp->inp_options = ip_srcroute(m);
if (inp->inp_options == NULL) {
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
break;
}
in_pcbrehash(inp);
/*
* Give the new socket our cached route reference.
*/
if (src->sa_family == AF_INET)
inp->inp_route = sc->sc_route4; /* struct assignment */
#ifdef INET6
else
inp->inp_route6 = sc->sc_route6;
#endif
sc->sc_route4.ro_rt = NULL;
am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
if (am == NULL)
goto resetandabort;
am->m_len = src->sa_len;
memcpy(mtod(am, caddr_t), src, src->sa_len);
if (in_pcbconnect(inp, am)) {
(void) m_free(am);
goto resetandabort;
}
(void) m_free(am);
tp = intotcpcb(inp);
tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY);
if (sc->sc_request_r_scale != 15) {
tp->requested_s_scale = sc->sc_requested_s_scale;
tp->request_r_scale = sc->sc_request_r_scale;
tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
}
if (sc->sc_flags & SCF_TIMESTAMP)
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->t_template = tcp_template(tp);
if (tp->t_template == 0) {
tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
so = NULL;
goto abort;
}
tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
tp->ts_modulate = sc->sc_modulate;
tp->ts_recent = sc->sc_timestamp;
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
tcp_sendseqinit(tp);
tp->snd_last = tp->snd_una;
#ifdef TCP_ECN
if (sc->sc_flags & SCF_ECN_PERMIT) {
tp->t_flags |= TF_ECN_PERMIT;
tcpstat_inc(tcps_ecn_accepts);
}
#endif
if (sc->sc_flags & SCF_SACK_PERMIT)
tp->t_flags |= TF_SACK_PERMIT;
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
tcp_rcvseqinit(tp);
tp->t_state = TCPS_SYN_RECEIVED;
tp->t_rcvtime = now;
tp->t_sndtime = now;
tp->t_rcvacktime = now;
tp->t_sndacktime = now;
TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
tcpstat_inc(tcps_accepts);
tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */
if (sc->sc_peermaxseg)
tcp_mss_update(tp);
/* Reset initial window to 1 segment for retransmit */
if (sc->sc_rxtshift > 0)
tp->snd_cwnd = tp->t_maxseg;
tp->snd_wl1 = sc->sc_irs;
tp->rcv_up = sc->sc_irs + 1;
/*
* This is what would have happened in tcp_output() when
* the SYN,ACK was sent.
*/
tp->snd_up = tp->snd_una;
tp->snd_max = tp->snd_nxt = tp->iss+1;
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
tp->last_ack_sent = tp->rcv_nxt;
tcpstat_inc(tcps_sc_completed);
syn_cache_put(sc);
return (so);
resetandabort:
tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST,
m->m_pkthdr.ph_rtableid, now);
abort:
m_freem(m);
if (so != NULL)
soabort(so);
syn_cache_put(sc);
tcpstat_inc(tcps_sc_aborted);
return ((struct socket *)(-1));
}
/*
* This function is called when we get a RST for a
* non-existent connection, so that we can see if the
* connection is in the syn cache. If it is, zap it.
*/
void
syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
u_int rtableid)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
NET_ASSERT_LOCKED();
if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
return;
if (SEQ_LT(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs + 1))
return;
syn_cache_rm(sc);
tcpstat_inc(tcps_sc_reset);
syn_cache_put(sc);
}
void
syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
u_int rtableid)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
NET_ASSERT_LOCKED();
if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
return;
/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
if (ntohl (th->th_seq) != sc->sc_iss) {
return;
}
/*
* If we've retransmitted 3 times and this is our second error,
* we remove the entry. Otherwise, we allow it to continue on.
* This prevents us from incorrectly nuking an entry during a
* spurious network outage.
*
* See tcp_notify().
*/
if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
sc->sc_flags |= SCF_UNREACH;
return;
}
syn_cache_rm(sc);
tcpstat_inc(tcps_sc_unreach);
syn_cache_put(sc);
}
/*
* Given a LISTEN socket and an inbound SYN request, add
* this to the syn cache, and send back a segment:
* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
* to the source.
*
* IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
* Doing so would require that we hold onto the data and deliver it
* to the application. However, if we are the target of a SYN-flood
* DoS attack, an attacker could send data which would eventually
* consume all available buffer space if it were ACKed. By not ACKing
* the data, we avoid this DoS scenario.
*/
int
syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen,
struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now)
{
struct tcpcb tb, *tp;
long win;
struct syn_cache *sc;
struct syn_cache_head *scp;
struct mbuf *ipopts;
tp = sototcpcb(so);
/*
* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
*
* Note this check is performed in tcp_input() very early on.
*/
/*
* Initialize some local state.
*/
win = sbspace(so, &so->so_rcv);
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
bzero(&tb, sizeof(tb));
#ifdef TCP_SIGNATURE
if (optp || (tp->t_flags & TF_SIGNATURE)) {
#else
if (optp) {
#endif
tb.pf = tp->pf;
tb.sack_enable = tp->sack_enable;
tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
tb.t_flags |= TF_SIGNATURE;
#endif
tb.t_state = TCPS_LISTEN;
if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi,
sotoinpcb(so)->inp_rtableid, now))
return (-1);
}
switch (src->sa_family) {
case AF_INET:
/*
* Remember the IP options, if any.
*/
ipopts = ip_srcroute(m);
break;
default:
ipopts = NULL;
}
/*
* See if we already have an entry for this connection.
* If we do, resend the SYN,ACK. We do not count this
* as a retransmission (XXX though maybe we should).
*/
sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
if (sc != NULL) {
tcpstat_inc(tcps_sc_dupesyn);
if (ipopts) {
/*
* If we were remembering a previous source route,
* forget it and use the new one we've been given.
*/
m_free(sc->sc_ipopts);
sc->sc_ipopts = ipopts;
}
sc->sc_timestamp = tb.ts_recent;
if (syn_cache_respond(sc, m, now) == 0) {
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
}
return (0);
}
sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
if (sc == NULL) {
m_free(ipopts);
return (-1);
}
/*
* Fill in the cache, and put the necessary IP and TCP
* options into the reply.
*/
memcpy(&sc->sc_src, src, src->sa_len);
memcpy(&sc->sc_dst, dst, dst->sa_len);
sc->sc_rtableid = sotoinpcb(so)->inp_rtableid;
sc->sc_flags = 0;
sc->sc_ipopts = ipopts;
sc->sc_irs = th->th_seq;
sc->sc_iss = issp ? *issp : arc4random();
sc->sc_peermaxseg = oi->maxseg;
sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family);
sc->sc_win = win;
sc->sc_timestamp = tb.ts_recent;
if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
sc->sc_flags |= SCF_TIMESTAMP;
sc->sc_modulate = arc4random();
}
if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
(TF_RCVD_SCALE|TF_REQ_SCALE)) {
sc->sc_requested_s_scale = tb.requested_s_scale;
sc->sc_request_r_scale = 0;
/*
* Pick the smallest possible scaling factor that
* will still allow us to scale up to sb_max.
*
* We do this because there are broken firewalls that
* will corrupt the window scale option, leading to
* the other endpoint believing that our advertised
* window is unscaled. At scale factors larger than
* 5 the unscaled window will drop below 1500 bytes,
* leading to serious problems when traversing these
* broken firewalls.
*
* With the default sbmax of 256K, a scale factor
* of 3 will be chosen by this algorithm. Those who
* choose a larger sbmax should watch out
* for the compatibility problems mentioned above.
*
* RFC1323: The Window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled.
*/
while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
(TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
sc->sc_request_r_scale++;
} else {
sc->sc_requested_s_scale = 15;
sc->sc_request_r_scale = 15;
}
#ifdef TCP_ECN
/*
* if both ECE and CWR flag bits are set, peer is ECN capable.
*/
if (tcp_do_ecn &&
(th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
sc->sc_flags |= SCF_ECN_PERMIT;
#endif
/*
* Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
* (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
*/
if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
sc->sc_flags |= SCF_SACK_PERMIT;
#ifdef TCP_SIGNATURE
if (tb.t_flags & TF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
#endif
sc->sc_tp = tp;
if (syn_cache_respond(sc, m, now) == 0) {
syn_cache_insert(sc, tp);
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
} else {
syn_cache_put(sc);
tcpstat_inc(tcps_sc_dropped);
}
return (0);
}
int
syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint32_t now)
{
u_int8_t *optp;
int optlen, error;
u_int16_t tlen;
struct ip *ip = NULL;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
struct tcphdr *th;
u_int hlen;
struct inpcb *inp;
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
hlen = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
m_freem(m);
return (EAFNOSUPPORT);
}
/* Compute the size of the TCP options. */
optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
#ifdef TCP_SIGNATURE
((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
#endif
((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
tlen = hlen + sizeof(struct tcphdr) + optlen;
/*
* Create the IP+TCP header from scratch.
*/
m_freem(m);
#ifdef DIAGNOSTIC
if (max_linkhdr + tlen > MCLBYTES)
return (ENOBUFS);
#endif
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m && max_linkhdr + tlen > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL)
return (ENOBUFS);
/* Fixup the mbuf. */
m->m_data += max_linkhdr;
m->m_len = m->m_pkthdr.len = tlen;
m->m_pkthdr.ph_ifidx = 0;
m->m_pkthdr.ph_rtableid = sc->sc_rtableid;
memset(mtod(m, u_char *), 0, tlen);
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip = mtod(m, struct ip *);
ip->ip_dst = sc->sc_src.sin.sin_addr;
ip->ip_src = sc->sc_dst.sin.sin_addr;
ip->ip_p = IPPROTO_TCP;
th = (struct tcphdr *)(ip + 1);
th->th_dport = sc->sc_src.sin.sin_port;
th->th_sport = sc->sc_dst.sin.sin_port;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
ip6->ip6_nxt = IPPROTO_TCP;
/* ip6_plen will be updated in ip6_output() */
th = (struct tcphdr *)(ip6 + 1);
th->th_dport = sc->sc_src.sin6.sin6_port;
th->th_sport = sc->sc_dst.sin6.sin6_port;
break;
#endif
default:
unhandled_af(sc->sc_src.sa.sa_family);
}
th->th_seq = htonl(sc->sc_iss);
th->th_ack = htonl(sc->sc_irs + 1);
th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
th->th_flags = TH_SYN|TH_ACK;
#ifdef TCP_ECN
/* Set ECE for SYN-ACK if peer supports ECN. */
if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
th->th_flags |= TH_ECE;
#endif
th->th_win = htons(sc->sc_win);
/* th_sum already 0 */
/* th_urp already 0 */
/* Tack on the TCP options. */
optp = (u_int8_t *)(th + 1);
*optp++ = TCPOPT_MAXSEG;
*optp++ = 4;
*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
*optp++ = sc->sc_ourmaxseg & 0xff;
/* Include SACK_PERMIT_HDR option if peer has already done so. */
if (sc->sc_flags & SCF_SACK_PERMIT) {
*((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
optp += 4;
}
if (sc->sc_request_r_scale != 15) {
*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
sc->sc_request_r_scale);
optp += 4;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
u_int32_t *lp = (u_int32_t *)(optp);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(TCPOPT_TSTAMP_HDR);
*lp++ = htonl(now + sc->sc_modulate);
*lp = htonl(sc->sc_timestamp);
optp += TCPOLEN_TSTAMP_APPA;
}
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE) {
union sockaddr_union src, dst;
struct tdb *tdb;
bzero(&src, sizeof(union sockaddr_union));
bzero(&dst, sizeof(union sockaddr_union));
src.sa.sa_len = sc->sc_src.sa.sa_len;
src.sa.sa_family = sc->sc_src.sa.sa_family;
dst.sa.sa_len = sc->sc_dst.sa.sa_len;
dst.sa.sa_family = sc->sc_dst.sa.sa_family;
switch (sc->sc_src.sa.sa_family) {
case 0: /*default to PF_INET*/
case AF_INET:
src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
break;
#ifdef INET6
case AF_INET6:
src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
break;
#endif /* INET6 */
}
tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid),
0, &src, &dst, IPPROTO_TCP);
if (tdb == NULL) {
m_freem(m);
return (EPERM);
}
/* Send signature option */
*(optp++) = TCPOPT_SIGNATURE;
*(optp++) = TCPOLEN_SIGNATURE;
if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
hlen, 0, optp) < 0) {
m_freem(m);
tdb_unref(tdb);
return (EINVAL);
}
tdb_unref(tdb);
optp += 16;
/* Pad options list to the next 32 bit boundary and
* terminate it.
*/
*optp++ = TCPOPT_NOP;
*optp++ = TCPOPT_EOL;
}
#endif /* TCP_SIGNATURE */
/* Compute the packet's checksum. */
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_len = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in_cksum(m, tlen);
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_plen = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
break;
#endif
}
/* use IPsec policy and ttl from listening socket, on SYN ACK */
inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
/*
* Fill in some straggling IP bits. Note the stack expects
* ip_len to be in host order, for convenience.
*/
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_len = htons(tlen);
ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
if (inp != NULL)
ip->ip_tos = inp->inp_ip.ip_tos;
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_plen = htons(tlen - hlen);
/* ip6_hlim will be initialized afterwards */
/* leave flowlabel = 0, it is legal and require no state mgmt */
break;
#endif
}
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
error = ip_output(m, sc->sc_ipopts, &sc->sc_route4,
(ip_mtudisc ? IP_MTUDISC : 0), NULL, inp, 0);
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_hlim = in6_selecthlim(inp);
error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0,
NULL, NULL);
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
return (error);
}
/* $OpenBSD: tcp_subr.c,v 1.188 2022/09/03 22:11:09 bluhm Exp $ */
/* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/timeout.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#ifdef INET6
#include <netinet6/ip6protosw.h>
#endif /* INET6 */
#include <crypto/md5.h>
#include <crypto/sha2.h>
/*
* Locks used to protect struct members in this file:
* I immutable after creation
* T tcp_timer_mtx global tcp timer data structures
*/
struct mutex tcp_timer_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
/* patchable/settable parameters for tcp */
int tcp_mssdflt = TCP_MSS;
int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
/* values controllable via sysctl */
int tcp_do_rfc1323 = 1;
int tcp_do_sack = 1; /* RFC 2018 selective ACKs */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
#ifdef TCP_ECN
int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */
#endif
int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */
#ifndef TCB_INITIAL_HASH_SIZE
#define TCB_INITIAL_HASH_SIZE 128
#endif
int tcp_reass_limit = NMBCLUSTERS / 8; /* hardlimit for tcpqe_pool */
int tcp_sackhole_limit = 32*1024; /* hardlimit for sackhl_pool */
struct pool tcpcb_pool;
struct pool tcpqe_pool;
struct pool sackhl_pool;
struct cpumem *tcpcounters; /* tcp statistics */
u_char tcp_secret[16]; /* [I] */
SHA2_CTX tcp_secret_ctx; /* [I] */
tcp_seq tcp_iss; /* [T] updated by timer and connection */
uint32_t tcp_now; /* [T] incremented by slow timer */
/*
* Tcp initialization
*/
void
tcp_init(void)
{
tcp_iss = 1; /* wrong */
tcp_now = 1;
pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, IPL_SOFTNET, 0,
"tcpcb", NULL);
pool_init(&tcpqe_pool, sizeof(struct tcpqent), 0, IPL_SOFTNET, 0,
"tcpqe", NULL);
pool_sethardlimit(&tcpqe_pool, tcp_reass_limit, NULL, 0);
pool_init(&sackhl_pool, sizeof(struct sackhole), 0, IPL_SOFTNET, 0,
"sackhl", NULL);
pool_sethardlimit(&sackhl_pool, tcp_sackhole_limit, NULL, 0);
in_pcbinit(&tcbtable, TCB_INITIAL_HASH_SIZE);
tcpcounters = counters_alloc(tcps_ncounters);
arc4random_buf(tcp_secret, sizeof(tcp_secret));
SHA512Init(&tcp_secret_ctx);
SHA512Update(&tcp_secret_ctx, tcp_secret, sizeof(tcp_secret));
#ifdef INET6
/*
* Since sizeof(struct ip6_hdr) > sizeof(struct ip), we
* do max length checks/computations only on the former.
*/
if (max_protohdr < (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)))
max_protohdr = (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
if ((max_linkhdr + sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) >
MHLEN)
panic("tcp_init");
icmp6_mtudisc_callback_register(tcp6_mtudisc_callback);
#endif /* INET6 */
/* Initialize the compressed state engine. */
syn_cache_init();
/* Initialize timer state. */
tcp_timer_init();
}
/*
* Create template to be used to send tcp packets on a connection.
* Call after host entry created, allocates an mbuf and fills
* in a skeletal tcp/ip header, minimizing the amount of work
* necessary when the connection is used.
*
* To support IPv6 in addition to IPv4 and considering that the sizes of
* the IPv4 and IPv6 headers are not the same, we now use a separate pointer
* for the TCP header. Also, we made the former tcpiphdr header pointer
* into just an IP overlay pointer, with casting as appropriate for v6. rja
*/
struct mbuf *
tcp_template(struct tcpcb *tp)
{
struct inpcb *inp = tp->t_inpcb;
struct mbuf *m;
struct tcphdr *th;
CTASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= MHLEN);
CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= MHLEN);
if ((m = tp->t_template) == 0) {
m = m_get(M_DONTWAIT, MT_HEADER);
if (m == NULL)
return (0);
switch (tp->pf) {
case 0: /*default to PF_INET*/
case AF_INET:
m->m_len = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
m->m_len = sizeof(struct ip6_hdr);
break;
#endif /* INET6 */
}
m->m_len += sizeof (struct tcphdr);
}
switch(tp->pf) {
case AF_INET:
{
struct ipovly *ipovly;
ipovly = mtod(m, struct ipovly *);
bzero(ipovly->ih_x1, sizeof ipovly->ih_x1);
ipovly->ih_pr = IPPROTO_TCP;
ipovly->ih_len = htons(sizeof (struct tcphdr));
ipovly->ih_src = inp->inp_laddr;
ipovly->ih_dst = inp->inp_faddr;
th = (struct tcphdr *)(mtod(m, caddr_t) +
sizeof(struct ip));
}
break;
#ifdef INET6
case AF_INET6:
{
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_src = inp->inp_laddr6;
ip6->ip6_dst = inp->inp_faddr6;
ip6->ip6_flow = htonl(0x60000000) |
(inp->inp_flowinfo & IPV6_FLOWLABEL_MASK);
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_plen = htons(sizeof(struct tcphdr)); /*XXX*/
ip6->ip6_hlim = in6_selecthlim(inp); /*XXX*/
th = (struct tcphdr *)(mtod(m, caddr_t) +
sizeof(struct ip6_hdr));
}
break;
#endif /* INET6 */
}
th->th_sport = inp->inp_lport;
th->th_dport = inp->inp_fport;
th->th_seq = 0;
th->th_ack = 0;
th->th_x2 = 0;
th->th_off = 5;
th->th_flags = 0;
th->th_win = 0;
th->th_urp = 0;
th->th_sum = 0;
return (m);
}
/*
* Send a single message to the TCP at address specified by
* the given TCP/IP header. If m == 0, then we make a copy
* of the tcpiphdr at ti and send directly to the addressed host.
* This is used to force keep alive messages out using the TCP
* template for a connection tp->t_template. If flags are given
* then we send a message back to the TCP which originated the
* segment ti, and discard the mbuf containing it and any other
* attached mbufs.
*
* In any case the ack and sequence number of the transmitted
* segment are as specified by the parameters.
*/
void
tcp_respond(struct tcpcb *tp, caddr_t template, struct tcphdr *th0,
tcp_seq ack, tcp_seq seq, int flags, u_int rtableid, uint32_t now)
{
int tlen;
int win = 0;
struct mbuf *m = NULL;
struct tcphdr *th;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
int af; /* af on wire */
if (tp) {
struct socket *so = tp->t_inpcb->inp_socket;
win = sbspace(so, &so->so_rcv);
/*
* If this is called with an unconnected
* socket/tp/pcb (tp->pf is 0), we lose.
*/
af = tp->pf;
} else
af = (((struct ip *)template)->ip_v == 6) ? AF_INET6 : AF_INET;
m = m_gethdr(M_DONTWAIT, MT_HEADER);
if (m == NULL)
return;
m->m_data += max_linkhdr;
tlen = 0;
#define xchg(a,b,type) do { type t; t=a; a=b; b=t; } while (0)
switch (af) {
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
tlen = sizeof(*ip6) + sizeof(*th);
if (th0) {
bcopy(template, ip6, sizeof(*ip6));
bcopy(th0, th, sizeof(*th));
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
} else {
bcopy(template, ip6, tlen);
}
break;
#endif /* INET6 */
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
tlen = sizeof(*ip) + sizeof(*th);
if (th0) {
bcopy(template, ip, sizeof(*ip));
bcopy(th0, th, sizeof(*th));
xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, u_int32_t);
} else {
bcopy(template, ip, tlen);
}
break;
}
if (th0)
xchg(th->th_dport, th->th_sport, u_int16_t);
else
flags = TH_ACK;
#undef xchg
th->th_seq = htonl(seq);
th->th_ack = htonl(ack);
th->th_x2 = 0;
th->th_off = sizeof (struct tcphdr) >> 2;
th->th_flags = flags;
if (tp)
win >>= tp->rcv_scale;
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
th->th_win = htons((u_int16_t)win);
th->th_urp = 0;
if (tp && (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(flags & TH_RST) == 0 && (tp->t_flags & TF_RCVD_TSTMP)) {
u_int32_t *lp = (u_int32_t *)(th + 1);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(TCPOPT_TSTAMP_HDR);
*lp++ = htonl(now + tp->ts_modulate);
*lp = htonl(tp->ts_recent);
tlen += TCPOLEN_TSTAMP_APPA;
th->th_off = (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2;
}
m->m_len = tlen;
m->m_pkthdr.len = tlen;
m->m_pkthdr.ph_ifidx = 0;
m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT;
/* force routing table */
if (tp)
m->m_pkthdr.ph_rtableid = tp->t_inpcb->inp_rtableid;
else
m->m_pkthdr.ph_rtableid = rtableid;
switch (af) {
#ifdef INET6
case AF_INET6:
ip6->ip6_flow = htonl(0x60000000);
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL); /*XXX*/
ip6->ip6_plen = tlen - sizeof(struct ip6_hdr);
ip6->ip6_plen = htons(ip6->ip6_plen);
ip6_output(m, tp ? tp->t_inpcb->inp_outputopts6 : NULL,
tp ? &tp->t_inpcb->inp_route6 : NULL,
0, NULL,
tp ? tp->t_inpcb : NULL);
break;
#endif /* INET6 */
case AF_INET:
ip->ip_len = htons(tlen);
ip->ip_ttl = ip_defttl;
ip->ip_tos = 0;
ip_output(m, NULL,
tp ? &tp->t_inpcb->inp_route : NULL,
ip_mtudisc ? IP_MTUDISC : 0, NULL,
tp ? tp->t_inpcb : NULL, 0);
break;
}
}
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(struct inpcb *inp)
{
struct tcpcb *tp;
int i;
tp = pool_get(&tcpcb_pool, PR_NOWAIT|PR_ZERO);
if (tp == NULL)
return (NULL);
TAILQ_INIT(&tp->t_segq);
tp->t_maxseg = tcp_mssdflt;
tp->t_maxopd = 0;
for (i = 0; i < TCPT_NTIMERS; i++)
TCP_TIMER_INIT(tp, i);
tp->sack_enable = tcp_do_sack;
tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
tp->t_inpcb = inp;
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_srtt = TCPTV_SRTTBASE;
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ <<
(TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
tp->t_rttmin = TCPTV_MIN;
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
TCPTV_MIN, TCPTV_REXMTMAX);
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_pmtud_mtu_sent = 0;
tp->t_pmtud_mss_acked = 0;
#ifdef INET6
/* we disallow IPv4 mapped address completely. */
if ((inp->inp_flags & INP_IPV6) == 0)
tp->pf = PF_INET;
else
tp->pf = PF_INET6;
#else
tp->pf = PF_INET;
#endif
#ifdef INET6
if (inp->inp_flags & INP_IPV6)
inp->inp_ipv6.ip6_hlim = ip6_defhlim;
else
#endif /* INET6 */
inp->inp_ip.ip_ttl = ip_defttl;
inp->inp_ppcb = (caddr_t)tp;
return (tp);
}
/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
*/
struct tcpcb *
tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so = tp->t_inpcb->inp_socket;
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_state = TCPS_CLOSED;
(void) tcp_output(tp);
tcpstat_inc(tcps_drops);
} else
tcpstat_inc(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
return (tcp_close(tp));
}
/*
* Close a TCP control block:
* discard all space held by the tcp
* discard internet protocol block
* wake up any sleepers
*/
struct tcpcb *
tcp_close(struct tcpcb *tp)
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
struct sackhole *p, *q;
/* free the reassembly queue, if any */
tcp_freeq(tp);
tcp_canceltimers(tp);
syn_cache_cleanup(tp);
/* Free SACK holes. */
q = p = tp->snd_holes;
while (p != 0) {
q = p->next;
pool_put(&sackhl_pool, p);
p = q;
}
m_free(tp->t_template);
/* Free tcpcb after all pending timers have been run. */
TCP_TIMER_ARM(tp, TCPT_REAPER, 1);
inp->inp_ppcb = NULL;
soisdisconnected(so);
in_pcbdetach(inp);
return (NULL);
}
int
tcp_freeq(struct tcpcb *tp)
{
struct tcpqent *qe;
int rv = 0;
while ((qe = TAILQ_FIRST(&tp->t_segq)) != NULL) {
TAILQ_REMOVE(&tp->t_segq, qe, tcpqe_q);
m_freem(qe->tcpqe_m);
pool_put(&tcpqe_pool, qe);
rv = 1;
}
return (rv);
}
/*
* Compute proper scaling value for receiver window from buffer space
*/
void
tcp_rscale(struct tcpcb *tp, u_long hiwat)
{
tp->request_r_scale = 0;
while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < hiwat)
tp->request_r_scale++;
}
/*
* Notify a tcp user of an asynchronous error;
* store error as soft error, but wake up user
* (for now, won't do anything until can select for soft error).
*/
void
tcp_notify(struct inpcb *inp, int error)
{
struct tcpcb *tp = intotcpcb(inp);
struct socket *so = inp->inp_socket;
/*
* Ignore some errors if we are hooked up.
* If connection hasn't completed, has retransmitted several times,
* and receives a second error, give up now. This is better
* than waiting a long time to establish a connection that
* can never complete.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
return;
} else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
tp->t_rxtshift > 3 && tp->t_softerror)
so->so_error = error;
else
tp->t_softerror = error;
wakeup((caddr_t) &so->so_timeo);
sorwakeup(so);
sowwakeup(so);
}
#ifdef INET6
void
tcp6_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *d)
{
struct tcphdr th;
struct tcpcb *tp;
void (*notify)(struct inpcb *, int) = tcp_notify;
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6_src = NULL;
struct sockaddr_in6 *sa6 = satosin6(sa);
struct inpcb *inp;
struct mbuf *m;
tcp_seq seq;
int off;
struct {
u_int16_t th_sport;
u_int16_t th_dport;
u_int32_t th_seq;
} *thp;
CTASSERT(sizeof(*thp) <= sizeof(th));
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6) ||
IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr))
return;
if ((unsigned)cmd >= PRC_NCMDS)
return;
else if (cmd == PRC_QUENCH) {
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
/* XXX there's no PRC_QUENCH in IPv6 */
return;
} else if (PRC_IS_REDIRECT(cmd))
notify = in_rtchange, d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
sa6_src = &sa6_any;
}
if (ip6) {
/*
* XXX: We assume that when ip6 is non NULL,
* M and OFF are valid.
*/
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(*thp))
return;
bzero(&th, sizeof(th));
m_copydata(m, off, sizeof(*thp), &th);
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMPv6 message
* payload.
*/
inp = in6_pcblookup(&tcbtable, &sa6->sin6_addr,
th.th_dport, &sa6_src->sin6_addr, th.th_sport, rdomain);
if (cmd == PRC_MSGSIZE) {
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d,
inp != NULL);
in_pcbunref(inp);
return;
}
if (inp) {
seq = ntohl(th.th_seq);
if (inp->inp_socket &&
(tp = intotcpcb(inp)) &&
SEQ_GEQ(seq, tp->snd_una) &&
SEQ_LT(seq, tp->snd_max))
notify(inp, inet6ctlerrmap[cmd]);
} else if (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
inet6ctlerrmap[cmd] == ENETUNREACH ||
inet6ctlerrmap[cmd] == EHOSTDOWN)
syn_cache_unreach((struct sockaddr *)sa6_src,
sa, &th, rdomain);
in_pcbunref(inp);
} else {
in6_pcbnotify(&tcbtable, sa6, 0,
sa6_src, 0, rdomain, cmd, NULL, notify);
}
}
#endif
void
tcp_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *v)
{
struct ip *ip = v;
struct tcphdr *th;
struct tcpcb *tp;
struct inpcb *inp;
struct in_addr faddr;
tcp_seq seq;
u_int mtu;
void (*notify)(struct inpcb *, int) = tcp_notify;
int errno;
if (sa->sa_family != AF_INET)
return;
faddr = satosin(sa)->sin_addr;
if (faddr.s_addr == INADDR_ANY)
return;
if ((unsigned)cmd >= PRC_NCMDS)
return;
errno = inetctlerrmap[cmd];
if (cmd == PRC_QUENCH)
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return;
else if (PRC_IS_REDIRECT(cmd))
notify = in_rtchange, ip = 0;
else if (cmd == PRC_MSGSIZE && ip_mtudisc && ip) {
/*
* Verify that the packet in the icmp payload refers
* to an existing TCP connection.
*/
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
seq = ntohl(th->th_seq);
inp = in_pcblookup(&tcbtable,
ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
rdomain);
if (inp && (tp = intotcpcb(inp)) &&
SEQ_GEQ(seq, tp->snd_una) &&
SEQ_LT(seq, tp->snd_max)) {
struct icmp *icp;
icp = (struct icmp *)((caddr_t)ip -
offsetof(struct icmp, icmp_ip));
/*
* If the ICMP message advertises a Next-Hop MTU
* equal or larger than the maximum packet size we have
* ever sent, drop the message.
*/
mtu = (u_int)ntohs(icp->icmp_nextmtu);
if (mtu >= tp->t_pmtud_mtu_sent) {
in_pcbunref(inp);
return;
}
if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
/*
* Calculate new MTU, and create corresponding
* route (traditional PMTUD).
*/
tp->t_flags &= ~TF_PMTUD_PEND;
icmp_mtudisc(icp, inp->inp_rtableid);
} else {
/*
* Record the information got in the ICMP
* message; act on it later.
* If we had already recorded an ICMP message,
* replace the old one only if the new message
* refers to an older TCP segment
*/
if (tp->t_flags & TF_PMTUD_PEND) {
if (SEQ_LT(tp->t_pmtud_th_seq, seq)) {
in_pcbunref(inp);
return;
}
} else
tp->t_flags |= TF_PMTUD_PEND;
tp->t_pmtud_th_seq = seq;
tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
in_pcbunref(inp);
return;
}
} else {
/* ignore if we don't have a matching connection */
in_pcbunref(inp);
return;
}
in_pcbunref(inp);
notify = tcp_mtudisc, ip = 0;
} else if (cmd == PRC_MTUINC)
notify = tcp_mtudisc_increase, ip = 0;
else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if (errno == 0)
return;
if (ip) {
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
inp = in_pcblookup(&tcbtable,
ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport,
rdomain);
if (inp) {
seq = ntohl(th->th_seq);
if (inp->inp_socket &&
(tp = intotcpcb(inp)) &&
SEQ_GEQ(seq, tp->snd_una) &&
SEQ_LT(seq, tp->snd_max))
notify(inp, errno);
} else if (inetctlerrmap[cmd] == EHOSTUNREACH ||
inetctlerrmap[cmd] == ENETUNREACH ||
inetctlerrmap[cmd] == EHOSTDOWN) {
struct sockaddr_in sin;
bzero(&sin, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_port = th->th_sport;
sin.sin_addr = ip->ip_src;
syn_cache_unreach(sintosa(&sin), sa, th, rdomain);
}
in_pcbunref(inp);
} else
in_pcbnotifyall(&tcbtable, sa, rdomain, errno, notify);
}
#ifdef INET6
/*
* Path MTU Discovery handlers.
*/
void
tcp6_mtudisc_callback(struct sockaddr_in6 *sin6, u_int rdomain)
{
in6_pcbnotify(&tcbtable, sin6, 0,
&sa6_any, 0, rdomain, PRC_MSGSIZE, NULL, tcp_mtudisc);
}
#endif /* INET6 */
/*
* On receipt of path MTU corrections, flush old route and replace it
* with the new one. Retransmit all unacknowledged packets, to ensure
* that all packets will be received.
*/
void
tcp_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
int orig_maxseg, change = 0;
if (tp == NULL)
return;
orig_maxseg = tp->t_maxseg;
rt = in_pcbrtentry(inp);
if (rt != NULL) {
unsigned int orig_mtulock = (rt->rt_locks & RTV_MTU);
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
in_rtchange(inp, errno);
if ((rt = in_pcbrtentry(inp)) == NULL)
return;
}
if (orig_mtulock < (rt->rt_locks & RTV_MTU))
change = 1;
}
tcp_mss(tp, -1);
if (orig_maxseg > tp->t_maxseg)
change = 1;
/*
* Resend unacknowledged packets
*/
tp->snd_nxt = tp->snd_una;
if (change || errno > 0)
tcp_output(tp);
}
void
tcp_mtudisc_increase(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt = in_pcbrtentry(inp);
if (tp != 0 && rt != 0) {
/*
* If this was a host route, remove and realloc.
*/
if (rt->rt_flags & RTF_HOST)
in_rtchange(inp, errno);
/* also takes care of congestion window */
tcp_mss(tp, -1);
}
}
/*
* Generate new ISNs with a method based on RFC1948
*/
#define TCP_ISS_CONN_INC 4096
void
tcp_set_iss_tsm(struct tcpcb *tp)
{
SHA2_CTX ctx;
union {
uint8_t bytes[SHA512_DIGEST_LENGTH];
uint32_t words[2];
} digest;
u_int rdomain = rtable_l2(tp->t_inpcb->inp_rtableid);
tcp_seq iss;
mtx_enter(&tcp_timer_mtx);
tcp_iss += TCP_ISS_CONN_INC;
iss = tcp_iss;
mtx_leave(&tcp_timer_mtx);
ctx = tcp_secret_ctx;
SHA512Update(&ctx, &rdomain, sizeof(rdomain));
SHA512Update(&ctx, &tp->t_inpcb->inp_lport, sizeof(u_short));
SHA512Update(&ctx, &tp->t_inpcb->inp_fport, sizeof(u_short));
if (tp->pf == AF_INET6) {
SHA512Update(&ctx, &tp->t_inpcb->inp_laddr6,
sizeof(struct in6_addr));
SHA512Update(&ctx, &tp->t_inpcb->inp_faddr6,
sizeof(struct in6_addr));
} else {
SHA512Update(&ctx, &tp->t_inpcb->inp_laddr,
sizeof(struct in_addr));
SHA512Update(&ctx, &tp->t_inpcb->inp_faddr,
sizeof(struct in_addr));
}
SHA512Final(digest.bytes, &ctx);
tp->iss = digest.words[0] + iss;
tp->ts_modulate = digest.words[1];
}
#ifdef TCP_SIGNATURE
int
tcp_signature_tdb_attach(void)
{
return (0);
}
int
tcp_signature_tdb_init(struct tdb *tdbp, const struct xformsw *xsp,
struct ipsecinit *ii)
{
if ((ii->ii_authkeylen < 1) || (ii->ii_authkeylen > 80))
return (EINVAL);
tdbp->tdb_amxkey = malloc(ii->ii_authkeylen, M_XDATA, M_NOWAIT);
if (tdbp->tdb_amxkey == NULL)
return (ENOMEM);
memcpy(tdbp->tdb_amxkey, ii->ii_authkey, ii->ii_authkeylen);
tdbp->tdb_amxkeylen = ii->ii_authkeylen;
return (0);
}
int
tcp_signature_tdb_zeroize(struct tdb *tdbp)
{
if (tdbp->tdb_amxkey) {
explicit_bzero(tdbp->tdb_amxkey, tdbp->tdb_amxkeylen);
free(tdbp->tdb_amxkey, M_XDATA, tdbp->tdb_amxkeylen);
tdbp->tdb_amxkey = NULL;
}
return (0);
}
int
tcp_signature_tdb_input(struct mbuf **mp, struct tdb *tdbp, int skip,
int protoff)
{
m_freemp(mp);
return (IPPROTO_DONE);
}
int
tcp_signature_tdb_output(struct mbuf *m, struct tdb *tdbp, int skip,
int protoff)
{
m_freem(m);
return (EINVAL);
}
int
tcp_signature_apply(caddr_t fstate, caddr_t data, unsigned int len)
{
MD5Update((MD5_CTX *)fstate, (char *)data, len);
return 0;
}
int
tcp_signature(struct tdb *tdb, int af, struct mbuf *m, struct tcphdr *th,
int iphlen, int doswap, char *sig)
{
MD5_CTX ctx;
int len;
struct tcphdr th0;
MD5Init(&ctx);
switch(af) {
case 0:
case AF_INET: {
struct ippseudo ippseudo;
struct ip *ip;
ip = mtod(m, struct ip *);
ippseudo.ippseudo_src = ip->ip_src;
ippseudo.ippseudo_dst = ip->ip_dst;
ippseudo.ippseudo_pad = 0;
ippseudo.ippseudo_p = IPPROTO_TCP;
ippseudo.ippseudo_len = htons(m->m_pkthdr.len - iphlen);
MD5Update(&ctx, (char *)&ippseudo,
sizeof(struct ippseudo));
break;
}
#ifdef INET6
case AF_INET6: {
struct ip6_hdr_pseudo ip6pseudo;
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
bzero(&ip6pseudo, sizeof(ip6pseudo));
ip6pseudo.ip6ph_src = ip6->ip6_src;
ip6pseudo.ip6ph_dst = ip6->ip6_dst;
in6_clearscope(&ip6pseudo.ip6ph_src);
in6_clearscope(&ip6pseudo.ip6ph_dst);
ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
ip6pseudo.ip6ph_len = htonl(m->m_pkthdr.len - iphlen);
MD5Update(&ctx, (char *)&ip6pseudo,
sizeof(ip6pseudo));
break;
}
#endif
}
th0 = *th;
th0.th_sum = 0;
if (doswap) {
th0.th_seq = htonl(th0.th_seq);
th0.th_ack = htonl(th0.th_ack);
th0.th_win = htons(th0.th_win);
th0.th_urp = htons(th0.th_urp);
}
MD5Update(&ctx, (char *)&th0, sizeof(th0));
len = m->m_pkthdr.len - iphlen - th->th_off * sizeof(uint32_t);
if (len > 0 &&
m_apply(m, iphlen + th->th_off * sizeof(uint32_t), len,
tcp_signature_apply, (caddr_t)&ctx))
return (-1);
MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen);
MD5Final(sig, &ctx);
return (0);
}
#endif /* TCP_SIGNATURE */
/* $OpenBSD: ufs_vnops.c,v 1.156 2022/06/26 05:20:43 visa Exp $ */
/* $NetBSD: ufs_vnops.c,v 1.18 1996/05/11 18:28:04 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/dirent.h>
#include <sys/lockf.h>
#include <sys/event.h>
#include <sys/specdev.h>
#include <sys/unistd.h>
#include <miscfs/fifofs/fifo.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ext2fs/ext2fs_extern.h>
#include <uvm/uvm_extern.h>
int ufs_chmod(struct vnode *, int, struct ucred *);
int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *);
int filt_ufsread(struct knote *, long);
int filt_ufswrite(struct knote *, long);
int filt_ufsvnode(struct knote *, long);
void filt_ufsdetach(struct knote *);
/*
* A virgin directory (no blushing please).
*/
static struct dirtemplate mastertemplate = {
0, 12, DT_DIR, 1, ".",
0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
};
static struct odirtemplate omastertemplate = {
0, 12, 1, ".",
0, DIRBLKSIZ - 12, 2, ".."
};
/*
* Update the times in the inode
*/
void
ufs_itimes(struct vnode *vp)
{
struct inode *ip;
struct timespec ts;
ip = VTOI(vp);
if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
return;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
goto out;
#ifdef EXT2FS
if (IS_EXT2_VNODE(ip->i_vnode)) {
EXT2FS_ITIMES(ip);
goto out;
}
#endif
if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
ip->i_flag |= IN_LAZYMOD;
else
ip->i_flag |= IN_MODIFIED;
getnanotime(&ts);
if (ip->i_flag & IN_ACCESS) { DIP_ASSIGN(ip, atime, ts.tv_sec); DIP_ASSIGN(ip, atimensec, ts.tv_nsec);
}
if (ip->i_flag & IN_UPDATE) { DIP_ASSIGN(ip, mtime, ts.tv_sec); DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
}
if (ip->i_flag & IN_CHANGE) { DIP_ASSIGN(ip, ctime, ts.tv_sec); DIP_ASSIGN(ip, ctimensec, ts.tv_nsec);
ip->i_modrev++;
}
out:
ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
}
/*
* Create a regular file
*/
int
ufs_create(void *v)
{
struct vop_create_args *ap = v;
int error;
error =
ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
ap->a_dvp, ap->a_vpp, ap->a_cnp);
if (error == 0) VN_KNOTE(ap->a_dvp, NOTE_WRITE);
return (error);
}
/*
* Mknod vnode call
*/
int
ufs_mknod(void *v)
{
struct vop_mknod_args *ap = v;
struct vattr *vap = ap->a_vap;
struct vnode **vpp = ap->a_vpp;
struct inode *ip;
int error;
if ((error =
ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
ap->a_dvp, vpp, ap->a_cnp)) != 0)
return (error);
VN_KNOTE(ap->a_dvp, NOTE_WRITE);
ip = VTOI(*vpp);
ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
if (vap->va_rdev != VNOVAL) {
/*
* Want to be able to use this to make badblock
* inodes, so don't truncate the dev number.
*/
DIP_ASSIGN(ip, rdev, vap->va_rdev);
}
/*
* Remove inode so that it will be reloaded by VFS_VGET and
* checked to see if it is an alias of an existing entry in
* the inode cache.
*/
vput(*vpp);
(*vpp)->v_type = VNON;
vgone(*vpp);
*vpp = NULL;
return (0);
}
/*
* Open called.
*
* Nothing to do.
*/
int
ufs_open(void *v)
{
struct vop_open_args *ap = v;
struct inode *ip = VTOI(ap->a_vp);
/*
* Files marked append-only must be opened for appending.
*/
if ((DIP(ip, flags) & APPEND) &&
(ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
return (EPERM);
if (ap->a_mode & O_TRUNC) ip->i_flag |= IN_CHANGE | IN_UPDATE;
return (0);
}
/*
* Close called.
*
* Update the times on the inode.
*/
int
ufs_close(void *v)
{
struct vop_close_args *ap = v;
struct vnode *vp = ap->a_vp;
if (vp->v_usecount > 1) ufs_itimes(vp);
return (0);
}
int
ufs_access(void *v)
{
struct vop_access_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
mode_t mode = ap->a_mode;
/*
* Disallow write attempts on read-only file systems;
* unless the file is a socket, fifo, or a block or
* character device resident on the file system.
*/
if (mode & VWRITE) { switch (vp->v_type) {
int error;
case VDIR:
case VLNK:
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
if ((error = getinoquota(ip)) != 0)
return (error);
break;
case VBAD:
case VBLK:
case VCHR:
case VSOCK:
case VFIFO:
case VNON:
break;
}
}
/* If immutable bit set, nobody gets to write it. */
if ((mode & VWRITE) && (DIP(ip, flags) & IMMUTABLE))
return (EPERM);
if (vnoperm(vp)) {
/* For VEXEC, at least one of the execute bits must be set. */
if ((mode & VEXEC) && vp->v_type != VDIR && (DIP(ip, mode) & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
return EACCES;
return 0;
}
return (vaccess(vp->v_type, DIP(ip, mode), DIP(ip, uid), DIP(ip, gid),
mode, ap->a_cred));
}
int
ufs_getattr(void *v)
{
struct vop_getattr_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
struct vattr *vap = ap->a_vap;
ufs_itimes(vp);
/*
* Copy from inode table
*/
vap->va_fsid = ip->i_dev;
vap->va_fileid = ip->i_number;
vap->va_mode = DIP(ip, mode) & ~IFMT;
vap->va_nlink = ip->i_effnlink;
vap->va_uid = DIP(ip, uid); vap->va_gid = DIP(ip, gid); vap->va_rdev = (dev_t) DIP(ip, rdev); vap->va_size = DIP(ip, size); vap->va_atime.tv_sec = DIP(ip, atime); vap->va_atime.tv_nsec = DIP(ip, atimensec); vap->va_mtime.tv_sec = DIP(ip, mtime); vap->va_mtime.tv_nsec = DIP(ip, mtimensec); vap->va_ctime.tv_sec = DIP(ip, ctime); vap->va_ctime.tv_nsec = DIP(ip, ctimensec); vap->va_flags = DIP(ip, flags); vap->va_gen = DIP(ip, gen);
/* this doesn't belong here */
if (vp->v_type == VBLK)
vap->va_blocksize = BLKDEV_IOSIZE;
else if (vp->v_type == VCHR)
vap->va_blocksize = MAXBSIZE;
else
vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t) DIP(ip, blocks));
vap->va_type = vp->v_type;
vap->va_filerev = ip->i_modrev;
return (0);
}
/*
* Set attribute vnode op. called from several syscalls
*/
int
ufs_setattr(void *v)
{
struct vop_setattr_args *ap = v;
struct vattr *vap = ap->a_vap;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
struct ucred *cred = ap->a_cred;
int error;
long hint = NOTE_ATTRIB;
u_quad_t oldsize;
/*
* Check for unsettable attributes.
*/
if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
return (EINVAL);
}
if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
if (cred->cr_uid != DIP(ip, uid) && !vnoperm(vp) &&
(error = suser_ucred(cred)))
return (error);
if (cred->cr_uid == 0 || vnoperm(vp)) { if ((DIP(ip, flags) & (SF_IMMUTABLE | SF_APPEND)) &&
securelevel > 0)
return (EPERM);
DIP_ASSIGN(ip, flags, vap->va_flags);
} else {
if (DIP(ip, flags) & (SF_IMMUTABLE | SF_APPEND) ||
(vap->va_flags & UF_SETTABLE) != vap->va_flags)
return (EPERM);
DIP_AND(ip, flags, SF_SETTABLE); DIP_OR(ip, flags, vap->va_flags & UF_SETTABLE);
}
ip->i_flag |= IN_CHANGE;
if (vap->va_flags & (IMMUTABLE | APPEND))
return (0);
}
if (DIP(ip, flags) & (IMMUTABLE | APPEND))
return (EPERM);
/*
* Go through the fields and update if not VNOVAL.
*/
if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred);
if (error)
return (error);
}
if (vap->va_size != VNOVAL) { oldsize = DIP(ip, size);
/*
* Disallow write attempts on read-only file systems;
* unless the file is a socket, fifo, or a block or
* character device resident on the file system.
*/
switch (vp->v_type) {
case VDIR:
return (EISDIR);
case VLNK:
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
break;
default:
break;
}
if ((error = UFS_TRUNCATE(ip, vap->va_size, 0, cred)) != 0)
return (error);
if (vap->va_size < oldsize)
hint |= NOTE_TRUNCATE;
}
if ((vap->va_vaflags & VA_UTIMES_CHANGE) || vap->va_atime.tv_nsec != VNOVAL ||
vap->va_mtime.tv_nsec != VNOVAL) {
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
if (cred->cr_uid != DIP(ip, uid) && !vnoperm(vp) && (error = suser_ucred(cred)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
(error = VOP_ACCESS(vp, VWRITE, cred, ap->a_p))))
return (error);
if (vap->va_mtime.tv_nsec != VNOVAL)
ip->i_flag |= IN_CHANGE | IN_UPDATE;
else if (vap->va_vaflags & VA_UTIMES_CHANGE)
ip->i_flag |= IN_CHANGE;
if (vap->va_atime.tv_nsec != VNOVAL) { if (!(vp->v_mount->mnt_flag & MNT_NOATIME) ||
(ip->i_flag & (IN_CHANGE | IN_UPDATE)))
ip->i_flag |= IN_ACCESS;
}
ufs_itimes(vp);
if (vap->va_mtime.tv_nsec != VNOVAL) { DIP_ASSIGN(ip, mtime, vap->va_mtime.tv_sec); DIP_ASSIGN(ip, mtimensec, vap->va_mtime.tv_nsec);
}
if (vap->va_atime.tv_nsec != VNOVAL) { DIP_ASSIGN(ip, atime, vap->va_atime.tv_sec); DIP_ASSIGN(ip, atimensec, vap->va_atime.tv_nsec);
}
error = UFS_UPDATE(ip, 0);
if (error)
return (error);
}
error = 0;
if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
error = ufs_chmod(vp, (int)vap->va_mode, cred);
}
VN_KNOTE(vp, hint);
return (error);
}
/*
* Change the mode on a file.
* Inode must be locked before calling.
*/
int
ufs_chmod(struct vnode *vp, int mode, struct ucred *cred)
{
struct inode *ip = VTOI(vp);
int error;
if (cred->cr_uid != DIP(ip, uid) && !vnoperm(vp) &&
(error = suser_ucred(cred)))
return (error);
if (cred->cr_uid && !vnoperm(vp)) { if (vp->v_type != VDIR && (mode & S_ISTXT))
return (EFTYPE);
if (!groupmember(DIP(ip, gid), cred) && (mode & ISGID))
return (EPERM);
}
DIP_AND(ip, mode, ~ALLPERMS); DIP_OR(ip, mode, mode & ALLPERMS);
ip->i_flag |= IN_CHANGE;
if ((vp->v_flag & VTEXT) && (DIP(ip, mode) & S_ISTXT) == 0) (void) uvm_vnp_uncache(vp);
return (0);
}
/*
* Perform chown operation on inode ip;
* inode must be locked prior to call.
*/
int
ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred)
{
struct inode *ip = VTOI(vp);
uid_t ouid;
gid_t ogid;
int error = 0;
daddr_t change;
enum ufs_quota_flags quota_flags = 0;
if (uid == (uid_t)VNOVAL) uid = DIP(ip, uid); if (gid == (gid_t)VNOVAL) gid = DIP(ip, gid);
/*
* If we don't own the file, are trying to change the owner
* of the file, or are not a member of the target group,
* the caller must be superuser or the call fails.
*/
if ((cred->cr_uid != DIP(ip, uid) || uid != DIP(ip, uid) || (gid != DIP(ip, gid) && !groupmember(gid, cred))) && !vnoperm(vp) &&
(error = suser_ucred(cred)))
return (error);
ogid = DIP(ip, gid);
ouid = DIP(ip, uid);
change = DIP(ip, blocks);
if (ouid == uid)
quota_flags |= UFS_QUOTA_NOUID;
if (ogid == gid)
quota_flags |= UFS_QUOTA_NOGID;
if ((error = getinoquota(ip)) != 0)
return (error);
(void) ufs_quota_free_blocks2(ip, change, cred, quota_flags);
(void) ufs_quota_free_inode2(ip, cred, quota_flags);
(void) ufs_quota_delete(ip);
DIP_ASSIGN(ip, gid, gid); DIP_ASSIGN(ip, uid, uid); if ((error = getinoquota(ip)) != 0)
goto error;
if ((error = ufs_quota_alloc_blocks2(ip, change, cred,
quota_flags)) != 0)
goto error;
if ((error = ufs_quota_alloc_inode2(ip, cred ,
quota_flags)) != 0) {
(void)ufs_quota_free_blocks2(ip, change, cred,
quota_flags);
goto error;
}
if (getinoquota(ip))
panic("chown: lost quota"); if (ouid != uid || ogid != gid) ip->i_flag |= IN_CHANGE; if (!vnoperm(vp)) { if (ouid != uid && cred->cr_uid != 0) DIP_AND(ip, mode, ~ISUID); if (ogid != gid && cred->cr_uid != 0) DIP_AND(ip, mode, ~ISGID);
}
return (0);
error:
(void) ufs_quota_delete(ip);
DIP_ASSIGN(ip, gid, ogid); DIP_ASSIGN(ip, uid, ouid); if (getinoquota(ip) == 0) {
(void) ufs_quota_alloc_blocks2(ip, change, cred,
quota_flags | UFS_QUOTA_FORCE);
(void) ufs_quota_alloc_inode2(ip, cred,
quota_flags | UFS_QUOTA_FORCE);
(void) getinoquota(ip);
}
return (error);
}
/* ARGSUSED */
int
ufs_ioctl(void *v)
{
#if 0
struct vop_ioctl_args *ap = v;
#endif
return (ENOTTY);
}
int
ufs_remove(void *v)
{
struct vop_remove_args *ap = v;
struct inode *ip;
struct vnode *vp = ap->a_vp;
struct vnode *dvp = ap->a_dvp;
int error;
ip = VTOI(vp);
if (vp->v_type == VDIR || (DIP(ip, flags) & (IMMUTABLE | APPEND)) || (DIP(VTOI(dvp), flags) & APPEND)) {
error = EPERM;
goto out;
}
error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
VN_KNOTE(vp, NOTE_DELETE); VN_KNOTE(dvp, NOTE_WRITE);
out:
if (dvp == vp)
vrele(vp);
else
vput(vp);
vput(dvp);
return (error);
}
/*
* link vnode call
*/
int
ufs_link(void *v)
{
struct vop_link_args *ap = v;
struct vnode *dvp = ap->a_dvp;
struct vnode *vp = ap->a_vp;
struct componentname *cnp = ap->a_cnp;
struct inode *ip;
struct direct newdir;
int error;
#ifdef DIAGNOSTIC
if ((cnp->cn_flags & HASBUF) == 0)
panic("ufs_link: no name");
#endif
if (vp->v_type == VDIR) {
VOP_ABORTOP(dvp, cnp);
error = EPERM;
goto out2;
}
if (dvp->v_mount != vp->v_mount) {
VOP_ABORTOP(dvp, cnp);
error = EXDEV;
goto out2;
}
if (dvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE))) { VOP_ABORTOP(dvp, cnp);
goto out2;
}
ip = VTOI(vp);
if ((nlink_t) DIP(ip, nlink) >= LINK_MAX) {
VOP_ABORTOP(dvp, cnp);
error = EMLINK;
goto out1;
}
if (DIP(ip, flags) & (IMMUTABLE | APPEND)) {
VOP_ABORTOP(dvp, cnp);
error = EPERM;
goto out1;
}
ip->i_effnlink++;
DIP_ADD(ip, nlink, 1);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip, 0); if ((error = UFS_UPDATE(ip, !DOINGSOFTDEP(vp))) == 0) {
ufs_makedirentry(ip, cnp, &newdir);
error = ufs_direnter(dvp, vp, &newdir, cnp, NULL);
}
if (error) {
ip->i_effnlink--;
DIP_ADD(ip, nlink, -1);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip, 0);
}
pool_put(&namei_pool, cnp->cn_pnbuf);
VN_KNOTE(vp, NOTE_LINK); VN_KNOTE(dvp, NOTE_WRITE);
out1:
if (dvp != vp) VOP_UNLOCK(vp);
out2:
vput(dvp);
return (error);
}
/*
* Rename system call.
* rename("foo", "bar");
* is essentially
* unlink("bar");
* link("foo", "bar");
* unlink("foo");
* but ``atomically''. Can't do full commit without saving state in the
* inode on disk which isn't feasible at this time. Best we can do is
* always guarantee the target exists.
*
* Basic algorithm is:
*
* 1) Bump link count on source while we're linking it to the
* target. This also ensure the inode won't be deleted out
* from underneath us while we work (it may be truncated by
* a concurrent `trunc' or `open' for creation).
* 2) Link source to destination. If destination already exists,
* delete it first.
* 3) Unlink source reference to inode if still around. If a
* directory was moved and the parent of the destination
* is different from the source, patch the ".." entry in the
* directory.
*/
int
ufs_rename(void *v)
{
struct vop_rename_args *ap = v;
struct vnode *tvp = ap->a_tvp;
struct vnode *tdvp = ap->a_tdvp;
struct vnode *fvp = ap->a_fvp;
struct vnode *fdvp = ap->a_fdvp;
struct componentname *tcnp = ap->a_tcnp;
struct componentname *fcnp = ap->a_fcnp;
struct inode *ip, *xp, *dp;
struct direct newdir;
int doingdirectory = 0, oldparent = 0, newparent = 0;
int error = 0;
#ifdef DIAGNOSTIC
if ((tcnp->cn_flags & HASBUF) == 0 ||
(fcnp->cn_flags & HASBUF) == 0)
panic("ufs_rename: no name");
#endif
/*
* Check for cross-device rename.
*/
if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) {
error = EXDEV;
abortit:
VOP_ABORTOP(tdvp, tcnp);
if (tdvp == tvp)
vrele(tdvp);
else
vput(tdvp); if (tvp) vput(tvp);
VOP_ABORTOP(fdvp, fcnp);
vrele(fdvp);
vrele(fvp);
return (error);
}
if (tvp && ((DIP(VTOI(tvp), flags) & (IMMUTABLE | APPEND)) || (DIP(VTOI(tdvp), flags) & APPEND))) {
error = EPERM;
goto abortit;
}
/*
* Check if just deleting a link name or if we've lost a race.
* If another process completes the same rename after we've looked
* up the source and have blocked looking up the target, then the
* source and target inodes may be identical now although the
* names were never linked.
*/
if (fvp == tvp) {
if (fvp->v_type == VDIR) {
/*
* Linked directories are impossible, so we must
* have lost the race. Pretend that the rename
* completed before the lookup.
*/
error = ENOENT;
goto abortit;
}
/* Release destination completely. */
VOP_ABORTOP(tdvp, tcnp);
vput(tdvp);
vput(tvp);
/*
* Delete source. There is another race now that everything
* is unlocked, but this doesn't cause any new complications.
* relookup() may find a file that is unrelated to the
* original one, or it may fail. Too bad.
*/
vrele(fvp);
fcnp->cn_flags &= ~MODMASK;
fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
if ((fcnp->cn_flags & SAVESTART) == 0)
panic("ufs_rename: lost from startdir");
fcnp->cn_nameiop = DELETE;
if ((error = vfs_relookup(fdvp, &fvp, fcnp)) != 0)
return (error); /* relookup did vrele() */
vrele(fdvp);
return (VOP_REMOVE(fdvp, fvp, fcnp));
}
if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
goto abortit;
/* fvp, tdvp, tvp now locked */
dp = VTOI(fdvp);
ip = VTOI(fvp);
if ((nlink_t) DIP(ip, nlink) >= LINK_MAX) {
VOP_UNLOCK(fvp);
error = EMLINK;
goto abortit;
}
if ((DIP(ip, flags) & (IMMUTABLE | APPEND)) || (DIP(dp, flags) & APPEND)) {
VOP_UNLOCK(fvp);
error = EPERM;
goto abortit;
}
if ((DIP(ip, mode) & IFMT) == IFDIR) {
error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
if (!error && tvp) error = VOP_ACCESS(tvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); if (error) {
VOP_UNLOCK(fvp);
error = EACCES;
goto abortit;
}
/*
* Avoid ".", "..", and aliases of "." for obvious reasons.
*/
if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) ||
(ip->i_flag & IN_RENAME)) {
VOP_UNLOCK(fvp);
error = EINVAL;
goto abortit;
}
ip->i_flag |= IN_RENAME;
oldparent = dp->i_number;
doingdirectory = 1;
}
VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */
/*
* When the target exists, both the directory
* and target vnodes are returned locked.
*/
dp = VTOI(tdvp);
xp = NULL;
if (tvp) xp = VTOI(tvp);
/*
* 1) Bump link count while we're moving stuff
* around. If we crash somewhere before
* completing our work, the link count
* may be wrong, but correctable.
*/
ip->i_effnlink++;
DIP_ADD(ip, nlink, 1);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip, 0);
if ((error = UFS_UPDATE(ip, !DOINGSOFTDEP(fvp))) != 0) {
VOP_UNLOCK(fvp);
goto bad;
}
/*
* If ".." must be changed (ie the directory gets a new
* parent) then the source directory must not be in the
* directory hierarchy above the target, as this would
* orphan everything below the source directory. Also
* the user must have write permission in the source so
* as to be able to change "..". We must repeat the call
* to namei, as the parent directory is unlocked by the
* call to checkpath().
*/
error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
VOP_UNLOCK(fvp);
/* tdvp and tvp locked */
if (oldparent != dp->i_number)
newparent = dp->i_number;
if (doingdirectory && newparent) { if (error) /* write access check above */
goto bad;
if (xp != NULL) vput(tvp);
/*
* Compensate for the reference ufs_checkpath() loses.
*/
vref(tdvp);
/* Only tdvp is locked */
if ((error = ufs_checkpath(ip, dp, tcnp->cn_cred)) != 0) {
vrele(tdvp);
goto out;
}
if ((tcnp->cn_flags & SAVESTART) == 0)
panic("ufs_rename: lost to startdir"); if ((error = vfs_relookup(tdvp, &tvp, tcnp)) != 0)
goto out;
vrele(tdvp); /* relookup() acquired a reference */
dp = VTOI(tdvp);
xp = NULL;
if (tvp)
xp = VTOI(tvp);
}
/*
* 2) If target doesn't exist, link the target
* to the source and unlink the source.
* Otherwise, rewrite the target directory
* entry to reference the source inode and
* expunge the original entry's existence.
*/
if (xp == NULL) {
if (dp->i_dev != ip->i_dev)
panic("rename: EXDEV");
/*
* Account for ".." in new directory.
* When source and destination have the same
* parent we don't fool with the link count.
*/
if (doingdirectory && newparent) { if ((nlink_t) DIP(dp, nlink) >= LINK_MAX) {
error = EMLINK;
goto bad;
}
dp->i_effnlink++;
DIP_ADD(dp, nlink, 1);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp, 0); if ((error = UFS_UPDATE(dp, !DOINGSOFTDEP(tdvp)))
!= 0) {
dp->i_effnlink--;
DIP_ADD(dp, nlink, -1);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp, 0);
goto bad;
}
}
ufs_makedirentry(ip, tcnp, &newdir);
if ((error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL)) != 0) {
if (doingdirectory && newparent) {
dp->i_effnlink--;
DIP_ADD(dp, nlink, -1);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp, 0);
(void)UFS_UPDATE(dp, 1);
}
goto bad;
}
VN_KNOTE(tdvp, NOTE_WRITE);
vput(tdvp);
} else {
if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
panic("rename: EXDEV");
/*
* Short circuit rename(foo, foo).
*/
if (xp->i_number == ip->i_number)
panic("ufs_rename: same file");
/*
* If the parent directory is "sticky", then the user must
* own the parent directory, or the destination of the rename,
* otherwise the destination may not be changed (except by
* root). This implements append-only directories.
*/
if ((DIP(dp, mode) & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != DIP(dp, uid) && DIP(xp, uid )!= tcnp->cn_cred->cr_uid &&
!vnoperm(tdvp)) {
error = EPERM;
goto bad;
}
/*
* Target must be empty if a directory and have no links
* to it. Also, ensure source and target are compatible
* (both directories, or both not directories).
*/
if ((DIP(xp, mode) & IFMT) == IFDIR) { if (xp->i_effnlink > 2 ||
!ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
error = ENOTEMPTY;
goto bad;
}
if (!doingdirectory) {
error = ENOTDIR;
goto bad;
}
cache_purge(tdvp); } else if (doingdirectory) {
error = EISDIR;
goto bad;
}
if ((error = ufs_dirrewrite(dp, xp, ip->i_number, IFTODT(DIP(ip, mode)), (doingdirectory && newparent) ?
newparent : doingdirectory)) != 0)
goto bad;
if (doingdirectory) { if (!newparent) {
dp->i_effnlink--;
if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp, 0);
}
xp->i_effnlink--;
if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(xp, 0);
}
if (doingdirectory && !DOINGSOFTDEP(tvp)) {
/*
* Truncate inode. The only stuff left in the directory
* is "." and "..". The "." reference is inconsequential
* since we are quashing it. We have removed the "."
* reference and the reference in the parent directory,
* but there may be other hard links. The soft
* dependency code will arrange to do these operations
* after the parent directory entry has been deleted on
* disk, so when running with that code we avoid doing
* them now.
*/
if (!newparent) { DIP_ADD(dp, nlink, -1);
dp->i_flag |= IN_CHANGE;
}
DIP_ADD(xp, nlink, -1);
xp->i_flag |= IN_CHANGE;
if ((error = UFS_TRUNCATE(VTOI(tvp), (off_t)0, IO_SYNC,
tcnp->cn_cred)) != 0)
goto bad;
}
VN_KNOTE(tdvp, NOTE_WRITE);
vput(tdvp);
VN_KNOTE(tvp, NOTE_DELETE);
vput(tvp);
xp = NULL;
}
/*
* 3) Unlink the source.
*/
fcnp->cn_flags &= ~MODMASK;
fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
if ((fcnp->cn_flags & SAVESTART) == 0)
panic("ufs_rename: lost from startdir");
if ((error = vfs_relookup(fdvp, &fvp, fcnp)) != 0) {
vrele(ap->a_fvp);
return (error);
}
vrele(fdvp);
if (fvp == NULL) {
/*
* From name has disappeared.
*/
if (doingdirectory)
panic("ufs_rename: lost dir entry"); vrele(ap->a_fvp);
return (0);
}
xp = VTOI(fvp);
dp = VTOI(fdvp);
/*
* Ensure that the directory entry still exists and has not
* changed while the new name has been entered. If the source is
* a file then the entry may have been unlinked or renamed. In
* either case there is no further work to be done. If the source
* is a directory then it cannot have been rmdir'ed; the IN_RENAME
* flag ensures that it cannot be moved by another rename or removed
* by a rmdir.
*/
if (xp != ip) {
if (doingdirectory) panic("ufs_rename: lost dir entry");
} else {
/*
* If the source is a directory with a
* new parent, the link count of the old
* parent directory must be decremented
* and ".." set to point to the new parent.
*/
if (doingdirectory && newparent) { xp->i_offset = mastertemplate.dot_reclen;
ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
cache_purge(fdvp);
}
error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
xp->i_flag &= ~IN_RENAME;
}
VN_KNOTE(fvp, NOTE_RENAME); if (dp) vput(fdvp); if (xp) vput(fvp);
vrele(ap->a_fvp);
return (error);
bad:
if (xp)
vput(ITOV(xp));
vput(ITOV(dp));
out:
vrele(fdvp);
if (doingdirectory) ip->i_flag &= ~IN_RENAME;
if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
ip->i_effnlink--;
DIP_ADD(ip, nlink, -1);
ip->i_flag |= IN_CHANGE;
ip->i_flag &= ~IN_RENAME;
if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip, 0);
vput(fvp);
} else
vrele(fvp);
return (error);
}
/*
* Mkdir system call
*/
int
ufs_mkdir(void *v)
{
struct vop_mkdir_args *ap = v;
struct vnode *dvp = ap->a_dvp;
struct vattr *vap = ap->a_vap;
struct componentname *cnp = ap->a_cnp;
struct inode *ip, *dp;
struct vnode *tvp;
struct buf *bp;
struct direct newdir;
struct dirtemplate dirtemplate, *dtp;
int error, dmode, blkoff;
#ifdef DIAGNOSTIC
if ((cnp->cn_flags & HASBUF) == 0)
panic("ufs_mkdir: no name");
#endif
dp = VTOI(dvp);
if ((nlink_t) DIP(dp, nlink) >= LINK_MAX) {
error = EMLINK;
goto out;
}
dmode = vap->va_mode & 0777;
dmode |= IFDIR;
/*
* Must simulate part of ufs_makeinode here to acquire the inode,
* but not have it entered in the parent directory. The entry is
* made later after writing "." and ".." entries.
*/
if ((error = UFS_INODE_ALLOC(dp, dmode, cnp->cn_cred, &tvp)) != 0)
goto out;
ip = VTOI(tvp);
DIP_ASSIGN(ip, uid, cnp->cn_cred->cr_uid); DIP_ASSIGN(ip, gid, DIP(dp, gid)); if ((error = getinoquota(ip)) ||
(error = ufs_quota_alloc_inode(ip, cnp->cn_cred))) {
pool_put(&namei_pool, cnp->cn_pnbuf);
UFS_INODE_FREE(ip, ip->i_number, dmode);
vput(tvp);
vput(dvp);
return (error);
}
ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
DIP_ASSIGN(ip, mode, dmode);
tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */
ip->i_effnlink = 2;
DIP_ASSIGN(ip, nlink, 2); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip, 0);
/*
* Bump link count in parent directory to reflect work done below.
* Should be done before reference is create so cleanup is
* possible if we crash.
*/
dp->i_effnlink++;
DIP_ADD(dp, nlink, 1);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp, 0); if ((error = UFS_UPDATE(dp, !DOINGSOFTDEP(dvp))) != 0)
goto bad;
/*
* Initialize directory with "." and ".." from static template.
*/
if (dp->i_ump->um_maxsymlinklen > 0)
dtp = &mastertemplate;
else
dtp = (struct dirtemplate *)&omastertemplate;
dirtemplate = *dtp;
dirtemplate.dot_ino = ip->i_number;
dirtemplate.dotdot_ino = dp->i_number;
if ((error = UFS_BUF_ALLOC(ip, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
B_CLRBUF, &bp)) != 0)
goto bad;
DIP_ASSIGN(ip, size, DIRBLKSIZ);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
uvm_vnp_setsize(tvp, DIP(ip, size));
memcpy(bp->b_data, &dirtemplate, sizeof(dirtemplate));
if (DOINGSOFTDEP(tvp)) {
/*
* Ensure that the entire newly allocated block is a
* valid directory so that future growth within the
* block does not have to ensure that the block is
* written before the inode
*/
blkoff = DIRBLKSIZ;
while (blkoff < bp->b_bcount) {
((struct direct *)
(bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
blkoff += DIRBLKSIZ;
}
}
if ((error = UFS_UPDATE(ip, !DOINGSOFTDEP(tvp))) != 0) { (void)VOP_BWRITE(bp);
goto bad;
}
/*
* Directory set up, now install its entry in the parent directory.
*
* If we are not doing soft dependencies, then we must write out the
* buffer containing the new directory body before entering the new
* name in the parent. If we are doing soft dependencies, then the
* buffer containing the new directory body will be passed to and
* released in the soft dependency code after the code has attached
* an appropriate ordering dependency to the buffer which ensures that
* the buffer is written before the new name is written in the parent.
*/
if (!DOINGSOFTDEP(dvp) && ((error = VOP_BWRITE(bp)) != 0))
goto bad;
ufs_makedirentry(ip, cnp, &newdir);
error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
bad:
if (error == 0) { VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
*ap->a_vpp = tvp;
} else {
dp->i_effnlink--;
DIP_ADD(dp, nlink, -1);
dp->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp, 0);
/*
* No need to do an explicit VOP_TRUNCATE here, vrele will
* do this for us because we set the link count to 0.
*/
ip->i_effnlink = 0;
DIP_ASSIGN(ip, nlink, 0);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip, 0);
vput(tvp);
}
out:
pool_put(&namei_pool, cnp->cn_pnbuf);
vput(dvp);
return (error);
}
/*
* Rmdir system call.
*/
int
ufs_rmdir(void *v)
{
struct vop_rmdir_args *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *dvp = ap->a_dvp;
struct componentname *cnp = ap->a_cnp;
struct inode *ip, *dp;
int error;
ip = VTOI(vp);
dp = VTOI(dvp);
/*
* Do not remove a directory that is in the process of being renamed.
* Verify the directory is empty (and valid). Rmdir ".." will not be
* valid since ".." will contain a reference to the current directory
* and thus be non-empty.
*/
error = 0;
if (ip->i_flag & IN_RENAME) {
error = EINVAL;
goto out;
}
if (ip->i_effnlink != 2 ||
!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
error = ENOTEMPTY;
goto out;
}
if ((DIP(dp, flags) & APPEND) || (DIP(ip, flags) & (IMMUTABLE | APPEND))) {
error = EPERM;
goto out;
}
/*
* Delete reference to directory before purging
* inode. If we crash in between, the directory
* will be reattached to lost+found,
*/
dp->i_effnlink--;
ip->i_effnlink--;
if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp, 0);
softdep_change_linkcnt(ip, 0);
}
if ((error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1)) != 0) {
dp->i_effnlink++;
ip->i_effnlink++;
if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp, 0);
softdep_change_linkcnt(ip, 0);
}
goto out;
}
VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
cache_purge(dvp);
/*
* Truncate inode. The only stuff left in the directory is "." and
* "..". The "." reference is inconsequential since we are quashing
* it. The soft dependency code will arrange to do these operations
* after the parent directory entry has been deleted on disk, so
* when running with that code we avoid doing them now.
*/
if (!DOINGSOFTDEP(vp)) {
int ioflag;
DIP_ADD(dp, nlink, -1);
dp->i_flag |= IN_CHANGE;
DIP_ADD(ip, nlink, -1);
ip->i_flag |= IN_CHANGE;
ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC;
error = UFS_TRUNCATE(ip, (off_t)0, ioflag, cnp->cn_cred);
}
cache_purge(vp);
#ifdef UFS_DIRHASH
/* Kill any active hash; i_effnlink == 0, so it will not come back. */
if (ip->i_dirhash != NULL) ufsdirhash_free(ip);
#endif
out:
VN_KNOTE(vp, NOTE_DELETE);
vput(dvp);
vput(vp);
return (error);
}
/*
* symlink -- make a symbolic link
*/
int
ufs_symlink(void *v)
{
struct vop_symlink_args *ap = v;
struct vnode *vp, **vpp = ap->a_vpp;
struct inode *ip;
int len, error;
error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
vpp, ap->a_cnp);
if (error) {
vput(ap->a_dvp);
return (error);
}
VN_KNOTE(ap->a_dvp, NOTE_WRITE);
vput(ap->a_dvp);
vp = *vpp;
ip = VTOI(vp);
len = strlen(ap->a_target);
if (len < ip->i_ump->um_maxsymlinklen) {
memcpy(SHORTLINK(ip), ap->a_target, len); DIP_ASSIGN(ip, size, len);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
} else
error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, NULL,
curproc);
vput(vp);
return (error);
}
/*
* Vnode op for reading directories.
*
* This routine converts the on-disk struct direct entries to the
* struct dirent entries expected by userland and the rest of the kernel.
*/
int
ufs_readdir(void *v)
{
struct vop_readdir_args *ap = v;
struct uio auio, *uio = ap->a_uio;
struct iovec aiov;
union {
struct dirent dn;
char __pad[roundup(sizeof(struct dirent), 8)];
} u;
off_t off = uio->uio_offset;
struct direct *dp;
char *edp;
caddr_t diskbuf;
size_t count, entries;
int bufsize, readcnt, error;
#if (BYTE_ORDER == LITTLE_ENDIAN)
int ofmt = VTOI(ap->a_vp)->i_ump->um_maxsymlinklen == 0;
#endif
if (uio->uio_rw != UIO_READ)
return (EINVAL);
count = uio->uio_resid;
entries = (uio->uio_offset + count) & (DIRBLKSIZ - 1);
/* Make sure we don't return partial entries. */
if (count <= entries)
return (EINVAL);
/*
* Convert and copy back the on-disk struct direct format to
* the user-space struct dirent format, one entry at a time
*/
/* read from disk, stopping on a block boundary, max 64kB */
readcnt = min(count, 64*1024) - entries;
auio = *uio;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = readcnt;
auio.uio_segflg = UIO_SYSSPACE;
aiov.iov_len = readcnt;
bufsize = readcnt;
diskbuf = malloc(bufsize, M_TEMP, M_WAITOK);
aiov.iov_base = diskbuf;
error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
readcnt -= auio.uio_resid;
dp = (struct direct *)diskbuf;
edp = &diskbuf[readcnt];
memset(&u, 0, sizeof(u));
/*
* While
* - we haven't failed to VOP_READ or uiomove()
* - there's space in the read buf for the head of an entry
* - that entry has a valid d_reclen, and
* - there's space for the *entire* entry
* then we're good to process this one.
*/
while (error == 0 &&
(char *)dp + offsetof(struct direct, d_name) < edp &&
dp->d_reclen > offsetof(struct direct, d_name) &&
(char *)dp + dp->d_reclen <= edp) {
u.dn.d_reclen = roundup(dp->d_namlen+1 +
offsetof(struct dirent, d_name), 8);
if (u.dn.d_reclen > uio->uio_resid)
break;
off += dp->d_reclen;
u.dn.d_off = off;
u.dn.d_fileno = dp->d_ino;
#if (BYTE_ORDER == LITTLE_ENDIAN)
if (ofmt) {
u.dn.d_type = dp->d_namlen;
u.dn.d_namlen = dp->d_type;
} else
#endif
{
u.dn.d_type = dp->d_type;
u.dn.d_namlen = dp->d_namlen;
}
memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen);
memset(u.dn.d_name + u.dn.d_namlen, 0, u.dn.d_reclen
- u.dn.d_namlen - offsetof(struct dirent, d_name));
error = uiomove(&u.dn, u.dn.d_reclen, uio);
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
/*
* If there was room for an entry in what we read but its
* d_reclen is bogus, fail
*/
if ((char *)dp + offsetof(struct direct, d_name) < edp &&
dp->d_reclen <= offsetof(struct direct, d_name))
error = EIO;
free(diskbuf, M_TEMP, bufsize);
uio->uio_offset = off;
*ap->a_eofflag = DIP(VTOI(ap->a_vp), size) <= off;
return (error);
}
/*
* Return target name of a symbolic link
*/
int
ufs_readlink(void *v)
{
struct vop_readlink_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
u_int64_t isize;
isize = DIP(ip, size); if (isize < ip->i_ump->um_maxsymlinklen || (ip->i_ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) {
return (uiomove((char *)SHORTLINK(ip), isize, ap->a_uio));
}
return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
}
/*
* Lock an inode. If its already locked, set the WANT bit and sleep.
*/
int
ufs_lock(void *v)
{
struct vop_lock_args *ap = v;
struct vnode *vp = ap->a_vp;
return rrw_enter(&VTOI(vp)->i_lock, ap->a_flags & LK_RWFLAGS);
}
/*
* Unlock an inode. If WANT bit is on, wakeup.
*/
int
ufs_unlock(void *v)
{
struct vop_unlock_args *ap = v;
struct vnode *vp = ap->a_vp;
rrw_exit(&VTOI(vp)->i_lock);
return 0;
}
/*
* Check for a locked inode.
*/
int
ufs_islocked(void *v)
{
struct vop_islocked_args *ap = v;
return rrw_status(&VTOI(ap->a_vp)->i_lock);
}
/*
* Calculate the logical to physical mapping if not done already,
* then call the device strategy routine.
*/
int
ufs_strategy(void *v)
{
struct vop_strategy_args *ap = v;
struct buf *bp = ap->a_bp;
struct vnode *vp = bp->b_vp;
struct inode *ip;
int error;
int s;
ip = VTOI(vp);
if (vp->v_type == VBLK || vp->v_type == VCHR)
panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) {
error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
NULL);
if (error) {
bp->b_error = error;
bp->b_flags |= B_ERROR;
s = splbio();
biodone(bp);
splx(s);
return (error);
}
if (bp->b_blkno == -1) clrbuf(bp);
}
if (bp->b_blkno == -1) { s = splbio();
biodone(bp);
splx(s);
return (0);
}
vp = ip->i_devvp;
bp->b_dev = vp->v_rdev;
VOP_STRATEGY(vp, bp);
return (0);
}
/*
* Print out the contents of an inode.
*/
int
ufs_print(void *v)
{
#ifdef DIAGNOSTIC
struct vop_print_args *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
printf("tag VT_UFS, ino %u, on dev %d, %d", ip->i_number,
major(ip->i_dev), minor(ip->i_dev));
printf(" flags 0x%x, effnlink %d, nlink %d\n",
ip->i_flag, ip->i_effnlink, DIP(ip, nlink));
printf("\tmode 0%o, owner %d, group %d, size %lld",
DIP(ip, mode), DIP(ip, uid), DIP(ip, gid), DIP(ip, size));
#ifdef FIFO
if (vp->v_type == VFIFO)
fifo_printinfo(vp);
#endif /* FIFO */
printf("\n");
#endif /* DIAGNOSTIC */
return (0);
}
/*
* Read wrapper for special devices.
*/
int
ufsspec_read(void *v)
{
struct vop_read_args *ap = v;
/*
* Set access flag.
*/
VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
return (spec_read(ap));
}
/*
* Write wrapper for special devices.
*/
int
ufsspec_write(void *v)
{
struct vop_write_args *ap = v;
/*
* Set update and change flags.
*/
VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE;
return (spec_write(ap));
}
/*
* Close wrapper for special devices.
*
* Update the times on the inode then do device close.
*/
int
ufsspec_close(void *v)
{
struct vop_close_args *ap = v;
struct vnode *vp = ap->a_vp;
if (vp->v_usecount > 1) ufs_itimes(vp);
return (spec_close(ap));
}
#ifdef FIFO
/*
* Read wrapper for fifo's
*/
int
ufsfifo_read(void *v)
{
struct vop_read_args *ap = v;
/*
* Set access flag.
*/
VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
return (fifo_read(ap));
}
/*
* Write wrapper for fifo's.
*/
int
ufsfifo_write(void *v)
{
struct vop_write_args *ap = v;
/*
* Set update and change flags.
*/
VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE;
return (fifo_write(ap));
}
/*
* Close wrapper for fifo's.
*
* Update the times on the inode then do device close.
*/
int
ufsfifo_close(void *v)
{
struct vop_close_args *ap = v;
struct vnode *vp = ap->a_vp;
if (vp->v_usecount > 1) ufs_itimes(vp);
return (fifo_close(ap));
}
#endif /* FIFO */
/*
* Return POSIX pathconf information applicable to ufs filesystems.
*/
int
ufs_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
int error = 0;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
break;
case _PC_NAME_MAX:
*ap->a_retval = NAME_MAX;
break;
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
break;
case _PC_NO_TRUNC:
*ap->a_retval = 1;
break;
case _PC_ALLOC_SIZE_MIN:
*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
break;
case _PC_FILESIZEBITS:
*ap->a_retval = 64;
break;
case _PC_REC_INCR_XFER_SIZE:
*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
break;
case _PC_REC_MAX_XFER_SIZE:
*ap->a_retval = -1; /* means ``unlimited'' */
break;
case _PC_REC_MIN_XFER_SIZE:
*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
break;
case _PC_REC_XFER_ALIGN:
*ap->a_retval = PAGE_SIZE;
break;
case _PC_SYMLINK_MAX:
*ap->a_retval = MAXPATHLEN;
break;
case _PC_2_SYMLINKS:
*ap->a_retval = 1;
break;
case _PC_TIMESTAMP_RESOLUTION:
*ap->a_retval = 1;
break;
default:
error = EINVAL;
break;
}
return (error);
}
/*
* Advisory record locking support
*/
int
ufs_advlock(void *v)
{
struct vop_advlock_args *ap = v;
struct inode *ip = VTOI(ap->a_vp);
return (lf_advlock(&ip->i_lockf, DIP(ip, size), ap->a_id, ap->a_op,
ap->a_fl, ap->a_flags));
}
/*
* Allocate a new inode.
*/
int
ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
struct componentname *cnp)
{
struct inode *ip, *pdir;
struct direct newdir;
struct vnode *tvp;
int error;
pdir = VTOI(dvp);
#ifdef DIAGNOSTIC
if ((cnp->cn_flags & HASBUF) == 0)
panic("ufs_makeinode: no name");
#endif
*vpp = NULL;
if ((mode & IFMT) == 0)
mode |= IFREG;
if ((error = UFS_INODE_ALLOC(pdir, mode, cnp->cn_cred, &tvp)) != 0) {
pool_put(&namei_pool, cnp->cn_pnbuf);
return (error);
}
ip = VTOI(tvp);
DIP_ASSIGN(ip, gid, DIP(pdir, gid)); DIP_ASSIGN(ip, uid, cnp->cn_cred->cr_uid); if ((error = getinoquota(ip)) ||
(error = ufs_quota_alloc_inode(ip, cnp->cn_cred))) {
pool_put(&namei_pool, cnp->cn_pnbuf);
UFS_INODE_FREE(ip, ip->i_number, mode);
vput(tvp);
return (error);
}
ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
DIP_ASSIGN(ip, mode, mode);
tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */
ip->i_effnlink = 1;
DIP_ASSIGN(ip, nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip, 0); if ((DIP(ip, mode) & ISGID) && !groupmember(DIP(ip, gid), cnp->cn_cred) && !vnoperm(dvp) &&
suser_ucred(cnp->cn_cred))
DIP_AND(ip, mode, ~ISGID);
/*
* Make sure inode goes to disk before directory entry.
*/
if ((error = UFS_UPDATE(ip, !DOINGSOFTDEP(tvp))) != 0)
goto bad;
ufs_makedirentry(ip, cnp, &newdir);
if ((error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL)) != 0)
goto bad;
if ((cnp->cn_flags & SAVESTART) == 0) pool_put(&namei_pool, cnp->cn_pnbuf);
*vpp = tvp;
return (0);
bad:
/*
* Write error occurred trying to update the inode
* or the directory so must deallocate the inode.
*/
pool_put(&namei_pool, cnp->cn_pnbuf);
ip->i_effnlink = 0;
DIP_ASSIGN(ip, nlink, 0);
ip->i_flag |= IN_CHANGE;
if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip, 0);
tvp->v_type = VNON;
vput(tvp);
return (error);
}
const struct filterops ufsread_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_ufsdetach,
.f_event = filt_ufsread,
};
const struct filterops ufswrite_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_ufsdetach,
.f_event = filt_ufswrite,
};
const struct filterops ufsvnode_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_ufsdetach,
.f_event = filt_ufsvnode,
};
int
ufs_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
struct vnode *vp = ap->a_vp;
struct knote *kn = ap->a_kn;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &ufsread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &ufswrite_filtops;
break;
case EVFILT_VNODE:
kn->kn_fop = &ufsvnode_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = (caddr_t)vp;
klist_insert_locked(&vp->v_selectinfo.si_note, kn);
return (0);
}
void
filt_ufsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
klist_remove_locked(&vp->v_selectinfo.si_note, kn);
}
int
filt_ufsread(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
struct inode *ip = VTOI(vp);
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE) {
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
return (1);
}
#ifdef EXT2FS
if (IS_EXT2_VNODE(ip->i_vnode))
kn->kn_data = ext2fs_size(ip) - foffset(kn->kn_fp);
else
#endif
kn->kn_data = DIP(ip, size) - foffset(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF;
return (1);
}
if (kn->kn_flags & (__EV_POLL | __EV_SELECT))
return (1);
return (kn->kn_data != 0);
}
int
filt_ufswrite(struct knote *kn, long hint)
{
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE) {
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
return (1);
}
kn->kn_data = 0;
return (1);
}
int
filt_ufsvnode(struct knote *kn, long hint)
{ if (kn->kn_sfflags & hint) kn->kn_fflags |= hint;
if (hint == NOTE_REVOKE) {
kn->kn_flags |= EV_EOF;
return (1);
}
return (kn->kn_fflags != 0);
}
/* $OpenBSD: bpf_filter.c,v 1.34 2020/08/03 03:21:24 dlg Exp $ */
/* $NetBSD: bpf_filter.c,v 1.12 1996/02/13 22:00:00 christos Exp $ */
/*
* Copyright (c) 1990, 1991, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from the Stanford/CMU enet packet filter,
* (net/enet.c) distributed as part of 4.3BSD, and code contributed
* to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
* Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/time.h>
#ifndef _KERNEL
#include <stdlib.h>
#include <string.h>
#include "pcap.h"
#else
#include <sys/systm.h>
#endif
#include <sys/endian.h>
#ifdef _KERNEL
extern int bpf_maxbufsize;
#define Static
#else /* _KERNEL */
#define Static static
#endif /* _KERNEL */
#include <net/bpf.h>
struct bpf_mem {
const u_char *pkt;
u_int len;
};
Static u_int32_t bpf_mem_ldw(const void *, u_int32_t, int *);
Static u_int32_t bpf_mem_ldh(const void *, u_int32_t, int *);
Static u_int32_t bpf_mem_ldb(const void *, u_int32_t, int *);
static const struct bpf_ops bpf_mem_ops = {
bpf_mem_ldw,
bpf_mem_ldh,
bpf_mem_ldb,
};
Static u_int32_t
bpf_mem_ldw(const void *mem, u_int32_t k, int *err)
{
const struct bpf_mem *bm = mem;
u_int32_t v;
*err = 1;
if (k + sizeof(v) > bm->len)
return (0);
memcpy(&v, bm->pkt + k, sizeof(v));
*err = 0;
return ntohl(v);
}
Static u_int32_t
bpf_mem_ldh(const void *mem, u_int32_t k, int *err)
{
const struct bpf_mem *bm = mem;
u_int16_t v;
*err = 1;
if (k + sizeof(v) > bm->len)
return (0);
memcpy(&v, bm->pkt + k, sizeof(v));
*err = 0;
return ntohs(v);
}
Static u_int32_t
bpf_mem_ldb(const void *mem, u_int32_t k, int *err)
{
const struct bpf_mem *bm = mem;
*err = 1;
if (k >= bm->len)
return (0);
*err = 0;
return bm->pkt[k];
}
/*
* Execute the filter program starting at pc on the packet p
* wirelen is the length of the original packet
* buflen is the amount of data present
*/
u_int
bpf_filter(const struct bpf_insn *pc, const u_char *pkt,
u_int wirelen, u_int buflen)
{
struct bpf_mem bm;
bm.pkt = pkt;
bm.len = buflen;
return _bpf_filter(pc, &bpf_mem_ops, &bm, wirelen);
}
u_int
_bpf_filter(const struct bpf_insn *pc, const struct bpf_ops *ops,
const void *pkt, u_int wirelen)
{
u_int32_t A = 0, X = 0;
u_int32_t k;
int32_t mem[BPF_MEMWORDS];
int err;
if (pc == NULL) {
/*
* No filter means accept all.
*/
return (u_int)-1;
}
memset(mem, 0, sizeof(mem));
--pc;
while (1) {
++pc;
switch (pc->code) {
default:
#ifdef _KERNEL
return 0;
#else
abort();
#endif
case BPF_RET|BPF_K:
return (u_int)pc->k;
case BPF_RET|BPF_A:
return (u_int)A;
case BPF_LD|BPF_W|BPF_ABS:
A = ops->ldw(pkt, pc->k, &err);
if (err != 0)
return 0;
continue;
case BPF_LD|BPF_H|BPF_ABS:
A = ops->ldh(pkt, pc->k, &err);
if (err != 0)
return 0;
continue;
case BPF_LD|BPF_B|BPF_ABS:
A = ops->ldb(pkt, pc->k, &err);
if (err != 0)
return 0;
continue;
case BPF_LD|BPF_W|BPF_LEN:
A = wirelen;
continue;
case BPF_LDX|BPF_W|BPF_LEN:
X = wirelen;
continue;
case BPF_LD|BPF_W|BPF_RND:
A = arc4random();
continue;
case BPF_LD|BPF_W|BPF_IND:
k = X + pc->k;
A = ops->ldw(pkt, k, &err);
if (err != 0)
return 0;
continue;
case BPF_LD|BPF_H|BPF_IND:
k = X + pc->k;
A = ops->ldh(pkt, k, &err);
if (err != 0)
return 0;
continue;
case BPF_LD|BPF_B|BPF_IND:
k = X + pc->k;
A = ops->ldb(pkt, k, &err);
if (err != 0)
return 0;
continue;
case BPF_LDX|BPF_MSH|BPF_B:
X = ops->ldb(pkt, pc->k, &err);
if (err != 0)
return 0;
X &= 0xf;
X <<= 2;
continue;
case BPF_LD|BPF_IMM:
A = pc->k;
continue;
case BPF_LDX|BPF_IMM:
X = pc->k;
continue;
case BPF_LD|BPF_MEM:
A = mem[pc->k];
continue;
case BPF_LDX|BPF_MEM:
X = mem[pc->k];
continue;
case BPF_ST:
mem[pc->k] = A;
continue;
case BPF_STX:
mem[pc->k] = X;
continue;
case BPF_JMP|BPF_JA:
pc += pc->k;
continue;
case BPF_JMP|BPF_JGT|BPF_K:
pc += (A > pc->k) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JGE|BPF_K:
pc += (A >= pc->k) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JEQ|BPF_K:
pc += (A == pc->k) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JSET|BPF_K:
pc += (A & pc->k) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JGT|BPF_X:
pc += (A > X) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JGE|BPF_X:
pc += (A >= X) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JEQ|BPF_X:
pc += (A == X) ? pc->jt : pc->jf;
continue;
case BPF_JMP|BPF_JSET|BPF_X:
pc += (A & X) ? pc->jt : pc->jf;
continue;
case BPF_ALU|BPF_ADD|BPF_X:
A += X;
continue;
case BPF_ALU|BPF_SUB|BPF_X:
A -= X;
continue;
case BPF_ALU|BPF_MUL|BPF_X:
A *= X;
continue;
case BPF_ALU|BPF_DIV|BPF_X:
if (X == 0)
return 0;
A /= X;
continue;
case BPF_ALU|BPF_AND|BPF_X:
A &= X;
continue;
case BPF_ALU|BPF_OR|BPF_X:
A |= X;
continue;
case BPF_ALU|BPF_LSH|BPF_X:
A <<= X;
continue;
case BPF_ALU|BPF_RSH|BPF_X:
A >>= X;
continue;
case BPF_ALU|BPF_ADD|BPF_K:
A += pc->k;
continue;
case BPF_ALU|BPF_SUB|BPF_K:
A -= pc->k;
continue;
case BPF_ALU|BPF_MUL|BPF_K:
A *= pc->k;
continue;
case BPF_ALU|BPF_DIV|BPF_K:
A /= pc->k;
continue;
case BPF_ALU|BPF_AND|BPF_K:
A &= pc->k;
continue;
case BPF_ALU|BPF_OR|BPF_K:
A |= pc->k;
continue;
case BPF_ALU|BPF_LSH|BPF_K:
A <<= pc->k;
continue;
case BPF_ALU|BPF_RSH|BPF_K:
A >>= pc->k;
continue;
case BPF_ALU|BPF_NEG:
A = -A;
continue;
case BPF_MISC|BPF_TAX:
X = A;
continue;
case BPF_MISC|BPF_TXA:
A = X;
continue;
}
}
}
#ifdef _KERNEL
/*
* Return true if the 'fcode' is a valid filter program.
* The constraints are that each jump be forward and to a valid
* code and memory operations use valid addresses. The code
* must terminate with either an accept or reject.
*
* The kernel needs to be able to verify an application's filter code.
* Otherwise, a bogus program could easily crash the system.
*/
int
bpf_validate(struct bpf_insn *f, int len)
{
u_int i, from;
struct bpf_insn *p;
if (len < 1 || len > BPF_MAXINSNS)
return 0;
for (i = 0; i < len; ++i) {
p = &f[i];
switch (BPF_CLASS(p->code)) {
/*
* Check that memory operations use valid addresses.
*/
case BPF_LD:
case BPF_LDX:
switch (BPF_MODE(p->code)) {
case BPF_IMM:
break;
case BPF_ABS:
case BPF_IND:
case BPF_MSH:
/*
* More strict check with actual packet length
* is done runtime.
*/
if (p->k >= bpf_maxbufsize)
return 0;
break;
case BPF_MEM:
if (p->k >= BPF_MEMWORDS)
return 0;
break;
case BPF_LEN:
case BPF_RND:
break;
default:
return 0;
}
break;
case BPF_ST:
case BPF_STX:
if (p->k >= BPF_MEMWORDS)
return 0;
break;
case BPF_ALU:
switch (BPF_OP(p->code)) {
case BPF_ADD:
case BPF_SUB:
case BPF_MUL:
case BPF_OR:
case BPF_AND:
case BPF_LSH:
case BPF_RSH:
case BPF_NEG:
break;
case BPF_DIV:
/*
* Check for constant division by 0.
*/
if (BPF_SRC(p->code) == BPF_K && p->k == 0)
return 0;
break;
default:
return 0;
}
break;
case BPF_JMP:
/*
* Check that jumps are forward, and within
* the code block.
*/
from = i + 1;
switch (BPF_OP(p->code)) {
case BPF_JA:
if (from + p->k < from || from + p->k >= len)
return 0;
break;
case BPF_JEQ:
case BPF_JGT:
case BPF_JGE:
case BPF_JSET:
if (from + p->jt >= len || from + p->jf >= len)
return 0;
break;
default:
return 0;
}
break;
case BPF_RET:
break;
case BPF_MISC:
break;
default:
return 0;
}
}
return BPF_CLASS(f[len - 1].code) == BPF_RET;
}
#endif
/* $OpenBSD: cons.c,v 1.30 2022/07/02 08:50:41 visa Exp $ */
/* $NetBSD: cons.c,v 1.30 1996/04/08 19:57:30 jonathan Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: cons.c 1.7 92/01/21$
*
* @(#)cons.c 8.2 (Berkeley) 1/12/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <dev/cons.h>
struct tty *constty = NULL; /* virtual console output device */
struct vnode *cn_devvp = NULLVP; /* vnode for underlying device. */
int
cnopen(dev_t dev, int flag, int mode, struct proc *p)
{
dev_t cndev;
if (cn_tab == NULL)
return (0);
/*
* always open the 'real' console device, so we don't get nailed
* later. This follows normal device semantics; they always get
* open() calls.
*/
cndev = cn_tab->cn_dev;
if (cndev == NODEV)
return (ENXIO);
#ifdef DIAGNOSTIC
if (cndev == dev)
panic("cnopen: recursive");
#endif
if (cn_devvp == NULLVP) {
/* try to get a reference on its vnode, but fail silently */
cdevvp(cndev, &cn_devvp);
}
return ((*cdevsw[major(cndev)].d_open)(cndev, flag, mode, p));
}
int
cnclose(dev_t dev, int flag, int mode, struct proc *p)
{
struct vnode *vp;
if (cn_tab == NULL)
return (0);
/*
* If the real console isn't otherwise open, close it.
* If it's otherwise open, don't close it, because that'll
* screw up others who have it open.
*/
dev = cn_tab->cn_dev;
if (cn_devvp != NULLVP) {
/* release our reference to real dev's vnode */
vrele(cn_devvp);
cn_devvp = NULLVP;
}
if (vfinddev(dev, VCHR, &vp) && vcount(vp))
return (0);
return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, p));
}
int
cnread(dev_t dev, struct uio *uio, int flag)
{
/*
* If we would redirect input, punt. This will keep strange
* things from happening to people who are using the real
* console. Nothing should be using /dev/console for
* input (except a shell in single-user mode, but then,
* one wouldn't TIOCCONS then).
*/
if (constty != NULL)
return 0;
else if (cn_tab == NULL)
return ENXIO;
dev = cn_tab->cn_dev;
return ((*cdevsw[major(dev)].d_read)(dev, uio, flag));
}
int
cnwrite(dev_t dev, struct uio *uio, int flag)
{
/*
* Redirect output, if that's appropriate.
* If there's no real console, return ENXIO.
*/
if (constty != NULL)
dev = constty->t_dev;
else if (cn_tab == NULL)
return ENXIO;
else
dev = cn_tab->cn_dev;
return ((*cdevsw[major(dev)].d_write)(dev, uio, flag));
}
int
cnstop(struct tty *tp, int flag)
{
return (0);
}
int
cnioctl(dev_t dev, u_long cmd, caddr_t data, int flag,
struct proc *p)
{
int error;
/*
* Superuser can always use this to wrest control of console
* output from the "virtual" console.
*/
if (cmd == TIOCCONS && constty != NULL) {
error = suser(p);
if (error)
return (error);
constty = NULL;
return (0);
}
/*
* Redirect the ioctl, if that's appropriate.
* Note that strange things can happen, if a program does
* ioctls on /dev/console, then the console is redirected
* out from under it.
*/
if (constty != NULL)
dev = constty->t_dev;
else if (cn_tab == NULL)
return ENXIO;
else
dev = cn_tab->cn_dev;
return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, data, flag, p));
}
int
cnkqfilter(dev_t dev, struct knote *kn)
{
/*
* Redirect output, if that's appropriate.
* If there's no real console, return 1.
*/
if (constty != NULL)
dev = constty->t_dev;
else if (cn_tab == NULL)
return (ENXIO);
else
dev = cn_tab->cn_dev;
if (cdevsw[major(dev)].d_kqfilter)
return ((*cdevsw[major(dev)].d_kqfilter)(dev, kn));
return (EOPNOTSUPP);
}
int
cngetc(void)
{
if (cn_tab == NULL)
return (0);
return ((*cn_tab->cn_getc)(cn_tab->cn_dev));
}
void
cnputc(int c)
{ if (cn_tab == NULL)
return;
if (c) {
(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
if (c == '\n') (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
}
}
void
cnpollc(int on)
{
static int refcount = 0;
if (cn_tab == NULL)
return;
if (!on)
--refcount;
if (refcount == 0)
(*cn_tab->cn_pollc)(cn_tab->cn_dev, on);
if (on)
++refcount;
}
void
nullcnpollc(dev_t dev, int on)
{
}
void
cnbell(u_int pitch, u_int period, u_int volume)
{
if (cn_tab == NULL || cn_tab->cn_bell == NULL)
return;
(*cn_tab->cn_bell)(cn_tab->cn_dev, pitch, period, volume);
}
/* $OpenBSD: vfs_init.c,v 1.43 2019/12/26 13:30:54 bluhm Exp $ */
/* $NetBSD: vfs_init.c,v 1.6 1996/02/09 19:00:58 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed
* to Berkeley by John Heidemann of the UCLA Ficus project.
*
* Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/pool.h>
struct pool namei_pool;
/* This defines the root filesystem. */
struct vnode *rootvnode;
/* Set up the filesystem operations for vnodes. */
static struct vfsconf vfsconflist[] = {
#ifdef FFS
{ &ffs_vfsops, MOUNT_FFS, 1, 0, MNT_LOCAL | MNT_SWAPPABLE,
sizeof(struct ufs_args) },
#endif
#ifdef MFS
{ &mfs_vfsops, MOUNT_MFS, 3, 0, MNT_LOCAL,
sizeof(struct mfs_args) },
#endif
#ifdef EXT2FS
{ &ext2fs_vfsops, MOUNT_EXT2FS, 17, 0, MNT_LOCAL | MNT_SWAPPABLE,
sizeof(struct ufs_args) },
#endif
#ifdef CD9660
{ &cd9660_vfsops, MOUNT_CD9660, 14, 0, MNT_LOCAL,
sizeof(struct iso_args) },
#endif
#ifdef MSDOSFS
{ &msdosfs_vfsops, MOUNT_MSDOS, 4, 0, MNT_LOCAL | MNT_SWAPPABLE,
sizeof(struct msdosfs_args) },
#endif
#ifdef NFSCLIENT
{ &nfs_vfsops, MOUNT_NFS, 2, 0, MNT_SWAPPABLE,
sizeof(struct nfs_args) },
#endif
#ifdef NTFS
{ &ntfs_vfsops, MOUNT_NTFS, 6, 0, MNT_LOCAL,
sizeof(struct ntfs_args) },
#endif
#ifdef UDF
{ &udf_vfsops, MOUNT_UDF, 13, 0, MNT_LOCAL,
sizeof(struct iso_args) },
#endif
#ifdef FUSE
{ &fusefs_vfsops, MOUNT_FUSEFS, 18, 0, MNT_LOCAL,
sizeof(struct fusefs_args) },
#endif
#ifdef TMPFS
{ &tmpfs_vfsops, MOUNT_TMPFS, 19, 0, MNT_LOCAL,
sizeof(struct tmpfs_args) },
#endif
};
/*
* Initially the size of the list, vfsinit will set maxvfsconf
* to the highest defined type number.
*/
int maxvfsconf = sizeof(vfsconflist) / sizeof(struct vfsconf);
/* Initialize the vnode structures and initialize each file system type. */
void
vfsinit(void)
{
struct vfsconf *vfsp;
int i;
pool_init(&namei_pool, MAXPATHLEN, 0, IPL_NONE, PR_WAITOK, "namei",
NULL);
/* Initialize the vnode table. */
vntblinit();
/* Initialize the vnode name cache. */
nchinit();
maxvfsconf = 0;
for (i = 0; i < nitems(vfsconflist); i++) {
vfsp = &vfsconflist[i];
if (vfsp->vfc_typenum > maxvfsconf)
maxvfsconf = vfsp->vfc_typenum;
if (vfsp->vfc_vfsops->vfs_init != NULL)
(*vfsp->vfc_vfsops->vfs_init)(vfsp);
}
}
struct vfsconf *
vfs_byname(const char *name)
{
int i;
for (i = 0; i < nitems(vfsconflist); i++) {
if (strcmp(vfsconflist[i].vfc_name, name) == 0)
return &vfsconflist[i];
}
return NULL;
}
struct vfsconf *
vfs_bytypenum(int typenum)
{
int i;
for (i = 0; i < nitems(vfsconflist); i++) {
if (vfsconflist[i].vfc_typenum == typenum)
return &vfsconflist[i];
}
return NULL;
}
/* $OpenBSD: sysv_msg.c,v 1.39 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: sysv_msg.c,v 1.19 1996/02/09 19:00:18 christos Exp $ */
/*
* Copyright (c) 2009 Bret S. Lambert <blambert@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Implementation of SVID messages
*
* Author: Daniel Boulet
*
* Copyright 1993 Daniel Boulet and RTMX Inc.
*
* This system call was implemented by Daniel Boulet under contract from RTMX.
*
* Redistribution and use in source forms, with and without modification,
* are permitted provided that this entire comment appears intact.
*
* Redistribution in binary form may occur without any restrictions.
* Obviously, it would be nice if you gave credit where credit is due
* but requiring it would be too onerous.
*
* This software is provided ``AS IS'' without any warranties of any kind.
*/
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/msg.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
struct que *que_create(key_t, struct ucred *, int);
struct que *que_lookup(int);
struct que *que_key_lookup(key_t);
void que_wakewriters(void);
void que_free(struct que *);
struct msg *msg_create(struct que *);
void msg_free(struct msg *);
void msg_enqueue(struct que *, struct msg *, struct proc *);
void msg_dequeue(struct que *, struct msg *, struct proc *);
struct msg *msg_lookup(struct que *, int);
int msg_copyin(struct msg *, const char *, size_t, struct proc *);
int msg_copyout(struct msg *, char *, size_t *, struct proc *);
struct pool sysvmsgpl;
struct msginfo msginfo;
TAILQ_HEAD(, que) msg_queues;
int num_ques;
int num_msgs;
int sequence;
int maxmsgs;
void
msginit(void)
{
msginfo.msgmax = MSGMAX;
msginfo.msgmni = MSGMNI;
msginfo.msgmnb = MSGMNB;
msginfo.msgtql = MSGTQL;
msginfo.msgssz = MSGSSZ;
msginfo.msgseg = MSGSEG;
pool_init(&sysvmsgpl, sizeof(struct msg), 0, IPL_NONE, PR_WAITOK,
"sysvmsgpl", NULL);
TAILQ_INIT(&msg_queues);
num_ques = 0;
num_msgs = 0;
sequence = 1;
maxmsgs = 0;
}
int
sys_msgctl(struct proc *p, void *v, register_t *retval)
{
struct sys_msgctl_args /* {
syscallarg(int) msqid;
syscallarg(int) cmd;
syscallarg(struct msqid_ds *) buf;
} */ *uap = v;
return (msgctl1(p, SCARG(uap, msqid), SCARG(uap, cmd),
(caddr_t)SCARG(uap, buf), copyin, copyout));
}
int
msgctl1(struct proc *p, int msqid, int cmd, caddr_t buf,
int (*ds_copyin)(const void *, void *, size_t),
int (*ds_copyout)(const void *, void *, size_t))
{
struct msqid_ds tmp;
struct ucred *cred = p->p_ucred;
struct que *que;
int error = 0;
if ((que = que_lookup(msqid)) == NULL)
return (EINVAL);
QREF(que);
switch (cmd) {
case IPC_RMID:
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm, IPC_M)))
goto out;
TAILQ_REMOVE(&msg_queues, que, que_next);
que->que_flags |= MSGQ_DYING;
/* lose interest in the queue and wait for others to too */
if (--que->que_references > 0) { wakeup(que);
tsleep_nsec(&que->que_references, PZERO, "msgqrm",
INFSLP);
}
que_free(que);
return (0);
case IPC_SET:
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm, IPC_M)))
goto out;
if ((error = ds_copyin(buf, &tmp, sizeof(struct msqid_ds))))
goto out;
/* only superuser can bump max bytes in queue */
if (tmp.msg_qbytes > que->msqid_ds.msg_qbytes &&
cred->cr_uid != 0) {
error = EPERM;
goto out;
}
/* restrict max bytes in queue to system limit */
if (tmp.msg_qbytes > msginfo.msgmnb) tmp.msg_qbytes = msginfo.msgmnb;
/* can't reduce msg_bytes to 0 */
if (tmp.msg_qbytes == 0) {
error = EINVAL; /* non-standard errno! */
goto out;
}
que->msqid_ds.msg_perm.uid = tmp.msg_perm.uid;
que->msqid_ds.msg_perm.gid = tmp.msg_perm.gid;
que->msqid_ds.msg_perm.mode =
(que->msqid_ds.msg_perm.mode & ~0777) |
(tmp.msg_perm.mode & 0777);
que->msqid_ds.msg_qbytes = tmp.msg_qbytes;
que->msqid_ds.msg_ctime = gettime();
break;
case IPC_STAT:
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm, IPC_R)))
goto out;
error = ds_copyout(&que->msqid_ds, buf,
sizeof(struct msqid_ds));
break;
default:
error = EINVAL;
break;
}
out:
QRELE(que);
return (error);
}
int
sys_msgget(struct proc *p, void *v, register_t *retval)
{
struct sys_msgget_args /* {
syscallarg(key_t) key;
syscallarg(int) msgflg;
} */ *uap = v;
struct ucred *cred = p->p_ucred;
struct que *que;
key_t key = SCARG(uap, key);
int msgflg = SCARG(uap, msgflg);
int error = 0;
again:
if (key != IPC_PRIVATE) {
que = que_key_lookup(key);
if (que) {
if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL))
return (EEXIST);
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm,
msgflg & 0700)))
return (error);
goto found;
}
}
/* don't create a new message queue if the caller doesn't want to */
if (key != IPC_PRIVATE && !(msgflg & IPC_CREAT))
return (ENOENT);
/* enforce limits on the maximum number of message queues */
if (num_ques >= msginfo.msgmni)
return (ENOSPC);
/*
* if que_create returns NULL, it means that a que with an identical
* key was created while this process was sleeping, so start over
*/
if ((que = que_create(key, cred, msgflg & 0777)) == NULL)
goto again;
found:
*retval = IXSEQ_TO_IPCID(que->que_ix, que->msqid_ds.msg_perm);
return (error);
}
#define MSGQ_SPACE(q) ((q)->msqid_ds.msg_qbytes - (q)->msqid_ds.msg_cbytes)
int
sys_msgsnd(struct proc *p, void *v, register_t *retval)
{
struct sys_msgsnd_args /* {
syscallarg(int) msqid;
syscallarg(const void *) msgp;
syscallarg(size_t) msgsz;
syscallarg(int) msgflg;
} */ *uap = v;
struct ucred *cred = p->p_ucred;
struct que *que;
struct msg *msg;
size_t msgsz = SCARG(uap, msgsz);
int error;
if ((que = que_lookup(SCARG(uap, msqid))) == NULL)
return (EINVAL);
if (msgsz > que->msqid_ds.msg_qbytes || msgsz > msginfo.msgmax)
return (EINVAL);
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm, IPC_W)))
return (error);
QREF(que);
while (MSGQ_SPACE(que) < msgsz || num_msgs >= msginfo.msgtql) {
if (SCARG(uap, msgflg) & IPC_NOWAIT) {
error = EAGAIN;
goto out;
}
/* notify world that process may wedge here */
if (num_msgs >= msginfo.msgtql) maxmsgs = 1;
que->que_flags |= MSGQ_WRITERS;
if ((error = tsleep_nsec(que, PZERO|PCATCH, "msgwait", INFSLP)))
goto out;
if (que->que_flags & MSGQ_DYING) {
error = EIDRM;
goto out;
}
}
/* if msg_create returns NULL, the queue is being removed */
if ((msg = msg_create(que)) == NULL) {
error = EIDRM;
goto out;
}
/* msg_copyin frees msg on error */
if ((error = msg_copyin(msg, (const char *)SCARG(uap, msgp), msgsz, p)))
goto out;
msg_enqueue(que, msg, p);
if (que->que_flags & MSGQ_READERS) { que->que_flags &= ~MSGQ_READERS;
wakeup(que);
}
if (que->que_flags & MSGQ_DYING) {
error = EIDRM;
wakeup(que);
}
out:
QRELE(que);
return (error);
}
int
sys_msgrcv(struct proc *p, void *v, register_t *retval)
{
struct sys_msgrcv_args /* {
syscallarg(int) msqid;
syscallarg(void *) msgp;
syscallarg(size_t) msgsz;
syscallarg(long) msgtyp;
syscallarg(int) msgflg;
} */ *uap = v;
struct ucred *cred = p->p_ucred;
char *msgp = SCARG(uap, msgp);
struct que *que;
struct msg *msg;
size_t msgsz = SCARG(uap, msgsz);
long msgtyp = SCARG(uap, msgtyp);
int error;
if ((que = que_lookup(SCARG(uap, msqid))) == NULL)
return (EINVAL);
if ((error = ipcperm(cred, &que->msqid_ds.msg_perm, IPC_R)))
return (error);
QREF(que);
/* msg_lookup handles matching; sleeping gets handled here */
while ((msg = msg_lookup(que, msgtyp)) == NULL) { if (SCARG(uap, msgflg) & IPC_NOWAIT) {
error = ENOMSG;
goto out;
}
que->que_flags |= MSGQ_READERS;
if ((error = tsleep_nsec(que, PZERO|PCATCH, "msgwait", INFSLP)))
goto out;
/* make sure the queue still alive */
if (que->que_flags & MSGQ_DYING) {
error = EIDRM;
goto out;
}
}
/* if msg_copyout fails, keep the message around so it isn't lost */
if ((error = msg_copyout(msg, msgp, &msgsz, p)))
goto out;
msg_dequeue(que, msg, p);
msg_free(msg);
if (que->que_flags & MSGQ_WRITERS) { que->que_flags &= ~MSGQ_WRITERS;
wakeup(que);
}
/* ensure processes waiting on the global limit don't wedge */
if (maxmsgs) {
maxmsgs = 0;
que_wakewriters();
}
*retval = msgsz;
out:
QRELE(que);
return (error);
}
/*
* que management functions
*/
struct que *
que_create(key_t key, struct ucred *cred, int mode)
{
struct que *que, *que2;
int nextix = 1;
que = malloc(sizeof(*que), M_TEMP, M_WAIT|M_ZERO);
/* if malloc slept, a queue with the same key may have been created */
if (que_key_lookup(key)) {
free(que, M_TEMP, sizeof *que);
return (NULL);
}
/* find next available "index" */
TAILQ_FOREACH(que2, &msg_queues, que_next) { if (nextix < que2->que_ix)
break;
nextix = que2->que_ix + 1;
}
que->que_ix = nextix;
que->msqid_ds.msg_perm.key = key;
que->msqid_ds.msg_perm.cuid = cred->cr_uid;
que->msqid_ds.msg_perm.uid = cred->cr_uid;
que->msqid_ds.msg_perm.cgid = cred->cr_gid;
que->msqid_ds.msg_perm.gid = cred->cr_gid;
que->msqid_ds.msg_perm.mode = mode & 0777;
que->msqid_ds.msg_perm.seq = ++sequence & 0x7fff;
que->msqid_ds.msg_qbytes = msginfo.msgmnb;
que->msqid_ds.msg_ctime = gettime();
TAILQ_INIT(&que->que_msgs);
/* keep queues in "index" order */
if (que2)
TAILQ_INSERT_BEFORE(que2, que, que_next);
else
TAILQ_INSERT_TAIL(&msg_queues, que, que_next);
num_ques++;
return (que);
}
struct que *
que_lookup(int id)
{
struct que *que;
TAILQ_FOREACH(que, &msg_queues, que_next)
if (que->que_ix == IPCID_TO_IX(id))
break;
/* don't return queues marked for removal */
if (que && que->que_flags & MSGQ_DYING)
return (NULL);
return (que);
}
struct que *
que_key_lookup(key_t key)
{
struct que *que;
if (key == IPC_PRIVATE)
return (NULL);
TAILQ_FOREACH(que, &msg_queues, que_next)
if (que->msqid_ds.msg_perm.key == key)
break;
/* don't return queues marked for removal */
if (que && que->que_flags & MSGQ_DYING)
return (NULL);
return (que);
}
void
que_wakewriters(void)
{
struct que *que;
TAILQ_FOREACH(que, &msg_queues, que_next) { if (que->que_flags & MSGQ_WRITERS) { que->que_flags &= ~MSGQ_WRITERS;
wakeup(que);
}
}
}
void
que_free(struct que *que)
{
struct msg *msg;
#ifdef DIAGNOSTIC
if (que->que_references > 0)
panic("freeing message queue with active references");
#endif
while ((msg = TAILQ_FIRST(&que->que_msgs))) {
TAILQ_REMOVE(&que->que_msgs, msg, msg_next);
msg_free(msg);
}
free(que, M_TEMP, sizeof *que);
num_ques--;
}
/*
* msg management functions
*/
struct msg *
msg_create(struct que *que)
{
struct msg *msg;
msg = pool_get(&sysvmsgpl, PR_WAITOK|PR_ZERO);
/* if the queue has died during allocation, return NULL */
if (que->que_flags & MSGQ_DYING) {
pool_put(&sysvmsgpl, msg);
wakeup(que);
return(NULL);
}
num_msgs++;
return (msg);
}
struct msg *
msg_lookup(struct que *que, int msgtyp)
{
struct msg *msg;
/*
* Three different matches are performed based on the value of msgtyp:
* 1) msgtyp > 0 => match exactly
* 2) msgtyp = 0 => match any
* 3) msgtyp < 0 => match any up to absolute value of msgtyp
*/
TAILQ_FOREACH(msg, &que->que_msgs, msg_next) if (msgtyp == 0 || msgtyp == msg->msg_type || (msgtyp < 0 && -msgtyp <= msg->msg_type))
break;
return (msg);
}
void
msg_free(struct msg *msg)
{
m_freem(msg->msg_data);
pool_put(&sysvmsgpl, msg);
num_msgs--;
}
void
msg_enqueue(struct que *que, struct msg *msg, struct proc *p)
{
que->msqid_ds.msg_cbytes += msg->msg_len;
que->msqid_ds.msg_qnum++;
que->msqid_ds.msg_lspid = p->p_p->ps_pid;
que->msqid_ds.msg_stime = gettime();
TAILQ_INSERT_TAIL(&que->que_msgs, msg, msg_next);
}
void
msg_dequeue(struct que *que, struct msg *msg, struct proc *p)
{
que->msqid_ds.msg_cbytes -= msg->msg_len;
que->msqid_ds.msg_qnum--;
que->msqid_ds.msg_lrpid = p->p_p->ps_pid;
que->msqid_ds.msg_rtime = gettime();
TAILQ_REMOVE(&que->que_msgs, msg, msg_next);
}
/*
* The actual I/O routines. A note concerning the layout of SysV msg buffers:
*
* The data to be copied is laid out as a single userspace buffer, with a
* long preceding an opaque buffer of len bytes. The long value ends
* up being the message type, which needs to be copied separately from
* the buffer data, which is stored in in mbufs.
*/
int
msg_copyin(struct msg *msg, const char *ubuf, size_t len, struct proc *p)
{
struct mbuf **mm, *m;
size_t xfer;
int error;
if (msg == NULL)
panic ("msg NULL");
if ((error = copyin(ubuf, &msg->msg_type, sizeof(long)))) {
msg_free(msg);
return (error);
}
if (msg->msg_type < 1) {
msg_free(msg);
return (EINVAL);
}
ubuf += sizeof(long);
msg->msg_len = 0;
mm = &msg->msg_data;
while (msg->msg_len < len) {
m = m_get(M_WAIT, MT_DATA);
if (len >= MINCLSIZE) { MCLGET(m, M_WAIT);
xfer = min(len, MCLBYTES);
} else {
xfer = min(len, MLEN);
}
m->m_len = xfer;
msg->msg_len += xfer;
*mm = m;
mm = &m->m_next;
}
for (m = msg->msg_data; m; m = m->m_next) { if ((error = copyin(ubuf, mtod(m, void *), m->m_len))) {
msg_free(msg);
return (error);
}
ubuf += m->m_len;
}
return (0);
}
int
msg_copyout(struct msg *msg, char *ubuf, size_t *len, struct proc *p)
{
struct mbuf *m;
size_t xfer;
int error;
#ifdef DIAGNOSTIC
if (msg->msg_len > MSGMAX)
panic("SysV message longer than MSGMAX");
#endif
/* silently truncate messages too large for user buffer */
xfer = min(*len, msg->msg_len);
if ((error = copyout(&msg->msg_type, ubuf, sizeof(long))))
return (error);
ubuf += sizeof(long);
*len = xfer;
for (m = msg->msg_data; m; m = m->m_next) { if ((error = copyout(mtod(m, void *), ubuf, m->m_len)))
return (error);
ubuf += m->m_len;
}
return (0);
}
int
sysctl_sysvmsg(int *name, u_int namelen, void *where, size_t *sizep)
{
struct msg_sysctl_info *info;
struct que *que;
size_t infolen, infolen0;
int error;
switch (*name) {
case KERN_SYSVIPC_MSG_INFO:
if (namelen != 1)
return (ENOTDIR);
/*
* The userland ipcs(1) utility expects to be able
* to iterate over at least msginfo.msgmni queues,
* even if those queues don't exist. This is an
* artifact of the previous implementation of
* message queues; for now, emulate this behavior
* until a more thorough fix can be made.
*/
infolen0 = sizeof(msginfo) +
msginfo.msgmni * sizeof(struct msqid_ds);
if (where == NULL) {
*sizep = infolen0;
return (0);
}
/*
* More special-casing due to previous implementation:
* if the caller just wants the msginfo struct, then
* sizep will point to the value sizeof(struct msginfo).
* In that case, only copy out the msginfo struct to
* the caller.
*/
if (*sizep == sizeof(struct msginfo))
return (copyout(&msginfo, where, sizeof(msginfo)));
info = malloc(infolen0, M_TEMP, M_WAIT|M_ZERO);
/* if the malloc slept, this may have changed */
infolen = sizeof(msginfo) +
msginfo.msgmni * sizeof(struct msqid_ds);
if (*sizep < infolen) {
free(info, M_TEMP, infolen0);
return (ENOMEM);
}
memcpy(&info->msginfo, &msginfo, sizeof(struct msginfo));
/*
* Special case #3: the previous array-based implementation
* exported the array indices and userland has come to rely
* upon these indices, so keep behavior consistent.
*/
TAILQ_FOREACH(que, &msg_queues, que_next)
memcpy(&info->msgids[que->que_ix], &que->msqid_ds,
sizeof(struct msqid_ds));
error = copyout(info, where, infolen);
free(info, M_TEMP, infolen0);
return (error);
default:
return (EINVAL);
}
}
/* $OpenBSD: ufs_bmap.c,v 1.37 2021/12/12 09:14:59 visa Exp $ */
/* $NetBSD: ufs_bmap.c,v 1.3 1996/02/09 22:36:00 christos Exp $ */
/*
* Copyright (c) 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
/*
* Bmap converts a the logical block number of a file to its physical block
* number on the disk. The conversion is done by using the logical block
* number to index into the array of block pointers described by the dinode.
*/
int
ufs_bmap(void *v)
{
struct vop_bmap_args *ap = v;
/*
* Check for underlying vnode requests and ensure that logical
* to physical mapping is requested.
*/
if (ap->a_vpp != NULL)
*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
if (ap->a_bnp == NULL)
return (0);
return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
ap->a_runp));
}
/*
* Indirect blocks are now on the vnode for the file. They are given negative
* logical block numbers. Indirect blocks are addressed by the negative
* address of the first data block to which they point. Double indirect blocks
* are addressed by one less than the address of the first indirect block to
* which they point. Triple indirect blocks are addressed by one less than
* the address of the first double indirect block to which they point.
*
* ufs_bmaparray does the bmap conversion, and if requested returns the
* array of logical blocks which must be traversed to get to a block.
* Each entry contains the offset into that block that gets you to the
* next block and the disk address of the block (if it is assigned).
*/
int
ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
int *nump, int *runp)
{
struct inode *ip;
struct buf *bp;
struct ufsmount *ump;
struct mount *mp;
struct vnode *devvp;
struct indir a[NIADDR+1], *xap;
daddr_t daddr, metalbn;
int error, maxrun = 0, num;
ip = VTOI(vp);
mp = vp->v_mount;
ump = VFSTOUFS(mp);
#ifdef DIAGNOSTIC
if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
panic("ufs_bmaparray: invalid arguments");
#endif
if (runp) {
/*
* XXX
* If MAXBSIZE is the largest transfer the disks can handle,
* we probably want maxrun to be 1 block less so that we
* don't create a block larger than the device can handle.
*/
*runp = 0;
maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1;
}
xap = ap == NULL ? a : ap;
if (!nump)
nump = #
if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
return (error);
num = *nump;
if (num == 0) {
*bnp = blkptrtodb(ump, DIP(ip, db[bn]));
if (*bnp == 0)
*bnp = -1; else if (runp) for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, DIP(ip, db[bn - 1]),
DIP(ip, db[bn]));
++bn, ++*runp);
return (0);
}
/* Get disk address out of indirect block array */
daddr = DIP(ip, ib[xap->in_off]);
devvp = VFSTOUFS(vp->v_mount)->um_devvp;
for (bp = NULL, ++xap; --num; ++xap) {
/*
* Exit the loop if there is no disk address assigned yet and
* the indirect block isn't in the cache, or if we were
* looking for an indirect block and we've found it.
*/
metalbn = xap->in_lbn;
if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
break;
/*
* If we get here, we've either got the block in the cache
* or we have a disk address for it, go fetch it.
*/
if (bp) brelse(bp);
xap->in_exists = 1;
bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, INFSLP);
if (bp->b_flags & (B_DONE | B_DELWRI)) {
;
}
#ifdef DIAGNOSTIC
else if (!daddr)
panic("ufs_bmaparray: indirect block not in cache");
#endif
else {
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
bcstats.pendingreads++;
bcstats.numreads++;
VOP_STRATEGY(bp->b_vp, bp);
curproc->p_ru.ru_inblock++; /* XXX */
if ((error = biowait(bp)) != 0) { brelse(bp);
return (error);
}
}
#ifdef FFS2
if (ip->i_ump->um_fstype == UM_UFS2) {
daddr = ((int64_t *)bp->b_data)[xap->in_off];
if (num == 1 && daddr && runp) for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
((int64_t *)bp->b_data)[bn - 1],
((int64_t *)bp->b_data)[bn]);
++bn, ++*runp);
continue;
}
#endif /* FFS2 */
daddr = ((int32_t *)bp->b_data)[xap->in_off];
if (num == 1 && daddr && runp) for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
((int32_t *)bp->b_data)[bn - 1],
((int32_t *)bp->b_data)[bn]);
++bn, ++*runp);
}
if (bp)
brelse(bp);
daddr = blkptrtodb(ump, daddr);
*bnp = daddr == 0 ? -1 : daddr;
return (0);
}
/*
* Create an array of logical block number/offset pairs which represent the
* path of indirect blocks required to access a data block. The first "pair"
* contains the logical block number of the appropriate single, double or
* triple indirect block and the offset into the inode indirect block array.
* Note, the logical block number of the inode single/double/triple indirect
* block appears twice in the array, once with the offset into the i_ffs_ib and
* once with the offset into the page itself.
*/
int
ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
{
daddr_t metalbn, realbn;
struct ufsmount *ump;
int64_t blockcnt;
int i, numlevels, off;
ump = VFSTOUFS(vp->v_mount);
if (nump) *nump = 0;
numlevels = 0;
realbn = bn;
if (bn < 0)
bn = -bn;
#ifdef DIAGNOSTIC
if (realbn < 0 && realbn > -NDADDR) {
panic ("ufs_getlbns: Invalid indirect block %lld specified",
(long long)realbn);
}
#endif
/* The first NDADDR blocks are direct blocks. */
if (bn < NDADDR)
return (0);
/*
* Determine the number of levels of indirection. After this loop
* is done, blockcnt indicates the number of data blocks possible
* at the given level of indirection, and NIADDR - i is the number
* of levels of indirection needed to locate the requested block.
*/
for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
if (i == 0)
return (EFBIG);
blockcnt *= MNINDIR(ump);
if (bn < blockcnt)
break;
}
/* Calculate the address of the first meta-block. */
if (realbn >= 0)
metalbn = -(realbn - bn + NIADDR - i);
else
metalbn = -(-realbn - bn + NIADDR - i);
/*
* At each iteration, off is the offset into the bap array which is
* an array of disk addresses at the current level of indirection.
* The logical block number and the offset in that block are stored
* into the argument array.
*/
ap->in_lbn = metalbn;
ap->in_off = off = NIADDR - i;
ap->in_exists = 0;
ap++;
for (++numlevels; i <= NIADDR; i++) {
/* If searching for a meta-data block, quit when found. */
if (metalbn == realbn)
break;
blockcnt /= MNINDIR(ump);
off = (bn / blockcnt) % MNINDIR(ump);
++numlevels;
ap->in_lbn = metalbn;
ap->in_off = off;
ap->in_exists = 0;
++ap;
metalbn -= -1 + off * blockcnt;
}
#ifdef DIAGNOSTIC
if (realbn < 0 && metalbn != realbn) { panic("ufs_getlbns: indirect block %lld not found",
(long long)realbn);
}
#endif
if (nump) *nump = numlevels;
return (0);
}
/* $OpenBSD: trap.c,v 1.90 2021/12/09 00:26:11 guenther Exp $ */
/* $NetBSD: trap.c,v 1.2 2003/05/04 23:51:56 fvdl Exp $ */
/*-
* Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)trap.c 7.4 (Berkeley) 5/13/91
*/
/*
* amd64 Trap and System call handling
*/
#undef TRAP_SIGDEBUG
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/user.h>
#include <sys/signal.h>
#include <sys/syscall.h>
#include <sys/syscall_mi.h>
#include <sys/stdarg.h>
#include <uvm/uvm_extern.h>
#include <machine/cpu.h>
#include <machine/cpufunc.h>
#include <machine/fpu.h>
#include <machine/psl.h>
#include <machine/trap.h>
#ifdef DDB
#include <ddb/db_output.h>
#include <machine/db_machdep.h>
#endif
#include "isa.h"
int upageflttrap(struct trapframe *, uint64_t);
int kpageflttrap(struct trapframe *, uint64_t);
void kerntrap(struct trapframe *);
void usertrap(struct trapframe *);
void ast(struct trapframe *);
void syscall(struct trapframe *);
const char * const trap_type[] = {
"privileged instruction fault", /* 0 T_PRIVINFLT */
"breakpoint trap", /* 1 T_BPTFLT */
"arithmetic trap", /* 2 T_ARITHTRAP */
"reserved trap", /* 3 T_RESERVED */
"protection fault", /* 4 T_PROTFLT */
"trace trap", /* 5 T_TRCTRAP */
"page fault", /* 6 T_PAGEFLT */
"alignment fault", /* 7 T_ALIGNFLT */
"integer divide fault", /* 8 T_DIVIDE */
"non-maskable interrupt", /* 9 T_NMI */
"overflow trap", /* 10 T_OFLOW */
"bounds check fault", /* 11 T_BOUND */
"FPU not available fault", /* 12 T_DNA */
"double fault", /* 13 T_DOUBLEFLT */
"FPU operand fetch fault", /* 14 T_FPOPFLT */
"invalid TSS fault", /* 15 T_TSSFLT */
"segment not present fault", /* 16 T_SEGNPFLT */
"stack fault", /* 17 T_STKFLT */
"machine check", /* 18 T_MCA */
"SSE FP exception", /* 19 T_XMM */
};
const int trap_types = nitems(trap_type);
#ifdef DEBUG
int trapdebug = 0;
#endif
static void trap_print(struct trapframe *, int _type);
static inline void frame_dump(struct trapframe *_tf, struct proc *_p,
const char *_sig, uint64_t _cr2);
static inline void verify_smap(const char *_func);
static inline void debug_trap(struct trapframe *_frame, struct proc *_p,
long _type);
static inline void
fault(const char *fmt, ...)
{
struct cpu_info *ci = curcpu();
va_list ap;
atomic_cas_ptr(&panicstr, NULL, ci->ci_panicbuf);
va_start(ap, fmt);
vsnprintf(ci->ci_panicbuf, sizeof(ci->ci_panicbuf), fmt, ap);
va_end(ap);
#ifdef DDB
db_printf("%s\n", ci->ci_panicbuf);
#else
printf("%s\n", ci->ci_panicbuf);
#endif
}
static inline int
pgex2access(int pgex)
{
if (pgex & PGEX_W)
return PROT_WRITE;
else if (pgex & PGEX_I)
return PROT_EXEC;
return PROT_READ;
}
/*
* upageflttrap(frame, usermode): page fault handler
* Returns non-zero if the fault was handled (possibly by generating
* a signal). Returns zero, possibly still holding the kernel lock,
* if something was so broken that we should panic.
*/
int
upageflttrap(struct trapframe *frame, uint64_t cr2)
{
struct proc *p = curproc;
vaddr_t va = trunc_page((vaddr_t)cr2);
vm_prot_t access_type = pgex2access(frame->tf_err);
union sigval sv;
int signal, sicode, error;
error = uvm_fault(&p->p_vmspace->vm_map, va, 0, access_type);
if (error == 0) { uvm_grow(p, va);
return 1;
}
signal = SIGSEGV;
sicode = SEGV_MAPERR;
if (error == ENOMEM) {
printf("UVM: pid %d (%s), uid %d killed:"
" out of swap\n", p->p_p->ps_pid, p->p_p->ps_comm,
p->p_ucred ? (int)p->p_ucred->cr_uid : -1);
signal = SIGKILL;
} else {
if (error == EACCES)
sicode = SEGV_ACCERR;
else if (error == EIO) {
signal = SIGBUS;
sicode = BUS_OBJERR;
}
}
sv.sival_ptr = (void *)cr2;
trapsignal(p, signal, T_PAGEFLT, sicode, sv);
return 1;
}
/*
* kpageflttrap(frame, usermode): page fault handler
* Returns non-zero if the fault was handled (possibly by generating
* a signal). Returns zero, possibly still holding the kernel lock,
* if something was so broken that we should panic.
*/
int
kpageflttrap(struct trapframe *frame, uint64_t cr2)
{
struct proc *p = curproc;
struct pcb *pcb;
vaddr_t va = trunc_page((vaddr_t)cr2);
struct vm_map *map;
vm_prot_t access_type = pgex2access(frame->tf_err);
caddr_t onfault;
int error;
if (p == NULL || p->p_addr == NULL || p->p_vmspace == NULL)
return 0;
pcb = &p->p_addr->u_pcb;
/* This will only trigger if SMEP is enabled */
if (cr2 <= VM_MAXUSER_ADDRESS && frame->tf_err & PGEX_I) {
KERNEL_LOCK();
fault("attempt to execute user address %p "
"in supervisor mode", (void *)cr2);
/* retain kernel lock */
return 0;
}
/* This will only trigger if SMAP is enabled */
if (pcb->pcb_onfault == NULL && cr2 <= VM_MAXUSER_ADDRESS &&
frame->tf_err & PGEX_P) {
KERNEL_LOCK();
fault("attempt to access user address %p "
"in supervisor mode", (void *)cr2);
/* retain kernel lock */
return 0;
}
/*
* It is only a kernel address space fault iff:
* 1. when running in ring 0 and
* 2. pcb_onfault not set or
* 3. pcb_onfault set but supervisor space fault
* The last can occur during an exec() copyin where the
* argument space is lazy-allocated.
*/
map = &p->p_vmspace->vm_map;
if (va >= VM_MIN_KERNEL_ADDRESS)
map = kernel_map;
if (curcpu()->ci_inatomic == 0 || map == kernel_map) {
onfault = pcb->pcb_onfault;
pcb->pcb_onfault = NULL;
error = uvm_fault(map, va, 0, access_type);
pcb->pcb_onfault = onfault;
if (error == 0 && map != kernel_map) uvm_grow(p, va);
} else
error = EFAULT;
if (error) {
if (pcb->pcb_onfault == NULL) {
/* bad memory access in the kernel */
KERNEL_LOCK();
fault("uvm_fault(%p, 0x%llx, 0, %d) -> %x",
map, cr2, access_type, error);
/* retain kernel lock */
return 0;
}
frame->tf_rip = (u_int64_t)pcb->pcb_onfault;
}
return 1;
}
/*
* kerntrap(frame):
* Exception, fault, and trap interface to BSD kernel. This
* common code is called from assembly language IDT gate entry
* routines that prepare a suitable stack frame, and restore this
* frame after the exception has been processed.
*/
void
kerntrap(struct trapframe *frame)
{
int type = (int)frame->tf_trapno;
uint64_t cr2 = rcr2();
verify_smap(__func__);
uvmexp.traps++;
debug_trap(frame, curproc, type);
switch (type) {
default:
we_re_toast:
#ifdef DDB
if (db_ktrap(type, 0, frame))
return;
#endif
trap_print(frame, type);
panic("trap type %d, code=%llx, pc=%llx",
type, frame->tf_err, frame->tf_rip);
/*NOTREACHED*/
case T_PAGEFLT: /* allow page faults in kernel mode */
if (kpageflttrap(frame, cr2))
return;
goto we_re_toast;
#if NISA > 0
case T_NMI:
#ifdef DDB
/* NMI can be hooked up to a pushbutton for debugging */
printf ("NMI ... going to debugger\n");
if (db_ktrap(type, 0, frame))
return;
#endif
/* machine/parity/power fail/"kitchen sink" faults */
if (x86_nmi() != 0)
goto we_re_toast;
else
return;
#endif /* NISA > 0 */
}
}
/*
* usertrap(frame): handler for exceptions, faults, and traps from userspace
* This is called from the assembly language IDT gate entries
* which prepare a suitable stack frame and restores the CPU state
* after the fault has been processed.
*/
void
usertrap(struct trapframe *frame)
{
struct proc *p = curproc;
int type = (int)frame->tf_trapno;
uint64_t cr2 = rcr2();
union sigval sv;
int sig, code;
verify_smap(__func__);
uvmexp.traps++;
debug_trap(frame, p, type);
p->p_md.md_regs = frame;
refreshcreds(p);
switch (type) {
case T_TSSFLT:
sig = SIGBUS;
code = BUS_OBJERR;
break;
case T_PROTFLT: /* protection fault */
case T_SEGNPFLT:
case T_STKFLT:
frame_dump(frame, p, "SEGV", 0);
sig = SIGSEGV;
code = SEGV_MAPERR;
break;
case T_ALIGNFLT:
sig = SIGBUS;
code = BUS_ADRALN;
break;
case T_PRIVINFLT: /* privileged instruction fault */
sig = SIGILL;
code = ILL_PRVOPC;
break;
case T_DIVIDE:
sig = SIGFPE;
code = FPE_INTDIV;
break;
case T_ARITHTRAP:
case T_XMM: /* real arithmetic exceptions */
sig = SIGFPE;
code = fputrap(type);
break;
case T_BPTFLT: /* bpt instruction fault */
case T_TRCTRAP: /* trace trap */
sig = SIGTRAP;
code = TRAP_BRKPT;
break;
case T_PAGEFLT: /* page fault */
if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
"[%s]%d/%d sp=%lx inside %lx-%lx: not MAP_STACK\n",
uvm_map_inentry_sp, p->p_vmspace->vm_map.sserial))
goto out;
if (upageflttrap(frame, cr2))
goto out;
/* FALLTHROUGH */
default:
trap_print(frame, type);
panic("impossible trap");
}
sv.sival_ptr = (void *)frame->tf_rip;
trapsignal(p, sig, type, code, sv);
out:
userret(p);
}
static void
trap_print(struct trapframe *frame, int type)
{
if (type < trap_types)
printf("fatal %s", trap_type[type]);
else
printf("unknown trap %d", type);
printf(" in %s mode\n", KERNELMODE(frame->tf_cs, frame->tf_rflags) ?
"supervisor" : "user");
printf("trap type %d code %llx rip %llx cs %llx rflags %llx cr2 "
"%llx cpl %x rsp %llx\n",
type, frame->tf_err, frame->tf_rip, frame->tf_cs,
frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp);
printf("gsbase %p kgsbase %p\n",
(void *)rdmsr(MSR_GSBASE), (void *)rdmsr(MSR_KERNELGSBASE));
}
static inline void
frame_dump(struct trapframe *tf, struct proc *p, const char *sig, uint64_t cr2)
{
#ifdef TRAP_SIGDEBUG
printf("pid %d (%s): %s at rip %llx addr %llx\n",
p->p_p->ps_pid, p->p_p->ps_comm, sig, tf->tf_rip, cr2);
printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n",
(void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff,
(void *)tf->tf_rflags,
(void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff);
printf("err 0x%llx trapno 0x%llx\n",
tf->tf_err, tf->tf_trapno);
printf("rdi %p rsi %p rdx %p\n",
(void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx);
printf("rcx %p r8 %p r9 %p\n",
(void *)tf->tf_rcx, (void *)tf->tf_r8, (void *)tf->tf_r9);
printf("r10 %p r11 %p r12 %p\n",
(void *)tf->tf_r10, (void *)tf->tf_r11, (void *)tf->tf_r12);
printf("r13 %p r14 %p r15 %p\n",
(void *)tf->tf_r13, (void *)tf->tf_r14, (void *)tf->tf_r15);
printf("rbp %p rbx %p rax %p\n",
(void *)tf->tf_rbp, (void *)tf->tf_rbx, (void *)tf->tf_rax);
#endif
}
static inline void
verify_smap(const char *func)
{
#ifdef DIAGNOSTIC
if (curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) {
u_long rf = read_rflags();
if (rf & PSL_AC) { write_rflags(rf & ~PSL_AC);
panic("%s: AC set on entry", func);
}
}
#endif
}
static inline void
debug_trap(struct trapframe *frame, struct proc *p, long type)
{
#ifdef DEBUG
if (trapdebug) {
printf("trap %ld code %llx rip %llx cs %llx rflags %llx "
"cr2 %llx cpl %x\n",
type, frame->tf_err, frame->tf_rip, frame->tf_cs,
frame->tf_rflags, rcr2(), curcpu()->ci_ilevel);
printf("curproc %p\n", (void *)p);
if (p != NULL)
printf("pid %d\n", p->p_p->ps_pid);
}
#endif
}
/*
* ast(frame):
* AST handler. This is called from assembly language stubs when
* returning to userspace after a syscall or interrupt.
*/
void
ast(struct trapframe *frame)
{
struct proc *p = curproc;
uvmexp.traps++;
KASSERT(!KERNELMODE(frame->tf_cs, frame->tf_rflags));
p->p_md.md_regs = frame;
refreshcreds(p);
uvmexp.softs++;
mi_ast(p, curcpu()->ci_want_resched);
userret(p);
}
/*
* syscall(frame):
* System call request from POSIX system call gate interface to kernel.
*/
void
syscall(struct trapframe *frame)
{
caddr_t params;
const struct sysent *callp;
struct proc *p;
int error;
size_t argsize, argoff;
register_t code, args[9], rval[2], *argp;
verify_smap(__func__);
uvmexp.syscalls++;
p = curproc;
code = frame->tf_rax;
argp = &args[0];
argoff = 0;
switch (code) {
case SYS_syscall:
case SYS___syscall:
/*
* Code is first argument, followed by actual args.
*/
code = frame->tf_rdi;
argp = &args[1];
argoff = 1;
break;
default:
break;
}
callp = sysent;
if (code < 0 || code >= SYS_MAXSYSCALL)
callp += SYS_syscall;
else
callp += code;
argsize = (callp->sy_argsize >> 3) + argoff;
if (argsize) { switch (MIN(argsize, 6)) {
case 6:
args[5] = frame->tf_r9;
case 5:
args[4] = frame->tf_r8;
case 4:
args[3] = frame->tf_r10;
case 3:
args[2] = frame->tf_rdx;
case 2:
args[1] = frame->tf_rsi;
case 1:
args[0] = frame->tf_rdi;
break;
default:
panic("impossible syscall argsize");
}
if (argsize > 6) {
argsize -= 6;
params = (caddr_t)frame->tf_rsp + sizeof(register_t);
if ((error = copyin(params, &args[6], argsize << 3)))
goto bad;
}
}
rval[0] = 0;
rval[1] = frame->tf_rdx;
error = mi_syscall(p, code, callp, argp, rval);
switch (error) {
case 0:
frame->tf_rax = rval[0];
frame->tf_rdx = rval[1];
frame->tf_rflags &= ~PSL_C; /* carry bit */
break;
case ERESTART:
/* Back up over the syscall instruction (2 bytes) */
frame->tf_rip -= 2;
break;
case EJUSTRETURN:
/* nothing to do */
break;
default:
bad:
frame->tf_rax = error;
frame->tf_rflags |= PSL_C; /* carry bit */
break;
}
mi_syscall_return(p, code, error, rval);
}
void
child_return(void *arg)
{
struct proc *p = arg;
struct trapframe *tf = p->p_md.md_regs;
tf->tf_rax = 0;
tf->tf_rdx = 1;
tf->tf_rflags &= ~PSL_C;
KERNEL_UNLOCK();
mi_child_return(p);
}
/* $OpenBSD: in6_src.c,v 1.86 2022/02/22 01:15:02 guenther Exp $ */
/* $KAME: in6_src.c,v 1.36 2001/02/06 04:08:17 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **, u_int);
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and pcb. We need the additional opt parameter because
* the values set at pcb level can be overridden via cmsg.
*/
int
in6_pcbselsrc(struct in6_addr **in6src, struct sockaddr_in6 *dstsock,
struct inpcb *inp, struct ip6_pktopts *opts)
{
struct ip6_moptions *mopts = inp->inp_moptions6;
struct route_in6 *ro = &inp->inp_route6;
struct in6_addr *laddr = &inp->inp_laddr6;
u_int rtableid = inp->inp_rtableid;
struct ifnet *ifp = NULL;
struct sockaddr *ip6_source = NULL;
struct in6_addr *dst;
struct in6_ifaddr *ia6 = NULL;
struct in6_pktinfo *pi = NULL;
int error;
dst = &dstsock->sin6_addr;
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
* assigned to the node, and can be used as the packet's source
* address. If everything is okay, use the address as source.
*/
if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
struct sockaddr_in6 sa6;
/* get the outgoing interface */
error = in6_selectif(dstsock, opts, mopts, ro, &ifp, rtableid);
if (error)
return (error);
bzero(&sa6, sizeof(sa6));
sa6.sin6_family = AF_INET6;
sa6.sin6_len = sizeof(sa6);
sa6.sin6_addr = pi->ipi6_addr;
if (ifp && IN6_IS_SCOPE_EMBED(&sa6.sin6_addr))
sa6.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
if_put(ifp); /* put reference from in6_selectif */
ia6 = ifatoia6(ifa_ifwithaddr(sin6tosa(&sa6), rtableid));
if (ia6 == NULL || (ia6->ia6_flags &
(IN6_IFF_ANYCAST|IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)))
return (EADDRNOTAVAIL);
pi->ipi6_addr = sa6.sin6_addr; /* XXX: this overrides pi */
*in6src = &pi->ipi6_addr;
return (0);
}
/*
* If the source address is not specified but the socket(if any)
* is already bound, use the bound address.
*/
if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) {
*in6src = laddr;
return (0);
}
/*
* If the caller doesn't specify the source address but
* the outgoing interface, use an address associated with
* the interface.
*/
if (pi && pi->ipi6_ifindex) {
ifp = if_get(pi->ipi6_ifindex);
if (ifp == NULL)
return (ENXIO); /* XXX: better error? */
ia6 = in6_ifawithscope(ifp, dst, rtableid);
if_put(ifp);
if (ia6 == NULL)
return (EADDRNOTAVAIL);
*in6src = &ia6->ia_addr.sin6_addr;
return (0);
}
error = in6_selectsrc(in6src, dstsock, mopts, rtableid);
if (error != EADDRNOTAVAIL)
return (error);
/*
* If route is known or can be allocated now,
* our src addr is taken from the i/f, else punt.
*/
if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) ||
!IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst)) {
rtfree(ro->ro_rt);
ro->ro_rt = NULL;
}
if (ro->ro_rt == NULL) {
struct sockaddr_in6 *sa6;
/* No route yet, so try to acquire one */
bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
ro->ro_tableid = rtableid;
sa6 = &ro->ro_dst;
sa6->sin6_family = AF_INET6;
sa6->sin6_len = sizeof(struct sockaddr_in6);
sa6->sin6_addr = *dst;
sa6->sin6_scope_id = dstsock->sin6_scope_id;
ro->ro_rt = rtalloc(sin6tosa(&ro->ro_dst),
RT_RESOLVE, ro->ro_tableid);
}
/*
* in_pcbconnect() checks out IFF_LOOPBACK to skip using
* the address. But we don't know why it does so.
* It is necessary to ensure the scope even for lo0
* so doesn't check out IFF_LOOPBACK.
*/
if (ro->ro_rt) {
ifp = if_get(ro->ro_rt->rt_ifidx);
if (ifp != NULL) {
ia6 = in6_ifawithscope(ifp, dst, rtableid);
if_put(ifp);
}
if (ia6 == NULL) /* xxx scope error ?*/
ia6 = ifatoia6(ro->ro_rt->rt_ifa);
}
/*
* Use preferred source address if :
* - destination is not onlink
* - preferred source address is set
* - output interface is UP
*/
if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) &&
!(ro->ro_rt->rt_flags & RTF_HOST)) {
ip6_source = rtable_getsource(rtableid, AF_INET6);
if (ip6_source != NULL) {
struct ifaddr *ifa;
if ((ifa = ifa_ifwithaddr(ip6_source, rtableid)) != NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) { *in6src = &satosin6(ip6_source)->sin6_addr;
return (0);
}
}
}
if (ia6 == NULL)
return (EHOSTUNREACH); /* no route */
*in6src = &ia6->ia_addr.sin6_addr;
return (0);
}
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and multicast options.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
int
in6_selectsrc(struct in6_addr **in6src, struct sockaddr_in6 *dstsock,
struct ip6_moptions *mopts, unsigned int rtableid)
{
struct ifnet *ifp = NULL;
struct in6_addr *dst;
struct in6_ifaddr *ia6 = NULL;
dst = &dstsock->sin6_addr;
/*
* If the destination address is a link-local unicast address or
* a link/interface-local multicast address, and if the outgoing
* interface is specified by the sin6_scope_id filed, use an address
* associated with the interface.
* XXX: We're now trying to define more specific semantics of
* sin6_scope_id field, so this part will be rewritten in
* the near future.
*/
if ((IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MC_LINKLOCAL(dst) ||
IN6_IS_ADDR_MC_INTFACELOCAL(dst)) && dstsock->sin6_scope_id) {
ifp = if_get(dstsock->sin6_scope_id);
if (ifp == NULL)
return (ENXIO); /* XXX: better error? */
ia6 = in6_ifawithscope(ifp, dst, rtableid);
if_put(ifp);
if (ia6 == NULL)
return (EADDRNOTAVAIL);
*in6src = &ia6->ia_addr.sin6_addr;
return (0);
}
/*
* If the destination address is a multicast address and
* the outgoing interface for the address is specified
* by the caller, use an address associated with the interface.
* Even if the outgoing interface is not specified, we also
* choose a loopback interface as the outgoing interface.
*/
if (IN6_IS_ADDR_MULTICAST(dst)) {
ifp = mopts ? if_get(mopts->im6o_ifidx) : NULL; if (!ifp && dstsock->sin6_scope_id)
ifp = if_get(htons(dstsock->sin6_scope_id));
if (ifp) {
ia6 = in6_ifawithscope(ifp, dst, rtableid);
if_put(ifp);
if (ia6 == NULL)
return (EADDRNOTAVAIL);
*in6src = &ia6->ia_addr.sin6_addr;
return (0);
}
}
return (EADDRNOTAVAIL);
}
struct rtentry *
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct route_in6 *ro, unsigned int rtableid)
{
struct in6_addr *dst;
dst = &dstsock->sin6_addr;
/*
* Use a cached route if it exists and is valid, else try to allocate
* a new one.
*/
if (ro) { if (rtisvalid(ro->ro_rt)) KASSERT(sin6tosa(&ro->ro_dst)->sa_family == AF_INET6); if (!rtisvalid(ro->ro_rt) ||
!IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst)) {
rtfree(ro->ro_rt);
ro->ro_rt = NULL;
}
if (ro->ro_rt == NULL) {
struct sockaddr_in6 *sa6;
/* No route yet, so try to acquire one */
bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
ro->ro_tableid = rtableid;
sa6 = &ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
ro->ro_tableid = rtableid;
ro->ro_rt = rtalloc_mpath(sin6tosa(&ro->ro_dst),
NULL, ro->ro_tableid);
}
/*
* Check if the outgoing interface conflicts with
* the interface specified by ipi6_ifindex (if specified).
* Note that loopback interface is always okay.
* (this may happen when we are sending a packet to one of
* our own addresses.)
*/
if (opts && opts->ip6po_pktinfo &&
opts->ip6po_pktinfo->ipi6_ifindex) {
if (ro->ro_rt != NULL && !ISSET(ro->ro_rt->rt_flags, RTF_LOCAL) &&
ro->ro_rt->rt_ifidx !=
opts->ip6po_pktinfo->ipi6_ifindex) {
return (NULL);
}
}
return (ro->ro_rt);
}
return (NULL);
}
int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp,
u_int rtableid)
{
struct rtentry *rt = NULL;
struct in6_pktinfo *pi = NULL;
/* If the caller specify the outgoing interface explicitly, use it. */
if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
*retifp = if_get(pi->ipi6_ifindex);
if (*retifp != NULL)
return (0);
}
/*
* If the destination address is a multicast address and the outgoing
* interface for the address is specified by the caller, use it.
*/
if (IN6_IS_ADDR_MULTICAST(&dstsock->sin6_addr) &&
mopts != NULL && (*retifp = if_get(mopts->im6o_ifidx)) != NULL)
return (0);
rt = in6_selectroute(dstsock, opts, ro, rtableid);
if (rt == NULL)
return (EHOSTUNREACH);
/*
* do not use a rejected or black hole route.
* XXX: this check should be done in the L2 output routine.
* However, if we skipped this check here, we'd see the following
* scenario:
* - install a rejected route for a scoped address prefix
* (like fe80::/10)
* - send a packet to a destination that matches the scoped prefix,
* with ambiguity about the scope zone.
* - pick the outgoing interface from the route, and disambiguate the
* scope zone with the interface.
* - ip6_output() would try to get another route with the "new"
* destination, which may be valid.
* - we'd see no error on output.
* Although this may not be very harmful, it should still be confusing.
* We thus reject the case here.
*/
if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE)))
return (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
if (rt != NULL)
*retifp = if_get(rt->rt_ifidx);
return (0);
}
int
in6_selecthlim(struct inpcb *in6p)
{ if (in6p && in6p->inp_hops >= 0)
return (in6p->inp_hops);
return (ip6_defhlim);
}
/*
* generate kernel-internal form (scopeid embedded into s6_addr16[1]).
* If the address scope of is link-local, embed the interface index in the
* address. The routine determines our precedence
* between advanced API scope/interface specification and basic API
* specification.
*
* this function should be nuked in the future, when we get rid of
* embedded scopeid thing.
*
* XXX actually, it is over-specification to return ifp against sin6_scope_id.
* there can be multiple interfaces that belong to a particular scope zone
* (in specification, we have 1:N mapping between a scope zone and interfaces).
* we may want to change the function to return something other than ifp.
*/
int
in6_embedscope(struct in6_addr *in6, const struct sockaddr_in6 *sin6,
struct inpcb *in6p)
{
struct ifnet *ifp = NULL;
u_int32_t scopeid;
*in6 = sin6->sin6_addr;
scopeid = sin6->sin6_scope_id;
/*
* don't try to read sin6->sin6_addr beyond here, since the caller may
* ask us to overwrite existing sockaddr_in6
*/
if (IN6_IS_SCOPE_EMBED(in6)) {
struct in6_pktinfo *pi;
/*
* KAME assumption: link id == interface id
*/
if (in6p && in6p->inp_outputopts6 && (pi = in6p->inp_outputopts6->ip6po_pktinfo) &&
pi->ipi6_ifindex) {
ifp = if_get(pi->ipi6_ifindex);
if (ifp == NULL)
return ENXIO; /* XXX EINVAL? */
in6->s6_addr16[1] = htons(pi->ipi6_ifindex); } else if (in6p && IN6_IS_ADDR_MULTICAST(in6) && in6p->inp_moptions6 &&
(ifp = if_get(in6p->inp_moptions6->im6o_ifidx))) {
in6->s6_addr16[1] = htons(ifp->if_index); } else if (scopeid) {
ifp = if_get(scopeid);
if (ifp == NULL)
return ENXIO; /* XXX EINVAL? */
/*XXX assignment to 16bit from 32bit variable */
in6->s6_addr16[1] = htons(scopeid & 0xffff);
}
if_put(ifp);
}
return 0;
}
/*
* generate standard sockaddr_in6 from embedded form.
* touches sin6_addr and sin6_scope_id only.
*
* this function should be nuked in the future, when we get rid of
* embedded scopeid thing.
*/
void
in6_recoverscope(struct sockaddr_in6 *sin6, const struct in6_addr *in6)
{
u_int32_t scopeid;
sin6->sin6_addr = *in6;
/*
* don't try to read *in6 beyond here, since the caller may
* ask us to overwrite existing sockaddr_in6
*/
sin6->sin6_scope_id = 0;
if (IN6_IS_SCOPE_EMBED(in6)) {
/*
* KAME assumption: link id == interface id
*/
scopeid = ntohs(sin6->sin6_addr.s6_addr16[1]); if (scopeid) {
sin6->sin6_addr.s6_addr16[1] = 0;
sin6->sin6_scope_id = scopeid;
}
}
}
/*
* just clear the embedded scope identifier.
*/
void
in6_clearscope(struct in6_addr *addr)
{
if (IN6_IS_SCOPE_EMBED(addr))
addr->s6_addr16[1] = 0;
}
/* $OpenBSD: subr_autoconf.c,v 1.96 2022/04/07 09:37:32 tb Exp $ */
/* $NetBSD: subr_autoconf.c,v 1.21 1996/04/04 06:06:18 cgd Exp $ */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratories.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp (LBL)
*
* @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/device.h>
#include <sys/hotplug.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/atomic.h>
#include <sys/reboot.h>
#include "hotplug.h"
#include "mpath.h"
/*
* Autoconfiguration subroutines.
*/
/*
* ioconf.c exports exactly two names: cfdata and cfroots. All system
* devices and drivers are found via these tables.
*/
extern short cfroots[];
#define ROOT ((struct device *)NULL)
struct matchinfo {
cfmatch_t fn;
struct device *parent;
void *match, *aux;
int indirect, pri;
};
#ifndef AUTOCONF_VERBOSE
#define AUTOCONF_VERBOSE 0
#endif /* AUTOCONF_VERBOSE */
int autoconf_verbose = AUTOCONF_VERBOSE; /* trace probe calls */
static void mapply(struct matchinfo *, struct cfdata *);
struct deferred_config {
TAILQ_ENTRY(deferred_config) dc_queue;
struct device *dc_dev;
void (*dc_func)(struct device *);
};
TAILQ_HEAD(, deferred_config) deferred_config_queue;
TAILQ_HEAD(, deferred_config) mountroot_config_queue;
void *config_rootsearch(cfmatch_t, char *, void *);
void config_process_deferred_children(struct device *);
struct devicelist alldevs; /* list of all devices */
volatile int config_pending; /* semaphore for mountroot */
struct mutex autoconf_attdet_mtx = MUTEX_INITIALIZER(IPL_HIGH);
/*
* If > 0, devices are being attached and any thread which tries to
* detach will sleep; if < 0 devices are being detached and any
* thread which tries to attach will sleep.
*/
int autoconf_attdet;
/*
* Initialize autoconfiguration data structures. This occurs before console
* initialization as that might require use of this subsystem. Furthermore
* this means that malloc et al. isn't yet available.
*/
void
config_init(void)
{
TAILQ_INIT(&deferred_config_queue);
TAILQ_INIT(&mountroot_config_queue);
TAILQ_INIT(&alldevs);
}
/*
* Apply the matching function and choose the best. This is used
* a few times and we want to keep the code small.
*/
void
mapply(struct matchinfo *m, struct cfdata *cf)
{
int pri;
void *match;
if (m->indirect)
match = config_make_softc(m->parent, cf);
else
match = cf;
if (autoconf_verbose) {
printf(">>> probing for %s", cf->cf_driver->cd_name);
if (cf->cf_fstate == FSTATE_STAR)
printf("*\n");
else
printf("%d\n", cf->cf_unit);
}
if (m->fn != NULL)
pri = (*m->fn)(m->parent, match, m->aux);
else {
if (cf->cf_attach->ca_match == NULL) {
panic("mapply: no match function for '%s' device",
cf->cf_driver->cd_name);
}
pri = (*cf->cf_attach->ca_match)(m->parent, match, m->aux);
}
if (autoconf_verbose)
printf(">>> %s probe returned %d\n", cf->cf_driver->cd_name,
pri);
if (pri > m->pri) {
if (m->indirect && m->match) {
cf = ((struct device *)m->match)->dv_cfdata;
free(m->match, M_DEVBUF, cf->cf_attach->ca_devsize);
}
m->match = match;
m->pri = pri;
} else {
if (m->indirect)
free(match, M_DEVBUF, cf->cf_attach->ca_devsize);
}
}
/*
* Iterate over all potential children of some device, calling the given
* function (default being the child's match function) for each one.
* Nonzero returns are matches; the highest value returned is considered
* the best match. Return the `found child' if we got a match, or NULL
* otherwise. The `aux' pointer is simply passed on through.
*
* Note that this function is designed so that it can be used to apply
* an arbitrary function to all potential children (its return value
* can be ignored).
*/
void *
config_search(cfmatch_t fn, struct device *parent, void *aux)
{
struct cfdata *cf;
short *p;
struct matchinfo m;
m.fn = fn;
m.parent = parent;
m.match = NULL;
m.aux = aux;
m.indirect = parent && (parent->dv_cfdata->cf_driver->cd_mode & CD_INDIRECT);
m.pri = 0;
for (cf = cfdata; cf->cf_driver; cf++) {
/*
* Skip cf if no longer eligible, otherwise scan
* through parents for one matching `parent',
* and try match function.
*/
if (cf->cf_fstate == FSTATE_FOUND)
continue;
if (cf->cf_fstate == FSTATE_DNOTFOUND ||
cf->cf_fstate == FSTATE_DSTAR)
continue;
if (boothowto & RB_UNHIBERNATE) {
if (cf->cf_driver->cd_mode & CD_SKIPHIBERNATE)
continue;
if (cf->cf_driver->cd_class == DV_IFNET)
continue;
if (cf->cf_driver->cd_class == DV_TAPE)
continue;
}
for (p = cf->cf_parents; *p >= 0; p++)
if (parent->dv_cfdata == &cfdata[*p])
mapply(&m, cf);
}
if (autoconf_verbose) {
if (m.match) {
if (m.indirect)
cf = ((struct device *)m.match)->dv_cfdata;
else
cf = (struct cfdata *)m.match;
printf(">>> %s probe won\n",
cf->cf_driver->cd_name);
} else
printf(">>> no winning probe\n");
}
return (m.match);
}
/*
* Iterate over all potential children of some device, calling the given
* function for each one.
*
* Note that this function is designed so that it can be used to apply
* an arbitrary function to all potential children (its return value
* can be ignored).
*/
void
config_scan(cfscan_t fn, struct device *parent)
{
struct cfdata *cf;
short *p;
void *match;
int indirect;
indirect = parent && (parent->dv_cfdata->cf_driver->cd_mode & CD_INDIRECT);
for (cf = cfdata; cf->cf_driver; cf++) {
/*
* Skip cf if no longer eligible, otherwise scan
* through parents for one matching `parent',
* and try match function.
*/
if (cf->cf_fstate == FSTATE_FOUND)
continue;
if (cf->cf_fstate == FSTATE_DNOTFOUND ||
cf->cf_fstate == FSTATE_DSTAR)
continue;
for (p = cf->cf_parents; *p >= 0; p++)
if (parent->dv_cfdata == &cfdata[*p]) {
match = indirect?
config_make_softc(parent, cf) :
(void *)cf;
(*fn)(parent, match);
}
}
}
/*
* Find the given root device.
* This is much like config_search, but there is no parent.
*/
void *
config_rootsearch(cfmatch_t fn, char *rootname, void *aux)
{
struct cfdata *cf;
short *p;
struct matchinfo m;
m.fn = fn;
m.parent = ROOT;
m.match = NULL;
m.aux = aux;
m.indirect = 0;
m.pri = 0;
/*
* Look at root entries for matching name. We do not bother
* with found-state here since only one instance of each possible
* root child should ever be searched.
*/
for (p = cfroots; *p >= 0; p++) {
cf = &cfdata[*p];
if (cf->cf_fstate == FSTATE_DNOTFOUND ||
cf->cf_fstate == FSTATE_DSTAR)
continue;
if (strcmp(cf->cf_driver->cd_name, rootname) == 0)
mapply(&m, cf);
}
return (m.match);
}
const char *msgs[3] = { "", " not configured\n", " unsupported\n" };
/*
* The given `aux' argument describes a device that has been found
* on the given parent, but not necessarily configured. Locate the
* configuration data for that device (using the submatch function
* provided, or using candidates' cd_match configuration driver
* functions) and attach it, and return true. If the device was
* not configured, call the given `print' function and return 0.
*/
struct device *
config_found_sm(struct device *parent, void *aux, cfprint_t print,
cfmatch_t submatch)
{
void *match;
if ((match = config_search(submatch, parent, aux)) != NULL)
return (config_attach(parent, match, aux, print));
if (print)
printf("%s", msgs[(*print)(aux, parent->dv_xname)]);
return (NULL);
}
/*
* As above, but for root devices.
*/
struct device *
config_rootfound(char *rootname, void *aux)
{
void *match;
if ((match = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL)
return (config_attach(ROOT, match, aux, (cfprint_t)NULL));
printf("root device %s not configured\n", rootname);
return (NULL);
}
/*
* Attach a found device. Allocates memory for device variables.
*/
struct device *
config_attach(struct device *parent, void *match, void *aux, cfprint_t print)
{
struct cfdata *cf;
struct device *dev;
struct cfdriver *cd;
const struct cfattach *ca;
mtx_enter(&autoconf_attdet_mtx);
while (autoconf_attdet < 0)
msleep_nsec(&autoconf_attdet, &autoconf_attdet_mtx,
PWAIT, "autoconf", INFSLP);
autoconf_attdet++;
mtx_leave(&autoconf_attdet_mtx);
if (parent && (parent->dv_cfdata->cf_driver->cd_mode & CD_INDIRECT)) {
dev = match;
cf = dev->dv_cfdata;
} else {
cf = match;
dev = config_make_softc(parent, cf);
}
cd = cf->cf_driver;
ca = cf->cf_attach;
KASSERT(cd->cd_devs != NULL);
KASSERT(dev->dv_unit < cd->cd_ndevs);
KASSERT(cd->cd_devs[dev->dv_unit] == NULL);
cd->cd_devs[dev->dv_unit] = dev;
/*
* If this is a "STAR" device and we used the last unit, prepare for
* another one.
*/
if (cf->cf_fstate == FSTATE_STAR) {
if (dev->dv_unit == cf->cf_unit)
cf->cf_unit++;
} else
cf->cf_fstate = FSTATE_FOUND;
TAILQ_INSERT_TAIL(&alldevs, dev, dv_list);
device_ref(dev);
if (parent == ROOT)
printf("%s at root", dev->dv_xname);
else {
printf("%s at %s", dev->dv_xname, parent->dv_xname);
if (print)
(void) (*print)(aux, NULL);
}
/*
* Before attaching, clobber any unfound devices that are
* otherwise identical, or bump the unit number on all starred
* cfdata for this device.
*/
for (cf = cfdata; cf->cf_driver; cf++) {
if (cf->cf_driver == cd &&
cf->cf_unit == dev->dv_unit) {
if (cf->cf_fstate == FSTATE_NOTFOUND)
cf->cf_fstate = FSTATE_FOUND;
if (cf->cf_fstate == FSTATE_STAR)
cf->cf_unit++;
}
}
device_register(dev, aux);
(*ca->ca_attach)(parent, dev, aux);
config_process_deferred_children(dev);
#if NHOTPLUG > 0
if (!cold)
hotplug_device_attach(cd->cd_class, dev->dv_xname);
#endif
mtx_enter(&autoconf_attdet_mtx);
if (--autoconf_attdet == 0)
wakeup(&autoconf_attdet);
mtx_leave(&autoconf_attdet_mtx);
return (dev);
}
struct device *
config_make_softc(struct device *parent, struct cfdata *cf)
{
struct device *dev;
struct cfdriver *cd;
const struct cfattach *ca;
cd = cf->cf_driver;
ca = cf->cf_attach;
if (ca->ca_devsize < sizeof(struct device))
panic("config_make_softc");
/* get memory for all device vars */
dev = malloc(ca->ca_devsize, M_DEVBUF, M_NOWAIT|M_ZERO);
if (dev == NULL)
panic("config_make_softc: allocation for device softc failed");
dev->dv_class = cd->cd_class;
dev->dv_cfdata = cf;
dev->dv_flags = DVF_ACTIVE; /* always initially active */
/* If this is a STAR device, search for a free unit number */
if (cf->cf_fstate == FSTATE_STAR) {
for (dev->dv_unit = cf->cf_starunit1;
dev->dv_unit < cf->cf_unit; dev->dv_unit++)
if (cd->cd_ndevs == 0 ||
dev->dv_unit >= cd->cd_ndevs ||
cd->cd_devs[dev->dv_unit] == NULL)
break;
} else
dev->dv_unit = cf->cf_unit;
/* Build the device name into dv_xname. */
if (snprintf(dev->dv_xname, sizeof(dev->dv_xname), "%s%d",
cd->cd_name, dev->dv_unit) >= sizeof(dev->dv_xname))
panic("config_make_softc: device name too long");
dev->dv_parent = parent;
/* put this device in the devices array */
if (dev->dv_unit >= cd->cd_ndevs) {
/*
* Need to expand the array.
*/
int old = cd->cd_ndevs, new;
void **nsp;
if (old == 0)
new = MINALLOCSIZE / sizeof(void *);
else
new = old * 2;
while (new <= dev->dv_unit)
new *= 2;
cd->cd_ndevs = new;
nsp = mallocarray(new, sizeof(void *), M_DEVBUF, M_NOWAIT|M_ZERO);
if (nsp == NULL)
panic("config_make_softc: %sing dev array",
old != 0 ? "expand" : "creat");
if (old != 0) {
bcopy(cd->cd_devs, nsp, old * sizeof(void *));
free(cd->cd_devs, M_DEVBUF, old * sizeof(void *));
}
cd->cd_devs = nsp;
}
if (cd->cd_devs[dev->dv_unit])
panic("config_make_softc: duplicate %s", dev->dv_xname);
dev->dv_ref = 1;
return (dev);
}
/*
* Detach a device. Optionally forced (e.g. because of hardware
* removal) and quiet. Returns zero if successful, non-zero
* (an error code) otherwise.
*
* Note that this code wants to be run from a process context, so
* that the detach can sleep to allow processes which have a device
* open to run and unwind their stacks.
*/
int
config_detach(struct device *dev, int flags)
{
struct cfdata *cf;
const struct cfattach *ca;
struct cfdriver *cd;
int rv = 0, i;
#ifdef DIAGNOSTIC
struct device *d;
#endif
#if NHOTPLUG > 0
char devname[16];
#endif
mtx_enter(&autoconf_attdet_mtx);
while (autoconf_attdet > 0)
msleep_nsec(&autoconf_attdet, &autoconf_attdet_mtx,
PWAIT, "autoconf", INFSLP);
autoconf_attdet--;
mtx_leave(&autoconf_attdet_mtx);
#if NHOTPLUG > 0
strlcpy(devname, dev->dv_xname, sizeof(devname));
#endif
cf = dev->dv_cfdata;
#ifdef DIAGNOSTIC
if (cf->cf_fstate != FSTATE_FOUND && cf->cf_fstate != FSTATE_STAR)
panic("config_detach: bad device fstate");
#endif
ca = cf->cf_attach;
cd = cf->cf_driver;
/*
* Ensure the device is deactivated. If the device has an
* activation entry point and DVF_ACTIVE is still set, the
* device is busy, and the detach fails.
*/
rv = config_deactivate(dev);
/*
* Try to detach the device. If that's not possible, then
* we either panic() (for the forced but failed case), or
* return an error.
*/
if (rv == 0) {
if (ca->ca_detach != NULL)
rv = (*ca->ca_detach)(dev, flags);
else
rv = EOPNOTSUPP;
}
if (rv != 0) {
if ((flags & DETACH_FORCE) == 0)
goto done;
else
panic("config_detach: forced detach of %s failed (%d)",
dev->dv_xname, rv);
}
/*
* The device has now been successfully detached.
*/
#ifdef DIAGNOSTIC
/*
* Sanity: If you're successfully detached, you should have no
* children. (Note that because children must be attached
* after parents, we only need to search the latter part of
* the list.)
*/
i = 0;
for (d = TAILQ_NEXT(dev, dv_list); d != NULL;
d = TAILQ_NEXT(d, dv_list)) {
if (d->dv_parent == dev) {
printf("config_detach: %s attached at %s\n",
d->dv_xname, dev->dv_xname);
i = 1;
}
}
if (i != 0)
panic("config_detach: detached device (%s) has children",
dev->dv_xname);
#endif
/*
* Mark cfdata to show that the unit can be reused, if possible.
* Note that we can only re-use a starred unit number if the unit
* being detached had the last assigned unit number.
*/
for (cf = cfdata; cf->cf_driver; cf++) {
if (cf->cf_driver == cd) {
if (cf->cf_fstate == FSTATE_FOUND &&
cf->cf_unit == dev->dv_unit)
cf->cf_fstate = FSTATE_NOTFOUND;
if (cf->cf_fstate == FSTATE_STAR &&
cf->cf_unit == dev->dv_unit + 1)
cf->cf_unit--;
}
}
/*
* Unlink from device list.
*/
TAILQ_REMOVE(&alldevs, dev, dv_list);
device_unref(dev);
/*
* Remove from cfdriver's array, tell the world, and free softc.
*/
cd->cd_devs[dev->dv_unit] = NULL;
if ((flags & DETACH_QUIET) == 0)
printf("%s detached\n", dev->dv_xname);
device_unref(dev);
/*
* If the device now has no units in use, deallocate its softc array.
*/
for (i = 0; i < cd->cd_ndevs; i++)
if (cd->cd_devs[i] != NULL)
break;
if (i == cd->cd_ndevs) { /* nothing found; deallocate */
free(cd->cd_devs, M_DEVBUF, cd->cd_ndevs * sizeof(void *));
cd->cd_devs = NULL;
cd->cd_ndevs = 0;
cf->cf_unit = 0;
}
#if NHOTPLUG > 0
if (!cold)
hotplug_device_detach(cd->cd_class, devname);
#endif
/*
* Return success.
*/
done:
mtx_enter(&autoconf_attdet_mtx);
if (++autoconf_attdet == 0)
wakeup(&autoconf_attdet);
mtx_leave(&autoconf_attdet_mtx);
return (rv);
}
int
config_deactivate(struct device *dev)
{
int rv = 0, oflags = dev->dv_flags;
if (dev->dv_flags & DVF_ACTIVE) {
dev->dv_flags &= ~DVF_ACTIVE;
rv = config_suspend(dev, DVACT_DEACTIVATE);
if (rv)
dev->dv_flags = oflags;
}
return (rv);
}
/*
* Defer the configuration of the specified device until all
* of its parent's devices have been attached.
*/
void
config_defer(struct device *dev, void (*func)(struct device *))
{
struct deferred_config *dc;
if (dev->dv_parent == NULL)
panic("config_defer: can't defer config of a root device");
#ifdef DIAGNOSTIC
for (dc = TAILQ_FIRST(&deferred_config_queue); dc != NULL;
dc = TAILQ_NEXT(dc, dc_queue)) {
if (dc->dc_dev == dev)
panic("config_defer: deferred twice");
}
#endif
if ((dc = malloc(sizeof(*dc), M_DEVBUF, M_NOWAIT)) == NULL)
panic("config_defer: can't allocate defer structure");
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue);
config_pending_incr();
}
/*
* Defer the configuration of the specified device until after
* root file system is mounted.
*/
void
config_mountroot(struct device *dev, void (*func)(struct device *))
{
struct deferred_config *dc;
/*
* No need to defer if root file system is already mounted.
*/
if (rootvp != NULL) {
(*func)(dev);
return;
}
#ifdef DIAGNOSTIC
for (dc = TAILQ_FIRST(&mountroot_config_queue); dc != NULL;
dc = TAILQ_NEXT(dc, dc_queue)) {
if (dc->dc_dev == dev)
panic("config_mountroot: deferred twice");
}
#endif
if ((dc = malloc(sizeof(*dc), M_DEVBUF, M_NOWAIT)) == NULL)
panic("config_mountroot: can't allocate defer structure");
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue);
}
/*
* Process the deferred configuration queue for a device.
*/
void
config_process_deferred_children(struct device *parent)
{
struct deferred_config *dc, *ndc;
for (dc = TAILQ_FIRST(&deferred_config_queue);
dc != NULL; dc = ndc) {
ndc = TAILQ_NEXT(dc, dc_queue);
if (dc->dc_dev->dv_parent == parent) {
TAILQ_REMOVE(&deferred_config_queue, dc, dc_queue);
(*dc->dc_func)(dc->dc_dev);
free(dc, M_DEVBUF, sizeof(*dc));
config_pending_decr();
}
}
}
/*
* Process the deferred configuration queue after the root file
* system is mounted .
*/
void
config_process_deferred_mountroot(void)
{
struct deferred_config *dc;
while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) {
TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue);
(*dc->dc_func)(dc->dc_dev);
free(dc, M_DEVBUF, sizeof(*dc));
}
}
/*
* Manipulate the config_pending semaphore.
*/
void
config_pending_incr(void)
{
config_pending++;
}
void
config_pending_decr(void)
{
#ifdef DIAGNOSTIC
if (config_pending == 0)
panic("config_pending_decr: config_pending == 0");
#endif
config_pending--;
if (config_pending == 0)
wakeup((void *)&config_pending);
}
int
config_detach_children(struct device *parent, int flags)
{
struct device *dev, *next_dev;
int rv = 0;
/*
* The config_detach routine may sleep, meaning devices
* may be added to the queue. However, all devices will
* be added to the tail of the queue, the queue won't
* be re-organized, and the subtree of parent here should be locked
* for purposes of adding/removing children.
*
* Note that we can not afford trying to walk the device list
* once - our ``next'' device might be a child of the device
* we are about to detach, so it would disappear.
* Just play it safe and restart from the parent.
*/
for (dev = TAILQ_LAST(&alldevs, devicelist);
dev != NULL; dev = next_dev) {
if (dev->dv_parent == parent) {
if ((rv = config_detach(dev, flags)) != 0)
return (rv);
next_dev = TAILQ_LAST(&alldevs, devicelist);
} else {
next_dev = TAILQ_PREV(dev, devicelist, dv_list);
}
}
return (0);
}
int
config_suspend(struct device *dev, int act)
{
const struct cfattach *ca = dev->dv_cfdata->cf_attach;
int r;
device_ref(dev);
if (ca->ca_activate)
r = (*ca->ca_activate)(dev, act);
else
r = config_activate_children(dev, act);
device_unref(dev);
return (r);
}
int
config_suspend_all(int act)
{
struct device *mainbus = device_mainbus();
struct device *mpath = device_mpath();
int rv = 0;
switch (act) {
case DVACT_QUIESCE:
case DVACT_SUSPEND:
case DVACT_POWERDOWN:
if (mpath) {
rv = config_suspend(mpath, act);
if (rv)
return rv;
}
if (mainbus)
rv = config_suspend(mainbus, act);
break;
case DVACT_RESUME:
case DVACT_WAKEUP:
if (mainbus) {
rv = config_suspend(mainbus, act);
if (rv)
return rv;
}
if (mpath)
rv = config_suspend(mpath, act);
break;
}
return (rv);
}
/*
* Call the ca_activate for each of our children, letting each
* decide whether they wish to do the same for their children
* and more.
*/
int
config_activate_children(struct device *parent, int act)
{
struct device *d;
int rv = 0;
for (d = TAILQ_NEXT(parent, dv_list); d != NULL;
d = TAILQ_NEXT(d, dv_list)) {
if (d->dv_parent != parent)
continue;
switch (act) {
case DVACT_QUIESCE:
case DVACT_SUSPEND:
case DVACT_RESUME:
case DVACT_WAKEUP:
case DVACT_POWERDOWN:
rv = config_suspend(d, act);
break;
case DVACT_DEACTIVATE:
rv = config_deactivate(d);
break;
}
if (rv == 0)
continue;
/*
* Found a device that refuses the action.
* If we were being asked to suspend, we can
* try to resume all previous devices.
*/
#ifdef DIAGNOSTIC
printf("config_activate_children: device %s failed %d\n",
d->dv_xname, act);
#endif
if (act == DVACT_RESUME)
printf("failing resume cannot be handled\n");
if (act == DVACT_POWERDOWN)
return (rv);
if (act != DVACT_SUSPEND)
return (rv);
d = TAILQ_PREV(d, devicelist, dv_list);
for (; d != NULL && d != parent;
d = TAILQ_PREV(d, devicelist, dv_list)) {
if (d->dv_parent != parent)
continue;
printf("resume %s\n", d->dv_xname);
config_suspend(d, DVACT_RESUME);
}
return (rv);
}
return (rv);
}
/*
* Lookup a device in the cfdriver device array. Does not return a
* device if it is not active.
*
* Increments ref count on the device by one, reflecting the
* new reference created on the stack.
*
* Context: process only
*/
struct device *
device_lookup(struct cfdriver *cd, int unit)
{
struct device *dv = NULL;
if (unit >= 0 && unit < cd->cd_ndevs)
dv = (struct device *)(cd->cd_devs[unit]);
if (!dv)
return (NULL);
if (!(dv->dv_flags & DVF_ACTIVE))
dv = NULL;
if (dv != NULL)
device_ref(dv);
return (dv);
}
struct device *
device_mainbus(void)
{
extern struct cfdriver mainbus_cd;
if (mainbus_cd.cd_ndevs < 1)
return (NULL);
return (mainbus_cd.cd_devs[0]);
}
struct device *
device_mpath(void)
{
#if NMPATH > 0
extern struct cfdriver mpath_cd;
if (mpath_cd.cd_ndevs < 1)
return (NULL);
return (mpath_cd.cd_devs[0]);
#else
return (NULL);
#endif
}
/*
* Increments the ref count on the device structure. The device
* structure is freed when the ref count hits 0.
*
* Context: process or interrupt
*/
void
device_ref(struct device *dv)
{
atomic_inc_int(&dv->dv_ref);
}
/*
* Decrement the ref count on the device structure.
*
* free's the structure when the ref count hits zero.
*
* Context: process or interrupt
*/
void
device_unref(struct device *dv)
{
const struct cfattach *ca;
if (atomic_dec_int_nv(&dv->dv_ref) == 0) { ca = dv->dv_cfdata->cf_attach;
free(dv, M_DEVBUF, ca->ca_devsize);
}
}
/* $OpenBSD: tty_endrun.c,v 1.8 2018/02/19 08:59:52 mpi Exp $ */
/*
* Copyright (c) 2008 Marc Balmer <mbalmer@openbsd.org>
* Copyright (c) 2009 Kevin Steves <stevesk@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* A tty line discipline to decode the EndRun Technologies native
* time-of-day message.
* http://www.endruntechnologies.com/
*/
/*
* EndRun Format:
*
* T YYYY DDD HH:MM:SS zZZ m<CR><LF>
*
* T is the Time Figure of Merit (TFOM) character (described below).
* This is the on-time character, transmitted during the first
* millisecond of each second.
*
* YYYY is the year
* DDD is the day-of-year
* : is the colon character (0x3A)
* HH is the hour of the day
* MM is the minute of the hour
* SS is the second of the minute
* z is the sign of the offset to UTC, + implies time is ahead of UTC.
* ZZ is the magnitude of the offset to UTC in units of half-hours.
* Non-zero only when the Timemode is Local.
* m is the Timemode character and is one of:
* G = GPS
* L = Local
* U = UTC
* <CR> is the ASCII carriage return character (0x0D)
* <LF> is the ASCII line feed character (0x0A)
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/sensors.h>
#include <sys/tty.h>
#include <sys/conf.h>
#include <sys/time.h>
#ifdef ENDRUN_DEBUG
#define DPRINTFN(n, x) do { if (endrundebug > (n)) printf x; } while (0)
int endrundebug = 0;
#else
#define DPRINTFN(n, x)
#endif
#define DPRINTF(x) DPRINTFN(0, x)
void endrunattach(int);
#define ENDRUNLEN 27 /* strlen("6 2009 018 20:41:17 +00 U\r\n") */
#define NUMFLDS 6
#ifdef ENDRUN_DEBUG
#define TRUSTTIME 30
#else
#define TRUSTTIME (10 * 60) /* 10 minutes */
#endif
int endrun_count, endrun_nxid;
struct endrun {
char cbuf[ENDRUNLEN]; /* receive buffer */
struct ksensor time; /* the timedelta sensor */
struct ksensor signal; /* signal status */
struct ksensordev timedev;
struct timespec ts; /* current timestamp */
struct timespec lts; /* timestamp of last TFOM */
struct timeout endrun_tout; /* invalidate sensor */
int64_t gap; /* gap between two sentences */
int64_t last; /* last time rcvd */
#define SYNC_SCAN 1 /* scanning for '\n' */
#define SYNC_EOL 2 /* '\n' seen, next char TFOM */
int sync;
int pos; /* position in rcv buffer */
int no_pps; /* no PPS although requested */
#ifdef ENDRUN_DEBUG
char tfom;
#endif
};
/* EndRun decoding */
void endrun_scan(struct endrun *, struct tty *);
void endrun_decode(struct endrun *, struct tty *, char *fld[], int fldcnt);
/* date and time conversion */
int endrun_atoi(char *s, int len);
int endrun_date_to_nano(char *s1, char *s2, int64_t *nano);
int endrun_time_to_nano(char *s, int64_t *nano);
int endrun_offset_to_nano(char *s, int64_t *nano);
/* degrade the timedelta sensor */
void endrun_timeout(void *);
void
endrunattach(int dummy)
{
}
int
endrunopen(dev_t dev, struct tty *tp, struct proc *p)
{
struct endrun *np;
int error;
DPRINTF(("endrunopen\n"));
if (tp->t_line == ENDRUNDISC)
return ENODEV;
if ((error = suser(p)) != 0)
return error;
np = malloc(sizeof(struct endrun), M_DEVBUF, M_WAITOK|M_ZERO);
snprintf(np->timedev.xname, sizeof(np->timedev.xname), "endrun%d",
endrun_nxid++);
endrun_count++;
np->time.status = SENSOR_S_UNKNOWN;
np->time.type = SENSOR_TIMEDELTA;
#ifndef ENDRUN_DEBUG
np->time.flags = SENSOR_FINVALID;
#endif
sensor_attach(&np->timedev, &np->time);
np->signal.type = SENSOR_PERCENT;
np->signal.status = SENSOR_S_UNKNOWN;
np->signal.value = 100000LL;
strlcpy(np->signal.desc, "Signal", sizeof(np->signal.desc));
sensor_attach(&np->timedev, &np->signal);
np->sync = SYNC_SCAN;
#ifdef ENDRUN_DEBUG
np->tfom = '0';
#endif
tp->t_sc = (caddr_t)np;
error = linesw[TTYDISC].l_open(dev, tp, p);
if (error) {
free(np, M_DEVBUF, sizeof(*np));
tp->t_sc = NULL;
} else {
sensordev_install(&np->timedev);
timeout_set(&np->endrun_tout, endrun_timeout, np);
}
return error;
}
int
endrunclose(struct tty *tp, int flags, struct proc *p)
{
struct endrun *np = (struct endrun *)tp->t_sc;
DPRINTF(("endrunclose\n"));
tp->t_line = TTYDISC; /* switch back to termios */
timeout_del(&np->endrun_tout);
sensordev_deinstall(&np->timedev);
free(np, M_DEVBUF, sizeof(*np));
tp->t_sc = NULL;
endrun_count--;
if (endrun_count == 0)
endrun_nxid = 0;
return linesw[TTYDISC].l_close(tp, flags, p);
}
/* collect EndRun sentence from tty */
int
endruninput(int c, struct tty *tp)
{
struct endrun *np = (struct endrun *)tp->t_sc;
struct timespec ts;
int64_t gap;
long tmin, tmax;
if (np->sync == SYNC_EOL) {
nanotime(&ts);
np->pos = 0;
np->sync = SYNC_SCAN;
np->cbuf[np->pos++] = c; /* TFOM char */
gap = (ts.tv_sec * 1000000000LL + ts.tv_nsec) -
(np->lts.tv_sec * 1000000000LL + np->lts.tv_nsec);
np->lts.tv_sec = ts.tv_sec;
np->lts.tv_nsec = ts.tv_nsec;
if (gap <= np->gap)
goto nogap;
np->ts.tv_sec = ts.tv_sec;
np->ts.tv_nsec = ts.tv_nsec;
np->gap = gap;
/*
* If a tty timestamp is available, make sure its value is
* reasonable by comparing against the timestamp just taken.
* If they differ by more than 2 seconds, assume no PPS signal
* is present, note the fact, and keep using the timestamp
* value. When this happens, the sensor state is set to
* CRITICAL later when the EndRun sentence is decoded.
*/
if (tp->t_flags & (TS_TSTAMPDCDSET | TS_TSTAMPDCDCLR |
TS_TSTAMPCTSSET | TS_TSTAMPCTSCLR)) {
tmax = lmax(np->ts.tv_sec, tp->t_tv.tv_sec);
tmin = lmin(np->ts.tv_sec, tp->t_tv.tv_sec);
if (tmax - tmin > 1)
np->no_pps = 1;
else {
np->ts.tv_sec = tp->t_tv.tv_sec;
np->ts.tv_nsec = tp->t_tv.tv_usec *
1000L;
np->no_pps = 0;
}
}
} else if (c == '\n') {
if (np->pos == ENDRUNLEN - 1) {
/* don't copy '\n' into cbuf */
np->cbuf[np->pos] = '\0';
endrun_scan(np, tp);
}
np->sync = SYNC_EOL;
} else {
if (np->pos < ENDRUNLEN - 1)
np->cbuf[np->pos++] = c;
}
nogap:
/* pass data to termios */
return linesw[TTYDISC].l_rint(c, tp);
}
/* Scan the EndRun sentence just received */
void
endrun_scan(struct endrun *np, struct tty *tp)
{
int fldcnt = 0, n;
char *fld[NUMFLDS], *cs;
DPRINTFN(1, ("%s\n", np->cbuf));
/* split into fields */
fld[fldcnt++] = &np->cbuf[0];
for (cs = NULL, n = 0; n < np->pos && cs == NULL; n++) {
switch (np->cbuf[n]) {
case '\r':
np->cbuf[n] = '\0';
cs = &np->cbuf[n + 1];
break;
case ' ':
if (fldcnt < NUMFLDS) {
np->cbuf[n] = '\0';
fld[fldcnt++] = &np->cbuf[n + 1];
} else {
DPRINTF(("endrun: nr of fields in sentence "
"exceeds expected: %d\n", NUMFLDS));
return;
}
break;
}
}
endrun_decode(np, tp, fld, fldcnt);
}
/* Decode the time string */
void
endrun_decode(struct endrun *np, struct tty *tp, char *fld[], int fldcnt)
{
int64_t date_nano, time_nano, offset_nano, endrun_now;
char tfom;
int jumped = 0;
if (fldcnt != NUMFLDS) {
DPRINTF(("endrun: field count mismatch, %d\n", fldcnt));
return;
}
if (endrun_time_to_nano(fld[3], &time_nano) == -1) {
DPRINTF(("endrun: illegal time, %s\n", fld[3]));
return;
}
if (endrun_date_to_nano(fld[1], fld[2], &date_nano) == -1) {
DPRINTF(("endrun: illegal date, %s %s\n", fld[1], fld[2]));
return;
}
offset_nano = 0;
/* only parse offset when timemode is local */
if (fld[5][0] == 'L' &&
endrun_offset_to_nano(fld[4], &offset_nano) == -1) {
DPRINTF(("endrun: illegal offset, %s\n", fld[4]));
return;
}
endrun_now = date_nano + time_nano + offset_nano;
if (endrun_now <= np->last) {
DPRINTF(("endrun: time not monotonically increasing "
"last %lld now %lld\n",
(long long)np->last, (long long)endrun_now));
jumped = 1;
}
np->last = endrun_now;
np->gap = 0LL;
#ifdef ENDRUN_DEBUG
if (np->time.status == SENSOR_S_UNKNOWN) {
np->time.status = SENSOR_S_OK;
timeout_add_sec(&np->endrun_tout, TRUSTTIME);
}
#endif
np->time.value = np->ts.tv_sec * 1000000000LL +
np->ts.tv_nsec - endrun_now;
np->time.tv.tv_sec = np->ts.tv_sec;
np->time.tv.tv_usec = np->ts.tv_nsec / 1000L;
if (np->time.status == SENSOR_S_UNKNOWN) {
np->time.status = SENSOR_S_OK;
np->time.flags &= ~SENSOR_FINVALID;
strlcpy(np->time.desc, "EndRun", sizeof(np->time.desc));
}
/*
* Only update the timeout if the clock reports the time as valid.
*
* Time Figure Of Merit (TFOM) values:
*
* 6 - time error is < 100 us
* 7 - time error is < 1 ms
* 8 - time error is < 10 ms
* 9 - time error is > 10 ms,
* unsynchronized state if never locked to CDMA
*/
switch (tfom = fld[0][0]) {
case '6':
case '7':
case '8':
np->time.status = SENSOR_S_OK;
np->signal.status = SENSOR_S_OK;
break;
case '9':
np->signal.status = SENSOR_S_WARN;
break;
default:
DPRINTF(("endrun: invalid TFOM: '%c'\n", tfom));
np->signal.status = SENSOR_S_CRIT;
break;
}
#ifdef ENDRUN_DEBUG
if (np->tfom != tfom) {
DPRINTF(("endrun: TFOM changed from %c to %c\n",
np->tfom, tfom));
np->tfom = tfom;
}
#endif
if (jumped)
np->time.status = SENSOR_S_WARN;
if (np->time.status == SENSOR_S_OK)
timeout_add_sec(&np->endrun_tout, TRUSTTIME);
/*
* If tty timestamping is requested, but no PPS signal is present, set
* the sensor state to CRITICAL.
*/
if (np->no_pps)
np->time.status = SENSOR_S_CRIT;
}
int
endrun_atoi(char *s, int len)
{
int n;
char *p;
/* make sure the input contains only numbers */
for (n = 0, p = s; n < len && *p && *p >= '0' && *p <= '9'; n++, p++)
;
if (n != len || *p != '\0')
return -1;
for (n = 0; *s; s++)
n = n * 10 + *s - '0';
return n;
}
/*
* Convert date fields from EndRun to nanoseconds since the epoch.
* The year string must be of the form YYYY .
* The day of year string must be of the form DDD .
* Return 0 on success, -1 if illegal characters are encountered.
*/
int
endrun_date_to_nano(char *y, char *doy, int64_t *nano)
{
struct clock_ymdhms clock;
time_t secs;
int n, i;
int year_days = 365;
int month_days[] = {
0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
};
#define FEBRUARY 2
#define LEAPYEAR(x) \
((x) % 4 == 0 && \
(x) % 100 != 0) || \
(x) % 400 == 0
if ((n = endrun_atoi(y, 4)) == -1)
return -1;
clock.dt_year = n;
if (LEAPYEAR(n)) {
month_days[FEBRUARY]++;
year_days++;
}
if ((n = endrun_atoi(doy, 3)) == -1 || n == 0 || n > year_days)
return -1;
/* convert day of year to month, day */
for (i = 1; n > month_days[i]; i++) {
n -= month_days[i];
}
clock.dt_mon = i;
clock.dt_day = n;
DPRINTFN(1, ("mm/dd %d/%d\n", i, n));
clock.dt_hour = clock.dt_min = clock.dt_sec = 0;
secs = clock_ymdhms_to_secs(&clock);
*nano = secs * 1000000000LL;
return 0;
}
/*
* Convert time field from EndRun to nanoseconds since midnight.
* The string must be of the form HH:MM:SS .
* Return 0 on success, -1 if illegal characters are encountered.
*/
int
endrun_time_to_nano(char *s, int64_t *nano)
{
struct clock_ymdhms clock;
time_t secs;
int n;
if (s[2] != ':' || s[5] != ':')
return -1;
s[2] = '\0';
s[5] = '\0';
if ((n = endrun_atoi(&s[0], 2)) == -1 || n > 23)
return -1;
clock.dt_hour = n;
if ((n = endrun_atoi(&s[3], 2)) == -1 || n > 59)
return -1;
clock.dt_min = n;
if ((n = endrun_atoi(&s[6], 2)) == -1 || n > 60)
return -1;
clock.dt_sec = n;
DPRINTFN(1, ("hh:mm:ss %d:%d:%d\n", (int)clock.dt_hour,
(int)clock.dt_min,
(int)clock.dt_sec));
secs = clock.dt_hour * 3600
+ clock.dt_min * 60
+ clock.dt_sec;
DPRINTFN(1, ("secs %lu\n", (unsigned long)secs));
*nano = secs * 1000000000LL;
return 0;
}
int
endrun_offset_to_nano(char *s, int64_t *nano)
{
time_t secs;
int n;
if (!(s[0] == '+' || s[0] == '-'))
return -1;
if ((n = endrun_atoi(&s[1], 2)) == -1)
return -1;
secs = n * 30 * 60;
*nano = secs * 1000000000LL;
if (s[0] == '+')
*nano = -*nano;
DPRINTFN(1, ("offset secs %lu nanosecs %lld\n",
(unsigned long)secs, (long long)*nano));
return 0;
}
/*
* Degrade the sensor state if we received no EndRun string for more than
* TRUSTTIME seconds.
*/
void
endrun_timeout(void *xnp)
{
struct endrun *np = xnp;
if (np->time.status == SENSOR_S_OK) {
np->time.status = SENSOR_S_WARN;
/*
* further degrade in TRUSTTIME seconds if no new valid EndRun
* strings are received.
*/
timeout_add_sec(&np->endrun_tout, TRUSTTIME);
} else
np->time.status = SENSOR_S_CRIT;
}
/* $OpenBSD: uvm_km.c,v 1.151 2022/08/01 14:15:46 mpi Exp $ */
/* $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_kern.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_km.c: handle kernel memory allocation and management
*/
/*
* overview of kernel memory management:
*
* the kernel virtual address space is mapped by "kernel_map." kernel_map
* starts at a machine-dependent address and is VM_KERNEL_SPACE_SIZE bytes
* large.
*
* the kernel_map has several "submaps." submaps can only appear in
* the kernel_map (user processes can't use them). submaps "take over"
* the management of a sub-range of the kernel's address space. submaps
* are typically allocated at boot time and are never released. kernel
* virtual address space that is mapped by a submap is locked by the
* submap's lock -- not the kernel_map's lock.
*
* thus, the useful feature of submaps is that they allow us to break
* up the locking and protection of the kernel address space into smaller
* chunks.
*
* The VM system has several standard kernel submaps:
* kmem_map: Contains only wired kernel memory for malloc(9).
* Note: All access to this map must be protected by splvm as
* calls to malloc(9) are allowed in interrupt handlers.
* exec_map: Memory to hold arguments to system calls are allocated from
* this map.
* XXX: This is primeraly used to artificially limit the number
* of concurrent processes doing an exec.
* phys_map: Buffers for vmapbuf (physio) are allocated from this map.
*
* the kernel allocates its private memory out of special uvm_objects whose
* reference count is set to UVM_OBJ_KERN (thus indicating that the objects
* are "special" and never die). all kernel objects should be thought of
* as large, fixed-sized, sparsely populated uvm_objects. each kernel
* object is equal to the size of kernel virtual address space (i.e.
* VM_KERNEL_SPACE_SIZE).
*
* most kernel private memory lives in kernel_object. the only exception
* to this is for memory that belongs to submaps that must be protected
* by splvm(). each of these submaps manages their own pages.
*
* note that just because a kernel object spans the entire kernel virtual
* address space doesn't mean that it has to be mapped into the entire space.
* large chunks of a kernel object's space go unused either because
* that area of kernel VM is unmapped, or there is some other type of
* object mapped into that range (e.g. a vnode). for submap's kernel
* objects, the only part of the object that can ever be populated is the
* offsets that are managed by the submap.
*
* note that the "offset" in a kernel object is always the kernel virtual
* address minus the vm_map_min(kernel_map).
* example:
* suppose kernel_map starts at 0xf8000000 and the kernel does a
* uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the
* kernel map]. if uvm_km_alloc returns virtual address 0xf8235000,
* then that means that the page at offset 0x235000 in kernel_object is
* mapped at 0xf8235000.
*
* kernel objects have one other special property: when the kernel virtual
* memory mapping them is unmapped, the backing memory in the object is
* freed right away. this is done with the uvm_km_pgremove() function.
* this has to be done because there is no backing store for kernel pages
* and no need to save them after they are no longer referenced.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <uvm/uvm.h>
/*
* global data structures
*/
struct vm_map *kernel_map = NULL;
/* Unconstraint range. */
struct uvm_constraint_range no_constraint = { 0x0, (paddr_t)-1 };
/*
* local data structures
*/
static struct vm_map kernel_map_store;
/*
* uvm_km_init: init kernel maps and objects to reflect reality (i.e.
* KVM already allocated for text, data, bss, and static data structures).
*
* => KVM is defined by [base.. base + VM_KERNEL_SPACE_SIZE].
* we assume that [base -> start] has already been allocated and that
* "end" is the end of the kernel image span.
*/
void
uvm_km_init(vaddr_t base, vaddr_t start, vaddr_t end)
{
/* kernel_object: for pageable anonymous kernel memory */
uao_init();
uvm.kernel_object = uao_create(VM_KERNEL_SPACE_SIZE, UAO_FLAG_KERNOBJ);
/*
* init the map and reserve already allocated kernel space
* before installing.
*/
uvm_map_setup(&kernel_map_store, pmap_kernel(), base, end,
#ifdef KVA_GUARDPAGES
VM_MAP_PAGEABLE | VM_MAP_GUARDPAGES
#else
VM_MAP_PAGEABLE
#endif
);
if (base != start && uvm_map(&kernel_map_store, &base, start - base,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_INHERIT_NONE, MADV_RANDOM, UVM_FLAG_FIXED)) != 0)
panic("uvm_km_init: could not reserve space for kernel");
kernel_map = &kernel_map_store;
}
/*
* uvm_km_suballoc: allocate a submap in the kernel map. once a submap
* is allocated all references to that area of VM must go through it. this
* allows the locking of VAs in kernel_map to be broken up into regions.
*
* => if `fixed' is true, *min specifies where the region described
* by the submap must start
* => if submap is non NULL we use that as the submap, otherwise we
* alloc a new map
*/
struct vm_map *
uvm_km_suballoc(struct vm_map *map, vaddr_t *min, vaddr_t *max, vsize_t size,
int flags, boolean_t fixed, struct vm_map *submap)
{
int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0);
size = round_page(size); /* round up to pagesize */
/* first allocate a blank spot in the parent map */
if (uvm_map(map, min, size, NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_INHERIT_NONE, MADV_RANDOM, mapflags)) != 0) {
panic("uvm_km_suballoc: unable to allocate space in parent map");
}
/* set VM bounds (min is filled in by uvm_map) */
*max = *min + size;
/* add references to pmap and create or init the submap */
pmap_reference(vm_map_pmap(map));
if (submap == NULL) {
submap = uvm_map_create(vm_map_pmap(map), *min, *max, flags);
if (submap == NULL)
panic("uvm_km_suballoc: unable to create submap");
} else {
uvm_map_setup(submap, vm_map_pmap(map), *min, *max, flags);
}
/*
* now let uvm_map_submap plug in it...
*/
if (uvm_map_submap(map, *min, *max, submap) != 0)
panic("uvm_km_suballoc: submap allocation failed");
return(submap);
}
/*
* uvm_km_pgremove: remove pages from a kernel uvm_object.
*
* => when you unmap a part of anonymous kernel memory you want to toss
* the pages right away. (this gets called from uvm_unmap_...).
*/
void
uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva)
{
const voff_t start = startva - vm_map_min(kernel_map);
const voff_t end = endva - vm_map_min(kernel_map);
struct vm_page *pp;
voff_t curoff;
int slot;
int swpgonlydelta = 0;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
pmap_remove(pmap_kernel(), startva, endva);
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
pp = uvm_pagelookup(uobj, curoff);
if (pp && pp->pg_flags & PG_BUSY) {
uvm_pagewait(pp, uobj->vmobjlock, "km_pgrm");
rw_enter(uobj->vmobjlock, RW_WRITE);
curoff -= PAGE_SIZE; /* loop back to us */
continue;
}
/* free the swap slot, then the page */
slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT);
if (pp != NULL) {
uvm_lock_pageq();
uvm_pagefree(pp);
uvm_unlock_pageq();
} else if (slot != 0) {
swpgonlydelta++;
}
}
if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta); atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
/*
* uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for "intrsafe"
* objects
*
* => when you unmap a part of anonymous kernel memory you want to toss
* the pages right away. (this gets called from uvm_unmap_...).
* => none of the pages will ever be busy, and none of them will ever
* be on the active or inactive queues (because these objects are
* never allowed to "page").
*/
void
uvm_km_pgremove_intrsafe(vaddr_t start, vaddr_t end)
{
struct vm_page *pg;
vaddr_t va;
paddr_t pa;
for (va = start; va < end; va += PAGE_SIZE) { if (!pmap_extract(pmap_kernel(), va, &pa))
continue;
pg = PHYS_TO_VM_PAGE(pa);
if (pg == NULL)
panic("uvm_km_pgremove_intrsafe: no page"); uvm_pagefree(pg);
}
pmap_kremove(start, end - start);
}
/*
* uvm_km_kmemalloc: lower level kernel memory allocator for malloc()
*
* => we map wired memory into the specified map using the obj passed in
* => NOTE: we can return NULL even if we can wait if there is not enough
* free VM space in the map... caller should be prepared to handle
* this case.
* => we return KVA of memory allocated
* => flags: NOWAIT, VALLOC - just allocate VA, TRYLOCK - fail if we can't
* lock the map
* => low, high, alignment, boundary, nsegs are the corresponding parameters
* to uvm_pglistalloc
* => flags: ZERO - correspond to uvm_pglistalloc flags
*/
vaddr_t
uvm_km_kmemalloc_pla(struct vm_map *map, struct uvm_object *obj, vsize_t size,
vsize_t valign, int flags, paddr_t low, paddr_t high, paddr_t alignment,
paddr_t boundary, int nsegs)
{
vaddr_t kva, loopva;
voff_t offset;
struct vm_page *pg;
struct pglist pgl;
int pla_flags;
KASSERT(vm_map_pmap(map) == pmap_kernel());
/* UVM_KMF_VALLOC => !UVM_KMF_ZERO */
KASSERT(!(flags & UVM_KMF_VALLOC) ||
!(flags & UVM_KMF_ZERO));
/* setup for call */
size = round_page(size);
kva = vm_map_min(map); /* hint */
if (nsegs == 0)
nsegs = atop(size);
/* allocate some virtual space */
if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET,
valign, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_INHERIT_NONE, MADV_RANDOM, (flags & UVM_KMF_TRYLOCK))) != 0)) {
return 0;
}
/* if all we wanted was VA, return now */
if (flags & UVM_KMF_VALLOC) {
return kva;
}
/* recover object offset from virtual address */
if (obj != NULL) offset = kva - vm_map_min(kernel_map);
else
offset = 0;
/*
* now allocate and map in the memory... note that we are the only ones
* whom should ever get a handle on this area of VM.
*/
TAILQ_INIT(&pgl);
pla_flags = 0;
KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if ((flags & UVM_KMF_NOWAIT) || ((flags & UVM_KMF_CANFAIL) &&
uvmexp.swpages - uvmexp.swpgonly <= atop(size)))
pla_flags |= UVM_PLA_NOWAIT;
else
pla_flags |= UVM_PLA_WAITOK;
if (flags & UVM_KMF_ZERO)
pla_flags |= UVM_PLA_ZERO;
if (uvm_pglistalloc(size, low, high, alignment, boundary, &pgl, nsegs,
pla_flags) != 0) {
/* Failed. */
uvm_unmap(map, kva, kva + size);
return (0);
}
if (obj != NULL) rw_enter(obj->vmobjlock, RW_WRITE);
loopva = kva;
while (loopva != kva + size) {
pg = TAILQ_FIRST(&pgl);
TAILQ_REMOVE(&pgl, pg, pageq);
uvm_pagealloc_pg(pg, obj, offset, NULL);
atomic_clearbits_int(&pg->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pg, NULL);
/*
* map it in: note that we call pmap_enter with the map and
* object unlocked in case we are kmem_map.
*/
if (obj == NULL) {
pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
PROT_READ | PROT_WRITE);
} else {
pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg),
PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PMAP_WIRED);
}
loopva += PAGE_SIZE;
offset += PAGE_SIZE;
}
KASSERT(TAILQ_EMPTY(&pgl));
pmap_update(pmap_kernel());
if (obj != NULL) rw_exit(obj->vmobjlock);
return kva;
}
/*
* uvm_km_free: free an area of kernel memory
*/
void
uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size)
{
uvm_unmap(map, trunc_page(addr), round_page(addr+size));
}
/*
* uvm_km_alloc1: allocate wired down memory in the kernel map.
*
* => we can sleep if needed
*/
vaddr_t
uvm_km_alloc1(struct vm_map *map, vsize_t size, vsize_t align, boolean_t zeroit)
{
vaddr_t kva, loopva;
voff_t offset;
struct vm_page *pg;
KASSERT(vm_map_pmap(map) == pmap_kernel());
size = round_page(size);
kva = vm_map_min(map); /* hint */
/* allocate some virtual space */
if (__predict_false(uvm_map(map, &kva, size, uvm.kernel_object,
UVM_UNKNOWN_OFFSET, align,
UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_INHERIT_NONE, MADV_RANDOM, 0)) != 0)) {
return 0;
}
/* recover object offset from virtual address */
offset = kva - vm_map_min(kernel_map);
/* now allocate the memory. we must be careful about released pages. */
loopva = kva;
while (size) {
rw_enter(uvm.kernel_object->vmobjlock, RW_WRITE);
/* allocate ram */
pg = uvm_pagealloc(uvm.kernel_object, offset, NULL, 0);
if (pg) {
atomic_clearbits_int(&pg->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pg, NULL);
}
rw_exit(uvm.kernel_object->vmobjlock);
if (__predict_false(pg == NULL)) {
if (curproc == uvm.pagedaemon_proc) {
/*
* It is unfeasible for the page daemon to
* sleep for memory, so free what we have
* allocated and fail.
*/
uvm_unmap(map, kva, loopva - kva);
return (0);
} else {
uvm_wait("km_alloc1w"); /* wait for memory */
continue;
}
}
/*
* map it in; note we're never called with an intrsafe
* object, so we always use regular old pmap_enter().
*/
pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg),
PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PMAP_WIRED);
loopva += PAGE_SIZE;
offset += PAGE_SIZE;
size -= PAGE_SIZE;
}
pmap_update(map->pmap);
/*
* zero on request (note that "size" is now zero due to the above loop
* so we need to subtract kva from loopva to reconstruct the size).
*/
if (zeroit)
memset((caddr_t)kva, 0, loopva - kva);
return kva;
}
#if defined(__HAVE_PMAP_DIRECT)
/*
* uvm_km_page allocator, __HAVE_PMAP_DIRECT arch
* On architectures with machine memory direct mapped into a portion
* of KVM, we have very little work to do. Just get a physical page,
* and find and return its VA.
*/
void
uvm_km_page_init(void)
{
/* nothing */
}
void
uvm_km_page_lateinit(void)
{
/* nothing */
}
#else
/*
* uvm_km_page allocator, non __HAVE_PMAP_DIRECT archs
* This is a special allocator that uses a reserve of free pages
* to fulfill requests. It is fast and interrupt safe, but can only
* return page sized regions. Its primary use is as a backend for pool.
*
* The memory returned is allocated from the larger kernel_map, sparing
* pressure on the small interrupt-safe kmem_map. It is wired, but
* not zero filled.
*/
struct uvm_km_pages uvm_km_pages;
void uvm_km_createthread(void *);
void uvm_km_thread(void *);
struct uvm_km_free_page *uvm_km_doputpage(struct uvm_km_free_page *);
/*
* Allocate the initial reserve, and create the thread which will
* keep the reserve full. For bootstrapping, we allocate more than
* the lowat amount, because it may be a while before the thread is
* running.
*/
void
uvm_km_page_init(void)
{
int lowat_min;
int i;
int len, bulk;
vaddr_t addr;
mtx_init(&uvm_km_pages.mtx, IPL_VM);
if (!uvm_km_pages.lowat) {
/* based on physmem, calculate a good value here */
uvm_km_pages.lowat = physmem / 256;
lowat_min = physmem < atop(16 * 1024 * 1024) ? 32 : 128;
if (uvm_km_pages.lowat < lowat_min)
uvm_km_pages.lowat = lowat_min;
}
if (uvm_km_pages.lowat > UVM_KM_PAGES_LOWAT_MAX)
uvm_km_pages.lowat = UVM_KM_PAGES_LOWAT_MAX;
uvm_km_pages.hiwat = 4 * uvm_km_pages.lowat;
if (uvm_km_pages.hiwat > UVM_KM_PAGES_HIWAT_MAX)
uvm_km_pages.hiwat = UVM_KM_PAGES_HIWAT_MAX;
/* Allocate all pages in as few allocations as possible. */
len = 0;
bulk = uvm_km_pages.hiwat;
while (len < uvm_km_pages.hiwat && bulk > 0) {
bulk = MIN(bulk, uvm_km_pages.hiwat - len);
addr = vm_map_min(kernel_map);
if (uvm_map(kernel_map, &addr, (vsize_t)bulk << PAGE_SHIFT,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE, MAP_INHERIT_NONE,
MADV_RANDOM, UVM_KMF_TRYLOCK)) != 0) {
bulk /= 2;
continue;
}
for (i = len; i < len + bulk; i++, addr += PAGE_SIZE)
uvm_km_pages.page[i] = addr;
len += bulk;
}
uvm_km_pages.free = len;
for (i = len; i < UVM_KM_PAGES_HIWAT_MAX; i++)
uvm_km_pages.page[i] = 0;
/* tone down if really high */
if (uvm_km_pages.lowat > 512)
uvm_km_pages.lowat = 512;
}
void
uvm_km_page_lateinit(void)
{
kthread_create_deferred(uvm_km_createthread, NULL);
}
void
uvm_km_createthread(void *arg)
{
kthread_create(uvm_km_thread, NULL, &uvm_km_pages.km_proc, "kmthread");
}
/*
* Endless loop. We grab pages in increments of 16 pages, then
* quickly swap them into the list.
*/
void
uvm_km_thread(void *arg)
{
vaddr_t pg[16];
int i;
int allocmore = 0;
int flags;
struct uvm_km_free_page *fp = NULL;
KERNEL_UNLOCK();
for (;;) {
mtx_enter(&uvm_km_pages.mtx);
if (uvm_km_pages.free >= uvm_km_pages.lowat &&
uvm_km_pages.freelist == NULL) {
msleep_nsec(&uvm_km_pages.km_proc, &uvm_km_pages.mtx,
PVM, "kmalloc", INFSLP);
}
allocmore = uvm_km_pages.free < uvm_km_pages.lowat;
fp = uvm_km_pages.freelist;
uvm_km_pages.freelist = NULL;
uvm_km_pages.freelistlen = 0;
mtx_leave(&uvm_km_pages.mtx);
if (allocmore) {
/*
* If there was nothing on the freelist, then we
* must obtain at least one page to make progress.
* So, only use UVM_KMF_TRYLOCK for the first page
* if fp != NULL
*/
flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE, MAP_INHERIT_NONE,
MADV_RANDOM, fp != NULL ? UVM_KMF_TRYLOCK : 0);
memset(pg, 0, sizeof(pg));
for (i = 0; i < nitems(pg); i++) {
pg[i] = vm_map_min(kernel_map);
if (uvm_map(kernel_map, &pg[i], PAGE_SIZE,
NULL, UVM_UNKNOWN_OFFSET, 0, flags) != 0) {
pg[i] = 0;
break;
}
/* made progress, so don't sleep for more */
flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE, MAP_INHERIT_NONE,
MADV_RANDOM, UVM_KMF_TRYLOCK);
}
mtx_enter(&uvm_km_pages.mtx);
for (i = 0; i < nitems(pg); i++) {
if (uvm_km_pages.free ==
nitems(uvm_km_pages.page))
break;
else if (pg[i] != 0)
uvm_km_pages.page[uvm_km_pages.free++]
= pg[i];
}
wakeup(&uvm_km_pages.free);
mtx_leave(&uvm_km_pages.mtx);
/* Cleanup left-over pages (if any). */
for (; i < nitems(pg); i++) {
if (pg[i] != 0) {
uvm_unmap(kernel_map,
pg[i], pg[i] + PAGE_SIZE);
}
}
}
while (fp) {
fp = uvm_km_doputpage(fp);
}
}
}
struct uvm_km_free_page *
uvm_km_doputpage(struct uvm_km_free_page *fp)
{
vaddr_t va = (vaddr_t)fp;
struct vm_page *pg;
int freeva = 1;
struct uvm_km_free_page *nextfp = fp->next;
pg = uvm_atopg(va);
pmap_kremove(va, PAGE_SIZE);
pmap_update(kernel_map->pmap);
mtx_enter(&uvm_km_pages.mtx);
if (uvm_km_pages.free < uvm_km_pages.hiwat) {
uvm_km_pages.page[uvm_km_pages.free++] = va;
freeva = 0;
}
mtx_leave(&uvm_km_pages.mtx);
if (freeva)
uvm_unmap(kernel_map, va, va + PAGE_SIZE);
uvm_pagefree(pg);
return (nextfp);
}
#endif /* !__HAVE_PMAP_DIRECT */
void *
km_alloc(size_t sz, const struct kmem_va_mode *kv,
const struct kmem_pa_mode *kp, const struct kmem_dyn_mode *kd)
{
struct vm_map *map;
struct vm_page *pg;
struct pglist pgl;
int mapflags = 0;
vm_prot_t prot;
paddr_t pla_align;
int pla_flags;
int pla_maxseg;
vaddr_t va, sva = 0;
KASSERT(sz == round_page(sz));
TAILQ_INIT(&pgl);
if (kp->kp_nomem || kp->kp_pageable)
goto alloc_va;
pla_flags = kd->kd_waitok ? UVM_PLA_WAITOK : UVM_PLA_NOWAIT;
pla_flags |= UVM_PLA_TRYCONTIG;
if (kp->kp_zero)
pla_flags |= UVM_PLA_ZERO;
pla_align = kp->kp_align;
#ifdef __HAVE_PMAP_DIRECT
if (pla_align < kv->kv_align)
pla_align = kv->kv_align;
#endif
pla_maxseg = kp->kp_maxseg;
if (pla_maxseg == 0)
pla_maxseg = sz / PAGE_SIZE;
if (uvm_pglistalloc(sz, kp->kp_constraint->ucr_low,
kp->kp_constraint->ucr_high, pla_align, kp->kp_boundary,
&pgl, pla_maxseg, pla_flags)) {
return (NULL);
}
#ifdef __HAVE_PMAP_DIRECT
/*
* Only use direct mappings for single page or single segment
* allocations.
*/
if (kv->kv_singlepage || kp->kp_maxseg == 1) { TAILQ_FOREACH(pg, &pgl, pageq) {
va = pmap_map_direct(pg);
if (pg == TAILQ_FIRST(&pgl))
sva = va;
}
return ((void *)sva);
}
#endif
alloc_va:
prot = PROT_READ | PROT_WRITE;
if (kp->kp_pageable) {
KASSERT(kp->kp_object); KASSERT(!kv->kv_singlepage);
} else {
KASSERT(kp->kp_object == NULL);
}
if (kv->kv_singlepage) {
KASSERT(sz == PAGE_SIZE);
#ifdef __HAVE_PMAP_DIRECT
panic("km_alloc: DIRECT single page");
#else
mtx_enter(&uvm_km_pages.mtx);
while (uvm_km_pages.free == 0) {
if (kd->kd_waitok == 0) {
mtx_leave(&uvm_km_pages.mtx);
uvm_pglistfree(&pgl);
return NULL;
}
msleep_nsec(&uvm_km_pages.free, &uvm_km_pages.mtx,
PVM, "getpage", INFSLP);
}
va = uvm_km_pages.page[--uvm_km_pages.free];
if (uvm_km_pages.free < uvm_km_pages.lowat &&
curproc != uvm_km_pages.km_proc) {
if (kd->kd_slowdown)
*kd->kd_slowdown = 1;
wakeup(&uvm_km_pages.km_proc);
}
mtx_leave(&uvm_km_pages.mtx);
#endif
} else {
struct uvm_object *uobj = NULL;
if (kd->kd_trylock)
mapflags |= UVM_KMF_TRYLOCK;
if (kp->kp_object)
uobj = *kp->kp_object;
try_map:
map = *kv->kv_map;
va = vm_map_min(map);
if (uvm_map(map, &va, sz, uobj, kd->kd_prefer,
kv->kv_align, UVM_MAPFLAG(prot, prot, MAP_INHERIT_NONE,
MADV_RANDOM, mapflags))) {
if (kv->kv_wait && kd->kd_waitok) {
tsleep_nsec(map, PVM, "km_allocva", INFSLP);
goto try_map;
}
uvm_pglistfree(&pgl);
return (NULL);
}
}
sva = va;
TAILQ_FOREACH(pg, &pgl, pageq) {
if (kp->kp_pageable)
pmap_enter(pmap_kernel(), va, VM_PAGE_TO_PHYS(pg),
prot, prot | PMAP_WIRED);
else
pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), prot);
va += PAGE_SIZE;
}
pmap_update(pmap_kernel());
return ((void *)sva);
}
void
km_free(void *v, size_t sz, const struct kmem_va_mode *kv,
const struct kmem_pa_mode *kp)
{
vaddr_t sva, eva, va;
struct vm_page *pg;
struct pglist pgl;
sva = (vaddr_t)v;
eva = sva + sz;
if (kp->kp_nomem)
goto free_va;
#ifdef __HAVE_PMAP_DIRECT
if (kv->kv_singlepage || kp->kp_maxseg == 1) {
TAILQ_INIT(&pgl);
for (va = sva; va < eva; va += PAGE_SIZE) {
pg = pmap_unmap_direct(va);
TAILQ_INSERT_TAIL(&pgl, pg, pageq);
}
uvm_pglistfree(&pgl);
return;
}
#else
if (kv->kv_singlepage) {
struct uvm_km_free_page *fp = v;
mtx_enter(&uvm_km_pages.mtx);
fp->next = uvm_km_pages.freelist;
uvm_km_pages.freelist = fp;
if (uvm_km_pages.freelistlen++ > 16)
wakeup(&uvm_km_pages.km_proc);
mtx_leave(&uvm_km_pages.mtx);
return;
}
#endif
if (kp->kp_pageable) {
pmap_remove(pmap_kernel(), sva, eva);
pmap_update(pmap_kernel());
} else {
TAILQ_INIT(&pgl);
for (va = sva; va < eva; va += PAGE_SIZE) {
paddr_t pa;
if (!pmap_extract(pmap_kernel(), va, &pa))
continue;
pg = PHYS_TO_VM_PAGE(pa);
if (pg == NULL) {
panic("km_free: unmanaged page 0x%lx", pa);
}
TAILQ_INSERT_TAIL(&pgl, pg, pageq);
}
pmap_kremove(sva, sz);
pmap_update(pmap_kernel());
uvm_pglistfree(&pgl);
}
free_va:
uvm_unmap(*kv->kv_map, sva, eva);
if (kv->kv_wait) wakeup(*kv->kv_map);
}
const struct kmem_va_mode kv_any = {
.kv_map = &kernel_map,
};
const struct kmem_va_mode kv_intrsafe = {
.kv_map = &kmem_map,
};
const struct kmem_va_mode kv_page = {
.kv_singlepage = 1
};
const struct kmem_pa_mode kp_dirty = {
.kp_constraint = &no_constraint
};
const struct kmem_pa_mode kp_dma = {
.kp_constraint = &dma_constraint
};
const struct kmem_pa_mode kp_dma_contig = {
.kp_constraint = &dma_constraint,
.kp_maxseg = 1
};
const struct kmem_pa_mode kp_dma_zero = {
.kp_constraint = &dma_constraint,
.kp_zero = 1
};
const struct kmem_pa_mode kp_zero = {
.kp_constraint = &no_constraint,
.kp_zero = 1
};
const struct kmem_pa_mode kp_pageable = {
.kp_object = &uvm.kernel_object,
.kp_pageable = 1
/* XXX - kp_nomem, maybe, but we'll need to fix km_free. */
};
const struct kmem_pa_mode kp_none = {
.kp_nomem = 1
};
const struct kmem_dyn_mode kd_waitok = {
.kd_waitok = 1,
.kd_prefer = UVM_UNKNOWN_OFFSET
};
const struct kmem_dyn_mode kd_nowait = {
.kd_prefer = UVM_UNKNOWN_OFFSET
};
const struct kmem_dyn_mode kd_trylock = {
.kd_trylock = 1,
.kd_prefer = UVM_UNKNOWN_OFFSET
};
/* $OpenBSD: uvm_map.c,v 1.294 2022/08/15 15:53:45 jsg Exp $ */
/* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */
/*
* Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_map.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_map.c: uvm map operations
*/
/* #define DEBUG */
/* #define VMMAP_DEBUG */
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/acct.h>
#include <sys/mman.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/sysctl.h>
#include <sys/signalvar.h>
#include <sys/syslog.h>
#include <sys/user.h>
#include <sys/tracepoint.h>
#ifdef SYSVSHM
#include <sys/shm.h>
#endif
#include <uvm/uvm.h>
#ifdef DDB
#include <uvm/uvm_ddb.h>
#endif
#include <uvm/uvm_addr.h>
vsize_t uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
int uvm_mapent_isjoinable(struct vm_map*,
struct vm_map_entry*, struct vm_map_entry*);
struct vm_map_entry *uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
struct vm_map_entry*, struct uvm_map_deadq*);
struct vm_map_entry *uvm_mapent_tryjoin(struct vm_map*,
struct vm_map_entry*, struct uvm_map_deadq*);
struct vm_map_entry *uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
struct vm_map_entry*, vaddr_t, vsize_t, int,
struct uvm_map_deadq*, struct vm_map_entry*);
struct vm_map_entry *uvm_mapent_alloc(struct vm_map*, int);
void uvm_mapent_free(struct vm_map_entry*);
void uvm_unmap_kill_entry(struct vm_map*,
struct vm_map_entry*);
void uvm_unmap_kill_entry_withlock(struct vm_map *,
struct vm_map_entry *, int);
void uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
void uvm_mapent_mkfree(struct vm_map*,
struct vm_map_entry*, struct vm_map_entry**,
struct uvm_map_deadq*, boolean_t);
void uvm_map_pageable_pgon(struct vm_map*,
struct vm_map_entry*, struct vm_map_entry*,
vaddr_t, vaddr_t);
int uvm_map_pageable_wire(struct vm_map*,
struct vm_map_entry*, struct vm_map_entry*,
vaddr_t, vaddr_t, int);
void uvm_map_setup_entries(struct vm_map*);
void uvm_map_setup_md(struct vm_map*);
void uvm_map_teardown(struct vm_map*);
void uvm_map_vmspace_update(struct vm_map*,
struct uvm_map_deadq*, int);
void uvm_map_kmem_grow(struct vm_map*,
struct uvm_map_deadq*, vsize_t, int);
void uvm_map_freelist_update_clear(struct vm_map*,
struct uvm_map_deadq*);
void uvm_map_freelist_update_refill(struct vm_map *, int);
void uvm_map_freelist_update(struct vm_map*,
struct uvm_map_deadq*, vaddr_t, vaddr_t,
vaddr_t, vaddr_t, int);
struct vm_map_entry *uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
vaddr_t, vaddr_t, int);
int uvm_map_findspace(struct vm_map*,
struct vm_map_entry**, struct vm_map_entry**,
vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
vsize_t uvm_map_addr_augment_get(struct vm_map_entry*);
void uvm_map_addr_augment(struct vm_map_entry*);
int uvm_map_inentry_recheck(u_long, vaddr_t,
struct p_inentry *);
boolean_t uvm_map_inentry_fix(struct proc *, struct p_inentry *,
vaddr_t, int (*)(vm_map_entry_t), u_long);
/*
* Tree management functions.
*/
static inline void uvm_mapent_copy(struct vm_map_entry*,
struct vm_map_entry*);
static inline int uvm_mapentry_addrcmp(const struct vm_map_entry*,
const struct vm_map_entry*);
void uvm_mapent_free_insert(struct vm_map*,
struct uvm_addr_state*, struct vm_map_entry*);
void uvm_mapent_free_remove(struct vm_map*,
struct uvm_addr_state*, struct vm_map_entry*);
void uvm_mapent_addr_insert(struct vm_map*,
struct vm_map_entry*);
void uvm_mapent_addr_remove(struct vm_map*,
struct vm_map_entry*);
void uvm_map_splitentry(struct vm_map*,
struct vm_map_entry*, struct vm_map_entry*,
vaddr_t);
vsize_t uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
/*
* uvm_vmspace_fork helper functions.
*/
struct vm_map_entry *uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
vsize_t, vm_prot_t, vm_prot_t,
struct vm_map_entry*, struct uvm_map_deadq*, int,
int);
struct vm_map_entry *uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
struct vm_map_entry*, struct uvm_map_deadq*);
struct vm_map_entry *uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
struct vm_map*, struct vm_map_entry*,
struct uvm_map_deadq*);
struct vm_map_entry *uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
struct vm_map*, struct vm_map_entry*,
struct uvm_map_deadq*);
struct vm_map_entry *uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
struct vm_map*, struct vm_map_entry*,
struct uvm_map_deadq*);
/*
* Tree validation.
*/
#ifdef VMMAP_DEBUG
void uvm_tree_assert(struct vm_map*, int, char*,
char*, int);
#define UVM_ASSERT(map, cond, file, line) \
uvm_tree_assert((map), (cond), #cond, (file), (line))
void uvm_tree_sanity(struct vm_map*, char*, int);
void uvm_tree_size_chk(struct vm_map*, char*, int);
void vmspace_validate(struct vm_map*);
#else
#define uvm_tree_sanity(_map, _file, _line) do {} while (0)
#define uvm_tree_size_chk(_map, _file, _line) do {} while (0)
#define vmspace_validate(_map) do {} while (0)
#endif
/*
* The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
* Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
*
* We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
* each time.
*/
#define VM_MAP_KSIZE_INIT (512 * (vaddr_t)PAGE_SIZE)
#define VM_MAP_KSIZE_DELTA (256 * (vaddr_t)PAGE_SIZE)
#define VM_MAP_KSIZE_ALLOCMUL 4
/* auto-allocate address lower bound */
#define VMMAP_MIN_ADDR PAGE_SIZE
#ifdef DEADBEEF0
#define UVMMAP_DEADBEEF ((unsigned long)DEADBEEF0)
#else
#define UVMMAP_DEADBEEF ((unsigned long)0xdeadd0d0)
#endif
#ifdef DEBUG
int uvm_map_printlocks = 0;
#define LPRINTF(_args) \
do { \
if (uvm_map_printlocks) \
printf _args; \
} while (0)
#else
#define LPRINTF(_args) do {} while (0)
#endif
static struct mutex uvm_kmapent_mtx;
static struct timeval uvm_kmapent_last_warn_time;
static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
const char vmmapbsy[] = "vmmapbsy";
/*
* pool for vmspace structures.
*/
struct pool uvm_vmspace_pool;
/*
* pool for dynamically-allocated map entries.
*/
struct pool uvm_map_entry_pool;
struct pool uvm_map_entry_kmem_pool;
/*
* This global represents the end of the kernel virtual address
* space. If we want to exceed this, we must grow the kernel
* virtual address space dynamically.
*
* Note, this variable is locked by kernel_map's lock.
*/
vaddr_t uvm_maxkaddr;
/*
* Locking predicate.
*/
#define UVM_MAP_REQ_WRITE(_map) \
do { \
if ((_map)->ref_count > 0) { \
if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \
rw_assert_wrlock(&(_map)->lock); \
else \
MUTEX_ASSERT_LOCKED(&(_map)->mtx); \
} \
} while (0)
#define vm_map_modflags(map, set, clear) \
do { \
mtx_enter(&(map)->flags_lock); \
(map)->flags = ((map)->flags | (set)) & ~(clear); \
mtx_leave(&(map)->flags_lock); \
} while (0)
/*
* Tree describing entries by address.
*
* Addresses are unique.
* Entries with start == end may only exist if they are the first entry
* (sorted by address) within a free-memory tree.
*/
static inline int
uvm_mapentry_addrcmp(const struct vm_map_entry *e1,
const struct vm_map_entry *e2)
{
return e1->start < e2->start ? -1 : e1->start > e2->start;
}
/*
* Copy mapentry.
*/
static inline void
uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
{
caddr_t csrc, cdst;
size_t sz;
csrc = (caddr_t)src;
cdst = (caddr_t)dst;
csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
memcpy(cdst, csrc, sz);
}
/*
* Handle free-list insertion.
*/
void
uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry *entry)
{
const struct uvm_addr_functions *fun;
#ifdef VMMAP_DEBUG
vaddr_t min, max, bound;
#endif
#ifdef VMMAP_DEBUG
/*
* Boundary check.
* Boundaries are folded if they go on the same free list.
*/
min = VMMAP_FREE_START(entry);
max = VMMAP_FREE_END(entry);
while (min < max) {
bound = uvm_map_boundary(map, min, max);
KASSERT(uvm_map_uaddr(map, min) == uaddr);
min = bound;
}
#endif
KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0); UVM_MAP_REQ_WRITE(map);
/* Actual insert: forward to uaddr pointer. */
if (uaddr != NULL) {
fun = uaddr->uaddr_functions;
KDASSERT(fun != NULL);
if (fun->uaddr_free_insert != NULL) (*fun->uaddr_free_insert)(map, uaddr, entry);
entry->etype |= UVM_ET_FREEMAPPED;
}
/* Update fspace augmentation. */
uvm_map_addr_augment(entry);
}
/*
* Handle free-list removal.
*/
void
uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry *entry)
{
const struct uvm_addr_functions *fun;
KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL); KASSERT(uvm_map_uaddr_e(map, entry) == uaddr); UVM_MAP_REQ_WRITE(map); if (uaddr != NULL) {
fun = uaddr->uaddr_functions;
if (fun->uaddr_free_remove != NULL) (*fun->uaddr_free_remove)(map, uaddr, entry);
entry->etype &= ~UVM_ET_FREEMAPPED;
}
}
/*
* Handle address tree insertion.
*/
void
uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *res;
if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF))
panic("uvm_mapent_addr_insert: entry still in addr list");
KDASSERT(entry->start <= entry->end);
KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
(entry->end & (vaddr_t)PAGE_MASK) == 0);
TRACEPOINT(uvm, map_insert,
entry->start, entry->end, entry->protection, NULL);
UVM_MAP_REQ_WRITE(map);
res = RBT_INSERT(uvm_map_addr, &map->addr, entry);
if (res != NULL) {
panic("uvm_mapent_addr_insert: map %p entry %p "
"(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
"with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
map, entry,
entry->start, entry->end, entry->guard, entry->fspace,
res, res->start, res->end, res->guard, res->fspace);
}
}
/*
* Handle address tree removal.
*/
void
uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *res;
TRACEPOINT(uvm, map_remove,
entry->start, entry->end, entry->protection, NULL);
UVM_MAP_REQ_WRITE(map);
res = RBT_REMOVE(uvm_map_addr, &map->addr, entry);
if (res != entry)
panic("uvm_mapent_addr_remove"); RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF);
}
/*
* uvm_map_reference: add reference to a map
*
* => map need not be locked
*/
void
uvm_map_reference(struct vm_map *map)
{
atomic_inc_int(&map->ref_count);
}
void
uvm_map_lock_entry(struct vm_map_entry *entry)
{ if (entry->aref.ar_amap != NULL) { amap_lock(entry->aref.ar_amap);
}
if (UVM_ET_ISOBJ(entry)) { rw_enter(entry->object.uvm_obj->vmobjlock, RW_WRITE);
}
}
void
uvm_map_unlock_entry(struct vm_map_entry *entry)
{ if (UVM_ET_ISOBJ(entry)) { rw_exit(entry->object.uvm_obj->vmobjlock);
}
if (entry->aref.ar_amap != NULL) { amap_unlock(entry->aref.ar_amap);
}
}
/*
* Calculate the dused delta.
*/
vsize_t
uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
{
struct vmspace *vm;
vsize_t sz;
vaddr_t lmax;
vaddr_t stack_begin, stack_end; /* Position of stack. */
KASSERT(map->flags & VM_MAP_ISVMSPACE);
vm = (struct vmspace *)map;
stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
sz = 0;
while (min != max) {
lmax = max;
if (min < stack_begin && lmax > stack_begin)
lmax = stack_begin;
else if (min < stack_end && lmax > stack_end)
lmax = stack_end;
if (min >= stack_begin && min < stack_end) {
/* nothing */
} else
sz += lmax - min;
min = lmax;
}
return sz >> PAGE_SHIFT;
}
/*
* Find the entry describing the given address.
*/
struct vm_map_entry*
uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
{
struct vm_map_entry *iter;
iter = RBT_ROOT(uvm_map_addr, atree);
while (iter != NULL) {
if (iter->start > addr)
iter = RBT_LEFT(uvm_map_addr, iter); else if (VMMAP_FREE_END(iter) <= addr) iter = RBT_RIGHT(uvm_map_addr, iter);
else
return iter;
}
return NULL;
}
/*
* DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
*
* Push dead entries into a linked list.
* Since the linked list abuses the address tree for storage, the entry
* may not be linked in a map.
*
* *head must be initialized to NULL before the first call to this macro.
* uvm_unmap_detach(*head, 0) will remove dead entries.
*/
static inline void
dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
{
TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
}
#define DEAD_ENTRY_PUSH(_headptr, _entry) \
dead_entry_push((_headptr), (_entry))
/*
* Test if memory starting at addr with sz bytes is free.
*
* Fills in *start_ptr and *end_ptr to be the first and last entry describing
* the space.
* If called with prefilled *start_ptr and *end_ptr, they are to be correct.
*/
int
uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
vaddr_t addr, vsize_t sz)
{
struct uvm_addr_state *free;
struct uvm_map_addr *atree;
struct vm_map_entry *i, *i_end;
if (addr + sz < addr)
return 0;
/*
* Kernel memory above uvm_maxkaddr is considered unavailable.
*/
if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
if (addr + sz > uvm_maxkaddr)
return 0;
}
atree = &map->addr;
/*
* Fill in first, last, so they point at the entries containing the
* first and last address of the range.
* Note that if they are not NULL, we don't perform the lookup.
*/
KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
if (*start_ptr == NULL) {
*start_ptr = uvm_map_entrybyaddr(atree, addr);
if (*start_ptr == NULL)
return 0;
} else
KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
if (*end_ptr == NULL) {
if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
*end_ptr = *start_ptr;
else {
*end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
if (*end_ptr == NULL)
return 0;
}
} else
KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
/* Validation. */
KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
KDASSERT((*start_ptr)->start <= addr &&
VMMAP_FREE_END(*start_ptr) > addr &&
(*end_ptr)->start < addr + sz &&
VMMAP_FREE_END(*end_ptr) >= addr + sz);
/*
* Check the none of the entries intersects with <addr, addr+sz>.
* Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
* considered unavailable unless called by those allocators.
*/
i = *start_ptr;
i_end = RBT_NEXT(uvm_map_addr, *end_ptr);
for (; i != i_end;
i = RBT_NEXT(uvm_map_addr, i)) {
if (i->start != i->end && i->end > addr)
return 0;
/*
* uaddr_exe and uaddr_brk_stack may only be used
* by these allocators and the NULL uaddr (i.e. no
* uaddr).
* Reject if this requirement is not met.
*/
if (uaddr != NULL) {
free = uvm_map_uaddr_e(map, i);
if (uaddr != free && free != NULL && (free == map->uaddr_exe ||
free == map->uaddr_brk_stack))
return 0;
}
}
return -1;
}
/*
* Invoke each address selector until an address is found.
* Will not invoke uaddr_exe.
*/
int
uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
{
struct uvm_addr_state *uaddr;
int i;
/*
* Allocation for sz bytes at any address,
* using the addr selectors in order.
*/
for (i = 0; i < nitems(map->uaddr_any); i++) {
uaddr = map->uaddr_any[i];
if (uvm_addr_invoke(map, uaddr, first, last,
addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
return 0;
}
/* Fall back to brk() and stack() address selectors. */
uaddr = map->uaddr_brk_stack;
if (uvm_addr_invoke(map, uaddr, first, last,
addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
return 0;
return ENOMEM;
}
/* Calculate entry augmentation value. */
vsize_t
uvm_map_addr_augment_get(struct vm_map_entry *entry)
{
vsize_t augment;
struct vm_map_entry *left, *right;
augment = entry->fspace;
if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL)
augment = MAX(augment, left->fspace_augment);
if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
augment = MAX(augment, right->fspace_augment);
return augment;
}
/*
* Update augmentation data in entry.
*/
void
uvm_map_addr_augment(struct vm_map_entry *entry)
{
vsize_t augment;
while (entry != NULL) {
/* Calculate value for augmentation. */
augment = uvm_map_addr_augment_get(entry);
/*
* Descend update.
* Once we find an entry that already has the correct value,
* stop, since it means all its parents will use the correct
* value too.
*/
if (entry->fspace_augment == augment)
return;
entry->fspace_augment = augment;
entry = RBT_PARENT(uvm_map_addr, entry);
}
}
/*
* uvm_mapanon: establish a valid mapping in map for an anon
*
* => *addr and sz must be a multiple of PAGE_SIZE.
* => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
* => map must be unlocked.
*
* => align: align vaddr, must be a power-of-2.
* Align is only a hint and will be ignored if the alignment fails.
*/
int
uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
vsize_t align, unsigned int flags)
{
struct vm_map_entry *first, *last, *entry, *new;
struct uvm_map_deadq dead;
vm_prot_t prot;
vm_prot_t maxprot;
vm_inherit_t inherit;
int advice;
int error;
vaddr_t pmap_align, pmap_offset;
vaddr_t hint;
KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE); KASSERT(map != kernel_map); KASSERT((map->flags & UVM_FLAG_HOLE) == 0); KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); splassert(IPL_NONE); KASSERT((flags & UVM_FLAG_TRYLOCK) == 0);
/*
* We use pmap_align and pmap_offset as alignment and offset variables.
*
* Because the align parameter takes precedence over pmap prefer,
* the pmap_align will need to be set to align, with pmap_offset = 0,
* if pmap_prefer will not align.
*/
pmap_align = MAX(align, PAGE_SIZE);
pmap_offset = 0;
/* Decode parameters. */
prot = UVM_PROTECTION(flags);
maxprot = UVM_MAXPROTECTION(flags);
advice = UVM_ADVICE(flags);
inherit = UVM_INHERIT(flags);
error = 0;
hint = trunc_page(*addr);
TAILQ_INIT(&dead);
KASSERT((sz & (vaddr_t)PAGE_MASK) == 0); KASSERT((align & (align - 1)) == 0);
/* Check protection. */
if ((prot & maxprot) != prot)
return EACCES;
/*
* Before grabbing the lock, allocate a map entry for later
* use to ensure we don't wait for memory while holding the
* vm_map_lock.
*/
new = uvm_mapent_alloc(map, flags);
if (new == NULL)
return ENOMEM;
vm_map_lock(map);
first = last = NULL;
if (flags & UVM_FLAG_FIXED) {
/*
* Fixed location.
*
* Note: we ignore align, pmap_prefer.
* Fill in first, last and *addr.
*/
KASSERT((*addr & PAGE_MASK) == 0);
/* Check that the space is available. */
if (flags & UVM_FLAG_UNMAP) { if ((flags & UVM_FLAG_STACK) &&
!uvm_map_is_stack_remappable(map, *addr, sz)) {
error = EINVAL;
goto unlock;
}
uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
}
if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
error = ENOMEM;
goto unlock;
}
} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 && (align == 0 || (*addr & (align - 1)) == 0) &&
uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
/*
* Address used as hint.
*
* Note: we enforce the alignment restriction,
* but ignore pmap_prefer.
*/
} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
/* Run selection algorithm for executables. */
error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
addr, sz, pmap_align, pmap_offset, prot, hint);
if (error != 0)
goto unlock;
} else {
/* Update freelists from vmspace. */
uvm_map_vmspace_update(map, &dead, flags);
error = uvm_map_findspace(map, &first, &last, addr, sz,
pmap_align, pmap_offset, prot, hint);
if (error != 0)
goto unlock;
}
/* Double-check if selected address doesn't cause overflow. */
if (*addr + sz < *addr) {
error = ENOMEM;
goto unlock;
}
/* If we only want a query, return now. */
if (flags & UVM_FLAG_QUERY) {
error = 0;
goto unlock;
}
/*
* Create new entry.
* first and last may be invalidated after this call.
*/
entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
new);
if (entry == NULL) {
error = ENOMEM;
goto unlock;
}
new = NULL;
KDASSERT(entry->start == *addr && entry->end == *addr + sz);
entry->object.uvm_obj = NULL;
entry->offset = 0;
entry->protection = prot;
entry->max_protection = maxprot;
entry->inheritance = inherit;
entry->wired_count = 0;
entry->advice = advice;
if (prot & PROT_WRITE) map->wserial++; if (flags & UVM_FLAG_SYSCALL) { entry->etype |= UVM_ET_SYSCALL;
map->wserial++;
}
if (flags & UVM_FLAG_STACK) {
entry->etype |= UVM_ET_STACK;
if (flags & (UVM_FLAG_FIXED | UVM_FLAG_UNMAP)) map->sserial++;
}
if (flags & UVM_FLAG_COPYONW) {
entry->etype |= UVM_ET_COPYONWRITE;
if ((flags & UVM_FLAG_OVERLAY) == 0) entry->etype |= UVM_ET_NEEDSCOPY;
}
if (flags & UVM_FLAG_CONCEAL) entry->etype |= UVM_ET_CONCEAL; if (flags & UVM_FLAG_OVERLAY) { entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
}
/* Update map and process statistics. */
map->size += sz;
if (prot != PROT_NONE) {
((struct vmspace *)map)->vm_dused +=
uvmspace_dused(map, *addr, *addr + sz);
}
unlock:
vm_map_unlock(map);
/*
* Remove dead entries.
*
* Dead entries may be the result of merging.
* uvm_map_mkentry may also create dead entries, when it attempts to
* destroy free-space entries.
*/
uvm_unmap_detach(&dead, 0);
if (new) uvm_mapent_free(new);
return error;
}
/*
* uvm_map: establish a valid mapping in map
*
* => *addr and sz must be a multiple of PAGE_SIZE.
* => map must be unlocked.
* => <uobj,uoffset> value meanings (4 cases):
* [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER
* [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER
* [3] <uobj,uoffset> == normal mapping
* [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA
*
* case [4] is for kernel mappings where we don't know the offset until
* we've found a virtual address. note that kernel object offsets are
* always relative to vm_map_min(kernel_map).
*
* => align: align vaddr, must be a power-of-2.
* Align is only a hint and will be ignored if the alignment fails.
*/
int
uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
struct uvm_object *uobj, voff_t uoffset,
vsize_t align, unsigned int flags)
{
struct vm_map_entry *first, *last, *entry, *new;
struct uvm_map_deadq dead;
vm_prot_t prot;
vm_prot_t maxprot;
vm_inherit_t inherit;
int advice;
int error;
vaddr_t pmap_align, pmap_offset;
vaddr_t hint;
if ((map->flags & VM_MAP_INTRSAFE) == 0)
splassert(IPL_NONE);
else
splassert(IPL_VM);
/*
* We use pmap_align and pmap_offset as alignment and offset variables.
*
* Because the align parameter takes precedence over pmap prefer,
* the pmap_align will need to be set to align, with pmap_offset = 0,
* if pmap_prefer will not align.
*/
if (uoffset == UVM_UNKNOWN_OFFSET) {
pmap_align = MAX(align, PAGE_SIZE);
pmap_offset = 0;
} else {
pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
pmap_offset = PMAP_PREFER_OFFSET(uoffset);
if (align == 0 ||
(align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
/* pmap_offset satisfies align, no change. */
} else {
/* Align takes precedence over pmap prefer. */
pmap_align = align;
pmap_offset = 0;
}
}
/* Decode parameters. */
prot = UVM_PROTECTION(flags);
maxprot = UVM_MAXPROTECTION(flags);
advice = UVM_ADVICE(flags);
inherit = UVM_INHERIT(flags);
error = 0;
hint = trunc_page(*addr);
TAILQ_INIT(&dead);
KASSERT((sz & (vaddr_t)PAGE_MASK) == 0); KASSERT((align & (align - 1)) == 0);
/* Holes are incompatible with other types of mappings. */
if (flags & UVM_FLAG_HOLE) {
KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
(flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
}
/* Unset hint for kernel_map non-fixed allocations. */
if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
hint = 0;
/* Check protection. */
if ((prot & maxprot) != prot)
return EACCES;
if (map == kernel_map &&
(prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
panic("uvm_map: kernel map W^X violation requested");
/*
* Before grabbing the lock, allocate a map entry for later
* use to ensure we don't wait for memory while holding the
* vm_map_lock.
*/
new = uvm_mapent_alloc(map, flags);
if (new == NULL)
return ENOMEM;
if (flags & UVM_FLAG_TRYLOCK) {
if (vm_map_lock_try(map) == FALSE) {
error = EFAULT;
goto out;
}
} else {
vm_map_lock(map);
}
first = last = NULL;
if (flags & UVM_FLAG_FIXED) {
/*
* Fixed location.
*
* Note: we ignore align, pmap_prefer.
* Fill in first, last and *addr.
*/
KASSERT((*addr & PAGE_MASK) == 0);
/*
* Grow pmap to include allocated address.
* If the growth fails, the allocation will fail too.
*/
if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
uvm_maxkaddr < (*addr + sz)) {
uvm_map_kmem_grow(map, &dead,
*addr + sz - uvm_maxkaddr, flags);
}
/* Check that the space is available. */
if (flags & UVM_FLAG_UNMAP) uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE); if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
error = ENOMEM;
goto unlock;
}
} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 && (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE && (align == 0 || (*addr & (align - 1)) == 0) &&
uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
/*
* Address used as hint.
*
* Note: we enforce the alignment restriction,
* but ignore pmap_prefer.
*/
} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
/* Run selection algorithm for executables. */
error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
addr, sz, pmap_align, pmap_offset, prot, hint);
/* Grow kernel memory and try again. */
if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
uvm_map_kmem_grow(map, &dead, sz, flags);
error = uvm_addr_invoke(map, map->uaddr_exe,
&first, &last, addr, sz,
pmap_align, pmap_offset, prot, hint);
}
if (error != 0)
goto unlock;
} else {
/* Update freelists from vmspace. */
if (map->flags & VM_MAP_ISVMSPACE) uvm_map_vmspace_update(map, &dead, flags);
error = uvm_map_findspace(map, &first, &last, addr, sz,
pmap_align, pmap_offset, prot, hint);
/* Grow kernel memory and try again. */
if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
uvm_map_kmem_grow(map, &dead, sz, flags);
error = uvm_map_findspace(map, &first, &last, addr, sz,
pmap_align, pmap_offset, prot, hint);
}
if (error != 0)
goto unlock;
}
/* Double-check if selected address doesn't cause overflow. */
if (*addr + sz < *addr) {
error = ENOMEM;
goto unlock;
}
KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
uvm_maxkaddr >= *addr + sz);
/* If we only want a query, return now. */
if (flags & UVM_FLAG_QUERY) {
error = 0;
goto unlock;
}
if (uobj == NULL)
uoffset = 0;
else if (uoffset == UVM_UNKNOWN_OFFSET) {
KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj)); uoffset = *addr - vm_map_min(kernel_map);
}
/*
* Create new entry.
* first and last may be invalidated after this call.
*/
entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
new);
if (entry == NULL) {
error = ENOMEM;
goto unlock;
}
new = NULL;
KDASSERT(entry->start == *addr && entry->end == *addr + sz);
entry->object.uvm_obj = uobj;
entry->offset = uoffset;
entry->protection = prot;
entry->max_protection = maxprot;
entry->inheritance = inherit;
entry->wired_count = 0;
entry->advice = advice;
if (prot & PROT_WRITE) map->wserial++; if (flags & UVM_FLAG_SYSCALL) { entry->etype |= UVM_ET_SYSCALL;
map->wserial++;
}
if (flags & UVM_FLAG_STACK) {
entry->etype |= UVM_ET_STACK;
if (flags & UVM_FLAG_UNMAP) map->sserial++;
}
if (uobj)
entry->etype |= UVM_ET_OBJ;
else if (flags & UVM_FLAG_HOLE)
entry->etype |= UVM_ET_HOLE;
if (flags & UVM_FLAG_NOFAULT) entry->etype |= UVM_ET_NOFAULT; if (flags & UVM_FLAG_WC) entry->etype |= UVM_ET_WC; if (flags & UVM_FLAG_COPYONW) {
entry->etype |= UVM_ET_COPYONWRITE;
if ((flags & UVM_FLAG_OVERLAY) == 0) entry->etype |= UVM_ET_NEEDSCOPY;
}
if (flags & UVM_FLAG_CONCEAL) entry->etype |= UVM_ET_CONCEAL; if (flags & UVM_FLAG_OVERLAY) { entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
}
/* Update map and process statistics. */
if (!(flags & UVM_FLAG_HOLE)) {
map->size += sz;
if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL &&
prot != PROT_NONE) {
((struct vmspace *)map)->vm_dused +=
uvmspace_dused(map, *addr, *addr + sz);
}
}
/*
* Try to merge entry.
*
* Userland allocations are kept separated most of the time.
* Forego the effort of merging what most of the time can't be merged
* and only try the merge if it concerns a kernel entry.
*/
if ((flags & UVM_FLAG_NOMERGE) == 0 &&
(map->flags & VM_MAP_ISVMSPACE) == 0)
uvm_mapent_tryjoin(map, entry, &dead);
unlock:
vm_map_unlock(map);
/*
* Remove dead entries.
*
* Dead entries may be the result of merging.
* uvm_map_mkentry may also create dead entries, when it attempts to
* destroy free-space entries.
*/
if (map->flags & VM_MAP_INTRSAFE)
uvm_unmap_detach_intrsafe(&dead);
else
uvm_unmap_detach(&dead, 0);
out:
if (new)
uvm_mapent_free(new);
return error;
}
/*
* True iff e1 and e2 can be joined together.
*/
int
uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
struct vm_map_entry *e2)
{
KDASSERT(e1 != NULL && e2 != NULL);
/* Must be the same entry type and not have free memory between. */
if (e1->etype != e2->etype || e1->end != e2->start)
return 0;
/* Submaps are never joined. */
if (UVM_ET_ISSUBMAP(e1))
return 0;
/* Never merge wired memory. */
if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
return 0;
/* Protection, inheritance and advice must be equal. */
if (e1->protection != e2->protection || e1->max_protection != e2->max_protection || e1->inheritance != e2->inheritance ||
e1->advice != e2->advice)
return 0;
/* If uvm_object: object itself and offsets within object must match. */
if (UVM_ET_ISOBJ(e1)) { if (e1->object.uvm_obj != e2->object.uvm_obj)
return 0;
if (e1->offset + (e1->end - e1->start) != e2->offset)
return 0;
}
/*
* Cannot join shared amaps.
* Note: no need to lock amap to look at refs, since we don't care
* about its exact value.
* If it is 1 (i.e. we have the only reference) it will stay there.
*/
if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
return 0;
if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
return 0;
/* Apparently, e1 and e2 match. */
return 1;
}
/*
* Join support function.
*
* Returns the merged entry on success.
* Returns NULL if the merge failed.
*/
struct vm_map_entry*
uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
struct vm_map_entry *e2, struct uvm_map_deadq *dead)
{
struct uvm_addr_state *free;
/*
* Merging is not supported for map entries that
* contain an amap in e1. This should never happen
* anyway, because only kernel entries are merged.
* These do not contain amaps.
* e2 contains no real information in its amap,
* so it can be erased immediately.
*/
KASSERT(e1->aref.ar_amap == NULL);
/*
* Don't drop obj reference:
* uvm_unmap_detach will do this for us.
*/
free = uvm_map_uaddr_e(map, e1);
uvm_mapent_free_remove(map, free, e1);
free = uvm_map_uaddr_e(map, e2);
uvm_mapent_free_remove(map, free, e2);
uvm_mapent_addr_remove(map, e2);
e1->end = e2->end;
e1->guard = e2->guard;
e1->fspace = e2->fspace;
uvm_mapent_free_insert(map, free, e1);
DEAD_ENTRY_PUSH(dead, e2);
return e1;
}
/*
* Attempt forward and backward joining of entry.
*
* Returns entry after joins.
* We are guaranteed that the amap of entry is either non-existent or
* has never been used.
*/
struct vm_map_entry*
uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
struct uvm_map_deadq *dead)
{
struct vm_map_entry *other;
struct vm_map_entry *merged;
/* Merge with previous entry. */
other = RBT_PREV(uvm_map_addr, entry);
if (other && uvm_mapent_isjoinable(map, other, entry)) { merged = uvm_mapent_merge(map, other, entry, dead);
if (merged)
entry = merged;
}
/*
* Merge with next entry.
*
* Because amap can only extend forward and the next entry
* probably contains sensible info, only perform forward merging
* in the absence of an amap.
*/
other = RBT_NEXT(uvm_map_addr, entry);
if (other && entry->aref.ar_amap == NULL && other->aref.ar_amap == NULL &&
uvm_mapent_isjoinable(map, entry, other)) {
merged = uvm_mapent_merge(map, entry, other, dead);
if (merged)
entry = merged;
}
return entry;
}
/*
* Kill entries that are no longer in a map.
*/
void
uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
{
struct vm_map_entry *entry, *tmp;
int waitok = flags & UVM_PLA_WAITOK;
TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) {
/* Drop reference to amap, if we've got one. */
if (entry->aref.ar_amap)
amap_unref(entry->aref.ar_amap,
entry->aref.ar_pageoff,
atop(entry->end - entry->start),
flags & AMAP_REFALL);
/* Skip entries for which we have to grab the kernel lock. */
if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry))
continue;
TAILQ_REMOVE(deadq, entry, dfree.deadq);
uvm_mapent_free(entry);
}
if (TAILQ_EMPTY(deadq))
return;
KERNEL_LOCK();
while ((entry = TAILQ_FIRST(deadq)) != NULL) { if (waitok) uvm_pause();
/* Drop reference to our backing object, if we've got one. */
if (UVM_ET_ISSUBMAP(entry)) {
/* ... unlikely to happen, but play it safe */
uvm_map_deallocate(entry->object.sub_map); } else if (UVM_ET_ISOBJ(entry) &&
entry->object.uvm_obj->pgops->pgo_detach) {
entry->object.uvm_obj->pgops->pgo_detach(
entry->object.uvm_obj);
}
/* Step to next. */
TAILQ_REMOVE(deadq, entry, dfree.deadq);
uvm_mapent_free(entry);
}
KERNEL_UNLOCK();
}
void
uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
{
struct vm_map_entry *entry;
while ((entry = TAILQ_FIRST(deadq)) != NULL) { KASSERT(entry->aref.ar_amap == NULL); KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(!UVM_ET_ISOBJ(entry));
TAILQ_REMOVE(deadq, entry, dfree.deadq);
uvm_mapent_free(entry);
}
}
/*
* Create and insert new entry.
*
* Returned entry contains new addresses and is inserted properly in the tree.
* first and last are (probably) no longer valid.
*/
struct vm_map_entry*
uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
struct uvm_map_deadq *dead, struct vm_map_entry *new)
{
struct vm_map_entry *entry, *prev;
struct uvm_addr_state *free;
vaddr_t min, max; /* free space boundaries for new entry */
KDASSERT(map != NULL);
KDASSERT(first != NULL);
KDASSERT(last != NULL);
KDASSERT(dead != NULL);
KDASSERT(sz > 0);
KDASSERT(addr + sz > addr);
KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
uvm_tree_sanity(map, __FILE__, __LINE__);
min = addr + sz;
max = VMMAP_FREE_END(last);
/* Initialize new entry. */
if (new == NULL)
entry = uvm_mapent_alloc(map, flags);
else
entry = new;
if (entry == NULL)
return NULL;
entry->offset = 0;
entry->etype = 0;
entry->wired_count = 0;
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = NULL;
entry->start = addr;
entry->end = min;
entry->guard = 0;
entry->fspace = 0;
/* Reset free space in first. */
free = uvm_map_uaddr_e(map, first);
uvm_mapent_free_remove(map, free, first);
first->guard = 0;
first->fspace = 0;
/*
* Remove all entries that are fully replaced.
* We are iterating using last in reverse order.
*/
for (; first != last; last = prev) {
prev = RBT_PREV(uvm_map_addr, last);
KDASSERT(last->start == last->end);
free = uvm_map_uaddr_e(map, last);
uvm_mapent_free_remove(map, free, last);
uvm_mapent_addr_remove(map, last);
DEAD_ENTRY_PUSH(dead, last);
}
/* Remove first if it is entirely inside <addr, addr+sz>. */
if (first->start == addr) {
uvm_mapent_addr_remove(map, first);
DEAD_ENTRY_PUSH(dead, first);
} else {
uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
addr, flags);
}
/* Finally, link in entry. */
uvm_mapent_addr_insert(map, entry);
uvm_map_fix_space(map, entry, min, max, flags);
uvm_tree_sanity(map, __FILE__, __LINE__);
return entry;
}
/*
* uvm_mapent_alloc: allocate a map entry
*/
struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *map, int flags)
{
struct vm_map_entry *me, *ne;
int pool_flags;
int i;
pool_flags = PR_WAITOK;
if (flags & UVM_FLAG_TRYLOCK)
pool_flags = PR_NOWAIT;
if (map->flags & VM_MAP_INTRSAFE || cold) {
mtx_enter(&uvm_kmapent_mtx);
if (SLIST_EMPTY(&uvm.kentry_free)) {
ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
&kd_nowait);
if (ne == NULL)
panic("uvm_mapent_alloc: cannot allocate map "
"entry");
for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) {
SLIST_INSERT_HEAD(&uvm.kentry_free,
&ne[i], daddrs.addr_kentry);
}
if (ratecheck(&uvm_kmapent_last_warn_time,
&uvm_kmapent_warn_rate))
printf("uvm_mapent_alloc: out of static "
"map entries\n");
}
me = SLIST_FIRST(&uvm.kentry_free);
SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry);
uvmexp.kmapent++;
mtx_leave(&uvm_kmapent_mtx);
me->flags = UVM_MAP_STATIC;
} else if (map == kernel_map) {
splassert(IPL_NONE); me = pool_get(&uvm_map_entry_kmem_pool, pool_flags); if (me == NULL)
goto out;
me->flags = UVM_MAP_KMEM;
} else {
splassert(IPL_NONE); me = pool_get(&uvm_map_entry_pool, pool_flags); if (me == NULL)
goto out;
me->flags = 0;
}
RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF);
out:
return me;
}
/*
* uvm_mapent_free: free map entry
*
* => XXX: static pool for kernel map?
*/
void
uvm_mapent_free(struct vm_map_entry *me)
{
if (me->flags & UVM_MAP_STATIC) {
mtx_enter(&uvm_kmapent_mtx);
SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry);
uvmexp.kmapent--;
mtx_leave(&uvm_kmapent_mtx);
} else if (me->flags & UVM_MAP_KMEM) {
splassert(IPL_NONE);
pool_put(&uvm_map_entry_kmem_pool, me);
} else {
splassert(IPL_NONE);
pool_put(&uvm_map_entry_pool, me);
}
}
/*
* uvm_map_lookup_entry: find map entry at or before an address.
*
* => map must at least be read-locked by caller
* => entry is returned in "entry"
* => return value is true if address is in the returned entry
* ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
* returned for those mappings.
*/
boolean_t
uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
struct vm_map_entry **entry)
{
*entry = uvm_map_entrybyaddr(&map->addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) && (*entry)->start <= address && (*entry)->end > address;
}
/*
* Stack must be in a MAP_STACK entry. PROT_NONE indicates stack not yet
* grown -- then uvm_map_check_region_range() should not cache the entry
* because growth won't be seen.
*/
int
uvm_map_inentry_sp(vm_map_entry_t entry)
{ if ((entry->etype & UVM_ET_STACK) == 0) { if (entry->protection == PROT_NONE)
return (-1); /* don't update range */
return (0);
}
return (1);
}
/*
* The system call must not come from a writeable entry, W^X is violated.
* (Would be nice if we can spot aliasing, which is also kind of bad)
*
* The system call must come from an syscall-labeled entry (which are
* the text regions of the main program, sigtramp, ld.so, or libc).
*/
int
uvm_map_inentry_pc(vm_map_entry_t entry)
{ if (entry->protection & PROT_WRITE)
return (0); /* not permitted */
if ((entry->etype & UVM_ET_SYSCALL) == 0)
return (0); /* not permitted */
return (1);
}
int
uvm_map_inentry_recheck(u_long serial, vaddr_t addr, struct p_inentry *ie)
{
return (serial != ie->ie_serial || ie->ie_start == 0 ||
addr < ie->ie_start || addr >= ie->ie_end);
}
/*
* Inside a vm_map find the reg address and verify it via function.
* Remember low and high addresses of region if valid and return TRUE,
* else return FALSE.
*/
boolean_t
uvm_map_inentry_fix(struct proc *p, struct p_inentry *ie, vaddr_t addr,
int (*fn)(vm_map_entry_t), u_long serial)
{
vm_map_t map = &p->p_vmspace->vm_map;
vm_map_entry_t entry;
int ret;
if (addr < map->min_offset || addr >= map->max_offset)
return (FALSE);
/* lock map */
vm_map_lock_read(map);
/* lookup */
if (!uvm_map_lookup_entry(map, trunc_page(addr), &entry)) {
vm_map_unlock_read(map);
return (FALSE);
}
ret = (*fn)(entry);
if (ret == 0) {
vm_map_unlock_read(map);
return (FALSE);
} else if (ret == 1) {
ie->ie_start = entry->start;
ie->ie_end = entry->end;
ie->ie_serial = serial;
} else {
/* do not update, re-check later */
}
vm_map_unlock_read(map);
return (TRUE);
}
boolean_t
uvm_map_inentry(struct proc *p, struct p_inentry *ie, vaddr_t addr,
const char *fmt, int (*fn)(vm_map_entry_t), u_long serial)
{
union sigval sv;
boolean_t ok = TRUE;
if (uvm_map_inentry_recheck(serial, addr, ie)) {
ok = uvm_map_inentry_fix(p, ie, addr, fn, serial);
if (!ok) { KERNEL_LOCK();
printf(fmt, p->p_p->ps_comm, p->p_p->ps_pid, p->p_tid,
addr, ie->ie_start, ie->ie_end-1);
p->p_p->ps_acflag |= AMAP;
sv.sival_ptr = (void *)PROC_PC(p);
trapsignal(p, SIGSEGV, 0, SEGV_ACCERR, sv);
KERNEL_UNLOCK();
}
}
return (ok);
}
/*
* Check whether the given address range can be converted to a MAP_STACK
* mapping.
*
* Must be called with map locked.
*/
boolean_t
uvm_map_is_stack_remappable(struct vm_map *map, vaddr_t addr, vaddr_t sz)
{
vaddr_t end = addr + sz;
struct vm_map_entry *first, *iter, *prev = NULL;
if (!uvm_map_lookup_entry(map, addr, &first)) {
printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
addr, end, map);
return FALSE;
}
/*
* Check that the address range exists and is contiguous.
*/
for (iter = first; iter != NULL && iter->start < end;
prev = iter, iter = RBT_NEXT(uvm_map_addr, iter)) {
/*
* Make sure that we do not have holes in the range.
*/
#if 0
if (prev != NULL) {
printf("prev->start 0x%lx, prev->end 0x%lx, "
"iter->start 0x%lx, iter->end 0x%lx\n",
prev->start, prev->end, iter->start, iter->end);
}
#endif
if (prev != NULL && prev->end != iter->start) {
printf("map stack 0x%lx-0x%lx of map %p failed: "
"hole in range\n", addr, end, map);
return FALSE;
}
if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) {
printf("map stack 0x%lx-0x%lx of map %p failed: "
"hole in range\n", addr, end, map);
return FALSE;
}
}
return TRUE;
}
/*
* Remap the middle-pages of an existing mapping as a stack range.
* If there exists a previous contiguous mapping with the given range
* [addr, addr + sz), with protection PROT_READ|PROT_WRITE, then the
* mapping is dropped, and a new anon mapping is created and marked as
* a stack.
*
* Must be called with map unlocked.
*/
int
uvm_map_remap_as_stack(struct proc *p, vaddr_t addr, vaddr_t sz)
{
vm_map_t map = &p->p_vmspace->vm_map;
vaddr_t start, end;
int error;
int flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_INHERIT_COPY, MADV_NORMAL,
UVM_FLAG_STACK | UVM_FLAG_FIXED | UVM_FLAG_UNMAP |
UVM_FLAG_COPYONW);
start = round_page(addr);
end = trunc_page(addr + sz);
#ifdef MACHINE_STACK_GROWS_UP
if (end == addr + sz)
end -= PAGE_SIZE;
#else
if (start == addr)
start += PAGE_SIZE;
#endif
if (start < map->min_offset || end >= map->max_offset || end < start)
return EINVAL;
error = uvm_mapanon(map, &start, end - start, 0, flags);
if (error != 0)
printf("map stack for pid %d failed\n", p->p_p->ps_pid);
return error;
}
/*
* uvm_map_pie: return a random load address for a PIE executable
* properly aligned.
*/
#ifndef VM_PIE_MAX_ADDR
#define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
#endif
#ifndef VM_PIE_MIN_ADDR
#define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
#endif
#ifndef VM_PIE_MIN_ALIGN
#define VM_PIE_MIN_ALIGN PAGE_SIZE
#endif
vaddr_t
uvm_map_pie(vaddr_t align)
{
vaddr_t addr, space, min;
align = MAX(align, VM_PIE_MIN_ALIGN);
/* round up to next alignment */
min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
return (align);
space = (VM_PIE_MAX_ADDR - min) / align;
space = MIN(space, (u_int32_t)-1);
addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
addr += min;
return (addr);
}
void
uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct uvm_map_deadq dead;
KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
(end & (vaddr_t)PAGE_MASK) == 0);
TAILQ_INIT(&dead);
vm_map_lock(map);
uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
vm_map_unlock(map);
if (map->flags & VM_MAP_INTRSAFE)
uvm_unmap_detach_intrsafe(&dead);
else
uvm_unmap_detach(&dead, 0);
}
/*
* Mark entry as free.
*
* entry will be put on the dead list.
* The free space will be merged into the previous or a new entry,
* unless markfree is false.
*/
void
uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
boolean_t markfree)
{
struct uvm_addr_state *free;
struct vm_map_entry *prev;
vaddr_t addr; /* Start of freed range. */
vaddr_t end; /* End of freed range. */
prev = *prev_ptr;
if (prev == entry)
*prev_ptr = prev = NULL;
if (prev == NULL ||
VMMAP_FREE_END(prev) != entry->start)
prev = RBT_PREV(uvm_map_addr, entry);
/* Entry is describing only free memory and has nothing to drain into. */
if (prev == NULL && entry->start == entry->end && markfree) {
*prev_ptr = entry;
return;
}
addr = entry->start;
end = VMMAP_FREE_END(entry);
free = uvm_map_uaddr_e(map, entry);
uvm_mapent_free_remove(map, free, entry);
uvm_mapent_addr_remove(map, entry);
DEAD_ENTRY_PUSH(dead, entry);
if (markfree) { if (prev) { free = uvm_map_uaddr_e(map, prev);
uvm_mapent_free_remove(map, free, prev);
}
*prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
}
}
/*
* Unwire and release referenced amap and object from map entry.
*/
void
uvm_unmap_kill_entry_withlock(struct vm_map *map, struct vm_map_entry *entry,
int needlock)
{
/* Unwire removed map entry. */
if (VM_MAPENT_ISWIRED(entry)) { KERNEL_LOCK();
entry->wired_count = 0;
uvm_fault_unwire_locked(map, entry->start, entry->end);
KERNEL_UNLOCK();
}
if (needlock)
uvm_map_lock_entry(entry);
/* Entry-type specific code. */
if (UVM_ET_ISHOLE(entry)) {
/* Nothing to be done for holes. */
} else if (map->flags & VM_MAP_INTRSAFE) {
KASSERT(vm_map_pmap(map) == pmap_kernel()); uvm_km_pgremove_intrsafe(entry->start, entry->end); } else if (UVM_ET_ISOBJ(entry) &&
UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
KASSERT(vm_map_pmap(map) == pmap_kernel());
/*
* Note: kernel object mappings are currently used in
* two ways:
* [1] "normal" mappings of pages in the kernel object
* [2] uvm_km_valloc'd allocations in which we
* pmap_enter in some non-kernel-object page
* (e.g. vmapbuf).
*
* for case [1], we need to remove the mapping from
* the pmap and then remove the page from the kernel
* object (because, once pages in a kernel object are
* unmapped they are no longer needed, unlike, say,
* a vnode where you might want the data to persist
* until flushed out of a queue).
*
* for case [2], we need to remove the mapping from
* the pmap. there shouldn't be any pages at the
* specified offset in the kernel object [but it
* doesn't hurt to call uvm_km_pgremove just to be
* safe?]
*
* uvm_km_pgremove currently does the following:
* for pages in the kernel object range:
* - drops the swap slot
* - uvm_pagefree the page
*
* note there is version of uvm_km_pgremove() that
* is used for "intrsafe" objects.
*/
/*
* remove mappings from pmap and drop the pages
* from the object. offsets are always relative
* to vm_map_min(kernel_map).
*/
uvm_km_pgremove(entry->object.uvm_obj, entry->start,
entry->end);
} else {
/* remove mappings the standard way. */
pmap_remove(map->pmap, entry->start, entry->end);
}
if (needlock)
uvm_map_unlock_entry(entry);
}
void
uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
{
uvm_unmap_kill_entry_withlock(map, entry, 0);
}
/*
* Remove all entries from start to end.
*
* If remove_holes, then remove ET_HOLE entries as well.
* If markfree, entry will be properly marked free, otherwise, no replacement
* entry will be put in the tree (corrupting the tree).
*/
void
uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
struct uvm_map_deadq *dead, boolean_t remove_holes,
boolean_t markfree)
{
struct vm_map_entry *prev_hint, *next, *entry;
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
return;
if ((map->flags & VM_MAP_INTRSAFE) == 0)
splassert(IPL_NONE);
else
splassert(IPL_VM);
/* Find first affected entry. */
entry = uvm_map_entrybyaddr(&map->addr, start);
KDASSERT(entry != NULL && entry->start <= start);
if (entry->end <= start && markfree)
entry = RBT_NEXT(uvm_map_addr, entry);
else
UVM_MAP_CLIP_START(map, entry, start);
/*
* Iterate entries until we reach end address.
* prev_hint hints where the freed space can be appended to.
*/
prev_hint = NULL;
for (; entry != NULL && entry->start < end; entry = next) {
KDASSERT(entry->start >= start);
if (entry->end > end || !markfree)
UVM_MAP_CLIP_END(map, entry, end);
KDASSERT(entry->start >= start && entry->end <= end);
next = RBT_NEXT(uvm_map_addr, entry);
/* Don't remove holes unless asked to do so. */
if (UVM_ET_ISHOLE(entry)) {
if (!remove_holes) {
prev_hint = entry;
continue;
}
}
/* A stack has been removed.. */
if (UVM_ET_ISSTACK(entry) && (map->flags & VM_MAP_ISVMSPACE)) map->sserial++;
/* Kill entry. */
uvm_unmap_kill_entry_withlock(map, entry, 1);
/* Update space usage. */
if ((map->flags & VM_MAP_ISVMSPACE) && entry->object.uvm_obj == NULL && entry->protection != PROT_NONE &&
!UVM_ET_ISHOLE(entry)) {
((struct vmspace *)map)->vm_dused -=
uvmspace_dused(map, entry->start, entry->end);
}
if (!UVM_ET_ISHOLE(entry)) map->size -= entry->end - entry->start;
/* Actual removal of entry. */
uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
}
pmap_update(vm_map_pmap(map));
#ifdef VMMAP_DEBUG
if (markfree) {
for (entry = uvm_map_entrybyaddr(&map->addr, start);
entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
KDASSERT(entry->end <= start ||
entry->start == entry->end ||
UVM_ET_ISHOLE(entry));
}
} else {
vaddr_t a;
for (a = start; a < end; a += PAGE_SIZE)
KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
}
#endif
}
/*
* Mark all entries from first until end (exclusive) as pageable.
*
* Lock must be exclusive on entry and will not be touched.
*/
void
uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
{
struct vm_map_entry *iter;
for (iter = first; iter != end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
continue;
iter->wired_count = 0;
uvm_fault_unwire_locked(map, iter->start, iter->end);
}
}
/*
* Mark all entries from first until end (exclusive) as wired.
*
* Lockflags determines the lock state on return from this function.
* Lock must be exclusive on entry.
*/
int
uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
int lockflags)
{
struct vm_map_entry *iter;
#ifdef DIAGNOSTIC
unsigned int timestamp_save;
#endif
int error;
/*
* Wire pages in two passes:
*
* 1: holding the write lock, we create any anonymous maps that need
* to be created. then we clip each map entry to the region to
* be wired and increment its wiring count.
*
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the pages for any newly wired area (wired_count == 1).
*
* downgrading to a read lock for uvm_fault_wire avoids a possible
* deadlock with another thread that may have faulted on one of
* the pages to be wired (it would mark the page busy, blocking
* us, then in turn block on the map lock that we hold).
* because we keep the read lock on the map, the copy-on-write
* status of the entries we modify here cannot change.
*/
for (iter = first; iter != end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
iter->protection == PROT_NONE)
continue;
/*
* Perform actions of vm_map_lookup that need the write lock.
* - create an anonymous map for copy-on-write
* - anonymous map for zero-fill
* Skip submaps.
*/
if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
UVM_ET_ISNEEDSCOPY(iter) &&
((iter->protection & PROT_WRITE) ||
iter->object.uvm_obj == NULL)) {
amap_copy(map, iter, M_WAITOK,
UVM_ET_ISSTACK(iter) ? FALSE : TRUE,
iter->start, iter->end);
}
iter->wired_count++;
}
/*
* Pass 2.
*/
#ifdef DIAGNOSTIC
timestamp_save = map->timestamp;
#endif
vm_map_busy(map);
vm_map_downgrade(map);
error = 0;
for (iter = first; error == 0 && iter != end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
iter->protection == PROT_NONE)
continue;
error = uvm_fault_wire(map, iter->start, iter->end,
iter->protection);
}
if (error) {
/*
* uvm_fault_wire failure
*
* Reacquire lock and undo our work.
*/
vm_map_upgrade(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save != map->timestamp)
panic("uvm_map_pageable_wire: stale map");
#endif
/*
* first is no longer needed to restart loops.
* Use it as iterator to unmap successful mappings.
*/
for (; first != iter;
first = RBT_NEXT(uvm_map_addr, first)) {
if (UVM_ET_ISHOLE(first) || first->start == first->end ||
first->protection == PROT_NONE)
continue;
first->wired_count--;
if (!VM_MAPENT_ISWIRED(first)) { uvm_fault_unwire_locked(map,
first->start, first->end);
}
}
/* decrease counter in the rest of the entries */
for (; iter != end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
iter->protection == PROT_NONE)
continue;
iter->wired_count--;
}
if ((lockflags & UVM_LK_EXIT) == 0)
vm_map_unlock(map);
return error;
}
/* We are currently holding a read lock. */
if ((lockflags & UVM_LK_EXIT) == 0) {
vm_map_unbusy(map);
vm_map_unlock_read(map);
} else {
vm_map_upgrade(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save != map->timestamp) panic("uvm_map_pageable_wire: stale map");
#endif
}
return 0;
}
/*
* uvm_map_pageable: set pageability of a range in a map.
*
* Flags:
* UVM_LK_ENTER: map is already locked by caller
* UVM_LK_EXIT: don't unlock map on exit
*
* The full range must be in use (entries may not have fspace != 0).
* UVM_ET_HOLE counts as unmapped.
*/
int
uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
boolean_t new_pageable, int lockflags)
{
struct vm_map_entry *first, *last, *tmp;
int error;
start = trunc_page(start);
end = round_page(end);
if (start > end)
return EINVAL;
if (start == end)
return 0; /* nothing to do */
if (start < map->min_offset)
return EFAULT; /* why? see first XXX below */
if (end > map->max_offset)
return EINVAL; /* why? see second XXX below */
KASSERT(map->flags & VM_MAP_PAGEABLE); if ((lockflags & UVM_LK_ENTER) == 0) vm_map_lock(map);
/*
* Find first entry.
*
* Initial test on start is different, because of the different
* error returned. Rest is tested further down.
*/
first = uvm_map_entrybyaddr(&map->addr, start);
if (first->end <= start || UVM_ET_ISHOLE(first)) {
/*
* XXX if the first address is not mapped, it is EFAULT?
*/
error = EFAULT;
goto out;
}
/* Check that the range has no holes. */
for (last = first; last != NULL && last->start < end;
last = RBT_NEXT(uvm_map_addr, last)) {
if (UVM_ET_ISHOLE(last) || (last->end < end && VMMAP_FREE_END(last) != last->end)) {
/*
* XXX unmapped memory in range, why is it EINVAL
* instead of EFAULT?
*/
error = EINVAL;
goto out;
}
}
/*
* Last ended at the first entry after the range.
* Move back one step.
*
* Note that last may be NULL.
*/
if (last == NULL) {
last = RBT_MAX(uvm_map_addr, &map->addr);
if (last->end < end) {
error = EINVAL;
goto out;
}
} else {
KASSERT(last != first); last = RBT_PREV(uvm_map_addr, last);
}
/* Wire/unwire pages here. */
if (new_pageable) {
/*
* Mark pageable.
* entries that are not wired are untouched.
*/
if (VM_MAPENT_ISWIRED(first)) UVM_MAP_CLIP_START(map, first, start);
/*
* Split last at end.
* Make tmp be the first entry after what is to be touched.
* If last is not wired, don't touch it.
*/
if (VM_MAPENT_ISWIRED(last)) { UVM_MAP_CLIP_END(map, last, end);
tmp = RBT_NEXT(uvm_map_addr, last);
} else
tmp = last;
uvm_map_pageable_pgon(map, first, tmp, start, end);
error = 0;
out:
if ((lockflags & UVM_LK_EXIT) == 0)
vm_map_unlock(map);
return error;
} else {
/*
* Mark entries wired.
* entries are always touched (because recovery needs this).
*/
if (!VM_MAPENT_ISWIRED(first)) UVM_MAP_CLIP_START(map, first, start);
/*
* Split last at end.
* Make tmp be the first entry after what is to be touched.
* If last is not wired, don't touch it.
*/
if (!VM_MAPENT_ISWIRED(last)) { UVM_MAP_CLIP_END(map, last, end);
tmp = RBT_NEXT(uvm_map_addr, last);
} else
tmp = last;
return uvm_map_pageable_wire(map, first, tmp, start, end,
lockflags);
}
}
/*
* uvm_map_pageable_all: special case of uvm_map_pageable - affects
* all mapped regions.
*
* Map must not be locked.
* If no flags are specified, all ragions are unwired.
*/
int
uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
{
vsize_t size;
struct vm_map_entry *iter;
KASSERT(map->flags & VM_MAP_PAGEABLE);
vm_map_lock(map);
if (flags == 0) {
uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr),
NULL, map->min_offset, map->max_offset);
vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
vm_map_unlock(map);
return 0;
}
if (flags & MCL_FUTURE) vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
if (!(flags & MCL_CURRENT)) {
vm_map_unlock(map);
return 0;
}
/*
* Count number of pages in all non-wired entries.
* If the number exceeds the limit, abort.
*/
size = 0;
RBT_FOREACH(iter, uvm_map_addr, &map->addr) { if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
continue;
size += iter->end - iter->start;
}
if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
vm_map_unlock(map);
return ENOMEM;
}
/* XXX non-pmap_wired_count case must be handled by caller */
#ifdef pmap_wired_count
if (limit != 0 &&
size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
vm_map_unlock(map);
return ENOMEM;
}
#endif
/*
* uvm_map_pageable_wire will release lock
*/
return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr),
NULL, map->min_offset, map->max_offset, 0);
}
/*
* Initialize map.
*
* Allocates sufficient entries to describe the free memory in the map.
*/
void
uvm_map_setup(struct vm_map *map, pmap_t pmap, vaddr_t min, vaddr_t max,
int flags)
{
int i;
KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
(max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
/*
* Update parameters.
*
* This code handles (vaddr_t)-1 and other page mask ending addresses
* properly.
* We lose the top page if the full virtual address space is used.
*/
if (max & (vaddr_t)PAGE_MASK) {
max += 1;
if (max == 0) /* overflow */
max -= PAGE_SIZE;
}
RBT_INIT(uvm_map_addr, &map->addr);
map->uaddr_exe = NULL;
for (i = 0; i < nitems(map->uaddr_any); ++i)
map->uaddr_any[i] = NULL;
map->uaddr_brk_stack = NULL;
map->pmap = pmap;
map->size = 0;
map->ref_count = 0;
map->min_offset = min;
map->max_offset = max;
map->b_start = map->b_end = 0; /* Empty brk() area by default. */
map->s_start = map->s_end = 0; /* Empty stack area by default. */
map->flags = flags;
map->timestamp = 0;
if (flags & VM_MAP_ISVMSPACE)
rw_init_flags(&map->lock, "vmmaplk", RWL_DUPOK);
else
rw_init(&map->lock, "kmmaplk");
mtx_init(&map->mtx, IPL_VM);
mtx_init(&map->flags_lock, IPL_VM);
/* Configure the allocators. */
if (flags & VM_MAP_ISVMSPACE)
uvm_map_setup_md(map);
else
map->uaddr_any[3] = &uaddr_kbootstrap;
/*
* Fill map entries.
* We do not need to write-lock the map here because only the current
* thread sees it right now. Initialize ref_count to 0 above to avoid
* bogus triggering of lock-not-held assertions.
*/
uvm_map_setup_entries(map);
uvm_tree_sanity(map, __FILE__, __LINE__);
map->ref_count = 1;
}
/*
* Destroy the map.
*
* This is the inverse operation to uvm_map_setup.
*/
void
uvm_map_teardown(struct vm_map *map)
{
struct uvm_map_deadq dead_entries;
struct vm_map_entry *entry, *tmp;
#ifdef VMMAP_DEBUG
size_t numq, numt;
#endif
int i;
KERNEL_ASSERT_LOCKED();
KERNEL_UNLOCK();
KERNEL_ASSERT_UNLOCKED();
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
/* Remove address selectors. */
uvm_addr_destroy(map->uaddr_exe);
map->uaddr_exe = NULL;
for (i = 0; i < nitems(map->uaddr_any); i++) {
uvm_addr_destroy(map->uaddr_any[i]);
map->uaddr_any[i] = NULL;
}
uvm_addr_destroy(map->uaddr_brk_stack);
map->uaddr_brk_stack = NULL;
/*
* Remove entries.
*
* The following is based on graph breadth-first search.
*
* In color terms:
* - the dead_entries set contains all nodes that are reachable
* (i.e. both the black and the grey nodes)
* - any entry not in dead_entries is white
* - any entry that appears in dead_entries before entry,
* is black, the rest is grey.
* The set [entry, end] is also referred to as the wavefront.
*
* Since the tree is always a fully connected graph, the breadth-first
* search guarantees that each vmmap_entry is visited exactly once.
* The vm_map is broken down in linear time.
*/
TAILQ_INIT(&dead_entries);
if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL)
DEAD_ENTRY_PUSH(&dead_entries, entry);
while (entry != NULL) {
sched_pause(yield);
uvm_unmap_kill_entry(map, entry);
if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL)
DEAD_ENTRY_PUSH(&dead_entries, tmp);
if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
DEAD_ENTRY_PUSH(&dead_entries, tmp);
/* Update wave-front. */
entry = TAILQ_NEXT(entry, dfree.deadq);
}
#ifdef VMMAP_DEBUG
numt = numq = 0;
RBT_FOREACH(entry, uvm_map_addr, &map->addr)
numt++;
TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
numq++;
KASSERT(numt == numq);
#endif
uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
KERNEL_LOCK();
pmap_destroy(map->pmap);
map->pmap = NULL;
}
/*
* Populate map with free-memory entries.
*
* Map must be initialized and empty.
*/
void
uvm_map_setup_entries(struct vm_map *map)
{
KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
}
/*
* Split entry at given address.
*
* orig: entry that is to be split.
* next: a newly allocated map entry that is not linked.
* split: address at which the split is done.
*/
void
uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
struct vm_map_entry *next, vaddr_t split)
{
struct uvm_addr_state *free, *free_before;
vsize_t adj;
if ((split & PAGE_MASK) != 0) {
panic("uvm_map_splitentry: split address 0x%lx "
"not on page boundary!", split);
}
KDASSERT(map != NULL && orig != NULL && next != NULL);
uvm_tree_sanity(map, __FILE__, __LINE__);
KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
#ifdef VMMAP_DEBUG
KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig);
KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next);
#endif /* VMMAP_DEBUG */
/*
* Free space will change, unlink from free space tree.
*/
free = uvm_map_uaddr_e(map, orig);
uvm_mapent_free_remove(map, free, orig);
adj = split - orig->start;
uvm_mapent_copy(orig, next);
if (split >= orig->end) {
next->etype = 0;
next->offset = 0;
next->wired_count = 0;
next->start = next->end = split;
next->guard = 0;
next->fspace = VMMAP_FREE_END(orig) - split;
next->aref.ar_amap = NULL;
next->aref.ar_pageoff = 0;
orig->guard = MIN(orig->guard, split - orig->end);
orig->fspace = split - VMMAP_FREE_START(orig);
} else {
orig->fspace = 0;
orig->guard = 0;
orig->end = next->start = split;
if (next->aref.ar_amap) {
amap_splitref(&orig->aref, &next->aref, adj);
}
if (UVM_ET_ISSUBMAP(orig)) {
uvm_map_reference(next->object.sub_map);
next->offset += adj;
} else if (UVM_ET_ISOBJ(orig)) { if (next->object.uvm_obj->pgops &&
next->object.uvm_obj->pgops->pgo_reference) {
KERNEL_LOCK();
next->object.uvm_obj->pgops->pgo_reference(
next->object.uvm_obj);
KERNEL_UNLOCK();
}
next->offset += adj;
}
}
/*
* Link next into address tree.
* Link orig and next into free-space tree.
*
* Don't insert 'next' into the addr tree until orig has been linked,
* in case the free-list looks at adjecent entries in the addr tree
* for its decisions.
*/
if (orig->fspace > 0)
free_before = free;
else
free_before = uvm_map_uaddr_e(map, orig);
uvm_mapent_free_insert(map, free_before, orig);
uvm_mapent_addr_insert(map, next);
uvm_mapent_free_insert(map, free, next);
uvm_tree_sanity(map, __FILE__, __LINE__);
}
#ifdef VMMAP_DEBUG
void
uvm_tree_assert(struct vm_map *map, int test, char *test_str,
char *file, int line)
{
char* map_special;
if (test)
return;
if (map == kernel_map)
map_special = " (kernel_map)";
else if (map == kmem_map)
map_special = " (kmem_map)";
else
map_special = "";
panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
line, test_str);
}
/*
* Check that map is sane.
*/
void
uvm_tree_sanity(struct vm_map *map, char *file, int line)
{
struct vm_map_entry *iter;
vaddr_t addr;
vaddr_t min, max, bound; /* Bounds checker. */
struct uvm_addr_state *free;
addr = vm_map_min(map);
RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
/*
* Valid start, end.
* Catch overflow for end+fspace.
*/
UVM_ASSERT(map, iter->end >= iter->start, file, line);
UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
/* May not be empty. */
UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
file, line);
/* Addresses for entry must lie within map boundaries. */
UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
/* Tree may not have gaps. */
UVM_ASSERT(map, iter->start == addr, file, line);
addr = VMMAP_FREE_END(iter);
/*
* Free space may not cross boundaries, unless the same
* free list is used on both sides of the border.
*/
min = VMMAP_FREE_START(iter);
max = VMMAP_FREE_END(iter);
while (min < max &&
(bound = uvm_map_boundary(map, min, max)) != max) {
UVM_ASSERT(map,
uvm_map_uaddr(map, bound - 1) ==
uvm_map_uaddr(map, bound),
file, line);
min = bound;
}
free = uvm_map_uaddr_e(map, iter);
if (free) {
UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
file, line);
} else {
UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
file, line);
}
}
UVM_ASSERT(map, addr == vm_map_max(map), file, line);
}
void
uvm_tree_size_chk(struct vm_map *map, char *file, int line)
{
struct vm_map_entry *iter;
vsize_t size;
size = 0;
RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
if (!UVM_ET_ISHOLE(iter))
size += iter->end - iter->start;
}
if (map->size != size)
printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
UVM_ASSERT(map, map->size == size, file, line);
vmspace_validate(map);
}
/*
* This function validates the statistics on vmspace.
*/
void
vmspace_validate(struct vm_map *map)
{
struct vmspace *vm;
struct vm_map_entry *iter;
vaddr_t imin, imax;
vaddr_t stack_begin, stack_end; /* Position of stack. */
vsize_t stack, heap; /* Measured sizes. */
if (!(map->flags & VM_MAP_ISVMSPACE))
return;
vm = (struct vmspace *)map;
stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack = heap = 0;
RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
imin = imax = iter->start;
if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL ||
iter->prot != PROT_NONE)
continue;
/*
* Update stack, heap.
* Keep in mind that (theoretically) the entries of
* userspace and stack may be joined.
*/
while (imin != iter->end) {
/*
* Set imax to the first boundary crossed between
* imin and stack addresses.
*/
imax = iter->end;
if (imin < stack_begin && imax > stack_begin)
imax = stack_begin;
else if (imin < stack_end && imax > stack_end)
imax = stack_end;
if (imin >= stack_begin && imin < stack_end)
stack += imax - imin;
else
heap += imax - imin;
imin = imax;
}
}
heap >>= PAGE_SHIFT;
if (heap != vm->vm_dused) {
printf("vmspace stack range: 0x%lx-0x%lx\n",
stack_begin, stack_end);
panic("vmspace_validate: vmspace.vm_dused invalid, "
"expected %ld pgs, got %ld pgs in map %p",
heap, vm->vm_dused,
map);
}
}
#endif /* VMMAP_DEBUG */
/*
* uvm_map_init: init mapping system at boot time. note that we allocate
* and init the static pool of structs vm_map_entry for the kernel here.
*/
void
uvm_map_init(void)
{
static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
int lcv;
/* now set up static pool of kernel map entries ... */
mtx_init(&uvm_kmapent_mtx, IPL_VM);
SLIST_INIT(&uvm.kentry_free);
for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
SLIST_INSERT_HEAD(&uvm.kentry_free,
&kernel_map_entry[lcv], daddrs.addr_kentry);
}
/* initialize the map-related pools. */
pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0,
IPL_NONE, PR_WAITOK, "vmsppl", NULL);
pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0,
IPL_VM, PR_WAITOK, "vmmpepl", NULL);
pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0,
IPL_VM, 0, "vmmpekpl", NULL);
pool_sethiwat(&uvm_map_entry_pool, 8192);
uvm_addr_init();
}
#if defined(DDB)
/*
* DDB hooks
*/
/*
* uvm_map_printit: actually prints the map
*/
void
uvm_map_printit(struct vm_map *map, boolean_t full,
int (*pr)(const char *, ...))
{
struct vmspace *vm;
struct vm_map_entry *entry;
struct uvm_addr_state *free;
int in_free, i;
char buf[8];
(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
(*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
map->b_start, map->b_end);
(*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
map->s_start, map->s_end);
(*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
map->size, map->ref_count, map->timestamp,
map->flags);
(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
pmap_resident_count(map->pmap));
/* struct vmspace handling. */
if (map->flags & VM_MAP_ISVMSPACE) {
vm = (struct vmspace *)map;
(*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
(*pr)("\tvm_tsize=%u vm_dsize=%u\n",
vm->vm_tsize, vm->vm_dsize);
(*pr)("\tvm_taddr=%p vm_daddr=%p\n",
vm->vm_taddr, vm->vm_daddr);
(*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
vm->vm_maxsaddr, vm->vm_minsaddr);
}
if (!full)
goto print_uaddr;
RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
entry, entry->start, entry->end, entry->object.uvm_obj,
(long long)entry->offset, entry->aref.ar_amap,
entry->aref.ar_pageoff);
(*pr)("\tsubmap=%c, cow=%c, nc=%c, stack=%c, "
"syscall=%c, prot(max)=%d/%d, inh=%d, "
"wc=%d, adv=%d\n",
(entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
(entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
(entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
(entry->etype & UVM_ET_STACK) ? 'T' : 'F',
(entry->etype & UVM_ET_SYSCALL) ? 'T' : 'F',
entry->protection, entry->max_protection,
entry->inheritance, entry->wired_count, entry->advice);
free = uvm_map_uaddr_e(map, entry);
in_free = (free != NULL);
(*pr)("\thole=%c, free=%c, guard=0x%lx, "
"free=0x%lx-0x%lx\n",
(entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
in_free ? 'T' : 'F',
entry->guard,
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
(*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
(*pr)("\tfreemapped=%c, uaddr=%p\n",
(entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
if (free) {
(*pr)("\t\t(0x%lx-0x%lx %s)\n",
free->uaddr_minaddr, free->uaddr_maxaddr,
free->uaddr_functions->uaddr_name);
}
}
print_uaddr:
uvm_addr_print(map->uaddr_exe, "exe", full, pr);
for (i = 0; i < nitems(map->uaddr_any); i++) {
snprintf(&buf[0], sizeof(buf), "any[%d]", i);
uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
}
uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
}
/*
* uvm_object_printit: actually prints the object
*/
void
uvm_object_printit(struct uvm_object *uobj, boolean_t full,
int (*pr)(const char *, ...))
{
struct vm_page *pg;
int cnt = 0;
(*pr)("OBJECT %p: pgops=%p, npages=%d, ",
uobj, uobj->pgops, uobj->uo_npages);
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
(*pr)("refs=<SYSTEM>\n");
else
(*pr)("refs=%d\n", uobj->uo_refs);
if (!full) {
return;
}
(*pr)(" PAGES <pg,offset>:\n ");
RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
if ((cnt % 3) == 2) {
(*pr)("\n ");
}
cnt++;
}
if ((cnt % 3) != 2) {
(*pr)("\n");
}
}
/*
* uvm_page_printit: actually print the page
*/
static const char page_flagbits[] =
"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
"\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
void
uvm_page_printit(struct vm_page *pg, boolean_t full,
int (*pr)(const char *, ...))
{
struct vm_page *tpg;
struct uvm_object *uobj;
struct pglist *pgl;
(*pr)("PAGE %p:\n", pg);
(*pr)(" flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
(long long)pg->phys_addr);
(*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
pg->uobject, pg->uanon, (long long)pg->offset);
#if defined(UVM_PAGE_TRKOWN)
if (pg->pg_flags & PG_BUSY)
(*pr)(" owning thread = %d, tag=%s",
pg->owner, pg->owner_tag);
else
(*pr)(" page not busy, no owner");
#else
(*pr)(" [page ownership tracking disabled]");
#endif
(*pr)("\tvm_page_md %p\n", &pg->mdpage);
if (!full)
return;
/* cross-verify object/anon */
if ((pg->pg_flags & PQ_FREE) == 0) {
if (pg->pg_flags & PQ_ANON) {
if (pg->uanon == NULL || pg->uanon->an_page != pg)
(*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
(pg->uanon) ? pg->uanon->an_page : NULL);
else
(*pr)(" anon backpointer is OK\n");
} else {
uobj = pg->uobject;
if (uobj) {
(*pr)(" checking object list\n");
RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) {
if (tpg == pg) {
break;
}
}
if (tpg)
(*pr)(" page found on object list\n");
else
(*pr)(" >>> PAGE NOT FOUND "
"ON OBJECT LIST! <<<\n");
}
}
}
/* cross-verify page queue */
if (pg->pg_flags & PQ_FREE) {
if (uvm_pmr_isfree(pg))
(*pr)(" page found in uvm_pmemrange\n");
else
(*pr)(" >>> page not found in uvm_pmemrange <<<\n");
pgl = NULL;
} else if (pg->pg_flags & PQ_INACTIVE) {
pgl = &uvm.page_inactive;
} else if (pg->pg_flags & PQ_ACTIVE) {
pgl = &uvm.page_active;
} else {
pgl = NULL;
}
if (pgl) {
(*pr)(" checking pageq list\n");
TAILQ_FOREACH(tpg, pgl, pageq) {
if (tpg == pg) {
break;
}
}
if (tpg)
(*pr)(" page found on pageq list\n");
else
(*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
}
}
#endif
/*
* uvm_map_protect: change map protection
*
* => set_max means set max_protection.
* => map must be unlocked.
*/
int
uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t new_prot, boolean_t set_max)
{
struct vm_map_entry *first, *iter;
vm_prot_t old_prot;
vm_prot_t mask;
vsize_t dused;
int error;
if (start > end)
return EINVAL;
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
return 0;
dused = 0;
error = 0;
vm_map_lock(map);
/*
* Set up first and last.
* - first will contain first entry at or after start.
*/
first = uvm_map_entrybyaddr(&map->addr, start);
KDASSERT(first != NULL);
if (first->end <= start) first = RBT_NEXT(uvm_map_addr, first);
/* First, check for protection violations. */
for (iter = first; iter != NULL && iter->start < end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
/* Treat memory holes as free space. */
if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
continue;
old_prot = iter->protection;
if (old_prot == PROT_NONE && new_prot != old_prot) {
dused += uvmspace_dused(
map, MAX(start, iter->start), MIN(end, iter->end));
}
if (UVM_ET_ISSUBMAP(iter)) {
error = EINVAL;
goto out;
}
if ((new_prot & iter->max_protection) != new_prot) {
error = EACCES;
goto out;
}
if (map == kernel_map &&
(new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
panic("uvm_map_protect: kernel map W^X violation requested");
}
/* Check limits. */
if (dused > 0 && (map->flags & VM_MAP_ISVMSPACE)) {
vsize_t limit = lim_cur(RLIMIT_DATA);
dused = ptoa(dused);
if (limit < dused ||
limit - dused < ptoa(((struct vmspace *)map)->vm_dused)) {
error = ENOMEM;
goto out;
}
}
/* Fix protections. */
for (iter = first; iter != NULL && iter->start < end;
iter = RBT_NEXT(uvm_map_addr, iter)) {
/* Treat memory holes as free space. */
if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
continue;
old_prot = iter->protection;
/*
* Skip adapting protection iff old and new protection
* are equal.
*/
if (set_max) {
if (old_prot == (new_prot & old_prot) &&
iter->max_protection == new_prot)
continue;
} else {
if (old_prot == new_prot)
continue;
}
UVM_MAP_CLIP_START(map, iter, start); UVM_MAP_CLIP_END(map, iter, end); if (set_max) { iter->max_protection = new_prot;
iter->protection &= new_prot;
} else
iter->protection = new_prot;
/*
* update physical map if necessary. worry about copy-on-write
* here -- CHECK THIS XXX
*/
if (iter->protection != old_prot) {
mask = UVM_ET_ISCOPYONWRITE(iter) ?
~PROT_WRITE : PROT_MASK;
/* XXX should only wserial++ if no split occurs */
if (iter->protection & PROT_WRITE) map->wserial++; if (map->flags & VM_MAP_ISVMSPACE) { if (old_prot == PROT_NONE) {
((struct vmspace *)map)->vm_dused +=
uvmspace_dused(map, iter->start,
iter->end);
}
if (iter->protection == PROT_NONE) {
((struct vmspace *)map)->vm_dused -=
uvmspace_dused(map, iter->start,
iter->end);
}
}
/* update pmap */
if ((iter->protection & mask) == PROT_NONE &&
VM_MAPENT_ISWIRED(iter)) {
/*
* TODO(ariane) this is stupid. wired_count
* is 0 if not wired, otherwise anything
* larger than 0 (incremented once each time
* wire is called).
* Mostly to be able to undo the damage on
* failure. Not the actually be a wired
* refcounter...
* Originally: iter->wired_count--;
* (don't we have to unwire this in the pmap
* as well?)
*/
iter->wired_count = 0;
}
uvm_map_lock_entry(iter);
pmap_protect(map->pmap, iter->start, iter->end,
iter->protection & mask);
uvm_map_unlock_entry(iter);
}
/*
* If the map is configured to lock any future mappings,
* wire this entry now if the old protection was PROT_NONE
* and the new protection is not PROT_NONE.
*/
if ((map->flags & VM_MAP_WIREFUTURE) != 0 && VM_MAPENT_ISWIRED(iter) == 0 &&
old_prot == PROT_NONE &&
new_prot != PROT_NONE) {
if (uvm_map_pageable(map, iter->start, iter->end,
FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
/*
* If locking the entry fails, remember the
* error if it's the first one. Note we
* still continue setting the protection in
* the map, but it will return the resource
* storage condition regardless.
*
* XXX Ignore what the actual error is,
* XXX just call it a resource shortage
* XXX so that it doesn't get confused
* XXX what uvm_map_protect() itself would
* XXX normally return.
*/
error = ENOMEM;
}
}
}
pmap_update(map->pmap);
out:
vm_map_unlock(map);
return error;
}
/*
* uvmspace_alloc: allocate a vmspace structure.
*
* - structure includes vm_map and pmap
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
struct vmspace *
uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
boolean_t remove_holes)
{
struct vmspace *vm;
vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
return (vm);
}
/*
* uvmspace_init: initialize a vmspace structure.
*
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
void
uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
boolean_t pageable, boolean_t remove_holes)
{
KASSERT(pmap == NULL || pmap == pmap_kernel());
if (pmap)
pmap_reference(pmap);
else
pmap = pmap_create();
uvm_map_setup(&vm->vm_map, pmap, min, max,
(pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
vm->vm_refcnt = 1;
if (remove_holes)
pmap_remove_holes(vm);
}
/*
* uvmspace_share: share a vmspace between two processes
*
* - used for vfork
*/
struct vmspace *
uvmspace_share(struct process *pr)
{
struct vmspace *vm = pr->ps_vmspace;
uvmspace_addref(vm);
return vm;
}
/*
* uvmspace_exec: the process wants to exec a new program
*
* - XXX: no locking on vmspace
*/
void
uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
{
struct process *pr = p->p_p;
struct vmspace *nvm, *ovm = pr->ps_vmspace;
struct vm_map *map = &ovm->vm_map;
struct uvm_map_deadq dead_entries;
KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
(end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
pmap_unuse_final(p); /* before stack addresses go away */
TAILQ_INIT(&dead_entries);
/* see if more than one process is using this vmspace... */
if (ovm->vm_refcnt == 1) {
/*
* If pr is the only process using its vmspace then
* we can safely recycle that vmspace for the program
* that is being exec'd.
*/
#ifdef SYSVSHM
/*
* SYSV SHM semantics require us to kill all segments on an exec
*/
if (ovm->vm_shm)
shmexit(ovm);
#endif
/*
* POSIX 1003.1b -- "lock future mappings" is revoked
* when a process execs another program image.
*/
vm_map_lock(map);
vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE);
/*
* now unmap the old program
*
* Instead of attempting to keep the map valid, we simply
* nuke all entries and ask uvm_map_setup to reinitialize
* the map to the new boundaries.
*
* uvm_unmap_remove will actually nuke all entries for us
* (as in, not replace them with free-memory entries).
*/
uvm_unmap_remove(map, map->min_offset, map->max_offset,
&dead_entries, TRUE, FALSE);
KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
/* Nuke statistics and boundaries. */
memset(&ovm->vm_startcopy, 0,
(caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
if (end & (vaddr_t)PAGE_MASK) {
end += 1;
if (end == 0) /* overflow */
end -= PAGE_SIZE;
}
/* Setup new boundaries and populate map with entries. */
map->min_offset = start;
map->max_offset = end;
uvm_map_setup_entries(map);
vm_map_unlock(map);
/* but keep MMU holes unavailable */
pmap_remove_holes(ovm);
} else {
/*
* pr's vmspace is being shared, so we can't reuse
* it for pr since it is still being used for others.
* allocate a new vmspace for pr
*/
nvm = uvmspace_alloc(start, end,
(map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
/* install new vmspace and drop our ref to the old one. */
pmap_deactivate(p);
p->p_vmspace = pr->ps_vmspace = nvm;
pmap_activate(p);
uvmspace_free(ovm);
}
/* Release dead entries */
uvm_unmap_detach(&dead_entries, 0);
}
/*
* uvmspace_addref: add a reference to a vmspace.
*/
void
uvmspace_addref(struct vmspace *vm)
{ KERNEL_ASSERT_LOCKED(); KASSERT(vm->vm_refcnt > 0); vm->vm_refcnt++;
}
/*
* uvmspace_free: free a vmspace data structure
*/
void
uvmspace_free(struct vmspace *vm)
{ KERNEL_ASSERT_LOCKED(); if (--vm->vm_refcnt == 0) {
/*
* lock the map, to wait out all other references to it. delete
* all of the mappings and pages they hold, then call the pmap
* module to reclaim anything left.
*/
#ifdef SYSVSHM
/* Get rid of any SYSV shared memory segments. */
if (vm->vm_shm != NULL) shmexit(vm);
#endif
uvm_map_teardown(&vm->vm_map);
pool_put(&uvm_vmspace_pool, vm);
}
}
/*
* uvm_share: Map the address range [srcaddr, srcaddr + sz) in
* srcmap to the address range [dstaddr, dstaddr + sz) in
* dstmap.
*
* The whole address range in srcmap must be backed by an object
* (no holes).
*
* If successful, the address ranges share memory and the destination
* address range uses the protection flags in prot.
*
* This routine assumes that sz is a multiple of PAGE_SIZE and
* that dstaddr and srcaddr are page-aligned.
*/
int
uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
{
int ret = 0;
vaddr_t unmap_end;
vaddr_t dstva;
vsize_t s_off, len, n = sz, remain;
struct vm_map_entry *first = NULL, *last = NULL;
struct vm_map_entry *src_entry, *psrc_entry = NULL;
struct uvm_map_deadq dead;
if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
return EINVAL;
TAILQ_INIT(&dead);
vm_map_lock(dstmap);
vm_map_lock_read(srcmap);
if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
ret = ENOMEM;
goto exit_unlock;
}
if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
ret = EINVAL;
goto exit_unlock;
}
dstva = dstaddr;
unmap_end = dstaddr;
for (; src_entry != NULL;
psrc_entry = src_entry,
src_entry = RBT_NEXT(uvm_map_addr, src_entry)) {
/* hole in address space, bail out */
if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
break;
if (src_entry->start >= srcaddr + sz)
break;
if (UVM_ET_ISSUBMAP(src_entry))
panic("uvm_share: encountered a submap (illegal)");
if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
UVM_ET_ISNEEDSCOPY(src_entry))
panic("uvm_share: non-copy_on_write map entries "
"marked needs_copy (illegal)");
/*
* srcaddr > map entry start? means we are in the middle of a
* map, so we calculate the offset to use in the source map.
*/
if (srcaddr > src_entry->start)
s_off = srcaddr - src_entry->start;
else if (srcaddr == src_entry->start)
s_off = 0;
else
panic("uvm_share: map entry start > srcaddr");
remain = src_entry->end - src_entry->start - s_off;
/* Determine how many bytes to share in this pass */
if (n < remain)
len = n;
else
len = remain;
if (uvm_mapent_share(dstmap, dstva, len, s_off, prot, prot,
srcmap, src_entry, &dead) == NULL)
break;
n -= len;
dstva += len;
srcaddr += len;
unmap_end = dstva + len;
if (n == 0)
goto exit_unlock;
}
ret = EINVAL;
uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
exit_unlock:
vm_map_unlock_read(srcmap);
vm_map_unlock(dstmap);
uvm_unmap_detach(&dead, 0);
return ret;
}
/*
* Clone map entry into other map.
*
* Mapping will be placed at dstaddr, for the same length.
* Space must be available.
* Reference counters are incremented.
*/
struct vm_map_entry *
uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
int mapent_flags, int amap_share_flags)
{
struct vm_map_entry *new_entry, *first, *last;
KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
/* Create new entry (linked in on creation). Fill in first, last. */
first = last = NULL;
if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
panic("uvm_mapent_clone: no space in map for "
"entry in empty map");
}
new_entry = uvm_map_mkentry(dstmap, first, last,
dstaddr, dstlen, mapent_flags, dead, NULL);
if (new_entry == NULL)
return NULL;
/* old_entry -> new_entry */
new_entry->object = old_entry->object;
new_entry->offset = old_entry->offset;
new_entry->aref = old_entry->aref;
new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
new_entry->protection = prot;
new_entry->max_protection = maxprot;
new_entry->inheritance = old_entry->inheritance;
new_entry->advice = old_entry->advice;
/* gain reference to object backing the map (can't be a submap). */
if (new_entry->aref.ar_amap) { new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
(new_entry->end - new_entry->start) >> PAGE_SHIFT,
amap_share_flags);
}
if (UVM_ET_ISOBJ(new_entry) &&
new_entry->object.uvm_obj->pgops->pgo_reference) {
new_entry->offset += off;
new_entry->object.uvm_obj->pgops->pgo_reference
(new_entry->object.uvm_obj);
}
return new_entry;
}
struct vm_map_entry *
uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
{
/*
* If old_entry refers to a copy-on-write region that has not yet been
* written to (needs_copy flag is set), then we need to allocate a new
* amap for old_entry.
*
* If we do not do this, and the process owning old_entry does a copy-on
* write later, old_entry and new_entry will refer to different memory
* regions, and the memory between the processes is no longer shared.
*
* [in other words, we need to clear needs_copy]
*/
if (UVM_ET_ISNEEDSCOPY(old_entry)) {
/* get our own amap, clears needs_copy */
amap_copy(old_map, old_entry, M_WAITOK, FALSE, 0, 0);
/* XXXCDC: WAITOK??? */
}
return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
}
/*
* share the mapping: this means we want the old and
* new entries to share amaps and backing objects.
*/
struct vm_map_entry *
uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
struct vm_map *old_map,
struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_share(new_map, old_entry->start,
old_entry->end - old_entry->start, 0, old_entry->protection,
old_entry->max_protection, old_map, old_entry, dead);
/*
* pmap_copy the mappings: this routine is optional
* but if it is there it will reduce the number of
* page faults in the new proc.
*/
if (!UVM_ET_ISHOLE(new_entry))
pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
(new_entry->end - new_entry->start), new_entry->start);
return (new_entry);
}
/*
* copy-on-write the mapping (using mmap's
* MAP_PRIVATE semantics)
*
* allocate new_entry, adjust reference counts.
* (note that new references are read-only).
*/
struct vm_map_entry *
uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
struct vm_map *old_map,
struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
{
struct vm_map_entry *new_entry;
boolean_t protect_child;
new_entry = uvm_mapent_clone(new_map, old_entry->start,
old_entry->end - old_entry->start, 0, old_entry->protection,
old_entry->max_protection, old_entry, dead, 0, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
/*
* the new entry will need an amap. it will either
* need to be copied from the old entry or created
* from scratch (if the old entry does not have an
* amap). can we defer this process until later
* (by setting "needs_copy") or do we need to copy
* the amap now?
*
* we must copy the amap now if any of the following
* conditions hold:
* 1. the old entry has an amap and that amap is
* being shared. this means that the old (parent)
* process is sharing the amap with another
* process. if we do not clear needs_copy here
* we will end up in a situation where both the
* parent and child process are referring to the
* same amap with "needs_copy" set. if the
* parent write-faults, the fault routine will
* clear "needs_copy" in the parent by allocating
* a new amap. this is wrong because the
* parent is supposed to be sharing the old amap
* and the new amap will break that.
*
* 2. if the old entry has an amap and a non-zero
* wire count then we are going to have to call
* amap_cow_now to avoid page faults in the
* parent process. since amap_cow_now requires
* "needs_copy" to be clear we might as well
* clear it here as well.
*
*/
if (old_entry->aref.ar_amap != NULL &&
((amap_flags(old_entry->aref.ar_amap) &
AMAP_SHARED) != 0 ||
VM_MAPENT_ISWIRED(old_entry))) {
amap_copy(new_map, new_entry, M_WAITOK, FALSE,
0, 0);
/* XXXCDC: M_WAITOK ... ok? */
}
/*
* if the parent's entry is wired down, then the
* parent process does not want page faults on
* access to that memory. this means that we
* cannot do copy-on-write because we can't write
* protect the old entry. in this case we
* resolve all copy-on-write faults now, using
* amap_cow_now. note that we have already
* allocated any needed amap (above).
*/
if (VM_MAPENT_ISWIRED(old_entry)) {
/*
* resolve all copy-on-write faults now
* (note that there is nothing to do if
* the old mapping does not have an amap).
* XXX: is it worthwhile to bother with
* pmap_copy in this case?
*/
if (old_entry->aref.ar_amap)
amap_cow_now(new_map, new_entry);
} else {
if (old_entry->aref.ar_amap) {
/*
* setup mappings to trigger copy-on-write faults
* we must write-protect the parent if it has
* an amap and it is not already "needs_copy"...
* if it is already "needs_copy" then the parent
* has already been write-protected by a previous
* fork operation.
*
* if we do not write-protect the parent, then
* we must be sure to write-protect the child
* after the pmap_copy() operation.
*
* XXX: pmap_copy should have some way of telling
* us that it didn't do anything so we can avoid
* calling pmap_protect needlessly.
*/
if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
if (old_entry->max_protection & PROT_WRITE) {
uvm_map_lock_entry(old_entry);
pmap_protect(old_map->pmap,
old_entry->start,
old_entry->end,
old_entry->protection &
~PROT_WRITE);
uvm_map_unlock_entry(old_entry);
pmap_update(old_map->pmap);
}
old_entry->etype |= UVM_ET_NEEDSCOPY;
}
/* parent must now be write-protected */
protect_child = FALSE;
} else {
/*
* we only need to protect the child if the
* parent has write access.
*/
if (old_entry->max_protection & PROT_WRITE)
protect_child = TRUE;
else
protect_child = FALSE;
}
/*
* copy the mappings
* XXX: need a way to tell if this does anything
*/
if (!UVM_ET_ISHOLE(new_entry))
pmap_copy(new_map->pmap, old_map->pmap,
new_entry->start,
(old_entry->end - old_entry->start),
old_entry->start);
/* protect the child's mappings if necessary */
if (protect_child) {
pmap_protect(new_map->pmap, new_entry->start,
new_entry->end,
new_entry->protection &
~PROT_WRITE);
}
}
return (new_entry);
}
/*
* zero the mapping: the new entry will be zero initialized
*/
struct vm_map_entry *
uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
struct vm_map *old_map,
struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_clone(new_map, old_entry->start,
old_entry->end - old_entry->start, 0, old_entry->protection,
old_entry->max_protection, old_entry, dead, 0, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
if (new_entry->aref.ar_amap) {
amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
atop(new_entry->end - new_entry->start), 0);
new_entry->aref.ar_amap = NULL;
new_entry->aref.ar_pageoff = 0;
}
if (UVM_ET_ISOBJ(new_entry)) {
if (new_entry->object.uvm_obj->pgops->pgo_detach)
new_entry->object.uvm_obj->pgops->pgo_detach(
new_entry->object.uvm_obj);
new_entry->object.uvm_obj = NULL;
new_entry->etype &= ~UVM_ET_OBJ;
}
return (new_entry);
}
/*
* uvmspace_fork: fork a process' main map
*
* => create a new vmspace for child process from parent.
* => parent's map must not be locked.
*/
struct vmspace *
uvmspace_fork(struct process *pr)
{
struct vmspace *vm1 = pr->ps_vmspace;
struct vmspace *vm2;
struct vm_map *old_map = &vm1->vm_map;
struct vm_map *new_map;
struct vm_map_entry *old_entry, *new_entry;
struct uvm_map_deadq dead;
vm_map_lock(old_map);
vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
(old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
(caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
vm2->vm_dused = 0; /* Statistic managed by us. */
new_map = &vm2->vm_map;
vm_map_lock(new_map);
/* go entry-by-entry */
TAILQ_INIT(&dead);
RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
if (old_entry->start == old_entry->end)
continue;
/* first, some sanity checks on the old entry */
if (UVM_ET_ISSUBMAP(old_entry)) {
panic("fork: encountered a submap during fork "
"(illegal)");
}
if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
UVM_ET_ISNEEDSCOPY(old_entry)) {
panic("fork: non-copy_on_write map entry marked "
"needs_copy (illegal)");
}
/* Apply inheritance. */
switch (old_entry->inheritance) {
case MAP_INHERIT_SHARE:
new_entry = uvm_mapent_forkshared(vm2, new_map,
old_map, old_entry, &dead);
break;
case MAP_INHERIT_COPY:
new_entry = uvm_mapent_forkcopy(vm2, new_map,
old_map, old_entry, &dead);
break;
case MAP_INHERIT_ZERO:
new_entry = uvm_mapent_forkzero(vm2, new_map,
old_map, old_entry, &dead);
break;
default:
continue;
}
/* Update process statistics. */
if (!UVM_ET_ISHOLE(new_entry))
new_map->size += new_entry->end - new_entry->start;
if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry) &&
new_entry->protection != PROT_NONE) {
vm2->vm_dused += uvmspace_dused(
new_map, new_entry->start, new_entry->end);
}
}
vm_map_unlock(old_map);
vm_map_unlock(new_map);
/*
* This can actually happen, if multiple entries described a
* space in which an entry was inherited.
*/
uvm_unmap_detach(&dead, 0);
#ifdef SYSVSHM
if (vm1->vm_shm)
shmfork(vm1, vm2);
#endif
return vm2;
}
/*
* uvm_map_hint: return the beginning of the best area suitable for
* creating a new mapping with "prot" protection.
*/
vaddr_t
uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
vaddr_t maxaddr)
{
vaddr_t addr;
vaddr_t spacing;
#ifdef __i386__
/*
* If executable skip first two pages, otherwise start
* after data + heap region.
*/
if ((prot & PROT_EXEC) != 0 &&
(vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
addr = (PAGE_SIZE*2) +
(arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
return (round_page(addr));
}
#endif
#if defined (__LP64__)
spacing = MIN(4UL * 1024 * 1024 * 1024, MAXDSIZ) - 1;
#else
spacing = MIN(1 * 1024 * 1024 * 1024, MAXDSIZ) - 1;
#endif
/*
* Start malloc/mmap after the brk.
*/
addr = (vaddr_t)vm->vm_daddr + BRKSIZ;
addr = MAX(addr, minaddr);
if (addr < maxaddr) {
while (spacing > maxaddr - addr)
spacing >>= 1;
}
addr += arc4random() & spacing;
return (round_page(addr));
}
/*
* uvm_map_submap: punch down part of a map into a submap
*
* => only the kernel_map is allowed to be submapped
* => the purpose of submapping is to break up the locking granularity
* of a larger map
* => the range specified must have been mapped previously with a uvm_map()
* call [with uobj==NULL] to create a blank map entry in the main map.
* [And it had better still be blank!]
* => maps which contain submaps should never be copied or forked.
* => to remove a submap, use uvm_unmap() on the main map
* and then uvm_map_deallocate() the submap.
* => main map must be unlocked.
* => submap must have been init'd and have a zero reference count.
* [need not be locked as we don't actually reference it]
*/
int
uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map *submap)
{
struct vm_map_entry *entry;
int result;
if (start > map->max_offset || end > map->max_offset ||
start < map->min_offset || end < map->min_offset)
return EINVAL;
vm_map_lock(map);
if (uvm_map_lookup_entry(map, start, &entry)) {
UVM_MAP_CLIP_START(map, entry, start);
UVM_MAP_CLIP_END(map, entry, end);
} else
entry = NULL;
if (entry != NULL &&
entry->start == start && entry->end == end &&
entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
!UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
entry->etype |= UVM_ET_SUBMAP;
entry->object.sub_map = submap;
entry->offset = 0;
uvm_map_reference(submap);
result = 0;
} else
result = EINVAL;
vm_map_unlock(map);
return result;
}
/*
* uvm_map_checkprot: check protection in map
*
* => must allow specific protection in a fully allocated region.
* => map mut be read or write locked by caller.
*/
boolean_t
uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t protection)
{
struct vm_map_entry *entry;
if (start < map->min_offset || end > map->max_offset || start > end)
return FALSE;
if (start == end)
return TRUE;
/*
* Iterate entries.
*/
for (entry = uvm_map_entrybyaddr(&map->addr, start); entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
/* Fail if a hole is found. */
if (UVM_ET_ISHOLE(entry) || (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
return FALSE;
/* Check protection. */
if ((entry->protection & protection) != protection)
return FALSE;
}
return TRUE;
}
/*
* uvm_map_create: create map
*/
vm_map_t
uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
{
vm_map_t map;
map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
uvm_map_setup(map, pmap, min, max, flags);
return (map);
}
/*
* uvm_map_deallocate: drop reference to a map
*
* => caller must not lock map
* => we will zap map if ref count goes to zero
*/
void
uvm_map_deallocate(vm_map_t map)
{
int c;
struct uvm_map_deadq dead;
c = atomic_dec_int_nv(&map->ref_count);
if (c > 0) {
return;
}
/*
* all references gone. unmap and free.
*
* No lock required: we are only one to access this map.
*/
TAILQ_INIT(&dead);
uvm_tree_sanity(map, __FILE__, __LINE__);
uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
TRUE, FALSE);
pmap_destroy(map->pmap);
KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
free(map, M_VMMAP, sizeof *map);
uvm_unmap_detach(&dead, 0);
}
/*
* uvm_map_inherit: set inheritance code for range of addrs in map.
*
* => map must be unlocked
* => note that the inherit code is used during a "fork". see fork
* code for details.
*/
int
uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_inherit_t new_inheritance)
{
struct vm_map_entry *entry;
switch (new_inheritance) {
case MAP_INHERIT_NONE:
case MAP_INHERIT_COPY:
case MAP_INHERIT_SHARE:
case MAP_INHERIT_ZERO:
break;
default:
return (EINVAL);
}
if (start > end)
return EINVAL;
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
return 0;
vm_map_lock(map);
entry = uvm_map_entrybyaddr(&map->addr, start);
if (entry->end > start)
UVM_MAP_CLIP_START(map, entry, start);
else
entry = RBT_NEXT(uvm_map_addr, entry);
while (entry != NULL && entry->start < end) { UVM_MAP_CLIP_END(map, entry, end);
entry->inheritance = new_inheritance;
entry = RBT_NEXT(uvm_map_addr, entry);
}
vm_map_unlock(map);
return (0);
}
/*
* uvm_map_syscall: permit system calls for range of addrs in map.
*
* => map must be unlocked
*/
int
uvm_map_syscall(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct vm_map_entry *entry;
if (start > end)
return EINVAL;
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
return 0;
if (map->flags & VM_MAP_SYSCALL_ONCE) /* only allowed once */
return (EPERM);
vm_map_lock(map);
entry = uvm_map_entrybyaddr(&map->addr, start);
if (entry->end > start)
UVM_MAP_CLIP_START(map, entry, start);
else
entry = RBT_NEXT(uvm_map_addr, entry);
while (entry != NULL && entry->start < end) {
UVM_MAP_CLIP_END(map, entry, end);
entry->etype |= UVM_ET_SYSCALL;
entry = RBT_NEXT(uvm_map_addr, entry);
}
map->wserial++;
map->flags |= VM_MAP_SYSCALL_ONCE;
vm_map_unlock(map);
return (0);
}
/*
* uvm_map_advice: set advice code for range of addrs in map.
*
* => map must be unlocked
*/
int
uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
{
struct vm_map_entry *entry;
switch (new_advice) {
case MADV_NORMAL:
case MADV_RANDOM:
case MADV_SEQUENTIAL:
break;
default:
return (EINVAL);
}
if (start > end)
return EINVAL;
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
return 0;
vm_map_lock(map);
entry = uvm_map_entrybyaddr(&map->addr, start);
if (entry != NULL && entry->end > start)
UVM_MAP_CLIP_START(map, entry, start);
else if (entry!= NULL)
entry = RBT_NEXT(uvm_map_addr, entry);
/*
* XXXJRT: disallow holes?
*/
while (entry != NULL && entry->start < end) { UVM_MAP_CLIP_END(map, entry, end);
entry->advice = new_advice;
entry = RBT_NEXT(uvm_map_addr, entry);
}
vm_map_unlock(map);
return (0);
}
/*
* uvm_map_extract: extract a mapping from a map and put it somewhere
* in the kernel_map, setting protection to max_prot.
*
* => map should be unlocked (we will write lock it and kernel_map)
* => returns 0 on success, error code otherwise
* => start must be page aligned
* => len must be page sized
* => flags:
* UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
* Mappings are QREF's.
*/
int
uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
vaddr_t *dstaddrp, int flags)
{
struct uvm_map_deadq dead;
struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
vaddr_t dstaddr;
vaddr_t end;
vaddr_t cp_start;
vsize_t cp_len, cp_off;
int error;
TAILQ_INIT(&dead);
end = start + len;
/*
* Sanity check on the parameters.
* Also, since the mapping may not contain gaps, error out if the
* mapped area is not in source map.
*/
if ((start & (vaddr_t)PAGE_MASK) != 0 ||
(end & (vaddr_t)PAGE_MASK) != 0 || end < start)
return EINVAL;
if (start < srcmap->min_offset || end > srcmap->max_offset)
return EINVAL;
/* Initialize dead entries. Handle len == 0 case. */
if (len == 0)
return 0;
/* Acquire lock on srcmap. */
vm_map_lock(srcmap);
/* Lock srcmap, lookup first and last entry in <start,len>. */
first = uvm_map_entrybyaddr(&srcmap->addr, start);
/* Check that the range is contiguous. */
for (entry = first; entry != NULL && entry->end < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
if (VMMAP_FREE_END(entry) != entry->end ||
UVM_ET_ISHOLE(entry)) {
error = EINVAL;
goto fail;
}
}
if (entry == NULL || UVM_ET_ISHOLE(entry)) {
error = EINVAL;
goto fail;
}
/*
* Handle need-copy flag.
*/
for (entry = first; entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
if (UVM_ET_ISNEEDSCOPY(entry))
amap_copy(srcmap, entry, M_NOWAIT,
UVM_ET_ISSTACK(entry) ? FALSE : TRUE, start, end); if (UVM_ET_ISNEEDSCOPY(entry)) {
/*
* amap_copy failure
*/
error = ENOMEM;
goto fail;
}
}
/* Lock destination map (kernel_map). */
vm_map_lock(kernel_map);
if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
PROT_NONE, 0) != 0) {
error = ENOMEM;
goto fail2;
}
*dstaddrp = dstaddr;
/*
* We now have srcmap and kernel_map locked.
* dstaddr contains the destination offset in dstmap.
*/
/* step 1: start looping through map entries, performing extraction. */
for (entry = first; entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
if (UVM_ET_ISHOLE(entry))
continue;
/* Calculate uvm_mapent_clone parameters. */
cp_start = entry->start;
if (cp_start < start) {
cp_off = start - cp_start;
cp_start = start;
} else
cp_off = 0;
cp_len = MIN(entry->end, end) - cp_start;
newentry = uvm_mapent_clone(kernel_map,
cp_start - start + dstaddr, cp_len, cp_off,
entry->protection, entry->max_protection,
entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
if (newentry == NULL) {
error = ENOMEM;
goto fail2_unmap;
}
kernel_map->size += cp_len;
if (flags & UVM_EXTRACT_FIXPROT) newentry->protection = newentry->max_protection;
/*
* Step 2: perform pmap copy.
* (Doing this in the loop saves one RB traversal.)
*/
pmap_copy(kernel_map->pmap, srcmap->pmap,
cp_start - start + dstaddr, cp_len, cp_start);
}
pmap_update(kernel_map->pmap);
error = 0;
/* Unmap copied entries on failure. */
fail2_unmap:
if (error) {
uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
FALSE, TRUE);
}
/* Release maps, release dead entries. */
fail2:
vm_map_unlock(kernel_map);
fail:
vm_map_unlock(srcmap);
uvm_unmap_detach(&dead, 0);
return error;
}
/*
* uvm_map_clean: clean out a map range
*
* => valid flags:
* if (flags & PGO_CLEANIT): dirty pages are cleaned first
* if (flags & PGO_SYNCIO): dirty pages are written synchronously
* if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
* if (flags & PGO_FREE): any cached pages are freed after clean
* => returns an error if any part of the specified range isn't mapped
* => never a need to flush amap layer since the anonymous memory has
* no permanent home, but may deactivate pages there
* => called from sys_msync() and sys_madvise()
* => caller must not write-lock map (read OK).
* => we may sleep while cleaning if SYNCIO [with map read-locked]
*/
int
uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
struct vm_map_entry *first, *entry;
struct vm_amap *amap;
struct vm_anon *anon;
struct vm_page *pg;
struct uvm_object *uobj;
vaddr_t cp_start, cp_end;
int refs;
int error;
boolean_t rv;
KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
(PGO_FREE|PGO_DEACTIVATE));
if (start > end || start < map->min_offset || end > map->max_offset)
return EINVAL;
vm_map_lock_read(map);
first = uvm_map_entrybyaddr(&map->addr, start);
/* Make a first pass to check for holes. */
for (entry = first; entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
if (UVM_ET_ISSUBMAP(entry)) {
vm_map_unlock_read(map);
return EINVAL;
}
if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISHOLE(entry) || (entry->end < end &&
VMMAP_FREE_END(entry) != entry->end)) {
vm_map_unlock_read(map);
return EFAULT;
}
}
error = 0;
for (entry = first; entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
amap = entry->aref.ar_amap; /* top layer */
if (UVM_ET_ISOBJ(entry)) uobj = entry->object.uvm_obj;
else
uobj = NULL;
/*
* No amap cleaning necessary if:
* - there's no amap
* - we're not deactivating or freeing pages.
*/
if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
goto flush_object;
cp_start = MAX(entry->start, start);
cp_end = MIN(entry->end, end);
amap_lock(amap);
for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
anon = amap_lookup(&entry->aref,
cp_start - entry->start);
if (anon == NULL)
continue;
KASSERT(anon->an_lock == amap->am_lock);
pg = anon->an_page;
if (pg == NULL) {
continue;
}
KASSERT(pg->pg_flags & PQ_ANON); switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* XXX In these first 3 cases, we always just
* XXX deactivate the page. We may want to
* XXX handle the different cases more
* XXX specifically, in the future.
*/
case PGO_CLEANIT|PGO_FREE:
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
/* skip the page if it's wired */
if (pg->wire_count != 0)
break;
uvm_lock_pageq();
KASSERT(pg->uanon == anon);
/* zap all mappings for the page. */
pmap_page_protect(pg, PROT_NONE);
/* ...and deactivate the page. */
uvm_pagedeactivate(pg);
uvm_unlock_pageq();
break;
case PGO_FREE:
/*
* If there are multiple references to
* the amap, just deactivate the page.
*/
if (amap_refs(amap) > 1)
goto deactivate_it;
/* XXX skip the page if it's wired */
if (pg->wire_count != 0) {
break;
}
amap_unadd(&entry->aref,
cp_start - entry->start);
refs = --anon->an_ref;
if (refs == 0) uvm_anfree(anon);
break;
default:
panic("uvm_map_clean: weird flags");
}
}
amap_unlock(amap);
flush_object:
cp_start = MAX(entry->start, start);
cp_end = MIN(entry->end, end);
/*
* flush pages if we've got a valid backing object.
*
* Don't PGO_FREE if we don't have write permission
* and don't flush if this is a copy-on-write object
* since we can't know our permissions on it.
*/
if (uobj != NULL && ((flags & PGO_FREE) == 0 || ((entry->max_protection & PROT_WRITE) != 0 &&
(entry->etype & UVM_ET_COPYONWRITE) == 0))) {
rw_enter(uobj->vmobjlock, RW_WRITE);
rv = uobj->pgops->pgo_flush(uobj,
cp_start - entry->start + entry->offset,
cp_end - entry->start + entry->offset, flags);
rw_exit(uobj->vmobjlock);
if (rv == FALSE)
error = EFAULT;
}
}
vm_map_unlock_read(map);
return error;
}
/*
* UVM_MAP_CLIP_END implementation
*/
void
uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
{
struct vm_map_entry *tmp;
KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); tmp = uvm_mapent_alloc(map, 0);
/* Invoke splitentry. */
uvm_map_splitentry(map, entry, tmp, addr);
}
/*
* UVM_MAP_CLIP_START implementation
*
* Clippers are required to not change the pointers to the entry they are
* clipping on.
* Since uvm_map_splitentry turns the original entry into the lowest
* entry (address wise) we do a swap between the new entry and the original
* entry, prior to calling uvm_map_splitentry.
*/
void
uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
{
struct vm_map_entry *tmp;
struct uvm_addr_state *free;
/* Unlink original. */
free = uvm_map_uaddr_e(map, entry);
uvm_mapent_free_remove(map, free, entry);
uvm_mapent_addr_remove(map, entry);
/* Copy entry. */
KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); tmp = uvm_mapent_alloc(map, 0);
uvm_mapent_copy(entry, tmp);
/* Put new entry in place of original entry. */
uvm_mapent_addr_insert(map, tmp);
uvm_mapent_free_insert(map, free, tmp);
/* Invoke splitentry. */
uvm_map_splitentry(map, tmp, entry, addr);
}
/*
* Boundary fixer.
*/
static inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
static inline vaddr_t
uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
{
return (min < bound && max > bound) ? bound : max;
}
/*
* Choose free list based on address at start of free space.
*
* The uvm_addr_state returned contains addr and is the first of:
* - uaddr_exe
* - uaddr_brk_stack
* - uaddr_any
*/
struct uvm_addr_state*
uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
{
struct uvm_addr_state *uaddr;
int i;
/* Special case the first page, to prevent mmap from returning 0. */
if (addr < VMMAP_MIN_ADDR)
return NULL;
/* Upper bound for kernel maps at uvm_maxkaddr. */
if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
if (addr >= uvm_maxkaddr)
return NULL;
}
/* Is the address inside the exe-only map? */
if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
addr < map->uaddr_exe->uaddr_maxaddr)
return map->uaddr_exe;
/* Check if the space falls inside brk/stack area. */
if ((addr >= map->b_start && addr < map->b_end) || (addr >= map->s_start && addr < map->s_end)) { if (map->uaddr_brk_stack != NULL && addr >= map->uaddr_brk_stack->uaddr_minaddr &&
addr < map->uaddr_brk_stack->uaddr_maxaddr) {
return map->uaddr_brk_stack;
} else
return NULL;
}
/*
* Check the other selectors.
*
* These selectors are only marked as the owner, if they have insert
* functions.
*/
for (i = 0; i < nitems(map->uaddr_any); i++) {
uaddr = map->uaddr_any[i];
if (uaddr == NULL)
continue;
if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
continue;
if (addr >= uaddr->uaddr_minaddr &&
addr < uaddr->uaddr_maxaddr)
return uaddr;
}
return NULL;
}
/*
* Choose free list based on address at start of free space.
*
* The uvm_addr_state returned contains addr and is the first of:
* - uaddr_exe
* - uaddr_brk_stack
* - uaddr_any
*/
struct uvm_addr_state*
uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
{
return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
}
/*
* Returns the first free-memory boundary that is crossed by [min-max].
*/
vsize_t
uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
{
struct uvm_addr_state *uaddr;
int i;
/* Never return first page. */
max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
/* Treat the maxkaddr special, if the map is a kernel_map. */
if ((map->flags & VM_MAP_ISVMSPACE) == 0) max = uvm_map_boundfix(min, max, uvm_maxkaddr);
/* Check for exe-only boundaries. */
if (map->uaddr_exe != NULL) { max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
}
/* Check for exe-only boundaries. */
if (map->uaddr_brk_stack != NULL) {
max = uvm_map_boundfix(min, max,
map->uaddr_brk_stack->uaddr_minaddr);
max = uvm_map_boundfix(min, max,
map->uaddr_brk_stack->uaddr_maxaddr);
}
/* Check other boundaries. */
for (i = 0; i < nitems(map->uaddr_any); i++) {
uaddr = map->uaddr_any[i];
if (uaddr != NULL) { max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
}
}
/* Boundaries at stack and brk() area. */
max = uvm_map_boundfix(min, max, map->s_start);
max = uvm_map_boundfix(min, max, map->s_end);
max = uvm_map_boundfix(min, max, map->b_start);
max = uvm_map_boundfix(min, max, map->b_end);
return max;
}
/*
* Update map allocation start and end addresses from proc vmspace.
*/
void
uvm_map_vmspace_update(struct vm_map *map,
struct uvm_map_deadq *dead, int flags)
{
struct vmspace *vm;
vaddr_t b_start, b_end, s_start, s_end;
KASSERT(map->flags & VM_MAP_ISVMSPACE);
KASSERT(offsetof(struct vmspace, vm_map) == 0);
/*
* Derive actual allocation boundaries from vmspace.
*/
vm = (struct vmspace *)map;
b_start = (vaddr_t)vm->vm_daddr;
b_end = b_start + BRKSIZ;
s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
s_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
#ifdef DIAGNOSTIC
if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
(b_end & (vaddr_t)PAGE_MASK) != 0 ||
(s_start & (vaddr_t)PAGE_MASK) != 0 ||
(s_end & (vaddr_t)PAGE_MASK) != 0) {
panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
"b=0x%lx-0x%lx s=0x%lx-0x%lx",
vm, b_start, b_end, s_start, s_end);
}
#endif
if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
map->s_start == s_start && map->s_end == s_end))
return;
uvm_map_freelist_update(map, dead, b_start, b_end,
s_start, s_end, flags);
}
/*
* Grow kernel memory.
*
* This function is only called for kernel maps when an allocation fails.
*
* If the map has a gap that is large enough to accommodate alloc_sz, this
* function will make sure map->free will include it.
*/
void
uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
vsize_t alloc_sz, int flags)
{
vsize_t sz;
vaddr_t end;
struct vm_map_entry *entry;
/* Kernel memory only. */
KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
/* Destroy free list. */
uvm_map_freelist_update_clear(map, dead);
/* Include the guard page in the hard minimum requirement of alloc_sz. */
if (map->flags & VM_MAP_GUARDPAGES)
alloc_sz += PAGE_SIZE;
/*
* Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
*
* Don't handle the case where the multiplication overflows:
* if that happens, the allocation is probably too big anyway.
*/
sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
/*
* Walk forward until a gap large enough for alloc_sz shows up.
*
* We assume the kernel map has no boundaries.
* uvm_maxkaddr may be zero.
*/
end = MAX(uvm_maxkaddr, map->min_offset);
entry = uvm_map_entrybyaddr(&map->addr, end);
while (entry && entry->fspace < alloc_sz)
entry = RBT_NEXT(uvm_map_addr, entry);
if (entry) {
end = MAX(VMMAP_FREE_START(entry), end);
end += MIN(sz, map->max_offset - end);
} else
end = map->max_offset;
/* Reserve pmap entries. */
#ifdef PMAP_GROWKERNEL
uvm_maxkaddr = pmap_growkernel(end);
#else
uvm_maxkaddr = MAX(uvm_maxkaddr, end);
#endif
/* Rebuild free list. */
uvm_map_freelist_update_refill(map, flags);
}
/*
* Freelist update subfunction: unlink all entries from freelists.
*/
void
uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
{
struct uvm_addr_state *free;
struct vm_map_entry *entry, *prev, *next;
prev = NULL;
for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL;
entry = next) {
next = RBT_NEXT(uvm_map_addr, entry);
free = uvm_map_uaddr_e(map, entry);
uvm_mapent_free_remove(map, free, entry);
if (prev != NULL && entry->start == entry->end) {
prev->fspace += VMMAP_FREE_END(entry) - entry->end;
uvm_mapent_addr_remove(map, entry);
DEAD_ENTRY_PUSH(dead, entry);
} else
prev = entry;
}
}
/*
* Freelist update subfunction: refill the freelists with entries.
*/
void
uvm_map_freelist_update_refill(struct vm_map *map, int flags)
{
struct vm_map_entry *entry;
vaddr_t min, max;
RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
min = VMMAP_FREE_START(entry);
max = VMMAP_FREE_END(entry);
entry->fspace = 0;
entry = uvm_map_fix_space(map, entry, min, max, flags);
}
uvm_tree_sanity(map, __FILE__, __LINE__);
}
/*
* Change {a,b}_{start,end} allocation ranges and associated free lists.
*/
void
uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
{
KDASSERT(b_end >= b_start && s_end >= s_start);
/* Clear all free lists. */
uvm_map_freelist_update_clear(map, dead);
/* Apply new bounds. */
map->b_start = b_start;
map->b_end = b_end;
map->s_start = s_start;
map->s_end = s_end;
/* Refill free lists. */
uvm_map_freelist_update_refill(map, flags);
}
/*
* Assign a uvm_addr_state to the specified pointer in vm_map.
*
* May sleep.
*/
void
uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
struct uvm_addr_state *newval)
{
struct uvm_map_deadq dead;
/* Pointer which must be in this map. */
KASSERT(which != NULL);
KASSERT((void*)map <= (void*)(which) &&
(void*)(which) < (void*)(map + 1));
vm_map_lock(map);
TAILQ_INIT(&dead);
uvm_map_freelist_update_clear(map, &dead);
uvm_addr_destroy(*which);
*which = newval;
uvm_map_freelist_update_refill(map, 0);
vm_map_unlock(map);
uvm_unmap_detach(&dead, 0);
}
/*
* Correct space insert.
*
* Entry must not be on any freelist.
*/
struct vm_map_entry*
uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
vaddr_t min, vaddr_t max, int flags)
{
struct uvm_addr_state *free, *entfree;
vaddr_t lmax;
KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
KDASSERT(min <= max);
KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
min == map->min_offset);
/*
* During the function, entfree will always point at the uaddr state
* for entry.
*/
entfree = (entry == NULL ? NULL :
uvm_map_uaddr_e(map, entry)); while (min != max) {
/* Claim guard page for entry. */
if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL && VMMAP_FREE_END(entry) == entry->end &&
entry->start != entry->end) {
if (max - min == 2 * PAGE_SIZE) {
/*
* If the free-space gap is exactly 2 pages,
* we make the guard 2 pages instead of 1.
* Because in a guarded map, an area needs
* at least 2 pages to allocate from:
* one page for the allocation and one for
* the guard.
*/
entry->guard = 2 * PAGE_SIZE;
min = max;
} else {
entry->guard = PAGE_SIZE;
min += PAGE_SIZE;
}
continue;
}
/*
* Handle the case where entry has a 2-page guard, but the
* space after entry is freed.
*/
if (entry != NULL && entry->fspace == 0 &&
entry->guard > PAGE_SIZE) {
entry->guard = PAGE_SIZE;
min = VMMAP_FREE_START(entry);
}
lmax = uvm_map_boundary(map, min, max);
free = uvm_map_uaddr(map, min);
/*
* Entries are merged if they point at the same uvm_free().
* Exception to that rule: if min == uvm_maxkaddr, a new
* entry is started regardless (otherwise the allocators
* will get confused).
*/
if (entry != NULL && free == entfree && !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
min == uvm_maxkaddr)) {
KDASSERT(VMMAP_FREE_END(entry) == min);
entry->fspace += lmax - min;
} else {
/*
* Commit entry to free list: it'll not be added to
* anymore.
* We'll start a new entry and add to that entry
* instead.
*/
if (entry != NULL) uvm_mapent_free_insert(map, entfree, entry);
/* New entry for new uaddr. */
entry = uvm_mapent_alloc(map, flags);
KDASSERT(entry != NULL);
entry->end = entry->start = min;
entry->guard = 0;
entry->fspace = lmax - min;
entry->object.uvm_obj = NULL;
entry->offset = 0;
entry->etype = 0;
entry->protection = entry->max_protection = 0;
entry->inheritance = 0;
entry->wired_count = 0;
entry->advice = 0;
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = NULL;
uvm_mapent_addr_insert(map, entry);
entfree = free;
}
min = lmax;
}
/* Finally put entry on the uaddr state. */
if (entry != NULL)
uvm_mapent_free_insert(map, entfree, entry);
return entry;
}
/*
* MQuery style of allocation.
*
* This allocator searches forward until sufficient space is found to map
* the given size.
*
* XXX: factor in offset (via pmap_prefer) and protection?
*/
int
uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
int flags)
{
struct vm_map_entry *entry, *last;
vaddr_t addr;
vaddr_t tmp, pmap_align, pmap_offset;
int error;
addr = *addr_p;
vm_map_lock_read(map);
/* Configure pmap prefer. */
if (offset != UVM_UNKNOWN_OFFSET) {
pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
pmap_offset = PMAP_PREFER_OFFSET(offset);
} else {
pmap_align = PAGE_SIZE;
pmap_offset = 0;
}
/* Align address to pmap_prefer unless FLAG_FIXED is set. */
if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
if (tmp < addr)
tmp += pmap_align;
addr = tmp;
}
/* First, check if the requested range is fully available. */
entry = uvm_map_entrybyaddr(&map->addr, addr);
last = NULL;
if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
error = 0;
goto out;
}
if (flags & UVM_FLAG_FIXED) {
error = EINVAL;
goto out;
}
error = ENOMEM; /* Default error from here. */
/*
* At this point, the memory at <addr, sz> is not available.
* The reasons are:
* [1] it's outside the map,
* [2] it starts in used memory (and therefore needs to move
* toward the first free page in entry),
* [3] it starts in free memory but bumps into used memory.
*
* Note that for case [2], the forward moving is handled by the
* for loop below.
*/
if (entry == NULL) {
/* [1] Outside the map. */
if (addr >= map->max_offset)
goto out;
else
entry = RBT_MIN(uvm_map_addr, &map->addr); } else if (VMMAP_FREE_START(entry) <= addr) {
/* [3] Bumped into used memory. */
entry = RBT_NEXT(uvm_map_addr, entry);
}
/* Test if the next entry is sufficient for the allocation. */
for (; entry != NULL; entry = RBT_NEXT(uvm_map_addr, entry)) { if (entry->fspace == 0)
continue;
addr = VMMAP_FREE_START(entry);
restart: /* Restart address checks on address change. */
tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
if (tmp < addr)
tmp += pmap_align;
addr = tmp;
if (addr >= VMMAP_FREE_END(entry))
continue;
/* Skip brk() allocation addresses. */
if (addr + sz > map->b_start && addr < map->b_end) { if (VMMAP_FREE_END(entry) > map->b_end) {
addr = map->b_end;
goto restart;
} else
continue;
}
/* Skip stack allocation addresses. */
if (addr + sz > map->s_start && addr < map->s_end) { if (VMMAP_FREE_END(entry) > map->s_end) {
addr = map->s_end;
goto restart;
} else
continue;
}
last = NULL;
if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
error = 0;
goto out;
}
}
out:
vm_map_unlock_read(map);
if (error == 0) *addr_p = addr;
return error;
}
boolean_t
vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
{
boolean_t rv;
if (map->flags & VM_MAP_INTRSAFE) {
rv = mtx_enter_try(&map->mtx);
} else {
mtx_enter(&map->flags_lock);
if (map->flags & VM_MAP_BUSY) {
mtx_leave(&map->flags_lock);
return (FALSE);
}
mtx_leave(&map->flags_lock);
rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0);
/* check if the lock is busy and back out if we won the race */
if (rv) {
mtx_enter(&map->flags_lock);
if (map->flags & VM_MAP_BUSY) {
rw_exit(&map->lock);
rv = FALSE;
}
mtx_leave(&map->flags_lock);
}
}
if (rv) {
map->timestamp++;
LPRINTF(("map lock: %p (at %s %d)\n", map, file, line));
uvm_tree_sanity(map, file, line);
uvm_tree_size_chk(map, file, line);
}
return (rv);
}
void
vm_map_lock_ln(struct vm_map *map, char *file, int line)
{
if ((map->flags & VM_MAP_INTRSAFE) == 0) {
do {
mtx_enter(&map->flags_lock);
tryagain:
while (map->flags & VM_MAP_BUSY) {
map->flags |= VM_MAP_WANTLOCK;
msleep_nsec(&map->flags, &map->flags_lock,
PVM, vmmapbsy, INFSLP);
}
mtx_leave(&map->flags_lock);
} while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0);
/* check if the lock is busy and back out if we won the race */
mtx_enter(&map->flags_lock);
if (map->flags & VM_MAP_BUSY) {
rw_exit(&map->lock);
goto tryagain;
}
mtx_leave(&map->flags_lock);
} else {
mtx_enter(&map->mtx);
}
map->timestamp++;
LPRINTF(("map lock: %p (at %s %d)\n", map, file, line));
uvm_tree_sanity(map, file, line);
uvm_tree_size_chk(map, file, line);
}
void
vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
{
if ((map->flags & VM_MAP_INTRSAFE) == 0)
rw_enter_read(&map->lock);
else
mtx_enter(&map->mtx);
LPRINTF(("map lock: %p (at %s %d)\n", map, file, line));
uvm_tree_sanity(map, file, line);
uvm_tree_size_chk(map, file, line);
}
void
vm_map_unlock_ln(struct vm_map *map, char *file, int line)
{
uvm_tree_sanity(map, file, line);
uvm_tree_size_chk(map, file, line);
LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
if ((map->flags & VM_MAP_INTRSAFE) == 0)
rw_exit(&map->lock);
else
mtx_leave(&map->mtx);
}
void
vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
{
/* XXX: RO */ uvm_tree_sanity(map, file, line);
/* XXX: RO */ uvm_tree_size_chk(map, file, line);
LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
if ((map->flags & VM_MAP_INTRSAFE) == 0)
rw_exit_read(&map->lock);
else
mtx_leave(&map->mtx);
}
void
vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
{
uvm_tree_sanity(map, file, line);
uvm_tree_size_chk(map, file, line);
LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
LPRINTF(("map lock: %p (at %s %d)\n", map, file, line));
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
if ((map->flags & VM_MAP_INTRSAFE) == 0)
rw_enter(&map->lock, RW_DOWNGRADE);
}
void
vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
{
/* XXX: RO */ uvm_tree_sanity(map, file, line);
/* XXX: RO */ uvm_tree_size_chk(map, file, line);
LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
if ((map->flags & VM_MAP_INTRSAFE) == 0) {
rw_exit_read(&map->lock);
rw_enter_write(&map->lock);
}
LPRINTF(("map lock: %p (at %s %d)\n", map, file, line));
uvm_tree_sanity(map, file, line);
}
void
vm_map_busy_ln(struct vm_map *map, char *file, int line)
{
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
mtx_enter(&map->flags_lock);
map->flags |= VM_MAP_BUSY;
mtx_leave(&map->flags_lock);
}
void
vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
{
int oflags;
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
mtx_enter(&map->flags_lock);
oflags = map->flags;
map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
mtx_leave(&map->flags_lock);
if (oflags & VM_MAP_WANTLOCK) wakeup(&map->flags);
}
#ifndef SMALL_KERNEL
int
uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
size_t *lenp)
{
struct vm_map_entry *entry;
vaddr_t start;
int cnt, maxcnt, error = 0;
KASSERT(*lenp > 0); KASSERT((*lenp % sizeof(*kve)) == 0);
cnt = 0;
maxcnt = *lenp / sizeof(*kve);
KASSERT(maxcnt > 0);
/*
* Return only entries whose address is above the given base
* address. This allows userland to iterate without knowing the
* number of entries beforehand.
*/
start = (vaddr_t)kve[0].kve_start;
vm_map_lock(map);
RBT_FOREACH(entry, uvm_map_addr, &map->addr) { if (cnt == maxcnt) {
error = ENOMEM;
break;
}
if (start != 0 && entry->start < start)
continue;
kve->kve_start = entry->start;
kve->kve_end = entry->end;
kve->kve_guard = entry->guard;
kve->kve_fspace = entry->fspace;
kve->kve_fspace_augment = entry->fspace_augment;
kve->kve_offset = entry->offset;
kve->kve_wired_count = entry->wired_count;
kve->kve_etype = entry->etype;
kve->kve_protection = entry->protection;
kve->kve_max_protection = entry->max_protection;
kve->kve_advice = entry->advice;
kve->kve_inheritance = entry->inheritance;
kve->kve_flags = entry->flags;
kve++;
cnt++;
}
vm_map_unlock(map);
KASSERT(cnt <= maxcnt); *lenp = sizeof(*kve) * cnt;
return error;
}
#endif
RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
uvm_mapentry_addrcmp, uvm_map_addr_augment);
/*
* MD code: vmspace allocator setup.
*/
#ifdef __i386__
void
uvm_map_setup_md(struct vm_map *map)
{
vaddr_t min, max;
min = map->min_offset;
max = map->max_offset;
/*
* Ensure the selectors will not try to manage page 0;
* it's too special.
*/
if (min < VMMAP_MIN_ADDR)
min = VMMAP_MIN_ADDR;
#if 0 /* Cool stuff, not yet */
/* Executable code is special. */
map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
/* Place normal allocations beyond executable mappings. */
map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
#else /* Crappy stuff, for now */
map->uaddr_any[0] = uaddr_rnd_create(min, max);
#endif
#ifndef SMALL_KERNEL
map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
#endif /* !SMALL_KERNEL */
}
#elif __LP64__
void
uvm_map_setup_md(struct vm_map *map)
{
vaddr_t min, max;
min = map->min_offset;
max = map->max_offset;
/*
* Ensure the selectors will not try to manage page 0;
* it's too special.
*/
if (min < VMMAP_MIN_ADDR)
min = VMMAP_MIN_ADDR;
#if 0 /* Cool stuff, not yet */
map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
#else /* Crappy stuff, for now */
map->uaddr_any[0] = uaddr_rnd_create(min, max);
#endif
#ifndef SMALL_KERNEL
map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
#endif /* !SMALL_KERNEL */
}
#else /* non-i386, 32 bit */
void
uvm_map_setup_md(struct vm_map *map)
{
vaddr_t min, max;
min = map->min_offset;
max = map->max_offset;
/*
* Ensure the selectors will not try to manage page 0;
* it's too special.
*/
if (min < VMMAP_MIN_ADDR)
min = VMMAP_MIN_ADDR;
#if 0 /* Cool stuff, not yet */
map->uaddr_any[3] = uaddr_pivot_create(min, max);
#else /* Crappy stuff, for now */
map->uaddr_any[0] = uaddr_rnd_create(min, max);
#endif
#ifndef SMALL_KERNEL
map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
#endif /* !SMALL_KERNEL */
}
#endif
/* $OpenBSD: kern_exec.c,v 1.231 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_exec.c,v 1.75 1996/02/09 18:59:28 christos Exp $ */
/*-
* Copyright (C) 1993, 1994 Christopher G. Demetriou
* Copyright (C) 1992 Wolfgang Solfrank.
* Copyright (C) 1992 TooLs GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by TooLs GmbH.
* 4. The name of TooLs GmbH may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/acct.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/ktrace.h>
#include <sys/resourcevar.h>
#include <sys/mman.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/conf.h>
#include <sys/pledge.h>
#ifdef SYSVSHM
#include <sys/shm.h>
#endif
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#include <machine/tcb.h>
#include <sys/timetc.h>
struct uvm_object *sigobject; /* shared sigcode object */
struct uvm_object *timekeep_object;
struct timekeep *timekeep;
void unveil_destroy(struct process *ps);
const struct kmem_va_mode kv_exec = {
.kv_wait = 1,
.kv_map = &exec_map
};
/*
* Map the shared signal code.
*/
int exec_sigcode_map(struct process *);
/*
* Map the shared timekeep page.
*/
int exec_timekeep_map(struct process *);
/*
* If non-zero, stackgap_random specifies the upper limit of the random gap size
* added to the fixed stack position. Must be n^2.
*/
int stackgap_random = STACKGAP_RANDOM;
/*
* check exec:
* given an "executable" described in the exec package's namei info,
* see what we can do with it.
*
* ON ENTRY:
* exec package with appropriate namei info
* proc pointer of exec'ing proc
* NO SELF-LOCKED VNODES
*
* ON EXIT:
* error: nothing held, etc. exec header still allocated.
* ok: filled exec package, one locked vnode.
*
* EXEC SWITCH ENTRY:
* Locked vnode to check, exec package, proc.
*
* EXEC SWITCH EXIT:
* ok: return 0, filled exec package, one locked vnode.
* error: destructive:
* everything deallocated except exec header.
* non-destructive:
* error code, locked vnode, exec header unmodified
*/
int
check_exec(struct proc *p, struct exec_package *epp)
{
int error, i;
struct vnode *vp;
struct nameidata *ndp;
size_t resid;
ndp = epp->ep_ndp;
ndp->ni_cnd.cn_nameiop = LOOKUP;
ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF | SAVENAME;
if (epp->ep_flags & EXEC_INDIR)
ndp->ni_cnd.cn_flags |= BYPASSUNVEIL;
/* first get the vnode */
if ((error = namei(ndp)) != 0)
return (error);
epp->ep_vp = vp = ndp->ni_vp;
/* check for regular file */
if (vp->v_type != VREG) {
error = EACCES;
goto bad1;
}
/* get attributes */
if ((error = VOP_GETATTR(vp, epp->ep_vap, p->p_ucred, p)) != 0)
goto bad1;
/* Check mount point */
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad1;
}
/* SUID programs may not be started with execpromises */
if ((epp->ep_vap->va_mode & (VSUID | VSGID)) &&
(p->p_p->ps_flags & PS_EXECPLEDGE)) {
error = EACCES;
goto bad1;
}
if ((vp->v_mount->mnt_flag & MNT_NOSUID)) epp->ep_vap->va_mode &= ~(VSUID | VSGID);
/* check access. for root we have to see if any exec bit on */
if ((error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p)) != 0)
goto bad1;
if ((epp->ep_vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
error = EACCES;
goto bad1;
}
/* try to open it */
if ((error = VOP_OPEN(vp, FREAD, p->p_ucred, p)) != 0)
goto bad1;
/* unlock vp, we need it unlocked from here */
VOP_UNLOCK(vp);
/* now we have the file, get the exec header */
error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
UIO_SYSSPACE, 0, p->p_ucred, &resid, p);
if (error)
goto bad2;
epp->ep_hdrvalid = epp->ep_hdrlen - resid;
/*
* set up the vmcmds for creation of the process
* address space
*/
error = ENOEXEC;
for (i = 0; i < nexecs && error != 0; i++) {
int newerror;
if (execsw[i].es_check == NULL)
continue;
newerror = (*execsw[i].es_check)(p, epp);
/* make sure the first "interesting" error code is saved. */
if (!newerror || error == ENOEXEC)
error = newerror;
if (epp->ep_flags & EXEC_DESTR && error != 0)
return (error);
}
if (!error) {
/* check that entry point is sane */
if (epp->ep_entry > VM_MAXUSER_ADDRESS) {
error = ENOEXEC;
}
/* check limits */
if ((epp->ep_tsize > MAXTSIZ) ||
(epp->ep_dsize > lim_cur(RLIMIT_DATA)))
error = ENOMEM;
if (!error)
return (0);
}
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
bad2:
/*
* close the vnode, free the pathname buf, and punt.
*/
vn_close(vp, FREAD, p->p_ucred, p);
pool_put(&namei_pool, ndp->ni_cnd.cn_pnbuf);
return (error);
bad1:
/*
* free the namei pathname buffer, and put the vnode
* (which we don't yet have open).
*/
pool_put(&namei_pool, ndp->ni_cnd.cn_pnbuf);
vput(vp);
return (error);
}
/*
* exec system call
*/
int
sys_execve(struct proc *p, void *v, register_t *retval)
{
struct sys_execve_args /* {
syscallarg(const char *) path;
syscallarg(char *const *) argp;
syscallarg(char *const *) envp;
} */ *uap = v;
int error;
struct exec_package pack;
struct nameidata nid;
struct vattr attr;
struct ucred *cred = p->p_ucred;
char *argp;
char * const *cpp, *dp, *sp;
#ifdef KTRACE
char *env_start;
#endif
struct process *pr = p->p_p;
long argc, envc;
size_t len, sgap, dstsize;
#ifdef MACHINE_STACK_GROWS_UP
size_t slen;
#endif
char *stack;
struct ps_strings arginfo;
struct vmspace *vm;
struct vnode *otvp;
/* get other threads to stop */
if ((error = single_thread_set(p, SINGLE_UNWIND, 1)))
return (error);
/*
* Cheap solution to complicated problems.
* Mark this process as "leave me alone, I'm execing".
*/
atomic_setbits_int(&pr->ps_flags, PS_INEXEC);
NDINIT(&nid, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
nid.ni_pledge = PLEDGE_EXEC;
nid.ni_unveil = UNVEIL_EXEC;
/*
* initialize the fields of the exec package.
*/
pack.ep_name = (char *)SCARG(uap, path);
pack.ep_hdr = malloc(exec_maxhdrsz, M_EXEC, M_WAITOK);
pack.ep_hdrlen = exec_maxhdrsz;
pack.ep_hdrvalid = 0;
pack.ep_ndp = &nid;
pack.ep_interp = NULL;
pack.ep_args = NULL;
pack.ep_auxinfo = NULL;
VMCMDSET_INIT(&pack.ep_vmcmds);
pack.ep_vap = &attr;
pack.ep_flags = 0;
/* see if we can run it. */
if ((error = check_exec(p, &pack)) != 0) {
goto freehdr;
}
/* XXX -- THE FOLLOWING SECTION NEEDS MAJOR CLEANUP */
/* allocate an argument buffer */
argp = km_alloc(NCARGS, &kv_exec, &kp_pageable, &kd_waitok);
#ifdef DIAGNOSTIC
if (argp == NULL)
panic("execve: argp == NULL");
#endif
dp = argp;
argc = 0;
/*
* Copy the fake args list, if there's one, freeing it as we go.
* exec_script_makecmds() allocates either 2 or 3 fake args bounded
* by MAXINTERP + MAXPATHLEN < NCARGS so no overflow can happen.
*/
if (pack.ep_flags & EXEC_HASARGL) {
dstsize = NCARGS;
for(; pack.ep_fa[argc] != NULL; argc++) {
len = strlcpy(dp, pack.ep_fa[argc], dstsize);
len++;
dp += len; dstsize -= len;
if (pack.ep_fa[argc+1] != NULL)
free(pack.ep_fa[argc], M_EXEC, len);
else
free(pack.ep_fa[argc], M_EXEC, MAXPATHLEN);
}
free(pack.ep_fa, M_EXEC, 4 * sizeof(char *));
pack.ep_flags &= ~EXEC_HASARGL;
}
/* Now get argv & environment */
if (!(cpp = SCARG(uap, argp))) {
error = EFAULT;
goto bad;
}
if (pack.ep_flags & EXEC_SKIPARG)
cpp++;
while (1) {
len = argp + ARG_MAX - dp;
if ((error = copyin(cpp, &sp, sizeof(sp))) != 0)
goto bad;
if (!sp)
break;
if ((error = copyinstr(sp, dp, len, &len)) != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto bad;
}
dp += len;
cpp++;
argc++;
}
/* must have at least one argument */
if (argc == 0) {
error = EINVAL;
goto bad;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_EXECARGS)) ktrexec(p, KTR_EXECARGS, argp, dp - argp);
#endif
envc = 0;
/* environment does not need to be there */
if ((cpp = SCARG(uap, envp)) != NULL ) {
#ifdef KTRACE
env_start = dp;
#endif
while (1) {
len = argp + ARG_MAX - dp;
if ((error = copyin(cpp, &sp, sizeof(sp))) != 0)
goto bad;
if (!sp)
break;
if ((error = copyinstr(sp, dp, len, &len)) != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto bad;
}
dp += len;
cpp++;
envc++;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_EXECENV)) ktrexec(p, KTR_EXECENV, env_start, dp - env_start);
#endif
}
dp = (char *)(((long)dp + _STACKALIGNBYTES) & ~_STACKALIGNBYTES);
/*
* If we have enabled random stackgap, the stack itself has already
* been moved from a random location, but is still aligned to a page
* boundary. Provide the lower bits of random placement now.
*/
if (stackgap_random == 0) {
sgap = 0;
} else {
sgap = arc4random() & PAGE_MASK;
sgap = (sgap + _STACKALIGNBYTES) & ~_STACKALIGNBYTES;
}
/* Now check if args & environ fit into new stack */
len = ((argc + envc + 2 + ELF_AUX_WORDS) * sizeof(char *) +
sizeof(long) + dp + sgap + sizeof(struct ps_strings)) - argp;
len = (len + _STACKALIGNBYTES) &~ _STACKALIGNBYTES;
if (len > pack.ep_ssize) { /* in effect, compare to initial limit */
error = ENOMEM;
goto bad;
}
/* adjust "active stack depth" for process VSZ */
pack.ep_ssize = len; /* maybe should go elsewhere, but... */
/*
* we're committed: any further errors will kill the process, so
* kill the other threads now.
*/
single_thread_set(p, SINGLE_EXIT, 1);
/*
* Prepare vmspace for remapping. Note that uvmspace_exec can replace
* ps_vmspace!
*/
uvmspace_exec(p, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
vm = pr->ps_vmspace;
/* Now map address space */
vm->vm_taddr = (char *)trunc_page(pack.ep_taddr);
vm->vm_tsize = atop(round_page(pack.ep_taddr + pack.ep_tsize) -
trunc_page(pack.ep_taddr));
vm->vm_daddr = (char *)trunc_page(pack.ep_daddr);
vm->vm_dsize = atop(round_page(pack.ep_daddr + pack.ep_dsize) -
trunc_page(pack.ep_daddr));
vm->vm_dused = 0;
vm->vm_ssize = atop(round_page(pack.ep_ssize));
vm->vm_maxsaddr = (char *)pack.ep_maxsaddr;
vm->vm_minsaddr = (char *)pack.ep_minsaddr;
/* create the new process's VM space by running the vmcmds */
#ifdef DIAGNOSTIC
if (pack.ep_vmcmds.evs_used == 0)
panic("execve: no vmcmds");
#endif
error = exec_process_vmcmds(p, &pack);
/* if an error happened, deallocate and punt */
if (error)
goto exec_abort;
#ifdef MACHINE_STACK_GROWS_UP
pr->ps_strings = (vaddr_t)vm->vm_maxsaddr + sgap;
if (uvm_map_protect(&vm->vm_map, (vaddr_t)vm->vm_maxsaddr,
trunc_page(pr->ps_strings), PROT_NONE, TRUE))
goto exec_abort;
#else
pr->ps_strings = (vaddr_t)vm->vm_minsaddr - sizeof(arginfo) - sgap;
if (uvm_map_protect(&vm->vm_map,
round_page(pr->ps_strings + sizeof(arginfo)),
(vaddr_t)vm->vm_minsaddr, PROT_NONE, TRUE))
goto exec_abort;
#endif
memset(&arginfo, 0, sizeof(arginfo));
/* remember information about the process */
arginfo.ps_nargvstr = argc;
arginfo.ps_nenvstr = envc;
#ifdef MACHINE_STACK_GROWS_UP
stack = (char *)vm->vm_maxsaddr + sizeof(arginfo) + sgap;
slen = len - sizeof(arginfo) - sgap;
#else
stack = (char *)(vm->vm_minsaddr - len);
#endif
/* Now copy argc, args & environ to new stack */
if (!copyargs(&pack, &arginfo, stack, argp))
goto exec_abort;
/* copy out the process's ps_strings structure */
if (copyout(&arginfo, (char *)pr->ps_strings, sizeof(arginfo)))
goto exec_abort;
stopprofclock(pr); /* stop profiling */
fdcloseexec(p); /* handle close on exec */
execsigs(p); /* reset caught signals */
TCB_SET(p, NULL); /* reset the TCB address */
pr->ps_kbind_addr = 0; /* reset the kbind bits */
pr->ps_kbind_cookie = 0;
arc4random_buf(&pr->ps_sigcookie, sizeof pr->ps_sigcookie);
/* set command name & other accounting info */
memset(pr->ps_comm, 0, sizeof(pr->ps_comm));
strlcpy(pr->ps_comm, nid.ni_cnd.cn_nameptr, sizeof(pr->ps_comm));
pr->ps_acflag &= ~AFORK;
/* record proc's vnode, for use by sysctl */
otvp = pr->ps_textvp;
vref(pack.ep_vp);
pr->ps_textvp = pack.ep_vp;
if (otvp) vrele(otvp);
atomic_setbits_int(&pr->ps_flags, PS_EXEC);
if (pr->ps_flags & PS_PPWAIT) { atomic_clearbits_int(&pr->ps_flags, PS_PPWAIT);
atomic_clearbits_int(&pr->ps_pptr->ps_flags, PS_ISPWAIT);
wakeup(pr->ps_pptr);
}
/*
* If process does execve() while it has a mismatched real,
* effective, or saved uid/gid, we set PS_SUGIDEXEC.
*/
if (cred->cr_uid != cred->cr_ruid || cred->cr_uid != cred->cr_svuid || cred->cr_gid != cred->cr_rgid ||
cred->cr_gid != cred->cr_svgid)
atomic_setbits_int(&pr->ps_flags, PS_SUGIDEXEC);
else
atomic_clearbits_int(&pr->ps_flags, PS_SUGIDEXEC);
if (pr->ps_flags & PS_EXECPLEDGE) {
pr->ps_pledge = pr->ps_execpledge;
atomic_setbits_int(&pr->ps_flags, PS_PLEDGE);
} else {
atomic_clearbits_int(&pr->ps_flags, PS_PLEDGE);
pr->ps_pledge = 0;
/* XXX XXX XXX XXX */
/* Clear our unveil paths out so the child
* starts afresh
*/
unveil_destroy(pr);
pr->ps_uvdone = 0;
}
/*
* deal with set[ug]id.
* MNT_NOEXEC has already been used to disable s[ug]id.
*/
if ((attr.va_mode & (VSUID | VSGID)) && proc_cansugid(p)) {
int i;
atomic_setbits_int(&pr->ps_flags, PS_SUGID|PS_SUGIDEXEC);
#ifdef KTRACE
/*
* If process is being ktraced, turn off - unless
* root set it.
*/
if (pr->ps_tracevp && !(pr->ps_traceflag & KTRFAC_ROOT)) ktrcleartrace(pr);
#endif
p->p_ucred = cred = crcopy(cred);
if (attr.va_mode & VSUID) cred->cr_uid = attr.va_uid; if (attr.va_mode & VSGID) cred->cr_gid = attr.va_gid;
/*
* For set[ug]id processes, a few caveats apply to
* stdin, stdout, and stderr.
*/
error = 0;
fdplock(p->p_fd); for (i = 0; i < 3; i++) {
struct file *fp = NULL;
/*
* NOTE - This will never return NULL because of
* immature fds. The file descriptor table is not
* shared because we're suid.
*/
fp = fd_getfile(p->p_fd, i);
/*
* Ensure that stdin, stdout, and stderr are already
* allocated. We do not want userland to accidentally
* allocate descriptors in this range which has implied
* meaning to libc.
*/
if (fp == NULL) {
short flags = FREAD | (i == 0 ? 0 : FWRITE);
struct vnode *vp;
int indx;
if ((error = falloc(p, &fp, &indx)) != 0)
break;
#ifdef DIAGNOSTIC
if (indx != i)
panic("sys_execve: falloc indx != i");
#endif
if ((error = cdevvp(getnulldev(), &vp)) != 0) {
fdremove(p->p_fd, indx);
closef(fp, p);
break;
}
if ((error = VOP_OPEN(vp, flags, cred, p)) != 0) {
fdremove(p->p_fd, indx);
closef(fp, p);
vrele(vp);
break;
}
if (flags & FWRITE)
vp->v_writecount++;
fp->f_flag = flags;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_data = (caddr_t)vp;
fdinsert(p->p_fd, indx, 0, fp);
}
FRELE(fp, p);
}
fdpunlock(p->p_fd);
if (error)
goto exec_abort;
} else
atomic_clearbits_int(&pr->ps_flags, PS_SUGID);
/*
* Reset the saved ugids and update the process's copy of the
* creds if the creds have been changed
*/
if (cred->cr_uid != cred->cr_svuid ||
cred->cr_gid != cred->cr_svgid) {
/* make sure we have unshared ucreds */
p->p_ucred = cred = crcopy(cred);
cred->cr_svuid = cred->cr_uid;
cred->cr_svgid = cred->cr_gid;
}
if (pr->ps_ucred != cred) {
struct ucred *ocred;
ocred = pr->ps_ucred;
crhold(cred);
pr->ps_ucred = cred;
crfree(ocred);
}
if (pr->ps_flags & PS_SUGIDEXEC) { cancel_all_itimers();
}
/* reset CPU time usage for the thread, but not the process */
timespecclear(&p->p_tu.tu_runtime);
p->p_tu.tu_uticks = p->p_tu.tu_sticks = p->p_tu.tu_iticks = 0;
km_free(argp, NCARGS, &kv_exec, &kp_pageable);
pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf);
vn_close(pack.ep_vp, FREAD, cred, p);
/*
* notify others that we exec'd
*/
KNOTE(&pr->ps_klist, NOTE_EXEC);
/* map the process's timekeep page, needs to be before exec_elf_fixup */
if (exec_timekeep_map(pr))
goto free_pack_abort;
/* setup new registers and do misc. setup. */
if (exec_elf_fixup(p, &pack) != 0)
goto free_pack_abort;
#ifdef MACHINE_STACK_GROWS_UP
setregs(p, &pack, (u_long)stack + slen, retval);
#else
setregs(p, &pack, (u_long)stack, retval);
#endif
/* map the process's signal trampoline code */
if (exec_sigcode_map(pr))
goto free_pack_abort;
#ifdef __HAVE_EXEC_MD_MAP
/* perform md specific mappings that process might need */
if (exec_md_map(p, &pack))
goto free_pack_abort;
#endif
if (pr->ps_flags & PS_TRACED) psignal(p, SIGTRAP);
free(pack.ep_hdr, M_EXEC, pack.ep_hdrlen);
p->p_descfd = 255;
if ((pack.ep_flags & EXEC_HASFD) && pack.ep_fd < 255) p->p_descfd = pack.ep_fd;
if (pack.ep_flags & EXEC_WXNEEDED)
atomic_setbits_int(&p->p_p->ps_flags, PS_WXNEEDED);
else
atomic_clearbits_int(&p->p_p->ps_flags, PS_WXNEEDED);
atomic_clearbits_int(&pr->ps_flags, PS_INEXEC);
single_thread_clear(p, P_SUSPSIG);
return (0);
bad:
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&pack.ep_vmcmds);
/* kill any opened file descriptor, if necessary */
if (pack.ep_flags & EXEC_HASFD) {
pack.ep_flags &= ~EXEC_HASFD;
fdplock(p->p_fd);
/* fdrelease unlocks p->p_fd. */
(void) fdrelease(p, pack.ep_fd);
}
if (pack.ep_interp != NULL) pool_put(&namei_pool, pack.ep_interp);
free(pack.ep_args, M_TEMP, sizeof *pack.ep_args);
/* close and put the exec'd file */
vn_close(pack.ep_vp, FREAD, cred, p);
pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf);
km_free(argp, NCARGS, &kv_exec, &kp_pageable);
freehdr:
free(pack.ep_hdr, M_EXEC, pack.ep_hdrlen);
atomic_clearbits_int(&pr->ps_flags, PS_INEXEC);
single_thread_clear(p, P_SUSPSIG);
return (error);
exec_abort:
/*
* the old process doesn't exist anymore. exit gracefully.
* get rid of the (new) address space we have created, if any, get rid
* of our namei data and vnode, and exit noting failure
*/
uvm_unmap(&vm->vm_map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
if (pack.ep_interp != NULL) pool_put(&namei_pool, pack.ep_interp);
free(pack.ep_args, M_TEMP, sizeof *pack.ep_args);
pool_put(&namei_pool, nid.ni_cnd.cn_pnbuf);
vn_close(pack.ep_vp, FREAD, cred, p);
km_free(argp, NCARGS, &kv_exec, &kp_pageable);
free_pack_abort:
free(pack.ep_hdr, M_EXEC, pack.ep_hdrlen);
exit1(p, 0, SIGABRT, EXIT_NORMAL);
/* NOTREACHED */
atomic_clearbits_int(&pr->ps_flags, PS_INEXEC);
return (0);
}
int
copyargs(struct exec_package *pack, struct ps_strings *arginfo, void *stack,
void *argp)
{
char **cpp = stack;
char *dp, *sp;
size_t len;
void *nullp = NULL;
long argc = arginfo->ps_nargvstr;
int envc = arginfo->ps_nenvstr;
if (copyout(&argc, cpp++, sizeof(argc)))
return (0);
dp = (char *) (cpp + argc + envc + 2 + ELF_AUX_WORDS);
sp = argp;
/* XXX don't copy them out, remap them! */
arginfo->ps_argvstr = cpp; /* remember location of argv for later */
for (; --argc >= 0; sp += len, dp += len)
if (copyout(&dp, cpp++, sizeof(dp)) ||
copyoutstr(sp, dp, ARG_MAX, &len))
return (0);
if (copyout(&nullp, cpp++, sizeof(nullp)))
return (0);
arginfo->ps_envstr = cpp; /* remember location of envp for later */
for (; --envc >= 0; sp += len, dp += len)
if (copyout(&dp, cpp++, sizeof(dp)) ||
copyoutstr(sp, dp, ARG_MAX, &len))
return (0);
if (copyout(&nullp, cpp++, sizeof(nullp)))
return (0);
/* if this process needs auxinfo, note where to place it */
if (pack->ep_args != NULL)
pack->ep_auxinfo = cpp;
return (1);
}
int
exec_sigcode_map(struct process *pr)
{
extern char sigcode[], esigcode[], sigcoderet[];
vsize_t sz;
sz = (vaddr_t)esigcode - (vaddr_t)sigcode;
/*
* If we don't have a sigobject yet, create one.
*
* sigobject is an anonymous memory object (just like SYSV shared
* memory) that we keep a permanent reference to and that we map
* in all processes that need this sigcode. The creation is simple,
* we create an object, add a permanent reference to it, map it in
* kernel space, copy out the sigcode to it and unmap it.
* Then we map it with PROT_READ|PROT_EXEC into the process just
* the way sys_mmap would map it.
*/
if (sigobject == NULL) {
extern int sigfillsiz;
extern u_char sigfill[];
size_t off, left;
vaddr_t va;
int r;
sigobject = uao_create(sz, 0);
uao_reference(sigobject); /* permanent reference */
if ((r = uvm_map(kernel_map, &va, round_page(sz), sigobject,
0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_INHERIT_SHARE, MADV_RANDOM, 0)))) {
uao_detach(sigobject);
return (ENOMEM);
}
for (off = 0, left = round_page(sz); left != 0;
off += sigfillsiz) {
size_t chunk = ulmin(left, sigfillsiz);
memcpy((caddr_t)va + off, sigfill, chunk);
left -= chunk;
}
memcpy((caddr_t)va, sigcode, sz);
uvm_unmap(kernel_map, va, va + round_page(sz));
}
pr->ps_sigcode = 0; /* no hint */
uao_reference(sigobject);
if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_sigcode, round_page(sz),
sigobject, 0, 0, UVM_MAPFLAG(PROT_READ | PROT_EXEC,
PROT_READ | PROT_WRITE | PROT_EXEC, MAP_INHERIT_COPY,
MADV_RANDOM, UVM_FLAG_COPYONW | UVM_FLAG_SYSCALL))) {
uao_detach(sigobject);
return (ENOMEM);
}
/* Calculate PC at point of sigreturn entry */
pr->ps_sigcoderet = pr->ps_sigcode + (sigcoderet - sigcode);
return (0);
}
int
exec_timekeep_map(struct process *pr)
{
size_t timekeep_sz = round_page(sizeof(struct timekeep));
/*
* Similar to the sigcode object
*/
if (timekeep_object == NULL) {
vaddr_t va = 0;
timekeep_object = uao_create(timekeep_sz, 0);
uao_reference(timekeep_object);
if (uvm_map(kernel_map, &va, timekeep_sz, timekeep_object,
0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
uao_detach(timekeep_object);
timekeep_object = NULL;
return (ENOMEM);
}
if (uvm_fault_wire(kernel_map, va, va + timekeep_sz,
PROT_READ | PROT_WRITE)) {
uvm_unmap(kernel_map, va, va + timekeep_sz);
uao_detach(timekeep_object);
timekeep_object = NULL;
return (ENOMEM);
}
timekeep = (struct timekeep *)va;
timekeep->tk_version = TK_VERSION;
}
pr->ps_timekeep = 0; /* no hint */
uao_reference(timekeep_object);
if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, timekeep_sz,
timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
uao_detach(timekeep_object);
return (ENOMEM);
}
return (0);
}
/* $OpenBSD: protosw.h,v 1.55 2022/09/05 14:56:09 bluhm Exp $ */
/* $NetBSD: protosw.h,v 1.10 1996/04/09 20:55:32 cgd Exp $ */
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)protosw.h 8.1 (Berkeley) 6/2/93
*/
/*
* Protocol switch table.
*
* Each protocol has a handle initializing one of these structures,
* which is used for protocol-protocol and system-protocol communication.
*
* A protocol is called through the pr_init entry before any other.
* Thereafter it is called every 200ms through the pr_fasttimo entry and
* every 500ms through the pr_slowtimo for timer based actions.
*
* Protocols pass data between themselves as chains of mbufs using
* the pr_input and pr_send hooks. Pr_input passes data up (towards
* UNIX) and pr_send passes it down (towards the imps); control
* information passes up and down on pr_ctlinput and pr_ctloutput.
* The protocol is responsible for the space occupied by any the
* arguments to these entries and must dispose it.
*
* The userreq routine interfaces protocols to the system and is
* described below.
*/
struct mbuf;
struct sockaddr;
struct socket;
struct domain;
struct proc;
struct stat;
struct ifnet;
struct pr_usrreqs {
int (*pru_attach)(struct socket *, int);
int (*pru_detach)(struct socket *);
void (*pru_lock)(struct socket *);
void (*pru_unlock)(struct socket *);
int (*pru_bind)(struct socket *, struct mbuf *, struct proc *);
int (*pru_listen)(struct socket *);
int (*pru_connect)(struct socket *, struct mbuf *);
int (*pru_accept)(struct socket *, struct mbuf *);
int (*pru_disconnect)(struct socket *);
int (*pru_shutdown)(struct socket *);
int (*pru_rcvd)(struct socket *);
int (*pru_send)(struct socket *, struct mbuf *, struct mbuf *,
struct mbuf *);
int (*pru_abort)(struct socket *);
int (*pru_control)(struct socket *, u_long, caddr_t,
struct ifnet *);
int (*pru_sense)(struct socket *, struct stat *);
int (*pru_rcvoob)(struct socket *, struct mbuf *, int);
int (*pru_sendoob)(struct socket *, struct mbuf *, struct mbuf *,
struct mbuf *);
int (*pru_sockaddr)(struct socket *, struct mbuf *);
int (*pru_peeraddr)(struct socket *, struct mbuf *);
int (*pru_connect2)(struct socket *, struct socket *);
};
struct protosw {
short pr_type; /* socket type used for */
const struct domain *pr_domain; /* domain protocol a member of */
short pr_protocol; /* protocol number */
short pr_flags; /* see below */
/* protocol-protocol hooks */
/* input to protocol (from below) */
int (*pr_input)(struct mbuf **, int *, int, int);
/* control input (from below) */
void (*pr_ctlinput)(int, struct sockaddr *, u_int, void *);
/* control output (from above) */
int (*pr_ctloutput)(int, struct socket *, int, int, struct mbuf *);
/* user-protocol hooks */
const struct pr_usrreqs *pr_usrreqs;
/* utility hooks */
void (*pr_init)(void); /* initialization hook */
void (*pr_fasttimo)(void); /* fast timeout (200ms) */
void (*pr_slowtimo)(void); /* slow timeout (500ms) */
/* sysctl for protocol */
int (*pr_sysctl)(int *, u_int, void *, size_t *, void *, size_t);
};
#define PR_SLOWHZ 2 /* 2 slow timeouts per second */
#define PR_FASTHZ 5 /* 5 fast timeouts per second */
/*
* Values for pr_flags.
* PR_ADDR requires PR_ATOMIC;
* PR_ADDR and PR_CONNREQUIRED are mutually exclusive.
*/
#define PR_ATOMIC 0x01 /* exchange atomic messages only */
#define PR_ADDR 0x02 /* addresses given with messages */
#define PR_CONNREQUIRED 0x04 /* connection required by protocol */
#define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */
#define PR_RIGHTS 0x10 /* passes capabilities */
#define PR_ABRTACPTDIS 0x20 /* abort on accept(2) to disconnected
socket */
#define PR_SPLICE 0x40 /* socket splicing is possible */
/*
* The arguments to usrreq are:
* (*protosw[].pr_usrreq)(up, req, m, nam, opt);
* where up is a (struct socket *), req is one of these requests,
* m is a optional mbuf chain containing a message,
* nam is an optional mbuf chain containing an address,
* and opt is a pointer to a socketopt structure or nil.
* The protocol is responsible for disposal of the mbuf chain m,
* the caller is responsible for any space held by nam and opt.
* A non-zero return from usrreq gives an
* UNIX error number which should be passed to higher level software.
*/
#define PRU_ATTACH 0 /* attach protocol to up */
#define PRU_DETACH 1 /* detach protocol from up */
#define PRU_BIND 2 /* bind socket to address */
#define PRU_LISTEN 3 /* listen for connection */
#define PRU_CONNECT 4 /* establish connection to peer */
#define PRU_ACCEPT 5 /* accept connection from peer */
#define PRU_DISCONNECT 6 /* disconnect from peer */
#define PRU_SHUTDOWN 7 /* won't send any more data */
#define PRU_RCVD 8 /* have taken data; more room now */
#define PRU_SEND 9 /* send this data */
#define PRU_ABORT 10 /* abort (fast DISCONNECT, DETACH) */
#define PRU_CONTROL 11 /* control operations on protocol */
#define PRU_SENSE 12 /* return status into m */
#define PRU_RCVOOB 13 /* retrieve out of band data */
#define PRU_SENDOOB 14 /* send out of band data */
#define PRU_SOCKADDR 15 /* fetch socket's address */
#define PRU_PEERADDR 16 /* fetch peer's address */
#define PRU_CONNECT2 17 /* connect two sockets */
/* begin for protocols internal use */
#define PRU_FASTTIMO 18 /* 200ms timeout */
#define PRU_SLOWTIMO 19 /* 500ms timeout */
#define PRU_PROTORCV 20 /* receive from below */
#define PRU_PROTOSEND 21 /* send to below */
#define PRU_NREQ 22
#ifdef PRUREQUESTS
const char *prurequests[] = {
"ATTACH", "DETACH", "BIND", "LISTEN",
"CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN",
"RCVD", "SEND", "ABORT", "CONTROL",
"SENSE", "RCVOOB", "SENDOOB", "SOCKADDR",
"PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO",
"PROTORCV", "PROTOSEND",
};
#endif
/*
* The arguments to the ctlinput routine are
* (*protosw[].pr_ctlinput)(cmd, sa, arg);
* where cmd is one of the commands below, sa is a pointer to a sockaddr,
* and arg is an optional caddr_t argument used within a protocol family.
*/
#define PRC_IFDOWN 0 /* interface transition */
#define PRC_ROUTEDEAD 1 /* select new route if possible ??? */
#define PRC_MTUINC 2 /* increase in mtu to host */
#define PRC_QUENCH2 3 /* DEC congestion bit says slow down */
#define PRC_QUENCH 4 /* some one said to slow down */
#define PRC_MSGSIZE 5 /* message size forced drop */
#define PRC_HOSTDEAD 6 /* host appears to be down */
#define PRC_HOSTUNREACH 7 /* deprecated (use PRC_UNREACH_HOST) */
#define PRC_UNREACH_NET 8 /* no route to network */
#define PRC_UNREACH_HOST 9 /* no route to host */
#define PRC_UNREACH_PROTOCOL 10 /* dst says bad protocol */
#define PRC_UNREACH_PORT 11 /* bad port # */
/* was PRC_UNREACH_NEEDFRAG 12 (use PRC_MSGSIZE) */
#define PRC_UNREACH_SRCFAIL 13 /* source route failed */
#define PRC_REDIRECT_NET 14 /* net routing redirect */
#define PRC_REDIRECT_HOST 15 /* host routing redirect */
#define PRC_REDIRECT_TOSNET 16 /* redirect for type of service & net */
#define PRC_REDIRECT_TOSHOST 17 /* redirect for tos & host */
#define PRC_TIMXCEED_INTRANS 18 /* packet lifetime expired in transit */
#define PRC_TIMXCEED_REASS 19 /* lifetime expired on reass q */
#define PRC_PARAMPROB 20 /* header incorrect */
#define PRC_NCMDS 21
#define PRC_IS_REDIRECT(cmd) \
((cmd) >= PRC_REDIRECT_NET && (cmd) <= PRC_REDIRECT_TOSHOST)
#ifdef PRCREQUESTS
char *prcrequests[] = {
"IFDOWN", "ROUTEDEAD", "MTUINC", "DEC-BIT-QUENCH2",
"QUENCH", "MSGSIZE", "HOSTDEAD", "#7",
"NET-UNREACH", "HOST-UNREACH", "PROTO-UNREACH", "PORT-UNREACH",
"#12", "SRCFAIL-UNREACH", "NET-REDIRECT", "HOST-REDIRECT",
"TOSNET-REDIRECT", "TOSHOST-REDIRECT", "TX-INTRANS", "TX-REASS",
"PARAMPROB"
};
#endif
/*
* The arguments to ctloutput are:
* (*protosw[].pr_ctloutput)(req, so, level, optname, optval);
* req is one of the actions listed below, so is a (struct socket *),
* level is an indication of which protocol layer the option is intended.
* optname is a protocol dependent socket option request,
* optval is a pointer to a mbuf-chain pointer, for value-return results.
* The protocol is responsible for disposal of the mbuf chain *optval
* if supplied,
* the caller is responsible for any space held by *optval, when returned.
* A non-zero return from usrreq gives an
* UNIX error number which should be passed to higher level software.
*/
#define PRCO_GETOPT 0
#define PRCO_SETOPT 1
#define PRCO_NCMDS 2
#ifdef PRCOREQUESTS
char *prcorequests[] = {
"GETOPT", "SETOPT",
};
#endif
#ifdef _KERNEL
#include <sys/mbuf.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
struct ifnet;
struct sockaddr;
const struct protosw *pffindproto(int, int, int);
const struct protosw *pffindtype(int, int);
void pfctlinput(int, struct sockaddr *);
extern u_char ip_protox[];
extern const struct protosw inetsw[];
#ifdef INET6
extern u_char ip6_protox[];
extern const struct protosw inet6sw[];
#endif /* INET6 */
static inline int
pru_attach(struct socket *so, int proto)
{
return (*so->so_proto->pr_usrreqs->pru_attach)(so, proto);
}
static inline int
pru_detach(struct socket *so)
{
return (*so->so_proto->pr_usrreqs->pru_detach)(so);
}
static inline void
pru_lock(struct socket *so)
{
(*so->so_proto->pr_usrreqs->pru_lock)(so);
}
static inline void
pru_unlock(struct socket *so)
{
(*so->so_proto->pr_usrreqs->pru_unlock)(so);
}
static inline int
pru_bind(struct socket *so, struct mbuf *nam, struct proc *p)
{
if (so->so_proto->pr_usrreqs->pru_bind) return (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
return (EOPNOTSUPP);
}
static inline int
pru_listen(struct socket *so)
{
if (so->so_proto->pr_usrreqs->pru_listen)
return (*so->so_proto->pr_usrreqs->pru_listen)(so);
return (EOPNOTSUPP);
}
static inline int
pru_connect(struct socket *so, struct mbuf *nam)
{
if (so->so_proto->pr_usrreqs->pru_connect) return (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
return (EOPNOTSUPP);
}
static inline int
pru_accept(struct socket *so, struct mbuf *nam)
{
if (so->so_proto->pr_usrreqs->pru_accept) return (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
return (EOPNOTSUPP);
}
static inline int
pru_disconnect(struct socket *so)
{
if (so->so_proto->pr_usrreqs->pru_disconnect)
return (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
return (EOPNOTSUPP);
}
static inline int
pru_shutdown(struct socket *so)
{
return (*so->so_proto->pr_usrreqs->pru_shutdown)(so);
}
static inline int
pru_rcvd(struct socket *so)
{
if (so->so_proto->pr_usrreqs->pru_rcvd) return (*so->so_proto->pr_usrreqs->pru_rcvd)(so);
return (EOPNOTSUPP);
}
static inline int
pru_send(struct socket *so, struct mbuf *top, struct mbuf *addr,
struct mbuf *control)
{
return (*so->so_proto->pr_usrreqs->pru_send)(so, top, addr, control);
}
static inline int
pru_abort(struct socket *so)
{
return (*so->so_proto->pr_usrreqs->pru_abort)(so);
}
static inline int
pru_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp)
{
if (so->so_proto->pr_usrreqs->pru_control) return (*so->so_proto->pr_usrreqs->pru_control)(so,
cmd, data, ifp);
return (EOPNOTSUPP);
}
static inline int
pru_sense(struct socket *so, struct stat *ub)
{
if (so->so_proto->pr_usrreqs->pru_sense)
return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
return (0);
}
static inline int
pru_rcvoob(struct socket *so, struct mbuf *m, int flags)
{
if (so->so_proto->pr_usrreqs->pru_rcvoob)
return (*so->so_proto->pr_usrreqs->pru_rcvoob)(so, m, flags);
return (EOPNOTSUPP);
}
static inline int
pru_sendoob(struct socket *so, struct mbuf *top, struct mbuf *addr,
struct mbuf *control)
{
if (so->so_proto->pr_usrreqs->pru_sendoob)
return (*so->so_proto->pr_usrreqs->pru_sendoob)(so,
top, addr, control);
m_freem(top);
m_freem(control);
return (EOPNOTSUPP);
}
static inline int
pru_sockaddr(struct socket *so, struct mbuf *addr)
{
return (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, addr);
}
static inline int
pru_peeraddr(struct socket *so, struct mbuf *addr)
{
return (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, addr);
}
static inline int
pru_connect2(struct socket *so1, struct socket *so2)
{
if (so1->so_proto->pr_usrreqs->pru_connect2) return (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
return (EOPNOTSUPP);
}
#endif
/* $OpenBSD: ffs_softdep.c,v 1.150 2021/04/28 09:53:53 claudio Exp $ */
/*
* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
*
* The soft updates code is derived from the appendix of a University
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
* "Soft Updates: A Solution to the Metadata Update Problem in File
* Systems", CSE-TR-254-95, August 1995).
*
* Further information about soft updates can be obtained from:
*
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
* 1614 Oxford Street mckusick@mckusick.com
* Berkeley, CA 94709-1608 +1-510-843-9542
* USA
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
* $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.86 2001/02/04 16:08:18 phk Exp $
*/
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/specdev.h>
#include <crypto/siphash.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/softdep.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_extern.h>
#define STATIC
/*
* Mapping of dependency structure types to malloc types.
*/
#define D_PAGEDEP 0
#define D_INODEDEP 1
#define D_NEWBLK 2
#define D_BMSAFEMAP 3
#define D_ALLOCDIRECT 4
#define D_INDIRDEP 5
#define D_ALLOCINDIR 6
#define D_FREEFRAG 7
#define D_FREEBLKS 8
#define D_FREEFILE 9
#define D_DIRADD 10
#define D_MKDIR 11
#define D_DIRREM 12
#define D_NEWDIRBLK 13
#define D_LAST 13
/*
* Names of softdep types.
*/
const char *softdep_typenames[] = {
"pagedep",
"inodedep",
"newblk",
"bmsafemap",
"allocdirect",
"indirdep",
"allocindir",
"freefrag",
"freeblks",
"freefile",
"diradd",
"mkdir",
"dirrem",
"newdirblk",
};
#define TYPENAME(type) \
((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???")
/*
* Finding the current process.
*/
#define CURPROC curproc
/*
* End system adaptation definitions.
*/
/*
* Internal function prototypes.
*/
STATIC void softdep_error(char *, int);
STATIC void drain_output(struct vnode *, int);
STATIC int getdirtybuf(struct buf *, int);
STATIC void clear_remove(struct proc *);
STATIC void clear_inodedeps(struct proc *);
STATIC int flush_pagedep_deps(struct vnode *, struct mount *,
struct diraddhd *);
STATIC int flush_inodedep_deps(struct fs *, ufsino_t);
STATIC int handle_written_filepage(struct pagedep *, struct buf *);
STATIC void diradd_inode_written(struct diradd *, struct inodedep *);
STATIC int handle_written_inodeblock(struct inodedep *, struct buf *);
STATIC void handle_allocdirect_partdone(struct allocdirect *);
STATIC void handle_allocindir_partdone(struct allocindir *);
STATIC void initiate_write_filepage(struct pagedep *, struct buf *);
STATIC void handle_written_mkdir(struct mkdir *, int);
STATIC void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
#ifdef FFS2
STATIC void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
#endif
STATIC void handle_workitem_freefile(struct freefile *);
STATIC void handle_workitem_remove(struct dirrem *);
STATIC struct dirrem *newdirrem(struct buf *, struct inode *,
struct inode *, int, struct dirrem **);
STATIC void free_diradd(struct diradd *);
STATIC void free_allocindir(struct allocindir *, struct inodedep *);
STATIC void free_newdirblk(struct newdirblk *);
STATIC int indir_trunc(struct inode *, daddr_t, int, daddr_t, long *);
STATIC void deallocate_dependencies(struct buf *, struct inodedep *);
STATIC void free_allocdirect(struct allocdirectlst *,
struct allocdirect *, int);
STATIC int check_inode_unwritten(struct inodedep *);
STATIC int free_inodedep(struct inodedep *);
STATIC void handle_workitem_freeblocks(struct freeblks *);
STATIC void merge_inode_lists(struct inodedep *);
STATIC void setup_allocindir_phase2(struct buf *, struct inode *,
struct allocindir *);
STATIC struct allocindir *newallocindir(struct inode *, int, daddr_t,
daddr_t);
STATIC void handle_workitem_freefrag(struct freefrag *);
STATIC struct freefrag *newfreefrag(struct inode *, daddr_t, long);
STATIC void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
STATIC struct bmsafemap *bmsafemap_lookup(struct buf *);
STATIC int newblk_lookup(struct fs *, daddr_t, int,
struct newblk **);
STATIC int inodedep_lookup(struct fs *, ufsino_t, int, struct inodedep **);
STATIC int pagedep_lookup(struct inode *, daddr_t, int, struct pagedep **);
STATIC void pause_timer(void *);
STATIC int request_cleanup(int, int);
STATIC int process_worklist_item(struct mount *, int *, int);
STATIC void add_to_worklist(struct worklist *);
/*
* Exported softdep operations.
*/
void softdep_disk_io_initiation(struct buf *);
void softdep_disk_write_complete(struct buf *);
void softdep_deallocate_dependencies(struct buf *);
void softdep_move_dependencies(struct buf *, struct buf *);
int softdep_count_dependencies(struct buf *bp, int, int);
/*
* Locking primitives.
*
* For a uniprocessor, all we need to do is protect against disk
* interrupts. For a multiprocessor, this lock would have to be
* a mutex. A single mutex is used throughout this file, though
* finer grain locking could be used if contention warranted it.
*
* For a multiprocessor, the sleep call would accept a lock and
* release it after the sleep processing was complete. In a uniprocessor
* implementation there is no such interlock, so we simple mark
* the places where it needs to be done with the `interlocked' form
* of the lock calls. Since the uniprocessor sleep already interlocks
* the spl, there is nothing that really needs to be done.
*/
#ifndef /* NOT */ DEBUG
STATIC struct lockit {
int lkt_spl;
} lk = { 0 };
#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
#define FREE_LOCK(lk) splx((lk)->lkt_spl)
#define ACQUIRE_LOCK_INTERLOCKED(lk,s) (lk)->lkt_spl = (s)
#define FREE_LOCK_INTERLOCKED(lk) ((lk)->lkt_spl)
#else /* DEBUG */
STATIC struct lockit {
int lkt_spl;
pid_t lkt_held;
int lkt_line;
} lk = { 0, -1 };
STATIC int lockcnt;
STATIC void acquire_lock(struct lockit *, int);
STATIC void free_lock(struct lockit *, int);
STATIC void acquire_lock_interlocked(struct lockit *, int, int);
STATIC int free_lock_interlocked(struct lockit *, int);
#define ACQUIRE_LOCK(lk) acquire_lock(lk, __LINE__)
#define FREE_LOCK(lk) free_lock(lk, __LINE__)
#define ACQUIRE_LOCK_INTERLOCKED(lk,s) acquire_lock_interlocked(lk, (s), __LINE__)
#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk, __LINE__)
STATIC void
acquire_lock(struct lockit *lk, int line)
{
pid_t holder;
int original_line;
if (lk->lkt_held != -1) {
holder = lk->lkt_held;
original_line = lk->lkt_line;
FREE_LOCK(lk);
if (holder == CURPROC->p_tid)
panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
else
panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
}
lk->lkt_spl = splbio();
lk->lkt_held = CURPROC->p_tid;
lk->lkt_line = line;
lockcnt++;
}
STATIC void
free_lock(struct lockit *lk, int line)
{
if (lk->lkt_held == -1)
panic("softdep_unlock: lock not held at line %d", line);
lk->lkt_held = -1;
splx(lk->lkt_spl);
}
STATIC void
acquire_lock_interlocked(struct lockit *lk, int s, int line)
{
pid_t holder;
int original_line;
if (lk->lkt_held != -1) {
holder = lk->lkt_held;
original_line = lk->lkt_line;
FREE_LOCK_INTERLOCKED(lk);
if (holder == CURPROC->p_tid)
panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
else
panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
}
lk->lkt_held = CURPROC->p_tid;
lk->lkt_line = line;
lk->lkt_spl = s;
lockcnt++;
}
STATIC int
free_lock_interlocked(struct lockit *lk, int line)
{
if (lk->lkt_held == -1)
panic("softdep_unlock_interlocked: lock not held at line %d", line);
lk->lkt_held = -1;
return (lk->lkt_spl);
}
#endif /* DEBUG */
/*
* Place holder for real semaphores.
*/
struct sema {
int value;
pid_t holder;
char *name;
int prio;
};
STATIC void sema_init(struct sema *, char *, int);
STATIC int sema_get(struct sema *, struct lockit *);
STATIC void sema_release(struct sema *);
STATIC void
sema_init(struct sema *semap, char *name, int prio)
{
semap->holder = -1;
semap->value = 0;
semap->name = name;
semap->prio = prio;
}
STATIC int
sema_get(struct sema *semap, struct lockit *interlock)
{
int s;
if (semap->value++ > 0) {
if (interlock != NULL)
s = FREE_LOCK_INTERLOCKED(interlock);
tsleep_nsec(semap, semap->prio, semap->name, INFSLP);
if (interlock != NULL) {
ACQUIRE_LOCK_INTERLOCKED(interlock, s);
FREE_LOCK(interlock);
}
return (0);
}
semap->holder = CURPROC->p_tid;
if (interlock != NULL)
FREE_LOCK(interlock);
return (1);
}
STATIC void
sema_release(struct sema *semap)
{
if (semap->value <= 0 || semap->holder != CURPROC->p_tid) {
#ifdef DEBUG
if (lk.lkt_held != -1)
FREE_LOCK(&lk);
#endif
panic("sema_release: not held");
}
if (--semap->value > 0) {
semap->value = 0;
wakeup(semap);
}
semap->holder = -1;
}
/*
* Memory management.
*/
STATIC struct pool pagedep_pool;
STATIC struct pool inodedep_pool;
STATIC struct pool newblk_pool;
STATIC struct pool bmsafemap_pool;
STATIC struct pool allocdirect_pool;
STATIC struct pool indirdep_pool;
STATIC struct pool allocindir_pool;
STATIC struct pool freefrag_pool;
STATIC struct pool freeblks_pool;
STATIC struct pool freefile_pool;
STATIC struct pool diradd_pool;
STATIC struct pool mkdir_pool;
STATIC struct pool dirrem_pool;
STATIC struct pool newdirblk_pool;
static __inline void
softdep_free(struct worklist *item, int type)
{
switch (type) {
case D_PAGEDEP:
pool_put(&pagedep_pool, item);
break;
case D_INODEDEP:
pool_put(&inodedep_pool, item);
break;
case D_BMSAFEMAP:
pool_put(&bmsafemap_pool, item);
break;
case D_ALLOCDIRECT:
pool_put(&allocdirect_pool, item);
break;
case D_INDIRDEP:
pool_put(&indirdep_pool, item);
break;
case D_ALLOCINDIR:
pool_put(&allocindir_pool, item);
break;
case D_FREEFRAG:
pool_put(&freefrag_pool, item);
break;
case D_FREEBLKS:
pool_put(&freeblks_pool, item);
break;
case D_FREEFILE:
pool_put(&freefile_pool, item);
break;
case D_DIRADD:
pool_put(&diradd_pool, item);
break;
case D_MKDIR:
pool_put(&mkdir_pool, item);
break;
case D_DIRREM:
pool_put(&dirrem_pool, item);
break;
case D_NEWDIRBLK:
pool_put(&newdirblk_pool, item);
break;
default:
#ifdef DEBUG
if (lk.lkt_held != -1)
FREE_LOCK(&lk);
#endif
panic("softdep_free: unknown type %d", type);
}
}
struct workhead softdep_freequeue;
static __inline void
softdep_freequeue_add(struct worklist *item)
{
int s;
s = splbio();
LIST_INSERT_HEAD(&softdep_freequeue, item, wk_list);
splx(s);
}
static __inline void
softdep_freequeue_process(void)
{
struct worklist *wk;
splassert(IPL_BIO);
while ((wk = LIST_FIRST(&softdep_freequeue)) != NULL) {
LIST_REMOVE(wk, wk_list);
FREE_LOCK(&lk);
softdep_free(wk, wk->wk_type);
ACQUIRE_LOCK(&lk);
}
}
/*
* Worklist queue management.
* These routines require that the lock be held.
*/
#ifndef /* NOT */ DEBUG
#define WORKLIST_INSERT(head, item) do { \
(item)->wk_state |= ONWORKLIST; \
LIST_INSERT_HEAD(head, item, wk_list); \
} while (0)
#define WORKLIST_REMOVE(item) do { \
(item)->wk_state &= ~ONWORKLIST; \
LIST_REMOVE(item, wk_list); \
} while (0)
#define WORKITEM_FREE(item, type) softdep_freequeue_add((struct worklist *)item)
#else /* DEBUG */
STATIC void worklist_insert(struct workhead *, struct worklist *);
STATIC void worklist_remove(struct worklist *);
STATIC void workitem_free(struct worklist *);
#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
#define WORKLIST_REMOVE(item) worklist_remove(item)
#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item)
STATIC void
worklist_insert(struct workhead *head, struct worklist *item)
{
if (lk.lkt_held == -1)
panic("worklist_insert: lock not held");
if (item->wk_state & ONWORKLIST) {
FREE_LOCK(&lk);
panic("worklist_insert: already on list");
}
item->wk_state |= ONWORKLIST;
LIST_INSERT_HEAD(head, item, wk_list);
}
STATIC void
worklist_remove(struct worklist *item)
{
if (lk.lkt_held == -1)
panic("worklist_remove: lock not held");
if ((item->wk_state & ONWORKLIST) == 0) {
FREE_LOCK(&lk);
panic("worklist_remove: not on list");
}
item->wk_state &= ~ONWORKLIST;
LIST_REMOVE(item, wk_list);
}
STATIC void
workitem_free(struct worklist *item)
{
if (item->wk_state & ONWORKLIST) {
if (lk.lkt_held != -1)
FREE_LOCK(&lk);
panic("workitem_free: still on list");
}
softdep_freequeue_add(item);
}
#endif /* DEBUG */
/*
* Workitem queue management
*/
STATIC struct workhead softdep_workitem_pending;
STATIC struct worklist *worklist_tail;
STATIC int num_on_worklist; /* number of worklist items to be processed */
STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */
STATIC int softdep_worklist_req; /* serialized waiters */
STATIC int max_softdeps; /* maximum number of structs before slowdown */
STATIC int tickdelay = 2; /* number of ticks to pause during slowdown */
STATIC int proc_waiting; /* tracks whether we have a timeout posted */
STATIC int *stat_countp; /* statistic to count in proc_waiting timeout */
STATIC struct timeout proc_waiting_timeout;
STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */
STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */
#define FLUSH_INODES 1
STATIC int req_clear_remove; /* syncer process flush some freeblks */
#define FLUSH_REMOVE 2
/*
* runtime statistics
*/
STATIC int stat_worklist_push; /* number of worklist cleanups */
STATIC int stat_blk_limit_push; /* number of times block limit neared */
STATIC int stat_ino_limit_push; /* number of times inode limit neared */
STATIC int stat_blk_limit_hit; /* number of times block slowdown imposed */
STATIC int stat_ino_limit_hit; /* number of times inode slowdown imposed */
STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
STATIC int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
STATIC int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
/*
* Add an item to the end of the work queue.
* This routine requires that the lock be held.
* This is the only routine that adds items to the list.
* The following routine is the only one that removes items
* and does so in order from first to last.
*/
STATIC void
add_to_worklist(struct worklist *wk)
{
if (wk->wk_state & ONWORKLIST) {
#ifdef DEBUG
if (lk.lkt_held != -1)
FREE_LOCK(&lk);
#endif
panic("add_to_worklist: already on list");
}
wk->wk_state |= ONWORKLIST;
if (LIST_FIRST(&softdep_workitem_pending) == NULL)
LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
else
LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
worklist_tail = wk;
num_on_worklist += 1;
}
/*
* Process that runs once per second to handle items in the background queue.
*
* Note that we ensure that everything is done in the order in which they
* appear in the queue. The code below depends on this property to ensure
* that blocks of a file are freed before the inode itself is freed. This
* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
* until all the old ones have been purged from the dependency lists.
*/
int
softdep_process_worklist(struct mount *matchmnt)
{
struct proc *p = CURPROC;
int matchcnt, loopcount;
struct timeval starttime;
/*
* First process any items on the delayed-free queue.
*/
ACQUIRE_LOCK(&lk);
softdep_freequeue_process();
FREE_LOCK(&lk);
/*
* Record the process identifier of our caller so that we can give
* this process preferential treatment in request_cleanup below.
* We can't do this in softdep_initialize, because the syncer doesn't
* have to run then.
* NOTE! This function _could_ be called with a curproc != syncerproc.
*/
filesys_syncer = syncerproc;
matchcnt = 0;
/*
* There is no danger of having multiple processes run this
* code, but we have to single-thread it when softdep_flushfiles()
* is in operation to get an accurate count of the number of items
* related to its mount point that are in the list.
*/
if (matchmnt == NULL) {
if (softdep_worklist_busy < 0)
return(-1);
softdep_worklist_busy += 1;
}
/*
* If requested, try removing inode or removal dependencies.
*/
if (req_clear_inodedeps) {
clear_inodedeps(p);
req_clear_inodedeps -= 1;
wakeup_one(&proc_waiting);
}
if (req_clear_remove) {
clear_remove(p);
req_clear_remove -= 1;
wakeup_one(&proc_waiting);
}
loopcount = 1;
getmicrouptime(&starttime);
while (num_on_worklist > 0) {
if (process_worklist_item(matchmnt, &matchcnt, LK_NOWAIT) == 0)
break;
/*
* If a umount operation wants to run the worklist
* accurately, abort.
*/
if (softdep_worklist_req && matchmnt == NULL) {
matchcnt = -1;
break;
}
/*
* If requested, try removing inode or removal dependencies.
*/
if (req_clear_inodedeps) {
clear_inodedeps(p);
req_clear_inodedeps -= 1;
wakeup_one(&proc_waiting);
}
if (req_clear_remove) {
clear_remove(p);
req_clear_remove -= 1;
wakeup_one(&proc_waiting);
}
/*
* We do not generally want to stop for buffer space, but if
* we are really being a buffer hog, we will stop and wait.
*/
#if 0
if (loopcount++ % 128 == 0)
bwillwrite();
#endif
/*
* Never allow processing to run for more than one
* second. Otherwise the other syncer tasks may get
* excessively backlogged.
*/
{
struct timeval diff;
struct timeval tv;
getmicrouptime(&tv);
timersub(&tv, &starttime, &diff);
if (diff.tv_sec != 0 && matchmnt == NULL) {
matchcnt = -1;
break;
}
}
/*
* Process any new items on the delayed-free queue.
*/
ACQUIRE_LOCK(&lk);
softdep_freequeue_process();
FREE_LOCK(&lk);
}
if (matchmnt == NULL) {
softdep_worklist_busy -= 1;
if (softdep_worklist_req && softdep_worklist_busy == 0)
wakeup(&softdep_worklist_req);
}
return (matchcnt);
}
/*
* Process one item on the worklist.
*/
STATIC int
process_worklist_item(struct mount *matchmnt, int *matchcnt, int flags)
{
struct worklist *wk, *wkend;
struct dirrem *dirrem;
struct mount *mp;
struct vnode *vp;
ACQUIRE_LOCK(&lk);
/*
* Normally we just process each item on the worklist in order.
* However, if we are in a situation where we cannot lock any
* inodes, we have to skip over any dirrem requests whose
* vnodes are resident and locked.
*/
LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
break;
dirrem = WK_DIRREM(wk);
vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
dirrem->dm_oldinum);
if (vp == NULL || !VOP_ISLOCKED(vp))
break;
}
if (wk == NULL) {
FREE_LOCK(&lk);
return (0);
}
/*
* Remove the item to be processed. If we are removing the last
* item on the list, we need to recalculate the tail pointer.
* As this happens rarely and usually when the list is short,
* we just run down the list to find it rather than tracking it
* in the above loop.
*/
WORKLIST_REMOVE(wk);
if (wk == worklist_tail) {
LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
if (LIST_NEXT(wkend, wk_list) == NULL)
break;
worklist_tail = wkend;
}
num_on_worklist -= 1;
FREE_LOCK(&lk);
switch (wk->wk_type) {
case D_DIRREM:
/* removal of a directory entry */
mp = WK_DIRREM(wk)->dm_mnt;
#if 0
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: dirrem on suspended filesystem",
"process_worklist_item");
#endif
if (matchmnt != NULL && mp == matchmnt)
*matchcnt += 1;
handle_workitem_remove(WK_DIRREM(wk));
break;
case D_FREEBLKS:
/* releasing blocks and/or fragments from a file */
mp = WK_FREEBLKS(wk)->fb_mnt;
#if 0
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freeblks on suspended filesystem",
"process_worklist_item");
#endif
if (matchmnt != NULL && mp == matchmnt)
*matchcnt += 1;
handle_workitem_freeblocks(WK_FREEBLKS(wk));
break;
case D_FREEFRAG:
/* releasing a fragment when replaced as a file grows */
mp = WK_FREEFRAG(wk)->ff_mnt;
#if 0
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freefrag on suspended filesystem",
"process_worklist_item");
#endif
if (matchmnt != NULL && mp == matchmnt)
*matchcnt += 1;
handle_workitem_freefrag(WK_FREEFRAG(wk));
break;
case D_FREEFILE:
/* releasing an inode when its link count drops to 0 */
mp = WK_FREEFILE(wk)->fx_mnt;
#if 0
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freefile on suspended filesystem",
"process_worklist_item");
#endif
if (matchmnt != NULL && mp == matchmnt)
*matchcnt += 1;
handle_workitem_freefile(WK_FREEFILE(wk));
break;
default:
panic("%s_process_worklist: Unknown type %s",
"softdep", TYPENAME(wk->wk_type));
/* NOTREACHED */
}
return (1);
}
/*
* Move dependencies from one buffer to another.
*/
void
softdep_move_dependencies(struct buf *oldbp, struct buf *newbp)
{
struct worklist *wk, *wktail;
if (LIST_FIRST(&newbp->b_dep) != NULL)
panic("softdep_move_dependencies: need merge code");
wktail = NULL;
ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
LIST_REMOVE(wk, wk_list);
if (wktail == NULL)
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
else
LIST_INSERT_AFTER(wktail, wk, wk_list);
wktail = wk;
}
FREE_LOCK(&lk);
}
/*
* Purge the work list of all items associated with a particular mount point.
*/
int
softdep_flushworklist(struct mount *oldmnt, int *countp, struct proc *p)
{
struct vnode *devvp;
int count, error = 0;
/*
* Await our turn to clear out the queue, then serialize access.
*/
while (softdep_worklist_busy) {
softdep_worklist_req += 1;
tsleep_nsec(&softdep_worklist_req, PRIBIO, "softflush", INFSLP);
softdep_worklist_req -= 1;
}
softdep_worklist_busy = -1;
/*
* Alternately flush the block device associated with the mount
* point and process any dependencies that the flushing
* creates. We continue until no more worklist dependencies
* are found.
*/
*countp = 0;
devvp = VFSTOUFS(oldmnt)->um_devvp;
while ((count = softdep_process_worklist(oldmnt)) > 0) {
*countp += count;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
VOP_UNLOCK(devvp);
if (error)
break;
}
softdep_worklist_busy = 0;
if (softdep_worklist_req)
wakeup(&softdep_worklist_req);
return (error);
}
/*
* Flush all vnodes and worklist items associated with a specified mount point.
*/
int
softdep_flushfiles(struct mount *oldmnt, int flags, struct proc *p)
{
int error, count, loopcnt;
/*
* Alternately flush the vnodes associated with the mount
* point and process any dependencies that the flushing
* creates. In theory, this loop can happen at most twice,
* but we give it a few extra just to be sure.
*/
for (loopcnt = 10; loopcnt > 0; loopcnt--) {
/*
* Do another flush in case any vnodes were brought in
* as part of the cleanup operations.
*/
if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
break;
if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
count == 0)
break;
}
/*
* If the reboot process sleeps during the loop, the update
* process may call softdep_process_worklist() and create
* new dirty vnodes at the mount point. Call ffs_flushfiles()
* again after the loop has flushed all soft dependencies.
*/
if (error == 0)
error = ffs_flushfiles(oldmnt, flags, p);
/*
* If we are unmounting then it is an error to fail. If we
* are simply trying to downgrade to read-only, then filesystem
* activity can keep us busy forever, so we just fail with EBUSY.
*/
if (loopcnt == 0) {
error = EBUSY;
}
return (error);
}
/*
* Structure hashing.
*
* There are three types of structures that can be looked up:
* 1) pagedep structures identified by mount point, inode number,
* and logical block.
* 2) inodedep structures identified by mount point and inode number.
* 3) newblk structures identified by mount point and
* physical block number.
*
* The "pagedep" and "inodedep" dependency structures are hashed
* separately from the file blocks and inodes to which they correspond.
* This separation helps when the in-memory copy of an inode or
* file block must be replaced. It also obviates the need to access
* an inode or file page when simply updating (or de-allocating)
* dependency structures. Lookup of newblk structures is needed to
* find newly allocated blocks when trying to associate them with
* their allocdirect or allocindir structure.
*
* The lookup routines optionally create and hash a new instance when
* an existing entry is not found.
*/
#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
#define NODELAY 0x0002 /* cannot do background work */
SIPHASH_KEY softdep_hashkey;
/*
* Structures and routines associated with pagedep caching.
*/
LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
u_long pagedep_hash; /* size of hash table - 1 */
STATIC struct sema pagedep_in_progress;
/*
* Look up a pagedep. Return 1 if found, 0 if not found or found
* when asked to allocate but not associated with any buffer.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in pagedeppp.
* This routine must be called with splbio interrupts blocked.
*/
STATIC int
pagedep_lookup(struct inode *ip, daddr_t lbn, int flags,
struct pagedep **pagedeppp)
{
SIPHASH_CTX ctx;
struct pagedep *pagedep;
struct pagedep_hashhead *pagedephd;
struct mount *mp;
int i;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("pagedep_lookup: lock not held");
#endif
mp = ITOV(ip)->v_mount;
SipHash24_Init(&ctx, &softdep_hashkey);
SipHash24_Update(&ctx, &mp, sizeof(mp));
SipHash24_Update(&ctx, &ip->i_number, sizeof(ip->i_number));
SipHash24_Update(&ctx, &lbn, sizeof(lbn));
pagedephd = &pagedep_hashtbl[SipHash24_End(&ctx) & pagedep_hash];
top:
LIST_FOREACH(pagedep, pagedephd, pd_hash)
if (ip->i_number == pagedep->pd_ino &&
lbn == pagedep->pd_lbn &&
mp == pagedep->pd_mnt)
break;
if (pagedep) {
*pagedeppp = pagedep;
if ((flags & DEPALLOC) != 0 &&
(pagedep->pd_state & ONWORKLIST) == 0)
return (0);
return (1);
}
if ((flags & DEPALLOC) == 0) {
*pagedeppp = NULL;
return (0);
}
if (sema_get(&pagedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
pagedep = pool_get(&pagedep_pool, PR_WAITOK | PR_ZERO);
pagedep->pd_list.wk_type = D_PAGEDEP;
pagedep->pd_mnt = mp;
pagedep->pd_ino = ip->i_number;
pagedep->pd_lbn = lbn;
LIST_INIT(&pagedep->pd_dirremhd);
LIST_INIT(&pagedep->pd_pendinghd);
for (i = 0; i < DAHASHSZ; i++)
LIST_INIT(&pagedep->pd_diraddhd[i]);
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
sema_release(&pagedep_in_progress);
*pagedeppp = pagedep;
return (0);
}
/*
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
STATIC u_long inodedep_hash; /* size of hash table - 1 */
STATIC long num_inodedep; /* number of inodedep allocated */
STATIC struct sema inodedep_in_progress;
/*
* Look up a inodedep. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in inodedeppp.
* This routine must be called with splbio interrupts blocked.
*/
STATIC int
inodedep_lookup(struct fs *fs, ufsino_t inum, int flags,
struct inodedep **inodedeppp)
{
SIPHASH_CTX ctx;
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
int firsttry;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("inodedep_lookup: lock not held");
#endif
firsttry = 1;
SipHash24_Init(&ctx, &softdep_hashkey);
SipHash24_Update(&ctx, &fs, sizeof(fs));
SipHash24_Update(&ctx, &inum, sizeof(inum));
inodedephd = &inodedep_hashtbl[SipHash24_End(&ctx) & inodedep_hash];
top:
LIST_FOREACH(inodedep, inodedephd, id_hash)
if (inum == inodedep->id_ino && fs == inodedep->id_fs)
break;
if (inodedep) {
*inodedeppp = inodedep;
return (1);
}
if ((flags & DEPALLOC) == 0) {
*inodedeppp = NULL;
return (0);
}
/*
* If we are over our limit, try to improve the situation.
*/
if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
request_cleanup(FLUSH_INODES, 1)) {
firsttry = 0;
goto top;
}
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
num_inodedep += 1;
inodedep = pool_get(&inodedep_pool, PR_WAITOK);
inodedep->id_list.wk_type = D_INODEDEP;
inodedep->id_fs = fs;
inodedep->id_ino = inum;
inodedep->id_state = ALLCOMPLETE;
inodedep->id_nlinkdelta = 0;
inodedep->id_savedino1 = NULL;
inodedep->id_savedsize = -1;
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
sema_release(&inodedep_in_progress);
*inodedeppp = inodedep;
return (0);
}
/*
* Structures and routines associated with newblk caching.
*/
LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
u_long newblk_hash; /* size of hash table - 1 */
STATIC struct sema newblk_in_progress;
/*
* Look up a newblk. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in newblkpp.
*/
STATIC int
newblk_lookup(struct fs *fs, daddr_t newblkno, int flags,
struct newblk **newblkpp)
{
SIPHASH_CTX ctx;
struct newblk *newblk;
struct newblk_hashhead *newblkhd;
SipHash24_Init(&ctx, &softdep_hashkey);
SipHash24_Update(&ctx, &fs, sizeof(fs));
SipHash24_Update(&ctx, &newblkno, sizeof(newblkno));
newblkhd = &newblk_hashtbl[SipHash24_End(&ctx) & newblk_hash];
top:
LIST_FOREACH(newblk, newblkhd, nb_hash)
if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
break;
if (newblk) {
*newblkpp = newblk;
return (1);
}
if ((flags & DEPALLOC) == 0) {
*newblkpp = NULL;
return (0);
}
if (sema_get(&newblk_in_progress, NULL) == 0)
goto top;
newblk = pool_get(&newblk_pool, PR_WAITOK);
newblk->nb_state = 0;
newblk->nb_fs = fs;
newblk->nb_newblkno = newblkno;
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
sema_release(&newblk_in_progress);
*newblkpp = newblk;
return (0);
}
/*
* Executed during filesystem system initialization before
* mounting any file systems.
*/
void
softdep_initialize(void)
{
bioops.io_start = softdep_disk_io_initiation;
bioops.io_complete = softdep_disk_write_complete;
bioops.io_deallocate = softdep_deallocate_dependencies;
bioops.io_movedeps = softdep_move_dependencies;
bioops.io_countdeps = softdep_count_dependencies;
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
#ifdef KMEMSTATS
max_softdeps = min (initialvnodes * 8,
kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep)));
#else
max_softdeps = initialvnodes * 4;
#endif
arc4random_buf(&softdep_hashkey, sizeof(softdep_hashkey));
pagedep_hashtbl = hashinit(initialvnodes / 5, M_PAGEDEP, M_WAITOK,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO);
inodedep_hashtbl = hashinit(initialvnodes, M_INODEDEP, M_WAITOK,
&inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO);
newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO);
timeout_set(&proc_waiting_timeout, pause_timer, NULL);
pool_init(&pagedep_pool, sizeof(struct pagedep), 0, IPL_NONE,
PR_WAITOK, "pagedep", NULL);
pool_init(&inodedep_pool, sizeof(struct inodedep), 0, IPL_NONE,
PR_WAITOK, "inodedep", NULL);
pool_init(&newblk_pool, sizeof(struct newblk), 0, IPL_NONE,
PR_WAITOK, "newblk", NULL);
pool_init(&bmsafemap_pool, sizeof(struct bmsafemap), 0, IPL_NONE,
PR_WAITOK, "bmsafemap", NULL);
pool_init(&allocdirect_pool, sizeof(struct allocdirect), 0, IPL_NONE,
PR_WAITOK, "allocdir", NULL);
pool_init(&indirdep_pool, sizeof(struct indirdep), 0, IPL_NONE,
PR_WAITOK, "indirdep", NULL);
pool_init(&allocindir_pool, sizeof(struct allocindir), 0, IPL_NONE,
PR_WAITOK, "allocindir", NULL);
pool_init(&freefrag_pool, sizeof(struct freefrag), 0, IPL_NONE,
PR_WAITOK, "freefrag", NULL);
pool_init(&freeblks_pool, sizeof(struct freeblks), 0, IPL_NONE,
PR_WAITOK, "freeblks", NULL);
pool_init(&freefile_pool, sizeof(struct freefile), 0, IPL_NONE,
PR_WAITOK, "freefile", NULL);
pool_init(&diradd_pool, sizeof(struct diradd), 0, IPL_NONE,
PR_WAITOK, "diradd", NULL);
pool_init(&mkdir_pool, sizeof(struct mkdir), 0, IPL_NONE,
PR_WAITOK, "mkdir", NULL);
pool_init(&dirrem_pool, sizeof(struct dirrem), 0, IPL_NONE,
PR_WAITOK, "dirrem", NULL);
pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, IPL_NONE,
PR_WAITOK, "newdirblk", NULL);
}
/*
* Called at mount time to notify the dependency code that a
* filesystem wishes to use it.
*/
int
softdep_mount(struct vnode *devvp, struct mount *mp, struct fs *fs,
struct ucred *cred)
{
struct csum_total cstotal;
struct cg *cgp;
struct buf *bp;
int error, cyl;
/*
* When doing soft updates, the counters in the
* superblock may have gotten out of sync, so we have
* to scan the cylinder groups and recalculate them.
*/
if ((fs->fs_flags & FS_UNCLEAN) == 0)
return (0);
memset(&cstotal, 0, sizeof(cstotal));
for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
fs->fs_cgsize, &bp)) != 0) {
brelse(bp);
return (error);
}
cgp = (struct cg *)bp->b_data;
cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
fs->fs_cs(fs, cyl) = cgp->cg_cs;
brelse(bp);
}
#ifdef DEBUG
if (memcmp(&cstotal, &fs->fs_cstotal, sizeof(cstotal)))
printf("ffs_mountfs: superblock updated for soft updates\n");
#endif
memcpy(&fs->fs_cstotal, &cstotal, sizeof(cstotal));
return (0);
}
/*
* Protecting the freemaps (or bitmaps).
*
* To eliminate the need to execute fsck before mounting a file system
* after a power failure, one must (conservatively) guarantee that the
* on-disk copy of the bitmaps never indicate that a live inode or block is
* free. So, when a block or inode is allocated, the bitmap should be
* updated (on disk) before any new pointers. When a block or inode is
* freed, the bitmap should not be updated until all pointers have been
* reset. The latter dependency is handled by the delayed de-allocation
* approach described below for block and inode de-allocation. The former
* dependency is handled by calling the following procedure when a block or
* inode is allocated. When an inode is allocated an "inodedep" is created
* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
* Each "inodedep" is also inserted into the hash indexing structure so
* that any additional link additions can be made dependent on the inode
* allocation.
*
* The ufs file system maintains a number of free block counts (e.g., per
* cylinder group, per cylinder and per <cylinder, rotational position> pair)
* in addition to the bitmaps. These counts are used to improve efficiency
* during allocation and therefore must be consistent with the bitmaps.
* There is no convenient way to guarantee post-crash consistency of these
* counts with simple update ordering, for two main reasons: (1) The counts
* and bitmaps for a single cylinder group block are not in the same disk
* sector. If a disk write is interrupted (e.g., by power failure), one may
* be written and the other not. (2) Some of the counts are located in the
* superblock rather than the cylinder group block. So, we focus our soft
* updates implementation on protecting the bitmaps. When mounting a
* filesystem, we recompute the auxiliary counts from the bitmaps.
*/
/*
* Called just after updating the cylinder group block to allocate an inode.
*/
/* buffer for cylgroup block with inode map */
/* inode related to allocation */
/* new inode number being allocated */
void
softdep_setup_inomapdep(struct buf *bp, struct inode *ip, ufsino_t newinum)
{
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
/*
* Create a dependency for the newly allocated inode.
* Panic if it already exists as something is seriously wrong.
* Otherwise add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep)
!= 0) {
FREE_LOCK(&lk);
panic("softdep_setup_inomapdep: found inode");
}
inodedep->id_buf = bp;
inodedep->id_state &= ~DEPCOMPLETE;
bmsafemap = bmsafemap_lookup(bp);
LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
FREE_LOCK(&lk);
}
/*
* Called just after updating the cylinder group block to
* allocate block or fragment.
*/
/* buffer for cylgroup block with block map */
/* filesystem doing allocation */
/* number of newly allocated block */
void
softdep_setup_blkmapdep(struct buf *bp, struct fs *fs, daddr_t newblkno)
{
struct newblk *newblk;
struct bmsafemap *bmsafemap;
/*
* Create a dependency for the newly allocated block.
* Add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
panic("softdep_setup_blkmapdep: found block");
ACQUIRE_LOCK(&lk);
newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
FREE_LOCK(&lk);
}
/*
* Find the bmsafemap associated with a cylinder group buffer.
* If none exists, create one. The buffer must be locked when
* this routine is called and this routine must be called with
* splbio interrupts blocked.
*/
STATIC struct bmsafemap *
bmsafemap_lookup(struct buf *bp)
{
struct bmsafemap *bmsafemap;
struct worklist *wk;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("bmsafemap_lookup: lock not held");
#endif
LIST_FOREACH(wk, &bp->b_dep, wk_list)
if (wk->wk_type == D_BMSAFEMAP)
return (WK_BMSAFEMAP(wk));
FREE_LOCK(&lk);
bmsafemap = pool_get(&bmsafemap_pool, PR_WAITOK);
bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
bmsafemap->sm_list.wk_state = 0;
bmsafemap->sm_buf = bp;
LIST_INIT(&bmsafemap->sm_allocdirecthd);
LIST_INIT(&bmsafemap->sm_allocindirhd);
LIST_INIT(&bmsafemap->sm_inodedephd);
LIST_INIT(&bmsafemap->sm_newblkhd);
ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
return (bmsafemap);
}
/*
* Direct block allocation dependencies.
*
* When a new block is allocated, the corresponding disk locations must be
* initialized (with zeros or new data) before the on-disk inode points to
* them. Also, the freemap from which the block was allocated must be
* updated (on disk) before the inode's pointer. These two dependencies are
* independent of each other and are needed for all file blocks and indirect
* blocks that are pointed to directly by the inode. Just before the
* "in-core" version of the inode is updated with a newly allocated block
* number, a procedure (below) is called to setup allocation dependency
* structures. These structures are removed when the corresponding
* dependencies are satisfied or when the block allocation becomes obsolete
* (i.e., the file is deleted, the block is de-allocated, or the block is a
* fragment that gets upgraded). All of these cases are handled in
* procedures described later.
*
* When a file extension causes a fragment to be upgraded, either to a larger
* fragment or to a full block, the on-disk location may change (if the
* previous fragment could not simply be extended). In this case, the old
* fragment must be de-allocated, but not until after the inode's pointer has
* been updated. In most cases, this is handled by later procedures, which
* will construct a "freefrag" structure to be added to the workitem queue
* when the inode update is complete (or obsolete). The main exception to
* this is when an allocation occurs while a pending allocation dependency
* (for the same block pointer) remains. This case is handled in the main
* allocation dependency setup procedure by immediately freeing the
* unreferenced fragments.
*/
/* inode to which block is being added */
/* block pointer within inode */
/* disk block number being added */
/* previous block number, 0 unless frag */
/* size of new block */
/* size of new block */
/* bp for allocated block */
void
softdep_setup_allocdirect(struct inode *ip, daddr_t lbn, daddr_t newblkno,
daddr_t oldblkno, long newsize, long oldsize, struct buf *bp)
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
struct bmsafemap *bmsafemap;
struct inodedep *inodedep;
struct pagedep *pagedep;
struct newblk *newblk;
adp = pool_get(&allocdirect_pool, PR_WAITOK | PR_ZERO);
adp->ad_list.wk_type = D_ALLOCDIRECT;
adp->ad_lbn = lbn;
adp->ad_newblkno = newblkno;
adp->ad_oldblkno = oldblkno;
adp->ad_newsize = newsize;
adp->ad_oldsize = oldsize;
adp->ad_state = ATTACHED;
LIST_INIT(&adp->ad_newdirblk);
if (newblkno == oldblkno)
adp->ad_freefrag = NULL;
else
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocdirect: lost block");
ACQUIRE_LOCK(&lk);
inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
if (newblk->nb_state == DEPCOMPLETE) {
adp->ad_state |= DEPCOMPLETE;
adp->ad_buf = NULL;
} else {
bmsafemap = newblk->nb_bmsafemap;
adp->ad_buf = bmsafemap->sm_buf;
LIST_REMOVE(newblk, nb_deps);
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
}
LIST_REMOVE(newblk, nb_hash);
pool_put(&newblk_pool, newblk);
if (bp == NULL) {
/*
* XXXUBC - Yes, I know how to fix this, but not right now.
*/
panic("softdep_setup_allocdirect: Bonk art in the head");
}
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
if (lbn >= NDADDR) {
/* allocating an indirect block */
if (oldblkno != 0) {
FREE_LOCK(&lk);
panic("softdep_setup_allocdirect: non-zero indir");
}
} else {
/*
* Allocating a direct block.
*
* If we are allocating a directory block, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((DIP(ip, mode) & IFMT) == IFDIR &&
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
}
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
* first uncommitted block (the size of the file stored on disk
* ends at the end of the lowest committed fragment, or if there
* are no fragments, at the end of the highest committed block).
* Since files generally grow, the typical case is that the new
* block is to be added at the end of the list. We speed this
* special case by checking against the last allocdirect in the
* list before laboriously traversing the list looking for the
* insertion point.
*/
adphead = &inodedep->id_newinoupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
if (oldadp != NULL && oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
if (oldadp->ad_lbn >= lbn)
break;
}
if (oldadp == NULL) {
FREE_LOCK(&lk);
panic("softdep_setup_allocdirect: lost entry");
}
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
if (oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
}
/*
* Replace an old allocdirect dependency with a newer one.
* This routine must be called with splbio interrupts blocked.
*/
/* head of list holding allocdirects */
/* allocdirect being added */
/* existing allocdirect being checked */
STATIC void
allocdirect_merge(struct allocdirectlst *adphead, struct allocdirect *newadp,
struct allocdirect *oldadp)
{
struct worklist *wk;
struct freefrag *freefrag;
struct newdirblk *newdirblk;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("allocdirect_merge: lock not held");
#endif
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
newadp->ad_oldsize != oldadp->ad_newsize ||
newadp->ad_lbn >= NDADDR) {
FREE_LOCK(&lk);
panic("allocdirect_merge: old %lld != new %lld || lbn %lld >= "
"%d", (long long)newadp->ad_oldblkno,
(long long)oldadp->ad_newblkno, (long long)newadp->ad_lbn,
NDADDR);
}
newadp->ad_oldblkno = oldadp->ad_oldblkno;
newadp->ad_oldsize = oldadp->ad_oldsize;
/*
* If the old dependency had a fragment to free or had never
* previously had a block allocated, then the new dependency
* can immediately post its freefrag and adopt the old freefrag.
* This action is done by swapping the freefrag dependencies.
* The new dependency gains the old one's freefrag, and the
* old one gets the new one and then immediately puts it on
* the worklist when it is freed by free_allocdirect. It is
* not possible to do this swap when the old dependency had a
* non-zero size but no previous fragment to free. This condition
* arises when the new block is an extension of the old block.
* Here, the first part of the fragment allocated to the new
* dependency is part of the block currently claimed on disk by
* the old dependency, so cannot legitimately be freed until the
* conditions for the new dependency are fulfilled.
*/
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
freefrag = newadp->ad_freefrag;
newadp->ad_freefrag = oldadp->ad_freefrag;
oldadp->ad_freefrag = freefrag;
}
/*
* If we are tracking a new directory-block allocation,
* move it from the old allocdirect to the new allocdirect.
*/
if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
panic("allocdirect_merge: extra newdirblk");
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
}
free_allocdirect(adphead, oldadp, 0);
}
/*
* Allocate a new freefrag structure if needed.
*/
STATIC struct freefrag *
newfreefrag(struct inode *ip, daddr_t blkno, long size)
{
struct freefrag *freefrag;
struct fs *fs;
if (blkno == 0)
return (NULL);
fs = ip->i_fs;
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
freefrag = pool_get(&freefrag_pool, PR_WAITOK);
freefrag->ff_list.wk_type = D_FREEFRAG;
freefrag->ff_state = DIP(ip, uid) & ~ONWORKLIST; /* used below */
freefrag->ff_inum = ip->i_number;
freefrag->ff_mnt = ITOV(ip)->v_mount;
freefrag->ff_devvp = ip->i_devvp;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
return (freefrag);
}
/*
* This workitem de-allocates fragments that were replaced during
* file block allocation.
*/
STATIC void
handle_workitem_freefrag(struct freefrag *freefrag)
{
struct inode tip;
struct ufs1_dinode dtip1;
tip.i_vnode = NULL;
tip.i_din1 = &dtip1;
tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
tip.i_ump = VFSTOUFS(freefrag->ff_mnt);
tip.i_dev = freefrag->ff_devvp->v_rdev;
tip.i_number = freefrag->ff_inum;
tip.i_ffs1_uid = freefrag->ff_state & ~ONWORKLIST; /* set above */
ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
pool_put(&freefrag_pool, freefrag);
}
/*
* Indirect block allocation dependencies.
*
* The same dependencies that exist for a direct block also exist when
* a new block is allocated and pointed to by an entry in a block of
* indirect pointers. The undo/redo states described above are also
* used here. Because an indirect block contains many pointers that
* may have dependencies, a second copy of the entire in-memory indirect
* block is kept. The buffer cache copy is always completely up-to-date.
* The second copy, which is used only as a source for disk writes,
* contains only the safe pointers (i.e., those that have no remaining
* update dependencies). The second copy is freed when all pointers
* are safe. The cache is not allowed to replace indirect blocks with
* pending update dependencies. If a buffer containing an indirect
* block with dependencies is written, these routines will mark it
* dirty again. It can only be successfully written once all the
* dependencies are removed. The ffs_fsync routine in conjunction with
* softdep_sync_metadata work together to get all the dependencies
* removed so that a file can be successfully written to disk. Three
* procedures are used when setting up indirect block pointer
* dependencies. The division is necessary because of the organization
* of the "balloc" routine and because of the distinction between file
* pages and file metadata blocks.
*/
/*
* Allocate a new allocindir structure.
*/
/* inode for file being extended */
/* offset of pointer in indirect block */
/* disk block number being added */
/* previous block number, 0 if none */
STATIC struct allocindir *
newallocindir(struct inode *ip, int ptrno, daddr_t newblkno,
daddr_t oldblkno)
{
struct allocindir *aip;
aip = pool_get(&allocindir_pool, PR_WAITOK | PR_ZERO);
aip->ai_list.wk_type = D_ALLOCINDIR;
aip->ai_state = ATTACHED;
aip->ai_offset = ptrno;
aip->ai_newblkno = newblkno;
aip->ai_oldblkno = oldblkno;
aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
return (aip);
}
/*
* Called just before setting an indirect block pointer
* to a newly allocated file page.
*/
/* inode for file being extended */
/* allocated block number within file */
/* buffer with indirect blk referencing page */
/* offset of pointer in indirect block */
/* disk block number being added */
/* previous block number, 0 if none */
/* buffer holding allocated page */
void
softdep_setup_allocindir_page(struct inode *ip, daddr_t lbn, struct buf *bp,
int ptrno, daddr_t newblkno, daddr_t oldblkno, struct buf *nbp)
{
struct allocindir *aip;
struct pagedep *pagedep;
aip = newallocindir(ip, ptrno, newblkno, oldblkno);
ACQUIRE_LOCK(&lk);
/*
* If we are allocating a directory page, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((DIP(ip, mode) & IFMT) == IFDIR &&
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
if (nbp == NULL) {
/*
* XXXUBC - Yes, I know how to fix this, but not right now.
*/
panic("softdep_setup_allocindir_page: Bonk art in the head");
}
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
FREE_LOCK(&lk);
setup_allocindir_phase2(bp, ip, aip);
}
/*
* Called just before setting an indirect block pointer to a
* newly allocated indirect block.
*/
/* newly allocated indirect block */
/* inode for file being extended */
/* indirect block referencing allocated block */
/* offset of pointer in indirect block */
/* disk block number being added */
void
softdep_setup_allocindir_meta(struct buf *nbp, struct inode *ip,
struct buf *bp, int ptrno, daddr_t newblkno)
{
struct allocindir *aip;
aip = newallocindir(ip, ptrno, newblkno, 0);
ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
FREE_LOCK(&lk);
setup_allocindir_phase2(bp, ip, aip);
}
/*
* Called to finish the allocation of the "aip" allocated
* by one of the two routines above.
*/
/* in-memory copy of the indirect block */
/* inode for file being extended */
/* allocindir allocated by the above routines */
STATIC void
setup_allocindir_phase2(struct buf *bp, struct inode *ip,
struct allocindir *aip)
{
struct worklist *wk;
struct indirdep *indirdep, *newindirdep;
struct bmsafemap *bmsafemap;
struct allocindir *oldaip;
struct freefrag *freefrag;
struct newblk *newblk;
if (bp->b_lblkno >= 0)
panic("setup_allocindir_phase2: not indir blk");
for (indirdep = NULL, newindirdep = NULL; ; ) {
ACQUIRE_LOCK(&lk);
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
if (wk->wk_type != D_INDIRDEP)
continue;
indirdep = WK_INDIRDEP(wk);
break;
}
if (indirdep == NULL && newindirdep) {
indirdep = newindirdep;
WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
newindirdep = NULL;
}
FREE_LOCK(&lk);
if (indirdep) {
if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
&newblk) == 0)
panic("setup_allocindir: lost block");
ACQUIRE_LOCK(&lk);
if (newblk->nb_state == DEPCOMPLETE) {
aip->ai_state |= DEPCOMPLETE;
aip->ai_buf = NULL;
} else {
bmsafemap = newblk->nb_bmsafemap;
aip->ai_buf = bmsafemap->sm_buf;
LIST_REMOVE(newblk, nb_deps);
LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
aip, ai_deps);
}
LIST_REMOVE(newblk, nb_hash);
pool_put(&newblk_pool, newblk);
aip->ai_indirdep = indirdep;
/*
* Check to see if there is an existing dependency
* for this block. If there is, merge the old
* dependency into the new one.
*/
if (aip->ai_oldblkno == 0)
oldaip = NULL;
else
LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
if (oldaip->ai_offset == aip->ai_offset)
break;
freefrag = NULL;
if (oldaip != NULL) {
if (oldaip->ai_newblkno != aip->ai_oldblkno) {
FREE_LOCK(&lk);
panic("setup_allocindir_phase2: blkno");
}
aip->ai_oldblkno = oldaip->ai_oldblkno;
freefrag = aip->ai_freefrag;
aip->ai_freefrag = oldaip->ai_freefrag;
oldaip->ai_freefrag = NULL;
free_allocindir(oldaip, NULL);
}
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
if (ip->i_ump->um_fstype == UM_UFS1)
((int32_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
else
((int64_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
FREE_LOCK(&lk);
if (freefrag != NULL)
handle_workitem_freefrag(freefrag);
}
if (newindirdep) {
if (indirdep->ir_savebp != NULL)
brelse(newindirdep->ir_savebp);
WORKITEM_FREE(newindirdep, D_INDIRDEP);
}
if (indirdep)
break;
newindirdep = pool_get(&indirdep_pool, PR_WAITOK);
newindirdep->ir_list.wk_type = D_INDIRDEP;
newindirdep->ir_state = ATTACHED;
if (ip->i_ump->um_fstype == UM_UFS1)
newindirdep->ir_state |= UFS1FMT;
LIST_INIT(&newindirdep->ir_deplisthd);
LIST_INIT(&newindirdep->ir_donehd);
if (bp->b_blkno == bp->b_lblkno) {
VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
NULL);
}
newindirdep->ir_savebp =
getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, INFSLP);
#if 0
BUF_KERNPROC(newindirdep->ir_savebp);
#endif
memcpy(newindirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount);
}
}
/*
* Block de-allocation dependencies.
*
* When blocks are de-allocated, the on-disk pointers must be nullified before
* the blocks are made available for use by other files. (The true
* requirement is that old pointers must be nullified before new on-disk
* pointers are set. We chose this slightly more stringent requirement to
* reduce complexity.) Our implementation handles this dependency by updating
* the inode (or indirect block) appropriately but delaying the actual block
* de-allocation (i.e., freemap and free space count manipulation) until
* after the updated versions reach stable storage. After the disk is
* updated, the blocks can be safely de-allocated whenever it is convenient.
* This implementation handles only the common case of reducing a file's
* length to zero. Other cases are handled by the conventional synchronous
* write approach.
*
* The ffs implementation with which we worked double-checks
* the state of the block pointers and file size as it reduces
* a file's length. Some of this code is replicated here in our
* soft updates implementation. The freeblks->fb_chkcnt field is
* used to transfer a part of this information to the procedure
* that eventually de-allocates the blocks.
*
* This routine should be called from the routine that shortens
* a file's length, before the inode's size or block pointers
* are modified. It will save the block pointer information for
* later release and zero the inode so that the calling routine
* can release it.
*/
/* The inode whose length is to be reduced */
/* The new length for the file */
void
softdep_setup_freeblocks(struct inode *ip, off_t length)
{
struct freeblks *freeblks;
struct inodedep *inodedep;
struct allocdirect *adp;
struct vnode *vp;
struct buf *bp;
struct fs *fs;
int i, delay, error;
fs = ip->i_fs;
if (length != 0)
panic("softdep_setup_freeblocks: non-zero length");
freeblks = pool_get(&freeblks_pool, PR_WAITOK | PR_ZERO);
freeblks->fb_list.wk_type = D_FREEBLKS;
freeblks->fb_state = ATTACHED;
freeblks->fb_uid = DIP(ip, uid);
freeblks->fb_previousinum = ip->i_number;
freeblks->fb_devvp = ip->i_devvp;
freeblks->fb_mnt = ITOV(ip)->v_mount;
freeblks->fb_oldsize = DIP(ip, size);
freeblks->fb_newsize = length;
freeblks->fb_chkcnt = DIP(ip, blocks);
for (i = 0; i < NDADDR; i++) {
freeblks->fb_dblks[i] = DIP(ip, db[i]);
DIP_ASSIGN(ip, db[i], 0);
}
for (i = 0; i < NIADDR; i++) {
freeblks->fb_iblks[i] = DIP(ip, ib[i]);
DIP_ASSIGN(ip, ib[i], 0);
}
DIP_ASSIGN(ip, blocks, 0);
DIP_ASSIGN(ip, size, 0);
/*
* Push the zero'ed inode to to its disk buffer so that we are free
* to delete its dependencies below. Once the dependencies are gone
* the buffer can be safely released.
*/
if ((error = bread(ip->i_devvp,
fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, &bp)) != 0)
softdep_error("softdep_setup_freeblocks", error);
if (ip->i_ump->um_fstype == UM_UFS1)
*((struct ufs1_dinode *) bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
else
*((struct ufs2_dinode *) bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
/*
* Find and eliminate any inode dependencies.
*/
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
if ((inodedep->id_state & IOSTARTED) != 0) {
FREE_LOCK(&lk);
panic("softdep_setup_freeblocks: inode busy");
}
/*
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk. If we
* still have a bitmap dependency (delay == 0), then the inode
* has never been written to disk, so we can process the
* freeblks below once we have deleted the dependencies.
*/
delay = (inodedep->id_state & DEPCOMPLETE);
if (delay)
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
* with this inode are obsolete and can simply be de-allocated.
* We must first merge the two dependency lists to get rid of
* any duplicate freefrag structures, then purge the merged list.
* If we still have a bitmap dependency, then the inode has never
* been written to disk, so we can free any fragments without delay.
*/
merge_inode_lists(inodedep);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
free_allocdirect(&inodedep->id_inoupdt, adp, delay);
FREE_LOCK(&lk);
bdwrite(bp);
/*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
* Once they are all there, walk the list and get rid of
* any dependencies.
*/
vp = ITOV(ip);
ACQUIRE_LOCK(&lk);
drain_output(vp, 1);
while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
if (getdirtybuf(bp, MNT_WAIT) <= 0)
break;
(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
deallocate_dependencies(bp, inodedep);
bp->b_flags |= B_INVAL | B_NOCACHE;
FREE_LOCK(&lk);
brelse(bp);
ACQUIRE_LOCK(&lk);
}
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
if (delay) {
freeblks->fb_state |= DEPCOMPLETE;
/*
* If the inode with zeroed block pointers is now on disk we
* can start freeing blocks. Add freeblks to the worklist
* instead of calling handle_workitem_freeblocks() directly as
* it is more likely that additional IO is needed to complete
* the request than in the !delay case.
*/
if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
add_to_worklist(&freeblks->fb_list);
}
FREE_LOCK(&lk);
/*
* If the inode has never been written to disk (delay == 0),
* then we can process the freeblks now that we have deleted
* the dependencies.
*/
if (!delay)
handle_workitem_freeblocks(freeblks);
}
/*
* Reclaim any dependency structures from a buffer that is about to
* be reallocated to a new vnode. The buffer must be locked, thus,
* no I/O completion operations can occur while we are manipulating
* its associated dependencies. The mutex is held so that other I/O's
* associated with related dependencies do not occur.
*/
STATIC void
deallocate_dependencies(struct buf *bp, struct inodedep *inodedep)
{
struct worklist *wk;
struct indirdep *indirdep;
struct allocindir *aip;
struct pagedep *pagedep;
struct dirrem *dirrem;
struct diradd *dap;
int i;
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
switch (wk->wk_type) {
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
/*
* None of the indirect pointers will ever be visible,
* so they can simply be tossed. GOINGAWAY ensures
* that allocated pointers will be saved in the buffer
* cache until they are freed. Note that they will
* only be able to be found by their physical address
* since the inode mapping the logical address will
* be gone. The save buffer used for the safe copy
* was allocated in setup_allocindir_phase2 using
* the physical address so it could be used for this
* purpose. Hence we swap the safe copy with the real
* copy, allowing the safe copy to be freed and holding
* on to the real copy for later use in indir_trunc.
*/
if (indirdep->ir_state & GOINGAWAY) {
FREE_LOCK(&lk);
panic("deallocate_dependencies: already gone");
}
indirdep->ir_state |= GOINGAWAY;
while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)))
free_allocindir(aip, inodedep);
if (bp->b_lblkno >= 0 ||
bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
FREE_LOCK(&lk);
panic("deallocate_dependencies: not indir");
}
memcpy(indirdep->ir_savebp->b_data, bp->b_data,
bp->b_bcount);
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
/*
* None of the directory additions will ever be
* visible, so they can simply be tossed.
*/
for (i = 0; i < DAHASHSZ; i++)
while ((dap =
LIST_FIRST(&pagedep->pd_diraddhd[i])))
free_diradd(dap);
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)))
free_diradd(dap);
/*
* Copy any directory remove dependencies to the list
* to be processed after the zero'ed inode is written.
* If the inode has already been written, then they
* can be dumped directly onto the work list.
*/
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))) {
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
if (inodedep == NULL ||
(inodedep->id_state & ALLCOMPLETE) ==
ALLCOMPLETE)
add_to_worklist(&dirrem->dm_list);
else
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
if (wk->wk_type == D_NEWDIRBLK &&
WK_NEWDIRBLK(wk)->db_pagedep ==
pagedep)
break;
if (wk != NULL) {
WORKLIST_REMOVE(wk);
free_newdirblk(WK_NEWDIRBLK(wk));
} else {
FREE_LOCK(&lk);
panic("deallocate_dependencies: "
"lost pagedep");
}
}
WORKLIST_REMOVE(&pagedep->pd_list);
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
continue;
case D_ALLOCINDIR:
free_allocindir(WK_ALLOCINDIR(wk), inodedep);
continue;
case D_ALLOCDIRECT:
case D_INODEDEP:
FREE_LOCK(&lk);
panic("deallocate_dependencies: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
default:
FREE_LOCK(&lk);
panic("deallocate_dependencies: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
}
/*
* Free an allocdirect. Generate a new freefrag work request if appropriate.
* This routine must be called with splbio interrupts blocked.
*/
STATIC void
free_allocdirect(struct allocdirectlst *adphead, struct allocdirect *adp,
int delay)
{
struct newdirblk *newdirblk;
struct worklist *wk;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("free_allocdirect: lock not held");
#endif
if ((adp->ad_state & DEPCOMPLETE) == 0)
LIST_REMOVE(adp, ad_deps);
TAILQ_REMOVE(adphead, adp, ad_next);
if ((adp->ad_state & COMPLETE) == 0)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
}
if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
panic("free_allocdirect: extra newdirblk");
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&newdirblk->db_list);
else
free_newdirblk(newdirblk);
}
WORKITEM_FREE(adp, D_ALLOCDIRECT);
}
/*
* Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
* This routine must be called with splbio interrupts blocked.
*/
void
free_newdirblk(struct newdirblk *newdirblk)
{
struct pagedep *pagedep;
struct diradd *dap;
int i;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("free_newdirblk: lock not held");
#endif
/*
* If the pagedep is still linked onto the directory buffer
* dependency chain, then some of the entries on the
* pd_pendinghd list may not be committed to disk yet. In
* this case, we will simply clear the NEWBLOCK flag and
* let the pd_pendinghd list be processed when the pagedep
* is next written. If the pagedep is no longer on the buffer
* dependency chain, then all the entries on the pd_pending
* list are committed to disk and we can free them here.
*/
pagedep = newdirblk->db_pagedep;
pagedep->pd_state &= ~NEWBLOCK;
if ((pagedep->pd_state & ONWORKLIST) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
free_diradd(dap);
/*
* If no dependencies remain, the pagedep will be freed.
*/
for (i = 0; i < DAHASHSZ; i++)
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
break;
if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
}
/*
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
void
softdep_freefile(struct vnode *pvp, ufsino_t ino, mode_t mode)
{
struct inode *ip = VTOI(pvp);
struct inodedep *inodedep;
struct freefile *freefile;
/*
* This sets up the inode de-allocation dependency.
*/
freefile = pool_get(&freefile_pool, PR_WAITOK);
freefile->fx_list.wk_type = D_FREEFILE;
freefile->fx_list.wk_state = 0;
freefile->fx_mode = mode;
freefile->fx_oldinum = ino;
freefile->fx_devvp = ip->i_devvp;
freefile->fx_mnt = ITOV(ip)->v_mount;
/*
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
* case we can free the file immediately.
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
handle_workitem_freefile(freefile);
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
FREE_LOCK(&lk);
}
/*
* Check to see if an inode has never been written to disk. If
* so free the inodedep and return success, otherwise return failure.
* This routine must be called with splbio interrupts blocked.
*
* If we still have a bitmap dependency, then the inode has never
* been written to disk. Drop the dependency as it is no longer
* necessary since the inode is being deallocated. We set the
* ALLCOMPLETE flags since the bitmap now properly shows that the
* inode is not allocated. Even if the inode is actively being
* written, it has been rolled back to its zero'ed state, so we
* are ensured that a zero inode is what is on the disk. For short
* lived files, this change will usually result in removing all the
* dependencies from the inode so that it can be freed immediately.
*/
STATIC int
check_inode_unwritten(struct inodedep *inodedep)
{
splassert(IPL_BIO);
if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
inodedep->id_nlinkdelta != 0)
return (0);
inodedep->id_state |= ALLCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
if (inodedep->id_state & ONWORKLIST)
WORKLIST_REMOVE(&inodedep->id_list);
if (inodedep->id_savedino1 != NULL) {
free(inodedep->id_savedino1, M_INODEDEP, inodedep->id_unsize);
inodedep->id_savedino1 = NULL;
}
if (free_inodedep(inodedep) == 0) {
FREE_LOCK(&lk);
panic("check_inode_unwritten: busy inode");
}
return (1);
}
/*
* Try to free an inodedep structure. Return 1 if it could be freed.
*/
STATIC int
free_inodedep(struct inodedep *inodedep)
{
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
return (1);
}
/*
* This workitem routine performs the block de-allocation.
* The workitem is added to the pending list after the updated
* inode block has been written to disk. As mentioned above,
* checks regarding the number of blocks de-allocated (compared
* to the number of blocks allocated for the file) are also
* performed in this function.
*/
STATIC void
handle_workitem_freeblocks(struct freeblks *freeblks)
{
struct inode tip;
daddr_t bn;
union {
struct ufs1_dinode di1;
struct ufs2_dinode di2;
} di;
struct fs *fs;
int i, level, bsize;
long nblocks, blocksreleased = 0;
int error, allerror = 0;
daddr_t baselbns[NIADDR], tmpval;
if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UM_UFS1)
tip.i_din1 = &di.di1;
else
tip.i_din2 = &di.di2;
tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
tip.i_number = freeblks->fb_previousinum;
tip.i_ump = VFSTOUFS(freeblks->fb_mnt);
tip.i_dev = freeblks->fb_devvp->v_rdev;
DIP_ASSIGN(&tip, size, freeblks->fb_oldsize);
DIP_ASSIGN(&tip, uid, freeblks->fb_uid);
tip.i_vnode = NULL;
tmpval = 1;
baselbns[0] = NDADDR;
for (i = 1; i < NIADDR; i++) {
tmpval *= NINDIR(fs);
baselbns[i] = baselbns[i - 1] + tmpval;
}
nblocks = btodb(fs->fs_bsize);
blocksreleased = 0;
/*
* Indirect blocks first.
*/
for (level = (NIADDR - 1); level >= 0; level--) {
if ((bn = freeblks->fb_iblks[level]) == 0)
continue;
if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
baselbns[level], &blocksreleased)) != 0)
allerror = error;
ffs_blkfree(&tip, bn, fs->fs_bsize);
blocksreleased += nblocks;
}
/*
* All direct blocks or frags.
*/
for (i = (NDADDR - 1); i >= 0; i--) {
if ((bn = freeblks->fb_dblks[i]) == 0)
continue;
bsize = blksize(fs, &tip, i);
ffs_blkfree(&tip, bn, bsize);
blocksreleased += btodb(bsize);
}
#ifdef DIAGNOSTIC
if (freeblks->fb_chkcnt != blocksreleased)
printf("handle_workitem_freeblocks: block count\n");
if (allerror)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
}
/*
* Release blocks associated with the inode ip and stored in the indirect
* block dbn. If level is greater than SINGLE, the block is an indirect block
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*/
STATIC int
indir_trunc(struct inode *ip, daddr_t dbn, int level, daddr_t lbn,
long *countp)
{
struct buf *bp;
int32_t *bap1 = NULL;
int64_t nb, *bap2 = NULL;
struct fs *fs;
struct worklist *wk;
struct indirdep *indirdep;
int i, lbnadd, nblocks, ufs1fmt;
int error, allerror = 0;
fs = ip->i_fs;
lbnadd = 1;
for (i = level; i > 0; i--)
lbnadd *= NINDIR(fs);
/*
* Get buffer of block pointers to be freed. This routine is not
* called until the zero'ed inode has been written, so it is safe
* to free blocks as they are encountered. Because the inode has
* been zero'ed, calls to bmap on these blocks will fail. So, we
* have to use the on-disk address and the block device for the
* filesystem to look them up. If the file was deleted before its
* indirect blocks were all written to disk, the routine that set
* us up (deallocate_dependencies) will have arranged to leave
* a complete copy of the indirect block in memory for our use.
* Otherwise we have to read the blocks in from the disk.
*/
ACQUIRE_LOCK(&lk);
if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
if (wk->wk_type != D_INDIRDEP ||
(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
(indirdep->ir_state & GOINGAWAY) == 0) {
FREE_LOCK(&lk);
panic("indir_trunc: lost indirdep");
}
WORKLIST_REMOVE(wk);
WORKITEM_FREE(indirdep, D_INDIRDEP);
if (LIST_FIRST(&bp->b_dep) != NULL) {
FREE_LOCK(&lk);
panic("indir_trunc: dangling dep");
}
FREE_LOCK(&lk);
} else {
FREE_LOCK(&lk);
error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, &bp);
if (error)
return (error);
}
/*
* Recursively free indirect blocks.
*/
if (ip->i_ump->um_fstype == UM_UFS1) {
ufs1fmt = 1;
bap1 = (int32_t *)bp->b_data;
} else {
ufs1fmt = 0;
bap2 = (int64_t *)bp->b_data;
}
nblocks = btodb(fs->fs_bsize);
for (i = NINDIR(fs) - 1; i >= 0; i--) {
if (ufs1fmt)
nb = bap1[i];
else
nb = bap2[i];
if (nb == 0)
continue;
if (level != 0) {
if ((error = indir_trunc(ip, fsbtodb(fs, nb),
level - 1, lbn + (i * lbnadd), countp)) != 0)
allerror = error;
}
ffs_blkfree(ip, nb, fs->fs_bsize);
*countp += nblocks;
}
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
return (allerror);
}
/*
* Free an allocindir.
* This routine must be called with splbio interrupts blocked.
*/
STATIC void
free_allocindir(struct allocindir *aip, struct inodedep *inodedep)
{
struct freefrag *freefrag;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("free_allocindir: lock not held");
#endif
if ((aip->ai_state & DEPCOMPLETE) == 0)
LIST_REMOVE(aip, ai_deps);
if (aip->ai_state & ONWORKLIST)
WORKLIST_REMOVE(&aip->ai_list);
LIST_REMOVE(aip, ai_next);
if ((freefrag = aip->ai_freefrag) != NULL) {
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
}
/*
* Directory entry addition dependencies.
*
* When adding a new directory entry, the inode (with its incremented link
* count) must be written to disk before the directory entry's pointer to it.
* Also, if the inode is newly allocated, the corresponding freemap must be
* updated (on disk) before the directory entry's pointer. These requirements
* are met via undo/redo on the directory entry's pointer, which consists
* simply of the inode number.
*
* As directory entries are added and deleted, the free space within a
* directory block can become fragmented. The ufs file system will compact
* a fragmented directory block to make space for a new entry. When this
* occurs, the offsets of previously added entries change. Any "diradd"
* dependency structures corresponding to these entries must be updated with
* the new offsets.
*/
/*
* This routine is called after the in-memory inode's link
* count has been incremented, but before the directory entry's
* pointer to the inode has been set.
*/
/* buffer containing directory block */
/* inode for directory */
/* offset of new entry in directory */
/* inode referenced by new directory entry */
/* non-NULL => contents of new mkdir */
/* entry is in a newly allocated block */
int
softdep_setup_directory_add(struct buf *bp, struct inode *dp, off_t diroffset,
long newinum, struct buf *newdirbp, int isnewblk)
{
int offset; /* offset of new entry within directory block */
daddr_t lbn; /* block in directory containing new entry */
struct fs *fs;
struct diradd *dap;
struct allocdirect *adp;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct newdirblk *newdirblk = NULL;
struct mkdir *mkdir1, *mkdir2;
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
offset = blkoff(fs, diroffset);
dap = pool_get(&diradd_pool, PR_WAITOK | PR_ZERO);
dap->da_list.wk_type = D_DIRADD;
dap->da_offset = offset;
dap->da_newinum = newinum;
dap->da_state = ATTACHED;
if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
newdirblk = pool_get(&newdirblk_pool, PR_WAITOK);
newdirblk->db_list.wk_type = D_NEWDIRBLK;
newdirblk->db_state = 0;
}
if (newdirbp == NULL) {
dap->da_state |= DEPCOMPLETE;
ACQUIRE_LOCK(&lk);
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
mkdir1 = pool_get(&mkdir_pool, PR_WAITOK);
mkdir1->md_list.wk_type = D_MKDIR;
mkdir1->md_state = MKDIR_BODY;
mkdir1->md_diradd = dap;
mkdir2 = pool_get(&mkdir_pool, PR_WAITOK);
mkdir2->md_list.wk_type = D_MKDIR;
mkdir2->md_state = MKDIR_PARENT;
mkdir2->md_diradd = dap;
/*
* Dependency on "." and ".." being written to disk.
*/
mkdir1->md_buf = newdirbp;
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
FREE_LOCK(&lk);
bdwrite(newdirbp);
/*
* Dependency on link count increase for parent directory
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
|| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state &= ~MKDIR_PARENT;
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
* Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
if (isnewblk) {
/*
* Directories growing into indirect blocks are rare
* enough and the frequency of new block allocation
* in those cases even more rare, that we choose not
* to bother tracking them. Rather we simply force the
* new directory entry to disk.
*/
if (lbn >= NDADDR) {
FREE_LOCK(&lk);
/*
* We only have a new allocation when at the
* beginning of a new block, not when we are
* expanding into an existing block.
*/
if (blkoff(fs, diroffset) == 0)
return (1);
return (0);
}
/*
* We only have a new allocation when at the beginning
* of a new fragment, not when we are expanding into an
* existing fragment. Also, there is nothing to do if we
* are already tracking this block.
*/
if (fragoff(fs, diroffset) != 0) {
FREE_LOCK(&lk);
return (0);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
FREE_LOCK(&lk);
return (0);
}
/*
* Find our associated allocdirect and have it track us.
*/
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
panic("softdep_setup_directory_add: lost inodedep");
adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
if (adp == NULL || adp->ad_lbn != lbn) {
FREE_LOCK(&lk);
panic("softdep_setup_directory_add: lost entry");
}
pagedep->pd_state |= NEWBLOCK;
newdirblk->db_pagedep = pagedep;
WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
}
FREE_LOCK(&lk);
return (0);
}
/*
* This procedure is called to change the offset of a directory
* entry when compacting a directory block which must be owned
* exclusively by the caller. Note that the actual entry movement
* must be done in this procedure to ensure that no I/O completions
* occur while the move is in progress.
*/
/* inode for directory */
/* address of dp->i_offset */
/* address of old directory location */
/* address of new directory location */
/* size of directory entry */
void
softdep_change_directoryentry_offset(struct inode *dp, caddr_t base,
caddr_t oldloc, caddr_t newloc, int entrysize)
{
int offset, oldoffset, newoffset;
struct pagedep *pagedep;
struct diradd *dap;
daddr_t lbn;
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
goto done;
oldoffset = offset + (oldloc - base);
newoffset = offset + (newloc - base);
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
if (dap->da_offset != oldoffset)
continue;
dap->da_offset = newoffset;
if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
break;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
dap, da_pdlist);
break;
}
if (dap == NULL) {
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
if (dap->da_offset == oldoffset) {
dap->da_offset = newoffset;
break;
}
}
}
done:
memmove(newloc, oldloc, entrysize);
FREE_LOCK(&lk);
}
/*
* Free a diradd dependency structure. This routine must be called
* with splbio interrupts blocked.
*/
STATIC void
free_diradd(struct diradd *dap)
{
struct dirrem *dirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct mkdir *mkdir, *nextmd;
splassert(IPL_BIO);
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("free_diradd: lock not held");
#endif
WORKLIST_REMOVE(&dap->da_list);
LIST_REMOVE(dap, da_pdlist);
if ((dap->da_state & DIRCHG) == 0) {
pagedep = dap->da_pagedep;
} else {
dirrem = dap->da_previous;
pagedep = dirrem->dm_pagedep;
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
0, &inodedep) != 0)
(void) free_inodedep(inodedep);
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
nextmd = LIST_NEXT(mkdir, md_mkdirs);
if (mkdir->md_diradd != dap)
continue;
dap->da_state &= ~mkdir->md_state;
WORKLIST_REMOVE(&mkdir->md_list);
LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
FREE_LOCK(&lk);
panic("free_diradd: unfound ref");
}
}
WORKITEM_FREE(dap, D_DIRADD);
}
/*
* Directory entry removal dependencies.
*
* When removing a directory entry, the entry's inode pointer must be
* zero'ed on disk before the corresponding inode's link count is decremented
* (possibly freeing the inode for re-use). This dependency is handled by
* updating the directory entry but delaying the inode count reduction until
* after the directory block has been written to disk. After this point, the
* inode count can be decremented whenever it is convenient.
*/
/*
* This routine should be called immediately after removing
* a directory entry. The inode's link count should not be
* decremented by the calling procedure -- the soft updates
* code will do this task when it is safe.
*/
/* buffer containing directory block */
/* inode for the directory being modified */
/* inode for directory entry being removed */
/* indicates if doing RMDIR */
void
softdep_setup_remove(struct buf *bp, struct inode *dp, struct inode *ip,
int isrmdir)
{
struct dirrem *dirrem, *prevdirrem;
/*
* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
*/
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
/*
* If the COMPLETE flag is clear, then there were no active
* entries and we want to roll back to a zeroed entry until
* the new inode is committed to disk. If the COMPLETE flag is
* set then we have deleted an entry that never made it to
* disk. If the entry we deleted resulted from a name change,
* then the old name still resides on disk. We cannot delete
* its inode (returned to us in prevdirrem) until the zeroed
* directory entry gets to disk. The new inode has never been
* referenced on the disk, so can be deleted immediately.
*/
if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
FREE_LOCK(&lk);
} else {
if (prevdirrem != NULL)
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
prevdirrem, dm_next);
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
FREE_LOCK(&lk);
handle_workitem_remove(dirrem);
}
}
STATIC long num_dirrem; /* number of dirrem allocated */
/*
* Allocate a new dirrem if appropriate and return it along with
* its associated pagedep. Called without a lock, returns with lock.
*/
/* buffer containing directory block */
/* inode for the directory being modified */
/* inode for directory entry being removed */
/* indicates if doing RMDIR */
/* previously referenced inode, if any */
STATIC struct dirrem *
newdirrem(struct buf *bp, struct inode *dp, struct inode *ip, int isrmdir,
struct dirrem **prevdirremp)
{
int offset;
daddr_t lbn;
struct diradd *dap;
struct dirrem *dirrem;
struct pagedep *pagedep;
/*
* Whiteouts have no deletion dependencies.
*/
if (ip == NULL)
panic("newdirrem: whiteout");
/*
* If we are over our limit, try to improve the situation.
* Limiting the number of dirrem structures will also limit
* the number of freefile and freeblks structures.
*/
if (num_dirrem > max_softdeps / 2)
(void) request_cleanup(FLUSH_REMOVE, 0);
num_dirrem += 1;
dirrem = pool_get(&dirrem_pool, PR_WAITOK | PR_ZERO);
dirrem->dm_list.wk_type = D_DIRREM;
dirrem->dm_state = isrmdir ? RMDIR : 0;
dirrem->dm_mnt = ITOV(ip)->v_mount;
dirrem->dm_oldinum = ip->i_number;
*prevdirremp = NULL;
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dirrem->dm_pagedep = pagedep;
/*
* Check for a diradd dependency for the same directory entry.
* If present, then both dependencies become obsolete and can
* be de-allocated. Check for an entry on both the pd_dirraddhd
* list and the pd_pendinghd list.
*/
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
if (dap->da_offset == offset)
break;
if (dap == NULL) {
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
if (dap->da_offset == offset)
break;
if (dap == NULL)
return (dirrem);
}
/*
* Must be ATTACHED at this point.
*/
if ((dap->da_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("newdirrem: not ATTACHED");
}
if (dap->da_newinum != ip->i_number) {
FREE_LOCK(&lk);
panic("newdirrem: inum %u should be %u",
ip->i_number, dap->da_newinum);
}
/*
* If we are deleting a changed name that never made it to disk,
* then return the dirrem describing the previous inode (which
* represents the inode currently referenced from this entry on disk).
*/
if ((dap->da_state & DIRCHG) != 0) {
*prevdirremp = dap->da_previous;
dap->da_state &= ~DIRCHG;
dap->da_pagedep = pagedep;
}
/*
* We are deleting an entry that never made it to disk.
* Mark it COMPLETE so we can delete its inode immediately.
*/
dirrem->dm_state |= COMPLETE;
free_diradd(dap);
return (dirrem);
}
/*
* Directory entry change dependencies.
*
* Changing an existing directory entry requires that an add operation
* be completed first followed by a deletion. The semantics for the addition
* are identical to the description of adding a new entry above except
* that the rollback is to the old inode number rather than zero. Once
* the addition dependency is completed, the removal is done as described
* in the removal routine above.
*/
/*
* This routine should be called immediately after changing
* a directory entry. The inode's link count should not be
* decremented by the calling procedure -- the soft updates
* code will perform this task when it is safe.
*/
/* buffer containing directory block */
/* inode for the directory being modified */
/* inode for directory entry being removed */
/* new inode number for changed entry */
/* indicates if doing RMDIR */
void
softdep_setup_directory_change(struct buf *bp, struct inode *dp,
struct inode *ip, long newinum, int isrmdir)
{
int offset;
struct diradd *dap;
struct dirrem *dirrem, *prevdirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
offset = blkoff(dp->i_fs, dp->i_offset);
dap = pool_get(&diradd_pool, PR_WAITOK | PR_ZERO);
dap->da_list.wk_type = D_DIRADD;
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
dap->da_offset = offset;
dap->da_newinum = newinum;
/*
* Allocate a new dirrem and ACQUIRE_LOCK.
*/
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
pagedep = dirrem->dm_pagedep;
/*
* The possible values for isrmdir:
* 0 - non-directory file rename
* 1 - directory rename within same directory
* inum - directory rename to new directory of given inode number
* When renaming to a new directory, we are both deleting and
* creating a new directory entry, so the link count on the new
* directory should not change. Thus we do not need the followup
* dirrem which is usually done in handle_workitem_remove. We set
* the DIRCHG flag to tell handle_workitem_remove to skip the
* followup dirrem.
*/
if (isrmdir > 1)
dirrem->dm_state |= DIRCHG;
/*
* If the COMPLETE flag is clear, then there were no active
* entries and we want to roll back to the previous inode until
* the new inode is committed to disk. If the COMPLETE flag is
* set, then we have deleted an entry that never made it to disk.
* If the entry we deleted resulted from a name change, then the old
* inode reference still resides on disk. Any rollback that we do
* needs to be to that old inode (returned to us in prevdirrem). If
* the entry we deleted resulted from a create, then there is
* no entry on the disk, so we want to roll back to zero rather
* than the uncommitted inode. In either of the COMPLETE cases we
* want to immediately free the unwritten and unreferenced inode.
*/
if ((dirrem->dm_state & COMPLETE) == 0) {
dap->da_previous = dirrem;
} else {
if (prevdirrem != NULL) {
dap->da_previous = prevdirrem;
} else {
dap->da_state &= ~DIRCHG;
dap->da_pagedep = pagedep;
}
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state |= COMPLETE;
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
} else {
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
}
FREE_LOCK(&lk);
}
/*
* Called whenever the link count on an inode is changed.
* It creates an inode dependency so that the new reference(s)
* to the inode cannot be committed to disk until the updated
* inode has been written.
*/
/* the inode with the increased link count */
/* do background work or not */
void
softdep_change_linkcnt(struct inode *ip, int nodelay)
{
struct inodedep *inodedep;
int flags;
/*
* If requested, do not allow background work to happen.
*/
flags = DEPALLOC;
if (nodelay)
flags |= NODELAY;
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(ip->i_fs, ip->i_number, flags, &inodedep);
if (DIP(ip, nlink) < ip->i_effnlink) {
FREE_LOCK(&lk);
panic("softdep_change_linkcnt: bad delta");
}
inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
FREE_LOCK(&lk);
}
/*
* This workitem decrements the inode's link count.
* If the link count reaches zero, the file is removed.
*/
STATIC void
handle_workitem_remove(struct dirrem *dirrem)
{
struct proc *p = CURPROC; /* XXX */
struct inodedep *inodedep;
struct vnode *vp;
struct inode *ip;
ufsino_t oldinum;
int error;
if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
softdep_error("handle_workitem_remove: vget", error);
return;
}
ip = VTOI(vp);
ACQUIRE_LOCK(&lk);
if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep))
== 0) {
FREE_LOCK(&lk);
panic("handle_workitem_remove: lost inodedep");
}
/*
* Normal file deletion.
*/
if ((dirrem->dm_state & RMDIR) == 0) {
DIP_ADD(ip, nlink, -1);
ip->i_flag |= IN_CHANGE;
if (DIP(ip, nlink) < ip->i_effnlink) {
FREE_LOCK(&lk);
panic("handle_workitem_remove: bad file delta");
}
inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
FREE_LOCK(&lk);
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
/*
* Directory deletion. Decrement reference count for both the
* just deleted parent directory entry and the reference for ".".
* Next truncate the directory to length zero. When the
* truncation completes, arrange to have the reference count on
* the parent decremented to account for the loss of "..".
*/
DIP_ADD(ip, nlink, -2);
ip->i_flag |= IN_CHANGE;
if (DIP(ip, nlink) < ip->i_effnlink)
panic("handle_workitem_remove: bad dir delta");
inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
FREE_LOCK(&lk);
if ((error = UFS_TRUNCATE(ip, (off_t)0, 0, p->p_ucred)) != 0)
softdep_error("handle_workitem_remove: truncate", error);
/*
* Rename a directory to a new parent. Since, we are both deleting
* and creating a new directory entry, the link count on the new
* directory should not change. Thus we skip the followup dirrem.
*/
if (dirrem->dm_state & DIRCHG) {
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
/*
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
* case we can remove the file immediately.
*/
ACQUIRE_LOCK(&lk);
dirrem->dm_state = 0;
oldinum = dirrem->dm_oldinum;
dirrem->dm_oldinum = dirrem->dm_dirinum;
if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
vput(vp);
handle_workitem_remove(dirrem);
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
FREE_LOCK(&lk);
ip->i_flag |= IN_CHANGE;
UFS_UPDATE(VTOI(vp), 0);
vput(vp);
}
/*
* Inode de-allocation dependencies.
*
* When an inode's link count is reduced to zero, it can be de-allocated. We
* found it convenient to postpone de-allocation until after the inode is
* written to disk with its new link count (zero). At this point, all of the
* on-disk inode's block pointers are nullified and, with careful dependency
* list ordering, all dependencies related to the inode will be satisfied and
* the corresponding dependency structures de-allocated. So, if/when the
* inode is reused, there will be no mixing of old dependencies with new
* ones. This artificial dependency is set up by the block de-allocation
* procedure above (softdep_setup_freeblocks) and completed by the
* following procedure.
*/
STATIC void
handle_workitem_freefile(struct freefile *freefile)
{
struct fs *fs;
struct vnode vp;
struct inode tip;
#ifdef DEBUG
struct inodedep *idp;
#endif
int error;
fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
#ifdef DEBUG
ACQUIRE_LOCK(&lk);
error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
FREE_LOCK(&lk);
if (error)
panic("handle_workitem_freefile: inodedep survived");
#endif
tip.i_ump = VFSTOUFS(freefile->fx_mnt);
tip.i_dev = freefile->fx_devvp->v_rdev;
tip.i_fs = fs;
tip.i_vnode = &vp;
vp.v_data = &tip;
if ((error = ffs_freefile(&tip, freefile->fx_oldinum,
freefile->fx_mode)) != 0) {
softdep_error("handle_workitem_freefile", error);
}
WORKITEM_FREE(freefile, D_FREEFILE);
}
/*
* Disk writes.
*
* The dependency structures constructed above are most actively used when file
* system blocks are written to disk. No constraints are placed on when a
* block can be written, but unsatisfied update dependencies are made safe by
* modifying (or replacing) the source memory for the duration of the disk
* write. When the disk write completes, the memory block is again brought
* up-to-date.
*
* In-core inode structure reclamation.
*
* Because there are a finite number of "in-core" inode structures, they are
* reused regularly. By transferring all inode-related dependencies to the
* in-memory inode block and indexing them separately (via "inodedep"s), we
* can allow "in-core" inode structures to be reused at any time and avoid
* any increase in contention.
*
* Called just before entering the device driver to initiate a new disk I/O.
* The buffer must be locked, thus, no I/O completion operations can occur
* while we are manipulating its associated dependencies.
*/
/* structure describing disk write to occur */
void
softdep_disk_io_initiation(struct buf *bp)
{
struct worklist *wk, *nextwk;
struct indirdep *indirdep;
struct inodedep *inodedep;
struct buf *sbp;
/*
* We only care about write operations. There should never
* be dependencies for reads.
*/
if (bp->b_flags & B_READ)
panic("softdep_disk_io_initiation: read");
ACQUIRE_LOCK(&lk);
/*
* Do any necessary pre-I/O processing.
*/
for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
nextwk = LIST_NEXT(wk, wk_list);
switch (wk->wk_type) {
case D_PAGEDEP:
initiate_write_filepage(WK_PAGEDEP(wk), bp);
continue;
case D_INODEDEP:
inodedep = WK_INODEDEP(wk);
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
initiate_write_inodeblock_ufs1(inodedep, bp);
#ifdef FFS2
else
initiate_write_inodeblock_ufs2(inodedep, bp);
#endif
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
if (indirdep->ir_state & GOINGAWAY)
panic("disk_io_initiation: indirdep gone");
/*
* If there are no remaining dependencies, this
* will be writing the real pointers, so the
* dependency can be freed.
*/
if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
sbp = indirdep->ir_savebp;
sbp->b_flags |= B_INVAL | B_NOCACHE;
/* inline expand WORKLIST_REMOVE(wk); */
wk->wk_state &= ~ONWORKLIST;
LIST_REMOVE(wk, wk_list);
WORKITEM_FREE(indirdep, D_INDIRDEP);
FREE_LOCK(&lk);
brelse(sbp);
ACQUIRE_LOCK(&lk);
continue;
}
/*
* Replace up-to-date version with safe version.
*/
FREE_LOCK(&lk);
indirdep->ir_saveddata = malloc(bp->b_bcount,
M_INDIRDEP, M_WAITOK);
ACQUIRE_LOCK(&lk);
indirdep->ir_state &= ~ATTACHED;
indirdep->ir_state |= UNDONE;
memcpy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
memcpy(bp->b_data, indirdep->ir_savebp->b_data,
bp->b_bcount);
continue;
case D_MKDIR:
case D_BMSAFEMAP:
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
continue;
default:
FREE_LOCK(&lk);
panic("handle_disk_io_initiation: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
FREE_LOCK(&lk);
}
/*
* Called from within the procedure above to deal with unsatisfied
* allocation dependencies in a directory. The buffer must be locked,
* thus, no I/O completion operations can occur while we are
* manipulating its associated dependencies.
*/
STATIC void
initiate_write_filepage(struct pagedep *pagedep, struct buf *bp)
{
struct diradd *dap;
struct direct *ep;
int i;
if (pagedep->pd_state & IOSTARTED) {
/*
* This can only happen if there is a driver that does not
* understand chaining. Here biodone will reissue the call
* to strategy for the incomplete buffers.
*/
printf("initiate_write_filepage: already started\n");
return;
}
pagedep->pd_state |= IOSTARTED;
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
ep = (struct direct *)
((char *)bp->b_data + dap->da_offset);
if (ep->d_ino != dap->da_newinum) {
FREE_LOCK(&lk);
panic("%s: dir inum %u != new %u",
"initiate_write_filepage",
ep->d_ino, dap->da_newinum);
}
if (dap->da_state & DIRCHG)
ep->d_ino = dap->da_previous->dm_oldinum;
else
ep->d_ino = 0;
dap->da_state &= ~ATTACHED;
dap->da_state |= UNDONE;
}
}
}
/*
* Called from within the procedure above to deal with unsatisfied
* allocation dependencies in an inodeblock. The buffer must be
* locked, thus, no I/O completion operations can occur while we
* are manipulating its associated dependencies.
*/
/* The inode block */
STATIC void
initiate_write_inodeblock_ufs1(struct inodedep *inodedep, struct buf *bp)
{
struct allocdirect *adp, *lastadp;
struct ufs1_dinode *dp;
struct fs *fs;
#ifdef DIAGNOSTIC
daddr_t prevlbn = 0;
int32_t d1, d2;
#endif
int i, deplist;
if (inodedep->id_state & IOSTARTED) {
FREE_LOCK(&lk);
panic("initiate_write_inodeblock: already started");
}
inodedep->id_state |= IOSTARTED;
fs = inodedep->id_fs;
dp = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
*/
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
if (inodedep->id_savedino1 != NULL) {
FREE_LOCK(&lk);
panic("initiate_write_inodeblock: already doing I/O");
}
FREE_LOCK(&lk);
inodedep->id_savedino1 = malloc(sizeof(struct ufs1_dinode),
M_INODEDEP, M_WAITOK);
inodedep->id_unsize = sizeof(struct ufs1_dinode);
ACQUIRE_LOCK(&lk);
*inodedep->id_savedino1 = *dp;
memset(dp, 0, sizeof(struct ufs1_dinode));
return;
}
/*
* If no dependencies, then there is nothing to roll back.
*/
inodedep->id_savedsize = dp->di_size;
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
return;
/*
* Set the dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if (adp->ad_lbn < NDADDR &&
(d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%lld mismatch %d != %d",
"softdep_write_inodeblock", (long long)adp->ad_lbn,
d1, d2);
}
if (adp->ad_lbn >= NDADDR &&
(d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
(d2 = adp->ad_newblkno)) {
FREE_LOCK(&lk);
panic("%s: indirect pointer #%lld mismatch %d != %d",
"softdep_write_inodeblock", (long long)(adp->ad_lbn -
NDADDR), d1, d2);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
if (adp->ad_lbn >= NDADDR)
break;
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep1");
}
#endif /* DIAGNOSTIC */
dp->di_db[i] = 0;
}
for (i = 0; i < NIADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_ib[i] != 0 &&
(deplist & ((1 << NDADDR) << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep2");
}
#endif /* DIAGNOSTIC */
dp->di_ib[i] = 0;
}
return;
}
/*
* If we have zero'ed out the last allocated block of the file,
* roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
}
/*
* The only dependencies are for indirect blocks.
*
* The file size for indirect block additions is not guaranteed.
* Such a guarantee would be non-trivial to achieve. The conventional
* synchronous write implementation also does not make this guarantee.
* Fsck should catch and fix discrepancies. Arguably, the file size
* can be over-estimated without destroying integrity when the file
* moves into the indirect blocks (i.e., is large). If we want to
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
}
#ifdef FFS2
/*
* Version of initiate_write_inodeblock that handles FFS2 dinodes.
*/
/* The inode block */
STATIC void
initiate_write_inodeblock_ufs2(struct inodedep *inodedep, struct buf *bp)
{
struct allocdirect *adp, *lastadp;
struct ufs2_dinode *dp;
struct fs *fs = inodedep->id_fs;
#ifdef DIAGNOSTIC
daddr_t prevlbn = -1, d1, d2;
#endif
int deplist, i;
if (inodedep->id_state & IOSTARTED)
panic("initiate_write_inodeblock_ufs2: already started");
inodedep->id_state |= IOSTARTED;
fs = inodedep->id_fs;
dp = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
*/
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
if (inodedep->id_savedino2 != NULL)
panic("initiate_write_inodeblock_ufs2: I/O underway");
inodedep->id_savedino2 = malloc(sizeof(struct ufs2_dinode),
M_INODEDEP, M_WAITOK);
inodedep->id_unsize = sizeof(struct ufs2_dinode);
*inodedep->id_savedino2 = *dp;
memset(dp, 0, sizeof(struct ufs2_dinode));
return;
}
/*
* If no dependencies, then there is nothing to roll back.
*/
inodedep->id_savedsize = dp->di_size;
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
return;
#ifdef notyet
inodedep->id_savedextsize = dp->di_extsize;
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
return;
/*
* Set the ext data dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if ((d1 = dp->di_extb[adp->ad_lbn]) !=
(d2 = adp->ad_newblkno)) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%lld mismatch %lld != %lld",
"softdep_write_inodeblock", (long long)adp->ad_lbn,
d1, d2);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the ext
* data which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep1");
}
#endif /* DIAGNOSTIC */
dp->di_extb[i] = 0;
}
lastadp = NULL;
break;
}
/*
* If we have zero'ed out the last allocated block of the ext
* data, roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_extb[i] != 0)
break;
dp->di_extsize = (i + 1) * fs->fs_bsize;
}
#endif /* notyet */
/*
* Set the file data dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if (adp->ad_lbn < NDADDR &&
(d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%lld mismatch %lld != %lld",
"softdep_write_inodeblock", (long long)adp->ad_lbn,
d1, d2);
}
if (adp->ad_lbn >= NDADDR &&
(d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
(d2 = adp->ad_newblkno)) {
FREE_LOCK(&lk);
panic("%s: indirect pointer #%lld mismatch %lld != %lld",
"softdep_write_inodeblock", (long long)(adp->ad_lbn -
NDADDR), d1, d2);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
if (adp->ad_lbn >= NDADDR)
break;
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep2");
}
#endif /* DIAGNOSTIC */
dp->di_db[i] = 0;
}
for (i = 0; i < NIADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_ib[i] != 0 &&
(deplist & ((1 << NDADDR) << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep3");
}
#endif /* DIAGNOSTIC */
dp->di_ib[i] = 0;
}
return;
}
/*
* If we have zero'ed out the last allocated block of the file,
* roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
}
/*
* The only dependencies are for indirect blocks.
*
* The file size for indirect block additions is not guaranteed.
* Such a guarantee would be non-trivial to achieve. The conventional
* synchronous write implementation also does not make this guarantee.
* Fsck should catch and fix discrepancies. Arguably, the file size
* can be over-estimated without destroying integrity when the file
* moves into the indirect blocks (i.e., is large). If we want to
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
}
#endif /* FFS2 */
/*
* This routine is called during the completion interrupt
* service routine for a disk write (from the procedure called
* by the device driver to inform the file system caches of
* a request completion). It should be called early in this
* procedure, before the block is made available to other
* processes or other routines are called.
*/
/* describes the completed disk write */
void
softdep_disk_write_complete(struct buf *bp)
{
struct worklist *wk;
struct workhead reattach;
struct newblk *newblk;
struct allocindir *aip;
struct allocdirect *adp;
struct indirdep *indirdep;
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
/*
* If an error occurred while doing the write, then the data
* has not hit the disk and the dependencies cannot be unrolled.
*/
if ((bp->b_flags & B_ERROR) && !(bp->b_flags & B_INVAL))
return;
#ifdef DEBUG
if (lk.lkt_held != -1)
panic("softdep_disk_write_complete: lock is held");
lk.lkt_held = -2;
#endif
LIST_INIT(&reattach);
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
case D_PAGEDEP:
if (handle_written_filepage(WK_PAGEDEP(wk), bp))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_INODEDEP:
if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_BMSAFEMAP:
bmsafemap = WK_BMSAFEMAP(wk);
while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
newblk->nb_state |= DEPCOMPLETE;
newblk->nb_bmsafemap = NULL;
LIST_REMOVE(newblk, nb_deps);
}
while ((adp =
LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
adp->ad_state |= DEPCOMPLETE;
adp->ad_buf = NULL;
LIST_REMOVE(adp, ad_deps);
handle_allocdirect_partdone(adp);
}
while ((aip =
LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
aip->ai_state |= DEPCOMPLETE;
aip->ai_buf = NULL;
LIST_REMOVE(aip, ai_deps);
handle_allocindir_partdone(aip);
}
while ((inodedep =
LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
inodedep->id_state |= DEPCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
}
WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
continue;
case D_MKDIR:
handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
continue;
case D_ALLOCDIRECT:
adp = WK_ALLOCDIRECT(wk);
adp->ad_state |= COMPLETE;
handle_allocdirect_partdone(adp);
continue;
case D_ALLOCINDIR:
aip = WK_ALLOCINDIR(wk);
aip->ai_state |= COMPLETE;
handle_allocindir_partdone(aip);
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
if (indirdep->ir_state & GOINGAWAY)
panic("disk_write_complete: indirdep gone");
memcpy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
free(indirdep->ir_saveddata, M_INDIRDEP, bp->b_bcount);
indirdep->ir_saveddata = NULL;
indirdep->ir_state &= ~UNDONE;
indirdep->ir_state |= ATTACHED;
while ((aip = LIST_FIRST(&indirdep->ir_donehd))) {
handle_allocindir_partdone(aip);
if (aip == LIST_FIRST(&indirdep->ir_donehd))
panic("disk_write_complete: not gone");
}
WORKLIST_INSERT(&reattach, wk);
if ((bp->b_flags & B_DELWRI) == 0)
stat_indir_blk_ptrs++;
buf_dirty(bp);
continue;
default:
panic("handle_disk_write_complete: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
/*
* Reattach any requests that must be redone.
*/
while ((wk = LIST_FIRST(&reattach)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&bp->b_dep, wk);
}
#ifdef DEBUG
if (lk.lkt_held != -2)
panic("softdep_disk_write_complete: lock lost");
lk.lkt_held = -1;
#endif
}
/*
* Called from within softdep_disk_write_complete above. Note that
* this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
/* the completed allocdirect */
STATIC void
handle_allocdirect_partdone(struct allocdirect *adp)
{
struct allocdirect *listadp;
struct inodedep *inodedep;
long bsize, delay;
splassert(IPL_BIO);
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
if (adp->ad_buf != NULL)
panic("handle_allocdirect_partdone: dangling dep");
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem. Thus, we cannot free any
* allocdirects after one whose ad_oldblkno claims a fragment as
* these blocks must be rolled back to zero before writing the inode.
* We check the currently active set of allocdirects in id_inoupdt.
*/
inodedep = adp->ad_inodedep;
bsize = inodedep->id_fs->fs_bsize;
TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
/* found our block */
if (listadp == adp)
break;
/* continue if ad_oldlbn is not a fragment */
if (listadp->ad_oldsize == 0 ||
listadp->ad_oldsize == bsize)
continue;
/* hit a fragment */
return;
}
/*
* If we have reached the end of the current list without
* finding the just finished dependency, then it must be
* on the future dependency list. Future dependencies cannot
* be freed until they are moved to the current list.
*/
if (listadp == NULL) {
#ifdef DEBUG
TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
/* found our block */
if (listadp == adp)
break;
if (listadp == NULL)
panic("handle_allocdirect_partdone: lost dep");
#endif /* DEBUG */
return;
}
/*
* If we have found the just finished dependency, then free
* it along with anything that follows it that is complete.
* If the inode still has a bitmap dependency, then it has
* never been written to disk, hence the on-disk inode cannot
* reference the old fragment so we can free it without delay.
*/
delay = (inodedep->id_state & DEPCOMPLETE);
for (; adp; adp = listadp) {
listadp = TAILQ_NEXT(adp, ad_next);
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
free_allocdirect(&inodedep->id_inoupdt, adp, delay);
}
}
/*
* Called from within softdep_disk_write_complete above. Note that
* this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
/* the completed allocindir */
STATIC void
handle_allocindir_partdone(struct allocindir *aip)
{
struct indirdep *indirdep;
splassert(IPL_BIO);
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
if (aip->ai_buf != NULL)
panic("handle_allocindir_partdone: dangling dependency");
indirdep = aip->ai_indirdep;
if (indirdep->ir_state & UNDONE) {
LIST_REMOVE(aip, ai_next);
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
return;
}
if (indirdep->ir_state & UFS1FMT)
((int32_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
else
((int64_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
LIST_REMOVE(aip, ai_next);
if (aip->ai_freefrag != NULL)
add_to_worklist(&aip->ai_freefrag->ff_list);
WORKITEM_FREE(aip, D_ALLOCINDIR);
}
/*
* Called from within softdep_disk_write_complete above to restore
* in-memory inode block contents to their most up-to-date state. Note
* that this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
/* buffer containing the inode block */
STATIC int
handle_written_inodeblock(struct inodedep *inodedep, struct buf *bp)
{
struct worklist *wk, *filefree;
struct allocdirect *adp, *nextadp;
struct ufs1_dinode *dp1 = NULL;
struct ufs2_dinode *dp2 = NULL;
int hadchanges, fstype;
splassert(IPL_BIO);
if ((inodedep->id_state & IOSTARTED) == 0)
panic("handle_written_inodeblock: not started");
inodedep->id_state &= ~IOSTARTED;
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
fstype = UM_UFS1;
dp1 = (struct ufs1_dinode *) bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
} else {
fstype = UM_UFS2;
dp2 = (struct ufs2_dinode *) bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
}
/*
* If we had to rollback the inode allocation because of
* bitmaps being incomplete, then simply restore it.
* Keep the block dirty so that it will not be reclaimed until
* all associated dependencies have been cleared and the
* corresponding updates written to disk.
*/
if (inodedep->id_savedino1 != NULL) {
if (fstype == UM_UFS1)
*dp1 = *inodedep->id_savedino1;
else
*dp2 = *inodedep->id_savedino2;
free(inodedep->id_savedino1, M_INODEDEP, inodedep->id_unsize);
inodedep->id_savedino1 = NULL;
if ((bp->b_flags & B_DELWRI) == 0)
stat_inode_bitmap++;
buf_dirty(bp);
return (1);
}
inodedep->id_state |= COMPLETE;
/*
* Roll forward anything that had to be rolled back before
* the inode could be updated.
*/
hadchanges = 0;
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED)
panic("handle_written_inodeblock: new entry");
if (fstype == UM_UFS1) {
if (adp->ad_lbn < NDADDR) {
if (dp1->di_db[adp->ad_lbn] != adp->ad_oldblkno)
panic("%s: %s #%lld mismatch %d != "
"%lld",
"handle_written_inodeblock",
"direct pointer",
(long long)adp->ad_lbn,
dp1->di_db[adp->ad_lbn],
(long long)adp->ad_oldblkno);
dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
} else {
if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
panic("%s: %s #%lld allocated as %d",
"handle_written_inodeblock",
"indirect pointer",
(long long)(adp->ad_lbn - NDADDR),
dp1->di_ib[adp->ad_lbn - NDADDR]);
dp1->di_ib[adp->ad_lbn - NDADDR] =
adp->ad_newblkno;
}
} else {
if (adp->ad_lbn < NDADDR) {
if (dp2->di_db[adp->ad_lbn] != adp->ad_oldblkno)
panic("%s: %s #%lld mismatch %lld != "
"%lld", "handle_written_inodeblock",
"direct pointer",
(long long)adp->ad_lbn,
dp2->di_db[adp->ad_lbn],
(long long)adp->ad_oldblkno);
dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
} else {
if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
panic("%s: %s #%lld allocated as %lld",
"handle_written_inodeblock",
"indirect pointer",
(long long)(adp->ad_lbn - NDADDR),
dp2->di_ib[adp->ad_lbn - NDADDR]);
dp2->di_ib[adp->ad_lbn - NDADDR] =
adp->ad_newblkno;
}
}
adp->ad_state &= ~UNDONE;
adp->ad_state |= ATTACHED;
hadchanges = 1;
}
if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
stat_direct_blk_ptrs++;
/*
* Reset the file size to its most up-to-date value.
*/
if (inodedep->id_savedsize == -1)
panic("handle_written_inodeblock: bad size");
if (fstype == UM_UFS1) {
if (dp1->di_size != inodedep->id_savedsize) {
dp1->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
} else {
if (dp2->di_size != inodedep->id_savedsize) {
dp2->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
}
inodedep->id_savedsize = -1;
/*
* If there were any rollbacks in the inode block, then it must be
* marked dirty so that its will eventually get written back in
* its correct form.
*/
if (hadchanges)
buf_dirty(bp);
/*
* Process any allocdirects that completed during the update.
*/
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
handle_allocdirect_partdone(adp);
/*
* Process deallocations that were held pending until the
* inode had been written to disk. Freeing of the inode
* is delayed until after all blocks have been freed to
* avoid creation of new <vfsid, inum, lbn> triples
* before the old ones have been deleted.
*/
filefree = NULL;
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
case D_FREEFILE:
/*
* We defer adding filefree to the worklist until
* all other additions have been made to ensure
* that it will be done after all the old blocks
* have been freed.
*/
if (filefree != NULL)
panic("handle_written_inodeblock: filefree");
filefree = wk;
continue;
case D_MKDIR:
handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
continue;
case D_DIRADD:
diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
wk->wk_state |= COMPLETE;
if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
continue;
/* FALLTHROUGH */
case D_FREEFRAG:
case D_DIRREM:
add_to_worklist(wk);
continue;
case D_NEWDIRBLK:
free_newdirblk(WK_NEWDIRBLK(wk));
continue;
default:
panic("handle_written_inodeblock: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
if (filefree != NULL) {
if (free_inodedep(inodedep) == 0)
panic("handle_written_inodeblock: live inodedep");
add_to_worklist(filefree);
return (0);
}
/*
* If no outstanding dependencies, free it.
*/
if (free_inodedep(inodedep) ||
TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
return (0);
return (hadchanges);
}
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
*/
STATIC void
diradd_inode_written(struct diradd *dap, struct inodedep *inodedep)
{
struct pagedep *pagedep;
splassert(IPL_BIO);
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
* Handle the completion of a mkdir dependency.
*/
STATIC void
handle_written_mkdir(struct mkdir *mkdir, int type)
{
struct diradd *dap;
struct pagedep *pagedep;
splassert(IPL_BIO);
if (mkdir->md_state != type)
panic("handle_written_mkdir: bad type");
dap = mkdir->md_diradd;
dap->da_state &= ~type;
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
dap->da_state |= DEPCOMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
/*
* Called from within softdep_disk_write_complete above.
* A write operation was just completed. Removed inodes can
* now be freed and associated block pointers may be committed.
* Note that this routine is always called from interrupt level
* with further splbio interrupts blocked.
*/
/* buffer containing the written page */
STATIC int
handle_written_filepage(struct pagedep *pagedep, struct buf *bp)
{
struct dirrem *dirrem;
struct diradd *dap, *nextdap;
struct direct *ep;
int i, chgs;
splassert(IPL_BIO);
if ((pagedep->pd_state & IOSTARTED) == 0)
panic("handle_written_filepage: not started");
pagedep->pd_state &= ~IOSTARTED;
/*
* Process any directory removals that have been committed.
*/
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
/*
* Free any directory additions that have been committed.
* If it is a newly allocated block, we have to wait until
* the on-disk directory inode claims the new block.
*/
if ((pagedep->pd_state & NEWBLOCK) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
free_diradd(dap);
/*
* Uncommitted directory entries must be restored.
*/
for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
dap = nextdap) {
nextdap = LIST_NEXT(dap, da_pdlist);
if (dap->da_state & ATTACHED)
panic("handle_written_filepage: attached");
ep = (struct direct *)
((char *)bp->b_data + dap->da_offset);
ep->d_ino = dap->da_newinum;
dap->da_state &= ~UNDONE;
dap->da_state |= ATTACHED;
chgs = 1;
/*
* If the inode referenced by the directory has
* been written out, then the dependency can be
* moved to the pending list.
*/
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
da_pdlist);
}
}
}
/*
* If there were any rollbacks in the directory, then it must be
* marked dirty so that its will eventually get written back in
* its correct form.
*/
if (chgs) {
if ((bp->b_flags & B_DELWRI) == 0)
stat_dir_entry++;
buf_dirty(bp);
return (1);
}
/*
* If we are not waiting for a new directory block to be
* claimed by its inode, then the pagedep will be freed.
* Otherwise it will remain to track any new entries on
* the page in case they are fsync'ed.
*/
if ((pagedep->pd_state & NEWBLOCK) == 0) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
return (0);
}
/*
* Writing back in-core inode structures.
*
* The file system only accesses an inode's contents when it occupies an
* "in-core" inode structure. These "in-core" structures are separate from
* the page frames used to cache inode blocks. Only the latter are
* transferred to/from the disk. So, when the updated contents of the
* "in-core" inode structure are copied to the corresponding in-memory inode
* block, the dependencies are also transferred. The following procedure is
* called when copying a dirty "in-core" inode to a cached inode block.
*/
/*
* Called when an inode is loaded from disk. If the effective link count
* differed from the actual link count when it was last flushed, then we
* need to ensure that the correct effective link count is put back.
*/
/* the "in_core" copy of the inode */
void
softdep_load_inodeblock(struct inode *ip)
{
struct inodedep *inodedep;
/*
* Check for alternate nlink count.
*/
ip->i_effnlink = DIP(ip, nlink);
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return;
}
ip->i_effnlink -= inodedep->id_nlinkdelta;
FREE_LOCK(&lk);
}
/*
* This routine is called just before the "in-core" inode
* information is to be copied to the in-memory inode block.
* Recall that an inode block contains several inodes. If
* the force flag is set, then the dependencies will be
* cleared so that the update can always be made. Note that
* the buffer is locked when this routine is called, so we
* will never be in the middle of writing the inode block
* to disk.
*/
/* the "in_core" copy of the inode */
/* the buffer containing the inode block */
/* nonzero => update must be allowed */
void
softdep_update_inodeblock(struct inode *ip, struct buf *bp, int waitfor)
{
struct inodedep *inodedep;
struct worklist *wk;
int error, gotit;
/*
* If the effective link count is not equal to the actual link
* count, then we must track the difference in an inodedep while
* the inode is (potentially) tossed out of the cache. Otherwise,
* if there is no existing inodedep, then there are no dependencies
* to track.
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
if (ip->i_effnlink != DIP(ip, nlink))
panic("softdep_update_inodeblock: bad link count");
return;
}
if (inodedep->id_nlinkdelta != DIP(ip, nlink) - ip->i_effnlink) {
FREE_LOCK(&lk);
panic("softdep_update_inodeblock: bad delta");
}
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
*/
inodedep->id_state &= ~COMPLETE;
if ((inodedep->id_state & ONWORKLIST) == 0)
WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
/*
* Any new dependencies associated with the incore inode must
* now be moved to the list associated with the buffer holding
* the in-memory copy of the inode. Once merged process any
* allocdirects that are completed by the merger.
*/
merge_inode_lists(inodedep);
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
* can be moved to the id_bufwait so that they will be
* processed when the buffer I/O completes.
*/
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
}
/*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
* DEPCOMPLETE being set in id_state). If we are doing a
* forced sync (e.g., an fsync on a file), we force the bitmap
* to be written so that the update can be done.
*/
do {
if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
FREE_LOCK(&lk);
return;
}
bp = inodedep->id_buf;
gotit = getdirtybuf(bp, MNT_WAIT);
} while (gotit == -1);
FREE_LOCK(&lk);
if (gotit && (error = bwrite(bp)) != 0)
softdep_error("softdep_update_inodeblock: bwrite", error);
if ((inodedep->id_state & DEPCOMPLETE) == 0)
panic("softdep_update_inodeblock: update failed");
}
/*
* Merge the new inode dependency list (id_newinoupdt) into the old
* inode dependency list (id_inoupdt). This routine must be called
* with splbio interrupts blocked.
*/
STATIC void
merge_inode_lists(struct inodedep *inodedep)
{
struct allocdirect *listadp, *newadp;
splassert(IPL_BIO);
newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
if (listadp->ad_lbn < newadp->ad_lbn) {
listadp = TAILQ_NEXT(listadp, ad_next);
continue;
}
TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
if (listadp->ad_lbn == newadp->ad_lbn) {
allocdirect_merge(&inodedep->id_inoupdt, newadp,
listadp);
listadp = newadp;
}
newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
}
TAILQ_CONCAT(&inodedep->id_inoupdt, &inodedep->id_newinoupdt, ad_next);
}
/*
* If we are doing an fsync, then we must ensure that any directory
* entries for the inode have been written after the inode gets to disk.
*/
/* the "in_core" copy of the inode */
int
softdep_fsync(struct vnode *vp)
{
struct inodedep *inodedep;
struct pagedep *pagedep;
struct worklist *wk;
struct diradd *dap;
struct mount *mnt;
struct vnode *pvp;
struct inode *ip;
struct inode *pip;
struct buf *bp;
struct fs *fs;
struct proc *p = CURPROC; /* XXX */
int error, flushparent;
ufsino_t parentino;
daddr_t lbn;
ip = VTOI(vp);
fs = ip->i_fs;
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return (0);
}
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
FREE_LOCK(&lk);
panic("softdep_fsync: pending ops");
}
for (error = 0, flushparent = 0; ; ) {
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
break;
if (wk->wk_type != D_DIRADD) {
FREE_LOCK(&lk);
panic("softdep_fsync: Unexpected type %s",
TYPENAME(wk->wk_type));
}
dap = WK_DIRADD(wk);
/*
* Flush our parent if this directory entry has a MKDIR_PARENT
* dependency or is contained in a newly allocated block.
*/
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
mnt = pagedep->pd_mnt;
parentino = pagedep->pd_ino;
lbn = pagedep->pd_lbn;
if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
FREE_LOCK(&lk);
panic("softdep_fsync: dirty");
}
if ((dap->da_state & MKDIR_PARENT) ||
(pagedep->pd_state & NEWBLOCK))
flushparent = 1;
else
flushparent = 0;
/*
* If we are being fsync'ed as part of vgone'ing this vnode,
* then we will not be able to release and recover the
* vnode below, so we just have to give up on writing its
* directory entry out. It will eventually be written, just
* not now, but then the user was not asking to have it
* written, so we are not breaking any promises.
*/
mtx_enter(&vnode_mtx);
if (vp->v_lflag & VXLOCK) {
mtx_leave(&vnode_mtx);
break;
}
mtx_leave(&vnode_mtx);
/*
* We prevent deadlock by always fetching inodes from the
* root, moving down the directory tree. Thus, when fetching
* our parent directory, we must unlock ourselves before
* requesting the lock on our parent. See the comment in
* ufs_lookup for details on possible races.
*/
FREE_LOCK(&lk);
VOP_UNLOCK(vp);
error = VFS_VGET(mnt, parentino, &pvp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (error != 0)
return (error);
/*
* All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
* that are contained in direct blocks will be resolved by
* doing a UFS_UPDATE. Pagedeps contained in indirect blocks
* may require a complete sync'ing of the directory. So, we
* try the cheap and fast UFS_UPDATE first, and if that fails,
* then we do the slower VOP_FSYNC of the directory.
*/
pip = VTOI(pvp);
if (flushparent) {
error = UFS_UPDATE(pip, 1);
if (error) {
vput(pvp);
return (error);
}
if (pagedep->pd_state & NEWBLOCK) {
error = VOP_FSYNC(pvp, p->p_ucred, MNT_WAIT, p);
if (error) {
vput(pvp);
return (error);
}
}
}
/*
* Flush directory page containing the inode's name.
*/
error = bread(pvp, lbn, fs->fs_bsize, &bp);
if (error == 0) {
bp->b_bcount = blksize(fs, pip, lbn);
error = bwrite(bp);
} else
brelse(bp);
vput(pvp);
if (error != 0)
return (error);
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
}
FREE_LOCK(&lk);
return (0);
}
/*
* Flush all the dirty bitmaps associated with the block device
* before flushing the rest of the dirty blocks so as to reduce
* the number of dependencies that will have to be rolled back.
*/
void
softdep_fsync_mountdev(struct vnode *vp, int waitfor)
{
struct buf *bp, *nbp;
struct worklist *wk;
if (!vn_isdisk(vp, NULL))
panic("softdep_fsync_mountdev: vnode not a disk");
ACQUIRE_LOCK(&lk);
LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
/*
* If it is already scheduled, skip to the next buffer.
*/
splassert(IPL_BIO);
if (bp->b_flags & B_BUSY)
continue;
if ((bp->b_flags & B_DELWRI) == 0) {
FREE_LOCK(&lk);
panic("softdep_fsync_mountdev: not dirty");
}
/*
* We are only interested in bitmaps with outstanding
* dependencies.
*/
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
wk->wk_type != D_BMSAFEMAP) {
continue;
}
bremfree(bp);
buf_acquire(bp);
FREE_LOCK(&lk);
(void) bawrite(bp);
ACQUIRE_LOCK(&lk);
/*
* Since we may have slept during the I/O, we need
* to start from a known point.
*/
nbp = LIST_FIRST(&vp->v_dirtyblkhd);
}
if (waitfor == MNT_WAIT)
drain_output(vp, 1);
FREE_LOCK(&lk);
}
/*
* This routine is called when we are trying to synchronously flush a
* file. This routine must eliminate any filesystem metadata dependencies
* so that the syncing routine can succeed by pushing the dirty blocks
* associated with the file. If any I/O errors occur, they are returned.
*/
int
softdep_sync_metadata(struct vop_fsync_args *ap)
{
struct vnode *vp = ap->a_vp;
struct pagedep *pagedep;
struct allocdirect *adp;
struct allocindir *aip;
struct buf *bp, *nbp;
struct worklist *wk;
int i, gotit, error, waitfor;
/*
* Check whether this vnode is involved in a filesystem
* that is doing soft dependency processing.
*/
if (!vn_isdisk(vp, NULL)) {
if (!DOINGSOFTDEP(vp))
return (0);
} else
if (vp->v_specmountpoint == NULL ||
(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
return (0);
/*
* Ensure that any direct block dependencies have been cleared.
*/
ACQUIRE_LOCK(&lk);
if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
FREE_LOCK(&lk);
return (error);
}
/*
* For most files, the only metadata dependencies are the
* cylinder group maps that allocate their inode or blocks.
* The block allocation dependencies can be found by traversing
* the dependency lists for any buffers that remain on their
* dirty buffer list. The inode allocation dependency will
* be resolved when the inode is updated with MNT_WAIT.
* This work is done in two passes. The first pass grabs most
* of the buffers and begins asynchronously writing them. The
* only way to wait for these asynchronous writes is to sleep
* on the filesystem vnode which may stay busy for a long time
* if the filesystem is active. So, instead, we make a second
* pass over the dependencies blocking on each write. In the
* usual case we will be blocking against a write that we
* initiated, so when it is done the dependency will have been
* resolved. Thus the second pass is expected to end quickly.
*/
waitfor = MNT_NOWAIT;
top:
/*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
*/
drain_output(vp, 1);
bp = LIST_FIRST(&vp->v_dirtyblkhd);
gotit = getdirtybuf(bp, MNT_WAIT);
if (gotit == 0) {
FREE_LOCK(&lk);
return (0);
} else if (gotit == -1)
goto top;
loop:
/*
* As we hold the buffer locked, none of its dependencies
* will disappear.
*/
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
switch (wk->wk_type) {
case D_ALLOCDIRECT:
adp = WK_ALLOCDIRECT(wk);
if (adp->ad_state & DEPCOMPLETE)
break;
nbp = adp->ad_buf;
gotit = getdirtybuf(nbp, waitfor);
if (gotit == 0)
break;
else if (gotit == -1)
goto loop;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = VOP_BWRITE(nbp)) != 0) { bawrite(bp);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
case D_ALLOCINDIR:
aip = WK_ALLOCINDIR(wk);
if (aip->ai_state & DEPCOMPLETE)
break;
nbp = aip->ai_buf;
gotit = getdirtybuf(nbp, waitfor);
if (gotit == 0)
break;
else if (gotit == -1)
goto loop;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = VOP_BWRITE(nbp)) != 0) { bawrite(bp);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
case D_INDIRDEP:
restart:
LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
if (aip->ai_state & DEPCOMPLETE)
continue;
nbp = aip->ai_buf;
if (getdirtybuf(nbp, MNT_WAIT) <= 0)
goto restart;
FREE_LOCK(&lk);
if ((error = VOP_BWRITE(nbp)) != 0) {
bawrite(bp);
return (error);
}
ACQUIRE_LOCK(&lk);
goto restart;
}
break;
case D_INODEDEP:
if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
WK_INODEDEP(wk)->id_ino)) != 0) {
FREE_LOCK(&lk);
bawrite(bp);
return (error);
}
break;
case D_PAGEDEP:
/*
* We are trying to sync a directory that may
* have dependencies on both its own metadata
* and/or dependencies on the inodes of any
* recently allocated files. We walk its diradd
* lists pushing out the associated inode.
*/
pagedep = WK_PAGEDEP(wk);
for (i = 0; i < DAHASHSZ; i++) {
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) ==
NULL)
continue;
if ((error =
flush_pagedep_deps(vp, pagedep->pd_mnt,
&pagedep->pd_diraddhd[i]))) {
FREE_LOCK(&lk);
bawrite(bp);
return (error);
}
}
break;
case D_MKDIR:
/*
* This case should never happen if the vnode has
* been properly sync'ed. However, if this function
* is used at a place where the vnode has not yet
* been sync'ed, this dependency can show up. So,
* rather than panic, just flush it.
*/
nbp = WK_MKDIR(wk)->md_buf;
KASSERT(bp != nbp);
gotit = getdirtybuf(nbp, waitfor);
if (gotit == 0)
break;
else if (gotit == -1)
goto loop;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = VOP_BWRITE(nbp)) != 0) { bawrite(bp);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
case D_BMSAFEMAP:
/*
* This case should never happen if the vnode has
* been properly sync'ed. However, if this function
* is used at a place where the vnode has not yet
* been sync'ed, this dependency can show up. So,
* rather than panic, just flush it.
*/
nbp = WK_BMSAFEMAP(wk)->sm_buf;
if (bp == nbp)
break;
gotit = getdirtybuf(nbp, waitfor);
if (gotit == 0)
break;
else if (gotit == -1)
goto loop;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = VOP_BWRITE(nbp)) != 0) { bawrite(bp);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
default:
FREE_LOCK(&lk);
panic("softdep_sync_metadata: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
do {
nbp = LIST_NEXT(bp, b_vnbufs);
gotit = getdirtybuf(nbp, MNT_WAIT);
} while (gotit == -1);
FREE_LOCK(&lk);
bawrite(bp);
ACQUIRE_LOCK(&lk);
if (nbp != NULL) {
bp = nbp;
goto loop;
}
/*
* The brief unlock is to allow any pent up dependency
* processing to be done. Then proceed with the second pass.
*/
if (waitfor == MNT_NOWAIT) {
waitfor = MNT_WAIT;
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
goto top;
}
/*
* If we have managed to get rid of all the dirty buffers,
* then we are done. For certain directories and block
* devices, we may need to do further work.
*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
*/
drain_output(vp, 1);
if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
FREE_LOCK(&lk);
return (0);
}
FREE_LOCK(&lk);
/*
* If we are trying to sync a block device, some of its buffers may
* contain metadata that cannot be written until the contents of some
* partially written files have been written to disk. The only easy
* way to accomplish this is to sync the entire filesystem (luckily
* this happens rarely).
*/
if (vn_isdisk(vp, NULL) && vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
(error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, 0, ap->a_cred,
ap->a_p)) != 0)
return (error);
return (0);
}
/*
* Flush the dependencies associated with an inodedep.
* Called with splbio blocked.
*/
STATIC int
flush_inodedep_deps(struct fs *fs, ufsino_t ino)
{
struct inodedep *inodedep;
struct allocdirect *adp;
int gotit, error, waitfor;
struct buf *bp;
splassert(IPL_BIO);
/*
* This work is done in two passes. The first pass grabs most
* of the buffers and begins asynchronously writing them. The
* only way to wait for these asynchronous writes is to sleep
* on the filesystem vnode which may stay busy for a long time
* if the filesystem is active. So, instead, we make a second
* pass over the dependencies blocking on each write. In the
* usual case we will be blocking against a write that we
* initiated, so when it is done the dependency will have been
* resolved. Thus the second pass is expected to end quickly.
* We give a brief window at the top of the loop to allow
* any pending I/O to complete.
*/
for (waitfor = MNT_NOWAIT; ; ) {
retry_ino:
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
return (0);
TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
gotit = getdirtybuf(bp, waitfor);
if (gotit == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
} else if (gotit == -1)
goto retry_ino;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
} else if ((error = VOP_BWRITE(bp)) != 0) {
ACQUIRE_LOCK(&lk);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
}
if (adp != NULL)
continue;
retry_newino:
TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
gotit = getdirtybuf(bp, waitfor);
if (gotit == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
} else if (gotit == -1)
goto retry_newino;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
} else if ((error = VOP_BWRITE(bp)) != 0) {
ACQUIRE_LOCK(&lk);
return (error);
}
ACQUIRE_LOCK(&lk);
break;
}
if (adp != NULL)
continue;
/*
* If pass2, we are done, otherwise do pass 2.
*/
if (waitfor == MNT_WAIT)
break;
waitfor = MNT_WAIT;
}
/*
* Try freeing inodedep in case all dependencies have been removed.
*/
if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
return (0);
}
/*
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
* Called with splbio blocked.
*/
STATIC int
flush_pagedep_deps(struct vnode *pvp, struct mount *mp,
struct diraddhd *diraddhdp)
{
struct proc *p = CURPROC; /* XXX */
struct worklist *wk;
struct inodedep *inodedep;
struct ufsmount *ump;
struct diradd *dap;
struct vnode *vp;
int gotit, error = 0;
struct buf *bp;
ufsino_t inum;
splassert(IPL_BIO);
ump = VFSTOUFS(mp);
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
/*
* Flush ourselves if this directory entry
* has a MKDIR_PARENT dependency.
*/
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
if ((error = UFS_UPDATE(VTOI(pvp), 1)))
break;
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: MKDIR_PARENT");
}
}
/*
* A newly allocated directory must have its "." and
* ".." entries written out before its name can be
* committed in its parent. We do not want or need
* the full semantics of a synchronous VOP_FSYNC as
* that may end up here again, once for each directory
* level in the filesystem. Instead, we push the blocks
* and wait for them to clear. We have to fsync twice
* because the first call may choose to defer blocks
* that still have dependencies, but deferral will
* happen at most once.
*/
inum = dap->da_newinum;
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
if ((error = VFS_VGET(mp, inum, &vp)) != 0)
break;
if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
(error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
vput(vp);
break;
}
drain_output(vp, 0);
/*
* If first block is still dirty with a D_MKDIR
* dependency then it needs to be written now.
*/
for (;;) {
error = 0;
ACQUIRE_LOCK(&lk);
bp = incore(vp, 0);
if (bp == NULL) {
FREE_LOCK(&lk);
break;
}
LIST_FOREACH(wk, &bp->b_dep, wk_list)
if (wk->wk_type == D_MKDIR)
break;
if (wk) {
gotit = getdirtybuf(bp, MNT_WAIT);
FREE_LOCK(&lk);
if (gotit == -1)
continue;
if (gotit && (error = bwrite(bp)) != 0)
break;
} else
FREE_LOCK(&lk);
break;
}
vput(vp);
/* Flushing of first block failed */
if (error)
break;
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: MKDIR_BODY");
}
}
/*
* Flush the inode on which the directory entry depends.
* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
* the only remaining dependency is that the updated inode
* count must get pushed to disk. The inode has already
* been pushed into its inode buffer (via VOP_UPDATE) at
* the time of the reference count change. So we need only
* locate that buffer, ensure that there will be no rollback
* caused by a bitmap dependency, then write the inode buffer.
*/
if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: lost inode");
}
/*
* If the inode still has bitmap dependencies,
* push them to disk.
*/
retry:
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
bp = inodedep->id_buf;
gotit = getdirtybuf(bp, MNT_WAIT);
if (gotit == -1)
goto retry;
FREE_LOCK(&lk);
if (gotit && (error = bwrite(bp)) != 0)
break;
ACQUIRE_LOCK(&lk);
if (dap != LIST_FIRST(diraddhdp))
continue;
}
/*
* If the inode is still sitting in a buffer waiting
* to be written, push it to disk.
*/
FREE_LOCK(&lk);
if ((error = bread(ump->um_devvp,
fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
(int)ump->um_fs->fs_bsize, &bp)) != 0) {
brelse(bp);
break;
}
if ((error = bwrite(bp)) != 0)
break;
ACQUIRE_LOCK(&lk);
/*
* If we have failed to get rid of all the dependencies
* then something is seriously wrong.
*/
if (dap == LIST_FIRST(diraddhdp)) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: flush failed");
}
}
if (error)
ACQUIRE_LOCK(&lk);
return (error);
}
/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. First attempt to slow things down
* using the techniques below. If that fails, this routine requests
* the offending operations to fall back to running synchronously
* until the memory load returns to a reasonable level.
*/
int
softdep_slowdown(struct vnode *vp)
{
int max_softdeps_hard;
max_softdeps_hard = max_softdeps * 11 / 10;
if (num_dirrem < max_softdeps_hard / 2 &&
num_inodedep < max_softdeps_hard)
return (0);
stat_sync_limit_hit += 1;
return (1);
}
/*
* If memory utilization has gotten too high, deliberately slow things
* down and speed up the I/O processing.
*/
STATIC int
request_cleanup(int resource, int islocked)
{
struct proc *p = CURPROC;
int s;
/*
* We never hold up the filesystem syncer process.
*/
if (p == filesys_syncer || (p->p_flag & P_SOFTDEP))
return (0);
/*
* First check to see if the work list has gotten backlogged.
* If it has, co-opt this process to help clean up two entries.
* Because this process may hold inodes locked, we cannot
* handle any remove requests that might block on a locked
* inode as that could lead to deadlock. We set P_SOFTDEP
* to avoid recursively processing the worklist.
*/
if (num_on_worklist > max_softdeps / 10) {
atomic_setbits_int(&p->p_flag, P_SOFTDEP);
if (islocked)
FREE_LOCK(&lk);
process_worklist_item(NULL, NULL, LK_NOWAIT);
process_worklist_item(NULL, NULL, LK_NOWAIT);
atomic_clearbits_int(&p->p_flag, P_SOFTDEP);
stat_worklist_push += 2;
if (islocked)
ACQUIRE_LOCK(&lk);
return(1);
}
/*
* Next, we attempt to speed up the syncer process. If that
* is successful, then we allow the process to continue.
*/
if (speedup_syncer())
return(0);
/*
* If we are resource constrained on inode dependencies, try
* flushing some dirty inodes. Otherwise, we are constrained
* by file deletions, so try accelerating flushes of directories
* with removal dependencies. We would like to do the cleanup
* here, but we probably hold an inode locked at this point and
* that might deadlock against one that we try to clean. So,
* the best that we can do is request the syncer daemon to do
* the cleanup for us.
*/
switch (resource) {
case FLUSH_INODES:
stat_ino_limit_push += 1;
req_clear_inodedeps += 1;
stat_countp = &stat_ino_limit_hit;
break;
case FLUSH_REMOVE:
stat_blk_limit_push += 1;
req_clear_remove += 1;
stat_countp = &stat_blk_limit_hit;
break;
default:
if (islocked)
FREE_LOCK(&lk);
panic("request_cleanup: unknown type");
}
/*
* Hopefully the syncer daemon will catch up and awaken us.
* We wait at most tickdelay before proceeding in any case.
*/
if (islocked == 0)
ACQUIRE_LOCK(&lk);
proc_waiting += 1;
if (!timeout_pending(&proc_waiting_timeout))
timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
s = FREE_LOCK_INTERLOCKED(&lk);
tsleep_nsec(&proc_waiting, PPAUSE, "softupdate", INFSLP);
ACQUIRE_LOCK_INTERLOCKED(&lk, s);
proc_waiting -= 1;
if (islocked == 0)
FREE_LOCK(&lk);
return (1);
}
/*
* Awaken processes pausing in request_cleanup and clear proc_waiting
* to indicate that there is no longer a timer running.
*/
void
pause_timer(void *arg)
{
*stat_countp += 1;
wakeup_one(&proc_waiting);
if (proc_waiting > 0)
timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
}
/*
* Flush out a directory with at least one removal dependency in an effort to
* reduce the number of dirrem, freefile, and freeblks dependency structures.
*/
STATIC void
clear_remove(struct proc *p)
{
struct pagedep_hashhead *pagedephd;
struct pagedep *pagedep;
static int next = 0;
struct mount *mp;
struct vnode *vp;
int error, cnt;
ufsino_t ino;
ACQUIRE_LOCK(&lk);
for (cnt = 0; cnt <= pagedep_hash; cnt++) {
pagedephd = &pagedep_hashtbl[next++];
if (next > pagedep_hash)
next = 0;
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
continue;
mp = pagedep->pd_mnt;
ino = pagedep->pd_ino;
#if 0
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
continue;
#endif
FREE_LOCK(&lk);
if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
softdep_error("clear_remove: vget", error);
#if 0
vn_finished_write(mp);
#endif
return;
}
if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
softdep_error("clear_remove: fsync", error);
drain_output(vp, 0);
vput(vp);
#if 0
vn_finished_write(mp);
#endif
return;
}
}
FREE_LOCK(&lk);
}
/*
* Clear out a block of dirty inodes in an effort to reduce
* the number of inodedep dependency structures.
*/
STATIC void
clear_inodedeps(struct proc *p)
{
struct inodedep_hashhead *inodedephd;
struct inodedep *inodedep = NULL;
static int next = 0;
struct mount *mp;
struct vnode *vp;
struct fs *fs;
int error, cnt;
ufsino_t firstino, lastino, ino;
ACQUIRE_LOCK(&lk);
/*
* Pick a random inode dependency to be cleared.
* We will then gather up all the inodes in its block
* that have dependencies and flush them out.
*/
for (cnt = 0; cnt <= inodedep_hash; cnt++) {
inodedephd = &inodedep_hashtbl[next++];
if (next > inodedep_hash)
next = 0;
if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
break;
}
if (inodedep == NULL) {
FREE_LOCK(&lk);
return;
}
/*
* Ugly code to find mount point given pointer to superblock.
*/
fs = inodedep->id_fs;
TAILQ_FOREACH(mp, &mountlist, mnt_list)
if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
break;
/*
* Find the last inode in the block with dependencies.
*/
firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
break;
/*
* Asynchronously push all but the last inode with dependencies.
* Synchronously push the last inode with dependencies to ensure
* that the inode block gets written to free up the inodedeps.
*/
for (ino = firstino; ino <= lastino; ino++) {
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
continue;
FREE_LOCK(&lk);
#if 0
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
continue;
#endif
if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
softdep_error("clear_inodedeps: vget", error);
#if 0
vn_finished_write(mp);
#endif
return;
}
if (ino == lastino) {
if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
softdep_error("clear_inodedeps: fsync1", error);
} else {
if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
softdep_error("clear_inodedeps: fsync2", error);
drain_output(vp, 0);
}
vput(vp);
#if 0
vn_finished_write(mp);
#endif
ACQUIRE_LOCK(&lk);
}
FREE_LOCK(&lk);
}
/*
* Function to determine if the buffer has outstanding dependencies
* that will cause a roll-back if the buffer is written. If wantcount
* is set, return number of dependencies, otherwise just yes or no.
*/
int
softdep_count_dependencies(struct buf *bp, int wantcount, int islocked)
{
struct worklist *wk;
struct inodedep *inodedep;
struct indirdep *indirdep;
struct allocindir *aip;
struct pagedep *pagedep;
struct diradd *dap;
int i, retval;
retval = 0;
if (!islocked)
ACQUIRE_LOCK(&lk);
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
switch (wk->wk_type) {
case D_INODEDEP:
inodedep = WK_INODEDEP(wk);
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
/* bitmap allocation dependency */
retval += 1;
if (!wantcount)
goto out;
}
if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
/* direct block pointer dependency */
retval += 1;
if (!wantcount)
goto out;
}
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
/* indirect block pointer dependency */
retval += 1;
if (!wantcount)
goto out;
}
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
/* directory entry dependency */
retval += 1;
if (!wantcount)
goto out;
}
}
continue;
case D_BMSAFEMAP:
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
case D_MKDIR:
/* never a dependency on these blocks */
continue;
default:
if (!islocked)
FREE_LOCK(&lk);
panic("softdep_check_for_rollback: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
out:
if (!islocked)
FREE_LOCK(&lk);
return retval;
}
/*
* Acquire exclusive access to a buffer.
* Must be called with splbio blocked.
* Returns:
* 1 if the buffer was acquired and is dirty;
* 0 if the buffer was clean, or we would have slept but had MN_NOWAIT;
* -1 if we slept and may try again (but not with this bp).
*/
STATIC int
getdirtybuf(struct buf *bp, int waitfor)
{
int s;
if (bp == NULL)
return (0);
splassert(IPL_BIO);
if (bp->b_flags & B_BUSY) {
if (waitfor != MNT_WAIT)
return (0);
bp->b_flags |= B_WANTED;
s = FREE_LOCK_INTERLOCKED(&lk);
tsleep_nsec(bp, PRIBIO+1, "sdsdty", INFSLP);
ACQUIRE_LOCK_INTERLOCKED(&lk, s);
return (-1);
}
if ((bp->b_flags & B_DELWRI) == 0)
return (0);
bremfree(bp);
buf_acquire(bp);
return (1);
}
/*
* Wait for pending output on a vnode to complete.
* Must be called with vnode locked.
*/
STATIC void
drain_output(struct vnode *vp, int islocked)
{
int s;
if (!islocked)
ACQUIRE_LOCK(&lk);
splassert(IPL_BIO); while (vp->v_numoutput) {
vp->v_bioflag |= VBIOWAIT;
s = FREE_LOCK_INTERLOCKED(&lk);
tsleep_nsec(&vp->v_numoutput, PRIBIO+1, "drain_output", INFSLP);
ACQUIRE_LOCK_INTERLOCKED(&lk, s);
}
if (!islocked)
FREE_LOCK(&lk);
}
/*
* Called whenever a buffer that is being invalidated or reallocated
* contains dependencies. This should only happen if an I/O error has
* occurred. The routine is called with the buffer locked.
*/
void
softdep_deallocate_dependencies(struct buf *bp)
{
if ((bp->b_flags & B_ERROR) == 0)
panic("softdep_deallocate_dependencies: dangling deps");
softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
panic("softdep_deallocate_dependencies: unrecovered I/O error");
}
/*
* Function to handle asynchronous write errors in the filesystem.
*/
void
softdep_error(char *func, int error)
{
/* XXX should do something better! */
printf("%s: got error %d while accessing filesystem\n", func, error);
}
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_output.h>
void
softdep_print(struct buf *bp, int full,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct worklist *wk;
(*pr)(" deps:\n");
LIST_FOREACH(wk, &bp->b_dep, wk_list)
worklist_print(wk, full, pr);
}
void
worklist_print(struct worklist *wk, int full,
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
struct pagedep *pagedep;
struct inodedep *inodedep;
struct newblk *newblk;
struct bmsafemap *bmsafemap;
struct allocdirect *adp;
struct indirdep *indirdep;
struct allocindir *aip;
struct freefrag *freefrag;
struct freeblks *freeblks;
struct freefile *freefile;
struct diradd *dap;
struct mkdir *mkdir;
struct dirrem *dirrem;
struct newdirblk *newdirblk;
char prefix[33];
int i;
for (prefix[i = 2 * MIN(16, full)] = '\0'; i--; prefix[i] = ' ')
;
(*pr)("%s%s(%p) state %b\n%s", prefix, TYPENAME(wk->wk_type), wk,
wk->wk_state, DEP_BITS, prefix);
switch (wk->wk_type) {
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
(*pr)("mount %p ino %u lbn %lld\n", pagedep->pd_mnt,
pagedep->pd_ino, (long long)pagedep->pd_lbn);
break;
case D_INODEDEP:
inodedep = WK_INODEDEP(wk);
(*pr)("fs %p ino %u nlinkdelta %u dino %p\n"
"%s bp %p savsz %lld\n", inodedep->id_fs,
inodedep->id_ino, inodedep->id_nlinkdelta,
inodedep->id_un.idu_savedino1,
prefix, inodedep->id_buf, inodedep->id_savedsize);
break;
case D_NEWBLK:
newblk = WK_NEWBLK(wk);
(*pr)("fs %p newblk %lld state %d bmsafemap %p\n",
newblk->nb_fs, (long long)newblk->nb_newblkno,
newblk->nb_state, newblk->nb_bmsafemap);
break;
case D_BMSAFEMAP:
bmsafemap = WK_BMSAFEMAP(wk);
(*pr)("buf %p\n", bmsafemap->sm_buf);
break;
case D_ALLOCDIRECT:
adp = WK_ALLOCDIRECT(wk);
(*pr)("lbn %lld newlbk %lld oldblk %lld newsize %ld olsize "
"%ld\n%s bp %p inodedep %p freefrag %p\n",
(long long)adp->ad_lbn, (long long)adp->ad_newblkno,
(long long)adp->ad_oldblkno, adp->ad_newsize,
adp->ad_oldsize,
prefix, adp->ad_buf, adp->ad_inodedep, adp->ad_freefrag);
break;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
(*pr)("savedata %p savebp %p\n", indirdep->ir_saveddata,
indirdep->ir_savebp);
break;
case D_ALLOCINDIR:
aip = WK_ALLOCINDIR(wk);
(*pr)("off %d newblk %lld oldblk %lld freefrag %p\n"
"%s indirdep %p buf %p\n", aip->ai_offset,
(long long)aip->ai_newblkno, (long long)aip->ai_oldblkno,
aip->ai_freefrag, prefix, aip->ai_indirdep, aip->ai_buf);
break;
case D_FREEFRAG:
freefrag = WK_FREEFRAG(wk);
(*pr)("vnode %p mp %p blkno %lld fsize %ld ino %u\n",
freefrag->ff_devvp, freefrag->ff_mnt,
(long long)freefrag->ff_blkno, freefrag->ff_fragsize,
freefrag->ff_inum);
break;
case D_FREEBLKS:
freeblks = WK_FREEBLKS(wk);
(*pr)("previno %u devvp %p mp %p oldsz %lld newsz %lld\n"
"%s chkcnt %d uid %d\n", freeblks->fb_previousinum,
freeblks->fb_devvp, freeblks->fb_mnt, freeblks->fb_oldsize,
freeblks->fb_newsize,
prefix, freeblks->fb_chkcnt, freeblks->fb_uid);
break;
case D_FREEFILE:
freefile = WK_FREEFILE(wk);
(*pr)("mode %x oldino %u vnode %p mp %p\n", freefile->fx_mode,
freefile->fx_oldinum, freefile->fx_devvp, freefile->fx_mnt);
break;
case D_DIRADD:
dap = WK_DIRADD(wk);
(*pr)("off %d ino %u da_un %p\n", dap->da_offset,
dap->da_newinum, dap->da_un.dau_previous);
break;
case D_MKDIR:
mkdir = WK_MKDIR(wk);
(*pr)("diradd %p bp %p\n", mkdir->md_diradd, mkdir->md_buf);
break;
case D_DIRREM:
dirrem = WK_DIRREM(wk);
(*pr)("mp %p ino %u dm_un %p\n", dirrem->dm_mnt,
dirrem->dm_oldinum, dirrem->dm_un.dmu_pagedep);
break;
case D_NEWDIRBLK:
newdirblk = WK_NEWDIRBLK(wk);
(*pr)("pagedep %p\n", newdirblk->db_pagedep);
break;
}
}
#endif
/* $OpenBSD: kern_malloc.c,v 1.148 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_malloc.c,v 1.15.4.2 1996/06/13 17:10:56 cgd Exp $ */
/*
* Copyright (c) 1987, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
*/
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/stdint.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/tracepoint.h>
#include <uvm/uvm_extern.h>
#if defined(DDB)
#include <machine/db_machdep.h>
#include <ddb/db_output.h>
#endif
static
#ifndef SMALL_KERNEL
__inline__
#endif
long BUCKETINDX(size_t sz)
{
long b, d;
/* note that this relies upon MINALLOCSIZE being 1 << MINBUCKET */
b = 7 + MINBUCKET; d = 4;
while (d != 0) {
if (sz <= (1 << b))
b -= d;
else
b += d;
d >>= 1;
}
if (sz <= (1 << b))
b += 0;
else
b += 1;
return b;
}
static struct vm_map kmem_map_store;
struct vm_map *kmem_map = NULL;
/*
* Default number of pages in kmem_map. We attempt to calculate this
* at run-time, but allow it to be either patched or set in the kernel
* config file.
*/
#ifndef NKMEMPAGES
#define NKMEMPAGES 0
#endif
u_int nkmempages = NKMEMPAGES;
/*
* Defaults for lower- and upper-bounds for the kmem_map page count.
* Can be overridden by kernel config options.
*/
#ifndef NKMEMPAGES_MIN
#define NKMEMPAGES_MIN 0
#endif
u_int nkmempages_min = 0;
#ifndef NKMEMPAGES_MAX
#define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT
#endif
u_int nkmempages_max = 0;
struct mutex malloc_mtx = MUTEX_INITIALIZER(IPL_VM);
struct kmembuckets bucket[MINBUCKET + 16];
#ifdef KMEMSTATS
struct kmemstats kmemstats[M_LAST];
#endif
struct kmemusage *kmemusage;
char *kmembase, *kmemlimit;
char buckstring[16 * sizeof("123456,")];
int buckstring_init = 0;
#if defined(KMEMSTATS) || defined(DIAGNOSTIC)
char *memname[] = INITKMEMNAMES;
char *memall = NULL;
struct rwlock sysctl_kmemlock = RWLOCK_INITIALIZER("sysctlklk");
#endif
/*
* Normally the freelist structure is used only to hold the list pointer
* for free objects. However, when running with diagnostics, the first
* 8 bytes of the structure is unused except for diagnostic information,
* and the free list pointer is at offset 8 in the structure. Since the
* first 8 bytes is the portion of the structure most often modified, this
* helps to detect memory reuse problems and avoid free list corruption.
*/
struct kmem_freelist {
int32_t kf_spare0;
int16_t kf_type;
int16_t kf_spare1;
XSIMPLEQ_ENTRY(kmem_freelist) kf_flist;
};
#ifdef DIAGNOSTIC
/*
* This structure provides a set of masks to catch unaligned frees.
*/
const long addrmask[] = { 0,
0x00000001, 0x00000003, 0x00000007, 0x0000000f,
0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
};
#endif /* DIAGNOSTIC */
#ifndef SMALL_KERNEL
struct timeval malloc_errintvl = { 5, 0 };
struct timeval malloc_lasterr;
#endif
/*
* Allocate a block of memory
*/
void *
malloc(size_t size, int type, int flags)
{
struct kmembuckets *kbp;
struct kmemusage *kup;
struct kmem_freelist *freep;
long indx, npg, allocsize;
caddr_t va, cp;
int s;
#ifdef DIAGNOSTIC
int freshalloc;
char *savedtype;
#endif
#ifdef KMEMSTATS
struct kmemstats *ksp = &kmemstats[type];
int wake;
if (((unsigned long)type) <= 1 || ((unsigned long)type) >= M_LAST)
panic("malloc: bogus type %d", type);
#endif
KASSERT(flags & (M_WAITOK | M_NOWAIT));
#ifdef DIAGNOSTIC
if ((flags & M_NOWAIT) == 0) {
extern int pool_debug;
assertwaitok();
if (pool_debug == 2) yield();
}
#endif
if (size > 65535 * PAGE_SIZE) {
if (flags & M_CANFAIL) {
#ifndef SMALL_KERNEL
if (ratecheck(&malloc_lasterr, &malloc_errintvl)) printf("malloc(): allocation too large, "
"type = %d, size = %lu\n", type, size);
#endif
return (NULL);
} else
panic("malloc: allocation too large, "
"type = %d, size = %lu", type, size);
}
indx = BUCKETINDX(size);
if (size > MAXALLOCSAVE)
allocsize = round_page(size);
else
allocsize = 1 << indx;
kbp = &bucket[indx];
mtx_enter(&malloc_mtx);
#ifdef KMEMSTATS
while (ksp->ks_memuse >= ksp->ks_limit) {
if (flags & M_NOWAIT) {
mtx_leave(&malloc_mtx);
return (NULL);
}
#ifdef DIAGNOSTIC
if (ISSET(flags, M_WAITOK) && curproc == &proc0) panic("%s: cannot sleep for memory during boot",
__func__);
#endif
if (ksp->ks_limblocks < 65535) ksp->ks_limblocks++;
msleep_nsec(ksp, &malloc_mtx, PSWP+2, memname[type], INFSLP);
}
ksp->ks_memuse += allocsize; /* account for this early */
ksp->ks_size |= 1 << indx;
#endif
if (XSIMPLEQ_FIRST(&kbp->kb_freelist) == NULL) {
mtx_leave(&malloc_mtx);
npg = atop(round_page(allocsize));
s = splvm();
va = (caddr_t)uvm_km_kmemalloc_pla(kmem_map, NULL,
(vsize_t)ptoa(npg), 0,
((flags & M_NOWAIT) ? UVM_KMF_NOWAIT : 0) |
((flags & M_CANFAIL) ? UVM_KMF_CANFAIL : 0),
no_constraint.ucr_low, no_constraint.ucr_high,
0, 0, 0);
splx(s);
if (va == NULL) {
/*
* Kmem_malloc() can return NULL, even if it can
* wait, if there is no map space available, because
* it can't fix that problem. Neither can we,
* right now. (We should release pages which
* are completely free and which are in buckets
* with too many free elements.)
*/
if ((flags & (M_NOWAIT|M_CANFAIL)) == 0)
panic("malloc: out of space in kmem_map");
#ifdef KMEMSTATS
mtx_enter(&malloc_mtx);
ksp->ks_memuse -= allocsize;
wake = ksp->ks_memuse + allocsize >= ksp->ks_limit &&
ksp->ks_memuse < ksp->ks_limit;
mtx_leave(&malloc_mtx);
if (wake) wakeup(ksp);
#endif
return (NULL);
}
mtx_enter(&malloc_mtx);
#ifdef KMEMSTATS
kbp->kb_total += kbp->kb_elmpercl;
#endif
kup = btokup(va);
kup->ku_indx = indx;
#ifdef DIAGNOSTIC
freshalloc = 1;
#endif
if (allocsize > MAXALLOCSAVE) {
kup->ku_pagecnt = npg;
goto out;
}
#ifdef KMEMSTATS
kup->ku_freecnt = kbp->kb_elmpercl;
kbp->kb_totalfree += kbp->kb_elmpercl;
#endif
cp = va + (npg * PAGE_SIZE) - allocsize;
for (;;) {
freep = (struct kmem_freelist *)cp;
#ifdef DIAGNOSTIC
/*
* Copy in known text to detect modification
* after freeing.
*/
poison_mem(cp, allocsize);
freep->kf_type = M_FREE;
#endif /* DIAGNOSTIC */
XSIMPLEQ_INSERT_HEAD(&kbp->kb_freelist, freep,
kf_flist);
if (cp <= va)
break;
cp -= allocsize;
}
} else {
#ifdef DIAGNOSTIC
freshalloc = 0;
#endif
}
freep = XSIMPLEQ_FIRST(&kbp->kb_freelist);
XSIMPLEQ_REMOVE_HEAD(&kbp->kb_freelist, kf_flist);
va = (caddr_t)freep;
#ifdef DIAGNOSTIC
savedtype = (unsigned)freep->kf_type < M_LAST ? memname[freep->kf_type] : "???"; if (freshalloc == 0 && XSIMPLEQ_FIRST(&kbp->kb_freelist)) {
int rv;
vaddr_t addr = (vaddr_t)XSIMPLEQ_FIRST(&kbp->kb_freelist);
vm_map_lock(kmem_map);
rv = uvm_map_checkprot(kmem_map, addr,
addr + sizeof(struct kmem_freelist), PROT_WRITE);
vm_map_unlock(kmem_map);
if (!rv) {
printf("%s %zd of object %p size 0x%lx %s %s"
" (invalid addr %p)\n",
"Data modified on freelist: word",
(int32_t *)&addr - (int32_t *)kbp, va, size,
"previous type", savedtype, (void *)addr);
}
}
/* Fill the fields that we've used with poison */
poison_mem(freep, sizeof(*freep));
/* and check that the data hasn't been modified. */
if (freshalloc == 0) {
size_t pidx;
uint32_t pval;
if (poison_check(va, allocsize, &pidx, &pval)) {
panic("%s %zd of object %p size 0x%lx %s %s"
" (0x%x != 0x%x)\n",
"Data modified on freelist: word",
pidx, va, size, "previous type",
savedtype, ((int32_t*)va)[pidx], pval);
}
}
freep->kf_spare0 = 0;
#endif /* DIAGNOSTIC */
#ifdef KMEMSTATS
kup = btokup(va);
if (kup->ku_indx != indx)
panic("malloc: wrong bucket");
if (kup->ku_freecnt == 0)
panic("malloc: lost data"); kup->ku_freecnt--;
kbp->kb_totalfree--;
out:
kbp->kb_calls++;
ksp->ks_inuse++;
ksp->ks_calls++;
if (ksp->ks_memuse > ksp->ks_maxused) ksp->ks_maxused = ksp->ks_memuse;
#else
out:
#endif
mtx_leave(&malloc_mtx);
if ((flags & M_ZERO) && va != NULL) memset(va, 0, size); TRACEPOINT(uvm, malloc, type, va, size, flags);
return (va);
}
/*
* Free a block of memory allocated by malloc.
*/
void
free(void *addr, int type, size_t freedsize)
{
struct kmembuckets *kbp;
struct kmemusage *kup;
struct kmem_freelist *freep;
long size;
int s;
#ifdef DIAGNOSTIC
long alloc;
#endif
#ifdef KMEMSTATS
struct kmemstats *ksp = &kmemstats[type];
int wake;
#endif
if (addr == NULL)
return;
#ifdef DIAGNOSTIC
if (addr < (void *)kmembase || addr >= (void *)kmemlimit)
panic("free: non-malloced addr %p type %s", addr,
memname[type]);
#endif
TRACEPOINT(uvm, free, type, addr, freedsize);
mtx_enter(&malloc_mtx);
kup = btokup(addr);
size = 1 << kup->ku_indx;
kbp = &bucket[kup->ku_indx];
if (size > MAXALLOCSAVE) size = kup->ku_pagecnt << PAGE_SHIFT;
#ifdef DIAGNOSTIC
#if 0
if (freedsize == 0) {
static int zerowarnings;
if (zerowarnings < 5) {
zerowarnings++;
printf("free with zero size: (%d)\n", type);
#ifdef DDB
db_stack_dump();
#endif
}
#endif
if (freedsize != 0 && freedsize > size)
panic("free: size too large %zu > %ld (%p) type %s",
freedsize, size, addr, memname[type]);
if (freedsize != 0 && size > MINALLOCSIZE && freedsize <= size / 2)
panic("free: size too small %zu <= %ld / 2 (%p) type %s",
freedsize, size, addr, memname[type]);
/*
* Check for returns of data that do not point to the
* beginning of the allocation.
*/
if (size > PAGE_SIZE)
alloc = addrmask[BUCKETINDX(PAGE_SIZE)];
else
alloc = addrmask[kup->ku_indx];
if (((u_long)addr & alloc) != 0)
panic("free: unaligned addr %p, size %ld, type %s, mask %ld",
addr, size, memname[type], alloc);
#endif /* DIAGNOSTIC */
if (size > MAXALLOCSAVE) {
u_short pagecnt = kup->ku_pagecnt;
kup->ku_indx = 0;
kup->ku_pagecnt = 0;
mtx_leave(&malloc_mtx);
s = splvm();
uvm_km_free(kmem_map, (vaddr_t)addr, ptoa(pagecnt));
splx(s);
#ifdef KMEMSTATS
mtx_enter(&malloc_mtx);
ksp->ks_memuse -= size;
wake = ksp->ks_memuse + size >= ksp->ks_limit &&
ksp->ks_memuse < ksp->ks_limit;
ksp->ks_inuse--;
kbp->kb_total -= 1;
mtx_leave(&malloc_mtx);
if (wake)
wakeup(ksp);
#endif
return;
}
freep = (struct kmem_freelist *)addr;
#ifdef DIAGNOSTIC
/*
* Check for multiple frees. Use a quick check to see if
* it looks free before laboriously searching the freelist.
*/
if (freep->kf_spare0 == poison_value(freep)) {
struct kmem_freelist *fp;
XSIMPLEQ_FOREACH(fp, &kbp->kb_freelist, kf_flist) {
if (addr != fp)
continue;
printf("multiply freed item %p\n", addr);
panic("free: duplicated free");
}
}
/*
* Copy in known text to detect modification after freeing
* and to make it look free. Also, save the type being freed
* so we can list likely culprit if modification is detected
* when the object is reallocated.
*/
poison_mem(addr, size);
freep->kf_spare0 = poison_value(freep);
freep->kf_type = type;
#endif /* DIAGNOSTIC */
#ifdef KMEMSTATS
kup->ku_freecnt++;
if (kup->ku_freecnt >= kbp->kb_elmpercl) {
if (kup->ku_freecnt > kbp->kb_elmpercl)
panic("free: multiple frees"); else if (kbp->kb_totalfree > kbp->kb_highwat) kbp->kb_couldfree++;
}
kbp->kb_totalfree++;
ksp->ks_memuse -= size;
wake = ksp->ks_memuse + size >= ksp->ks_limit &&
ksp->ks_memuse < ksp->ks_limit;
ksp->ks_inuse--;
#endif
XSIMPLEQ_INSERT_TAIL(&kbp->kb_freelist, freep, kf_flist);
mtx_leave(&malloc_mtx);
#ifdef KMEMSTATS
if (wake)
wakeup(ksp);
#endif
}
/*
* Compute the number of pages that kmem_map will map, that is,
* the size of the kernel malloc arena.
*/
void
kmeminit_nkmempages(void)
{
u_int npages;
if (nkmempages != 0) {
/*
* It's already been set (by us being here before, or
* by patching or kernel config options), bail out now.
*/
return;
}
/*
* We can't initialize these variables at compilation time, since
* the page size may not be known (on sparc GENERIC kernels, for
* example). But we still want the MD code to be able to provide
* better values.
*/
if (nkmempages_min == 0)
nkmempages_min = NKMEMPAGES_MIN;
if (nkmempages_max == 0)
nkmempages_max = NKMEMPAGES_MAX;
/*
* We use the following (simple) formula:
*
* - Starting point is physical memory / 4.
*
* - Clamp it down to nkmempages_max.
*
* - Round it up to nkmempages_min.
*/
npages = physmem / 4;
if (npages > nkmempages_max)
npages = nkmempages_max;
if (npages < nkmempages_min)
npages = nkmempages_min;
nkmempages = npages;
}
/*
* Initialize the kernel memory allocator
*/
void
kmeminit(void)
{
vaddr_t base, limit;
long indx;
#ifdef DIAGNOSTIC
if (sizeof(struct kmem_freelist) > (1 << MINBUCKET))
panic("kmeminit: minbucket too small/struct freelist too big");
#endif
/*
* Compute the number of kmem_map pages, if we have not
* done so already.
*/
kmeminit_nkmempages();
base = vm_map_min(kernel_map);
kmem_map = uvm_km_suballoc(kernel_map, &base, &limit,
(vsize_t)nkmempages << PAGE_SHIFT,
#ifdef KVA_GUARDPAGES
VM_MAP_INTRSAFE | VM_MAP_GUARDPAGES,
#else
VM_MAP_INTRSAFE,
#endif
FALSE, &kmem_map_store);
kmembase = (char *)base;
kmemlimit = (char *)limit;
kmemusage = km_alloc(round_page(nkmempages * sizeof(struct kmemusage)),
&kv_any, &kp_zero, &kd_waitok);
for (indx = 0; indx < MINBUCKET + 16; indx++) {
XSIMPLEQ_INIT(&bucket[indx].kb_freelist);
}
#ifdef KMEMSTATS
for (indx = 0; indx < MINBUCKET + 16; indx++) {
if (1 << indx >= PAGE_SIZE)
bucket[indx].kb_elmpercl = 1;
else
bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx);
bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl;
}
for (indx = 0; indx < M_LAST; indx++)
kmemstats[indx].ks_limit = nkmempages * PAGE_SIZE * 6 / 10;
#endif
}
/*
* Return kernel malloc statistics information.
*/
int
sysctl_malloc(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
struct kmembuckets kb;
#ifdef KMEMSTATS
struct kmemstats km;
#endif
#if defined(KMEMSTATS) || defined(DIAGNOSTIC)
int error;
#endif
int i, siz;
if (namelen != 2 && name[0] != KERN_MALLOC_BUCKETS &&
name[0] != KERN_MALLOC_KMEMNAMES)
return (ENOTDIR); /* overloaded */
switch (name[0]) {
case KERN_MALLOC_BUCKETS:
/* Initialize the first time */
if (buckstring_init == 0) {
buckstring_init = 1;
memset(buckstring, 0, sizeof(buckstring));
for (siz = 0, i = MINBUCKET; i < MINBUCKET + 16; i++) {
snprintf(buckstring + siz,
sizeof buckstring - siz,
"%d,", (u_int)(1<<i));
siz += strlen(buckstring + siz);
}
/* Remove trailing comma */
if (siz) buckstring[siz - 1] = '\0';
}
return (sysctl_rdstring(oldp, oldlenp, newp, buckstring));
case KERN_MALLOC_BUCKET:
mtx_enter(&malloc_mtx);
memcpy(&kb, &bucket[BUCKETINDX(name[1])], sizeof(kb));
mtx_leave(&malloc_mtx);
memset(&kb.kb_freelist, 0, sizeof(kb.kb_freelist));
return (sysctl_rdstruct(oldp, oldlenp, newp, &kb, sizeof(kb)));
case KERN_MALLOC_KMEMSTATS:
#ifdef KMEMSTATS
if ((name[1] < 0) || (name[1] >= M_LAST))
return (EINVAL);
mtx_enter(&malloc_mtx);
memcpy(&km, &kmemstats[name[1]], sizeof(km));
mtx_leave(&malloc_mtx);
return (sysctl_rdstruct(oldp, oldlenp, newp, &km, sizeof(km)));
#else
return (EOPNOTSUPP);
#endif
case KERN_MALLOC_KMEMNAMES:
#if defined(KMEMSTATS) || defined(DIAGNOSTIC)
error = rw_enter(&sysctl_kmemlock, RW_WRITE|RW_INTR);
if (error)
return (error);
if (memall == NULL) {
int totlen;
/* Figure out how large a buffer we need */
for (totlen = 0, i = 0; i < M_LAST; i++) { if (memname[i]) totlen += strlen(memname[i]);
totlen++;
}
memall = malloc(totlen + M_LAST, M_SYSCTL,
M_WAITOK|M_ZERO);
for (siz = 0, i = 0; i < M_LAST; i++) {
snprintf(memall + siz,
totlen + M_LAST - siz,
"%s,", memname[i] ? memname[i] : "");
siz += strlen(memall + siz);
}
/* Remove trailing comma */
if (siz) memall[siz - 1] = '\0';
/* Now, convert all spaces to underscores */
for (i = 0; i < totlen; i++) if (memall[i] == ' ') memall[i] = '_';
}
rw_exit_write(&sysctl_kmemlock);
return (sysctl_rdstring(oldp, oldlenp, newp, memall));
#else
return (EOPNOTSUPP);
#endif
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
#if defined(DDB)
void
malloc_printit(
int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
{
#ifdef KMEMSTATS
struct kmemstats *km;
int i;
(*pr)("%15s %5s %6s %7s %6s %9s %8s\n",
"Type", "InUse", "MemUse", "HighUse", "Limit", "Requests",
"Type Lim");
for (i = 0, km = kmemstats; i < M_LAST; i++, km++) {
if (!km->ks_calls || !memname[i])
continue;
(*pr)("%15s %5ld %6ldK %7ldK %6ldK %9ld %8d\n",
memname[i], km->ks_inuse, km->ks_memuse / 1024,
km->ks_maxused / 1024, km->ks_limit / 1024,
km->ks_calls, km->ks_limblocks);
}
#else
(*pr)("No KMEMSTATS compiled in\n");
#endif
}
#endif /* DDB */
/*
* Copyright (c) 2008 Otto Moerbeek <otto@drijf.net>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
* if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
*/
#define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4))
void *
mallocarray(size_t nmemb, size_t size, int type, int flags)
{ if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
nmemb > 0 && SIZE_MAX / nmemb < size) {
if (flags & M_CANFAIL)
return (NULL);
panic("mallocarray: overflow %zu * %zu", nmemb, size);
}
return (malloc(size * nmemb, type, flags));
}
/* $OpenBSD: if_ether.c,v 1.251 2022/07/16 15:25:30 bluhm Exp $ */
/* $NetBSD: if_ether.c,v 1.31 1996/05/11 12:59:58 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_ether.c 8.1 (Berkeley) 6/10/93
*/
/*
* Ethernet address resolution protocol.
* TODO:
* add "inuse/lock" bit (or ref. count) along with valid bit
*/
#include "carp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/timeout.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/queue.h>
#include <sys/pool.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/if_types.h>
#include <net/netisr.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
#include <netinet/ip_var.h>
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
/*
* Locks used to protect struct members in this file:
* a atomic operations
* I immutable after creation
* K kernel lock
* m arp mutex, needed when net lock is shared
* N net lock
*/
struct llinfo_arp {
LIST_ENTRY(llinfo_arp) la_list; /* [mN] global arp_list */
struct rtentry *la_rt; /* [I] backpointer to rtentry */
struct mbuf_queue la_mq; /* packet hold queue */
time_t la_refreshed; /* when was refresh sent */
int la_asked; /* number of queries sent */
};
#define LA_HOLD_QUEUE 10
#define LA_HOLD_TOTAL 100
/* timer values */
int arpt_prune = (5 * 60); /* [I] walk list every 5 minutes */
int arpt_keep = (20 * 60); /* [a] once resolved, cache for 20 minutes */
int arpt_down = 20; /* [a] once declared down, don't send for 20 secs */
struct mbuf *arppullup(struct mbuf *m);
void arpinvalidate(struct rtentry *);
void arptfree(struct rtentry *);
void arptimer(void *);
struct rtentry *arplookup(struct in_addr *, int, int, unsigned int);
void in_arpinput(struct ifnet *, struct mbuf *);
void in_revarpinput(struct ifnet *, struct mbuf *);
int arpcache(struct ifnet *, struct ether_arp *, struct rtentry *);
void arpreply(struct ifnet *, struct mbuf *, struct in_addr *, uint8_t *,
unsigned int);
struct niqueue arpinq = NIQUEUE_INITIALIZER(50, NETISR_ARP);
/* llinfo_arp live time, rt_llinfo and RTF_LLINFO are protected by arp_mtx */
struct mutex arp_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
LIST_HEAD(, llinfo_arp) arp_list; /* [mN] list of all llinfo_arp structures */
struct pool arp_pool; /* [I] pool for llinfo_arp structures */
int arp_maxtries = 5; /* [I] arp requests before set to rejected */
int la_hold_total; /* [a] packets currently in the arp queue */
#ifdef NFSCLIENT
/* revarp state */
struct in_addr revarp_myip, revarp_srvip;
int revarp_finished;
unsigned int revarp_ifidx;
#endif /* NFSCLIENT */
/*
* Timeout routine. Age arp_tab entries periodically.
*/
/* ARGSUSED */
void
arptimer(void *arg)
{
struct timeout *to = arg;
struct llinfo_arp *la, *nla;
time_t uptime;
NET_LOCK();
uptime = getuptime();
timeout_add_sec(to, arpt_prune);
/* Net lock is exclusive, no arp mutex needed for arp_list here. */
LIST_FOREACH_SAFE(la, &arp_list, la_list, nla) {
struct rtentry *rt = la->la_rt;
if (rt->rt_expire && rt->rt_expire < uptime)
arptfree(rt); /* timer has expired; clear */
}
NET_UNLOCK();
}
void
arpinit(void)
{
static struct timeout arptimer_to;
pool_init(&arp_pool, sizeof(struct llinfo_arp), 0,
IPL_SOFTNET, 0, "arp", NULL);
timeout_set_proc(&arptimer_to, arptimer, &arptimer_to);
timeout_add_sec(&arptimer_to, arpt_prune);
}
void
arp_rtrequest(struct ifnet *ifp, int req, struct rtentry *rt)
{
struct sockaddr *gate = rt->rt_gateway;
struct llinfo_arp *la;
time_t uptime;
NET_ASSERT_LOCKED();
if (ISSET(rt->rt_flags,
RTF_GATEWAY|RTF_BROADCAST|RTF_MULTICAST|RTF_MPLS))
return;
uptime = getuptime();
switch (req) {
case RTM_ADD:
if (rt->rt_flags & RTF_CLONING) {
rt->rt_expire = 0;
break;
}
if ((rt->rt_flags & RTF_LOCAL) && rt->rt_llinfo == NULL)
rt->rt_expire = 0;
/*
* Announce a new entry if requested or warn the user
* if another station has this IP address.
*/
if (rt->rt_flags & (RTF_ANNOUNCE|RTF_LOCAL))
arprequest(ifp,
&satosin(rt_key(rt))->sin_addr.s_addr,
&satosin(rt_key(rt))->sin_addr.s_addr,
(u_char *)LLADDR(satosdl(gate)));
/*FALLTHROUGH*/
case RTM_RESOLVE:
if (gate->sa_family != AF_LINK ||
gate->sa_len < sizeof(struct sockaddr_dl)) {
log(LOG_DEBUG, "%s: bad gateway value: %s\n", __func__,
ifp->if_xname);
break;
}
satosdl(gate)->sdl_type = ifp->if_type;
satosdl(gate)->sdl_index = ifp->if_index;
/*
* Case 2: This route may come from cloning, or a manual route
* add with a LL address.
*/
la = pool_get(&arp_pool, PR_NOWAIT | PR_ZERO);
if (la == NULL) {
log(LOG_DEBUG, "%s: pool get failed\n", __func__);
break;
}
mtx_enter(&arp_mtx);
if (rt->rt_llinfo != NULL) {
/* we lost the race, another thread has entered it */
mtx_leave(&arp_mtx);
pool_put(&arp_pool, la);
break;
}
mq_init(&la->la_mq, LA_HOLD_QUEUE, IPL_SOFTNET);
rt->rt_llinfo = (caddr_t)la;
la->la_rt = rt;
rt->rt_flags |= RTF_LLINFO;
LIST_INSERT_HEAD(&arp_list, la, la_list);
if ((rt->rt_flags & RTF_LOCAL) == 0)
rt->rt_expire = uptime;
mtx_leave(&arp_mtx);
break;
case RTM_DELETE:
mtx_enter(&arp_mtx);
la = (struct llinfo_arp *)rt->rt_llinfo;
if (la == NULL) {
/* we lost the race, another thread has removed it */
mtx_leave(&arp_mtx);
break;
}
LIST_REMOVE(la, la_list);
rt->rt_llinfo = NULL;
rt->rt_flags &= ~RTF_LLINFO;
atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq));
mtx_leave(&arp_mtx);
pool_put(&arp_pool, la);
break;
case RTM_INVALIDATE:
if (!ISSET(rt->rt_flags, RTF_LOCAL))
arpinvalidate(rt);
break;
}
}
/*
* Broadcast an ARP request. Caller specifies:
* - arp header source ip address
* - arp header target ip address
* - arp header source ethernet address
*/
void
arprequest(struct ifnet *ifp, u_int32_t *sip, u_int32_t *tip, u_int8_t *enaddr)
{
struct mbuf *m;
struct ether_header *eh;
struct ether_arp *ea;
struct sockaddr sa;
if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
return;
m->m_len = sizeof(*ea);
m->m_pkthdr.len = sizeof(*ea);
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
m->m_pkthdr.pf.prio = ifp->if_llprio;
m_align(m, sizeof(*ea));
ea = mtod(m, struct ether_arp *);
eh = (struct ether_header *)sa.sa_data;
memset(ea, 0, sizeof(*ea));
memcpy(eh->ether_dhost, etherbroadcastaddr, sizeof(eh->ether_dhost));
eh->ether_type = htons(ETHERTYPE_ARP); /* if_output will not swap */
ea->arp_hrd = htons(ARPHRD_ETHER);
ea->arp_pro = htons(ETHERTYPE_IP);
ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */
ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */
ea->arp_op = htons(ARPOP_REQUEST);
memcpy(eh->ether_shost, enaddr, sizeof(eh->ether_shost));
memcpy(ea->arp_sha, enaddr, sizeof(ea->arp_sha));
memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa));
memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa));
sa.sa_family = pseudo_AF_HDRCMPLT;
sa.sa_len = sizeof(sa);
m->m_flags |= M_BCAST;
ifp->if_output(ifp, m, &sa, NULL);
}
void
arpreply(struct ifnet *ifp, struct mbuf *m, struct in_addr *sip, uint8_t *eaddr,
unsigned int rdomain)
{
struct ether_header *eh;
struct ether_arp *ea;
struct sockaddr sa;
m_resethdr(m);
m->m_pkthdr.ph_rtableid = rdomain;
ea = mtod(m, struct ether_arp *);
ea->arp_op = htons(ARPOP_REPLY);
ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */
/* We're replying to a request. */
memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha));
memcpy(ea->arp_tpa, ea->arp_spa, sizeof(ea->arp_spa));
memcpy(ea->arp_sha, eaddr, sizeof(ea->arp_sha));
memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa));
eh = (struct ether_header *)sa.sa_data;
memcpy(eh->ether_dhost, ea->arp_tha, sizeof(eh->ether_dhost));
memcpy(eh->ether_shost, eaddr, sizeof(eh->ether_shost));
eh->ether_type = htons(ETHERTYPE_ARP);
sa.sa_family = pseudo_AF_HDRCMPLT;
sa.sa_len = sizeof(sa);
ifp->if_output(ifp, m, &sa, NULL);
}
/*
* Resolve an IP address into an ethernet address. If success,
* desten is filled in. If there is no entry in arptab,
* set one up and broadcast a request for the IP address.
* Hold onto this mbuf and resend it once the address
* is finally resolved. A return value of 0 indicates
* that desten has been filled in and the packet should be sent
* normally; A return value of EAGAIN indicates that the packet
* has been taken over here, either now or for later transmission.
* Any other return value indicates an error.
*/
int
arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
struct sockaddr *dst, u_char *desten)
{
struct arpcom *ac = (struct arpcom *)ifp;
struct llinfo_arp *la;
struct sockaddr_dl *sdl;
struct rtentry *rt = NULL;
char addr[INET_ADDRSTRLEN];
time_t uptime;
if (m->m_flags & M_BCAST) { /* broadcast */
memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr));
return (0);
}
if (m->m_flags & M_MCAST) { /* multicast */
ETHER_MAP_IP_MULTICAST(&satosin(dst)->sin_addr, desten);
return (0);
}
uptime = getuptime();
rt = rt_getll(rt0);
if (ISSET(rt->rt_flags, RTF_REJECT) && (rt->rt_expire == 0 || rt->rt_expire > uptime)) { m_freem(m);
return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
}
if (!ISSET(rt->rt_flags, RTF_LLINFO)) {
log(LOG_DEBUG, "%s: %s: route contains no arp information\n",
__func__, inet_ntop(AF_INET, &satosin(rt_key(rt))->sin_addr,
addr, sizeof(addr)));
goto bad;
}
sdl = satosdl(rt->rt_gateway);
if (sdl->sdl_alen > 0 && sdl->sdl_alen != ETHER_ADDR_LEN) {
log(LOG_DEBUG, "%s: %s: incorrect arp information\n", __func__,
inet_ntop(AF_INET, &satosin(dst)->sin_addr,
addr, sizeof(addr)));
goto bad;
}
/*
* Check the address family and length is valid, the address
* is resolved; otherwise, try to resolve.
*/
if ((rt->rt_expire == 0 || rt->rt_expire > uptime) && sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) {
int refresh = 0;
memcpy(desten, LLADDR(sdl), sdl->sdl_alen);
/* refresh ARP entry when timeout gets close */
if (rt->rt_expire != 0 &&
rt->rt_expire - arpt_keep / 8 < uptime) {
mtx_enter(&arp_mtx);
if (ISSET(rt->rt_flags, RTF_LLINFO)) {
la = (struct llinfo_arp *)rt->rt_llinfo;
KASSERT(la != NULL); if (la->la_refreshed + 30 < uptime) {
la->la_refreshed = uptime;
refresh = 1;
}
}
mtx_leave(&arp_mtx);
}
if (refresh) {
arprequest(ifp,
&satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr,
&satosin(dst)->sin_addr.s_addr,
ac->ac_enaddr);
}
return (0);
}
if (ifp->if_flags & (IFF_NOARP|IFF_STATICARP))
goto bad;
KERNEL_LOCK();
/*
* Re-check since we grab the kernel lock after the first check.
* rtrequest_delete() can be called with shared netlock. From
* there arp_rtrequest() is reached which touches RTF_LLINFO
* and rt_llinfo. As this is called with kernel lock we grab the
* kernel lock here and are safe. XXXSMP
*/
if (!ISSET(rt->rt_flags, RTF_LLINFO)) {
KERNEL_UNLOCK();
goto bad;
}
la = (struct llinfo_arp *)rt->rt_llinfo;
KASSERT(la != NULL);
/*
* There is an arptab entry, but no ethernet address
* response yet. Insert mbuf in hold queue if below limit
* if above the limit free the queue without queuing the new packet.
*/
if (atomic_inc_int_nv(&la_hold_total) <= LA_HOLD_TOTAL) {
if (mq_push(&la->la_mq, m) != 0) atomic_dec_int(&la_hold_total);
} else {
atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq) + 1);
m_freem(m);
}
/*
* Re-send the ARP request when appropriate.
*/
#ifdef DIAGNOSTIC
if (rt->rt_expire == 0) {
/* This should never happen. (Should it? -gwr) */
printf("%s: unresolved and rt_expire == 0\n", __func__);
/* Set expiration time to now (expired). */
rt->rt_expire = uptime;
}
#endif
if (rt->rt_expire) {
rt->rt_flags &= ~RTF_REJECT;
if (la->la_asked == 0 || rt->rt_expire != uptime) {
rt->rt_expire = uptime;
if (la->la_asked++ < arp_maxtries)
arprequest(ifp,
&satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr,
&satosin(dst)->sin_addr.s_addr,
ac->ac_enaddr);
else {
rt->rt_flags |= RTF_REJECT;
rt->rt_expire += arpt_down;
la->la_asked = 0;
la->la_refreshed = 0;
atomic_sub_int(&la_hold_total,
mq_purge(&la->la_mq));
}
}
}
KERNEL_UNLOCK();
return (EAGAIN);
bad:
m_freem(m);
return (EINVAL);
}
struct mbuf *
arppullup(struct mbuf *m)
{
struct arphdr *ar;
int len;
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("arp without packet header");
#endif
len = sizeof(struct arphdr);
if (m->m_len < len && (m = m_pullup(m, len)) == NULL)
return NULL;
ar = mtod(m, struct arphdr *);
if (ntohs(ar->ar_hrd) != ARPHRD_ETHER || ntohs(ar->ar_pro) != ETHERTYPE_IP || ar->ar_hln != ETHER_ADDR_LEN ||
ar->ar_pln != sizeof(struct in_addr)) {
m_freem(m);
return NULL;
}
len += 2 * (ar->ar_hln + ar->ar_pln);
if (m->m_len < len && (m = m_pullup(m, len)) == NULL)
return NULL;
return m;
}
/*
* Common length and type checks are done here,
* then the protocol-specific routine is called.
*/
void
arpinput(struct ifnet *ifp, struct mbuf *m)
{ if ((m = arppullup(m)) == NULL)
return;
niq_enqueue(&arpinq, m);
}
void
arpintr(void)
{
struct mbuf_list ml;
struct mbuf *m;
struct ifnet *ifp;
niq_delist(&arpinq, &ml);
while ((m = ml_dequeue(&ml)) != NULL) {
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp != NULL)
in_arpinput(ifp, m);
else
m_freem(m);
if_put(ifp);
}
}
/*
* ARP for Internet protocols on Ethernet, RFC 826.
* In addition, a sanity check is performed on the sender
* protocol address, to catch impersonators.
*/
void
in_arpinput(struct ifnet *ifp, struct mbuf *m)
{
struct ether_arp *ea;
struct rtentry *rt = NULL;
struct sockaddr_in sin;
struct in_addr isaddr, itaddr;
char addr[INET_ADDRSTRLEN];
int op, target = 0;
unsigned int rdomain;
rdomain = rtable_l2(m->m_pkthdr.ph_rtableid);
ea = mtod(m, struct ether_arp *);
op = ntohs(ea->arp_op);
if ((op != ARPOP_REQUEST) && (op != ARPOP_REPLY))
goto out;
memcpy(&itaddr, ea->arp_tpa, sizeof(itaddr));
memcpy(&isaddr, ea->arp_spa, sizeof(isaddr));
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
if (ETHER_IS_MULTICAST(ea->arp_sha) &&
ETHER_IS_BROADCAST(ea->arp_sha)) {
inet_ntop(AF_INET, &isaddr, addr, sizeof(addr));
log(LOG_ERR, "arp: ether address is broadcast for IP address "
"%s!\n", addr);
goto out;
}
if (!memcmp(ea->arp_sha, LLADDR(ifp->if_sadl), sizeof(ea->arp_sha)))
goto out; /* it's from me, ignore it. */
/* Check target against our interface addresses. */
sin.sin_addr = itaddr;
rt = rtalloc(sintosa(&sin), 0, rdomain);
if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL) &&
rt->rt_ifidx == ifp->if_index)
target = 1;
rtfree(rt);
rt = NULL;
#if NCARP > 0
if (target && op == ARPOP_REQUEST && ifp->if_type == IFT_CARP &&
!carp_iamatch(ifp))
goto out;
#endif
/* Do we have an ARP cache for the sender? Create if we are target. */
rt = arplookup(&isaddr, target, 0, rdomain);
/* Check sender against our interface addresses. */
if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL) &&
rt->rt_ifidx == ifp->if_index && isaddr.s_addr != INADDR_ANY) {
inet_ntop(AF_INET, &isaddr, addr, sizeof(addr));
log(LOG_ERR, "duplicate IP address %s sent from ethernet "
"address %s\n", addr, ether_sprintf(ea->arp_sha));
itaddr = isaddr;
} else if (rt != NULL) {
int error;
KERNEL_LOCK();
error = arpcache(ifp, ea, rt);
KERNEL_UNLOCK();
if (error)
goto out;
}
if (op == ARPOP_REQUEST) {
uint8_t *eaddr;
if (target) {
/* We already have all info for the reply */
eaddr = LLADDR(ifp->if_sadl);
} else {
rtfree(rt);
rt = arplookup(&itaddr, 0, SIN_PROXY, rdomain);
/*
* Protect from possible duplicates, only owner
* should respond
*/
if ((rt == NULL) || (rt->rt_ifidx != ifp->if_index))
goto out;
eaddr = LLADDR(satosdl(rt->rt_gateway));
}
arpreply(ifp, m, &itaddr, eaddr, rdomain);
rtfree(rt);
return;
}
out:
rtfree(rt);
m_freem(m);
}
int
arpcache(struct ifnet *ifp, struct ether_arp *ea, struct rtentry *rt)
{
struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo;
struct sockaddr_dl *sdl = satosdl(rt->rt_gateway);
struct in_addr *spa = (struct in_addr *)ea->arp_spa;
char addr[INET_ADDRSTRLEN];
struct ifnet *rifp;
struct mbuf_list ml;
struct mbuf *m;
time_t uptime;
unsigned int len;
int changed = 0;
KERNEL_ASSERT_LOCKED();
KASSERT(sdl != NULL);
/*
* This can happen if the entry has been deleted by another CPU
* after we found it.
*/
if (la == NULL)
return (0);
uptime = getuptime();
if (sdl->sdl_alen > 0) {
if (memcmp(ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) {
if (ISSET(rt->rt_flags, RTF_PERMANENT_ARP|RTF_LOCAL)) {
inet_ntop(AF_INET, spa, addr, sizeof(addr));
log(LOG_WARNING, "arp: attempt to overwrite "
"permanent entry for %s by %s on %s\n", addr,
ether_sprintf(ea->arp_sha), ifp->if_xname);
return (-1);
} else if (rt->rt_ifidx != ifp->if_index) {
#if NCARP > 0
if (ifp->if_type != IFT_CARP)
#endif
{
rifp = if_get(rt->rt_ifidx);
if (rifp == NULL)
return (-1);
inet_ntop(AF_INET, spa, addr,
sizeof(addr));
log(LOG_WARNING, "arp: attempt to "
"overwrite entry for %s on %s by "
"%s on %s\n", addr, rifp->if_xname,
ether_sprintf(ea->arp_sha),
ifp->if_xname);
if_put(rifp);
}
return (-1);
} else {
inet_ntop(AF_INET, spa, addr, sizeof(addr));
log(LOG_INFO, "arp info overwritten for %s by "
"%s on %s\n", addr,
ether_sprintf(ea->arp_sha), ifp->if_xname);
rt->rt_expire = 1;/* no longer static */
}
changed = 1;
}
} else if (!if_isconnected(ifp, rt->rt_ifidx)) {
rifp = if_get(rt->rt_ifidx);
if (rifp == NULL)
return (-1);
inet_ntop(AF_INET, spa, addr, sizeof(addr));
log(LOG_WARNING, "arp: attempt to add entry for %s on %s by %s"
" on %s\n", addr, rifp->if_xname,
ether_sprintf(ea->arp_sha), ifp->if_xname);
if_put(rifp);
return (-1);
}
sdl->sdl_alen = sizeof(ea->arp_sha);
memcpy(LLADDR(sdl), ea->arp_sha, sizeof(ea->arp_sha));
if (rt->rt_expire)
rt->rt_expire = uptime + arpt_keep;
rt->rt_flags &= ~RTF_REJECT;
/* Notify userland that an ARP resolution has been done. */
if (la->la_asked || changed) {
rtm_send(rt, RTM_RESOLVE, 0, ifp->if_rdomain);
}
la->la_asked = 0;
la->la_refreshed = 0;
mq_delist(&la->la_mq, &ml);
len = ml_len(&ml);
while ((m = ml_dequeue(&ml)) != NULL)
ifp->if_output(ifp, m, rt_key(rt), rt);
/* XXXSMP we discard if other CPU enqueues */
if (mq_len(&la->la_mq) > 0) {
/* mbuf is back in queue. Discard. */
atomic_sub_int(&la_hold_total, len + mq_purge(&la->la_mq));
} else
atomic_sub_int(&la_hold_total, len);
return (0);
}
void
arpinvalidate(struct rtentry *rt)
{
struct llinfo_arp *la;
struct sockaddr_dl *sdl = satosdl(rt->rt_gateway);
mtx_enter(&arp_mtx);
la = (struct llinfo_arp *)rt->rt_llinfo;
if (la == NULL) {
mtx_leave(&arp_mtx);
return;
}
atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq));
sdl->sdl_alen = 0;
la->la_asked = 0;
mtx_leave(&arp_mtx);
}
/*
* Free an arp entry.
*/
void
arptfree(struct rtentry *rt)
{
struct ifnet *ifp;
KASSERT(!ISSET(rt->rt_flags, RTF_LOCAL));
arpinvalidate(rt);
ifp = if_get(rt->rt_ifidx);
KASSERT(ifp != NULL);
if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED))
rtdeletemsg(rt, ifp, ifp->if_rdomain);
if_put(ifp);
}
/*
* Lookup or enter a new address in arptab.
*/
struct rtentry *
arplookup(struct in_addr *inp, int create, int proxy, u_int tableid)
{
struct rtentry *rt;
struct sockaddr_inarp sin;
int flags;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = inp->s_addr;
sin.sin_other = proxy ? SIN_PROXY : 0;
flags = (create) ? RT_RESOLVE : 0;
rt = rtalloc((struct sockaddr *)&sin, flags, tableid);
if (!rtisvalid(rt) || ISSET(rt->rt_flags, RTF_GATEWAY) ||
!ISSET(rt->rt_flags, RTF_LLINFO) ||
rt->rt_gateway->sa_family != AF_LINK) {
rtfree(rt);
return (NULL);
}
if (proxy && !ISSET(rt->rt_flags, RTF_ANNOUNCE)) {
while ((rt = rtable_iterate(rt)) != NULL) {
if (ISSET(rt->rt_flags, RTF_ANNOUNCE)) {
break;
}
}
}
return (rt);
}
/*
* Check whether we do proxy ARP for this address and we point to ourselves.
*/
int
arpproxy(struct in_addr in, unsigned int rtableid)
{
struct sockaddr_dl *sdl;
struct rtentry *rt;
struct ifnet *ifp;
int found = 0;
rt = arplookup(&in, 0, SIN_PROXY, rtableid);
if (!rtisvalid(rt)) {
rtfree(rt);
return (0);
}
/* Check that arp information are correct. */
sdl = satosdl(rt->rt_gateway);
if (sdl->sdl_alen != ETHER_ADDR_LEN) {
rtfree(rt);
return (0);
}
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL) {
rtfree(rt);
return (0);
}
if (!memcmp(LLADDR(sdl), LLADDR(ifp->if_sadl), sdl->sdl_alen))
found = 1;
if_put(ifp);
rtfree(rt);
return (found);
}
/*
* Called from Ethernet interrupt handlers
* when ether packet type ETHERTYPE_REVARP
* is received. Common length and type checks are done here,
* then the protocol-specific routine is called.
*/
void
revarpinput(struct ifnet *ifp, struct mbuf *m)
{ if ((m = arppullup(m)) == NULL)
return;
in_revarpinput(ifp, m);
}
/*
* RARP for Internet protocols on Ethernet.
* Algorithm is that given in RFC 903.
* We are only using for bootstrap purposes to get an ip address for one of
* our interfaces. Thus we support no user-interface.
*
* Since the contents of the RARP reply are specific to the interface that
* sent the request, this code must ensure that they are properly associated.
*
* Note: also supports ARP via RARP packets, per the RFC.
*/
void
in_revarpinput(struct ifnet *ifp, struct mbuf *m)
{
struct ether_arp *ar;
int op;
ar = mtod(m, struct ether_arp *);
op = ntohs(ar->arp_op);
switch (op) {
case ARPOP_REQUEST:
case ARPOP_REPLY: /* per RFC */
niq_enqueue(&arpinq, m);
return;
case ARPOP_REVREPLY:
break;
case ARPOP_REVREQUEST: /* handled by rarpd(8) */
default:
goto out;
}
#ifdef NFSCLIENT
if (revarp_ifidx == 0)
goto out;
if (revarp_ifidx != m->m_pkthdr.ph_ifidx) /* !same interface */
goto out;
if (revarp_finished)
goto wake;
if (memcmp(ar->arp_tha, LLADDR(ifp->if_sadl), sizeof(ar->arp_tha)))
goto out;
memcpy(&revarp_srvip, ar->arp_spa, sizeof(revarp_srvip));
memcpy(&revarp_myip, ar->arp_tpa, sizeof(revarp_myip));
revarp_finished = 1;
wake: /* Do wakeup every time in case it was missed. */
wakeup((caddr_t)&revarp_myip);
#endif /* NFSCLIENT */
out:
m_freem(m);
}
/*
* Send a RARP request for the ip address of the specified interface.
* The request should be RFC 903-compliant.
*/
void
revarprequest(struct ifnet *ifp)
{
struct sockaddr sa;
struct mbuf *m;
struct ether_header *eh;
struct ether_arp *ea;
struct arpcom *ac = (struct arpcom *)ifp;
if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
return;
m->m_len = sizeof(*ea);
m->m_pkthdr.len = sizeof(*ea);
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
m->m_pkthdr.pf.prio = ifp->if_llprio;
m_align(m, sizeof(*ea));
ea = mtod(m, struct ether_arp *);
eh = (struct ether_header *)sa.sa_data;
memset(ea, 0, sizeof(*ea));
memcpy(eh->ether_dhost, etherbroadcastaddr, sizeof(eh->ether_dhost));
eh->ether_type = htons(ETHERTYPE_REVARP);
ea->arp_hrd = htons(ARPHRD_ETHER);
ea->arp_pro = htons(ETHERTYPE_IP);
ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */
ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */
ea->arp_op = htons(ARPOP_REVREQUEST);
memcpy(eh->ether_shost, ac->ac_enaddr, sizeof(ea->arp_tha));
memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha));
memcpy(ea->arp_tha, ac->ac_enaddr, sizeof(ea->arp_tha));
sa.sa_family = pseudo_AF_HDRCMPLT;
sa.sa_len = sizeof(sa);
m->m_flags |= M_BCAST;
ifp->if_output(ifp, m, &sa, NULL);
}
#ifdef NFSCLIENT
/*
* RARP for the ip address of the specified interface, but also
* save the ip address of the server that sent the answer.
* Timeout if no response is received.
*/
int
revarpwhoarewe(struct ifnet *ifp, struct in_addr *serv_in,
struct in_addr *clnt_in)
{
int result, count = 20;
if (revarp_finished)
return EIO;
revarp_ifidx = ifp->if_index;
while (count--) {
revarprequest(ifp);
result = tsleep_nsec(&revarp_myip, PSOCK, "revarp",
MSEC_TO_NSEC(500));
if (result != EWOULDBLOCK)
break;
}
revarp_ifidx = 0;
if (!revarp_finished)
return ENETUNREACH;
memcpy(serv_in, &revarp_srvip, sizeof(*serv_in));
memcpy(clnt_in, &revarp_myip, sizeof(*clnt_in));
return 0;
}
/* For compatibility: only saves interface address. */
int
revarpwhoami(struct in_addr *in, struct ifnet *ifp)
{
struct in_addr server;
return (revarpwhoarewe(ifp, &server, in));
}
#endif /* NFSCLIENT */
/* $OpenBSD: if_dl.h,v 1.12 2017/05/04 15:00:24 bluhm Exp $ */
/* $NetBSD: if_dl.h,v 1.8 1995/03/26 20:30:13 jtc Exp $ */
/*
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_dl.h 8.1 (Berkeley) 6/10/93
*/
/*
* A Link-Level Sockaddr may specify the interface in one of two
* ways: either by means of a system-provided index number (computed
* anew and possibly differently on every reboot), or by a human-readable
* string such as "il0" (for managerial convenience).
*
* Census taking actions, such as something akin to SIOCGCONF would return
* both the index and the human name.
*
* High volume transactions (such as giving a link-level ``from'' address
* in a recvfrom or recvmsg call) may be likely only to provide the indexed
* form, (which requires fewer copy operations and less space).
*
* The form and interpretation of the link-level address is purely a matter
* of convention between the device driver and its consumers; however, it is
* expected that all drivers for an interface of a given if_type will agree.
*/
#ifndef _NET_IF_DL_H_
#define _NET_IF_DL_H_
/*
* Structure of a Link-Level sockaddr:
*/
struct sockaddr_dl {
u_char sdl_len; /* Total length of sockaddr */
u_char sdl_family; /* AF_LINK */
u_int16_t sdl_index; /* if != 0, system given index for interface */
u_char sdl_type; /* interface type */
u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */
u_char sdl_alen; /* link level address length */
u_char sdl_slen; /* link layer selector length, mostly 0 */
char sdl_data[24]; /* minimum work area, can be larger;
contains both if name and ll address;
big enough for IFNAMSIZ plus 8byte ll addr */
};
#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen))
#ifdef _KERNEL
static inline struct sockaddr_dl *
satosdl(struct sockaddr *sa)
{
return ((struct sockaddr_dl *)(sa));
}
static inline struct sockaddr *
sdltosa(struct sockaddr_dl *sdl)
{
return ((struct sockaddr *)(sdl));
}
#else /* _KERNEL */
__BEGIN_DECLS
char *link_ntoa(const struct sockaddr_dl *);
__END_DECLS
#endif /* _KERNEL */
#endif /* _NET_IF_DL_H_ */
/* $OpenBSD: kern_sensors.c,v 1.39 2019/12/19 17:40:11 mpi Exp $ */
/*
* Copyright (c) 2005 David Gwynne <dlg@openbsd.org>
* Copyright (c) 2006 Constantine A. Murenin <cnst+openbsd@bugmail.mojo.ru>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/device.h>
#include <sys/hotplug.h>
#include <sys/timeout.h>
#include <sys/task.h>
#include <sys/rwlock.h>
#include <sys/atomic.h>
#include <sys/sensors.h>
#include "hotplug.h"
struct taskq *sensors_taskq;
int sensordev_count;
SLIST_HEAD(, ksensordev) sensordev_list =
SLIST_HEAD_INITIALIZER(sensordev_list);
void
sensordev_install(struct ksensordev *sensdev)
{
struct ksensordev *v, *nv;
int s;
s = splhigh();
if (sensordev_count == 0) {
sensdev->num = 0;
SLIST_INSERT_HEAD(&sensordev_list, sensdev, list);
} else {
for (v = SLIST_FIRST(&sensordev_list);
(nv = SLIST_NEXT(v, list)) != NULL; v = nv)
if (nv->num - v->num > 1)
break;
sensdev->num = v->num + 1;
SLIST_INSERT_AFTER(v, sensdev, list);
}
sensordev_count++;
splx(s);
#if NHOTPLUG > 0
hotplug_device_attach(DV_DULL, "sensordev");
#endif
}
void
sensor_attach(struct ksensordev *sensdev, struct ksensor *sens)
{
struct ksensor *v, *nv;
struct ksensors_head *sh;
int s, i;
s = splhigh();
sh = &sensdev->sensors_list;
if (sensdev->sensors_count == 0) {
for (i = 0; i < SENSOR_MAX_TYPES; i++)
sensdev->maxnumt[i] = 0;
sens->numt = 0;
SLIST_INSERT_HEAD(sh, sens, list);
} else {
for (v = SLIST_FIRST(sh);
(nv = SLIST_NEXT(v, list)) != NULL; v = nv)
if (v->type == sens->type && (v->type != nv->type ||
(v->type == nv->type && nv->numt - v->numt > 1)))
break;
/* sensors of the same type go after each other */
if (v->type == sens->type)
sens->numt = v->numt + 1;
else
sens->numt = 0;
SLIST_INSERT_AFTER(v, sens, list);
}
/* we only increment maxnumt[] if the sensor was added
* to the last position of sensors of this type
*/
if (sensdev->maxnumt[sens->type] == sens->numt)
sensdev->maxnumt[sens->type]++;
sensdev->sensors_count++;
splx(s);
}
void
sensordev_deinstall(struct ksensordev *sensdev)
{
int s;
s = splhigh();
sensordev_count--;
SLIST_REMOVE(&sensordev_list, sensdev, ksensordev, list);
splx(s);
#if NHOTPLUG > 0
hotplug_device_detach(DV_DULL, "sensordev");
#endif
}
void
sensor_detach(struct ksensordev *sensdev, struct ksensor *sens)
{
struct ksensors_head *sh;
int s;
s = splhigh();
sh = &sensdev->sensors_list;
sensdev->sensors_count--;
SLIST_REMOVE(sh, sens, ksensor, list);
/* we only decrement maxnumt[] if this is the tail
* sensor of this type
*/
if (sens->numt == sensdev->maxnumt[sens->type] - 1)
sensdev->maxnumt[sens->type]--;
splx(s);
}
int
sensordev_get(int num, struct ksensordev **sensdev)
{
struct ksensordev *sd;
SLIST_FOREACH(sd, &sensordev_list, list) {
if (sd->num == num) {
*sensdev = sd;
return (0);
}
if (sd->num > num)
return (ENXIO);
}
return (ENOENT);
}
int
sensor_find(int dev, enum sensor_type type, int numt, struct ksensor **ksensorp)
{
struct ksensor *s;
struct ksensordev *sensdev;
struct ksensors_head *sh;
int ret;
ret = sensordev_get(dev, &sensdev);
if (ret)
return (ret);
sh = &sensdev->sensors_list;
SLIST_FOREACH(s, sh, list) if (s->type == type && s->numt == numt) { *ksensorp = s;
return (0);
}
return (ENOENT);
}
struct sensor_task {
void (*func)(void *);
void *arg;
unsigned int period;
struct timeout timeout;
struct task task;
struct rwlock lock;
};
void sensor_task_tick(void *);
void sensor_task_work(void *);
struct sensor_task *
sensor_task_register(void *arg, void (*func)(void *), unsigned int period)
{
struct sensor_task *st;
#ifdef DIAGNOSTIC
if (period == 0)
panic("sensor_task_register: period is 0");
#endif
if (sensors_taskq == NULL &&
(sensors_taskq = taskq_create("sensors", 1, IPL_HIGH, 0)) == NULL)
sensors_taskq = systq;
st = malloc(sizeof(*st), M_DEVBUF, M_NOWAIT);
if (st == NULL)
return (NULL);
st->func = func;
st->arg = arg;
st->period = period;
timeout_set(&st->timeout, sensor_task_tick, st);
task_set(&st->task, sensor_task_work, st);
rw_init(&st->lock, "sensor");
sensor_task_tick(st);
return (st);
}
void
sensor_task_unregister(struct sensor_task *st)
{
/*
* we can't reliably timeout_del or task_del because there's a window
* between when they come off the lists and the timeout or task code
* actually runs the respective handlers for them. mark the sensor_task
* as dying by setting period to 0 and let sensor_task_work mop up.
*/
rw_enter_write(&st->lock);
st->period = 0;
rw_exit_write(&st->lock);
}
void
sensor_task_tick(void *arg)
{
struct sensor_task *st = arg;
task_add(sensors_taskq, &st->task);
}
static int sensors_quiesced;
static int sensors_running;
void
sensor_quiesce(void)
{
sensors_quiesced = 1;
while (sensors_running > 0)
tsleep_nsec(&sensors_running, PZERO, "sensorpause", INFSLP);
}
void
sensor_restart(void)
{
sensors_quiesced = 0;
}
void
sensor_task_work(void *xst)
{
struct sensor_task *st = xst;
unsigned int period = 0;
atomic_inc_int(&sensors_running);
rw_enter_write(&st->lock);
period = st->period;
if (period > 0 && !sensors_quiesced)
st->func(st->arg);
rw_exit_write(&st->lock);
if (atomic_dec_int_nv(&sensors_running) == 0) {
if (sensors_quiesced)
wakeup(&sensors_running);
}
if (period == 0)
free(st, M_DEVBUF, sizeof(*st));
else
timeout_add_sec(&st->timeout, period);
}
/* $OpenBSD: pmap.c,v 1.153 2022/06/30 13:51:24 mlarkin Exp $ */
/* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright 2001 (c) Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is the i386 pmap modified and generalized to support x86-64
* as well. The idea is to hide the upper N levels of the page tables
* inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
* is mostly untouched, except that it uses some more generalized
* macros and interfaces.
*
* This pmap has been tested on the i386 as well, and it can be easily
* adapted to PAE.
*
* fvdl@wasabisystems.com 18-Jun-2001
*/
/*
* pmap.c: i386 pmap module rewrite
* Chuck Cranor <chuck@ccrc.wustl.edu>
* 11-Aug-97
*
* history of this pmap module: in addition to my own input, i used
* the following references for this rewrite of the i386 pmap:
*
* [1] the NetBSD i386 pmap. this pmap appears to be based on the
* BSD hp300 pmap done by Mike Hibler at University of Utah.
* it was then ported to the i386 by William Jolitz of UUNET
* Technologies, Inc. Then Charles M. Hannum of the NetBSD
* project fixed some bugs and provided some speed ups.
*
* [2] the FreeBSD i386 pmap. this pmap seems to be the
* Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
* and David Greenman.
*
* [3] the Mach pmap. this pmap, from CMU, seems to have migrated
* between several processors. the VAX version was done by
* Avadis Tevanian, Jr., and Michael Wayne Young. the i386
* version was done by Lance Berc, Mike Kupfer, Bob Baron,
* David Golub, and Richard Draves. the alpha version was
* done by Alessandro Forin (CMU/Mach) and Chris Demetriou
* (NetBSD/alpha).
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/user.h>
#include <sys/mutex.h>
#include <uvm/uvm.h>
#include <machine/cpu.h>
#ifdef MULTIPROCESSOR
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#endif
#include "vmm.h"
#if NVMM > 0
#include <machine/vmmvar.h>
#endif /* NVMM > 0 */
#include "acpi.h"
/* #define PMAP_DEBUG */
#ifdef PMAP_DEBUG
#define DPRINTF(x...) do { printf(x); } while(0)
#else
#define DPRINTF(x...)
#endif /* PMAP_DEBUG */
/*
* general info:
*
* - for an explanation of how the i386 MMU hardware works see
* the comments in <machine/pte.h>.
*
* - for an explanation of the general memory structure used by
* this pmap (including the recursive mapping), see the comments
* in <machine/pmap.h>.
*
* this file contains the code for the "pmap module." the module's
* job is to manage the hardware's virtual to physical address mappings.
* note that there are two levels of mapping in the VM system:
*
* [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
* to map ranges of virtual address space to objects/files. for
* example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
* to the file /bin/ls starting at offset zero." note that
* the upper layer mapping is not concerned with how individual
* vm_pages are mapped.
*
* [2] the lower layer of the VM system (the pmap) maintains the mappings
* from virtual addresses. it is concerned with which vm_page is
* mapped where. for example, when you run /bin/ls and start
* at page 0x1000 the fault routine may lookup the correct page
* of the /bin/ls file and then ask the pmap layer to establish
* a mapping for it.
*
* note that information in the lower layer of the VM system can be
* thrown away since it can easily be reconstructed from the info
* in the upper layer.
*
* data structures we use include:
* - struct pmap: describes the address space of one process
* - struct pv_entry: describes one <PMAP,VA> mapping of a PA
* - struct pg_to_free: a list of virtual addresses whose mappings
* have been changed. used for TLB flushing.
*/
/*
* memory allocation
*
* - there are three data structures that we must dynamically allocate:
*
* [A] new process' page directory page (PDP)
* - plan 1: done at pmap_create() we use
* pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
*
* if we are low in free physical memory then we sleep in
* pool_get() -- in this case this is ok since we are creating
* a new pmap and should not be holding any locks.
*
* XXX: the fork code currently has no way to return an "out of
* memory, try again" error code since uvm_fork [fka vm_fork]
* is a void function.
*
* [B] new page tables pages (PTP)
* call uvm_pagealloc()
* => success: zero page, add to pm_pdir
* => failure: we are out of free vm_pages, let pmap_enter()
* tell UVM about it.
*
* note: for kernel PTPs, we start with NKPTP of them. as we map
* kernel memory (at uvm_map time) we check to see if we've grown
* the kernel pmap. if so, we call the optional function
* pmap_growkernel() to grow the kernel PTPs in advance.
*
* [C] pv_entry structures
* - try to allocate one from the pool.
* If we fail, we simply let pmap_enter() tell UVM about it.
*/
long nkptp[] = NKPTP_INITIALIZER;
const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
const long nkptpmax[] = NKPTPMAX_INITIALIZER;
const long nbpd[] = NBPD_INITIALIZER;
pd_entry_t *const normal_pdes[] = PDES_INITIALIZER;
#define pmap_pte_set(p, n) atomic_swap_64(p, n)
#define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b)
#define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b)
/*
* global data structures
*/
struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
/*
* pmap_pg_wc: if our processor supports PAT then we set this
* to be the pte bits for Write Combining. Else we fall back to
* UC- so mtrrs can override the cacheability;
*/
int pmap_pg_wc = PG_UCMINUS;
/*
* pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
*
* The next three are zero unless and until PCID support is enabled so code
* can just 'or' them in as needed without tests.
* cr3_pcid: CR3_REUSE_PCID
* cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
*/
#if PCID_KERN != 0
# error "pmap.c assumes PCID_KERN is zero"
#endif
int pmap_use_pcid;
static u_int cr3_pcid_proc;
static u_int cr3_pcid_temp;
/* these two are accessed from locore.o */
paddr_t cr3_reuse_pcid;
paddr_t cr3_pcid_proc_intel;
/*
* other data structures
*/
pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
int pmap_initialized = 0; /* pmap_init done yet? */
/*
* pv management structures.
*/
struct pool pmap_pv_pool;
/*
* linked list of all non-kernel pmaps
*/
struct pmap_head pmaps;
struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
/*
* pool that pmap structures are allocated from
*/
struct pool pmap_pmap_pool;
/*
* When we're freeing a ptp, we need to delay the freeing until all
* tlb shootdown has been done. This is the list of the to-be-freed pages.
*/
TAILQ_HEAD(pg_to_free, vm_page);
/*
* pool that PDPs are allocated from
*/
struct pool pmap_pdp_pool;
void pmap_pdp_ctor(pd_entry_t *);
void pmap_pdp_ctor_intel(pd_entry_t *);
extern vaddr_t msgbuf_vaddr;
extern paddr_t msgbuf_paddr;
extern vaddr_t idt_vaddr; /* we allocate IDT early */
extern paddr_t idt_paddr;
extern vaddr_t lo32_vaddr;
extern vaddr_t lo32_paddr;
vaddr_t virtual_avail;
extern int end;
/*
* local prototypes
*/
void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
vaddr_t, struct vm_page *);
struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
void pmap_free_ptp(struct pmap *, struct vm_page *,
vaddr_t, struct pg_to_free *);
void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
#ifdef MULTIPROCESSOR
static int pmap_is_active(struct pmap *, struct cpu_info *);
#endif
paddr_t pmap_map_ptes(struct pmap *);
struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
void pmap_do_remove_ept(struct pmap *, vaddr_t);
int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
vaddr_t, int, struct pv_entry **);
void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
vaddr_t, vaddr_t, int, struct pv_entry **);
#define PMAP_REMOVE_ALL 0 /* remove all mappings */
#define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */
void pmap_unmap_ptes(struct pmap *, paddr_t);
int pmap_get_physpage(vaddr_t, int, paddr_t *);
int pmap_pdes_valid(vaddr_t, pd_entry_t *);
void pmap_alloc_level(vaddr_t, int, long *);
static inline
void pmap_sync_flags_pte(struct vm_page *, u_long);
void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
void pmap_tlb_shoottlb(struct pmap *, int);
#ifdef MULTIPROCESSOR
void pmap_tlb_shootwait(void);
#else
#define pmap_tlb_shootwait() do { } while (0)
#endif
/*
* p m a p i n l i n e h e l p e r f u n c t i o n s
*/
/*
* pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
* of course the kernel is always loaded
*/
static inline int
pmap_is_curpmap(struct pmap *pmap)
{
return((pmap == pmap_kernel()) || (pmap->pm_pdirpa == (rcr3() & CR3_PADDR)));
}
/*
* pmap_is_active: is this pmap loaded into the specified processor's %cr3?
*/
#ifdef MULTIPROCESSOR
static inline int
pmap_is_active(struct pmap *pmap, struct cpu_info *ci)
{
return pmap == pmap_kernel() || pmap == ci->ci_proc_pmap;
}
#endif
static inline u_int
pmap_pte2flags(u_long pte)
{
return (((pte & PG_U) ? PG_PMAP_REF : 0) |
((pte & PG_M) ? PG_PMAP_MOD : 0));
}
static inline void
pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
{
if (pte & (PG_U|PG_M)) { atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
}
}
/*
* pmap_map_ptes: map a pmap's PTEs into KVM
*
* This should not be done for EPT pmaps
*/
paddr_t
pmap_map_ptes(struct pmap *pmap)
{
paddr_t cr3;
KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
/* the kernel's pmap is always accessible */
if (pmap == pmap_kernel())
return 0;
/*
* Lock the target map before switching to its page tables to
* guarantee other CPUs have finished changing the tables before
* we potentially start caching table and TLB entries.
*/
mtx_enter(&pmap->pm_mtx);
cr3 = rcr3();
KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
(cr3 & CR3_PCID) == PCID_PROC);
if (pmap->pm_pdirpa == (cr3 & CR3_PADDR))
cr3 = 0;
else {
cr3 |= cr3_reuse_pcid;
lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
}
return cr3;
}
void
pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
{
if (pmap != pmap_kernel()) mtx_leave(&pmap->pm_mtx); if (save_cr3 != 0) lcr3(save_cr3);
}
int
pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
{
u_long mask, shift;
pd_entry_t pde;
paddr_t pdpa;
int lev;
pdpa = pm->pm_pdirpa;
shift = L4_SHIFT;
mask = L4_MASK;
for (lev = PTP_LEVELS; lev > 0; lev--) {
*pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
*offs = (VA_SIGN_POS(va) & mask) >> shift;
pde = (*pd)[*offs];
/* Large pages are different, break early if we run into one. */
if ((pde & (PG_PS|PG_V)) != PG_V)
return (lev - 1);
pdpa = ((*pd)[*offs] & PG_FRAME);
/* 4096/8 == 512 == 2^9 entries per level */
shift -= 9;
mask >>= 9;
}
return (0);
}
/*
* p m a p k e n t e r f u n c t i o n s
*
* functions to quickly enter/remove pages from the kernel address
* space. pmap_kremove is exported to MI kernel. we make use of
* the recursive PTE mappings.
*/
/*
* pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
*
* => no need to lock anything, assume va is already allocated
* => should be faster than normal pmap enter function
*/
void
pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
{
pt_entry_t *pte, opte, npte;
pte = kvtopte(va);
npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
((pa & PMAP_NOCACHE) ? PG_N : 0) |
((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V;
/* special 1:1 mappings in the first 2MB must not be global */
if (va >= (vaddr_t)NBPD_L2)
npte |= pg_g_kern;
if (!(prot & PROT_EXEC))
npte |= pg_nx;
opte = pmap_pte_set(pte, npte);
#ifdef LARGEPAGES
/* XXX For now... */
if (opte & PG_PS)
panic("%s: PG_PS", __func__);
#endif
if (pmap_valid_entry(opte)) { if (pa & PMAP_NOCACHE && (opte & PG_N) == 0) wbinvd_on_all_cpus();
/* This shouldn't happen */
pmap_tlb_shootpage(pmap_kernel(), va, 1);
pmap_tlb_shootwait();
}
}
/*
* pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
*
* => no need to lock anything
* => caller must dispose of any vm_page mapped in the va range
* => note: not an inline function
* => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
* => we assume kernel only unmaps valid addresses and thus don't bother
* checking the valid bit before doing TLB flushing
*/
void
pmap_kremove(vaddr_t sva, vsize_t len)
{
pt_entry_t *pte, opte;
vaddr_t va, eva;
eva = sva + len;
for (va = sva; va != eva; va += PAGE_SIZE) {
pte = kvtopte(va);
opte = pmap_pte_set(pte, 0);
#ifdef LARGEPAGES
KASSERT((opte & PG_PS) == 0);
#endif
KASSERT((opte & PG_PVLIST) == 0);
}
pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
pmap_tlb_shootwait();
}
/*
* pmap_set_pml4_early
*
* Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
* is the pml4 entry for 'early mappings' (see pmap.h). This function is used
* by display drivers that need to map their framebuffers early, before the
* pmap is fully initialized (eg, to show panic messages).
*
* Users of this function must call pmap_clear_pml4_early to remove the
* mapping when finished.
*
* Parameters:
* pa: phys addr to map
*
* Return value:
* VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
* of the 2MB region containing 'va'.
*/
vaddr_t
pmap_set_pml4_early(paddr_t pa)
{
extern paddr_t early_pte_pages;
pt_entry_t *pml4e, *pte;
int i, j, off;
paddr_t curpa;
vaddr_t va;
pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW;
off = pa & PAGE_MASK_L2;
curpa = pa & L2_FRAME;
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
memset(pte, 0, 3 * NBPG);
pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW;
pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW;
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG);
for (i = 0; i < 2; i++) {
/* 2 early pages of mappings */
for (j = 0; j < 512; j++) {
/* j[0..511] : 2MB mappings per page */
pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS;
curpa += (2 * 1024 * 1024);
}
}
va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off;
return VA_SIGN_NEG(va);
}
/*
* pmap_clear_pml4_early
*
* Clears the mapping previously established with pmap_set_pml4_early.
*/
void
pmap_clear_pml4_early(void)
{
extern paddr_t early_pte_pages;
pt_entry_t *pml4e, *pte;
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
memset(pte, 0, 3 * NBPG);
pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir;
pml4e[PDIR_SLOT_EARLY] = 0;
tlbflush();
}
/*
* p m a p i n i t f u n c t i o n s
*
* pmap_bootstrap and pmap_init are called during system startup
* to init the pmap module. pmap_bootstrap() does a low level
* init just to get things rolling. pmap_init() finishes the job.
*/
/*
* pmap_bootstrap: get the system in a state where it can run with VM
* properly enabled (called before main()). the VM system is
* fully init'd later...
*/
paddr_t
pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
{
vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
struct pmap *kpm;
int curslot, i, j, p;
long ndmpdp;
paddr_t dmpd, dmpdp, start_cur, cur_pa;
vaddr_t kva, kva_end;
pt_entry_t *pml3, *pml2;
/*
* define the boundaries of the managed kernel virtual address
* space.
*/
virtual_avail = kva_start; /* first free KVA */
/*
* set up protection_codes: we need to be able to convert from
* a MI protection code (some combo of VM_PROT...) to something
* we can jam into a i386 PTE.
*/
protection_codes[PROT_NONE] = pg_nx; /* --- */
protection_codes[PROT_EXEC] = PG_RO; /* --x */
protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */
protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */
protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */
protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */
protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */
/*
* now we init the kernel's pmap
*
* the kernel pmap's pm_obj is not used for much. however, in
* user pmaps the pm_obj contains the list of active PTPs.
* the pm_obj currently does not have a pager.
*/
kpm = pmap_kernel();
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
kpm->pm_ptphint[i] = NULL;
}
memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
atop(kva_start - VM_MIN_KERNEL_ADDRESS);
/*
* the above is just a rough estimate and not critical to the proper
* operation of the system.
*/
kpm->pm_type = PMAP_TYPE_NORMAL;
curpcb->pcb_pmap = kpm; /* proc0's pcb */
/*
* Configure and enable PCID use if supported.
* Currently we require INVPCID support.
*/
if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) {
uint32_t ebx, dummy;
CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy);
if (ebx & SEFF0EBX_INVPCID) {
pmap_use_pcid = 1;
/*
* We cannot use global mappings because
* invpcid function 0 does not invalidate global
* mappings. The hardware can cache kernel
* mappings based on PCID_KERN, i.e. there is no
* need for global mappings.
*/
pg_g_kern = 0;
lcr4( rcr4() | CR4_PCIDE );
cr3_pcid_proc = PCID_PROC;
cr3_pcid_temp = PCID_TEMP;
cr3_reuse_pcid = CR3_REUSE_PCID;
cr3_pcid_proc_intel = PCID_PROC_INTEL;
}
}
/*
* Add PG_G attribute to already mapped kernel pages. pg_g_kern
* is calculated in locore0.S and may be set to:
*
* 0 if this CPU does not safely support global pages in the kernel
* (Intel/Meltdown)
* PG_G if this CPU does safely support global pages in the kernel
* (AMD)
*/
#if KERNBASE == VM_MIN_KERNEL_ADDRESS
for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
#else
kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
for (kva = KERNBASE; kva < kva_end ;
#endif
kva += PAGE_SIZE) {
unsigned long p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pg_g_kern;
}
/*
* Map the direct map. The first 4GB were mapped in locore, here
* we map the rest if it exists. We actually use the direct map
* here to set up the page tables, we're assuming that we're still
* operating in the lower 4GB of memory.
*
* Map (up to) the first 512GB of physical memory first. This part
* is handled differently than physical memory > 512GB since we have
* already mapped part of this range in locore0.
*/
ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
if (ndmpdp < NDML2_ENTRIES)
ndmpdp = NDML2_ENTRIES; /* At least 4GB */
if (ndmpdp > 512)
ndmpdp = 512; /* At most 512GB */
dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME;
dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
paddr_t pdp;
vaddr_t va;
pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
va = PMAP_DIRECT_MAP(pdp);
*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
PG_M | pg_nx;
}
for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
paddr_t pdp;
vaddr_t va;
pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
va = PMAP_DIRECT_MAP(pdp);
*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx;
}
kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
PG_M | pg_nx;
/* Map any remaining physical memory > 512GB */
for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) {
/*
* Start of current range starts at PA (curslot) * 512GB
*/
start_cur = (paddr_t)(curslot * NBPD_L4);
if (max_pa > start_cur) {
/* Next 512GB, new PML4e and L3(512GB) page */
dmpd = first_avail; first_avail += PAGE_SIZE;
pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd |
PG_KW | PG_V | PG_U | PG_M | pg_nx;
/* Calculate full 1GB pages in this 512GB region */
p = ((max_pa - start_cur) >> L3_SHIFT);
/* Check if a partial (<1GB) page remains */
if (max_pa & L2_MASK)
p++;
/*
* Handle the case where this range is full and there
* is still more memory after (p would be > 512).
*/
if (p > NPDPG)
p = NPDPG;
/* Allocate 'p' L2(1GB) pages and populate */
for (i = 0; i < p; i++) {
dmpd = first_avail; first_avail += PAGE_SIZE;
pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
pml3[i] = dmpd |
PG_RW | PG_V | PG_U | PG_M | pg_nx;
cur_pa = start_cur + (i << L3_SHIFT);
j = 0;
while (cur_pa < max_pa && j < NPDPG) {
pml2[j] = curslot * NBPD_L4 +
(uint64_t)i * NBPD_L3 +
(uint64_t)j * NBPD_L2;
pml2[j] |= PG_RW | PG_V | pg_g_kern |
PG_U | PG_M | pg_nx | PG_PS;
cur_pa += NBPD_L2;
j++;
}
}
}
}
tlbflush();
msgbuf_vaddr = virtual_avail;
virtual_avail += round_page(MSGBUFSIZE);
idt_vaddr = virtual_avail;
virtual_avail += 2 * PAGE_SIZE;
idt_paddr = first_avail; /* steal a page */
first_avail += 2 * PAGE_SIZE;
#if defined(MULTIPROCESSOR) || \
(NACPI > 0 && !defined(SMALL_KERNEL))
/*
* Grab a page below 4G for things that need it (i.e.
* having an initial %cr3 for the MP trampoline).
*/
lo32_vaddr = virtual_avail;
virtual_avail += PAGE_SIZE;
lo32_paddr = first_avail;
first_avail += PAGE_SIZE;
#endif
/*
* init the global lists.
*/
LIST_INIT(&pmaps);
/*
* initialize the pmap pools.
*/
pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0,
"pmappl", NULL);
pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
"pvpl", &pool_allocator_single);
pool_sethiwat(&pmap_pv_pool, 32 * 1024);
/*
* initialize the PDE pool.
*/
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0,
"pdppl", &pool_allocator_single);
kpm->pm_pdir_intel = NULL;
kpm->pm_pdirpa_intel = 0;
/*
* ensure the TLB is sync'd with reality by flushing it...
*/
tlbflush();
return first_avail;
}
/*
* pmap_randomize
*
* Randomizes the location of the kernel pmap
*/
void
pmap_randomize(void)
{
pd_entry_t *pml4va, *oldpml4va;
paddr_t pml4pa;
int i;
pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
if (pml4va == NULL)
panic("%s: km_alloc failed", __func__);
/* Copy old PML4 page to new one */
oldpml4va = pmap_kernel()->pm_pdir;
memcpy(pml4va, oldpml4va, PAGE_SIZE);
/* Switch to new PML4 */
pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa);
lcr3(pml4pa);
/* Fixup pmap_kernel and proc0's %cr3 */
pmap_kernel()->pm_pdirpa = pml4pa;
pmap_kernel()->pm_pdir = pml4va;
proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
/* Fixup recursive PTE PML4E slot. We are only changing the PA */
pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~PG_FRAME);
for (i = 0; i < NPDPG; i++) {
/* PTE slot already handled earlier */
if (i == PDIR_SLOT_PTE)
continue;
if (pml4va[i] & PG_FRAME)
pmap_randomize_level(&pml4va[i], 3);
}
/* Wipe out bootstrap PML4 */
memset(oldpml4va, 0, PAGE_SIZE);
tlbflush();
}
void
pmap_randomize_level(pd_entry_t *pde, int level)
{
pd_entry_t *new_pd_va;
paddr_t old_pd_pa, new_pd_pa;
vaddr_t old_pd_va;
struct vm_page *pg;
int i;
if (level == 0)
return;
if (level < PTP_LEVELS - 1 && (*pde & PG_PS))
return;
new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
if (new_pd_va == NULL)
panic("%s: cannot allocate page for L%d page directory",
__func__, level);
old_pd_pa = *pde & PG_FRAME;
old_pd_va = PMAP_DIRECT_MAP(old_pd_pa);
pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa);
memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE);
*pde = new_pd_pa | (*pde & ~PG_FRAME);
tlbflush();
memset((void *)old_pd_va, 0, PAGE_SIZE);
pg = PHYS_TO_VM_PAGE(old_pd_pa);
if (pg != NULL) {
pg->wire_count--;
pmap_kernel()->pm_stats.resident_count--;
if (pg->wire_count <= 1)
uvm_pagefree(pg);
}
for (i = 0; i < NPDPG; i++)
if (new_pd_va[i] & PG_FRAME)
pmap_randomize_level(&new_pd_va[i], level - 1);
}
/*
* Pre-allocate PTPs for low memory, so that 1:1 mappings for various
* trampoline code can be entered.
*/
paddr_t
pmap_prealloc_lowmem_ptps(paddr_t first_avail)
{
pd_entry_t *pdes;
int level;
paddr_t newp;
pdes = pmap_kernel()->pm_pdir;
level = PTP_LEVELS;
for (;;) {
newp = first_avail; first_avail += PAGE_SIZE;
memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
level--;
if (level <= 1)
break;
pdes = normal_pdes[level - 2];
}
return first_avail;
}
/*
* pmap_init: no further initialization required on this platform
*/
void
pmap_init(void)
{
pmap_initialized = 1;
}
/*
* p v _ e n t r y f u n c t i o n s
*/
/*
* main pv_entry manipulation functions:
* pmap_enter_pv: enter a mapping onto a pv list
* pmap_remove_pv: remove a mapping from a pv list
*/
/*
* pmap_enter_pv: enter a mapping onto a pv list
*
* => caller should adjust ptp's wire_count before calling
*
* pve: preallocated pve for us to use
* ptp: PTP in pmap that maps this VA
*/
void
pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
vaddr_t va, struct vm_page *ptp)
{
pve->pv_pmap = pmap;
pve->pv_va = va;
pve->pv_ptp = ptp; /* NULL for kernel pmap */
mtx_enter(&pg->mdpage.pv_mtx);
pve->pv_next = pg->mdpage.pv_list; /* add to ... */
pg->mdpage.pv_list = pve; /* ... list */
mtx_leave(&pg->mdpage.pv_mtx);
}
/*
* pmap_remove_pv: try to remove a mapping from a pv_list
*
* => caller should adjust ptp's wire_count and free PTP if needed
* => we return the removed pve
*/
struct pv_entry *
pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
{
struct pv_entry *pve, **prevptr;
mtx_enter(&pg->mdpage.pv_mtx);
prevptr = &pg->mdpage.pv_list;
while ((pve = *prevptr) != NULL) { if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ *prevptr = pve->pv_next; /* remove it! */
break;
}
prevptr = &pve->pv_next; /* previous pointer */
}
mtx_leave(&pg->mdpage.pv_mtx);
return(pve); /* return removed pve */
}
/*
* p t p f u n c t i o n s
*/
struct vm_page *
pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
{
int lidx = level - 1;
struct vm_page *pg;
if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
return (pmap->pm_ptphint[lidx]);
pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
return pg;
}
void
pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
struct pg_to_free *pagelist)
{
int lidx;
struct uvm_object *obj;
lidx = level - 1;
obj = &pmap->pm_obj[lidx];
pmap->pm_stats.resident_count--;
if (pmap->pm_ptphint[lidx] == ptp)
pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt);
ptp->wire_count = 0;
uvm_pagerealloc(ptp, NULL, 0);
TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
}
void
pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
struct pg_to_free *pagelist)
{
unsigned long index;
int level;
vaddr_t invaladdr;
level = 1;
do {
pmap_freepage(pmap, ptp, level, pagelist);
index = pl_i(va, level + 1);
pmap_pte_set(&normal_pdes[level - 1][index], 0);
if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) {
/* Zap special meltdown PML4e */
pmap_pte_set(&pmap->pm_pdir_intel[index], 0);
DPRINTF("%s: cleared meltdown PML4e @ index %lu "
"(va range start 0x%llx)\n", __func__, index,
(uint64_t)(index << L4_SHIFT));
}
invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
(vaddr_t)normal_pdes[level - 2];
pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
pmap_is_curpmap(curpcb->pcb_pmap));
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
ptp->wire_count--;
if (ptp->wire_count > 1)
break;
}
} while (++level < PTP_LEVELS);
}
/*
* pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
*
* => pmap should NOT be pmap_kernel()
*/
struct vm_page *
pmap_get_ptp(struct pmap *pmap, vaddr_t va)
{
struct vm_page *ptp, *pptp;
int i;
unsigned long index;
pd_entry_t *pva, *pva_intel;
paddr_t ppa, pa;
struct uvm_object *obj;
ptp = NULL;
pa = (paddr_t)-1;
/*
* Loop through all page table levels seeing if we need to
* add a new page to that level.
*/
for (i = PTP_LEVELS; i > 1; i--) {
/*
* Save values from previous round.
*/
pptp = ptp;
ppa = pa;
index = pl_i(va, i);
pva = normal_pdes[i - 2];
if (pmap_valid_entry(pva[index])) {
ppa = pva[index] & PG_FRAME;
ptp = NULL;
continue;
}
obj = &pmap->pm_obj[i-2];
ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
return NULL;
atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
ptp->wire_count = 1;
pmap->pm_ptphint[i - 2] = ptp;
pa = VM_PAGE_TO_PHYS(ptp);
pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
/*
* Meltdown Special case - if we are adding a new PML4e for
* usermode addresses, just copy the PML4e to the U-K page
* table.
*/
if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS &&
va < VM_MAXUSER_ADDRESS) {
pva_intel = pmap->pm_pdir_intel;
pva_intel[index] = pva[index];
DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
"from 0x%llx -> 0x%llx\n", __func__, pva[index],
(uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
}
pmap->pm_stats.resident_count++;
/*
* If we're not in the top level, increase the
* wire count of the parent page.
*/
if (i < PTP_LEVELS) { if (pptp == NULL)
pptp = pmap_find_ptp(pmap, va, ppa, i);
#ifdef DIAGNOSTIC
if (pptp == NULL) panic("%s: pde page disappeared", __func__);
#endif
pptp->wire_count++;
}
}
/*
* ptp is not NULL if we just allocated a new ptp. If it's
* still NULL, we must look up the existing one.
*/
if (ptp == NULL) {
ptp = pmap_find_ptp(pmap, va, ppa, 1);
#ifdef DIAGNOSTIC
if (ptp == NULL) { printf("va %lx ppa %lx\n", (unsigned long)va,
(unsigned long)ppa);
panic("%s: unmanaged user PTP", __func__);
}
#endif
}
pmap->pm_ptphint[0] = ptp;
return(ptp);
}
/*
* p m a p l i f e c y c l e f u n c t i o n s
*/
/*
* pmap_pdp_ctor: constructor for the PDP cache.
*/
void
pmap_pdp_ctor(pd_entry_t *pdir)
{
paddr_t pdirpa;
int npde, i;
struct pmap *kpm = pmap_kernel();
/* fetch the physical address of the page directory. */
(void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
/* zero init area */
memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
/* put in recursive PDE to map the PTEs */
pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx;
npde = nkptp[PTP_LEVELS - 1];
/* put in kernel VM PDEs */
memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
npde * sizeof(pd_entry_t));
/* zero the rest */
memset(&pdir[PDIR_SLOT_KERN + npde], 0,
(NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
for (i = 0; i < NUM_L4_SLOT_DIRECT; i++)
pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i];
#if VM_MIN_KERNEL_ADDRESS != KERNBASE
pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
#endif
}
void
pmap_pdp_ctor_intel(pd_entry_t *pdir)
{
struct pmap *kpm = pmap_kernel();
/* Copy PML4es from pmap_kernel's U-K view */
memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
}
/*
* pmap_create: create a pmap
*
* => note: old pmap interface took a "size" args which allowed for
* the creation of "software only" pmaps (not in bsd).
*/
struct pmap *
pmap_create(void)
{
struct pmap *pmap;
int i;
pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
mtx_init(&pmap->pm_mtx, IPL_VM);
/* init uvm_object */
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1);
pmap->pm_ptphint[i] = NULL;
}
pmap->pm_stats.wired_count = 0;
pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
pmap->pm_type = PMAP_TYPE_NORMAL;
pmap->eptp = 0;
/* allocate PDP */
/*
* note that there is no need to splvm to protect us from
* malloc since malloc allocates out of a submap and we should
* have already allocated kernel PTPs to cover the range...
*/
pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
pmap_pdp_ctor(pmap->pm_pdir);
pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
/*
* Intel CPUs need a special page table to be used during usermode
* execution, one that lacks all kernel mappings.
*/
if (cpu_meltdown) {
pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
pmap->pm_stats.resident_count++;
if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
&pmap->pm_pdirpa_intel))
panic("%s: unknown PA mapping for meltdown PML4",
__func__);
} else {
pmap->pm_pdir_intel = NULL;
pmap->pm_pdirpa_intel = 0;
}
mtx_enter(&pmaps_lock);
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
mtx_leave(&pmaps_lock);
return (pmap);
}
/*
* pmap_destroy: drop reference count on pmap. free pmap if
* reference count goes to zero.
*/
void
pmap_destroy(struct pmap *pmap)
{
struct vm_page *pg;
int refs;
int i;
/*
* drop reference count
*/
refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
if (refs > 0) {
return;
}
/*
* remove it from global list of pmaps
*/
mtx_enter(&pmaps_lock);
LIST_REMOVE(pmap, pm_list);
mtx_leave(&pmaps_lock);
/*
* free any remaining PTPs
*/
for (i = 0; i < PTP_LEVELS - 1; i++) {
while ((pg = RBT_ROOT(uvm_objtree,
&pmap->pm_obj[i].memt)) != NULL) {
KASSERT((pg->pg_flags & PG_BUSY) == 0);
pg->wire_count = 0;
pmap->pm_stats.resident_count--;
uvm_pagefree(pg);
}
}
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
if (pmap->pm_pdir_intel != NULL) {
pmap->pm_stats.resident_count--;
pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
}
pool_put(&pmap_pmap_pool, pmap);
}
/*
* Add a reference to the specified pmap.
*/
void
pmap_reference(struct pmap *pmap)
{
atomic_inc_int(&pmap->pm_obj[0].uo_refs);
}
/*
* pmap_activate: activate a process' pmap (fill in %cr3)
*
* => called from cpu_fork() and when switching pmaps during exec
* => if p is the curproc, then load it into the MMU
*/
void
pmap_activate(struct proc *p)
{
struct pcb *pcb = &p->p_addr->u_pcb;
struct pmap *pmap = p->p_vmspace->vm_map.pmap;
pcb->pcb_pmap = pmap;
pcb->pcb_cr3 = pmap->pm_pdirpa;
pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc :
(PCID_KERN | cr3_reuse_pcid);
if (p != curproc)
return;
if ((p->p_flag & P_SYSTEM) == 0) {
struct cpu_info *self = curcpu();
/* mark the pmap in use by this processor */
self->ci_proc_pmap = pmap;
/* in case we return to userspace without context switching */
if (cpu_meltdown) {
self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
self->ci_user_cr3 = pmap->pm_pdirpa_intel |
cr3_pcid_proc_intel;
}
}
lcr3(pcb->pcb_cr3);
}
/*
* pmap_deactivate: deactivate a process' pmap
*/
void
pmap_deactivate(struct proc *p)
{
if ((p->p_flag & P_SYSTEM) == 0) {
struct cpu_info *self = curcpu();
/*
* mark the pmap no longer in use by this processor.
*/
KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap);
self->ci_proc_pmap = NULL;
}
}
/*
* end of lifecycle functions
*/
/*
* some misc. functions
*/
int
pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
{
int i;
unsigned long index;
pd_entry_t pde;
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_i(va, i);
pde = normal_pdes[i - 2][index];
if (!pmap_valid_entry(pde))
return 0;
}
if (lastpde != NULL)
*lastpde = pde;
return 1;
}
/*
* pmap_extract: extract a PA for the given VA
*/
int
pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
pt_entry_t *ptes, pte;
int level, offs;
if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
va < PMAP_DIRECT_END) {
*pap = va - PMAP_DIRECT_BASE;
return 1;
}
if (pmap != pmap_kernel()) mtx_enter(&pmap->pm_mtx);
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
pte = ptes[offs];
if (pmap != pmap_kernel()) mtx_leave(&pmap->pm_mtx);
if (__predict_true(level == 0 && pmap_valid_entry(pte))) {
if (pap != NULL) *pap = (pte & PG_FRAME) | (va & PAGE_MASK);
return 1;
}
if (level == 1 && (pte & (PG_PS|PG_V)) == (PG_PS|PG_V)) { if (pap != NULL) *pap = (pte & PG_LGFRAME) | (va & PAGE_MASK_L2);
return 1;
}
return 0;
}
/*
* pmap_zero_page: zero a page
*/
void
pmap_zero_page(struct vm_page *pg)
{
pagezero(pmap_map_direct(pg));
}
/*
* pmap_flush_cache: flush the cache for a virtual address.
*/
void
pmap_flush_cache(vaddr_t addr, vsize_t len)
{
vaddr_t i;
if (curcpu()->ci_cflushsz == 0) {
wbinvd_on_all_cpus();
return;
}
/* all cpus that have clflush also have mfence. */
mfence();
for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
clflush(i);
mfence();
}
/*
* pmap_copy_page: copy a page
*/
void
pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
{
vaddr_t srcva = pmap_map_direct(srcpg);
vaddr_t dstva = pmap_map_direct(dstpg);
memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
}
/*
* p m a p r e m o v e f u n c t i o n s
*
* functions that remove mappings
*/
/*
* pmap_remove_ptes: remove PTEs from a PTP
*
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
*/
void
pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
{
struct pv_entry *pve;
pt_entry_t *pte = (pt_entry_t *) ptpva;
struct vm_page *pg;
pt_entry_t opte;
/*
* note that ptpva points to the PTE that maps startva. this may
* or may not be the first PTE in the PTP.
*
* we loop through the PTP while there are still PTEs to look at
* and the wire_count is greater than 1 (because we use the wire_count
* to keep track of the number of real PTEs in the PTP).
*/
for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
; pte++, startva += PAGE_SIZE) {
if (!pmap_valid_entry(*pte))
continue; /* VA not mapped */
if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
continue;
}
/* atomically save the old PTE and zap! it */
opte = pmap_pte_set(pte, 0);
if (opte & PG_W) pmap->pm_stats.wired_count--;
pmap->pm_stats.resident_count--;
if (ptp != NULL) ptp->wire_count--; /* dropping a PTE */
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
/*
* if we are not on a pv list we are done.
*/
if ((opte & PG_PVLIST) == 0) {
#ifdef DIAGNOSTIC
if (pg != NULL) panic("%s: managed page without PG_PVLIST: "
"va 0x%lx, opte 0x%llx", __func__,
startva, opte);
#endif
continue;
}
#ifdef DIAGNOSTIC
if (pg == NULL)
panic("%s: unmanaged page marked PG_PVLIST: "
"va 0x%lx, opte 0x%llx", __func__,
startva, opte);
#endif
/* sync R/M bits */
pmap_sync_flags_pte(pg, opte);
pve = pmap_remove_pv(pg, pmap, startva);
if (pve != NULL) {
pve->pv_next = *free_pvs;
*free_pvs = pve;
}
/* end of "for" loop: time for next pte */
}
}
/*
* pmap_remove_pte: remove a single PTE from a PTP
*
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
* => returns true if we removed a mapping
*/
int
pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t va, int flags, struct pv_entry **free_pvs)
{
struct pv_entry *pve;
struct vm_page *pg;
pt_entry_t opte;
if (!pmap_valid_entry(*pte))
return 0; /* VA not mapped */
if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
return 0;
}
/* atomically save the old PTE and zap! it */
opte = pmap_pte_set(pte, 0);
if (opte & PG_W) pmap->pm_stats.wired_count--;
pmap->pm_stats.resident_count--;
if (ptp != NULL) ptp->wire_count--; /* dropping a PTE */
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
/*
* if we are not on a pv list we are done.
*/
if ((opte & PG_PVLIST) == 0) {
#ifdef DIAGNOSTIC
if (pg != NULL) panic("%s: managed page without PG_PVLIST: "
"va 0x%lx, opte 0x%llx", __func__, va, opte);
#endif
return 1;
}
#ifdef DIAGNOSTIC
if (pg == NULL)
panic("%s: unmanaged page marked PG_PVLIST: "
"va 0x%lx, opte 0x%llx", __func__, va, opte);
#endif
/* sync R/M bits */
pmap_sync_flags_pte(pg, opte);
pve = pmap_remove_pv(pg, pmap, va);
if (pve != NULL) {
pve->pv_next = *free_pvs;
*free_pvs = pve;
}
return 1;
}
/*
* pmap_remove: top level mapping removal function
*
* => caller should not be holding any pmap locks
*/
void
pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
if (pmap->pm_type == PMAP_TYPE_EPT)
pmap_remove_ept(pmap, sva, eva);
else
pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
}
/*
* pmap_do_remove: mapping removal guts
*
* => caller should not be holding any pmap locks
*/
void
pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
{
pd_entry_t pde;
int result;
paddr_t ptppa;
vaddr_t blkendva;
struct vm_page *ptp;
struct pv_entry *pve;
struct pv_entry *free_pvs = NULL;
vaddr_t va;
int shootall = 0, shootself;
struct pg_to_free empty_ptps;
paddr_t scr3;
TAILQ_INIT(&empty_ptps);
scr3 = pmap_map_ptes(pmap);
shootself = (scr3 == 0);
/*
* removing one page? take shortcut function.
*/
if (sva + PAGE_SIZE == eva) {
if (pmap_pdes_valid(sva, &pde)) {
/* PA of the PTP */
ptppa = pde & PG_FRAME;
/* get PTP if non-kernel mapping */
if (pmap == pmap_kernel()) {
/* we never free kernel PTPs */
ptp = NULL;
} else {
ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
#ifdef DIAGNOSTIC
if (ptp == NULL) panic("%s: unmanaged PTP detected "
"in shortcut path", __func__);
#endif
}
/* do it! */
result = pmap_remove_pte(pmap, ptp,
&PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs);
/*
* if mapping removed and the PTP is no longer
* being used, free it!
*/
if (result && ptp && ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
pmap_tlb_shootpage(pmap, sva, shootself);
pmap_unmap_ptes(pmap, scr3);
pmap_tlb_shootwait();
} else {
pmap_unmap_ptes(pmap, scr3);
}
goto cleanup;
}
if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
shootall = 1;
for (va = sva; va < eva; va = blkendva) {
/* determine range of block */
blkendva = x86_round_pdr(va + 1);
if (blkendva > eva)
blkendva = eva;
/*
* XXXCDC: our PTE mappings should never be removed
* with pmap_remove! if we allow this (and why would
* we?) then we end up freeing the pmap's page
* directory page (PDP) before we are finished using
* it when we hit in in the recursive mapping. this
* is BAD.
*
* long term solution is to move the PTEs out of user
* address space. and into kernel address space (up
* with APTE). then we can set VM_MAXUSER_ADDRESS to
* be VM_MAX_ADDRESS.
*/
if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
/* XXXCDC: ugly hack to avoid freeing PDP here */
continue;
if (!pmap_pdes_valid(va, &pde))
continue;
/* PA of the PTP */
ptppa = pde & PG_FRAME;
/* get PTP if non-kernel mapping */
if (pmap == pmap_kernel()) {
/* we never free kernel PTPs */
ptp = NULL;
} else {
ptp = pmap_find_ptp(pmap, va, ptppa, 1);
#ifdef DIAGNOSTIC
if (ptp == NULL) panic("%s: unmanaged PTP detected", __func__);
#endif
}
pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)],
va, blkendva, flags, &free_pvs);
/* if PTP is no longer being used, free it! */
if (ptp && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, &empty_ptps);
}
}
if (shootall)
pmap_tlb_shoottlb(pmap, shootself);
else
pmap_tlb_shootrange(pmap, sva, eva, shootself);
pmap_unmap_ptes(pmap, scr3);
pmap_tlb_shootwait();
cleanup:
while ((pve = free_pvs) != NULL) {
free_pvs = pve->pv_next;
pool_put(&pmap_pv_pool, pve);
}
while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
TAILQ_REMOVE(&empty_ptps, ptp, pageq);
uvm_pagefree(ptp);
}
}
/*
* pmap_page_remove: remove a managed vm_page from all pmaps that map it
*
* => R/M bits are sync'd back to attrs
*/
void
pmap_page_remove(struct vm_page *pg)
{
struct pv_entry *pve;
struct pmap *pm;
pt_entry_t opte;
#ifdef DIAGNOSTIC
pd_entry_t pde;
#endif
struct pg_to_free empty_ptps;
struct vm_page *ptp;
paddr_t scr3;
int shootself;
TAILQ_INIT(&empty_ptps);
mtx_enter(&pg->mdpage.pv_mtx);
while ((pve = pg->mdpage.pv_list) != NULL) {
pmap_reference(pve->pv_pmap);
pm = pve->pv_pmap;
mtx_leave(&pg->mdpage.pv_mtx);
/* XXX use direct map? */
scr3 = pmap_map_ptes(pm); /* locks pmap */
shootself = (scr3 == 0);
/*
* We dropped the pvlist lock before grabbing the pmap
* lock to avoid lock ordering problems. This means
* we have to check the pvlist again since somebody
* else might have modified it. All we care about is
* that the pvlist entry matches the pmap we just
* locked. If it doesn't, unlock the pmap and try
* again.
*/
mtx_enter(&pg->mdpage.pv_mtx);
if ((pve = pg->mdpage.pv_list) == NULL ||
pve->pv_pmap != pm) {
mtx_leave(&pg->mdpage.pv_mtx);
pmap_unmap_ptes(pm, scr3); /* unlocks pmap */
pmap_destroy(pm);
mtx_enter(&pg->mdpage.pv_mtx);
continue;
}
pg->mdpage.pv_list = pve->pv_next;
mtx_leave(&pg->mdpage.pv_mtx);
#ifdef DIAGNOSTIC
if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) &&
(pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
pg, pve->pv_va, pve->pv_ptp);
printf("%s: PTP's phys addr: "
"actual=%lx, recorded=%lx\n", __func__,
(unsigned long)(pde & PG_FRAME),
VM_PAGE_TO_PHYS(pve->pv_ptp));
panic("%s: mapped managed page has "
"invalid pv_ptp field", __func__);
}
#endif
/* atomically save the old PTE and zap it */
opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0);
if (opte & PG_W) pve->pv_pmap->pm_stats.wired_count--;
pve->pv_pmap->pm_stats.resident_count--;
pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
pmap_sync_flags_pte(pg, opte);
/* update the PTP reference count. free if last reference. */
if (pve->pv_ptp != NULL) {
pve->pv_ptp->wire_count--;
if (pve->pv_ptp->wire_count <= 1) { pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
pve->pv_va, &empty_ptps);
}
}
pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */
pmap_destroy(pve->pv_pmap);
pool_put(&pmap_pv_pool, pve);
mtx_enter(&pg->mdpage.pv_mtx);
}
mtx_leave(&pg->mdpage.pv_mtx);
pmap_tlb_shootwait();
while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
TAILQ_REMOVE(&empty_ptps, ptp, pageq);
uvm_pagefree(ptp);
}
}
/*
* p m a p a t t r i b u t e f u n c t i o n s
* functions that test/change managed page's attributes
* since a page can be mapped multiple times we must check each PTE that
* maps it by going down the pv lists.
*/
/*
* pmap_test_attrs: test a page's attributes
*/
int
pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
{
struct pv_entry *pve;
pt_entry_t *ptes;
int level, offs;
u_long mybits, testflags;
testflags = pmap_pte2flags(testbits);
if (pg->pg_flags & testflags)
return 1;
mybits = 0;
mtx_enter(&pg->mdpage.pv_mtx);
for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
pve = pve->pv_next) {
level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
&offs);
mybits |= (ptes[offs] & testbits);
}
mtx_leave(&pg->mdpage.pv_mtx); if (mybits == 0)
return 0;
atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
return 1;
}
/*
* pmap_clear_attrs: change a page's attributes
*
* => we return 1 if we cleared one of the bits we were asked to
*/
int
pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
{
struct pv_entry *pve;
pt_entry_t *ptes, opte;
u_long clearflags;
int result, level, offs;
clearflags = pmap_pte2flags(clearbits);
result = pg->pg_flags & clearflags;
if (result) atomic_clearbits_int(&pg->pg_flags, clearflags);
mtx_enter(&pg->mdpage.pv_mtx);
for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
&offs);
opte = ptes[offs];
if (opte & clearbits) {
result = 1;
pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
pmap_is_curpmap(pve->pv_pmap));
}
}
mtx_leave(&pg->mdpage.pv_mtx);
pmap_tlb_shootwait();
return (result != 0);
}
/*
* p m a p p r o t e c t i o n f u n c t i o n s
*/
/*
* pmap_page_protect: change the protection of all recorded mappings
* of a managed page
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_protect: set the protection in of the pages in a pmap
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_write_protect: write-protect pages in a pmap
*/
void
pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
pt_entry_t nx, *spte, *epte;
vaddr_t blockend;
int shootall = 0, shootself;
vaddr_t va;
paddr_t scr3;
scr3 = pmap_map_ptes(pmap);
shootself = (scr3 == 0);
/* should be ok, but just in case ... */
sva &= PG_FRAME;
eva &= PG_FRAME;
nx = 0;
if (!(prot & PROT_EXEC))
nx = pg_nx;
if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
shootall = 1;
for (va = sva; va < eva ; va = blockend) {
blockend = (va & L2_FRAME) + NBPD_L2;
if (blockend > eva)
blockend = eva;
/*
* XXXCDC: our PTE mappings should never be write-protected!
*
* long term solution is to move the PTEs out of user
* address space. and into kernel address space (up
* with APTE). then we can set VM_MAXUSER_ADDRESS to
* be VM_MAX_ADDRESS.
*/
/* XXXCDC: ugly hack to avoid freeing PDP here */
if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
continue;
/* empty block? */
if (!pmap_pdes_valid(va, NULL))
continue;
#ifdef DIAGNOSTIC
if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
panic("%s: PTE space", __func__);
#endif
spte = &PTE_BASE[pl1_i(va)];
epte = &PTE_BASE[pl1_i(blockend)];
for (/*null */; spte < epte ; spte++) { if (!pmap_valid_entry(*spte))
continue;
pmap_pte_clearbits(spte, PG_RW);
pmap_pte_setbits(spte, nx);
}
}
if (shootall)
pmap_tlb_shoottlb(pmap, shootself);
else
pmap_tlb_shootrange(pmap, sva, eva, shootself);
pmap_unmap_ptes(pmap, scr3);
pmap_tlb_shootwait();
}
/*
* end of protection functions
*/
/*
* pmap_unwire: clear the wired bit in the PTE
*
* => mapping should already be in map
*/
void
pmap_unwire(struct pmap *pmap, vaddr_t va)
{
pt_entry_t *ptes;
int level, offs;
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
if (level == 0) {
#ifdef DIAGNOSTIC
if (!pmap_valid_entry(ptes[offs]))
panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
#endif
if (__predict_true((ptes[offs] & PG_W) != 0)) {
pmap_pte_clearbits(&ptes[offs], PG_W);
pmap->pm_stats.wired_count--;
}
#ifdef DIAGNOSTIC
else {
printf("%s: wiring for pmap %p va 0x%lx "
"didn't change!\n", __func__, pmap, va);
}
#endif
}
#ifdef DIAGNOSTIC
else {
panic("%s: invalid PDE", __func__);
}
#endif
}
/*
* pmap_collect: free resources held by a pmap
*
* => optional function.
* => called when a process is swapped out to free memory.
*/
void
pmap_collect(struct pmap *pmap)
{
/*
* free all of the pt pages by removing the physical mappings
* for its entire address space.
*/
/* pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
PMAP_REMOVE_SKIPWIRED);
*/
}
/*
* pmap_copy: copy mappings from one pmap to another
*
* => optional function
* void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
*/
/*
* defined as macro in pmap.h
*/
void
pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
{
uint64_t l4idx, l3idx, l2idx, l1idx;
pd_entry_t *pd, *ptp;
paddr_t npa;
struct pmap *pmap = pmap_kernel();
pt_entry_t *ptes;
int level, offs;
/* If CPU is secure, no need to do anything */
if (!cpu_meltdown)
return;
/* Must be kernel VA */
if (va < VM_MIN_KERNEL_ADDRESS)
panic("%s: invalid special mapping va 0x%lx requested",
__func__, va);
if (pmap->pm_pdir_intel == NULL)
pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
PR_WAITOK | PR_ZERO);
l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
"l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
(uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
/* Start at PML4 / top level */
pd = pmap->pm_pdir_intel;
if (pd == NULL)
panic("%s: PML4 not initialized for pmap @ %p", __func__,
pmap);
/* npa = physaddr of PDPT */
npa = pd[l4idx] & PMAP_PA_MASK;
/* Valid PML4e for the 512GB region containing va? */
if (!npa) {
/* No valid PML4E - allocate PDPT page and set PML4E */
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
panic("%s: can't locate PDPT page", __func__);
pd[l4idx] = (npa | PG_RW | PG_V);
DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
"setting PML4e[%lld] = 0x%llx\n", __func__,
(uint64_t)npa, l4idx, pd[l4idx]);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
(uint64_t)npa);
/* npa = physaddr of PD page */
npa = pd[l3idx] & PMAP_PA_MASK;
/* Valid PDPTe for the 1GB region containing va? */
if (!npa) {
/* No valid PDPTe - allocate PD page and set PDPTe */
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
panic("%s: can't locate PD page", __func__);
pd[l3idx] = (npa | PG_RW | PG_V);
DPRINTF("%s: allocated new PD page at phys 0x%llx, "
"setting PDPTe[%lld] = 0x%llx\n", __func__,
(uint64_t)npa, l3idx, pd[l3idx]);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PD page @ pa=0x%llx", __func__,
(uint64_t)npa);
/* npa = physaddr of PT page */
npa = pd[l2idx] & PMAP_PA_MASK;
/* Valid PDE for the 2MB region containing va? */
if (!npa) {
/* No valid PDE - allocate PT page and set PDE */
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
panic("%s: can't locate PT page", __func__);
pd[l2idx] = (npa | PG_RW | PG_V);
DPRINTF("%s: allocated new PT page at phys 0x%llx, "
"setting PDE[%lld] = 0x%llx\n", __func__,
(uint64_t)npa, l2idx, pd[l2idx]);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PT page @ pa=0x%llx", __func__,
(uint64_t)npa);
DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
"0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
(uint64_t)prot, (uint64_t)pd[l1idx]);
pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W;
/*
* Look up the corresponding U+K entry. If we're installing the
* same PA into the U-K map then set the PG_G bit on both and copy
* the cache-control bits from the U+K entry to the U-K entry.
*/
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
if (((pd[l1idx] ^ ptes[offs]) & PG_FRAME) == 0) {
pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT));
ptes[offs] |= PG_G;
} else {
DPRINTF("%s: special diffing mapping at %llx\n",
__func__, (long long)va);
}
} else
DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
}
void
pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
{
vaddr_t v;
#if NVMM > 0
struct vmx_invept_descriptor vid;
#endif /* NVMM > 0 */
DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
(uint64_t)egpa);
for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
pmap_do_remove_ept(pmap, v);
#if NVMM > 0
if (pmap->eptp != 0) { memset(&vid, 0, sizeof(vid));
vid.vid_eptp = pmap->eptp;
DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
vid.vid_eptp);
invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
}
#endif /* NVMM > 0 */
}
void
pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
{
uint64_t l4idx, l3idx, l2idx, l1idx;
struct vm_page *pg3, *pg2, *pg1;
paddr_t npa3, npa2, npa1;
pd_entry_t *pd4, *pd3, *pd2, *pd1;
pd_entry_t *pptes;
l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
/* Start at PML4 / top level */
pd4 = (pd_entry_t *)pmap->pm_pdir;
if (pd4 == NULL)
return;
/* npa3 = physaddr of PDPT */
npa3 = pd4[l4idx] & PMAP_PA_MASK;
if (!npa3)
return;
pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
pg3 = PHYS_TO_VM_PAGE(npa3);
/* npa2 = physaddr of PD page */
npa2 = pd3[l3idx] & PMAP_PA_MASK;
if (!npa2)
return;
pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
pg2 = PHYS_TO_VM_PAGE(npa2);
/* npa1 = physaddr of PT page */
npa1 = pd2[l2idx] & PMAP_PA_MASK;
if (!npa1)
return;
pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
pg1 = PHYS_TO_VM_PAGE(npa1);
if (pd1[l1idx] == 0)
return;
pd1[l1idx] = 0;
pg1->wire_count--;
pmap->pm_stats.resident_count--;
if (pg1->wire_count > 1)
return;
pg1->wire_count = 0;
pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
pptes[l2idx] = 0;
uvm_pagefree(pg1);
pmap->pm_stats.resident_count--;
pg2->wire_count--;
if (pg2->wire_count > 1)
return;
pg2->wire_count = 0;
pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
pptes[l3idx] = 0;
uvm_pagefree(pg2);
pmap->pm_stats.resident_count--;
pg3->wire_count--;
if (pg3->wire_count > 1)
return;
pg3->wire_count = 0;
pptes = pd4;
pptes[l4idx] = 0;
uvm_pagefree(pg3);
pmap->pm_stats.resident_count--;
}
int
pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
{
uint64_t l4idx, l3idx, l2idx, l1idx;
pd_entry_t *pd, npte;
struct vm_page *ptp, *pptp;
paddr_t npa;
struct uvm_object *obj;
if (gpa > MAXDSIZ)
return ENOMEM;
l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
/* Start at PML4 / top level */
pd = (pd_entry_t *)pmap->pm_pdir;
if (pd == NULL)
return ENOMEM;
/* npa = physaddr of PDPT */
npa = pd[l4idx] & PMAP_PA_MASK;
/* Valid PML4e for the 512GB region containing gpa? */
if (!npa) {
/* No valid PML4e - allocate PDPT page and set PML4e */
obj = &pmap->pm_obj[2]; /* PML4 UVM object */
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
return ENOMEM;
/*
* New PDPT page - we are setting the first entry, so set
* the wired count to 1
*/
ptp->wire_count = 1;
/* Calculate phys address of this new PDPT page */
npa = VM_PAGE_TO_PHYS(ptp);
/*
* Higher levels get full perms; specific permissions are
* entered at the lowest level.
*/
pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X);
pmap->pm_stats.resident_count++;
pptp = ptp;
} else {
/* Already allocated PML4e */
pptp = PHYS_TO_VM_PAGE(npa);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
(uint64_t)npa);
/* npa = physaddr of PD page */
npa = pd[l3idx] & PMAP_PA_MASK;
/* Valid PDPTe for the 1GB region containing gpa? */
if (!npa) {
/* No valid PDPTe - allocate PD page and set PDPTe */
obj = &pmap->pm_obj[1]; /* PDPT UVM object */
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
return ENOMEM;
/*
* New PD page - we are setting the first entry, so set
* the wired count to 1
*/
ptp->wire_count = 1;
pptp->wire_count++;
npa = VM_PAGE_TO_PHYS(ptp);
/*
* Higher levels get full perms; specific permissions are
* entered at the lowest level.
*/
pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X);
pmap->pm_stats.resident_count++;
pptp = ptp;
} else {
/* Already allocated PDPTe */
pptp = PHYS_TO_VM_PAGE(npa);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PD page @ pa=0x%llx", __func__,
(uint64_t)npa);
/* npa = physaddr of PT page */
npa = pd[l2idx] & PMAP_PA_MASK;
/* Valid PDE for the 2MB region containing gpa? */
if (!npa) {
/* No valid PDE - allocate PT page and set PDE */
obj = &pmap->pm_obj[0]; /* PDE UVM object */
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
return ENOMEM;
pptp->wire_count++;
npa = VM_PAGE_TO_PHYS(ptp);
/*
* Higher level get full perms; specific permissions are
* entered at the lowest level.
*/
pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X);
pmap->pm_stats.resident_count++;
} else {
/* Find final ptp */
ptp = PHYS_TO_VM_PAGE(npa);
if (ptp == NULL)
panic("%s: ptp page vanished?", __func__);
}
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
if (pd == NULL)
panic("%s: can't locate PT page @ pa=0x%llx", __func__,
(uint64_t)npa);
npte = hpa | EPT_WB;
if (prot & PROT_READ)
npte |= EPT_R;
if (prot & PROT_WRITE)
npte |= EPT_W;
if (prot & PROT_EXEC)
npte |= EPT_X;
if (pd[l1idx] == 0) {
ptp->wire_count++;
pmap->pm_stats.resident_count++;
} else {
/* XXX flush ept */
}
pd[l1idx] = npte;
return 0;
}
/*
* pmap_enter: enter a mapping into a pmap
*
* => must be done "now" ... no lazy-evaluation
*/
int
pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
{
pt_entry_t opte, npte;
struct vm_page *ptp, *pg = NULL;
struct pv_entry *pve, *opve = NULL;
int ptpdelta, wireddelta, resdelta;
int wired = (flags & PMAP_WIRED) != 0;
int nocache = (pa & PMAP_NOCACHE) != 0;
int wc = (pa & PMAP_WC) != 0;
int error, shootself;
paddr_t scr3;
if (pmap->pm_type == PMAP_TYPE_EPT)
return pmap_enter_ept(pmap, va, pa, prot); KASSERT(!(wc && nocache));
pa &= PMAP_PA_MASK;
#ifdef DIAGNOSTIC
if (va == (vaddr_t) PDP_BASE)
panic("%s: trying to map over PDP!", __func__);
/* sanity check: kernel PTPs should already have been pre-allocated */
if (va >= VM_MIN_KERNEL_ADDRESS &&
!pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
panic("%s: missing kernel PTP for va %lx!", __func__, va);
#endif
pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
if (pve == NULL) {
if (flags & PMAP_CANFAIL) {
error = ENOMEM;
goto out;
}
panic("%s: no pv entries available", __func__);
}
/*
* map in ptes and get a pointer to our PTP (unless we are the kernel)
*/
scr3 = pmap_map_ptes(pmap);
shootself = (scr3 == 0);
if (pmap == pmap_kernel()) {
ptp = NULL;
} else {
ptp = pmap_get_ptp(pmap, va);
if (ptp == NULL) {
if (flags & PMAP_CANFAIL) {
pmap_unmap_ptes(pmap, scr3);
error = ENOMEM;
goto out;
}
panic("%s: get ptp failed", __func__);
}
}
opte = PTE_BASE[pl1_i(va)]; /* old PTE */
/*
* is there currently a valid mapping at our VA?
*/
if (pmap_valid_entry(opte)) {
/*
* first, calculate pm_stats updates. resident count will not
* change since we are replacing/changing a valid mapping.
* wired count might change...
*/
resdelta = 0;
if (wired && (opte & PG_W) == 0)
wireddelta = 1;
else if (!wired && (opte & PG_W) != 0)
wireddelta = -1;
else
wireddelta = 0;
ptpdelta = 0;
/*
* is the currently mapped PA the same as the one we
* want to map?
*/
if ((opte & PG_FRAME) == pa) {
/* if this is on the PVLIST, sync R/M bit */
if (opte & PG_PVLIST) {
pg = PHYS_TO_VM_PAGE(pa);
#ifdef DIAGNOSTIC
if (pg == NULL)
panic("%s: same pa, PG_PVLIST "
"mapping with unmanaged page: "
"va 0x%lx, opte 0x%llx, pa 0x%lx",
__func__, va, opte, pa);
#endif
pmap_sync_flags_pte(pg, opte);
} else {
#ifdef DIAGNOSTIC
if (PHYS_TO_VM_PAGE(pa) != NULL) panic("%s: same pa, no PG_PVLIST "
"mapping with managed page: "
"va 0x%lx, opte 0x%llx, pa 0x%lx",
__func__, va, opte, pa);
#endif
}
goto enter_now;
}
/*
* changing PAs: we must remove the old one first
*/
/*
* if current mapping is on a pvlist,
* remove it (sync R/M bits)
*/
if (opte & PG_PVLIST) {
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
#ifdef DIAGNOSTIC
if (pg == NULL)
panic("%s: PG_PVLIST mapping with unmanaged "
"page: va 0x%lx, opte 0x%llx, pa 0x%lx",
__func__, va, opte, pa);
#endif
pmap_sync_flags_pte(pg, opte);
opve = pmap_remove_pv(pg, pmap, va);
pg = NULL; /* This is not the page we are looking for */
}
} else { /* opte not valid */
resdelta = 1;
if (wired)
wireddelta = 1;
else
wireddelta = 0;
if (ptp != NULL)
ptpdelta = 1;
else
ptpdelta = 0;
}
/*
* pve is either NULL or points to a now-free pv_entry structure
* (the latter case is if we called pmap_remove_pv above).
*
* if this entry is to be on a pvlist, enter it now.
*/
if (pmap_initialized)
pg = PHYS_TO_VM_PAGE(pa);
if (pg != NULL) {
pmap_enter_pv(pg, pve, pmap, va, ptp);
pve = NULL;
}
enter_now:
/*
* at this point pg is !NULL if we want the PG_PVLIST bit set
*/
pmap->pm_stats.resident_count += resdelta;
pmap->pm_stats.wired_count += wireddelta;
if (ptp != NULL) ptp->wire_count += ptpdelta; KASSERT(pg == PHYS_TO_VM_PAGE(pa));
npte = pa | protection_codes[prot] | PG_V;
if (pg != NULL) {
npte |= PG_PVLIST;
/*
* make sure that if the page is write combined all
* instances of pmap_enter make it so.
*/
if (pg->pg_flags & PG_PMAP_WC) { KASSERT(nocache == 0);
wc = 1;
}
}
if (wc)
npte |= pmap_pg_wc;
if (wired)
npte |= PG_W;
if (nocache)
npte |= PG_N;
if (va < VM_MAXUSER_ADDRESS)
npte |= PG_u;
else if (va < VM_MAX_ADDRESS)
npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
if (pmap == pmap_kernel())
npte |= pg_g_kern;
/*
* If the old entry wasn't valid, we can just update it and
* go. If it was valid, and this isn't a read->write
* transition, then we can safely just update it and flush
* any old TLB entries.
*
* If it _was_ valid and this _is_ a read->write transition,
* then this could be a CoW resolution and we need to make
* sure no CPU can see the new writable mapping while another
* still has the old mapping in its TLB, so insert a correct
* but unwritable mapping, flush any old TLB entries, then
* make it writable.
*/
if (! pmap_valid_entry(opte)) {
PTE_BASE[pl1_i(va)] = npte;
} else if ((opte | (npte ^ PG_RW)) & PG_RW) {
/* previously writable or not making writable */
PTE_BASE[pl1_i(va)] = npte;
if (nocache && (opte & PG_N) == 0) wbinvd_on_all_cpus();
pmap_tlb_shootpage(pmap, va, shootself);
} else {
PTE_BASE[pl1_i(va)] = npte ^ PG_RW;
if (nocache && (opte & PG_N) == 0) /* XXX impossible? */ wbinvd_on_all_cpus();
pmap_tlb_shootpage(pmap, va, shootself);
pmap_tlb_shootwait();
PTE_BASE[pl1_i(va)] = npte;
}
pmap_unmap_ptes(pmap, scr3);
pmap_tlb_shootwait();
error = 0;
out:
if (pve != NULL) pool_put(&pmap_pv_pool, pve); if (opve != NULL) pool_put(&pmap_pv_pool, opve);
return error;
}
int
pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
{
struct vm_page *ptp;
struct pmap *kpm = pmap_kernel();
if (uvm.page_init_done == 0) {
vaddr_t va;
/*
* we're growing the kernel pmap early (from
* uvm_pageboot_alloc()). this case must be
* handled a little differently.
*/
va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
*paddrp = PMAP_DIRECT_UNMAP(va);
} else {
ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
ptp_va2o(va, level), NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
panic("%s: out of memory", __func__);
atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
ptp->wire_count = 1;
*paddrp = VM_PAGE_TO_PHYS(ptp);
}
kpm->pm_stats.resident_count++;
return 1;
}
/*
* Allocate the amount of specified ptps for a ptp level, and populate
* all levels below accordingly, mapping virtual addresses starting at
* kva.
*
* Used by pmap_growkernel.
*/
void
pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
{
unsigned long i;
vaddr_t va;
paddr_t pa;
unsigned long index, endindex;
int level;
pd_entry_t *pdep;
for (level = lvl; level > 1; level--) {
if (level == PTP_LEVELS)
pdep = pmap_kernel()->pm_pdir;
else
pdep = normal_pdes[level - 2];
va = kva;
index = pl_i(kva, level);
endindex = index + needed_ptps[level - 1];
/*
* XXX special case for first time call.
*/
if (nkptp[level - 1] != 0)
index++;
else
endindex--;
for (i = index; i <= endindex; i++) {
pmap_get_physpage(va, level - 1, &pa);
pdep[i] = pa | PG_RW | PG_V | pg_nx;
nkptp[level - 1]++;
va += nbpd[level - 1];
}
}
}
/*
* pmap_growkernel: increase usage of KVM space
*
* => we allocate new PTPs for the kernel and install them in all
* the pmaps on the system.
*/
static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)
{
struct pmap *kpm = pmap_kernel(), *pm;
int s, i;
unsigned newpdes;
long needed_kptp[PTP_LEVELS], target_nptp, old;
if (maxkvaddr <= pmap_maxkvaddr)
return pmap_maxkvaddr;
maxkvaddr = x86_round_pdr(maxkvaddr);
old = nkptp[PTP_LEVELS - 1];
/*
* This loop could be optimized more, but pmap_growkernel()
* is called infrequently.
*/
for (i = PTP_LEVELS - 1; i >= 1; i--) {
target_nptp = pl_i(maxkvaddr, i + 1) -
pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
/*
* XXX only need to check toplevel.
*/
if (target_nptp > nkptpmax[i])
panic("%s: out of KVA space", __func__);
needed_kptp[i] = target_nptp - nkptp[i] + 1;
}
s = splhigh(); /* to be safe */
pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
/*
* If the number of top level entries changed, update all
* pmaps.
*/
if (needed_kptp[PTP_LEVELS - 1] != 0) {
newpdes = nkptp[PTP_LEVELS - 1] - old;
mtx_enter(&pmaps_lock);
LIST_FOREACH(pm, &pmaps, pm_list) {
memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
&kpm->pm_pdir[PDIR_SLOT_KERN + old],
newpdes * sizeof (pd_entry_t));
}
mtx_leave(&pmaps_lock);
}
pmap_maxkvaddr = maxkvaddr;
splx(s);
return maxkvaddr;
}
vaddr_t
pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
{
int segno;
u_int npg;
vaddr_t va;
paddr_t pa;
struct vm_physseg *seg;
size = round_page(size);
npg = atop(size);
for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
if (seg->avail_end - seg->avail_start < npg)
continue;
/*
* We can only steal at an ``unused'' segment boundary,
* i.e. either at the start or at the end.
*/
if (seg->avail_start == seg->start ||
seg->avail_end == seg->end)
break;
}
if (segno == vm_nphysseg) {
panic("%s: out of memory", __func__);
} else {
if (seg->avail_start == seg->start) {
pa = ptoa(seg->avail_start);
seg->avail_start += npg;
seg->start += npg;
} else {
pa = ptoa(seg->avail_end) - size;
seg->avail_end -= npg;
seg->end -= npg;
}
/*
* If all the segment has been consumed now, remove it.
* Note that the crash dump code still knows about it
* and will dump it correctly.
*/
if (seg->start == seg->end) {
if (vm_nphysseg-- == 1)
panic("%s: out of memory", __func__);
while (segno < vm_nphysseg) {
seg[0] = seg[1]; /* struct copy */
seg++;
segno++;
}
}
va = PMAP_DIRECT_MAP(pa);
memset((void *)va, 0, size);
}
if (start != NULL)
*start = virtual_avail;
if (end != NULL)
*end = VM_MAX_KERNEL_ADDRESS;
return (va);
}
void
pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
{
*vstartp = virtual_avail;
*vendp = VM_MAX_KERNEL_ADDRESS;
}
/*
* pmap_convert
*
* Converts 'pmap' to the new 'mode'.
*
* Parameters:
* pmap: the pmap to convert
* mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
*/
void
pmap_convert(struct pmap *pmap, int mode)
{
pt_entry_t *pte;
pmap->pm_type = mode;
if (mode == PMAP_TYPE_EPT) {
/* Clear PML4 */
pte = (pt_entry_t *)pmap->pm_pdir;
memset(pte, 0, PAGE_SIZE);
/* Give back the meltdown pdir */
if (pmap->pm_pdir_intel != NULL) {
pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
pmap->pm_pdir_intel = NULL;
}
}
}
#ifdef MULTIPROCESSOR
/*
* Locking for tlb shootdown.
*
* We lock by setting tlb_shoot_wait to the number of cpus that will
* receive our tlb shootdown. After sending the IPIs, we don't need to
* worry about locking order or interrupts spinning for the lock because
* the call that grabs the "lock" isn't the one that releases it. And
* there is nothing that can block the IPI that releases the lock.
*
* The functions are organized so that we first count the number of
* cpus we need to send the IPI to, then we grab the counter, then
* we send the IPIs, then we finally do our own shootdown.
*
* Our shootdown is last to make it parallel with the other cpus
* to shorten the spin time.
*
* Notice that we depend on failures to send IPIs only being able to
* happen during boot. If they happen later, the above assumption
* doesn't hold since we can end up in situations where noone will
* release the lock if we get an interrupt in a bad moment.
*/
#ifdef MP_LOCKDEBUG
#include <ddb/db_output.h>
extern int __mp_lock_spinout;
#endif
volatile long tlb_shoot_wait __attribute__((section(".kudata")));
volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
/* Obtain the "lock" for TLB shooting */
static inline int
pmap_start_tlb_shoot(long wait, const char *func)
{
int s = splvm();
while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
#ifdef MP_LOCKDEBUG
int nticks = __mp_lock_spinout;
#endif
while (tlb_shoot_wait != 0) {
CPU_BUSY_CYCLE();
#ifdef MP_LOCKDEBUG
if (--nticks <= 0) {
db_printf("%s: spun out", func);
db_enter();
nticks = __mp_lock_spinout;
}
#endif
}
}
return s;
}
void
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
{
struct cpu_info *ci, *self = curcpu();
CPU_INFO_ITERATOR cii;
long wait = 0;
u_int64_t mask = 0;
int is_kva = va >= VM_MIN_KERNEL_ADDRESS;
CPU_INFO_FOREACH(cii, ci) { if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
continue;
if (!is_kva && !pmap_is_active(pm, ci))
continue;
mask |= (1ULL << ci->ci_cpuid);
wait++;
}
if (wait > 0) {
int s = pmap_start_tlb_shoot(wait, __func__);
tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
tlb_shoot_addr1 = va;
CPU_INFO_FOREACH(cii, ci) { if ((mask & (1ULL << ci->ci_cpuid)) == 0)
continue;
if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0) panic("%s: ipi failed", __func__);
}
splx(s);
}
if (!pmap_use_pcid) {
if (shootself) pmap_update_pg(va);
} else if (is_kva) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_KERN, va);
} else if (shootself) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
if (cpu_meltdown) invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
}
}
void
pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
{
struct cpu_info *ci, *self = curcpu();
CPU_INFO_ITERATOR cii;
long wait = 0;
u_int64_t mask = 0;
int is_kva = sva >= VM_MIN_KERNEL_ADDRESS;
vaddr_t va;
CPU_INFO_FOREACH(cii, ci) { if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
continue;
if (!is_kva && !pmap_is_active(pm, ci))
continue;
mask |= (1ULL << ci->ci_cpuid);
wait++;
}
if (wait > 0) {
int s = pmap_start_tlb_shoot(wait, __func__);
tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
tlb_shoot_addr1 = sva;
tlb_shoot_addr2 = eva;
CPU_INFO_FOREACH(cii, ci) { if ((mask & (1ULL << ci->ci_cpuid)) == 0)
continue;
if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0) panic("%s: ipi failed", __func__);
}
splx(s);
}
if (!pmap_use_pcid) {
if (shootself) { for (va = sva; va < eva; va += PAGE_SIZE)
pmap_update_pg(va);
}
} else if (is_kva) {
for (va = sva; va < eva; va += PAGE_SIZE) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_KERN, va);
}
} else if (shootself) {
if (cpu_meltdown) {
for (va = sva; va < eva; va += PAGE_SIZE) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
}
} else {
for (va = sva; va < eva; va += PAGE_SIZE)
invpcid(INVPCID_ADDR, PCID_PROC, va);
}
}
}
void
pmap_tlb_shoottlb(struct pmap *pm, int shootself)
{
struct cpu_info *ci, *self = curcpu();
CPU_INFO_ITERATOR cii;
long wait = 0;
u_int64_t mask = 0;
KASSERT(pm != pmap_kernel()); CPU_INFO_FOREACH(cii, ci) { if (ci == self || !pmap_is_active(pm, ci) ||
!(ci->ci_flags & CPUF_RUNNING))
continue;
mask |= (1ULL << ci->ci_cpuid);
wait++;
}
if (wait) {
int s = pmap_start_tlb_shoot(wait, __func__);
CPU_INFO_FOREACH(cii, ci) { if ((mask & (1ULL << ci->ci_cpuid)) == 0)
continue;
if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0) panic("%s: ipi failed", __func__);
}
splx(s);
}
if (shootself) {
if (!pmap_use_pcid)
tlbflush();
else {
invpcid(INVPCID_PCID, PCID_PROC, 0);
if (cpu_meltdown) invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
}
}
}
void
pmap_tlb_shootwait(void)
{
#ifdef MP_LOCKDEBUG
int nticks = __mp_lock_spinout;
#endif
while (tlb_shoot_wait != 0) {
CPU_BUSY_CYCLE();
#ifdef MP_LOCKDEBUG
if (--nticks <= 0) {
db_printf("%s: spun out", __func__);
db_enter();
nticks = __mp_lock_spinout;
}
#endif
}
}
#else /* MULTIPROCESSOR */
void
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
{
if (!pmap_use_pcid) {
if (shootself)
pmap_update_pg(va);
} else if (va >= VM_MIN_KERNEL_ADDRESS) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_KERN, va);
} else if (shootself) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
if (cpu_meltdown)
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
}
}
void
pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
{
vaddr_t va;
if (!pmap_use_pcid) {
if (shootself) {
for (va = sva; va < eva; va += PAGE_SIZE)
pmap_update_pg(va);
}
} else if (sva >= VM_MIN_KERNEL_ADDRESS) {
for (va = sva; va < eva; va += PAGE_SIZE) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_KERN, va);
}
} else if (shootself) {
if (cpu_meltdown) {
for (va = sva; va < eva; va += PAGE_SIZE) {
invpcid(INVPCID_ADDR, PCID_PROC, va);
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
}
} else {
for (va = sva; va < eva; va += PAGE_SIZE)
invpcid(INVPCID_ADDR, PCID_PROC, va);
}
}
}
void
pmap_tlb_shoottlb(struct pmap *pm, int shootself)
{
if (shootself) {
if (!pmap_use_pcid)
tlbflush();
else {
invpcid(INVPCID_PCID, PCID_PROC, 0);
if (cpu_meltdown)
invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
}
}
}
#endif /* MULTIPROCESSOR */
/* $OpenBSD: nfs_syscalls.c,v 1.118 2022/06/06 14:45:41 claudio Exp $ */
/* $NetBSD: nfs_syscalls.c,v 1.19 1996/02/18 11:53:52 fvdl Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)nfs_syscalls.c 8.5 (Berkeley) 3/30/95
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/namei.h>
#include <sys/syslog.h>
#include <sys/filedesc.h>
#include <sys/signalvar.h>
#include <sys/kthread.h>
#include <sys/queue.h>
#include <sys/syscallargs.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <nfs/xdr_subs.h>
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfsrvcache.h>
#include <nfs/nfsmount.h>
#include <nfs/nfsnode.h>
#include <nfs/nfs_var.h>
/* Global defs. */
extern int nfs_numasync;
extern struct nfsstats nfsstats;
struct nfssvc_sock *nfs_udpsock;
int nfsd_waiting = 0;
#ifdef NFSSERVER
struct pool nfsrv_descript_pl;
int nfsrv_getslp(struct nfsd *nfsd);
static int nfs_numnfsd = 0;
int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *,
struct nfssvc_sock *, struct proc *, struct mbuf **) = {
nfsrv_null,
nfsrv_getattr,
nfsrv_setattr,
nfsrv_lookup,
nfsrv3_access,
nfsrv_readlink,
nfsrv_read,
nfsrv_write,
nfsrv_create,
nfsrv_mkdir,
nfsrv_symlink,
nfsrv_mknod,
nfsrv_remove,
nfsrv_rmdir,
nfsrv_rename,
nfsrv_link,
nfsrv_readdir,
nfsrv_readdirplus,
nfsrv_statfs,
nfsrv_fsinfo,
nfsrv_pathconf,
nfsrv_commit,
nfsrv_noop
};
#endif
TAILQ_HEAD(, nfssvc_sock) nfssvc_sockhead;
struct nfsdhead nfsd_head;
int nfssvc_sockhead_flag;
#define SLP_INIT 0x01 /* NFS data undergoing initialization */
#define SLP_WANTINIT 0x02 /* thread waiting on NFS initialization */
int nfsd_head_flag;
#ifdef NFSCLIENT
struct proc *nfs_asyncdaemon[NFS_MAXASYNCDAEMON];
int nfs_niothreads = -1;
#endif
int nfssvc_addsock(struct file *, struct mbuf *);
int nfssvc_nfsd(struct nfsd *);
void nfsrv_slpderef(struct nfssvc_sock *);
void nfsrv_zapsock(struct nfssvc_sock *);
void nfssvc_iod(void *);
/*
* NFS server pseudo system call for the nfsd's
* Based on the flag value it either:
* - adds a socket to the selection list
* - remains in the kernel as an nfsd
*/
int
sys_nfssvc(struct proc *p, void *v, register_t *retval)
{
int error = 0;
#ifdef NFSSERVER
struct sys_nfssvc_args /* {
syscallarg(int) flag;
syscallarg(caddr_t) argp;
} */ *uap = v;
int flags = SCARG(uap, flag);
struct file *fp;
struct mbuf *nam;
struct nfsd_args nfsdarg;
struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs;
struct nfsd *nfsd;
#endif
/* Must be super user */
error = suser(p);
if (error)
return (error);
#ifndef NFSSERVER
error = ENOSYS;
#else
while (nfssvc_sockhead_flag & SLP_INIT) {
nfssvc_sockhead_flag |= SLP_WANTINIT;
tsleep_nsec(&nfssvc_sockhead, PSOCK, "nfsd init", INFSLP);
}
switch (flags) {
case NFSSVC_ADDSOCK:
error = copyin(SCARG(uap, argp), &nfsdarg, sizeof(nfsdarg));
if (error)
return (error);
error = getsock(p, nfsdarg.sock, &fp);
if (error)
return (error);
/*
* Get the client address for connected sockets.
*/
if (nfsdarg.name == NULL || nfsdarg.namelen == 0)
nam = NULL;
else {
error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen,
MT_SONAME);
if (error) {
FRELE(fp, p);
return (error);
}
}
error = nfssvc_addsock(fp, nam);
FRELE(fp, p);
break;
case NFSSVC_NFSD:
error = copyin(SCARG(uap, argp), nsd, sizeof(*nsd));
if (error)
return (error);
nfsd = malloc(sizeof(*nfsd), M_NFSD, M_WAITOK|M_ZERO);
nfsd->nfsd_procp = p;
nfsd->nfsd_slp = NULL;
error = nfssvc_nfsd(nfsd);
break;
default:
error = EINVAL;
break;
}
if (error == EINTR || error == ERESTART)
error = 0;
#endif /* !NFSSERVER */
return (error);
}
#ifdef NFSSERVER
/*
* Adds a socket to the list for servicing by nfsds.
*/
int
nfssvc_addsock(struct file *fp, struct mbuf *mynam)
{
struct mbuf *m;
int siz;
struct nfssvc_sock *slp;
struct socket *so;
struct nfssvc_sock *tslp;
int error;
so = (struct socket *)fp->f_data;
tslp = NULL;
/*
* Add it to the list, as required.
*/
if (so->so_proto->pr_protocol == IPPROTO_UDP) {
tslp = nfs_udpsock;
if (tslp->ns_flag & SLP_VALID) {
m_freem(mynam);
return (EPERM);
}
}
if (so->so_type == SOCK_STREAM)
siz = NFS_MAXPACKET + sizeof (u_long);
else
siz = NFS_MAXPACKET;
solock(so);
error = soreserve(so, siz, siz);
if (error) {
sounlock(so);
m_freem(mynam);
return (error);
}
/*
* Set protocol specific options { for now TCP only } and
* reserve some space. For datagram sockets, this can get called
* repeatedly for the same socket, but that isn't harmful.
*/
if (so->so_type == SOCK_STREAM) {
MGET(m, M_WAIT, MT_SOOPTS);
*mtod(m, int32_t *) = 1;
m->m_len = sizeof(int32_t);
sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m);
m_freem(m);
}
if (so->so_proto->pr_domain->dom_family == AF_INET &&
so->so_proto->pr_protocol == IPPROTO_TCP) {
MGET(m, M_WAIT, MT_SOOPTS);
*mtod(m, int32_t *) = 1;
m->m_len = sizeof(int32_t);
sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m);
m_freem(m);
}
so->so_rcv.sb_flags &= ~SB_NOINTR;
so->so_rcv.sb_timeo_nsecs = INFSLP;
so->so_snd.sb_flags &= ~SB_NOINTR;
so->so_snd.sb_timeo_nsecs = INFSLP;
sounlock(so);
if (tslp)
slp = tslp;
else {
slp = malloc(sizeof(*slp), M_NFSSVC, M_WAITOK|M_ZERO);
TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain);
}
slp->ns_so = so;
slp->ns_nam = mynam;
FREF(fp);
slp->ns_fp = fp;
so->so_upcallarg = (caddr_t)slp;
so->so_upcall = nfsrv_rcv;
slp->ns_flag = (SLP_VALID | SLP_NEEDQ);
nfsrv_wakenfsd(slp);
return (0);
}
/*
* Called by nfssvc() for nfsds. Just loops around servicing rpc requests
* until it is killed by a signal.
*/
int
nfssvc_nfsd(struct nfsd *nfsd)
{
struct mbuf *m;
int siz;
struct nfssvc_sock *slp;
struct socket *so;
int *solockp;
struct nfsrv_descript *nd = NULL;
struct mbuf *mreq;
int error = 0, cacherep, sotype;
cacherep = RC_DOIT;
TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
nfs_numnfsd++;
/* Loop getting rpc requests until SIGKILL. */
loop:
if (!ISSET(nfsd->nfsd_flag, NFSD_REQINPROG)) {
/* attach an nfssvc_sock to nfsd */
error = nfsrv_getslp(nfsd);
if (error)
goto done;
slp = nfsd->nfsd_slp;
if (ISSET(slp->ns_flag, SLP_VALID)) {
if (ISSET(slp->ns_flag, SLP_DISCONN)) {
nfsrv_zapsock(slp);
} else if (ISSET(slp->ns_flag, SLP_NEEDQ)) {
CLR(slp->ns_flag, SLP_NEEDQ);
nfs_sndlock(&slp->ns_solock, NULL);
nfsrv_rcv(slp->ns_so, (caddr_t)slp, M_WAIT);
nfs_sndunlock(&slp->ns_solock);
}
error = nfsrv_dorec(slp, nfsd, &nd);
SET(nfsd->nfsd_flag, NFSD_REQINPROG);
}
} else {
error = 0;
slp = nfsd->nfsd_slp;
}
if (error || !ISSET(slp->ns_flag, SLP_VALID)) {
if (nd != NULL) {
pool_put(&nfsrv_descript_pl, nd);
nd = NULL;
}
nfsd->nfsd_slp = NULL;
CLR(nfsd->nfsd_flag, NFSD_REQINPROG);
nfsrv_slpderef(slp);
goto loop;
}
so = slp->ns_so;
sotype = so->so_type;
if (ISSET(so->so_proto->pr_flags, PR_CONNREQUIRED))
solockp = &slp->ns_solock;
else
solockp = NULL;
if (nd) {
if (nd->nd_nam2)
nd->nd_nam = nd->nd_nam2;
else
nd->nd_nam = slp->ns_nam;
}
cacherep = nfsrv_getcache(nd, slp, &mreq);
switch (cacherep) {
case RC_DOIT:
error = (*(nfsrv3_procs[nd->nd_procnum]))(nd, slp, nfsd->nfsd_procp, &mreq);
if (mreq == NULL) {
if (nd != NULL) {
m_freem(nd->nd_nam2);
m_freem(nd->nd_mrep);
}
break;
}
if (error) {
nfsstats.srv_errs++;
nfsrv_updatecache(nd, 0, mreq);
m_freem(nd->nd_nam2);
break;
}
nfsstats.srvrpccnt[nd->nd_procnum]++;
nfsrv_updatecache(nd, 1, mreq);
nd->nd_mrep = NULL;
/* FALLTHROUGH */
case RC_REPLY:
m = mreq;
siz = 0;
while (m) {
siz += m->m_len;
m = m->m_next;
}
if (siz <= 0 || siz > NFS_MAXPACKET)
panic("bad nfs svc reply, siz = %i", siz);
m = mreq;
m->m_pkthdr.len = siz;
m->m_pkthdr.ph_ifidx = 0;
/* For stream protocols, prepend a Sun RPC Record Mark. */
if (sotype == SOCK_STREAM) {
M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
*mtod(m, u_int32_t *) = htonl(0x80000000 | siz);
}
if (solockp)
nfs_sndlock(solockp, NULL);
if (ISSET(slp->ns_flag, SLP_VALID))
error = nfs_send(so, nd->nd_nam2, m, NULL);
else {
error = EPIPE;
m_freem(m);
}
m_freem(nd->nd_nam2);
m_freem(nd->nd_mrep);
if (error == EPIPE)
nfsrv_zapsock(slp);
if (solockp)
nfs_sndunlock(solockp);
if (error == EINTR || error == ERESTART) {
pool_put(&nfsrv_descript_pl, nd);
nfsrv_slpderef(slp);
goto done;
}
break;
case RC_DROPIT:
m_freem(nd->nd_mrep);
m_freem(nd->nd_nam2);
break;
};
if (nd) {
pool_put(&nfsrv_descript_pl, nd);
nd = NULL;
}
if (nfsrv_dorec(slp, nfsd, &nd)) {
nfsd->nfsd_flag &= ~NFSD_REQINPROG;
nfsd->nfsd_slp = NULL;
nfsrv_slpderef(slp);
}
goto loop;
done:
TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
free(nfsd, M_NFSD, sizeof(*nfsd));
if (--nfs_numnfsd == 0)
nfsrv_init(1); /* Reinitialize everything */
return (error);
}
/*
* Shut down a socket associated with an nfssvc_sock structure.
* Should be called with the send lock set, if required.
* The trick here is to increment the sref at the start, so that the nfsds
* will stop using it and clear ns_flag at the end so that it will not be
* reassigned during cleanup.
*/
void
nfsrv_zapsock(struct nfssvc_sock *slp)
{
struct socket *so;
struct file *fp;
struct mbuf *m, *n;
slp->ns_flag &= ~SLP_ALLFLAGS;
fp = slp->ns_fp;
if (fp) {
FREF(fp);
slp->ns_fp = NULL;
so = slp->ns_so;
so->so_upcall = NULL;
soshutdown(so, SHUT_RDWR);
closef(fp, NULL);
if (slp->ns_nam)
m = m_free(slp->ns_nam);
m_freem(slp->ns_raw);
m = slp->ns_rec;
while (m) {
n = m->m_nextpkt;
m_freem(m);
m = n;
}
}
}
/*
* Dereference a server socket structure. If it has no more references and
* is no longer valid, you can throw it away.
*/
void
nfsrv_slpderef(struct nfssvc_sock *slp)
{
if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) {
TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
free(slp, M_NFSSVC, sizeof(*slp));
}
}
/*
* Initialize the data structures for the server.
* Handshake with any new nfsds starting up to avoid any chance of
* corruption.
*/
void
nfsrv_init(int terminating)
{
struct nfssvc_sock *slp, *nslp;
if (nfssvc_sockhead_flag & SLP_INIT)
panic("nfsd init");
nfssvc_sockhead_flag |= SLP_INIT;
if (terminating) {
for (slp = TAILQ_FIRST(&nfssvc_sockhead); slp != NULL;
slp = nslp) {
nslp = TAILQ_NEXT(slp, ns_chain);
if (slp->ns_flag & SLP_VALID)
nfsrv_zapsock(slp);
TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
free(slp, M_NFSSVC, sizeof(*slp));
}
nfsrv_cleancache(); /* And clear out server cache */
}
TAILQ_INIT(&nfssvc_sockhead);
nfssvc_sockhead_flag &= ~SLP_INIT;
if (nfssvc_sockhead_flag & SLP_WANTINIT) {
nfssvc_sockhead_flag &= ~SLP_WANTINIT;
wakeup((caddr_t)&nfssvc_sockhead);
}
TAILQ_INIT(&nfsd_head);
nfsd_head_flag &= ~NFSD_CHECKSLP;
nfs_udpsock = malloc(sizeof(*nfs_udpsock), M_NFSSVC,
M_WAITOK|M_ZERO);
TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain);
if (!terminating) {
pool_init(&nfsrv_descript_pl, sizeof(struct nfsrv_descript),
0, IPL_NONE, PR_WAITOK, "ndscpl", NULL);
}
}
#endif /* NFSSERVER */
#ifdef NFSCLIENT
/*
* Asynchronous I/O threads for client nfs.
* They do read-ahead and write-behind operations on the block I/O cache.
* Never returns unless it fails or gets killed.
*/
void
nfssvc_iod(void *arg)
{
struct proc *p = curproc;
struct buf *bp, *nbp;
int i, myiod;
struct vnode *vp;
int error = 0, s, bufcount;
bufcount = MIN(256, bcstats.kvaslots / 8);
bufcount = MIN(bufcount, bcstats.numbufs / 8);
/* Assign my position or return error if too many already running. */
myiod = -1;
for (i = 0; i < NFS_MAXASYNCDAEMON; i++) {
if (nfs_asyncdaemon[i] == NULL) {
myiod = i;
break;
}
}
if (myiod == -1)
kthread_exit(EBUSY);
nfs_asyncdaemon[myiod] = p;
nfs_numasync++;
/* Upper limit on how many bufs we'll queue up for this iod. */
if (nfs_bufqmax > bcstats.kvaslots / 4) {
nfs_bufqmax = bcstats.kvaslots / 4;
bufcount = 0;
}
if (nfs_bufqmax > bcstats.numbufs / 4) {
nfs_bufqmax = bcstats.numbufs / 4;
bufcount = 0;
}
nfs_bufqmax += bufcount;
wakeup(&nfs_bufqlen); /* wake up anyone waiting for room to enqueue IO */
/* Just loop around doin our stuff until SIGKILL. */
for (;;) {
while (TAILQ_FIRST(&nfs_bufq) == NULL && error == 0) {
error = tsleep_nsec(&nfs_bufq,
PWAIT | PCATCH, "nfsidl", INFSLP);
}
while ((bp = TAILQ_FIRST(&nfs_bufq)) != NULL) {
/* Take one off the front of the list */
TAILQ_REMOVE(&nfs_bufq, bp, b_freelist);
nfs_bufqlen--;
wakeup_one(&nfs_bufqlen);
if (bp->b_flags & B_READ)
(void) nfs_doio(bp, NULL);
else do {
/*
* Look for a delayed write for the same vnode, so I can do
* it now. We must grab it before calling nfs_doio() to
* avoid any risk of the vnode getting vclean()'d while
* we are doing the write rpc.
*/
vp = bp->b_vp;
s = splbio();
LIST_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) {
if ((nbp->b_flags &
(B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI)
continue;
nbp->b_flags |= B_ASYNC;
bremfree(nbp);
buf_acquire(nbp);
break;
}
/*
* For the delayed write, do the first part of nfs_bwrite()
* up to, but not including nfs_strategy().
*/
if (nbp) {
nbp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
buf_undirty(nbp);
nbp->b_vp->v_numoutput++;
}
splx(s);
(void) nfs_doio(bp, NULL);
} while ((bp = nbp) != NULL);
}
if (error) {
nfs_asyncdaemon[myiod] = NULL;
nfs_numasync--;
nfs_bufqmax -= bufcount;
kthread_exit(error);
}
}
}
void
nfs_getset_niothreads(int set)
{
int i, have, start;
for (have = 0, i = 0; i < NFS_MAXASYNCDAEMON; i++)
if (nfs_asyncdaemon[i] != NULL)
have++;
if (set) {
/* clamp to sane range */
nfs_niothreads = max(0, min(nfs_niothreads, NFS_MAXASYNCDAEMON));
start = nfs_niothreads - have;
while (start > 0) {
kthread_create(nfssvc_iod, NULL, NULL, "nfsio");
start--;
}
for (i = 0; (start < 0) && (i < NFS_MAXASYNCDAEMON); i++) if (nfs_asyncdaemon[i] != NULL) { psignal(nfs_asyncdaemon[i], SIGKILL);
start++;
}
} else {
if (nfs_niothreads >= 0) nfs_niothreads = have;
}
}
#endif /* NFSCLIENT */
#ifdef NFSSERVER
/*
* Find an nfssrv_sock for nfsd, sleeping if needed.
*/
int
nfsrv_getslp(struct nfsd *nfsd)
{
struct nfssvc_sock *slp;
int error;
again:
while (nfsd->nfsd_slp == NULL &&
(nfsd_head_flag & NFSD_CHECKSLP) == 0) {
nfsd->nfsd_flag |= NFSD_WAITING;
nfsd_waiting++;
error = tsleep_nsec(nfsd, PSOCK | PCATCH, "nfsd", INFSLP);
nfsd_waiting--;
if (error)
return (error);
}
if (nfsd->nfsd_slp == NULL &&
(nfsd_head_flag & NFSD_CHECKSLP) != 0) {
TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) ==
(SLP_VALID | SLP_DOREC)) {
slp->ns_flag &= ~SLP_DOREC;
slp->ns_sref++;
nfsd->nfsd_slp = slp;
break;
}
}
if (slp == NULL)
nfsd_head_flag &= ~NFSD_CHECKSLP;
}
if (nfsd->nfsd_slp == NULL)
goto again;
return (0);
}
#endif /* NFSSERVER */
/* $OpenBSD: igmp.c,v 1.81 2022/09/04 06:49:11 jsg Exp $ */
/* $NetBSD: igmp.c,v 1.15 1996/02/13 23:41:25 christos Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988 Stephen Deering.
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.2 (Berkeley) 5/3/95
*/
/*
* Internet Group Management Protocol (IGMP) routines.
*
* Written by Steve Deering, Stanford, May 1988.
* Modified by Rosen Sharma, Stanford, Aug 1994.
* Modified by Bill Fenner, Xerox PARC, Feb 1995.
*
* MULTICAST Revision: 1.3
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/igmp.h>
#include <netinet/igmp_var.h>
#include <sys/stdarg.h>
#define IP_MULTICASTOPTS 0
int igmp_timers_are_running; /* [N] shortcut for fast timer */
static LIST_HEAD(, router_info) rti_head;
static struct mbuf *router_alert;
struct cpumem *igmpcounters;
void igmp_checktimer(struct ifnet *);
void igmp_sendpkt(struct ifnet *, struct in_multi *, int, in_addr_t);
int rti_fill(struct in_multi *);
struct router_info * rti_find(struct ifnet *);
int igmp_input_if(struct ifnet *, struct mbuf **, int *, int, int);
int igmp_sysctl_igmpstat(void *, size_t *, void *);
void
igmp_init(void)
{
struct ipoption *ra;
igmp_timers_are_running = 0;
LIST_INIT(&rti_head);
igmpcounters = counters_alloc(igps_ncounters);
router_alert = m_get(M_WAIT, MT_DATA);
/*
* Construct a Router Alert option (RAO) to use in report
* messages as required by RFC2236. This option has the
* following format:
*
* | 10010100 | 00000100 | 2 octet value |
*
* where a value of "0" indicates that routers shall examine
* the packet.
*/
ra = mtod(router_alert, struct ipoption *);
ra->ipopt_dst.s_addr = INADDR_ANY;
ra->ipopt_list[0] = IPOPT_RA;
ra->ipopt_list[1] = 0x04;
ra->ipopt_list[2] = 0x00;
ra->ipopt_list[3] = 0x00;
router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1];
}
int
rti_fill(struct in_multi *inm)
{
struct router_info *rti;
LIST_FOREACH(rti, &rti_head, rti_list) {
if (rti->rti_ifidx == inm->inm_ifidx) {
inm->inm_rti = rti;
if (rti->rti_type == IGMP_v1_ROUTER)
return (IGMP_v1_HOST_MEMBERSHIP_REPORT);
else
return (IGMP_v2_HOST_MEMBERSHIP_REPORT);
}
}
rti = malloc(sizeof(*rti), M_MRTABLE, M_WAITOK);
rti->rti_ifidx = inm->inm_ifidx;
rti->rti_type = IGMP_v2_ROUTER;
LIST_INSERT_HEAD(&rti_head, rti, rti_list);
inm->inm_rti = rti;
return (IGMP_v2_HOST_MEMBERSHIP_REPORT);
}
struct router_info *
rti_find(struct ifnet *ifp)
{
struct router_info *rti;
KERNEL_ASSERT_LOCKED();
LIST_FOREACH(rti, &rti_head, rti_list) {
if (rti->rti_ifidx == ifp->if_index)
return (rti);
}
rti = malloc(sizeof(*rti), M_MRTABLE, M_NOWAIT);
if (rti == NULL)
return (NULL);
rti->rti_ifidx = ifp->if_index;
rti->rti_type = IGMP_v2_ROUTER;
LIST_INSERT_HEAD(&rti_head, rti, rti_list);
return (rti);
}
void
rti_delete(struct ifnet *ifp)
{
struct router_info *rti, *trti;
LIST_FOREACH_SAFE(rti, &rti_head, rti_list, trti) {
if (rti->rti_ifidx == ifp->if_index) {
LIST_REMOVE(rti, rti_list);
free(rti, M_MRTABLE, sizeof(*rti));
break;
}
}
}
int
igmp_input(struct mbuf **mp, int *offp, int proto, int af)
{
struct ifnet *ifp;
igmpstat_inc(igps_rcv_total);
ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
if (ifp == NULL) {
m_freemp(mp);
return IPPROTO_DONE;
}
KERNEL_LOCK();
proto = igmp_input_if(ifp, mp, offp, proto, af);
KERNEL_UNLOCK();
if_put(ifp);
return proto;
}
int
igmp_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto, int af)
{
struct mbuf *m = *mp;
int iphlen = *offp;
struct ip *ip = mtod(m, struct ip *);
struct igmp *igmp;
int igmplen;
int minlen;
struct ifmaddr *ifma;
struct in_multi *inm;
struct router_info *rti;
struct in_ifaddr *ia;
int timer;
igmplen = ntohs(ip->ip_len) - iphlen;
/*
* Validate lengths
*/
if (igmplen < IGMP_MINLEN) {
igmpstat_inc(igps_rcv_tooshort);
m_freem(m);
return IPPROTO_DONE;
}
minlen = iphlen + IGMP_MINLEN;
if ((m->m_flags & M_EXT || m->m_len < minlen) &&
(m = *mp = m_pullup(m, minlen)) == NULL) {
igmpstat_inc(igps_rcv_tooshort);
return IPPROTO_DONE;
}
/*
* Validate checksum
*/
m->m_data += iphlen;
m->m_len -= iphlen;
igmp = mtod(m, struct igmp *);
if (in_cksum(m, igmplen)) {
igmpstat_inc(igps_rcv_badsum);
m_freem(m);
return IPPROTO_DONE;
}
m->m_data -= iphlen;
m->m_len += iphlen;
ip = mtod(m, struct ip *);
switch (igmp->igmp_type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
igmpstat_inc(igps_rcv_queries);
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (igmp->igmp_code == 0) {
rti = rti_find(ifp);
if (rti == NULL) {
m_freem(m);
return IPPROTO_DONE;
}
rti->rti_type = IGMP_v1_ROUTER;
rti->rti_age = 0;
if (ip->ip_dst.s_addr != INADDR_ALLHOSTS_GROUP) {
igmpstat_inc(igps_rcv_badqueries);
m_freem(m);
return IPPROTO_DONE;
}
/*
* Start the timers in all of our membership records
* for the interface on which the query arrived,
* except those that are already running and those
* that belong to a "local" group (224.0.0.X).
*/
TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) {
if (ifma->ifma_addr->sa_family != AF_INET)
continue;
inm = ifmatoinm(ifma);
if (inm->inm_timer == 0 &&
!IN_LOCAL_GROUP(inm->inm_addr.s_addr)) {
inm->inm_state = IGMP_DELAYING_MEMBER;
inm->inm_timer = IGMP_RANDOM_DELAY(
IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ);
igmp_timers_are_running = 1;
}
}
} else {
if (!IN_MULTICAST(ip->ip_dst.s_addr)) {
igmpstat_inc(igps_rcv_badqueries);
m_freem(m);
return IPPROTO_DONE;
}
timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
if (timer == 0)
timer = 1;
/*
* Start the timers in all of our membership records
* for the interface on which the query arrived,
* except those that are already running and those
* that belong to a "local" group (224.0.0.X). For
* timers already running, check if they need to be
* reset.
*/
TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) {
if (ifma->ifma_addr->sa_family != AF_INET)
continue;
inm = ifmatoinm(ifma);
if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(ip->ip_dst.s_addr == INADDR_ALLHOSTS_GROUP ||
ip->ip_dst.s_addr == inm->inm_addr.s_addr)) {
switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
if (inm->inm_timer <= timer)
break;
/* FALLTHROUGH */
case IGMP_IDLE_MEMBER:
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
inm->inm_state =
IGMP_DELAYING_MEMBER;
inm->inm_timer =
IGMP_RANDOM_DELAY(timer);
igmp_timers_are_running = 1;
break;
case IGMP_SLEEPING_MEMBER:
inm->inm_state =
IGMP_AWAKENING_MEMBER;
break;
}
}
}
}
break;
case IGMP_v1_HOST_MEMBERSHIP_REPORT:
igmpstat_inc(igps_rcv_reports);
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN_MULTICAST(igmp->igmp_group.s_addr) ||
igmp->igmp_group.s_addr != ip->ip_dst.s_addr) {
igmpstat_inc(igps_rcv_badreports);
m_freem(m);
return IPPROTO_DONE;
}
/*
* KLUDGE: if the IP source address of the report has an
* unspecified (i.e., zero) subnet number, as is allowed for
* a booting host, replace it with the correct subnet number
* so that a process-level multicast routing daemon can
* determine which subnet it arrived from. This is necessary
* to compensate for the lack of any way for a process to
* determine the arrival interface of an incoming packet.
*/
if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) {
IFP_TO_IA(ifp, ia);
if (ia)
ip->ip_src.s_addr = ia->ia_net;
}
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm);
if (inm != NULL) {
inm->inm_timer = 0;
igmpstat_inc(igps_rcv_ourreports);
switch (inm->inm_state) {
case IGMP_IDLE_MEMBER:
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
case IGMP_SLEEPING_MEMBER:
inm->inm_state = IGMP_SLEEPING_MEMBER;
break;
case IGMP_DELAYING_MEMBER:
if (inm->inm_rti->rti_type == IGMP_v1_ROUTER)
inm->inm_state = IGMP_LAZY_MEMBER;
else
inm->inm_state = IGMP_SLEEPING_MEMBER;
break;
}
}
break;
case IGMP_v2_HOST_MEMBERSHIP_REPORT:
#ifdef MROUTING
/*
* Make sure we don't hear our own membership report. Fast
* leave requires knowing that we are the only member of a
* group.
*/
IFP_TO_IA(ifp, ia);
if (ia && ip->ip_src.s_addr == ia->ia_addr.sin_addr.s_addr)
break;
#endif
igmpstat_inc(igps_rcv_reports);
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN_MULTICAST(igmp->igmp_group.s_addr) ||
igmp->igmp_group.s_addr != ip->ip_dst.s_addr) {
igmpstat_inc(igps_rcv_badreports);
m_freem(m);
return IPPROTO_DONE;
}
/*
* KLUDGE: if the IP source address of the report has an
* unspecified (i.e., zero) subnet number, as is allowed for
* a booting host, replace it with the correct subnet number
* so that a process-level multicast routing daemon can
* determine which subnet it arrived from. This is necessary
* to compensate for the lack of any way for a process to
* determine the arrival interface of an incoming packet.
*/
if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) {
#ifndef MROUTING
IFP_TO_IA(ifp, ia);
#endif
if (ia)
ip->ip_src.s_addr = ia->ia_net;
}
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm);
if (inm != NULL) {
inm->inm_timer = 0;
igmpstat_inc(igps_rcv_ourreports);
switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
case IGMP_IDLE_MEMBER:
case IGMP_AWAKENING_MEMBER:
inm->inm_state = IGMP_LAZY_MEMBER;
break;
case IGMP_LAZY_MEMBER:
case IGMP_SLEEPING_MEMBER:
break;
}
}
break;
}
/*
* Pass all valid IGMP packets up to any process(es) listening
* on a raw IGMP socket.
*/
return rip_input(mp, offp, proto, af);
}
void
igmp_joingroup(struct in_multi *inm, struct ifnet *ifp)
{
int i;
inm->inm_state = IGMP_IDLE_MEMBER;
if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(ifp->if_flags & IFF_LOOPBACK) == 0) {
i = rti_fill(inm);
igmp_sendpkt(ifp, inm, i, 0);
inm->inm_state = IGMP_DELAYING_MEMBER;
inm->inm_timer = IGMP_RANDOM_DELAY(
IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ);
igmp_timers_are_running = 1;
} else
inm->inm_timer = 0;
}
void
igmp_leavegroup(struct in_multi *inm, struct ifnet *ifp)
{ switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
case IGMP_IDLE_MEMBER:
if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(ifp->if_flags & IFF_LOOPBACK) == 0)
if (inm->inm_rti->rti_type != IGMP_v1_ROUTER) igmp_sendpkt(ifp, inm,
IGMP_HOST_LEAVE_MESSAGE,
INADDR_ALLROUTERS_GROUP);
break;
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
case IGMP_SLEEPING_MEMBER:
break;
}
}
void
igmp_fasttimo(void)
{
struct ifnet *ifp;
/*
* Quick check to see if any work needs to be done, in order
* to minimize the overhead of fasttimo processing.
* Variable igmp_timers_are_running is read atomically, but without
* lock intentionally. In case it is not set due to MP races, we may
* miss to check the timers. Then run the loop at next fast timeout.
*/
if (!igmp_timers_are_running)
return;
NET_LOCK();
igmp_timers_are_running = 0;
TAILQ_FOREACH(ifp, &ifnet, if_list)
igmp_checktimer(ifp);
NET_UNLOCK();
}
void
igmp_checktimer(struct ifnet *ifp)
{
struct in_multi *inm;
struct ifmaddr *ifma;
NET_ASSERT_LOCKED();
TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) {
if (ifma->ifma_addr->sa_family != AF_INET)
continue;
inm = ifmatoinm(ifma);
if (inm->inm_timer == 0) {
/* do nothing */
} else if (--inm->inm_timer == 0) {
if (inm->inm_state == IGMP_DELAYING_MEMBER) {
if (inm->inm_rti->rti_type == IGMP_v1_ROUTER)
igmp_sendpkt(ifp, inm,
IGMP_v1_HOST_MEMBERSHIP_REPORT, 0);
else
igmp_sendpkt(ifp, inm,
IGMP_v2_HOST_MEMBERSHIP_REPORT, 0);
inm->inm_state = IGMP_IDLE_MEMBER;
}
} else {
igmp_timers_are_running = 1;
}
}
}
void
igmp_slowtimo(void)
{
struct router_info *rti;
NET_LOCK();
LIST_FOREACH(rti, &rti_head, rti_list) {
if (rti->rti_type == IGMP_v1_ROUTER &&
++rti->rti_age >= IGMP_AGE_THRESHOLD) {
rti->rti_type = IGMP_v2_ROUTER;
}
}
NET_UNLOCK();
}
void
igmp_sendpkt(struct ifnet *ifp, struct in_multi *inm, int type,
in_addr_t addr)
{
struct mbuf *m;
struct igmp *igmp;
struct ip *ip;
struct ip_moptions imo;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL)
return;
/*
* Assume max_linkhdr + sizeof(struct ip) + IGMP_MINLEN
* is smaller than mbuf size returned by MGETHDR.
*/
m->m_data += max_linkhdr;
m->m_len = sizeof(struct ip) + IGMP_MINLEN;
m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN;
ip = mtod(m, struct ip *);
ip->ip_tos = 0;
ip->ip_len = htons(sizeof(struct ip) + IGMP_MINLEN);
ip->ip_off = 0;
ip->ip_p = IPPROTO_IGMP;
ip->ip_src.s_addr = INADDR_ANY;
if (addr) {
ip->ip_dst.s_addr = addr;
} else {
ip->ip_dst = inm->inm_addr;
}
m->m_data += sizeof(struct ip);
m->m_len -= sizeof(struct ip);
igmp = mtod(m, struct igmp *);
igmp->igmp_type = type;
igmp->igmp_code = 0;
igmp->igmp_group = inm->inm_addr;
igmp->igmp_cksum = 0;
igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN);
m->m_data -= sizeof(struct ip);
m->m_len += sizeof(struct ip);
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
imo.imo_ifidx = inm->inm_ifidx;
imo.imo_ttl = 1;
/*
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing daemon can hear it.
*/
#ifdef MROUTING
imo.imo_loop = (ip_mrouter[ifp->if_rdomain] != NULL);
#else
imo.imo_loop = 0;
#endif /* MROUTING */
ip_output(m, router_alert, NULL, IP_MULTICASTOPTS, &imo, NULL, 0);
igmpstat_inc(igps_snd_reports);
}
/*
* Sysctl for igmp variables.
*/
int
igmp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case IGMPCTL_STATS:
if (newp != NULL)
return (EPERM);
return (igmp_sysctl_igmpstat(oldp, oldlenp, newp));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
int
igmp_sysctl_igmpstat(void *oldp, size_t *oldlenp, void *newp)
{
uint64_t counters[igps_ncounters];
struct igmpstat igmpstat;
u_long *words = (u_long *)&igmpstat;
int i;
CTASSERT(sizeof(igmpstat) == (nitems(counters) * sizeof(u_long)));
memset(&igmpstat, 0, sizeof igmpstat);
counters_read(igmpcounters, counters, nitems(counters));
for (i = 0; i < nitems(counters); i++)
words[i] = (u_long)counters[i];
return (sysctl_rdstruct(oldp, oldlenp, newp,
&igmpstat, sizeof(igmpstat)));
}
/* $OpenBSD: strnlen.c,v 1.3 2019/01/25 00:19:26 millert Exp $ */
/*
* Copyright (c) 2010 Todd C. Miller <millert@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/types.h>
#include <lib/libkern/libkern.h>
size_t
strnlen(const char *str, size_t maxlen)
{
const char *cp;
for (cp = str; maxlen != 0 && *cp != '\0'; cp++, maxlen--)
;
return (size_t)(cp - str);
}
/* $OpenBSD: sched_bsd.c,v 1.72 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.6 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <uvm/uvm_extern.h>
#include <sys/sched.h>
#include <sys/timeout.h>
#include <sys/smr.h>
#include <sys/tracepoint.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
int lbolt; /* once a second sleep address */
int rrticks_init; /* # of hardclock ticks per roundrobin() */
#ifdef MULTIPROCESSOR
struct __mp_lock sched_lock;
#endif
void schedcpu(void *);
uint32_t decay_aftersleep(uint32_t, uint32_t);
/*
* Force switch among equal priority processes every 100ms.
*/
void
roundrobin(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
spc->spc_rrticks = rrticks_init;
if (ci->ci_curproc != NULL) {
if (spc->spc_schedflags & SPCF_SEENRR) {
/*
* The process has already been through a roundrobin
* without switching and may be hogging the CPU.
* Indicate that the process should yield.
*/
atomic_setbits_int(&spc->spc_schedflags,
SPCF_SHOULDYIELD);
} else {
atomic_setbits_int(&spc->spc_schedflags,
SPCF_SEENRR);
}
}
if (spc->spc_nrun)
need_resched(ci);
}
/*
* Constants for digital decay and forget:
* 90% of (p_estcpu) usage in 5 * loadav time
* 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
* Note that, as ps(1) mentions, this can let percentages
* total over 100% (I've seen 137.9% for 3 processes).
*
* Note that hardclock updates p_estcpu and p_cpticks independently.
*
* We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
* That is, the system wants to compute a value of decay such
* that the following for loop:
* for (i = 0; i < (5 * loadavg); i++)
* p_estcpu *= decay;
* will compute
* p_estcpu *= 0.1;
* for all values of loadavg:
*
* Mathematically this loop can be expressed by saying:
* decay ** (5 * loadavg) ~= .1
*
* The system computes decay as:
* decay = (2 * loadavg) / (2 * loadavg + 1)
*
* We wish to prove that the system's computation of decay
* will always fulfill the equation:
* decay ** (5 * loadavg) ~= .1
*
* If we compute b as:
* b = 2 * loadavg
* then
* decay = b / (b + 1)
*
* We now need to prove two things:
* 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
* 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
*
* Facts:
* For x close to zero, exp(x) =~ 1 + x, since
* exp(x) = 0! + x**1/1! + x**2/2! + ... .
* therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
* For x close to zero, ln(1+x) =~ x, since
* ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
* therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
* ln(.1) =~ -2.30
*
* Proof of (1):
* Solve (factor)**(power) =~ .1 given power (5*loadav):
* solving for factor,
* ln(factor) =~ (-2.30/5*loadav), or
* factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
* exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
*
* Proof of (2):
* Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
* solving for power,
* power*ln(b/(b+1)) =~ -2.30, or
* power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
*
* Actual power values for the implemented algorithm are as follows:
* loadav: 1 2 3 4
* power: 5.68 10.32 14.94 19.55
*/
/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
#define loadfactor(loadav) (2 * (loadav))
#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
/*
* If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
* faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
* and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
*
* To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
* 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
*
* If you don't want to bother with the faster/more-accurate formula, you
* can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
* (more general) method of calculating the %age of CPU used by a process.
*/
#define CCPU_SHIFT 11
/*
* Recompute process priorities, every second.
*/
void
schedcpu(void *arg)
{
struct timeout *to = (struct timeout *)arg;
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
struct proc *p;
int s;
unsigned int newcpu;
int phz;
/*
* If we have a statistics clock, use that to calculate CPU
* time, otherwise revert to using the profiling clock (which,
* in turn, defaults to hz if there is no separate profiling
* clock available)
*/
phz = stathz ? stathz : profhz;
KASSERT(phz);
LIST_FOREACH(p, &allproc, p_list) {
/*
* Idle threads are never placed on the runqueue,
* therefore computing their priority is pointless.
*/
if (p->p_cpu != NULL &&
p->p_cpu->ci_schedstate.spc_idleproc == p)
continue;
/*
* Increment sleep time (if sleeping). We ignore overflow.
*/
if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
p->p_slptime++;
p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
/*
* If the process has slept the entire second,
* stop recalculating its priority until it wakes up.
*/
if (p->p_slptime > 1)
continue;
SCHED_LOCK(s);
/*
* p_pctcpu is only for diagnostic tools such as ps.
*/
#if (FSHIFT >= CCPU_SHIFT)
p->p_pctcpu += (phz == 100)?
((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
100 * (((fixpt_t) p->p_cpticks)
<< (FSHIFT - CCPU_SHIFT)) / phz;
#else
p->p_pctcpu += ((FSCALE - ccpu) *
(p->p_cpticks * FSCALE / phz)) >> FSHIFT;
#endif
p->p_cpticks = 0;
newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
setpriority(p, newcpu, p->p_p->ps_nice);
if (p->p_stat == SRUN &&
(p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
remrunqueue(p);
setrunqueue(p->p_cpu, p, p->p_usrpri);
}
SCHED_UNLOCK(s);
}
uvm_meter();
wakeup(&lbolt);
timeout_add_sec(to, 1);
}
/*
* Recalculate the priority of a process after it has slept for a while.
* For all load averages >= 1 and max p_estcpu of 255, sleeping for at
* least six times the loadfactor will decay p_estcpu to zero.
*/
uint32_t
decay_aftersleep(uint32_t estcpu, uint32_t slptime)
{
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
uint32_t newcpu;
if (slptime > 5 * loadfac)
newcpu = 0;
else {
newcpu = estcpu;
slptime--; /* the first time was done in schedcpu */
while (newcpu && --slptime)
newcpu = decay_cpu(loadfac, newcpu);
}
return (newcpu);
}
/*
* General yield call. Puts the current process back on its run queue and
* performs a voluntary context switch.
*/
void
yield(void)
{
struct proc *p = curproc;
int s;
SCHED_LOCK(s);
setrunqueue(p->p_cpu, p, p->p_usrpri);
p->p_ru.ru_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
/*
* General preemption call. Puts the current process back on its run queue
* and performs an involuntary context switch. If a process is supplied,
* we switch to that process. Otherwise, we use the normal process selection
* criteria.
*/
void
preempt(void)
{
struct proc *p = curproc;
int s;
SCHED_LOCK(s);
setrunqueue(p->p_cpu, p, p->p_usrpri);
p->p_ru.ru_nivcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
void
mi_switch(void)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
struct proc *p = curproc;
struct proc *nextproc;
struct process *pr = p->p_p;
struct timespec ts;
#ifdef MULTIPROCESSOR
int hold_count;
int sched_count;
#endif
assertwaitok();
KASSERT(p->p_stat != SONPROC); SCHED_ASSERT_LOCKED();
#ifdef MULTIPROCESSOR
/*
* Release the kernel_lock, as we are about to yield the CPU.
*/
sched_count = __mp_release_all_but_one(&sched_lock);
if (_kernel_lock_held()) hold_count = __mp_release_all(&kernel_lock);
else
hold_count = 0;
#endif
/*
* Compute the amount of time during which the current
* process was running, and add that to its total so far.
*/
nanouptime(&ts);
if (timespeccmp(&ts, &spc->spc_runtime, <)) {
#if 0
printf("uptime is not monotonic! "
"ts=%lld.%09lu, runtime=%lld.%09lu\n",
(long long)tv.tv_sec, tv.tv_nsec,
(long long)spc->spc_runtime.tv_sec,
spc->spc_runtime.tv_nsec);
#endif
} else {
timespecsub(&ts, &spc->spc_runtime, &ts); timespecadd(&p->p_rtime, &ts, &p->p_rtime);
}
/* add the time counts for this thread to the process's total */
tuagg_unlocked(pr, p);
/*
* Process is about to yield the CPU; clear the appropriate
* scheduling flags.
*/
atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
nextproc = sched_chooseproc();
if (p != nextproc) {
uvmexp.swtch++;
TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
nextproc->p_p->ps_pid);
cpu_switchto(p, nextproc);
TRACEPOINT(sched, on__cpu, NULL);
} else {
TRACEPOINT(sched, remain__cpu, NULL);
p->p_stat = SONPROC;
}
clear_resched(curcpu());
SCHED_ASSERT_LOCKED();
/*
* To preserve lock ordering, we need to release the sched lock
* and grab it after we grab the big lock.
* In the future, when the sched lock isn't recursive, we'll
* just release it here.
*/
#ifdef MULTIPROCESSOR
__mp_unlock(&sched_lock);
#endif
SCHED_ASSERT_UNLOCKED();
smr_idle();
/*
* We're running again; record our new start time. We might
* be running on a new CPU now, so don't use the cache'd
* schedstate_percpu pointer.
*/
KASSERT(p->p_cpu == curcpu());
nanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
#ifdef MULTIPROCESSOR
/*
* Reacquire the kernel_lock now. We do this after we've
* released the scheduler lock to avoid deadlock, and before
* we reacquire the interlock and the scheduler lock.
*/
if (hold_count) __mp_acquire_count(&kernel_lock, hold_count);
__mp_acquire_count(&sched_lock, sched_count + 1);
#endif
}
/*
* Change process state to be runnable,
* placing it on the run queue.
*/
void
setrunnable(struct proc *p)
{
struct process *pr = p->p_p;
u_char prio;
SCHED_ASSERT_LOCKED();
switch (p->p_stat) {
case 0:
case SRUN:
case SONPROC:
case SDEAD:
case SIDL:
default:
panic("setrunnable");
case SSTOP:
/*
* If we're being traced (possibly because someone attached us
* while we were stopped), check for a signal from the debugger.
*/
if ((pr->ps_flags & PS_TRACED) != 0 && pr->ps_xsig != 0) atomic_setbits_int(&p->p_siglist, sigmask(pr->ps_xsig));
prio = p->p_usrpri;
unsleep(p);
break;
case SSLEEP:
prio = p->p_slppri;
unsleep(p); /* e.g. when sending signals */
break;
}
setrunqueue(NULL, p, prio);
if (p->p_slptime > 1) {
uint32_t newcpu;
newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
setpriority(p, newcpu, pr->ps_nice);
}
p->p_slptime = 0;
}
/*
* Compute the priority of a process.
*/
void
setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
{
unsigned int newprio;
newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
SCHED_ASSERT_LOCKED();
p->p_estcpu = newcpu;
p->p_usrpri = newprio;
}
/*
* We adjust the priority of the current process. The priority of a process
* gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu)
* is increased here. The formula for computing priorities (in kern_synch.c)
* will compute a different value each time p_estcpu increases. This can
* cause a switch, but unless the priority crosses a PPQ boundary the actual
* queue will not change. The cpu usage estimator ramps up quite quickly
* when the process is running (linearly), and decays away exponentially, at
* a rate which is proportionally slower when the system is busy. The basic
* principle is that the system will 90% forget that the process used a lot
* of CPU time in 5 * loadav seconds. This causes the system to favor
* processes which haven't run much recently, and to round-robin among other
* processes.
*/
void
schedclock(struct proc *p)
{
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
uint32_t newcpu;
int s;
if (p == spc->spc_idleproc || spc->spc_spinning)
return;
SCHED_LOCK(s);
newcpu = ESTCPULIM(p->p_estcpu + 1);
setpriority(p, newcpu, p->p_p->ps_nice);
SCHED_UNLOCK(s);
}
void (*cpu_setperf)(int);
#define PERFPOL_MANUAL 0
#define PERFPOL_AUTO 1
#define PERFPOL_HIGH 2
int perflevel = 100;
int perfpolicy = PERFPOL_AUTO;
#ifndef SMALL_KERNEL
/*
* The code below handles CPU throttling.
*/
#include <sys/sysctl.h>
void setperf_auto(void *);
struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL);
extern int hw_power;
void
setperf_auto(void *v)
{
static uint64_t *idleticks, *totalticks;
static int downbeats;
int i, j = 0;
int speedup = 0;
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
uint64_t idle, total, allidle = 0, alltotal = 0;
if (perfpolicy != PERFPOL_AUTO)
return;
if (cpu_setperf == NULL)
return;
if (hw_power) {
speedup = 1;
goto faster;
}
if (!idleticks)
if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks),
M_DEVBUF, M_NOWAIT | M_ZERO)))
return;
if (!totalticks)
if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks),
M_DEVBUF, M_NOWAIT | M_ZERO))) {
free(idleticks, M_DEVBUF,
sizeof(*idleticks) * ncpusfound);
return;
}
CPU_INFO_FOREACH(cii, ci) {
if (!cpu_is_online(ci))
continue;
total = 0;
for (i = 0; i < CPUSTATES; i++) {
total += ci->ci_schedstate.spc_cp_time[i];
}
total -= totalticks[j];
idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j];
if (idle < total / 3)
speedup = 1;
alltotal += total;
allidle += idle;
idleticks[j] += idle;
totalticks[j] += total;
j++;
}
if (allidle < alltotal / 2)
speedup = 1;
if (speedup && downbeats < 5)
downbeats++;
if (speedup && perflevel != 100) {
faster:
perflevel = 100;
cpu_setperf(perflevel);
} else if (!speedup && perflevel != 0 && --downbeats <= 0) {
perflevel = 0;
cpu_setperf(perflevel);
}
timeout_add_msec(&setperf_to, 100);
}
int
sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
int err;
if (!cpu_setperf)
return EOPNOTSUPP;
if (perfpolicy != PERFPOL_MANUAL)
return sysctl_rdint(oldp, oldlenp, newp, perflevel);
err = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&perflevel, 0, 100);
if (err)
return err;
if (newp != NULL) cpu_setperf(perflevel);
return 0;
}
int
sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
char policy[32];
int err;
if (!cpu_setperf)
return EOPNOTSUPP;
switch (perfpolicy) {
case PERFPOL_MANUAL:
strlcpy(policy, "manual", sizeof(policy));
break;
case PERFPOL_AUTO:
strlcpy(policy, "auto", sizeof(policy));
break;
case PERFPOL_HIGH:
strlcpy(policy, "high", sizeof(policy));
break;
default:
strlcpy(policy, "unknown", sizeof(policy));
break;
}
if (newp == NULL)
return sysctl_rdstring(oldp, oldlenp, newp, policy);
err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy));
if (err)
return err;
if (strcmp(policy, "manual") == 0)
perfpolicy = PERFPOL_MANUAL;
else if (strcmp(policy, "auto") == 0)
perfpolicy = PERFPOL_AUTO; else if (strcmp(policy, "high") == 0) perfpolicy = PERFPOL_HIGH;
else
return EINVAL;
if (perfpolicy == PERFPOL_AUTO) {
timeout_add_msec(&setperf_to, 200);
} else if (perfpolicy == PERFPOL_HIGH) {
perflevel = 100;
cpu_setperf(perflevel);
}
return 0;
}
#endif
void
scheduler_start(void)
{
static struct timeout schedcpu_to;
/*
* We avoid polluting the global namespace by keeping the scheduler
* timeouts static in this function.
* We setup the timeout here and kick schedcpu once to make it do
* its job.
*/
timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
rrticks_init = hz / 10;
schedcpu(&schedcpu_to);
#ifndef SMALL_KERNEL
if (perfpolicy == PERFPOL_AUTO)
timeout_add_msec(&setperf_to, 200);
#endif
}
/* $OpenBSD: tcp_timer.c,v 1.70 2022/09/03 19:22:19 bluhm Exp $ */
/* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
#include <netinet/ip_icmp.h>
#include <netinet/tcp_seq.h>
/*
* Locks used to protect struct members in this file:
* T tcp_timer_mtx global tcp timer data structures
*/
int tcp_always_keepalive;
int tcp_keepidle;
int tcp_keepintvl;
int tcp_maxpersistidle; /* max idle time in persist */
int tcp_maxidle; /* [T] max idle time for keep alive */
/*
* Time to delay the ACK. This is initialized in tcp_init(), unless
* its patched.
*/
int tcp_delack_msecs;
void tcp_timer_rexmt(void *);
void tcp_timer_persist(void *);
void tcp_timer_keep(void *);
void tcp_timer_2msl(void *);
void tcp_timer_reaper(void *);
void tcp_timer_delack(void *);
const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
tcp_timer_rexmt,
tcp_timer_persist,
tcp_timer_keep,
tcp_timer_2msl,
tcp_timer_reaper,
tcp_timer_delack,
};
/*
* Timer state initialization, called from tcp_init().
*/
void
tcp_timer_init(void)
{
if (tcp_keepidle == 0)
tcp_keepidle = TCPTV_KEEP_IDLE;
if (tcp_keepintvl == 0)
tcp_keepintvl = TCPTV_KEEPINTVL;
if (tcp_maxpersistidle == 0)
tcp_maxpersistidle = TCPTV_KEEP_IDLE;
if (tcp_delack_msecs == 0)
tcp_delack_msecs = TCP_DELACK_MSECS;
}
/*
* Callout to process delayed ACKs for a TCPCB.
*/
void
tcp_timer_delack(void *arg)
{
struct tcpcb *otp = NULL, *tp = arg;
short ostate;
/*
* If tcp_output() wasn't able to transmit the ACK
* for whatever reason, it will restart the delayed
* ACK callout.
*/
NET_LOCK();
/* Ignore canceled timeouts or timeouts that have been rescheduled. */
if (!ISSET((tp)->t_flags, TF_TMR_DELACK) ||
timeout_pending(&tp->t_timer[TCPT_DELACK]))
goto out;
CLR((tp)->t_flags, TF_TMR_DELACK);
if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
}
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
if (otp)
tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_DELACK, 0);
out:
NET_UNLOCK();
}
/*
* Tcp protocol timeout routine called every 500 ms.
* Updates the timers in all active tcb's and
* causes finite state machine actions if timers expire.
*/
void
tcp_slowtimo(void)
{
mtx_enter(&tcp_timer_mtx);
tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
tcp_iss += TCP_ISSINCR2/PR_SLOWHZ; /* increment iss */
tcp_now++; /* for timestamps */
mtx_leave(&tcp_timer_mtx);
}
/*
* Cancel all timers for TCP tp.
*/
void
tcp_canceltimers(struct tcpcb *tp)
{
int i;
for (i = 0; i < TCPT_NTIMERS; i++)
TCP_TIMER_DISARM(tp, i);
}
int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
{ 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
/*
* TCP timer processing.
*/
void tcp_timer_freesack(struct tcpcb *);
void
tcp_timer_freesack(struct tcpcb *tp)
{
struct sackhole *p, *q;
/*
* Free SACK holes for 2MSL and REXMT timers.
*/
q = tp->snd_holes;
while (q != NULL) {
p = q;
q = q->next;
pool_put(&sackhl_pool, p);
}
tp->snd_holes = 0;
}
void
tcp_timer_rexmt(void *arg)
{
struct tcpcb *otp = NULL, *tp = arg;
uint32_t rto;
short ostate;
NET_LOCK();
/* Ignore canceled timeouts or timeouts that have been rescheduled. */
if (!ISSET((tp)->t_flags, TF_TMR_REXMT) ||
timeout_pending(&tp->t_timer[TCPT_REXMT]))
goto out;
CLR((tp)->t_flags, TF_TMR_REXMT);
if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) {
struct sockaddr_in sin;
struct icmp icmp;
tp->t_flags &= ~TF_PMTUD_PEND;
/* XXX create fake icmp message with relevant entries */
icmp.icmp_nextmtu = tp->t_pmtud_nextmtu;
icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len;
icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl;
icmp.icmp_ip.ip_dst = tp->t_inpcb->inp_faddr;
icmp_mtudisc(&icmp, tp->t_inpcb->inp_rtableid);
/*
* Notify all connections to the same peer about
* new mss and trigger retransmit.
*/
bzero(&sin, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr = tp->t_inpcb->inp_faddr;
in_pcbnotifyall(&tcbtable, sintosa(&sin),
tp->t_inpcb->inp_rtableid, EMSGSIZE, tcp_mtudisc);
goto out;
}
tcp_timer_freesack(tp);
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
tcpstat_inc(tcps_timeoutdrop);
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
goto out;
}
if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
}
tcpstat_inc(tcps_rexmttimeo);
rto = TCP_REXMTVAL(tp);
if (rto < tp->t_rttmin)
rto = tp->t_rttmin;
TCPT_RANGESET(tp->t_rxtcur,
rto * tcp_backoff[tp->t_rxtshift],
tp->t_rttmin, TCPTV_REXMTMAX);
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
/*
* If we are losing and we are trying path MTU discovery,
* try turning it off. This will avoid black holes in
* the network which suppress or fail to send "packet
* too big" ICMP messages. We should ideally do
* lots more sophisticated searching to find the right
* value here...
*/
if (ip_mtudisc && tp->t_inpcb &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
struct inpcb *inp = tp->t_inpcb;
struct rtentry *rt = NULL;
/* No data to send means path mtu is not a problem */
if (!inp->inp_socket->so_snd.sb_cc)
goto leave;
rt = in_pcbrtentry(inp);
/* Check if path MTU discovery is disabled already */
if (rt && (rt->rt_flags & RTF_HOST) &&
(rt->rt_locks & RTV_MTU))
goto leave;
rt = NULL;
switch(tp->pf) {
#ifdef INET6
case PF_INET6:
/*
* We can not turn off path MTU for IPv6.
* Do nothing for now, maybe lower to
* minimum MTU.
*/
break;
#endif
case PF_INET:
rt = icmp_mtudisc_clone(inp->inp_faddr,
inp->inp_rtableid, 0);
break;
}
if (rt != NULL) {
/* Disable path MTU discovery */
if ((rt->rt_locks & RTV_MTU) == 0) {
rt->rt_locks |= RTV_MTU;
in_rtchange(inp, 0);
}
rtfree(rt);
}
leave:
;
}
/*
* If losing, let the lower level know and try for
* a better route. Also, if we backed off this far,
* our srtt estimate is probably bogus. Clobber it
* so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current
* retransmit times until then.
*/
if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}
tp->snd_nxt = tp->snd_una;
/*
* Note: We overload snd_last to function also as the
* snd_last variable described in RFC 2582
*/
tp->snd_last = tp->snd_max;
/*
* If timing a segment in this window, stop the timer.
*/
tp->t_rtttime = 0;
#ifdef TCP_ECN
/*
* if ECN is enabled, there might be a broken firewall which
* blocks ecn packets. fall back to non-ecn.
*/
if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)
&& tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
tp->t_flags |= TF_DISABLE_ECN;
#endif
/*
* Close the congestion window down to one segment
* (we'll open it by one segment for each ack we get).
* Since we probably have a window's worth of unacked
* data accumulated, this "slow start" keeps us from
* dumping all that data as back-to-back packets (which
* might overwhelm an intermediate gateway).
*
* There are two phases to the opening: Initially we
* open by one mss on each ack. This makes the window
* size increase exponentially with time. If the
* window is larger than the path can handle, this
* exponential growth results in dropped packet(s)
* almost immediately. To get more time between
* drops but still "push" the network to take advantage
* of improving conditions, we switch from exponential
* to linear window opening at some threshold size.
* For a threshold, we use half the current window
* size, truncated to a multiple of the mss.
*
* (the minimum cwnd that will give us exponential
* growth is 2 mss. We don't allow the threshold
* to go below this.)
*/
{
u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
if (win < 2)
win = 2;
tp->snd_cwnd = tp->t_maxseg;
tp->snd_ssthresh = win * tp->t_maxseg;
tp->t_dupacks = 0;
#ifdef TCP_ECN
tp->snd_last = tp->snd_max;
tp->t_flags |= TF_SEND_CWR;
#endif
#if 1 /* TCP_ECN */
tcpstat_inc(tcps_cwr_timeout);
#endif
}
(void) tcp_output(tp);
if (otp)
tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_REXMT, 0);
out:
NET_UNLOCK();
}
void
tcp_timer_persist(void *arg)
{
struct tcpcb *otp = NULL, *tp = arg;
uint32_t rto;
short ostate;
uint32_t now;
NET_LOCK();
/* Ignore canceled timeouts or timeouts that have been rescheduled. */
if (!ISSET((tp)->t_flags, TF_TMR_PERSIST) ||
timeout_pending(&tp->t_timer[TCPT_PERSIST]))
goto out;
CLR((tp)->t_flags, TF_TMR_PERSIST);
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
goto out;
if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
}
tcpstat_inc(tcps_persisttimeo);
/*
* Hack: if the peer is dead/unreachable, we do not
* time out if the window is closed. After a full
* backoff, drop the connection if the idle time
* (no responses to probes) reaches the maximum
* backoff that we would use if retransmitting.
*/
rto = TCP_REXMTVAL(tp);
if (rto < tp->t_rttmin)
rto = tp->t_rttmin;
now = READ_ONCE(tcp_now);
if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
((now - tp->t_rcvtime) >= tcp_maxpersistidle ||
(now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
tcpstat_inc(tcps_persistdrop);
tp = tcp_drop(tp, ETIMEDOUT);
goto out;
}
tcp_setpersist(tp);
tp->t_force = 1;
(void) tcp_output(tp);
tp->t_force = 0;
if (otp)
tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_PERSIST, 0);
out:
NET_UNLOCK();
}
void
tcp_timer_keep(void *arg)
{
struct tcpcb *otp = NULL, *tp = arg;
short ostate;
NET_LOCK();
/* Ignore canceled timeouts or timeouts that have been rescheduled. */
if (!ISSET((tp)->t_flags, TF_TMR_KEEP) ||
timeout_pending(&tp->t_timer[TCPT_KEEP]))
goto out;
CLR((tp)->t_flags, TF_TMR_KEEP);
if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
}
tcpstat_inc(tcps_keeptimeo);
if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
goto dropit;
if ((tcp_always_keepalive ||
tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
tp->t_state <= TCPS_CLOSING) {
int maxidle;
uint32_t now;
maxidle = READ_ONCE(tcp_maxidle);
now = READ_ONCE(tcp_now);
if ((maxidle > 0) &&
((now - tp->t_rcvtime) >= tcp_keepidle + maxidle))
goto dropit;
/*
* Send a packet designed to force a response
* if the peer is up and reachable:
* either an ACK if the connection is still alive,
* or an RST if the peer has closed the connection
* due to timeout or reboot.
* Using sequence number tp->snd_una-1
* causes the transmitted zero-length segment
* to lie outside the receive window;
* by the protocol spec, this requires the
* correspondent TCP to respond.
*/
tcpstat_inc(tcps_keepprobe);
tcp_respond(tp, mtod(tp->t_template, caddr_t),
NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0, now);
TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
} else
TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
if (otp)
tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_KEEP, 0);
out:
NET_UNLOCK();
return;
dropit:
tcpstat_inc(tcps_keepdrops);
tp = tcp_drop(tp, ETIMEDOUT);
NET_UNLOCK();
}
void
tcp_timer_2msl(void *arg)
{
struct tcpcb *otp = NULL, *tp = arg;
short ostate;
int maxidle;
uint32_t now;
NET_LOCK();
/* Ignore canceled timeouts or timeouts that have been rescheduled. */
if (!ISSET((tp)->t_flags, TF_TMR_2MSL) ||
timeout_pending(&tp->t_timer[TCPT_2MSL]))
goto out;
CLR((tp)->t_flags, TF_TMR_2MSL);
if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
otp = tp;
ostate = tp->t_state;
}
tcp_timer_freesack(tp);
maxidle = READ_ONCE(tcp_maxidle);
now = READ_ONCE(tcp_now);
if (tp->t_state != TCPS_TIME_WAIT &&
((maxidle == 0) || ((now - tp->t_rcvtime) <= maxidle)))
TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
else
tp = tcp_close(tp);
if (otp)
tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_2MSL, 0);
out:
NET_UNLOCK();
}
void
tcp_timer_reaper(void *arg)
{
struct tcpcb *tp = arg;
/*
* This timer is necessary to delay the pool_put() after all timers
* have finished, even if they were sleeping to grab the net lock.
* Putting the pool_put() in a timer is sufficient as all timers run
* from the same timeout thread. Note that neither softnet thread nor
* user process may access the tcpcb after arming the reaper timer.
* Freeing may run in parallel as it does not grab the net lock.
*/
pool_put(&tcpcb_pool, tp);
tcpstat_inc(tcps_closed);
}
/* $OpenBSD: uvm_vnode.c,v 1.127 2022/08/31 09:07:35 gnezdo Exp $ */
/* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993
* The Regents of the University of California.
* Copyright (c) 1990 University of Utah.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94
* from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp
*/
/*
* uvm_vnode.c: the vnode pager.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/disklabel.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/rwlock.h>
#include <sys/dkio.h>
#include <sys/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_vnode.h>
/*
* private global data structure
*
* we keep a list of writeable active vnode-backed VM objects for sync op.
* we keep a simpleq of vnodes that are currently being sync'd.
*/
LIST_HEAD(uvn_list_struct, uvm_vnode);
struct uvn_list_struct uvn_wlist; /* writeable uvns */
SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode);
struct uvn_sq_struct uvn_sync_q; /* sync'ing uvns */
struct rwlock uvn_sync_lock; /* locks sync operation */
extern int rebooting;
/*
* functions
*/
void uvn_cluster(struct uvm_object *, voff_t, voff_t *, voff_t *);
void uvn_detach(struct uvm_object *);
boolean_t uvn_flush(struct uvm_object *, voff_t, voff_t, int);
int uvn_get(struct uvm_object *, voff_t, vm_page_t *, int *, int,
vm_prot_t, int, int);
void uvn_init(void);
int uvn_io(struct uvm_vnode *, vm_page_t *, int, int, int);
int uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
void uvn_reference(struct uvm_object *);
/*
* master pager structure
*/
const struct uvm_pagerops uvm_vnodeops = {
.pgo_init = uvn_init,
.pgo_reference = uvn_reference,
.pgo_detach = uvn_detach,
.pgo_flush = uvn_flush,
.pgo_get = uvn_get,
.pgo_put = uvn_put,
.pgo_cluster = uvn_cluster,
/* use generic version of this: see uvm_pager.c */
.pgo_mk_pcluster = uvm_mk_pcluster,
};
/*
* the ops!
*/
/*
* uvn_init
*
* init pager private data structures.
*/
void
uvn_init(void)
{
LIST_INIT(&uvn_wlist);
/* note: uvn_sync_q init'd in uvm_vnp_sync() */
rw_init_flags(&uvn_sync_lock, "uvnsync", RWL_IS_VNODE);
}
/*
* uvn_attach
*
* attach a vnode structure to a VM object. if the vnode is already
* attached, then just bump the reference count by one and return the
* VM object. if not already attached, attach and return the new VM obj.
* the "accessprot" tells the max access the attaching thread wants to
* our pages.
*
* => in fact, nothing should be locked so that we can sleep here.
* => note that uvm_object is first thing in vnode structure, so their
* pointers are equiv.
*/
struct uvm_object *
uvn_attach(struct vnode *vp, vm_prot_t accessprot)
{
struct uvm_vnode *uvn = vp->v_uvm;
struct vattr vattr;
int oldflags, result;
struct partinfo pi;
u_quad_t used_vnode_size = 0;
/* first get a lock on the uvn. */
while (uvn->u_flags & UVM_VNODE_BLOCKED) {
uvn->u_flags |= UVM_VNODE_WANTED;
tsleep_nsec(uvn, PVM, "uvn_attach", INFSLP);
}
/* if we're mapping a BLK device, make sure it is a disk. */
if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) {
return NULL;
}
/*
* now uvn must not be in a blocked state.
* first check to see if it is already active, in which case
* we can bump the reference count, check to see if we need to
* add it to the writeable list, and then return.
*/
rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
if (uvn->u_flags & UVM_VNODE_VALID) { /* already active? */
/* regain vref if we were persisting */
if (uvn->u_obj.uo_refs == 0) { vref(vp);
}
uvn->u_obj.uo_refs++; /* bump uvn ref! */
rw_exit(uvn->u_obj.vmobjlock);
/* check for new writeable uvn */
if ((accessprot & PROT_WRITE) != 0 &&
(uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) {
LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
/* we are now on wlist! */
uvn->u_flags |= UVM_VNODE_WRITEABLE;
}
return (&uvn->u_obj);
}
rw_exit(uvn->u_obj.vmobjlock);
/*
* need to call VOP_GETATTR() to get the attributes, but that could
* block (due to I/O), so we want to unlock the object before calling.
* however, we want to keep anyone else from playing with the object
* while it is unlocked. to do this we set UVM_VNODE_ALOCK which
* prevents anyone from attaching to the vnode until we are done with
* it.
*/
uvn->u_flags = UVM_VNODE_ALOCK;
if (vp->v_type == VBLK) {
/*
* We could implement this as a specfs getattr call, but:
*
* (1) VOP_GETATTR() would get the file system
* vnode operation, not the specfs operation.
*
* (2) All we want is the size, anyhow.
*/
result = (*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev,
DIOCGPART, (caddr_t)&pi, FREAD, curproc);
if (result == 0) {
/* XXX should remember blocksize */
used_vnode_size = (u_quad_t)pi.disklab->d_secsize *
(u_quad_t)DL_GETPSIZE(pi.part);
}
} else {
result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc);
if (result == 0) used_vnode_size = vattr.va_size;
}
if (result != 0) {
if (uvn->u_flags & UVM_VNODE_WANTED) wakeup(uvn);
uvn->u_flags = 0;
return NULL;
}
/*
* make sure that the newsize fits within a vaddr_t
* XXX: need to revise addressing data types
*/
#ifdef DEBUG
if (vp->v_type == VBLK)
printf("used_vnode_size = %llu\n", (long long)used_vnode_size);
#endif
/* now set up the uvn. */
KASSERT(uvn->u_obj.uo_refs == 0);
uvn->u_obj.uo_refs++;
oldflags = uvn->u_flags;
uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST;
uvn->u_nio = 0;
uvn->u_size = used_vnode_size;
/* if write access, we need to add it to the wlist */
if (accessprot & PROT_WRITE) { LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
uvn->u_flags |= UVM_VNODE_WRITEABLE; /* we are on wlist! */
}
/*
* add a reference to the vnode. this reference will stay as long
* as there is a valid mapping of the vnode. dropped when the
* reference count goes to zero [and we either free or persist].
*/
vref(vp);
if (oldflags & UVM_VNODE_WANTED) wakeup(uvn);
return &uvn->u_obj;
}
/*
* uvn_reference
*
* duplicate a reference to a VM object. Note that the reference
* count must already be at least one (the passed in reference) so
* there is no chance of the uvn being killed out here.
*
* => caller must be using the same accessprot as was used at attach time
*/
void
uvn_reference(struct uvm_object *uobj)
{
#ifdef DEBUG
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
#endif
#ifdef DEBUG
if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
printf("uvn_reference: ref=%d, flags=0x%x\n",
uobj->uo_refs, uvn->u_flags);
panic("uvn_reference: invalid state");
}
#endif
rw_enter(uobj->vmobjlock, RW_WRITE);
uobj->uo_refs++;
rw_exit(uobj->vmobjlock);
}
/*
* uvn_detach
*
* remove a reference to a VM object.
*
* => caller must call with map locked.
* => this starts the detach process, but doesn't have to finish it
* (async i/o could still be pending).
*/
void
uvn_detach(struct uvm_object *uobj)
{
struct uvm_vnode *uvn;
struct vnode *vp;
int oldflags;
rw_enter(uobj->vmobjlock, RW_WRITE);
uobj->uo_refs--; /* drop ref! */
if (uobj->uo_refs) { /* still more refs */
rw_exit(uobj->vmobjlock);
return;
}
/* get other pointers ... */
uvn = (struct uvm_vnode *) uobj;
vp = uvn->u_vnode;
/*
* clear VTEXT flag now that there are no mappings left (VTEXT is used
* to keep an active text file from being overwritten).
*/
vp->v_flag &= ~VTEXT;
/*
* we just dropped the last reference to the uvn. see if we can
* let it "stick around".
*/
if (uvn->u_flags & UVM_VNODE_CANPERSIST) {
/* won't block */
uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES);
goto out;
}
/* its a goner! */
uvn->u_flags |= UVM_VNODE_DYING;
/*
* even though we may unlock in flush, no one can gain a reference
* to us until we clear the "dying" flag [because it blocks
* attaches]. we will not do that until after we've disposed of all
* the pages with uvn_flush(). note that before the flush the only
* pages that could be marked PG_BUSY are ones that are in async
* pageout by the daemon. (there can't be any pending "get"'s
* because there are no references to the object).
*/
(void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
/*
* given the structure of this pager, the above flush request will
* create the following state: all the pages that were in the object
* have either been free'd or they are marked PG_BUSY and in the
* middle of an async io. If we still have pages we set the "relkill"
* state, so that in the case the vnode gets terminated we know
* to leave it alone. Otherwise we'll kill the vnode when it's empty.
*/
uvn->u_flags |= UVM_VNODE_RELKILL;
/* wait on any outstanding io */
while (uobj->uo_npages && uvn->u_flags & UVM_VNODE_RELKILL) {
uvn->u_flags |= UVM_VNODE_IOSYNC;
rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term",
INFSLP);
}
if ((uvn->u_flags & UVM_VNODE_RELKILL) == 0) {
rw_exit(uobj->vmobjlock);
return;
}
/*
* kill object now. note that we can't be on the sync q because
* all references are gone.
*/
if (uvn->u_flags & UVM_VNODE_WRITEABLE) { LIST_REMOVE(uvn, u_wlist);
}
KASSERT(RBT_EMPTY(uvm_objtree, &uobj->memt));
oldflags = uvn->u_flags;
uvn->u_flags = 0;
/* wake up any sleepers */
if (oldflags & UVM_VNODE_WANTED) wakeup(uvn);
out:
rw_exit(uobj->vmobjlock);
/* drop our reference to the vnode. */
vrele(vp);
return;
}
/*
* uvm_vnp_terminate: external hook to clear out a vnode's VM
*
* called in two cases:
* [1] when a persisting vnode vm object (i.e. one with a zero reference
* count) needs to be freed so that a vnode can be reused. this
* happens under "getnewvnode" in vfs_subr.c. if the vnode from
* the free list is still attached (i.e. not VBAD) then vgone is
* called. as part of the vgone trace this should get called to
* free the vm object. this is the common case.
* [2] when a filesystem is being unmounted by force (MNT_FORCE,
* "umount -f") the vgone() function is called on active vnodes
* on the mounted file systems to kill their data (the vnodes become
* "dead" ones [see src/sys/miscfs/deadfs/...]). that results in a
* call here (even if the uvn is still in use -- i.e. has a non-zero
* reference count). this case happens at "umount -f" and during a
* "reboot/halt" operation.
*
* => the caller must XLOCK and VOP_LOCK the vnode before calling us
* [protects us from getting a vnode that is already in the DYING
* state...]
* => in case [2] the uvn is still alive after this call, but all I/O
* ops will fail (due to the backing vnode now being "dead"). this
* will prob. kill any process using the uvn due to pgo_get failing.
*/
void
uvm_vnp_terminate(struct vnode *vp)
{
struct uvm_vnode *uvn = vp->v_uvm;
struct uvm_object *uobj = &uvn->u_obj;
int oldflags;
/* check if it is valid */
rw_enter(uobj->vmobjlock, RW_WRITE);
if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
rw_exit(uobj->vmobjlock);
return;
}
/*
* must be a valid uvn that is not already dying (because XLOCK
* protects us from that). the uvn can't in the ALOCK state
* because it is valid, and uvn's that are in the ALOCK state haven't
* been marked valid yet.
*/
#ifdef DEBUG
/*
* debug check: are we yanking the vnode out from under our uvn?
*/
if (uvn->u_obj.uo_refs) {
printf("uvm_vnp_terminate(%p): terminating active vnode "
"(refs=%d)\n", uvn, uvn->u_obj.uo_refs);
}
#endif
/*
* it is possible that the uvn was detached and is in the relkill
* state [i.e. waiting for async i/o to finish].
* we take over the vnode now and cancel the relkill.
* we want to know when the i/o is done so we can recycle right
* away. note that a uvn can only be in the RELKILL state if it
* has a zero reference count.
*/
if (uvn->u_flags & UVM_VNODE_RELKILL)
uvn->u_flags &= ~UVM_VNODE_RELKILL; /* cancel RELKILL */
/*
* block the uvn by setting the dying flag, and then flush the
* pages.
*
* also, note that we tell I/O that we are already VOP_LOCK'd so
* that uvn_io doesn't attempt to VOP_LOCK again.
*
* XXXCDC: setting VNISLOCKED on an active uvn which is being terminated
* due to a forceful unmount might not be a good idea. maybe we
* need a way to pass in this info to uvn_flush through a
* pager-defined PGO_ constant [currently there are none].
*/
uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED;
(void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
/*
* as we just did a flush we expect all the pages to be gone or in
* the process of going. sleep to wait for the rest to go [via iosync].
*/
while (uvn->u_obj.uo_npages) {
#ifdef DEBUG
struct vm_page *pp;
RBT_FOREACH(pp, uvm_objtree, &uvn->u_obj.memt) {
if ((pp->pg_flags & PG_BUSY) == 0)
panic("uvm_vnp_terminate: detected unbusy pg");
}
if (uvn->u_nio == 0)
panic("uvm_vnp_terminate: no I/O to wait for?");
printf("uvm_vnp_terminate: waiting for I/O to fin.\n");
/*
* XXXCDC: this is unlikely to happen without async i/o so we
* put a printf in just to keep an eye on it.
*/
#endif
uvn->u_flags |= UVM_VNODE_IOSYNC;
rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term",
INFSLP);
}
/*
* done. now we free the uvn if its reference count is zero
* (true if we are zapping a persisting uvn). however, if we are
* terminating a uvn with active mappings we let it live ... future
* calls down to the vnode layer will fail.
*/
oldflags = uvn->u_flags;
if (uvn->u_obj.uo_refs) {
/*
* uvn must live on it is dead-vnode state until all references
* are gone. restore flags. clear CANPERSIST state.
*/
uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED|
UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST);
} else {
/*
* free the uvn now. note that the vref reference is already
* gone [it is dropped when we enter the persist state].
*/
if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED)
panic("uvm_vnp_terminate: io sync wanted bit set"); if (uvn->u_flags & UVM_VNODE_WRITEABLE) { LIST_REMOVE(uvn, u_wlist);
}
uvn->u_flags = 0; /* uvn is history, clear all bits */
}
if (oldflags & UVM_VNODE_WANTED) wakeup(uvn);
rw_exit(uobj->vmobjlock);
}
/*
* NOTE: currently we have to use VOP_READ/VOP_WRITE because they go
* through the buffer cache and allow I/O in any size. These VOPs use
* synchronous i/o. [vs. VOP_STRATEGY which can be async, but doesn't
* go through the buffer cache or allow I/O sizes larger than a
* block]. we will eventually want to change this.
*
* issues to consider:
* uvm provides the uvm_aiodesc structure for async i/o management.
* there are two tailq's in the uvm. structure... one for pending async
* i/o and one for "done" async i/o. to do an async i/o one puts
* an aiodesc on the "pending" list (protected by splbio()), starts the
* i/o and returns VM_PAGER_PEND. when the i/o is done, we expect
* some sort of "i/o done" function to be called (at splbio(), interrupt
* time). this function should remove the aiodesc from the pending list
* and place it on the "done" list and wakeup the daemon. the daemon
* will run at normal spl() and will remove all items from the "done"
* list and call the "aiodone" hook for each done request (see uvm_pager.c).
* [in the old vm code, this was done by calling the "put" routine with
* null arguments which made the code harder to read and understand because
* you had one function ("put") doing two things.]
*
* so the current pager needs:
* int uvn_aiodone(struct uvm_aiodesc *)
*
* => return 0 (aio finished, free it). otherwise requeue for later collection.
* => called with pageq's locked by the daemon.
*
* general outline:
* - drop "u_nio" (this req is done!)
* - if (object->iosync && u_naio == 0) { wakeup &uvn->u_naio }
* - get "page" structures (atop?).
* - handle "wanted" pages
* dont forget to look at "object" wanted flag in all cases.
*/
/*
* uvn_flush: flush pages out of a uvm object.
*
* => if PGO_CLEANIT is set, we may block (due to I/O). thus, a caller
* might want to unlock higher level resources (e.g. vm_map)
* before calling flush.
* => if PGO_CLEANIT is not set, then we will not block
* => if PGO_ALLPAGE is set, then all pages in the object are valid targets
* for flushing.
* => NOTE: we are allowed to lock the page queues, so the caller
* must not be holding the lock on them [e.g. pagedaemon had
* better not call us with the queues locked]
* => we return TRUE unless we encountered some sort of I/O error
*
* comment on "cleaning" object and PG_BUSY pages:
* this routine is holding the lock on the object. the only time
* that it can run into a PG_BUSY page that it does not own is if
* some other process has started I/O on the page (e.g. either
* a pagein, or a pageout). if the PG_BUSY page is being paged
* in, then it can not be dirty (!PG_CLEAN) because no one has
* had a chance to modify it yet. if the PG_BUSY page is being
* paged out then it means that someone else has already started
* cleaning the page for us (how nice!). in this case, if we
* have syncio specified, then after we make our pass through the
* object we need to wait for the other PG_BUSY pages to clear
* off (i.e. we need to do an iosync). also note that once a
* page is PG_BUSY it must stay in its object until it is un-busyed.
*/
boolean_t
uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
struct vm_page *pp, *ptmp;
struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
struct pglist dead;
int npages, result, lcv;
boolean_t retval, need_iosync, needs_clean;
voff_t curoff;
KASSERT(rw_write_held(uobj->vmobjlock));
TAILQ_INIT(&dead);
/* get init vals and determine how we are going to traverse object */
need_iosync = FALSE;
retval = TRUE; /* return value */
if (flags & PGO_ALLPAGES) {
start = 0;
stop = round_page(uvn->u_size);
} else {
start = trunc_page(start);
stop = MIN(round_page(stop), round_page(uvn->u_size));
}
/*
* PG_CLEANCHK: this bit is used by the pgo_mk_pcluster function as
* a _hint_ as to how up to date the PG_CLEAN bit is. if the hint
* is wrong it will only prevent us from clustering... it won't break
* anything. we clear all PG_CLEANCHK bits here, and pgo_mk_pcluster
* will set them as it syncs PG_CLEAN. This is only an issue if we
* are looking at non-inactive pages (because inactive page's PG_CLEAN
* bit is always up to date since there are no mappings).
* [borrowed PG_CLEANCHK idea from FreeBSD VM]
*/
if ((flags & PGO_CLEANIT) != 0) { KASSERT(uobj->pgops->pgo_mk_pcluster != 0); for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) { if ((pp = uvm_pagelookup(uobj, curoff)) != NULL) atomic_clearbits_int(&pp->pg_flags,
PG_CLEANCHK);
}
}
ppsp = NULL; /* XXX: shut up gcc */
uvm_lock_pageq();
/* locked: both page queues */
for (curoff = start; curoff < stop; curoff += PAGE_SIZE) { if ((pp = uvm_pagelookup(uobj, curoff)) == NULL)
continue;
/*
* handle case where we do not need to clean page (either
* because we are not clean or because page is not dirty or
* is busy):
*
* NOTE: we are allowed to deactivate a non-wired active
* PG_BUSY page, but once a PG_BUSY page is on the inactive
* queue it must stay put until it is !PG_BUSY (so as not to
* confuse pagedaemon).
*/
if ((flags & PGO_CLEANIT) == 0 || (pp->pg_flags & PG_BUSY) != 0) {
needs_clean = FALSE;
if ((pp->pg_flags & PG_BUSY) != 0 &&
(flags & (PGO_CLEANIT|PGO_SYNCIO)) ==
(PGO_CLEANIT|PGO_SYNCIO))
need_iosync = TRUE;
} else {
/*
* freeing: nuke all mappings so we can sync
* PG_CLEAN bit with no race
*/
if ((pp->pg_flags & PG_CLEAN) != 0 &&
(flags & PGO_FREE) != 0 &&
(pp->pg_flags & PQ_ACTIVE) != 0)
pmap_page_protect(pp, PROT_NONE); if ((pp->pg_flags & PG_CLEAN) != 0 &&
pmap_is_modified(pp))
atomic_clearbits_int(&pp->pg_flags, PG_CLEAN);
atomic_setbits_int(&pp->pg_flags, PG_CLEANCHK);
needs_clean = ((pp->pg_flags & PG_CLEAN) == 0);
}
/* if we don't need a clean, deactivate/free pages then cont. */
if (!needs_clean) {
if (flags & PGO_DEACTIVATE) {
if (pp->wire_count == 0) { pmap_page_protect(pp, PROT_NONE);
uvm_pagedeactivate(pp);
}
} else if (flags & PGO_FREE) {
if (pp->pg_flags & PG_BUSY) {
uvm_unlock_pageq();
uvm_pagewait(pp, uobj->vmobjlock,
"uvn_flsh");
rw_enter(uobj->vmobjlock, RW_WRITE);
uvm_lock_pageq();
curoff -= PAGE_SIZE;
continue;
} else {
pmap_page_protect(pp, PROT_NONE);
/* removed page from object */
uvm_pageclean(pp);
TAILQ_INSERT_HEAD(&dead, pp, pageq);
}
}
continue;
}
/*
* pp points to a page in the object that we are
* working on. if it is !PG_CLEAN,!PG_BUSY and we asked
* for cleaning (PGO_CLEANIT). we clean it now.
*
* let uvm_pager_put attempted a clustered page out.
* note: locked: page queues.
*/
atomic_setbits_int(&pp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pp, "uvn_flush");
pmap_page_protect(pp, PROT_READ);
/* if we're async, free the page in aiodoned */
if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE) atomic_setbits_int(&pp->pg_flags, PG_RELEASED);
ReTry:
ppsp = pps;
npages = sizeof(pps) / sizeof(struct vm_page *);
result = uvm_pager_put(uobj, pp, &ppsp, &npages,
flags | PGO_DOACTCLUST, start, stop);
/*
* if we did an async I/O it is remotely possible for the
* async i/o to complete and the page "pp" be freed or what
* not before we get a chance to relock the object. Therefore,
* we only touch it when it won't be freed, RELEASED took care
* of the rest.
*/
uvm_lock_pageq();
/*
* VM_PAGER_AGAIN: given the structure of this pager, this
* can only happen when we are doing async I/O and can't
* map the pages into kernel memory (pager_map) due to lack
* of vm space. if this happens we drop back to sync I/O.
*/
if (result == VM_PAGER_AGAIN) {
/*
* it is unlikely, but page could have been released
* we ignore this now and retry the I/O.
* we will detect and
* handle the released page after the syncio I/O
* completes.
*/
#ifdef DIAGNOSTIC
if (flags & PGO_SYNCIO)
panic("%s: PGO_SYNCIO return 'try again' error (impossible)", __func__);
#endif
flags |= PGO_SYNCIO;
if (flags & PGO_FREE) atomic_clearbits_int(&pp->pg_flags,
PG_RELEASED);
goto ReTry;
}
/*
* the cleaning operation is now done. finish up. note that
* on error (!OK, !PEND) uvm_pager_put drops the cluster for us.
* if success (OK, PEND) then uvm_pager_put returns the cluster
* to us in ppsp/npages.
*/
/*
* for pending async i/o if we are not deactivating
* we can move on to the next page. aiodoned deals with
* the freeing case for us.
*/
if (result == VM_PAGER_PEND && (flags & PGO_DEACTIVATE) == 0)
continue;
/*
* need to look at each page of the I/O operation, and do what
* we gotta do.
*/
for (lcv = 0 ; lcv < npages; lcv++) {
ptmp = ppsp[lcv];
/*
* verify the page didn't get moved
*/
if (result == VM_PAGER_PEND && ptmp->uobject != uobj)
continue;
/*
* unbusy the page if I/O is done. note that for
* pending I/O it is possible that the I/O op
* finished
* (in which case the page is no longer busy).
*/
if (result != VM_PAGER_PEND) {
if (ptmp->pg_flags & PG_WANTED) wakeup(ptmp);
atomic_clearbits_int(&ptmp->pg_flags,
PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(ptmp, NULL);
atomic_setbits_int(&ptmp->pg_flags,
PG_CLEAN|PG_CLEANCHK);
if ((flags & PGO_FREE) == 0) pmap_clear_modify(ptmp);
}
/* dispose of page */
if (flags & PGO_DEACTIVATE) {
if (ptmp->wire_count == 0) { pmap_page_protect(ptmp, PROT_NONE);
uvm_pagedeactivate(ptmp);
}
} else if (flags & PGO_FREE &&
result != VM_PAGER_PEND) {
if (result != VM_PAGER_OK) {
static struct timeval lasttime;
static const struct timeval interval =
{ 5, 0 };
if (ratecheck(&lasttime, &interval)) {
printf("%s: obj=%p, "
"offset=0x%llx. error "
"during pageout.\n",
__func__, pp->uobject,
(long long)pp->offset);
printf("%s: WARNING: "
"changes to page may be "
"lost!\n", __func__);
}
retval = FALSE;
}
pmap_page_protect(ptmp, PROT_NONE);
uvm_pageclean(ptmp);
TAILQ_INSERT_TAIL(&dead, ptmp, pageq);
}
} /* end of "lcv" for loop */
} /* end of "pp" for loop */
/* done with pagequeues: unlock */
uvm_unlock_pageq();
/* now wait for all I/O if required. */
if (need_iosync) { while (uvn->u_nio != 0) {
uvn->u_flags |= UVM_VNODE_IOSYNC;
rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM,
"uvn_flush", INFSLP);
}
if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) wakeup(&uvn->u_flags);
uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
}
uvm_pglistfree(&dead);
return retval;
}
/*
* uvn_cluster
*
* we are about to do I/O in an object at offset. this function is called
* to establish a range of offsets around "offset" in which we can cluster
* I/O.
*/
void
uvn_cluster(struct uvm_object *uobj, voff_t offset, voff_t *loffset,
voff_t *hoffset)
{
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
*loffset = offset;
if (*loffset >= uvn->u_size)
panic("uvn_cluster: offset out of range");
/*
* XXX: old pager claims we could use VOP_BMAP to get maxcontig value.
*/
*hoffset = *loffset + MAXBSIZE;
if (*hoffset > round_page(uvn->u_size)) /* past end? */
*hoffset = round_page(uvn->u_size);
return;
}
/*
* uvn_put: flush page data to backing store.
*
* => prefer map unlocked (not required)
* => flags: PGO_SYNCIO -- use sync. I/O
* => note: caller must set PG_CLEAN and pmap_clear_modify (if needed)
* => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
* [thus we never do async i/o! see iodone comment]
*/
int
uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
{
int retval;
KASSERT(rw_write_held(uobj->vmobjlock)); retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
return retval;
}
/*
* uvn_get: get pages (synchronously) from backing store
*
* => prefer map unlocked (not required)
* => flags: PGO_ALLPAGES: get all of the pages
* PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
int
uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
{
voff_t current_offset;
struct vm_page *ptmp;
int lcv, result, gotpages;
boolean_t done;
KASSERT(((flags & PGO_LOCKED) != 0 && rw_lock_held(uobj->vmobjlock)) ||
(flags & PGO_LOCKED) == 0);
/* step 1: handled the case where fault data structures are locked. */
if (flags & PGO_LOCKED) {
/*
* gotpages is the current number of pages we've gotten (which
* we pass back up to caller via *npagesp.
*/
gotpages = 0;
/*
* step 1a: get pages that are already resident. only do this
* if the data structures are locked (i.e. the first time
* through).
*/
done = TRUE; /* be optimistic */
for (lcv = 0, current_offset = offset ; lcv < *npagesp ;
lcv++, current_offset += PAGE_SIZE) {
/* do we care about this page? if not, skip it */
if (pps[lcv] == PGO_DONTCARE)
continue;
/* lookup page */
ptmp = uvm_pagelookup(uobj, current_offset);
/* to be useful must get a non-busy, non-released pg */
if (ptmp == NULL ||
(ptmp->pg_flags & PG_BUSY) != 0) {
if (lcv == centeridx || (flags & PGO_ALLPAGES)
!= 0)
done = FALSE; /* need to do a wait or I/O! */
continue;
}
/*
* useful page: busy it and plug it in our
* result array
*/
atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(ptmp, "uvn_get1");
pps[lcv] = ptmp;
gotpages++;
}
/*
* XXX: given the "advice", should we consider async read-ahead?
* XXX: fault current does deactivate of pages behind us. is
* this good (other callers might now).
*/
/*
* XXX: read-ahead currently handled by buffer cache (bread)
* level.
* XXX: no async i/o available.
* XXX: so we don't do anything now.
*/
/*
* step 1c: now we've either done everything needed or we to
* unlock and do some waiting or I/O.
*/
*npagesp = gotpages; /* let caller know */
if (done)
return VM_PAGER_OK; /* bingo! */
else
return VM_PAGER_UNLOCK;
}
/*
* step 2: get non-resident or busy pages.
* data structures are unlocked.
*
* XXX: because we can't do async I/O at this level we get things
* page at a time (otherwise we'd chunk). the VOP_READ() will do
* async-read-ahead for us at a lower level.
*/
for (lcv = 0, current_offset = offset;
lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) {
/* skip over pages we've already gotten or don't want */
/* skip over pages we don't _have_ to get */
if (pps[lcv] != NULL || (lcv != centeridx &&
(flags & PGO_ALLPAGES) == 0))
continue;
/*
* we have yet to locate the current page (pps[lcv]). we first
* look for a page that is already at the current offset. if
* we fine a page, we check to see if it is busy or released.
* if that is the case, then we sleep on the page until it is
* no longer busy or released and repeat the lookup. if the
* page we found is neither busy nor released, then we busy it
* (so we own it) and plug it into pps[lcv]. this breaks the
* following while loop and indicates we are ready to move on
* to the next page in the "lcv" loop above.
*
* if we exit the while loop with pps[lcv] still set to NULL,
* then it means that we allocated a new busy/fake/clean page
* ptmp in the object and we need to do I/O to fill in the data.
*/
while (pps[lcv] == NULL) { /* top of "pps" while loop */
/* look for a current page */
ptmp = uvm_pagelookup(uobj, current_offset);
/* nope? allocate one now (if we can) */
if (ptmp == NULL) {
ptmp = uvm_pagealloc(uobj, current_offset,
NULL, 0);
/* out of RAM? */
if (ptmp == NULL) {
uvm_wait("uvn_getpage");
/* goto top of pps while loop */
continue;
}
/*
* got new page ready for I/O. break pps
* while loop. pps[lcv] is still NULL.
*/
break;
}
/* page is there, see if we need to wait on it */
if ((ptmp->pg_flags & PG_BUSY) != 0) {
uvm_pagewait(ptmp, uobj->vmobjlock, "uvn_get");
rw_enter(uobj->vmobjlock, RW_WRITE);
continue; /* goto top of pps while loop */
}
/*
* if we get here then the page has become resident
* and unbusy between steps 1 and 2. we busy it
* now (so we own it) and set pps[lcv] (so that we
* exit the while loop).
*/
atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(ptmp, "uvn_get2");
pps[lcv] = ptmp;
}
/*
* if we own the a valid page at the correct offset, pps[lcv]
* will point to it. nothing more to do except go to the
* next page.
*/
if (pps[lcv])
continue; /* next lcv */
/*
* we have a "fake/busy/clean" page that we just allocated. do
* I/O to fill it with valid data.
*/
result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
PGO_SYNCIO|PGO_NOWAIT, UIO_READ);
/*
* I/O done. because we used syncio the result can not be
* PEND or AGAIN.
*/
if (result != VM_PAGER_OK) {
if (ptmp->pg_flags & PG_WANTED) wakeup(ptmp);
atomic_clearbits_int(&ptmp->pg_flags,
PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(ptmp, NULL);
uvm_lock_pageq();
uvm_pagefree(ptmp);
uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
return result;
}
/*
* we got the page! clear the fake flag (indicates valid
* data now in page) and plug into our result array. note
* that page is still busy.
*
* it is the callers job to:
* => check if the page is released
* => unbusy the page
* => activate the page
*/
/* data is valid ... */
atomic_clearbits_int(&ptmp->pg_flags, PG_FAKE);
pmap_clear_modify(ptmp); /* ... and clean */
pps[lcv] = ptmp;
}
rw_exit(uobj->vmobjlock);
return (VM_PAGER_OK);
}
/*
* uvn_io: do I/O to a vnode
*
* => prefer map unlocked (not required)
* => flags: PGO_SYNCIO -- use sync. I/O
* => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
* [thus we never do async i/o! see iodone comment]
*/
int
uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw)
{
struct uvm_object *uobj = &uvn->u_obj;
struct vnode *vn;
struct uio uio;
struct iovec iov;
vaddr_t kva;
off_t file_offset;
int waitf, result, mapinflags;
size_t got, wanted;
int netunlocked = 0;
int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0;
KASSERT(rw_write_held(uobj->vmobjlock));
/* init values */
waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
vn = uvn->u_vnode;
file_offset = pps[0]->offset;
/* check for sync'ing I/O. */
while (uvn->u_flags & UVM_VNODE_IOSYNC) {
if (waitf == M_NOWAIT) {
return VM_PAGER_AGAIN;
}
uvn->u_flags |= UVM_VNODE_IOSYNCWANTED;
rwsleep_nsec(&uvn->u_flags, uobj->vmobjlock, PVM, "uvn_iosync",
INFSLP);
}
/* check size */
if (file_offset >= uvn->u_size) {
return VM_PAGER_BAD;
}
/* first try and map the pages in (without waiting) */
mapinflags = (rw == UIO_READ) ?
UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
kva = uvm_pagermapin(pps, npages, mapinflags);
if (kva == 0 && waitf == M_NOWAIT) {
return VM_PAGER_AGAIN;
}
/*
* ok, now bump u_nio up. at this point we are done with uvn
* and can unlock it. if we still don't have a kva, try again
* (this time with sleep ok).
*/
uvn->u_nio++; /* we have an I/O in progress! */
rw_exit(uobj->vmobjlock);
if (kva == 0)
kva = uvm_pagermapin(pps, npages,
mapinflags | UVMPAGER_MAPIN_WAITOK);
/*
* ok, mapped in. our pages are PG_BUSY so they are not going to
* get touched (so we can look at "offset" without having to lock
* the object). set up for I/O.
*/
/* fill out uio/iov */
iov.iov_base = (caddr_t) kva;
wanted = (size_t)npages << PAGE_SHIFT;
if (file_offset + wanted > uvn->u_size)
wanted = uvn->u_size - file_offset; /* XXX: needed? */
iov.iov_len = wanted;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = file_offset;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = rw;
uio.uio_resid = wanted;
uio.uio_procp = curproc;
/*
* This process may already have the NET_LOCK(), if we
* faulted in copyin() or copyout() in the network stack.
*/
if (rw_status(&netlock) == RW_WRITE) { NET_UNLOCK();
netunlocked = 1;
}
/* do the I/O! (XXX: curproc?) */
/*
* This process may already have this vnode locked, if we faulted in
* copyin() or copyout() on a region backed by this vnode
* while doing I/O to the vnode. If this is the case, don't
* panic.. instead, return the error to the user.
*
* XXX this is a stopgap to prevent a panic.
* Ideally, this kind of operation *should* work.
*/
result = 0;
KERNEL_LOCK();
if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags);
if (result == 0) {
/* NOTE: vnode now locked! */
if (rw == UIO_READ)
result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
else
result = VOP_WRITE(vn, &uio,
(flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
curproc->p_ucred);
if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) VOP_UNLOCK(vn);
}
KERNEL_UNLOCK();
if (netunlocked) NET_LOCK();
/* NOTE: vnode now unlocked (unless vnislocked) */
/*
* result == unix style errno (0 == OK!)
*
* zero out rest of buffer (if needed)
*/
if (result == 0) {
got = wanted - uio.uio_resid;
if (wanted && got == 0) {
result = EIO; /* XXX: error? */
} else if (got < PAGE_SIZE * npages && rw == UIO_READ) { memset((void *) (kva + got), 0,
((size_t)npages << PAGE_SHIFT) - got);
}
}
/* now remove pager mapping */
uvm_pagermapout(kva, npages);
/* now clean up the object (i.e. drop I/O count) */
rw_enter(uobj->vmobjlock, RW_WRITE);
uvn->u_nio--; /* I/O DONE! */
if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) { wakeup(&uvn->u_nio);
}
if (result == 0) {
return VM_PAGER_OK;
} else if (result == EBUSY) {
KASSERT(flags & PGO_NOWAIT);
return VM_PAGER_AGAIN;
} else {
if (rebooting) {
KERNEL_LOCK();
while (rebooting)
tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
KERNEL_UNLOCK();
}
return VM_PAGER_ERROR;
}
}
/*
* uvm_vnp_uncache: disable "persisting" in a vnode... when last reference
* is gone we will kill the object (flushing dirty pages back to the vnode
* if needed).
*
* => returns TRUE if there was no uvm_object attached or if there was
* one and we killed it [i.e. if there is no active uvn]
* => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if
* needed]
*
* => XXX: given that we now kill uvn's when a vnode is recycled (without
* having to hold a reference on the vnode) and given a working
* uvm_vnp_sync(), how does that effect the need for this function?
* [XXXCDC: seems like it can die?]
*
* => XXX: this function should DIE once we merge the VM and buffer
* cache.
*
* research shows that this is called in the following places:
* ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode
* changes sizes
* ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we
* are written to
* ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit
* is off
* ffs_realloccg: when we can't extend the current block and have
* to allocate a new one we call this [XXX: why?]
* nfsrv_rename, rename_files: called when the target filename is there
* and we want to remove it
* nfsrv_remove, sys_unlink: called on file we are removing
* nfsrv_access: if VTEXT and we want WRITE access and we don't uncache
* then return "text busy"
* nfs_open: seems to uncache any file opened with nfs
* vn_writechk: if VTEXT vnode and can't uncache return "text busy"
* fusefs_open: uncaches any file that is opened
* fusefs_write: uncaches on every write
*/
int
uvm_vnp_uncache(struct vnode *vp)
{
struct uvm_vnode *uvn = vp->v_uvm;
struct uvm_object *uobj = &uvn->u_obj;
/* lock uvn part of the vnode and check if we need to do anything */
rw_enter(uobj->vmobjlock, RW_WRITE);
if ((uvn->u_flags & UVM_VNODE_VALID) == 0 ||
(uvn->u_flags & UVM_VNODE_BLOCKED) != 0) {
rw_exit(uobj->vmobjlock);
return TRUE;
}
/*
* we have a valid, non-blocked uvn. clear persist flag.
* if uvn is currently active we can return now.
*/
uvn->u_flags &= ~UVM_VNODE_CANPERSIST;
if (uvn->u_obj.uo_refs) {
rw_exit(uobj->vmobjlock);
return FALSE;
}
/*
* uvn is currently persisting! we have to gain a reference to
* it so that we can call uvn_detach to kill the uvn.
*/
vref(vp); /* seems ok, even with VOP_LOCK */
uvn->u_obj.uo_refs++; /* value is now 1 */
rw_exit(uobj->vmobjlock);
#ifdef VFSLCKDEBUG
/*
* carry over sanity check from old vnode pager: the vnode should
* be VOP_LOCK'd, and we confirm it here.
*/
if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp)) panic("uvm_vnp_uncache: vnode not locked!");
#endif
/*
* now drop our reference to the vnode. if we have the sole
* reference to the vnode then this will cause it to die [as we
* just cleared the persist flag]. we have to unlock the vnode
* while we are doing this as it may trigger I/O.
*
* XXX: it might be possible for uvn to get reclaimed while we are
* unlocked causing us to return TRUE when we should not. we ignore
* this as a false-positive return value doesn't hurt us.
*/
VOP_UNLOCK(vp);
uvn_detach(&uvn->u_obj);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return TRUE;
}
/*
* uvm_vnp_setsize: grow or shrink a vnode uvn
*
* grow => just update size value
* shrink => toss un-needed pages
*
* => we assume that the caller has a reference of some sort to the
* vnode in question so that it will not be yanked out from under
* us.
*
* called from:
* => truncate fns (ext2fs_truncate, ffs_truncate, detrunc[msdos],
* fusefs_setattr)
* => "write" fns (ext2fs_write, WRITE [ufs/ufs], msdosfs_write, nfs_write
* fusefs_write)
* => ffs_balloc [XXX: why? doesn't WRITE handle?]
* => NFS: nfs_loadattrcache, nfs_getattrcache, nfs_setattr
* => union fs: union_newsize
*/
void
uvm_vnp_setsize(struct vnode *vp, off_t newsize)
{
struct uvm_vnode *uvn = vp->v_uvm;
struct uvm_object *uobj = &uvn->u_obj;
KERNEL_ASSERT_LOCKED();
rw_enter(uobj->vmobjlock, RW_WRITE);
/* lock uvn and check for valid object, and if valid: do it! */
if (uvn->u_flags & UVM_VNODE_VALID) {
/*
* now check if the size has changed: if we shrink we had better
* toss some pages...
*/
if (uvn->u_size > newsize) { (void)uvn_flush(&uvn->u_obj, newsize,
uvn->u_size, PGO_FREE);
}
uvn->u_size = newsize;
}
rw_exit(uobj->vmobjlock);
}
/*
* uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes.
*
* => called from sys_sync with no VM structures locked
* => only one process can do a sync at a time (because the uvn
* structure only has one queue for sync'ing). we ensure this
* by holding the uvn_sync_lock while the sync is in progress.
* other processes attempting a sync will sleep on this lock
* until we are done.
*/
void
uvm_vnp_sync(struct mount *mp)
{
struct uvm_vnode *uvn;
struct vnode *vp;
/*
* step 1: ensure we are only ones using the uvn_sync_q by locking
* our lock...
*/
rw_enter_write(&uvn_sync_lock);
/*
* step 2: build up a simpleq of uvns of interest based on the
* write list. we gain a reference to uvns of interest.
*/
SIMPLEQ_INIT(&uvn_sync_q);
LIST_FOREACH(uvn, &uvn_wlist, u_wlist) {
vp = uvn->u_vnode;
if (mp && vp->v_mount != mp)
continue;
/*
* If the vnode is "blocked" it means it must be dying, which
* in turn means its in the process of being flushed out so
* we can safely skip it.
*
* note that uvn must already be valid because we found it on
* the wlist (this also means it can't be ALOCK'd).
*/
if ((uvn->u_flags & UVM_VNODE_BLOCKED) != 0)
continue;
/*
* gain reference. watch out for persisting uvns (need to
* regain vnode REF).
*/
if (uvn->u_obj.uo_refs == 0) vref(vp);
uvn->u_obj.uo_refs++;
SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq);
}
/* step 3: we now have a list of uvn's that may need cleaning. */
SIMPLEQ_FOREACH(uvn, &uvn_sync_q, u_syncq) {
rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
#ifdef DEBUG
if (uvn->u_flags & UVM_VNODE_DYING) {
printf("uvm_vnp_sync: dying vnode on sync list\n");
}
#endif
uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST);
/*
* if we have the only reference and we just cleaned the uvn,
* then we can pull it out of the UVM_VNODE_WRITEABLE state
* thus allowing us to avoid thinking about flushing it again
* on later sync ops.
*/
if (uvn->u_obj.uo_refs == 1 &&
(uvn->u_flags & UVM_VNODE_WRITEABLE)) {
LIST_REMOVE(uvn, u_wlist);
uvn->u_flags &= ~UVM_VNODE_WRITEABLE;
}
rw_exit(uvn->u_obj.vmobjlock);
/* now drop our reference to the uvn */
uvn_detach(&uvn->u_obj);
}
rw_exit_write(&uvn_sync_lock);
}
/* $OpenBSD: subr_xxx.c,v 1.17 2019/05/17 03:53:08 visa Exp $ */
/* $NetBSD: subr_xxx.c,v 1.10 1996/02/04 02:16:51 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93
*/
/*
* Miscellaneous trivial functions, including many
* that are often inline-expanded or done in assembler.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/smr.h>
/*
* Unsupported device function (e.g. writing to read-only device).
*/
int
enodev(void)
{
return (ENODEV);
}
/*
* Unconfigured device function; driver not configured.
*/
int
enxio(void)
{
return (ENXIO);
}
/*
* Unsupported ioctl function.
*/
int
enoioctl(void)
{
return (ENOTTY);
}
/*
* Unsupported system function.
* This is used for an otherwise-reasonable operation
* that is not supported by the current system binary.
*/
int
enosys(void)
{
return (ENOSYS);
}
/*
* Return error for operation not supported
* on a specific object or file type.
*/
int
eopnotsupp(void *v)
{
return (EOPNOTSUPP);
}
/*
* Generic null operation, always returns success.
*/
int
nullop(void *v)
{
return (0);
}
struct bdevsw *
bdevsw_lookup(dev_t dev)
{
return (&bdevsw[major(dev)]);
}
struct cdevsw *
cdevsw_lookup(dev_t dev)
{
return (&cdevsw[major(dev)]);
}
/*
* Convert a character device number to a block device number.
*/
dev_t
chrtoblk(dev_t dev)
{
int blkmaj;
if (major(dev) >= nchrdev || major(dev) >= nchrtoblktbl)
return (NODEV);
blkmaj = chrtoblktbl[major(dev)];
if (blkmaj == NODEV)
return (NODEV);
return (makedev(blkmaj, minor(dev)));
}
/*
* Convert a block device number to a character device number.
*/
dev_t
blktochr(dev_t dev)
{
int blkmaj = major(dev);
int i;
if (blkmaj >= nblkdev)
return (NODEV);
for (i = 0; i < nchrtoblktbl; i++)
if (blkmaj == chrtoblktbl[i])
return (makedev(i, minor(dev)));
return (NODEV);
}
/*
* Check that we're in a context where it's okay to sleep.
*/
void
assertwaitok(void)
{ if (panicstr || db_active)
return;
splassert(IPL_NONE); SMR_ASSERT_NONCRITICAL();
#ifdef DIAGNOSTIC
if (curcpu()->ci_mutex_level != 0)
panic("assertwaitok: non-zero mutex count: %d",
curcpu()->ci_mutex_level);
#endif
}
/* $OpenBSD: ufs_ihash.c,v 1.26 2021/10/19 06:11:45 semarie Exp $ */
/* $NetBSD: ufs_ihash.c,v 1.3 1996/02/09 22:36:04 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_ihash.c 8.4 (Berkeley) 12/30/93
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <crypto/siphash.h>
/*
* Structures associated with inode caching.
*/
LIST_HEAD(ihashhead, inode) *ihashtbl;
u_long ihash; /* size of hash table - 1 */
SIPHASH_KEY ihashkey;
struct ihashhead *ufs_ihash(dev_t, ufsino_t);
#define INOHASH(device, inum) ufs_ihash((device), (inum))
struct ihashhead *
ufs_ihash(dev_t dev, ufsino_t inum)
{
SIPHASH_CTX ctx;
SipHash24_Init(&ctx, &ihashkey);
SipHash24_Update(&ctx, &dev, sizeof(dev));
SipHash24_Update(&ctx, &inum, sizeof(inum));
return (&ihashtbl[SipHash24_End(&ctx) & ihash]);
}
/*
* Initialize inode hash table.
*/
void
ufs_ihashinit(void)
{
ihashtbl = hashinit(initialvnodes, M_UFSMNT, M_WAITOK, &ihash);
arc4random_buf(&ihashkey, sizeof(ihashkey));
}
/*
* Use the device/inum pair to find the incore inode, and return a pointer
* to it. If it is in core, return it, even if it is locked.
*/
struct vnode *
ufs_ihashlookup(dev_t dev, ufsino_t inum)
{
struct inode *ip;
struct ihashhead *ipp;
/* XXXLOCKING lock hash list */
ipp = INOHASH(dev, inum);
LIST_FOREACH(ip, ipp, i_hash) {
if (inum == ip->i_number && dev == ip->i_dev)
break;
}
/* XXXLOCKING unlock hash list? */
if (ip)
return (ITOV(ip));
return (NULLVP);
}
/*
* Use the device/inum pair to find the incore inode, and return a pointer
* to it. If it is in core, but locked, wait for it.
*/
struct vnode *
ufs_ihashget(dev_t dev, ufsino_t inum)
{
struct ihashhead *ipp;
struct inode *ip;
struct vnode *vp;
loop:
/* XXXLOCKING lock hash list */
ipp = INOHASH(dev, inum);
LIST_FOREACH(ip, ipp, i_hash) { if (inum == ip->i_number && dev == ip->i_dev) {
vp = ITOV(ip);
/* XXXLOCKING unlock hash list? */
if (vget(vp, LK_EXCLUSIVE))
goto loop;
return (vp);
}
}
/* XXXLOCKING unlock hash list? */
return (NULL);
}
/*
* Insert the inode into the hash table, and return it locked.
*/
int
ufs_ihashins(struct inode *ip)
{
struct inode *curip;
struct ihashhead *ipp;
dev_t dev = ip->i_dev;
ufsino_t inum = ip->i_number;
/* lock the inode, then put it on the appropriate hash list */
VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
/* XXXLOCKING lock hash list */
ipp = INOHASH(dev, inum);
LIST_FOREACH(curip, ipp, i_hash) { if (inum == curip->i_number && dev == curip->i_dev) {
/* XXXLOCKING unlock hash list? */
VOP_UNLOCK(ITOV(ip));
return (EEXIST);
}
}
SET(ip->i_flag, IN_HASHED);
LIST_INSERT_HEAD(ipp, ip, i_hash);
/* XXXLOCKING unlock hash list? */
return (0);
}
/*
* Remove the inode from the hash table.
*/
void
ufs_ihashrem(struct inode *ip)
{
/* XXXLOCKING lock hash list */
if (ip->i_hash.le_prev == NULL)
return;
if (ISSET(ip->i_flag, IN_HASHED)) { LIST_REMOVE(ip, i_hash);
CLR(ip->i_flag, IN_HASHED);
}
#ifdef DIAGNOSTIC
ip->i_hash.le_next = NULL;
ip->i_hash.le_prev = NULL;
#endif
/* XXXLOCKING unlock hash list? */
}
/* $OpenBSD: strlcat.c,v 1.9 2019/01/25 00:19:26 millert Exp $ */
/*
* Copyright (c) 1998, 2015 Todd C. Miller <millert@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <lib/libkern/libkern.h>
/*
* Appends src to string dst of size dsize (unlike strncat, dsize is the
* full size of dst, not space left). At most dsize-1 characters
* will be copied. Always NUL terminates (unless dsize <= strlen(dst)).
* Returns strlen(src) + MIN(dsize, strlen(initial dst)).
* If retval >= siz, truncation occurred.
*/
size_t
strlcat(char *dst, const char *src, size_t dsize)
{
const char *odst = dst;
const char *osrc = src;
size_t n = dsize;
size_t dlen;
/* Find the end of dst and adjust bytes left but don't go past end. */
while (n-- != 0 && *dst != '\0')
dst++;
dlen = dst - odst;
n = dsize - dlen;
if (n-- == 0)
return(dlen + strlen(src)); while (*src != '\0') { if (n != 0) { *dst++ = *src;
n--;
}
src++;
}
*dst = '\0';
return(dlen + (src - osrc)); /* count does not include NUL */
}
/* $OpenBSD: syscall_mi.h,v 1.26 2022/06/29 12:06:11 jca Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
*/
#include <sys/param.h>
#include <sys/pledge.h>
#include <sys/tracepoint.h>
#include <uvm/uvm_extern.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include "dt.h"
#if NDT > 0
#include <dev/dt/dtvar.h>
#endif
/*
* The MD setup for a system call has been done; here's the MI part.
*/
static inline int
mi_syscall(struct proc *p, register_t code, const struct sysent *callp,
register_t *argp, register_t retval[2])
{
uint64_t tval;
int lock = !(callp->sy_flags & SY_NOLOCK);
int error, pledged;
/* refresh the thread's cache of the process's creds */
refreshcreds(p);
#ifdef SYSCALL_DEBUG
KERNEL_LOCK();
scdebug_call(p, code, argp);
KERNEL_UNLOCK();
#endif
TRACEPOINT(raw_syscalls, sys_enter, code, NULL);
#if NDT > 0
DT_ENTER(syscall, code, callp->sy_argsize, argp);
#endif
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSCALL)) { KERNEL_LOCK();
ktrsyscall(p, code, callp->sy_argsize, argp);
KERNEL_UNLOCK();
}
#endif
/* SP must be within MAP_STACK space */
if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
"[%s]%d/%d sp=%lx inside %lx-%lx: not MAP_STACK\n",
uvm_map_inentry_sp, p->p_vmspace->vm_map.sserial))
return (EPERM);
/* PC must be in un-writeable permitted text (sigtramp, libc, ld.so) */
if (!uvm_map_inentry(p, &p->p_pcinentry, PROC_PC(p),
"[%s]%d/%d pc=%lx inside %lx-%lx: bogus syscall\n",
uvm_map_inentry_pc, p->p_vmspace->vm_map.wserial))
return (EPERM);
pledged = (p->p_p->ps_flags & PS_PLEDGE);
if (pledged && (error = pledge_syscall(p, code, &tval))) { KERNEL_LOCK();
error = pledge_fail(p, error, tval);
KERNEL_UNLOCK();
return (error);
}
if (lock)
KERNEL_LOCK(); error = (*callp->sy_call)(p, argp, retval);
if (lock)
KERNEL_UNLOCK();
return (error);
}
/*
* Finish MI stuff on return, after the registers have been set
*/
static inline void
mi_syscall_return(struct proc *p, register_t code, int error,
const register_t retval[2])
{
#ifdef SYSCALL_DEBUG
KERNEL_LOCK();
scdebug_ret(p, code, error, retval);
KERNEL_UNLOCK();
#endif
#if NDT > 0
DT_LEAVE(syscall, code, error, retval[0], retval[1]);
#endif
TRACEPOINT(raw_syscalls, sys_exit, code, NULL);
userret(p);
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSRET)) { KERNEL_LOCK();
ktrsysret(p, code, error, retval);
KERNEL_UNLOCK();
}
#endif
}
/*
* Finish MI stuff for a new process/thread to return
*/
static inline void
mi_child_return(struct proc *p)
{
#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0
int code = (p->p_flag & P_THREAD) ? SYS___tfork :
(p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork;
const register_t child_retval[2] = { 0, 1 };
#endif
TRACEPOINT(sched, on__cpu, NULL);
#ifdef SYSCALL_DEBUG
KERNEL_LOCK();
scdebug_ret(p, code, 0, child_retval);
KERNEL_UNLOCK();
#endif
#if NDT > 0
DT_LEAVE(syscall, code, 0, child_retval[0], child_retval[1]);
#endif
TRACEPOINT(raw_syscalls, sys_exit, code, NULL);
userret(p);
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSRET)) {
KERNEL_LOCK();
ktrsysret(p, code, 0, child_retval);
KERNEL_UNLOCK();
}
#endif
}
/*
* Do the specific processing necessary for an AST
*/
static inline void
mi_ast(struct proc *p, int resched)
{
if (p->p_flag & P_OWEUPC) { KERNEL_LOCK();
ADDUPROF(p);
KERNEL_UNLOCK();
}
if (resched) preempt();
/*
* XXX could move call to userret() here, but
* hppa calls ast() in syscall return and sh calls
* it after userret()
*/
}
/* $OpenBSD: uvm_map.h,v 1.75 2022/03/12 08:11:07 mpi Exp $ */
/* $NetBSD: uvm_map.h,v 1.24 2001/02/18 21:19:08 chs Exp $ */
/*
* Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_map.h 8.3 (Berkeley) 3/15/94
* from: Id: uvm_map.h,v 1.1.2.3 1998/02/07 01:16:55 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _UVM_UVM_MAP_H_
#define _UVM_UVM_MAP_H_
#include <sys/mutex.h>
#include <sys/rwlock.h>
#ifdef _KERNEL
/*
* UVM_MAP_CLIP_START: ensure that the entry begins at or after
* the starting address, if it doesn't we split the entry.
*
* => map must be locked by caller
*/
#define UVM_MAP_CLIP_START(_map, _entry, _addr) \
do { \
KASSERT((_entry)->end + (_entry)->fspace > (_addr)); \
if ((_entry)->start < (_addr)) \
uvm_map_clip_start((_map), (_entry), (_addr)); \
} while (0)
/*
* UVM_MAP_CLIP_END: ensure that the entry ends at or before
* the ending address, if it doesn't we split the entry.
*
* => map must be locked by caller
*/
#define UVM_MAP_CLIP_END(_map, _entry, _addr) \
do { \
KASSERT((_entry)->start < (_addr)); \
if ((_entry)->end > (_addr)) \
uvm_map_clip_end((_map), (_entry), (_addr)); \
} while (0)
/*
* extract flags
*/
#define UVM_EXTRACT_FIXPROT 0x8 /* set prot to maxprot as we go */
#endif /* _KERNEL */
#include <uvm/uvm_anon.h>
/*
* Address map entries consist of start and end addresses,
* a VM object (or sharing map) and offset into that object,
* and user-exported inheritance and protection information.
* Also included is control information for virtual copy operations.
*/
struct vm_map_entry {
union {
RBT_ENTRY(vm_map_entry) addr_entry; /* address tree */
SLIST_ENTRY(vm_map_entry) addr_kentry;
} daddrs;
union {
RBT_ENTRY(vm_map_entry) rbtree; /* Link freespace tree. */
TAILQ_ENTRY(vm_map_entry) tailq;/* Link freespace queue. */
TAILQ_ENTRY(vm_map_entry) deadq;/* dead entry queue */
} dfree;
#define uvm_map_entry_start_copy start
vaddr_t start; /* start address */
vaddr_t end; /* end address */
vsize_t guard; /* bytes in guard */
vsize_t fspace; /* free space */
union {
struct uvm_object *uvm_obj; /* uvm object */
struct vm_map *sub_map; /* belongs to another map */
} object; /* object I point to */
voff_t offset; /* offset into object */
struct vm_aref aref; /* anonymous overlay */
int etype; /* entry type */
vm_prot_t protection; /* protection code */
vm_prot_t max_protection; /* maximum protection */
vm_inherit_t inheritance; /* inheritance */
int wired_count; /* can be paged if == 0 */
int advice; /* madvise advice */
#define uvm_map_entry_stop_copy flags
u_int8_t flags; /* flags */
#define UVM_MAP_STATIC 0x01 /* static map entry */
#define UVM_MAP_KMEM 0x02 /* from kmem entry pool */
vsize_t fspace_augment; /* max(fspace) in subtree */
};
#define VM_MAPENT_ISWIRED(entry) ((entry)->wired_count != 0)
TAILQ_HEAD(uvm_map_deadq, vm_map_entry); /* dead entry queue */
RBT_HEAD(uvm_map_addr, vm_map_entry);
#ifdef _KERNEL
RBT_PROTOTYPE(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
uvm_mapentry_addrcmp);
#endif
/*
* A Map is a rbtree of map entries, kept sorted by address.
* In addition, free space entries are also kept in a rbtree,
* indexed by free size.
*
*
*
* LOCKING PROTOCOL NOTES:
* -----------------------
*
* VM map locking is a little complicated. There are both shared
* and exclusive locks on maps. However, it is sometimes required
* to downgrade an exclusive lock to a shared lock, and upgrade to
* an exclusive lock again (to perform error recovery). However,
* another thread *must not* queue itself to receive an exclusive
* lock while before we upgrade back to exclusive, otherwise the
* error recovery becomes extremely difficult, if not impossible.
*
* In order to prevent this scenario, we introduce the notion of
* a `busy' map. A `busy' map is read-locked, but other threads
* attempting to write-lock wait for this flag to clear before
* entering the lock manager. A map may only be marked busy
* when the map is write-locked (and then the map must be downgraded
* to read-locked), and may only be marked unbusy by the thread
* which marked it busy (holding *either* a read-lock or a
* write-lock, the latter being gained by an upgrade).
*
* Access to the map `flags' member is controlled by the `flags_lock'
* simple lock. Note that some flags are static (set once at map
* creation time, and never changed), and thus require no locking
* to check those flags. All flags which are r/w must be set or
* cleared while the `flags_lock' is asserted. Additional locking
* requirements are:
*
* VM_MAP_PAGEABLE r/o static flag; no locking required
*
* VM_MAP_INTRSAFE r/o static flag; no locking required
*
* VM_MAP_WIREFUTURE r/w; may only be set or cleared when
* map is write-locked. may be tested
* without asserting `flags_lock'.
*
* VM_MAP_BUSY r/w; may only be set when map is
* write-locked, may only be cleared by
* thread which set it, map read-locked
* or write-locked. must be tested
* while `flags_lock' is asserted.
*
* VM_MAP_WANTLOCK r/w; may only be set when the map
* is busy, and thread is attempting
* to write-lock. must be tested
* while `flags_lock' is asserted.
*
* VM_MAP_GUARDPAGES r/o; must be specified at map
* initialization time.
* If set, guards will appear between
* automatic allocations.
* No locking required.
*
* VM_MAP_ISVMSPACE r/o; set by uvmspace_alloc.
* Signifies that this map is a vmspace.
* (The implementation treats all maps
* without this bit as kernel maps.)
* No locking required.
*
*
* All automatic allocations (uvm_map without MAP_FIXED) will allocate
* from vm_map.free.
* If that allocation fails:
* - vmspace maps will spill over into vm_map.bfree,
* - all other maps will call uvm_map_kmem_grow() to increase the arena.
*
* vmspace maps have their data, brk() and stack arenas automatically
* updated when uvm_map() is invoked without MAP_FIXED.
* The spill over arena (vm_map.bfree) will contain the space in the brk()
* and stack ranges.
* Kernel maps never have a bfree arena and this tree will always be empty.
*
*
* read_locks and write_locks are used in lock debugging code.
*
* Locks used to protect struct members in this file:
* a atomic operations
* I immutable after creation or exec(2)
* v `vm_map_lock' (this map `lock' or `mtx')
*/
struct vm_map {
struct pmap *pmap; /* [I] Physical map */
struct rwlock lock; /* Non-intrsafe lock */
struct mutex mtx; /* Intrsafe lock */
u_long sserial; /* [v] # stack changes */
u_long wserial; /* [v] # PROT_WRITE increases */
struct uvm_map_addr addr; /* [v] Entry tree, by addr */
vsize_t size; /* virtual size */
int ref_count; /* [a] Reference count */
int flags; /* flags */
struct mutex flags_lock; /* flags lock */
unsigned int timestamp; /* Version number */
vaddr_t min_offset; /* [I] First address in map. */
vaddr_t max_offset; /* [I] Last address in map. */
/*
* Allocation overflow regions.
*/
vaddr_t b_start; /* [v] Start for brk() alloc. */
vaddr_t b_end; /* [v] End for brk() alloc. */
vaddr_t s_start; /* [v] Start for stack alloc. */
vaddr_t s_end; /* [v] End for stack alloc. */
/*
* Special address selectors.
*
* The uaddr_exe mapping is used if:
* - protX is selected
* - the pointer is not NULL
*
* If uaddr_exe is not used, the other mappings are checked in
* order of appearance.
* If a hint is given, the selection will only be used if the hint
* falls in the range described by the mapping.
*
* The states are pointers because:
* - they may not all be in use
* - the struct size for different schemes is variable
*
* The uaddr_brk_stack selector will select addresses that are in
* the brk/stack area of the map.
*/
struct uvm_addr_state *uaddr_exe; /* Executable selector. */
struct uvm_addr_state *uaddr_any[4]; /* More selectors. */
struct uvm_addr_state *uaddr_brk_stack; /* Brk/stack selector. */
};
/* vm_map flags */
#define VM_MAP_PAGEABLE 0x01 /* ro: entries are pageable */
#define VM_MAP_INTRSAFE 0x02 /* ro: interrupt safe map */
#define VM_MAP_WIREFUTURE 0x04 /* rw: wire future mappings */
#define VM_MAP_BUSY 0x08 /* rw: map is busy */
#define VM_MAP_WANTLOCK 0x10 /* rw: want to write-lock */
#define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */
#define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */
#define VM_MAP_SYSCALL_ONCE 0x80 /* rw: libc syscall registered */
/* Number of kernel maps and entries to statically allocate */
#define MAX_KMAPENT 1024 /* Sufficient to make it to the scheduler. */
#ifdef _KERNEL
/*
* globals:
*/
extern vaddr_t uvm_maxkaddr;
/*
* protos: the following prototypes define the interface to vm_map
*/
void uvm_map_deallocate(struct vm_map *);
int uvm_map_clean(struct vm_map *, vaddr_t, vaddr_t, int);
void uvm_map_clip_start(struct vm_map *, struct vm_map_entry *,
vaddr_t);
void uvm_map_clip_end(struct vm_map *, struct vm_map_entry *,
vaddr_t);
int uvm_map_extract(struct vm_map *, vaddr_t, vsize_t,
vaddr_t *, int);
struct vm_map * uvm_map_create(pmap_t, vaddr_t, vaddr_t, int);
vaddr_t uvm_map_pie(vaddr_t);
vaddr_t uvm_map_hint(struct vmspace *, vm_prot_t, vaddr_t, vaddr_t);
int uvm_map_syscall(struct vm_map *, vaddr_t, vaddr_t);
int uvm_map_inherit(struct vm_map *, vaddr_t, vaddr_t, vm_inherit_t);
int uvm_map_advice(struct vm_map *, vaddr_t, vaddr_t, int);
void uvm_map_init(void);
boolean_t uvm_map_lookup_entry(struct vm_map *, vaddr_t, vm_map_entry_t *);
boolean_t uvm_map_is_stack_remappable(struct vm_map *, vaddr_t, vsize_t);
int uvm_map_remap_as_stack(struct proc *, vaddr_t, vsize_t);
int uvm_map_replace(struct vm_map *, vaddr_t, vaddr_t,
vm_map_entry_t, int);
int uvm_map_reserve(struct vm_map *, vsize_t, vaddr_t, vsize_t,
vaddr_t *);
void uvm_map_setup(struct vm_map *, pmap_t, vaddr_t, vaddr_t, int);
int uvm_map_submap(struct vm_map *, vaddr_t, vaddr_t,
struct vm_map *);
void uvm_unmap(struct vm_map *, vaddr_t, vaddr_t);
void uvm_unmap_detach(struct uvm_map_deadq *, int);
void uvm_unmap_remove(struct vm_map*, vaddr_t, vaddr_t,
struct uvm_map_deadq *, boolean_t, boolean_t);
void uvm_map_set_uaddr(struct vm_map*, struct uvm_addr_state**,
struct uvm_addr_state*);
int uvm_map_mquery(struct vm_map*, vaddr_t*, vsize_t, voff_t, int);
struct p_inentry;
int uvm_map_inentry_sp(vm_map_entry_t);
int uvm_map_inentry_pc(vm_map_entry_t);
boolean_t uvm_map_inentry(struct proc *, struct p_inentry *, vaddr_t addr,
const char *fmt, int (*fn)(vm_map_entry_t), u_long serial);
struct kinfo_vmentry;
int uvm_map_fill_vmmap(struct vm_map *, struct kinfo_vmentry *,
size_t *);
/*
* VM map locking operations:
*
* These operations perform locking on the data portion of the
* map.
*
* vm_map_lock_try: try to lock a map, failing if it is already locked.
*
* vm_map_lock: acquire an exclusive (write) lock on a map.
*
* vm_map_lock_read: acquire a shared (read) lock on a map.
*
* vm_map_unlock: release an exclusive lock on a map.
*
* vm_map_unlock_read: release a shared lock on a map.
*
* vm_map_downgrade: downgrade an exclusive lock to a shared lock.
*
* vm_map_upgrade: upgrade a shared lock to an exclusive lock.
*
* vm_map_busy: mark a map as busy.
*
* vm_map_unbusy: clear busy status on a map.
*
*/
boolean_t vm_map_lock_try_ln(struct vm_map*, char*, int);
void vm_map_lock_ln(struct vm_map*, char*, int);
void vm_map_lock_read_ln(struct vm_map*, char*, int);
void vm_map_unlock_ln(struct vm_map*, char*, int);
void vm_map_unlock_read_ln(struct vm_map*, char*, int);
void vm_map_downgrade_ln(struct vm_map*, char*, int);
void vm_map_upgrade_ln(struct vm_map*, char*, int);
void vm_map_busy_ln(struct vm_map*, char*, int);
void vm_map_unbusy_ln(struct vm_map*, char*, int);
#ifdef DIAGNOSTIC
#define vm_map_lock_try(map) vm_map_lock_try_ln(map, __FILE__, __LINE__)
#define vm_map_lock(map) vm_map_lock_ln(map, __FILE__, __LINE__)
#define vm_map_lock_read(map) vm_map_lock_read_ln(map, __FILE__, __LINE__)
#define vm_map_unlock(map) vm_map_unlock_ln(map, __FILE__, __LINE__)
#define vm_map_unlock_read(map) vm_map_unlock_read_ln(map, __FILE__, __LINE__)
#define vm_map_downgrade(map) vm_map_downgrade_ln(map, __FILE__, __LINE__)
#define vm_map_upgrade(map) vm_map_upgrade_ln(map, __FILE__, __LINE__)
#define vm_map_busy(map) vm_map_busy_ln(map, __FILE__, __LINE__)
#define vm_map_unbusy(map) vm_map_unbusy_ln(map, __FILE__, __LINE__)
#else
#define vm_map_lock_try(map) vm_map_lock_try_ln(map, NULL, 0)
#define vm_map_lock(map) vm_map_lock_ln(map, NULL, 0)
#define vm_map_lock_read(map) vm_map_lock_read_ln(map, NULL, 0)
#define vm_map_unlock(map) vm_map_unlock_ln(map, NULL, 0)
#define vm_map_unlock_read(map) vm_map_unlock_read_ln(map, NULL, 0)
#define vm_map_downgrade(map) vm_map_downgrade_ln(map, NULL, 0)
#define vm_map_upgrade(map) vm_map_upgrade_ln(map, NULL, 0)
#define vm_map_busy(map) vm_map_busy_ln(map, NULL, 0)
#define vm_map_unbusy(map) vm_map_unbusy_ln(map, NULL, 0)
#endif
void uvm_map_lock_entry(struct vm_map_entry *);
void uvm_map_unlock_entry(struct vm_map_entry *);
#endif /* _KERNEL */
/*
* Functions implemented as macros
*/
#define vm_map_min(map) ((map)->min_offset)
#define vm_map_max(map) ((map)->max_offset)
#define vm_map_pmap(map) ((map)->pmap)
#endif /* _UVM_UVM_MAP_H_ */
/* $OpenBSD: rnd.c,v 1.223 2022/02/06 17:24:58 rob Exp $ */
/*
* Copyright (c) 2011,2020 Theo de Raadt.
* Copyright (c) 2008 Damien Miller.
* Copyright (c) 1996, 1997, 2000-2002 Michael Shalayeff.
* Copyright (c) 2013 Markus Friedl.
* Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, and the entire permission notice in its entirety,
* including the disclaimer of warranties.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* ALTERNATIVELY, this product may be distributed under the terms of
* the GNU Public License, in which case the provisions of the GPL are
* required INSTEAD OF the above restrictions. (This clause is
* necessary due to a potential bad interaction between the GPL and
* the restrictions contained in a BSD-style copyright.)
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The bootblocks pre-fill the kernel .openbsd.randomdata section with seed
* material (on-disk from previous boot, hopefully mixed with a hardware rng).
* The first arc4random(9) call initializes this seed material as a chacha
* state. Calls can be done early in kernel bootstrap code -- early use is
* encouraged.
*
* After the kernel timeout subsystem is initialized, random_start() prepares
* the entropy collection mechanism enqueue_randomness() and timeout-driven
* mixing into the chacha state. The first submissions come from device
* probes, later on interrupt-time submissions are more common. Entropy
* data (and timing information) get mixed over the entropy input ring
* rnd_event_space[] -- the goal is to collect damage.
*
* Based upon timeouts, a selection of the entropy ring rnd_event_space[]
* CRC bit-distributed and XOR mixed into entropy_pool[].
*
* From time to time, entropy_pool[] is SHA512-whitened, mixed with time
* information again, XOR'd with the inner and outer states of the existing
* chacha state, to create a new chacha state.
*
* During early boot (until cold=0), enqueue operations are immediately
* dequeued, and mixed into the chacha.
*/
#include <sys/param.h>
#include <sys/event.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/timeout.h>
#include <sys/atomic.h>
#include <sys/task.h>
#include <sys/msgbuf.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <crypto/sha2.h>
#define KEYSTREAM_ONLY
#include <crypto/chacha_private.h>
#include <uvm/uvm_extern.h>
/*
* For the purposes of better mixing, we use the CRC-32 polynomial as
* well to make a twisted Generalized Feedback Shift Register
*
* (See M. Matsumoto & Y. Kurita, 1992. Twisted GFSR generators. ACM
* Transactions on Modeling and Computer Simulation 2(3):179-194.
* Also see M. Matsumoto & Y. Kurita, 1994. Twisted GFSR generators
* II. ACM Transactions on Modeling and Computer Simulation 4:254-266)
*/
/*
* Stirring polynomials over GF(2) for various pool sizes. Used in
* add_entropy_words() below.
*
* The polynomial terms are chosen to be evenly spaced (minimum RMS
* distance from evenly spaced; except for the last tap, which is 1 to
* get the twisting happening as fast as possible.
*
* The resultant polynomial is:
* 2^POOLWORDS + 2^POOL_TAP1 + 2^POOL_TAP2 + 2^POOL_TAP3 + 2^POOL_TAP4 + 1
*/
#define POOLWORDS 2048
#define POOLBYTES (POOLWORDS*4)
#define POOLMASK (POOLWORDS - 1)
#define POOL_TAP1 1638
#define POOL_TAP2 1231
#define POOL_TAP3 819
#define POOL_TAP4 411
/*
* Raw entropy collection from device drivers; at interrupt context or not.
* enqueue_randomness() is used to submit data into the entropy input ring.
*/
#define QEVLEN 128 /* must be a power of 2 */
#define QEVCONSUME 8 /* how many events to consume a time */
#define KEYSZ 32
#define IVSZ 8
#define BLOCKSZ 64
#define RSBUFSZ (16*BLOCKSZ)
#define EBUFSIZE KEYSZ + IVSZ
struct rand_event {
u_int re_time;
u_int re_val;
} rnd_event_space[QEVLEN];
u_int rnd_event_cons;
u_int rnd_event_prod;
int rnd_cold = 1;
int rnd_slowextract = 1;
void rnd_reinit(void *v); /* timeout to start reinit */
void rnd_init(void *); /* actually do the reinit */
static u_int32_t entropy_pool[POOLWORDS];
u_int32_t entropy_pool0[POOLWORDS] __attribute__((section(".openbsd.randomdata")));
void dequeue_randomness(void *);
void add_entropy_words(const u_int32_t *, u_int);
void extract_entropy(u_int8_t *)
__attribute__((__bounded__(__minbytes__,1,EBUFSIZE)));
struct timeout rnd_timeout = TIMEOUT_INITIALIZER(dequeue_randomness, NULL);
int filt_randomread(struct knote *, long);
void filt_randomdetach(struct knote *);
int filt_randomwrite(struct knote *, long);
static void _rs_seed(u_char *, size_t);
static void _rs_clearseed(const void *p, size_t s);
const struct filterops randomread_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_randomdetach,
.f_event = filt_randomread,
};
const struct filterops randomwrite_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_randomdetach,
.f_event = filt_randomwrite,
};
/*
* This function mixes entropy and timing into the entropy input ring.
*/
void
enqueue_randomness(u_int val)
{
struct rand_event *rep;
int e;
e = (atomic_inc_int_nv(&rnd_event_prod) - 1) & (QEVLEN-1);
rep = &rnd_event_space[e];
rep->re_time += cpu_rnd_messybits();
rep->re_val += val;
if (rnd_cold) {
dequeue_randomness(NULL);
rnd_init(NULL);
if (!cold) rnd_cold = 0; } else if (!timeout_pending(&rnd_timeout) &&
(rnd_event_prod - rnd_event_cons) > QEVCONSUME) {
rnd_slowextract = min(rnd_slowextract * 2, 5000);
timeout_add_msec(&rnd_timeout, rnd_slowextract * 10);
}
}
/*
* This function merges entropy ring information into the buffer using
* a polynomial to spread the bits.
*/
void
add_entropy_words(const u_int32_t *buf, u_int n)
{
/* derived from IEEE 802.3 CRC-32 */
static const u_int32_t twist_table[8] = {
0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278
};
static u_int entropy_add_ptr;
static u_char entropy_input_rotate;
for (; n--; buf++) {
u_int32_t w = (*buf << entropy_input_rotate) |
(*buf >> ((32 - entropy_input_rotate) & 31));
u_int i = entropy_add_ptr =
(entropy_add_ptr - 1) & POOLMASK;
/*
* Normally, we add 7 bits of rotation to the pool.
* At the beginning of the pool, add an extra 7 bits
* rotation, so that successive passes spread the
* input bits across the pool evenly.
*/
entropy_input_rotate =
(entropy_input_rotate + (i ? 7 : 14)) & 31;
/* XOR pool contents corresponding to polynomial terms */
w ^= entropy_pool[(i + POOL_TAP1) & POOLMASK] ^
entropy_pool[(i + POOL_TAP2) & POOLMASK] ^
entropy_pool[(i + POOL_TAP3) & POOLMASK] ^
entropy_pool[(i + POOL_TAP4) & POOLMASK] ^
entropy_pool[(i + 1) & POOLMASK] ^
entropy_pool[i]; /* + 2^POOLWORDS */
entropy_pool[i] = (w >> 3) ^ twist_table[w & 7];
}
}
/*
* Pulls entropy out of the queue and merges it into the pool with the
* CRC. This takes a mix of fresh entries from the producer end of the
* queue and entries from the consumer end of the queue which are
* likely to have collected more damage.
*/
/* ARGSUSED */
void
dequeue_randomness(void *v)
{
u_int32_t buf[2];
u_int startp, startc, i;
if (!rnd_cold)
timeout_del(&rnd_timeout);
/* Some very new damage */
startp = rnd_event_prod - QEVCONSUME;
for (i = 0; i < QEVCONSUME; i++) {
u_int e = (startp + i) & (QEVLEN-1);
buf[0] = rnd_event_space[e].re_time;
buf[1] = rnd_event_space[e].re_val;
add_entropy_words(buf, 2);
}
/* and some probably more damaged */
startc = rnd_event_cons;
for (i = 0; i < QEVCONSUME; i++) {
u_int e = (startc + i) & (QEVLEN-1);
buf[0] = rnd_event_space[e].re_time;
buf[1] = rnd_event_space[e].re_val;
add_entropy_words(buf, 2);
}
rnd_event_cons = startp + QEVCONSUME;
}
/*
* Grabs a chunk from the entropy_pool[] and slams it through SHA512 when
* requested.
*/
void
extract_entropy(u_int8_t *buf)
{
static u_int32_t extract_pool[POOLWORDS];
u_char digest[SHA512_DIGEST_LENGTH];
SHA2_CTX shactx;
#if SHA512_DIGEST_LENGTH < EBUFSIZE
#error "need more bigger hash output"
#endif
/*
* INTENTIONALLY not protected by any lock. Races during
* memcpy() result in acceptable input data; races during
* SHA512Update() would create nasty data dependencies. We
* do not rely on this as a benefit, but if it happens, cool.
*/
memcpy(extract_pool, entropy_pool, sizeof(extract_pool));
/* Hash the pool to get the output */
SHA512Init(&shactx);
SHA512Update(&shactx, (u_int8_t *)extract_pool, sizeof(extract_pool));
SHA512Final(digest, &shactx);
/* Copy data to destination buffer */
memcpy(buf, digest, EBUFSIZE);
/*
* Modify pool so next hash will produce different results.
* During boot-time enqueue/dequeue stage, avoid recursion.
*/
if (!rnd_cold)
enqueue_randomness(extract_pool[0]);
dequeue_randomness(NULL);
/* Wipe data from memory */
explicit_bzero(extract_pool, sizeof(extract_pool));
explicit_bzero(digest, sizeof(digest));
}
/* random keystream by ChaCha */
struct mutex rndlock = MUTEX_INITIALIZER(IPL_HIGH);
struct timeout rndreinit_timeout = TIMEOUT_INITIALIZER(rnd_reinit, NULL);
struct task rnd_task = TASK_INITIALIZER(rnd_init, NULL);
static chacha_ctx rs; /* chacha context for random keystream */
/* keystream blocks (also chacha seed from boot) */
static u_char rs_buf[RSBUFSZ];
u_char rs_buf0[RSBUFSZ] __attribute__((section(".openbsd.randomdata")));
static size_t rs_have; /* valid bytes at end of rs_buf */
static size_t rs_count; /* bytes till reseed */
void
suspend_randomness(void)
{
struct timespec ts;
getnanotime(&ts);
enqueue_randomness(ts.tv_sec);
enqueue_randomness(ts.tv_nsec);
dequeue_randomness(NULL);
rs_count = 0;
arc4random_buf(entropy_pool, sizeof(entropy_pool));
}
void
resume_randomness(char *buf, size_t buflen)
{
struct timespec ts;
if (buf && buflen)
_rs_seed(buf, buflen);
getnanotime(&ts);
enqueue_randomness(ts.tv_sec);
enqueue_randomness(ts.tv_nsec);
dequeue_randomness(NULL);
rs_count = 0;
}
static inline void _rs_rekey(u_char *dat, size_t datlen);
static inline void
_rs_init(u_char *buf, size_t n)
{
KASSERT(n >= KEYSZ + IVSZ);
chacha_keysetup(&rs, buf, KEYSZ * 8);
chacha_ivsetup(&rs, buf + KEYSZ, NULL);
}
static void
_rs_seed(u_char *buf, size_t n)
{
_rs_rekey(buf, n);
/* invalidate rs_buf */
rs_have = 0;
memset(rs_buf, 0, sizeof(rs_buf));
rs_count = 1600000;
}
static void
_rs_stir(int do_lock)
{
struct timespec ts;
u_int8_t buf[EBUFSIZE], *p;
int i;
/*
* Use SHA512 PRNG data and a system timespec; early in the boot
* process this is the best we can do -- some architectures do
* not collect entropy very well during this time, but may have
* clock information which is better than nothing.
*/
extract_entropy(buf);
nanotime(&ts);
for (p = (u_int8_t *)&ts, i = 0; i < sizeof(ts); i++)
buf[i] ^= p[i];
if (do_lock)
mtx_enter(&rndlock);
_rs_seed(buf, sizeof(buf));
if (do_lock)
mtx_leave(&rndlock);
explicit_bzero(buf, sizeof(buf));
/* encourage fast-dequeue again */
rnd_slowextract = 1;
}
static inline void
_rs_stir_if_needed(size_t len)
{
static int rs_initialized;
if (!rs_initialized) {
memcpy(entropy_pool, entropy_pool0, sizeof(entropy_pool));
memcpy(rs_buf, rs_buf0, sizeof(rs_buf));
/* seeds cannot be cleaned yet, random_start() will do so */
_rs_init(rs_buf, KEYSZ + IVSZ);
rs_count = 1024 * 1024 * 1024; /* until main() runs */
rs_initialized = 1;
} else if (rs_count <= len)
_rs_stir(0);
else
rs_count -= len;
}
static void
_rs_clearseed(const void *p, size_t s)
{
struct kmem_dyn_mode kd_avoidalias;
vaddr_t va = trunc_page((vaddr_t)p);
vsize_t off = (vaddr_t)p - va;
vsize_t len;
vaddr_t rwva;
paddr_t pa;
while (s > 0) {
pmap_extract(pmap_kernel(), va, &pa);
memset(&kd_avoidalias, 0, sizeof(kd_avoidalias));
kd_avoidalias.kd_prefer = pa;
kd_avoidalias.kd_waitok = 1;
rwva = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none,
&kd_avoidalias);
if (!rwva)
panic("_rs_clearseed");
pmap_kenter_pa(rwva, pa, PROT_READ | PROT_WRITE);
pmap_update(pmap_kernel());
len = MIN(s, PAGE_SIZE - off);
explicit_bzero((void *)(rwva + off), len);
pmap_kremove(rwva, PAGE_SIZE);
km_free((void *)rwva, PAGE_SIZE, &kv_any, &kp_none);
va += PAGE_SIZE;
s -= len;
off = 0;
}
}
static inline void
_rs_rekey(u_char *dat, size_t datlen)
{
#ifndef KEYSTREAM_ONLY
memset(rs_buf, 0, sizeof(rs_buf));
#endif
/* fill rs_buf with the keystream */
chacha_encrypt_bytes(&rs, rs_buf, rs_buf, sizeof(rs_buf));
/* mix in optional user provided data */
if (dat) {
size_t i, m;
m = MIN(datlen, KEYSZ + IVSZ);
for (i = 0; i < m; i++)
rs_buf[i] ^= dat[i];
}
/* immediately reinit for backtracking resistance */
_rs_init(rs_buf, KEYSZ + IVSZ);
memset(rs_buf, 0, KEYSZ + IVSZ);
rs_have = sizeof(rs_buf) - KEYSZ - IVSZ;
}
static inline void
_rs_random_buf(void *_buf, size_t n)
{
u_char *buf = (u_char *)_buf;
size_t m;
_rs_stir_if_needed(n);
while (n > 0) { if (rs_have > 0) {
m = MIN(n, rs_have);
memcpy(buf, rs_buf + sizeof(rs_buf) - rs_have, m);
memset(rs_buf + sizeof(rs_buf) - rs_have, 0, m);
buf += m;
n -= m;
rs_have -= m;
}
if (rs_have == 0)
_rs_rekey(NULL, 0);
}
}
static inline void
_rs_random_u32(u_int32_t *val)
{
_rs_stir_if_needed(sizeof(*val));
if (rs_have < sizeof(*val)) _rs_rekey(NULL, 0);
memcpy(val, rs_buf + sizeof(rs_buf) - rs_have, sizeof(*val));
memset(rs_buf + sizeof(rs_buf) - rs_have, 0, sizeof(*val));
rs_have -= sizeof(*val);
}
/* Return one word of randomness from a ChaCha20 generator */
u_int32_t
arc4random(void)
{
u_int32_t ret;
mtx_enter(&rndlock);
_rs_random_u32(&ret);
mtx_leave(&rndlock);
return ret;
}
/*
* Fill a buffer of arbitrary length with ChaCha20-derived randomness.
*/
void
arc4random_buf(void *buf, size_t n)
{
mtx_enter(&rndlock);
_rs_random_buf(buf, n);
mtx_leave(&rndlock);
}
/*
* Allocate a new ChaCha20 context for the caller to use.
*/
struct arc4random_ctx *
arc4random_ctx_new(void)
{
char keybuf[KEYSZ + IVSZ];
chacha_ctx *ctx = malloc(sizeof(chacha_ctx), M_TEMP, M_WAITOK);
arc4random_buf(keybuf, KEYSZ + IVSZ);
chacha_keysetup(ctx, keybuf, KEYSZ * 8);
chacha_ivsetup(ctx, keybuf + KEYSZ, NULL);
explicit_bzero(keybuf, sizeof(keybuf));
return (struct arc4random_ctx *)ctx;
}
/*
* Free a ChaCha20 context created by arc4random_ctx_new()
*/
void
arc4random_ctx_free(struct arc4random_ctx *ctx)
{
explicit_bzero(ctx, sizeof(chacha_ctx));
free(ctx, M_TEMP, sizeof(chacha_ctx));
}
/*
* Use a given ChaCha20 context to fill a buffer
*/
void
arc4random_ctx_buf(struct arc4random_ctx *ctx, void *buf, size_t n)
{
#ifndef KEYSTREAM_ONLY
memset(buf, 0, n);
#endif
chacha_encrypt_bytes((chacha_ctx *)ctx, buf, buf, n);
}
/*
* Calculate a uniformly distributed random number less than upper_bound
* avoiding "modulo bias".
*
* Uniformity is achieved by generating new random numbers until the one
* returned is outside the range [0, 2**32 % upper_bound). This
* guarantees the selected random number will be inside
* [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound)
* after reduction modulo upper_bound.
*/
u_int32_t
arc4random_uniform(u_int32_t upper_bound)
{
u_int32_t r, min;
if (upper_bound < 2)
return 0;
/* 2**32 % x == (2**32 - x) % x */
min = -upper_bound % upper_bound;
/*
* This could theoretically loop forever but each retry has
* p > 0.5 (worst case, usually far better) of selecting a
* number inside the range we need, so it should rarely need
* to re-roll.
*/
for (;;) {
r = arc4random();
if (r >= min)
break;
}
return r % upper_bound;
}
/* ARGSUSED */
void
rnd_init(void *null)
{
_rs_stir(1);
}
/*
* Called by timeout to mark arc4 for stirring,
*/
void
rnd_reinit(void *v)
{
task_add(systq, &rnd_task);
/* 10 minutes, per dm@'s suggestion */
timeout_add_sec(&rndreinit_timeout, 10 * 60);
}
/*
* Start periodic services inside the random subsystem, which pull
* entropy forward, hash it, and re-seed the random stream as needed.
*/
void
random_start(int goodseed)
{
extern char etext[];
#if !defined(NO_PROPOLICE)
extern long __guard_local;
if (__guard_local == 0)
printf("warning: no entropy supplied by boot loader\n");
#endif
_rs_clearseed(entropy_pool0, sizeof(entropy_pool0));
_rs_clearseed(rs_buf0, sizeof(rs_buf0));
/* Message buffer may contain data from previous boot */
if (msgbufp->msg_magic == MSG_MAGIC)
add_entropy_words((u_int32_t *)msgbufp->msg_bufc,
msgbufp->msg_bufs / sizeof(u_int32_t));
add_entropy_words((u_int32_t *)etext - 32*1024,
8192/sizeof(u_int32_t));
dequeue_randomness(NULL);
rnd_init(NULL);
rnd_reinit(NULL);
if (goodseed)
printf("random: good seed from bootblocks\n");
else {
/* XXX kernel should work harder here */
printf("random: boothowto does not indicate good seed\n");
}
}
int
randomopen(dev_t dev, int flag, int mode, struct proc *p)
{
return 0;
}
int
randomclose(dev_t dev, int flag, int mode, struct proc *p)
{
return 0;
}
/*
* Maximum number of bytes to serve directly from the main ChaCha
* pool. Larger requests are served from a discrete ChaCha instance keyed
* from the main pool.
*/
#define RND_MAIN_MAX_BYTES 2048
int
randomread(dev_t dev, struct uio *uio, int ioflag)
{
struct arc4random_ctx *lctx = NULL;
size_t total = uio->uio_resid;
u_char *buf;
int ret = 0;
if (uio->uio_resid == 0)
return 0;
buf = malloc(POOLBYTES, M_TEMP, M_WAITOK);
if (total > RND_MAIN_MAX_BYTES)
lctx = arc4random_ctx_new();
while (ret == 0 && uio->uio_resid > 0) {
size_t n = ulmin(POOLBYTES, uio->uio_resid);
if (lctx != NULL)
arc4random_ctx_buf(lctx, buf, n);
else
arc4random_buf(buf, n);
ret = uiomove(buf, n, uio);
if (ret == 0 && uio->uio_resid > 0)
yield();
}
if (lctx != NULL)
arc4random_ctx_free(lctx);
explicit_bzero(buf, POOLBYTES);
free(buf, M_TEMP, POOLBYTES);
return ret;
}
int
randomwrite(dev_t dev, struct uio *uio, int flags)
{
int ret = 0, newdata = 0;
u_int32_t *buf;
if (uio->uio_resid == 0)
return 0;
buf = malloc(POOLBYTES, M_TEMP, M_WAITOK);
while (ret == 0 && uio->uio_resid > 0) {
size_t n = ulmin(POOLBYTES, uio->uio_resid);
ret = uiomove(buf, n, uio);
if (ret != 0)
break;
while (n % sizeof(u_int32_t))
((u_int8_t *)buf)[n++] = 0;
add_entropy_words(buf, n / 4);
if (uio->uio_resid > 0)
yield();
newdata = 1;
}
if (newdata)
rnd_init(NULL);
explicit_bzero(buf, POOLBYTES);
free(buf, M_TEMP, POOLBYTES);
return ret;
}
int
randomkqfilter(dev_t dev, struct knote *kn)
{
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &randomread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &randomwrite_filtops;
break;
default:
return (EINVAL);
}
return (0);
}
void
filt_randomdetach(struct knote *kn)
{
}
int
filt_randomread(struct knote *kn, long hint)
{
kn->kn_data = RND_MAIN_MAX_BYTES;
return (1);
}
int
filt_randomwrite(struct knote *kn, long hint)
{
kn->kn_data = POOLBYTES;
return (1);
}
int
randomioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
{
switch (cmd) {
case FIOASYNC:
/* No async flag in softc so this is a no-op. */
break;
case FIONBIO:
/* Handled in the upper FS layer. */
break;
default:
return ENOTTY;
}
return 0;
}
int
sys_getentropy(struct proc *p, void *v, register_t *retval)
{
struct sys_getentropy_args /* {
syscallarg(void *) buf;
syscallarg(size_t) nbyte;
} */ *uap = v;
char buf[256];
int error;
if (SCARG(uap, nbyte) > sizeof(buf))
return (EIO);
arc4random_buf(buf, SCARG(uap, nbyte));
if ((error = copyout(buf, SCARG(uap, buf), SCARG(uap, nbyte))) != 0)
return (error);
explicit_bzero(buf, sizeof(buf));
retval[0] = 0;
return (0);
}
/* $OpenBSD: buf.h,v 1.113 2022/09/01 05:24:51 jsg Exp $ */
/* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)buf.h 8.7 (Berkeley) 1/21/94
*/
#ifndef _SYS_BUF_H_
#define _SYS_BUF_H_
#include <sys/queue.h>
#include <sys/tree.h>
#include <sys/mutex.h>
#include <uvm/uvm_extern.h>
#define NOLIST ((struct buf *)0x87654321)
struct buf;
struct vnode;
LIST_HEAD(bufhead, buf);
/*
* To avoid including <ufs/ffs/softdep.h>
*/
LIST_HEAD(workhead, worklist);
/*
* Buffer queues
*/
#define BUFQ_NSCAN_N 128
#define BUFQ_FIFO 0
#define BUFQ_NSCAN 1
#define BUFQ_DEFAULT BUFQ_NSCAN
#define BUFQ_HOWMANY 2
/*
* Write limits for bufq - defines high and low water marks for how
* many kva slots are allowed to be consumed to parallelize writes from
* the buffer cache from any individual bufq.
*/
#define BUFQ_HI 128
#define BUFQ_LOW 64
struct bufq_impl;
struct bufq {
SLIST_ENTRY(bufq) bufq_entries;
struct mutex bufq_mtx;
void *bufq_data;
u_int bufq_outstanding;
u_int bufq_hi;
u_int bufq_low;
int bufq_waiting;
int bufq_stop;
int bufq_type;
const struct bufq_impl *bufq_impl;
};
int bufq_init(struct bufq *, int);
int bufq_switch(struct bufq *, int);
void bufq_destroy(struct bufq *);
void bufq_queue(struct bufq *, struct buf *);
struct buf *bufq_dequeue(struct bufq *);
void bufq_requeue(struct bufq *, struct buf *);
int bufq_peek(struct bufq *);
void bufq_drain(struct bufq *);
void bufq_wait(struct bufq *);
void bufq_done(struct bufq *, struct buf *);
void bufq_quiesce(void);
void bufq_restart(void);
/* fifo */
SIMPLEQ_HEAD(bufq_fifo_head, buf);
struct bufq_fifo {
SIMPLEQ_ENTRY(buf) bqf_entries;
};
/* nscan */
SIMPLEQ_HEAD(bufq_nscan_head, buf);
struct bufq_nscan {
SIMPLEQ_ENTRY(buf) bqf_entries;
};
/* bufq link in struct buf */
union bufq_data {
struct bufq_fifo bufq_data_fifo;
struct bufq_nscan bufq_data_nscan;
};
/*
* These are currently used only by the soft dependency code, hence
* are stored once in a global variable. If other subsystems wanted
* to use these hooks, a pointer to a set of bio_ops could be added
* to each buffer.
*/
extern struct bio_ops {
void (*io_start)(struct buf *);
void (*io_complete)(struct buf *);
void (*io_deallocate)(struct buf *);
void (*io_movedeps)(struct buf *, struct buf *);
int (*io_countdeps)(struct buf *, int, int);
} bioops;
/* The buffer header describes an I/O operation in the kernel. */
struct buf {
RBT_ENTRY(buf) b_rbbufs; /* vnode "hash" tree */
LIST_ENTRY(buf) b_list; /* All allocated buffers. */
LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
int cache; /* which cache are we in */
struct proc *b_proc; /* Associated proc; NULL if kernel. */
volatile long b_flags; /* B_* flags. */
long b_bufsize; /* Allocated buffer size. */
long b_bcount; /* Valid bytes in buffer. */
size_t b_resid; /* Remaining I/O. */
int b_error; /* Errno value. */
dev_t b_dev; /* Device associated with buffer. */
caddr_t b_data; /* associated data */
void *b_saveaddr; /* Original b_data for physio. */
TAILQ_ENTRY(buf) b_valist; /* LRU of va to reuse. */
union bufq_data b_bufq;
struct bufq *b_bq; /* What bufq this buf is on */
struct uvm_object *b_pobj;
struct uvm_object b_uobj; /* Object containing the pages */
off_t b_poffs; /* Offset within object */
daddr_t b_lblkno; /* Logical block number. */
daddr_t b_blkno; /* Underlying physical block number. */
/* Function to call upon completion.
* Will be called at splbio(). */
void (*b_iodone)(struct buf *);
struct vnode *b_vp; /* Device vnode. */
int b_dirtyoff; /* Offset in buffer of dirty region. */
int b_dirtyend; /* Offset of end of dirty region. */
int b_validoff; /* Offset in buffer of valid region. */
int b_validend; /* Offset of end of valid region. */
struct workhead b_dep; /* List of filesystem dependencies. */
};
TAILQ_HEAD(bufqueue, buf);
struct bufcache {
int64_t hotbufpages;
int64_t warmbufpages;
int64_t cachepages;
struct bufqueue hotqueue;
struct bufqueue coldqueue;
struct bufqueue warmqueue;
};
/* Device driver compatibility definitions. */
#define b_active b_bcount /* Driver queue head: drive active. */
/*
* These flags are kept in b_flags.
*/
#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
#define B_NEEDCOMMIT 0x00000002 /* Needs committing to stable storage */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
#define B_BAD 0x00000008 /* Bad block revectoring in progress. */
#define B_BUSY 0x00000010 /* I/O in progress. */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_CALL 0x00000040 /* Call b_iodone from biodone. */
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
#define B_DONE 0x00000100 /* I/O completed. */
#define B_EINTR 0x00000200 /* I/O was interrupted */
#define B_ERROR 0x00000400 /* I/O error occurred. */
#define B_INVAL 0x00000800 /* Does not contain valid info. */
#define B_NOCACHE 0x00001000 /* Do not cache block after use. */
#define B_PHYS 0x00002000 /* I/O to user memory. */
#define B_RAW 0x00004000 /* Set by physio for raw transfers. */
#define B_READ 0x00008000 /* Read buffer. */
#define B_WANTED 0x00010000 /* Process wants this buffer. */
#define B_WRITEINPROG 0x00020000 /* Write in progress. */
#define B_XXX 0x00040000 /* Debugging flag. */
#define B_DEFERRED 0x00080000 /* Skipped over for cleaning */
#define B_SCANNED 0x00100000 /* Block already pushed during sync */
#define B_PDAEMON 0x00200000 /* I/O started by pagedaemon */
#define B_RELEASED 0x00400000 /* free this buffer after its kvm */
#define B_WARM 0x00800000 /* buffer is or has been on the warm queue */
#define B_COLD 0x01000000 /* buffer is on the cold queue */
#define B_BC 0x02000000 /* buffer is managed by the cache */
#define B_DMA 0x04000000 /* buffer is DMA reachable */
#define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
"\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \
"\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \
"\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \
"\025SCANNED\026DAEMON\027RELEASED\030WARM\031COLD\032BC\033DMA"
/*
* Zero out the buffer's data area.
*/
#define clrbuf(bp) { \
bzero((bp)->b_data, (bp)->b_bcount); \
(bp)->b_resid = 0; \
}
/* Flags to low-level allocation routines. */
#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */
#define B_SYNC 0x02 /* Do all allocations synchronously. */
struct cluster_info {
daddr_t ci_lastr; /* last read (read-ahead) */
daddr_t ci_lastw; /* last write (write cluster) */
daddr_t ci_cstart; /* start block of cluster */
daddr_t ci_lasta; /* last allocation */
int ci_clen; /* length of current cluster */
int ci_ralen; /* Read-ahead length */
daddr_t ci_maxra; /* last readahead block */
};
#ifdef _KERNEL
__BEGIN_DECLS
/* Kva slots (of size MAXPHYS) reserved for syncer and cleaner. */
#define RESERVE_SLOTS 4
/* Buffer cache pages reserved for syncer and cleaner. */
#define RESERVE_PAGES (RESERVE_SLOTS * MAXPHYS / PAGE_SIZE)
/* Minimum size of the buffer cache, in pages. */
#define BCACHE_MIN (RESERVE_PAGES * 2)
#define UNCLEAN_PAGES (bcstats.numbufpages - bcstats.numcleanpages)
extern struct proc *cleanerproc;
extern long bufpages; /* Max number of pages for buffers' data */
extern struct pool bufpool;
extern struct bufhead bufhead;
void bawrite(struct buf *);
void bdwrite(struct buf *);
void biodone(struct buf *);
int biowait(struct buf *);
int bread(struct vnode *, daddr_t, int, struct buf **);
int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
struct buf **);
void brelse(struct buf *);
#define bremfree bufcache_take
void bufinit(void);
void buf_dirty(struct buf *);
void buf_undirty(struct buf *);
void buf_adjcnt(struct buf *, long);
int bwrite(struct buf *);
struct buf *getblk(struct vnode *, daddr_t, int, int, uint64_t);
struct buf *geteblk(size_t);
struct buf *incore(struct vnode *, daddr_t);
/*
* bufcache functions
*/
void bufcache_take(struct buf *);
void bufcache_release(struct buf *);
int buf_flip_high(struct buf *);
void buf_flip_dma(struct buf *);
struct buf *bufcache_getcleanbuf(int, int);
struct buf *bufcache_getdirtybuf(void);
/*
* buf_kvm_init initializes the kvm handling for buffers.
* buf_acquire sets the B_BUSY flag and ensures that the buffer is
* mapped in the kvm.
* buf_release clears the B_BUSY flag and allows the buffer to become
* unmapped.
* buf_unmap is for internal use only. Unmaps the buffer from kvm.
*/
void buf_mem_init(vsize_t);
void buf_acquire(struct buf *);
void buf_acquire_nomap(struct buf *);
void buf_map(struct buf *);
void buf_release(struct buf *);
int buf_dealloc_mem(struct buf *);
void buf_fix_mapping(struct buf *, vsize_t);
void buf_alloc_pages(struct buf *, vsize_t);
void buf_free_pages(struct buf *);
void minphys(struct buf *bp);
int physio(void (*strategy)(struct buf *), dev_t dev, int flags,
void (*minphys)(struct buf *), struct uio *uio);
void brelvp(struct buf *);
void reassignbuf(struct buf *);
void bgetvp(struct vnode *, struct buf *);
void buf_replacevnode(struct buf *, struct vnode *);
void buf_daemon(void *);
void buf_replacevnode(struct buf *, struct vnode *);
int bread_cluster(struct vnode *, daddr_t, int, struct buf **);
static __inline void
buf_start(struct buf *bp)
{
if (bioops.io_start) (*bioops.io_start)(bp);
}
static __inline void
buf_complete(struct buf *bp)
{
if (bioops.io_complete) (*bioops.io_complete)(bp);
}
static __inline void
buf_deallocate(struct buf *bp)
{
if (bioops.io_deallocate) (*bioops.io_deallocate)(bp);
}
static __inline void
buf_movedeps(struct buf *bp, struct buf *bp2)
{
if (bioops.io_movedeps)
(*bioops.io_movedeps)(bp, bp2);
}
static __inline int
buf_countdeps(struct buf *bp, int i, int islocked)
{
if (bioops.io_countdeps)
return ((*bioops.io_countdeps)(bp, i, islocked));
else
return (0);
}
__END_DECLS
#endif /* _KERNEL */
#endif /* !_SYS_BUF_H_ */
/* $OpenBSD: siphash.c,v 1.5 2018/01/05 19:05:09 mikeb Exp $ */
/*-
* Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
* are the number of compression rounds and the number of finalization rounds.
* A compression round is identical to a finalization round and this round
* function is called SipRound. Given a 128-bit key k and a (possibly empty)
* byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
*
* Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
* by Jean-Philippe Aumasson and Daniel J. Bernstein,
* Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
* https://131002.net/siphash/siphash.pdf
* https://131002.net/siphash/
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <crypto/siphash.h>
static void SipHash_CRounds(SIPHASH_CTX *, int);
static void SipHash_Rounds(SIPHASH_CTX *, int);
void
SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
{
uint64_t k0, k1;
k0 = lemtoh64(&key->k0);
k1 = lemtoh64(&key->k1);
ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
ctx->v[3] = 0x7465646279746573ULL ^ k1;
memset(ctx->buf, 0, sizeof(ctx->buf));
ctx->bytes = 0;
}
void
SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
{
const uint8_t *ptr = src;
size_t left, used;
if (len == 0)
return;
used = ctx->bytes % sizeof(ctx->buf);
ctx->bytes += len;
if (used > 0) {
left = sizeof(ctx->buf) - used;
if (len >= left) {
memcpy(&ctx->buf[used], ptr, left);
SipHash_CRounds(ctx, rc);
len -= left;
ptr += left;
} else {
memcpy(&ctx->buf[used], ptr, len);
return;
}
}
while (len >= sizeof(ctx->buf)) {
memcpy(ctx->buf, ptr, sizeof(ctx->buf));
SipHash_CRounds(ctx, rc);
len -= sizeof(ctx->buf);
ptr += sizeof(ctx->buf);
}
if (len > 0) memcpy(ctx->buf, ptr, len);
}
void
SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
{
uint64_t r;
htolem64(&r, SipHash_End(ctx, rc, rf));
memcpy(dst, &r, sizeof r);
}
uint64_t
SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
{
uint64_t r;
size_t left, used;
used = ctx->bytes % sizeof(ctx->buf);
left = sizeof(ctx->buf) - used;
memset(&ctx->buf[used], 0, left - 1);
ctx->buf[7] = ctx->bytes;
SipHash_CRounds(ctx, rc);
ctx->v[2] ^= 0xff;
SipHash_Rounds(ctx, rf);
r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
explicit_bzero(ctx, sizeof(*ctx));
return (r);
}
uint64_t
SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
{
SIPHASH_CTX ctx;
SipHash_Init(&ctx, key);
SipHash_Update(&ctx, rc, rf, src, len);
return (SipHash_End(&ctx, rc, rf));
}
#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
static void
SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
{
while (rounds--) {
ctx->v[0] += ctx->v[1];
ctx->v[2] += ctx->v[3];
ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
ctx->v[1] ^= ctx->v[0];
ctx->v[3] ^= ctx->v[2];
ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
ctx->v[2] += ctx->v[1];
ctx->v[0] += ctx->v[3];
ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
ctx->v[1] ^= ctx->v[2];
ctx->v[3] ^= ctx->v[0];
ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
}
}
static void
SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
{
uint64_t m = lemtoh64((uint64_t *)ctx->buf);
ctx->v[3] ^= m;
SipHash_Rounds(ctx, rounds); ctx->v[0] ^= m;
}
/* $OpenBSD: uvm_object.c,v 1.25 2022/02/21 16:08:36 kn Exp $ */
/*
* Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_object.c: operate with memory objects
*
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/atomic.h>
#include <sys/rwlock.h>
#include <uvm/uvm.h>
/* Dummy object used by some pmaps for sanity checks. */
const struct uvm_pagerops pmap_pager = {
/* nothing */
};
/* Dummy object used by the buffer cache for sanity checks. */
const struct uvm_pagerops bufcache_pager = {
/* nothing */
};
/* Page count to fetch per single step. */
#define FETCH_PAGECOUNT 16
/*
* uvm_obj_init: initialize UVM memory object.
*/
void
uvm_obj_init(struct uvm_object *uobj, const struct uvm_pagerops *pgops, int refs)
{
int alock;
alock = ((pgops != NULL) && (pgops != &pmap_pager) &&
(pgops != &bufcache_pager) && (refs != UVM_OBJ_KERN));
if (alock) {
/* Allocate and assign a lock. */
rw_obj_alloc(&uobj->vmobjlock, "uobjlk");
} else {
/* The lock will need to be set via uvm_obj_setlock(). */
uobj->vmobjlock = NULL;
}
uobj->pgops = pgops;
RBT_INIT(uvm_objtree, &uobj->memt);
uobj->uo_npages = 0;
uobj->uo_refs = refs;
}
/*
* uvm_obj_destroy: destroy UVM memory object.
*/
void
uvm_obj_destroy(struct uvm_object *uo)
{ KASSERT(RBT_EMPTY(uvm_objtree, &uo->memt)); rw_obj_free(uo->vmobjlock);
}
/*
* uvm_obj_setlock: assign a vmobjlock to the UVM object.
*
* => Caller is responsible to ensure that UVM objects is not use.
* => Only dynamic lock may be previously set. We drop the reference then.
*/
void
uvm_obj_setlock(struct uvm_object *uo, struct rwlock *lockptr)
{
struct rwlock *olockptr = uo->vmobjlock;
if (olockptr) {
/* Drop the reference on the old lock. */
rw_obj_free(olockptr);
}
if (lockptr == NULL) {
/* If new lock is not passed - allocate default one. */
rw_obj_alloc(&lockptr, "uobjlk");
}
uo->vmobjlock = lockptr;
}
#ifndef SMALL_KERNEL
/*
* uvm_obj_wire: wire the pages of entire UVM object.
*
* => NOTE: this function should only be used for types of objects
* where PG_RELEASED flag is never set (aobj objects)
* => caller must pass page-aligned start and end values
* => if the caller passes in a pageq pointer, we'll return a list of
* wired pages.
*/
int
uvm_obj_wire(struct uvm_object *uobj, voff_t start, voff_t end,
struct pglist *pageq)
{
int i, npages, left, error;
struct vm_page *pgs[FETCH_PAGECOUNT];
voff_t offset = start;
left = (end - start) >> PAGE_SHIFT;
rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK);
while (left) {
npages = MIN(FETCH_PAGECOUNT, left);
/* Get the pages */
memset(pgs, 0, sizeof(pgs));
error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0,
PROT_READ | PROT_WRITE, MADV_SEQUENTIAL,
PGO_ALLPAGES | PGO_SYNCIO);
if (error)
goto error;
rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK);
for (i = 0; i < npages; i++) {
KASSERT(pgs[i] != NULL);
KASSERT(!(pgs[i]->pg_flags & PG_RELEASED));
if (pgs[i]->pg_flags & PQ_AOBJ) {
atomic_clearbits_int(&pgs[i]->pg_flags,
PG_CLEAN);
uao_dropswap(uobj, i);
}
}
/* Wire the pages */
uvm_lock_pageq();
for (i = 0; i < npages; i++) {
uvm_pagewire(pgs[i]);
if (pageq != NULL)
TAILQ_INSERT_TAIL(pageq, pgs[i], pageq);
}
uvm_unlock_pageq();
/* Unbusy the pages */
uvm_page_unbusy(pgs, npages);
left -= npages;
offset += (voff_t)npages << PAGE_SHIFT;
}
rw_exit(uobj->vmobjlock);
return 0;
error:
/* Unwire the pages which have been wired */
uvm_obj_unwire(uobj, start, offset);
return error;
}
/*
* uvm_obj_unwire: unwire the pages of entire UVM object.
*
* => caller must pass page-aligned start and end values
*/
void
uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t end)
{
struct vm_page *pg;
off_t offset;
rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK);
uvm_lock_pageq();
for (offset = start; offset < end; offset += PAGE_SIZE) {
pg = uvm_pagelookup(uobj, offset);
KASSERT(pg != NULL);
KASSERT(!(pg->pg_flags & PG_RELEASED));
uvm_pageunwire(pg);
}
uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
}
#endif /* !SMALL_KERNEL */
/*
* uvm_obj_free: free all pages in a uvm object, used by the buffer
* cache to free all pages attached to a buffer.
*/
void
uvm_obj_free(struct uvm_object *uobj)
{
struct vm_page *pg;
struct pglist pgl;
KASSERT(UVM_OBJ_IS_BUFCACHE(uobj)); KERNEL_ASSERT_LOCKED();
TAILQ_INIT(&pgl);
/*
* Extract from rb tree in offset order. The phys addresses
* usually increase in that order, which is better for
* uvm_pglistfree().
*/
RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
/*
* clear PG_TABLED so we don't do work to remove
* this pg from the uobj we are throwing away
*/
atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
uvm_lock_pageq();
uvm_pageclean(pg);
uvm_unlock_pageq();
TAILQ_INSERT_TAIL(&pgl, pg, pageq);
}
uvm_pglistfree(&pgl);
}
/* $OpenBSD: tcp_output.c,v 1.133 2022/09/03 19:22:19 bluhm Exp $ */
/* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include "pf.h"
#include "stoeplitz.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/route.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
#ifdef notyet
extern struct mbuf *m_copypack();
#endif
extern int tcprexmtthresh;
#ifdef TCP_SACK_DEBUG
void tcp_print_holes(struct tcpcb *tp);
void
tcp_print_holes(struct tcpcb *tp)
{
struct sackhole *p = tp->snd_holes;
if (p == NULL)
return;
printf("Hole report: start--end dups rxmit\n");
while (p) {
printf("%x--%x d %d r %x\n", p->start, p->end, p->dups,
p->rxmit);
p = p->next;
}
printf("\n");
}
#endif /* TCP_SACK_DEBUG */
/*
* Returns pointer to a sackhole if there are any pending retransmissions;
* NULL otherwise.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp)
{
struct sackhole *p;
if (!tp->sack_enable)
return (NULL);
p = tp->snd_holes;
while (p) { if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
p = p->next;
continue;
}
#ifdef TCP_SACK_DEBUG
if (p)
tcp_print_holes(tp);
#endif
return (p);
}
p = p->next;
}
return (NULL);
}
/*
* After a timeout, the SACK list may be rebuilt. This SACK information
* should be used to avoid retransmitting SACKed data. This function
* traverses the SACK list to see if snd_nxt should be moved forward.
*/
void
tcp_sack_adjust(struct tcpcb *tp)
{
struct sackhole *cur = tp->snd_holes;
if (cur == NULL)
return; /* No holes */
if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
return; /* We're already beyond any SACKed blocks */
/*
* Two cases for which we want to advance snd_nxt:
* i) snd_nxt lies between end of one hole and beginning of another
* ii) snd_nxt lies between end of last hole and rcv_lastsack
*/
while (cur->next) {
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
cur = cur->next;
else {
tp->snd_nxt = cur->next->start;
return;
}
}
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
tp->snd_nxt = tp->rcv_lastsack;
return;
}
/*
* Tcp output routine: figure out what should be sent and send it.
*/
int
tcp_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
long len, win, txmaxseg;
int off, flags, error;
struct mbuf *m;
struct tcphdr *th;
u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))];
u_char *opt = (u_char *)optbuf;
unsigned int optlen, hdrlen, packetlen;
int idle, sendalot = 0;
int i, sack_rxmit = 0;
struct sackhole *p;
uint32_t now;
#ifdef TCP_SIGNATURE
unsigned int sigoff;
#endif /* TCP_SIGNATURE */
#ifdef TCP_ECN
int needect;
#endif
if (tp->t_flags & TF_BLOCKOUTPUT) {
tp->t_flags |= TF_NEEDOUTPUT;
return (0);
} else
tp->t_flags &= ~TF_NEEDOUTPUT;
#if defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
if (tp->sack_enable && (tp->t_flags & TF_SIGNATURE))
return (EINVAL);
#endif /* defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */
now = READ_ONCE(tcp_now);
/*
* Determine length of data that should be transmitted,
* and flags that will be used.
* If there is some data or critical controls (SYN, RST)
* to send, then transmit; otherwise, investigate further.
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (now - tp->t_rcvtime) >= tp->t_rxtcur)
/*
* We have been idle for "a while" and no acks are
* expected to clock out any data we send --
* slow start to get ack "clock" running again.
*/
tp->snd_cwnd = 2 * tp->t_maxseg;
/* remember 'idle' for next invocation of tcp_output */
if (idle && soissending(so)) { tp->t_flags |= TF_LASTIDLE;
idle = 0;
} else
tp->t_flags &= ~TF_LASTIDLE;
again:
/*
* If we've recently taken a timeout, snd_max will be greater than
* snd_nxt. There may be SACK information that allows us to avoid
* resending already delivered data. Adjust snd_nxt accordingly.
*/
if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
tcp_sack_adjust(tp);
off = tp->snd_nxt - tp->snd_una;
win = ulmin(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
/*
* Send any SACK-generated retransmissions. If we're explicitly trying
* to send out new data (when sendalot is 1), bypass this function.
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
* we're replacing a (future) new transmission with a retransmission
* now, and we previously incremented snd_cwnd in tcp_input().
*/
if (tp->sack_enable && !sendalot) { if (tp->t_dupacks >= tcprexmtthresh &&
(p = tcp_sack_output(tp))) {
off = p->rxmit - tp->snd_una;
sack_rxmit = 1;
/* Coalesce holes into a single retransmission */
len = min(tp->t_maxseg, p->end - p->rxmit);
if (SEQ_LT(tp->snd_una, tp->snd_last)) tp->snd_cwnd -= tp->t_maxseg;
}
}
sendalot = 0;
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
* and timer expired, we will send what we can
* and go to transmit state.
*/
if (tp->t_force) {
if (win == 0) {
/*
* If we still have some data to send, then
* clear the FIN bit. Usually this would
* happen below when it realizes that we
* aren't sending all the data. However,
* if we have exactly 1 byte of unset data,
* then it won't clear the FIN bit below,
* and if we are in persist state, we wind
* up sending the packet without recording
* that we sent the FIN bit.
*
* We can't just blindly clear the FIN bit,
* because if we don't have any more data
* to send then the probe will be the FIN
* itself.
*/
if (off < so->so_snd.sb_cc)
flags &= ~TH_FIN;
win = 1;
} else {
TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
}
if (!sack_rxmit) { len = ulmin(so->so_snd.sb_cc, win) - off;
}
if (len < 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
* len will be -1. Otherwise, window shrank
* after we sent into it. If window shrank to 0,
* cancel pending retransmit, pull snd_nxt back
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
* close completely, just wait for an ACK.
*/
len = 0;
if (win == 0) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
tcp_setpersist(tp);
}
}
/*
* Never send more than half a buffer full. This insures that we can
* always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
* therefore acks will never be delayed unless we run out of data to
* transmit.
*/
txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
if (len > txmaxseg) {
len = txmaxseg;
sendalot = 1;
}
if (off + len < so->so_snd.sb_cc)
flags &= ~TH_FIN;
win = sbspace(so, &so->so_rcv);
/*
* Sender silly window avoidance. If connection is idle
* and can send all data, a maximum segment,
* at least a maximum default-size segment do it,
* or are forced, do it; otherwise don't bother.
* If peer's buffer is tiny, then send
* when window is at least half open.
* If retransmitting (possibly after persist timer forced us
* to send into a small window), then must resend.
*/
if (len) { if (len == txmaxseg)
goto send;
if ((idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && !soissending(so) &&
(tp->t_flags & TF_NOPUSH) == 0)
goto send;
if (tp->t_force)
goto send;
if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
if (sack_rxmit)
goto send;
}
/*
* Compare available window to amount of window
* known to peer (as advertised window less
* next expected input). If the difference is at least two
* max size segments, or at least 50% of the maximum possible
* window, then want to send a window update to peer.
*/
if (win > 0) {
/*
* "adv" is the amount we can increase the window,
* taking into account that we are limited by
* TCP_MAXWIN << tp->rcv_scale.
*/
long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) -
(tp->rcv_adv - tp->rcv_nxt);
if (adv >= (long) (2 * tp->t_maxseg))
goto send;
if (2 * adv >= (long) so->so_rcv.sb_hiwat)
goto send;
}
/*
* Send if we owe peer an ACK.
*/
if (tp->t_flags & TF_ACKNOW)
goto send;
if (flags & (TH_SYN|TH_RST))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
/*
* If our state indicates that FIN should be sent
* and we have not yet done so, or we're retransmitting the FIN,
* then we need to send.
*/
if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
/*
* In SACK, it is possible for tcp_output to fail to send a segment
* after the retransmission timer has been turned off. Make sure
* that the retransmission timer is set.
*/
if (SEQ_GT(tp->snd_max, tp->snd_una) &&
TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
return (0);
}
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
* updates. The three ``states'' for the output side are:
* idle not doing retransmits or persists
* persisting to move a small or zero window
* (re)transmitting and thereby not persisting
*
* tp->t_timer[TCPT_PERSIST]
* is set when we are in persist state.
* tp->t_force
* is set when we are called to send a persist packet.
* tp->t_timer[TCPT_REXMT]
* is set when we are retransmitting
* The output side is idle when both timers are zero.
*
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state.
* If nothing happens soon, send when timer expires:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
/*
* No reason to send a segment, just return.
*/
return (0);
send:
/*
* Before ESTABLISHED, force sending of initial options
* unless TCP set not to do any options.
* NOTE: we assume that the IP/TCP header plus TCP options
* always fit in a single mbuf, leaving room for a maximum
* link header, i.e.
* max_linkhdr + sizeof(network header) + sizeof(struct tcphdr +
* optlen <= MHLEN
*/
optlen = 0;
switch (tp->pf) {
case 0: /*default to PF_INET*/
case PF_INET:
hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
break;
#ifdef INET6
case PF_INET6:
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
break;
#endif /* INET6 */
default:
return (EPFNOSUPPORT);
}
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
if ((tp->t_flags & TF_NOOPT) == 0) {
u_int16_t mss;
opt[0] = TCPOPT_MAXSEG;
opt[1] = 4;
mss = htons((u_int16_t) tcp_mss(tp, 0));
memcpy(opt + 2, &mss, sizeof(mss));
optlen = 4;
if (flags & TH_ACK)
tcp_mss_update(tp);
/*
* If this is the first SYN of connection (not a SYN
* ACK), include SACK_PERMIT_HDR option. If this is a
* SYN ACK, include SACK_PERMIT_HDR option if peer has
* already done so.
*/
if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_SACK_PERMIT))) {
*((u_int32_t *) (opt + optlen)) =
htonl(TCPOPT_SACK_PERMIT_HDR);
optlen += 4;
}
if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
*((u_int32_t *) (opt + optlen)) = htonl(
TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 |
TCPOLEN_WINDOW << 8 |
tp->request_r_scale);
optlen += 4;
}
}
}
/*
* Send a timestamp and echo-reply if this is a SYN and our side
* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
* and our peer have sent timestamps in our SYN's.
*/
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(flags & TH_RST) == 0 &&
((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
(tp->t_flags & TF_RCVD_TSTMP))) {
u_int32_t *lp = (u_int32_t *)(opt + optlen);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(TCPOPT_TSTAMP_HDR);
*lp++ = htonl(now + tp->ts_modulate);
*lp = htonl(tp->ts_recent);
optlen += TCPOLEN_TSTAMP_APPA;
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0) tp->rfbuf_ts = now;
}
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE) { u_int8_t *bp = (u_int8_t *)(opt + optlen);
/* Send signature option */
*(bp++) = TCPOPT_SIGNATURE;
*(bp++) = TCPOLEN_SIGNATURE;
sigoff = optlen + 2;
{
unsigned int i;
for (i = 0; i < 16; i++)
*(bp++) = 0;
}
/* Pad options list to the next 32 bit boundary and
* terminate it.
*/
*bp++ = TCPOPT_NOP;
*bp++ = TCPOPT_NOP;
optlen += TCPOLEN_SIGLEN;
}
#endif /* TCP_SIGNATURE */
/*
* Send SACKs if necessary. This should be the last option processed.
* Only as many SACKs are sent as are permitted by the maximum options
* size. No more than three SACKs are sent.
*/
if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
(tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
tp->rcv_numsacks) {
u_int32_t *lp = (u_int32_t *)(opt + optlen);
u_int32_t *olp = lp++;
int count = 0; /* actual number of SACKs inserted */
int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
tcpstat_inc(tcps_sack_snd_opts);
maxsack = min(maxsack, TCP_MAX_SACK);
for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
struct sackblk sack = tp->sackblks[i];
if (sack.start == 0 && sack.end == 0)
continue;
*lp++ = htonl(sack.start);
*lp++ = htonl(sack.end);
count++;
}
*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
}
#ifdef DIAGNOSTIC
if (optlen > MAX_TCPOPTLEN)
panic("tcp_output: options too long");
#endif /* DIAGNOSTIC */
hdrlen += optlen;
/*
* Adjust data length if insertion of options will
* bump the packet length beyond the t_maxopd length.
*/
if (len > tp->t_maxopd - optlen) {
len = tp->t_maxopd - optlen;
sendalot = 1;
flags &= ~TH_FIN;
}
#ifdef DIAGNOSTIC
if (max_linkhdr + hdrlen > MCLBYTES)
panic("tcphdr too big");
#endif
/*
* Grab a header mbuf, attaching a copy of data to
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
if (len) {
if (tp->t_force && len == 1)
tcpstat_inc(tcps_sndprobe);
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
tcpstat_pkt(tcps_sndrexmitpack, tcps_sndrexmitbyte,
len);
tp->t_sndrexmitpack++;
} else {
tcpstat_pkt(tcps_sndpack, tcps_sndbyte, len);
}
#ifdef notyet
if ((m = m_copypack(so->so_snd.sb_mb, off,
(int)len, max_linkhdr + hdrlen)) == 0) {
error = ENOBUFS;
goto out;
}
/*
* m_copypack left space for our hdr; use it.
*/
m->m_len += hdrlen;
m->m_data -= hdrlen;
#else
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL) {
error = ENOBUFS;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
if (len <= m_trailingspace(m)) {
m_copydata(so->so_snd.sb_mb, off, (int) len,
mtod(m, caddr_t) + hdrlen);
m->m_len += len;
} else {
m->m_next = m_copym(so->so_snd.sb_mb, off, (int) len,
M_NOWAIT);
if (m->m_next == 0) { (void) m_free(m);
error = ENOBUFS;
goto out;
}
}
if (so->so_snd.sb_mb->m_flags & M_PKTHDR)
m->m_pkthdr.ph_loopcnt =
so->so_snd.sb_mb->m_pkthdr.ph_loopcnt;
#endif
/*
* If we're sending everything we've got, set PUSH.
* (This will keep happy those implementations which only
* give data to the user when a buffer fills or
* a PUSH comes in.)
*/
if (off + len == so->so_snd.sb_cc && !soissending(so))
flags |= TH_PUSH;
tp->t_sndtime = now;
} else {
if (tp->t_flags & TF_ACKNOW)
tcpstat_inc(tcps_sndacks);
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcpstat_inc(tcps_sndctrl);
else if (SEQ_GT(tp->snd_up, tp->snd_una))
tcpstat_inc(tcps_sndurg);
else
tcpstat_inc(tcps_sndwinup);
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL) {
error = ENOBUFS;
goto out;
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
m->m_pkthdr.ph_ifidx = 0;
m->m_pkthdr.len = hdrlen + len;
if (!tp->t_template)
panic("tcp_output");
#ifdef DIAGNOSTIC
if (tp->t_template->m_len != hdrlen - optlen)
panic("tcp_output: template len != hdrlen - optlen");
#endif /* DIAGNOSTIC */
memcpy(mtod(m, caddr_t), mtod(tp->t_template, caddr_t),
tp->t_template->m_len);
th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len -
sizeof(struct tcphdr));
/*
* Fill in fields, remembering maximum advertised
* window for use in delaying messages about window sizes.
* If resending a FIN, be sure not to use a new sequence number.
*/
if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
(tp->snd_nxt == tp->snd_max))
tp->snd_nxt--;
/*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only
* packets, we do not want the sequence number of the
* retransmitted packet, we want the sequence number
* of the next unsent octet. So, if there is no data
* (and no SYN or FIN), use snd_max instead of snd_nxt
* when filling in ti_seq. But if we are in persist
* state, snd_max might reflect one byte beyond the
* right edge of the window, so use snd_nxt in that
* case, since we know we aren't doing a retransmission.
* (retransmit and persist are mutually exclusive...)
*/
if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max); if (sack_rxmit) {
/*
* If sendalot was turned on (due to option stuffing), turn it
* off. Properly set th_seq field. Advance the ret'x pointer
* by len.
*/
if (sendalot)
sendalot = 0;
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
tcpstat_pkt(tcps_sack_rexmits, tcps_sack_rexmit_bytes, len);
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) { memcpy(th + 1, opt, optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
#ifdef TCP_ECN
if (tcp_do_ecn) {
/*
* if we have received congestion experienced segs,
* set ECE bit.
*/
if (tp->t_flags & TF_RCVD_CE) { flags |= TH_ECE;
tcpstat_inc(tcps_ecn_sndece);
}
if (!(tp->t_flags & TF_DISABLE_ECN)) {
/*
* if this is a SYN seg, set ECE and CWR.
* set only ECE for SYN-ACK if peer supports ECN.
*/
if ((flags & (TH_SYN|TH_ACK)) == TH_SYN)
flags |= (TH_ECE|TH_CWR); else if ((tp->t_flags & TF_ECN_PERMIT) &&
(flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK))
flags |= TH_ECE;
}
/*
* if we have reduced the congestion window, notify
* the peer by setting CWR bit.
*/
if ((tp->t_flags & TF_ECN_PERMIT) &&
(tp->t_flags & TF_SEND_CWR)) {
flags |= TH_CWR;
tp->t_flags &= ~TF_SEND_CWR;
tcpstat_inc(tcps_ecn_sndcwr);
}
}
#endif
th->th_flags = flags;
/*
* Calculate receive window. Don't shrink window,
* but avoid silly window syndrome.
*/
if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
win = 0;
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
if (flags & TH_RST)
win = 0;
th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
if (th->th_win == 0) tp->t_sndzerowin++;
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
u_int32_t urp = tp->snd_up - tp->snd_nxt;
if (urp > IP_MAXPACKET)
urp = IP_MAXPACKET;
th->th_urp = htons((u_int16_t)urp);
th->th_flags |= TH_URG;
} else
/*
* If no urgent pointer to send, then we pull
* the urgent pointer to the left edge of the send window
* so that it doesn't drift into the send window on sequence
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE) {
int iphlen;
union sockaddr_union src, dst;
struct tdb *tdb;
bzero(&src, sizeof(union sockaddr_union));
bzero(&dst, sizeof(union sockaddr_union));
switch (tp->pf) {
case 0: /*default to PF_INET*/
case AF_INET:
iphlen = sizeof(struct ip);
src.sa.sa_len = sizeof(struct sockaddr_in);
src.sa.sa_family = AF_INET;
src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
dst.sa.sa_len = sizeof(struct sockaddr_in);
dst.sa.sa_family = AF_INET;
dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
break;
#ifdef INET6
case AF_INET6:
iphlen = sizeof(struct ip6_hdr);
src.sa.sa_len = sizeof(struct sockaddr_in6);
src.sa.sa_family = AF_INET6;
src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
dst.sa.sa_len = sizeof(struct sockaddr_in6);
dst.sa.sa_family = AF_INET6;
dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
break;
#endif /* INET6 */
}
tdb = gettdbbysrcdst(rtable_l2(tp->t_inpcb->inp_rtableid),
0, &src, &dst, IPPROTO_TCP);
if (tdb == NULL) {
m_freem(m);
return (EPERM);
}
if (tcp_signature(tdb, tp->pf, m, th, iphlen, 0,
mtod(m, caddr_t) + hdrlen - optlen + sigoff) < 0) {
m_freem(m);
tdb_unref(tdb);
return (EINVAL);
}
tdb_unref(tdb);
}
#endif /* TCP_SIGNATURE */
/* Defer checksumming until later (ip_output() or hardware) */
m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT;
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
*/
if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tcp_seq startseq = tp->snd_nxt;
/*
* Advance snd_nxt over sequence space of this segment.
*/
if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++;
tp->t_flags |= TF_SENTFIN;
}
}
if (tp->sack_enable) { if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
goto timer;
}
}
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
*/
if (tp->t_rtttime == 0) { tp->t_rtttime = now;
tp->t_rtseq = startseq;
tcpstat_inc(tcps_segstimed);
}
}
/*
* Set retransmit timer if not currently set,
* and not doing an ack or a keep-alive probe.
* Initial value for retransmit timer is smoothed
* round-trip time + 2 * round-trip time variance.
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
timer:
if (tp->sack_enable && sack_rxmit && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
tp->snd_nxt != tp->snd_max) {
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
}
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
tp->snd_nxt != tp->snd_una) {
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
}
if (len == 0 && so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
/*
* Avoid a situation where we do not set persist timer
* after a zero window condition. For example:
* 1) A -> B: packet with enough data to fill the window
* 2) B -> A: ACK for #1 + new data (0 window
* advertisement)
* 3) A -> B: ACK for #2, 0 len packet
*
* In this case, A will not activate the persist timer,
* because it chose to send a packet. Unless tcp_output
* is called for some other reason (delayed ack timer,
* another input packet from B, socket syscall), A will
* not send zero window probes.
*
* So, if you send a 0-length packet, but there is data
* in the socket buffer, and neither the rexmt or
* persist timer is already set, then activate the
* persist timer.
*/
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
} else
if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len;
tcp_update_sndspace(tp);
/*
* Trace.
*/
if (so->so_options & SO_DEBUG) tcp_trace(TA_OUTPUT, tp->t_state, tp, tp, mtod(m, caddr_t), 0,
len);
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
* to handle ttl and tos; we could keep them in
* the template, but need a way to checksum without them.
*/
#ifdef TCP_ECN
/*
* if peer is ECN capable, set the ECT bit in the IP header.
* but don't set ECT for a pure ack, a retransmit or a window probe.
*/
needect = 0;
if (tcp_do_ecn && (tp->t_flags & TF_ECN_PERMIT)) { if (len == 0 || SEQ_LT(tp->snd_nxt, tp->snd_max) || (tp->t_force && len == 1)) {
/* don't set ECT */
} else {
needect = 1;
tcpstat_inc(tcps_ecn_sndect);
}
}
#endif
/* force routing table */
m->m_pkthdr.ph_rtableid = tp->t_inpcb->inp_rtableid;
#if NPF > 0
pf_mbuf_link_inpcb(m, tp->t_inpcb);
#endif
switch (tp->pf) {
case 0: /*default to PF_INET*/
case AF_INET:
{
struct ip *ip;
ip = mtod(m, struct ip *);
ip->ip_len = htons(m->m_pkthdr.len);
packetlen = m->m_pkthdr.len;
ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
#ifdef TCP_ECN
if (needect)
ip->ip_tos |= IPTOS_ECN_ECT0;
#endif
}
#if NSTOEPLITZ > 0
m->m_pkthdr.ph_flowid = tp->t_inpcb->inp_flowid;
SET(m->m_pkthdr.csum_flags, M_FLOWID);
#endif
error = ip_output(m, tp->t_inpcb->inp_options,
&tp->t_inpcb->inp_route,
(ip_mtudisc ? IP_MTUDISC : 0), NULL, tp->t_inpcb, 0);
break;
#ifdef INET6
case AF_INET6:
{
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_plen = m->m_pkthdr.len -
sizeof(struct ip6_hdr);
packetlen = m->m_pkthdr.len;
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb);
#ifdef TCP_ECN
if (needect) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
#endif
}
error = ip6_output(m, tp->t_inpcb->inp_outputopts6,
&tp->t_inpcb->inp_route6,
0, NULL, tp->t_inpcb);
break;
#endif /* INET6 */
}
if (error) {
out:
if (error == ENOBUFS) {
/*
* If the interface queue is full, or IP cannot
* get an mbuf, trigger TCP slow start.
*/
tp->snd_cwnd = tp->t_maxseg;
return (0);
}
if (error == EMSGSIZE) {
/*
* ip_output() will have already fixed the route
* for us. tcp_mtudisc() will, as its last action,
* initiate retransmission, so it is important to
* not do so here.
*/
tcp_mtudisc(tp->t_inpcb, -1);
return (0);
}
if ((error == EHOSTUNREACH || error == ENETDOWN) &&
TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_softerror = error;
return (0);
}
/* Restart the delayed ACK timer, if necessary. */
if (TCP_TIMER_ISARMED(tp, TCPT_DELACK)) TCP_TIMER_ARM_MSEC(tp, TCPT_DELACK, tcp_delack_msecs);
return (error);
}
if (packetlen > tp->t_pmtud_mtu_sent) tp->t_pmtud_mtu_sent = packetlen;
tcpstat_inc(tcps_sndtotal);
if (TCP_TIMER_ISARMED(tp, TCPT_DELACK)) tcpstat_inc(tcps_delack);
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
* Any pending ACK has now been sent.
*/
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_sndacktime = now;
tp->t_flags &= ~TF_ACKNOW;
TCP_TIMER_DISARM(tp, TCPT_DELACK);
if (sendalot)
goto again;
return (0);
}
void
tcp_setpersist(struct tcpcb *tp)
{
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + TCP_RTT_BASE_SHIFT);
int nticks;
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
panic("tcp_output REXMT");
/*
* Start/restart persistence timer.
*/
if (t < tp->t_rttmin)
t = tp->t_rttmin;
TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
TCPTV_PERSMIN, TCPTV_PERSMAX);
TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++;
}
/* $OpenBSD: udp6_output.c,v 1.59 2022/02/22 01:35:41 guenther Exp $ */
/* $KAME: udp6_output.c,v 1.21 2001/02/07 11:51:54 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "pf.h"
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
#if NPF > 0
#include <net/pfvar.h>
#endif
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
*/
int
udp6_output(struct inpcb *in6p, struct mbuf *m, struct mbuf *addr6,
struct mbuf *control)
{
u_int32_t ulen = m->m_pkthdr.len;
u_int32_t plen = sizeof(struct udphdr) + ulen;
int error = 0, priv = 0, hlen, flags;
struct ip6_hdr *ip6;
struct udphdr *udp6;
struct in6_addr *laddr, *faddr;
struct ip6_pktopts *optp, opt;
struct sockaddr_in6 tmp, valid;
struct proc *p = curproc; /* XXX */
u_short fport;
if ((in6p->inp_socket->so_state & SS_PRIV) != 0)
priv = 1;
if (control) {
if ((error = ip6_setpktopts(control, &opt,
in6p->inp_outputopts6, priv, IPPROTO_UDP)) != 0)
goto release;
optp = &opt;
} else
optp = in6p->inp_outputopts6;
if (addr6) {
struct sockaddr_in6 *sin6;
if ((error = in6_nam2sin6(addr6, &sin6)))
goto release;
if (sin6->sin6_port == 0) {
error = EADDRNOTAVAIL;
goto release;
}
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
error = EADDRNOTAVAIL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->inp_faddr6)) {
error = EISCONN;
goto release;
}
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
faddr = &sin6->sin6_addr;
fport = sin6->sin6_port; /* allow 0 port */
/* KAME hack: embed scopeid */
if (in6_embedscope(&sin6->sin6_addr, sin6, in6p) != 0) {
error = EINVAL;
goto release;
}
error = in6_pcbselsrc(&laddr, sin6, in6p, optp);
if (error)
goto release;
if (in6p->inp_lport == 0){
error = in_pcbbind(in6p, NULL, p);
if (error)
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->inp_laddr6) &&
!IN6_ARE_ADDR_EQUAL(&in6p->inp_laddr6, laddr)) {
valid.sin6_addr = *laddr;
valid.sin6_port = in6p->inp_lport;
valid.sin6_scope_id = 0;
valid.sin6_family = AF_INET6;
valid.sin6_len = sizeof(valid);
error = in6_pcbaddrisavail(in6p, &valid, 0, p);
if (error)
goto release;
}
} else {
if (IN6_IS_ADDR_UNSPECIFIED(&in6p->inp_faddr6)) {
error = ENOTCONN;
goto release;
}
laddr = &in6p->inp_laddr6;
faddr = &in6p->inp_faddr6;
fport = in6p->inp_fport;
}
hlen = sizeof(struct ip6_hdr);
/*
* Calculate data length and get a mbuf
* for UDP and IP6 headers.
*/
M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto releaseopt;
}
/*
* Stuff checksum and output datagram.
*/
udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
udp6->uh_sport = in6p->inp_lport; /* lport is always set in the PCB */
udp6->uh_dport = fport;
if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen);
else
udp6->uh_ulen = 0;
udp6->uh_sum = 0;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = in6p->inp_flowinfo & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
#if 0 /* ip6_plen will be filled in ip6_output. */
ip6->ip6_plen = htons((u_short)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
ip6->ip6_hlim = in6_selecthlim(in6p);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
flags = 0;
if (in6p->inp_flags & IN6P_MINMTU)
flags |= IPV6_MINMTU;
udpstat_inc(udps_opackets);
/* force routing table */
m->m_pkthdr.ph_rtableid = in6p->inp_rtableid;
#if NPF > 0
if (in6p->inp_socket->so_state & SS_ISCONNECTED) pf_mbuf_link_inpcb(m, in6p);
#endif
error = ip6_output(m, optp, &in6p->inp_route6,
flags, in6p->inp_moptions6, in6p);
goto releaseopt;
release:
m_freem(m);
releaseopt:
if (control) { ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return (error);
}
/* $OpenBSD: kern_synch.c,v 1.190 2022/08/14 01:58:27 jsg Exp $ */
/* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.6 (Berkeley) 1/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/timeout.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/refcnt.h>
#include <sys/atomic.h>
#include <sys/tracepoint.h>
#include <ddb/db_output.h>
#include <machine/spinlock.h>
#ifdef DIAGNOSTIC
#include <sys/syslog.h>
#endif
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
int sleep_signal_check(void);
int thrsleep(struct proc *, struct sys___thrsleep_args *);
int thrsleep_unlock(void *);
/*
* We're only looking at 7 bits of the address; everything is
* aligned to 4, lots of things are aligned to greater powers
* of 2. Shift right by 8, i.e. drop the bottom 256 worth.
*/
#define TABLESIZE 128
#define LOOKUP(x) (((long)(x) >> 8) & (TABLESIZE - 1))
TAILQ_HEAD(slpque,proc) slpque[TABLESIZE];
void
sleep_queue_init(void)
{
int i;
for (i = 0; i < TABLESIZE; i++)
TAILQ_INIT(&slpque[i]);
}
/*
* Global sleep channel for threads that do not want to
* receive wakeup(9) broadcasts.
*/
int nowake;
/*
* During autoconfiguration or after a panic, a sleep will simply
* lower the priority briefly to allow interrupts, then return.
* The priority to be used (safepri) is machine-dependent, thus this
* value is initialized and maintained in the machine-dependent layers.
* This priority will typically be 0, or the lowest priority
* that is safe for use on the interrupt stack; it can be made
* higher to block network software interrupts after panics.
*/
extern int safepri;
/*
* General sleep call. Suspends the current process until a wakeup is
* performed on the specified identifier. The process will then be made
* runnable with the specified priority. Sleeps at most timo/hz seconds
* (0 means no timeout). If pri includes PCATCH flag, signals are checked
* before and after sleeping, else signals are not checked. Returns 0 if
* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
* signal needs to be delivered, ERESTART is returned if the current system
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*/
int
tsleep(const volatile void *ident, int priority, const char *wmesg, int timo)
{
struct sleep_state sls;
#ifdef MULTIPROCESSOR
int hold_count;
#endif
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0); KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0);
#ifdef MULTIPROCESSOR
KASSERT(timo || _kernel_lock_held());
#endif
#ifdef DDB
if (cold == 2) db_stack_dump();
#endif
if (cold || panicstr) {
int s;
/*
* After a panic, or during autoconfiguration,
* just give interrupts a chance, then just return;
* don't run any other procs or panic below,
* in case this is the idle process and already asleep.
*/
s = splhigh();
splx(safepri);
#ifdef MULTIPROCESSOR
if (_kernel_lock_held()) { hold_count = __mp_release_all(&kernel_lock);
__mp_acquire_count(&kernel_lock, hold_count);
}
#endif
splx(s);
return (0);
}
sleep_setup(&sls, ident, priority, wmesg, timo);
return sleep_finish(&sls, 1);
}
int
tsleep_nsec(const volatile void *ident, int priority, const char *wmesg,
uint64_t nsecs)
{
uint64_t to_ticks;
if (nsecs == INFSLP)
return tsleep(ident, priority, wmesg, 0);
#ifdef DIAGNOSTIC
if (nsecs == 0) {
log(LOG_WARNING,
"%s: %s[%d]: %s: trying to sleep zero nanoseconds\n",
__func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid,
wmesg);
}
#endif
/*
* We want to sleep at least nsecs nanoseconds worth of ticks.
*
* - Clamp nsecs to prevent arithmetic overflow.
*
* - Round nsecs up to account for any nanoseconds that do not
* divide evenly into tick_nsec, otherwise we'll lose them to
* integer division in the next step. We add (tick_nsec - 1)
* to keep from introducing a spurious tick if there are no
* such nanoseconds, i.e. nsecs % tick_nsec == 0.
*
* - Divide the rounded value to a count of ticks. We divide
* by (tick_nsec + 1) to discard the extra tick introduced if,
* before rounding, nsecs % tick_nsec == 1.
*
* - Finally, add a tick to the result. We need to wait out
* the current tick before we can begin counting our interval,
* as we do not know how much time has elapsed since the
* current tick began.
*/
nsecs = MIN(nsecs, UINT64_MAX - tick_nsec);
to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
if (to_ticks > INT_MAX)
to_ticks = INT_MAX;
return tsleep(ident, priority, wmesg, (int)to_ticks);
}
/*
* Same as tsleep, but if we have a mutex provided, then once we've
* entered the sleep queue we drop the mutex. After sleeping we re-lock.
*/
int
msleep(const volatile void *ident, struct mutex *mtx, int priority,
const char *wmesg, int timo)
{
struct sleep_state sls;
int error, spl;
#ifdef MULTIPROCESSOR
int hold_count;
#endif
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0); KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0); KASSERT(mtx != NULL);
#ifdef DDB
if (cold == 2) db_stack_dump();
#endif
if (cold || panicstr) {
/*
* After a panic, or during autoconfiguration,
* just give interrupts a chance, then just return;
* don't run any other procs or panic below,
* in case this is the idle process and already asleep.
*/
spl = MUTEX_OLDIPL(mtx);
MUTEX_OLDIPL(mtx) = safepri;
mtx_leave(mtx);
#ifdef MULTIPROCESSOR
if (_kernel_lock_held()) { hold_count = __mp_release_all(&kernel_lock);
__mp_acquire_count(&kernel_lock, hold_count);
}
#endif
if ((priority & PNORELOCK) == 0) {
mtx_enter(mtx);
MUTEX_OLDIPL(mtx) = spl;
} else
splx(spl);
return (0);
}
sleep_setup(&sls, ident, priority, wmesg, timo);
/* XXX - We need to make sure that the mutex doesn't
* unblock splsched. This can be made a bit more
* correct when the sched_lock is a mutex.
*/
spl = MUTEX_OLDIPL(mtx);
MUTEX_OLDIPL(mtx) = splsched();
mtx_leave(mtx);
/* signal may stop the process, release mutex before that */
error = sleep_finish(&sls, 1);
if ((priority & PNORELOCK) == 0) {
mtx_enter(mtx);
MUTEX_OLDIPL(mtx) = spl; /* put the ipl back */
} else
splx(spl);
return error;
}
int
msleep_nsec(const volatile void *ident, struct mutex *mtx, int priority,
const char *wmesg, uint64_t nsecs)
{
uint64_t to_ticks;
if (nsecs == INFSLP)
return msleep(ident, mtx, priority, wmesg, 0);
#ifdef DIAGNOSTIC
if (nsecs == 0) {
log(LOG_WARNING,
"%s: %s[%d]: %s: trying to sleep zero nanoseconds\n",
__func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid,
wmesg);
}
#endif
nsecs = MIN(nsecs, UINT64_MAX - tick_nsec);
to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
if (to_ticks > INT_MAX)
to_ticks = INT_MAX;
return msleep(ident, mtx, priority, wmesg, (int)to_ticks);
}
/*
* Same as tsleep, but if we have a rwlock provided, then once we've
* entered the sleep queue we drop the it. After sleeping we re-lock.
*/
int
rwsleep(const volatile void *ident, struct rwlock *rwl, int priority,
const char *wmesg, int timo)
{
struct sleep_state sls;
int error, status;
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0); KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0);
rw_assert_anylock(rwl);
status = rw_status(rwl);
sleep_setup(&sls, ident, priority, wmesg, timo);
rw_exit(rwl);
/* signal may stop the process, release rwlock before that */
error = sleep_finish(&sls, 1);
if ((priority & PNORELOCK) == 0) rw_enter(rwl, status);
return error;
}
int
rwsleep_nsec(const volatile void *ident, struct rwlock *rwl, int priority,
const char *wmesg, uint64_t nsecs)
{
uint64_t to_ticks;
if (nsecs == INFSLP)
return rwsleep(ident, rwl, priority, wmesg, 0);
#ifdef DIAGNOSTIC
if (nsecs == 0) {
log(LOG_WARNING,
"%s: %s[%d]: %s: trying to sleep zero nanoseconds\n",
__func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid,
wmesg);
}
#endif
nsecs = MIN(nsecs, UINT64_MAX - tick_nsec);
to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
if (to_ticks > INT_MAX)
to_ticks = INT_MAX;
return rwsleep(ident, rwl, priority, wmesg, (int)to_ticks);
}
void
sleep_setup(struct sleep_state *sls, const volatile void *ident, int prio,
const char *wmesg, int timo)
{
struct proc *p = curproc;
#ifdef DIAGNOSTIC
if (p->p_flag & P_CANTSLEEP)
panic("sleep: %s failed insomnia", p->p_p->ps_comm);
if (ident == NULL)
panic("tsleep: no ident");
if (p->p_stat != SONPROC)
panic("tsleep: not SONPROC");
#endif
sls->sls_catch = prio & PCATCH;
sls->sls_timeout = 0;
SCHED_LOCK(sls->sls_s);
TRACEPOINT(sched, sleep, NULL);
p->p_wchan = ident;
p->p_wmesg = wmesg;
p->p_slptime = 0;
p->p_slppri = prio & PRIMASK;
TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
if (timo) { KASSERT((p->p_flag & P_TIMEOUT) == 0); sls->sls_timeout = 1;
timeout_add(&p->p_sleep_to, timo);
}
}
int
sleep_finish(struct sleep_state *sls, int do_sleep)
{
struct proc *p = curproc;
int error = 0, error1 = 0;
if (sls->sls_catch != 0) {
/*
* We put ourselves on the sleep queue and start our
* timeout before calling sleep_signal_check(), as we could
* stop there, and a wakeup or a SIGCONT (or both) could
* occur while we were stopped. A SIGCONT would cause
* us to be marked as SSLEEP without resuming us, thus
* we must be ready for sleep when sleep_signal_check() is
* called.
* If the wakeup happens while we're stopped, p->p_wchan
* will be NULL upon return from sleep_signal_check(). In
* that case we need to unwind immediately.
*/
atomic_setbits_int(&p->p_flag, P_SINTR);
if ((error = sleep_signal_check()) != 0) {
p->p_stat = SONPROC;
sls->sls_catch = 0;
do_sleep = 0;
} else if (p->p_wchan == NULL) {
sls->sls_catch = 0;
do_sleep = 0;
}
}
if (do_sleep) {
p->p_stat = SSLEEP;
p->p_ru.ru_nvcsw++;
SCHED_ASSERT_LOCKED(); mi_switch();
} else {
unsleep(p);
}
#ifdef DIAGNOSTIC
if (p->p_stat != SONPROC)
panic("sleep_finish !SONPROC");
#endif
p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri;
SCHED_UNLOCK(sls->sls_s);
/*
* Even though this belongs to the signal handling part of sleep,
* we need to clear it before the ktrace.
*/
atomic_clearbits_int(&p->p_flag, P_SINTR);
if (sls->sls_timeout) { if (p->p_flag & P_TIMEOUT) {
error1 = EWOULDBLOCK;
} else {
/* This can sleep. It must not use timeouts. */
timeout_del_barrier(&p->p_sleep_to);
}
atomic_clearbits_int(&p->p_flag, P_TIMEOUT);
}
/* Check if thread was woken up because of a unwind or signal */
if (sls->sls_catch != 0)
error = sleep_signal_check();
/* Signal errors are higher priority than timeouts. */
if (error == 0 && error1 != 0)
error = error1;
return error;
}
/*
* Check and handle signals and suspensions around a sleep cycle.
*/
int
sleep_signal_check(void)
{
struct proc *p = curproc;
struct sigctx ctx;
int err, sig;
if ((err = single_thread_check(p, 1)) != 0)
return err;
if ((sig = cursig(p, &ctx)) != 0) { if (ctx.sig_intr)
return EINTR;
else
return ERESTART;
}
return 0;
}
int
wakeup_proc(struct proc *p, const volatile void *chan)
{
int s, awakened = 0;
SCHED_LOCK(s);
if (p->p_wchan != NULL && ((chan == NULL) || (p->p_wchan == chan))) {
awakened = 1;
if (p->p_stat == SSLEEP)
setrunnable(p);
else
unsleep(p);
}
SCHED_UNLOCK(s);
return awakened;
}
/*
* Implement timeout for tsleep.
* If process hasn't been awakened (wchan non-zero),
* set timeout flag and undo the sleep. If proc
* is stopped, just unsleep so it will remain stopped.
*/
void
endtsleep(void *arg)
{
struct proc *p = arg;
int s;
SCHED_LOCK(s);
if (wakeup_proc(p, NULL))
atomic_setbits_int(&p->p_flag, P_TIMEOUT);
SCHED_UNLOCK(s);
}
/*
* Remove a process from its wait queue
*/
void
unsleep(struct proc *p)
{ SCHED_ASSERT_LOCKED(); if (p->p_wchan != NULL) {
TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
p->p_wchan = NULL;
TRACEPOINT(sched, wakeup, p->p_tid + THREAD_PID_OFFSET,
p->p_p->ps_pid);
}
}
/*
* Make a number of processes sleeping on the specified identifier runnable.
*/
void
wakeup_n(const volatile void *ident, int n)
{
struct slpque *qp;
struct proc *p;
struct proc *pnext;
int s;
SCHED_LOCK(s);
qp = &slpque[LOOKUP(ident)];
for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
pnext = TAILQ_NEXT(p, p_runq);
/*
* This happens if wakeup(9) is called after enqueuing
* itself on the sleep queue and both `ident' collide.
*/
if (p == curproc)
continue;
#ifdef DIAGNOSTIC
if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
panic("wakeup: p_stat is %d", (int)p->p_stat);
#endif
if (wakeup_proc(p, ident))
--n;
}
SCHED_UNLOCK(s);
}
/*
* Make all processes sleeping on the specified identifier runnable.
*/
void
wakeup(const volatile void *chan)
{
wakeup_n(chan, -1);
}
int
sys_sched_yield(struct proc *p, void *v, register_t *retval)
{
struct proc *q;
uint8_t newprio;
int s;
SCHED_LOCK(s);
/*
* If one of the threads of a multi-threaded process called
* sched_yield(2), drop its priority to ensure its siblings
* can make some progress.
*/
newprio = p->p_usrpri;
TAILQ_FOREACH(q, &p->p_p->ps_threads, p_thr_link)
newprio = max(newprio, q->p_runpri);
setrunqueue(p->p_cpu, p, newprio);
p->p_ru.ru_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
return (0);
}
int
thrsleep_unlock(void *lock)
{
static _atomic_lock_t unlocked = _ATOMIC_LOCK_UNLOCKED;
_atomic_lock_t *atomiclock = lock;
if (!lock)
return 0;
return copyout(&unlocked, atomiclock, sizeof(unlocked));
}
struct tslpentry {
TAILQ_ENTRY(tslpentry) tslp_link;
long tslp_ident;
};
/* thrsleep queue shared between processes */
static struct tslpqueue thrsleep_queue = TAILQ_HEAD_INITIALIZER(thrsleep_queue);
static struct rwlock thrsleep_lock = RWLOCK_INITIALIZER("thrsleeplk");
int
thrsleep(struct proc *p, struct sys___thrsleep_args *v)
{
struct sys___thrsleep_args /* {
syscallarg(const volatile void *) ident;
syscallarg(clockid_t) clock_id;
syscallarg(const struct timespec *) tp;
syscallarg(void *) lock;
syscallarg(const int *) abort;
} */ *uap = v;
long ident = (long)SCARG(uap, ident);
struct tslpentry entry;
struct tslpqueue *queue;
struct rwlock *qlock;
struct timespec *tsp = (struct timespec *)SCARG(uap, tp);
void *lock = SCARG(uap, lock);
uint64_t nsecs = INFSLP;
int abort = 0, error;
clockid_t clock_id = SCARG(uap, clock_id);
if (ident == 0)
return (EINVAL);
if (tsp != NULL) {
struct timespec now;
if ((error = clock_gettime(p, clock_id, &now)))
return (error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrabstimespec(p, tsp);
#endif
if (timespeccmp(tsp, &now, <=)) {
/* already passed: still do the unlock */
if ((error = thrsleep_unlock(lock)))
return (error);
return (EWOULDBLOCK);
}
timespecsub(tsp, &now, tsp);
nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
}
if (ident == -1) {
queue = &thrsleep_queue;
qlock = &thrsleep_lock;
} else {
queue = &p->p_p->ps_tslpqueue;
qlock = &p->p_p->ps_lock;
}
/* Interlock with wakeup. */
entry.tslp_ident = ident;
rw_enter_write(qlock);
TAILQ_INSERT_TAIL(queue, &entry, tslp_link);
rw_exit_write(qlock);
error = thrsleep_unlock(lock);
if (error == 0 && SCARG(uap, abort) != NULL)
error = copyin(SCARG(uap, abort), &abort, sizeof(abort));
rw_enter_write(qlock);
if (error != 0)
goto out;
if (abort != 0) {
error = EINTR;
goto out;
}
if (entry.tslp_ident != 0) {
error = rwsleep_nsec(&entry, qlock, PWAIT|PCATCH, "thrsleep",
nsecs);
}
out:
if (entry.tslp_ident != 0)
TAILQ_REMOVE(queue, &entry, tslp_link);
rw_exit_write(qlock);
if (error == ERESTART)
error = ECANCELED;
return (error);
}
int
sys___thrsleep(struct proc *p, void *v, register_t *retval)
{
struct sys___thrsleep_args /* {
syscallarg(const volatile void *) ident;
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
syscallarg(void *) lock;
syscallarg(const int *) abort;
} */ *uap = v;
struct timespec ts;
int error;
if (SCARG(uap, tp) != NULL) {
if ((error = copyin(SCARG(uap, tp), &ts, sizeof(ts)))) {
*retval = error;
return 0;
}
if (!timespecisvalid(&ts)) {
*retval = EINVAL;
return 0;
}
SCARG(uap, tp) = &ts;
}
*retval = thrsleep(p, uap);
return 0;
}
int
sys___thrwakeup(struct proc *p, void *v, register_t *retval)
{
struct sys___thrwakeup_args /* {
syscallarg(const volatile void *) ident;
syscallarg(int) n;
} */ *uap = v;
struct tslpentry *entry, *tmp;
struct tslpqueue *queue;
struct rwlock *qlock;
long ident = (long)SCARG(uap, ident);
int n = SCARG(uap, n);
int found = 0;
if (ident == 0)
*retval = EINVAL;
else {
if (ident == -1) {
queue = &thrsleep_queue;
qlock = &thrsleep_lock;
/*
* Wake up all waiters with ident -1. This is needed
* because ident -1 can be shared by multiple userspace
* lock state machines concurrently. The implementation
* has no way to direct the wakeup to a particular
* state machine.
*/
n = 0;
} else {
queue = &p->p_p->ps_tslpqueue;
qlock = &p->p_p->ps_lock;
}
rw_enter_write(qlock);
TAILQ_FOREACH_SAFE(entry, queue, tslp_link, tmp) {
if (entry->tslp_ident == ident) {
TAILQ_REMOVE(queue, entry, tslp_link);
entry->tslp_ident = 0;
wakeup_one(entry);
if (++found == n)
break;
}
}
rw_exit_write(qlock);
if (ident == -1)
*retval = 0;
else
*retval = found ? 0 : ESRCH;
}
return (0);
}
void
refcnt_init(struct refcnt *r)
{
refcnt_init_trace(r, 0);
}
void
refcnt_init_trace(struct refcnt *r, int idx)
{
r->r_traceidx = idx;
atomic_store_int(&r->r_refs, 1);
TRACEINDEX(refcnt, r->r_traceidx, r, 0, +1);
}
void
refcnt_take(struct refcnt *r)
{
u_int refs;
refs = atomic_inc_int_nv(&r->r_refs);
KASSERT(refs != 0); TRACEINDEX(refcnt, r->r_traceidx, r, refs - 1, +1);
(void)refs;
}
int
refcnt_rele(struct refcnt *r)
{
u_int refs;
membar_exit_before_atomic();
refs = atomic_dec_int_nv(&r->r_refs);
KASSERT(refs != ~0); TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1); if (refs == 0) { membar_enter_after_atomic();
return (1);
}
return (0);
}
void
refcnt_rele_wake(struct refcnt *r)
{
if (refcnt_rele(r))
wakeup_one(r);
}
void
refcnt_finalize(struct refcnt *r, const char *wmesg)
{
struct sleep_state sls;
u_int refs;
membar_exit_before_atomic();
refs = atomic_dec_int_nv(&r->r_refs);
KASSERT(refs != ~0); TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1); while (refs) {
sleep_setup(&sls, r, PWAIT, wmesg, 0);
refs = atomic_load_int(&r->r_refs);
sleep_finish(&sls, refs);
}
TRACEINDEX(refcnt, r->r_traceidx, r, refs, 0);
/* Order subsequent loads and stores after refs == 0 load. */
membar_sync();
}
int
refcnt_shared(struct refcnt *r)
{
u_int refs;
refs = atomic_load_int(&r->r_refs);
TRACEINDEX(refcnt, r->r_traceidx, r, refs, 0);
return (refs > 1);
}
unsigned int
refcnt_read(struct refcnt *r)
{
u_int refs;
refs = atomic_load_int(&r->r_refs);
TRACEINDEX(refcnt, r->r_traceidx, r, refs, 0);
return (refs);
}
void
cond_init(struct cond *c)
{
atomic_store_int(&c->c_wait, 1);
}
void
cond_signal(struct cond *c)
{
atomic_store_int(&c->c_wait, 0);
wakeup_one(c);
}
void
cond_wait(struct cond *c, const char *wmesg)
{
struct sleep_state sls;
unsigned int wait;
wait = atomic_load_int(&c->c_wait);
while (wait) {
sleep_setup(&sls, c, PWAIT, wmesg, 0);
wait = atomic_load_int(&c->c_wait);
sleep_finish(&sls, wait);
}
}
/* $OpenBSD: vfs_biomem.c,v 1.51 2021/10/24 00:02:25 jsg Exp $ */
/*
* Copyright (c) 2007 Artur Grabowski <art@openbsd.org>
* Copyright (c) 2012-2016,2019 Bob Beck <beck@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/pool.h>
#include <sys/proc.h> /* XXX for atomic */
#include <sys/mount.h>
#include <uvm/uvm_extern.h>
vaddr_t buf_kva_start, buf_kva_end;
int buf_needva;
TAILQ_HEAD(,buf) buf_valist;
extern struct bcachestats bcstats;
vaddr_t buf_unmap(struct buf *);
void
buf_mem_init(vsize_t size)
{
TAILQ_INIT(&buf_valist);
buf_kva_start = vm_map_min(kernel_map);
if (uvm_map(kernel_map, &buf_kva_start, size, NULL,
UVM_UNKNOWN_OFFSET, PAGE_SIZE, UVM_MAPFLAG(PROT_NONE,
PROT_NONE, MAP_INHERIT_NONE, MADV_NORMAL, 0)))
panic("%s: can't reserve VM for buffers", __func__);
buf_kva_end = buf_kva_start + size;
/* Contiguous mapping */
bcstats.kvaslots = bcstats.kvaslots_avail = size / MAXPHYS;
}
/*
* buf_acquire and buf_release manage the kvm mappings of buffers.
*/
void
buf_acquire(struct buf *bp)
{ KASSERT((bp->b_flags & B_BUSY) == 0); splassert(IPL_BIO);
/*
* Busy before waiting for kvm.
*/
SET(bp->b_flags, B_BUSY);
buf_map(bp);
}
/*
* Acquire a buf but do not map it. Preserve any mapping it did have.
*/
void
buf_acquire_nomap(struct buf *bp)
{ splassert(IPL_BIO);
SET(bp->b_flags, B_BUSY);
if (bp->b_data != NULL) { TAILQ_REMOVE(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail--;
bcstats.busymapped++;
}
}
void
buf_map(struct buf *bp)
{
vaddr_t va;
splassert(IPL_BIO);
if (bp->b_data == NULL) {
unsigned long i;
/*
* First, just use the pre-allocated space until we run out.
*/
if (buf_kva_start < buf_kva_end) {
va = buf_kva_start;
buf_kva_start += MAXPHYS;
bcstats.kvaslots_avail--;
} else {
struct buf *vbp;
/*
* Find some buffer we can steal the space from.
*/
vbp = TAILQ_FIRST(&buf_valist);
while ((curproc != syncerproc && curproc != cleanerproc &&
bcstats.kvaslots_avail <= RESERVE_SLOTS) ||
vbp == NULL) {
buf_needva++;
tsleep_nsec(&buf_needva, PRIBIO, "buf_needva",
INFSLP);
vbp = TAILQ_FIRST(&buf_valist);
}
va = buf_unmap(vbp);
}
for (i = 0; i < atop(bp->b_bufsize); i++) {
struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
bp->b_poffs + ptoa(i));
KASSERT(pg != NULL);
pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
PROT_READ | PROT_WRITE);
}
pmap_update(pmap_kernel());
bp->b_data = (caddr_t)va;
} else {
TAILQ_REMOVE(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail--;
}
bcstats.busymapped++;
}
void
buf_release(struct buf *bp)
{ KASSERT(bp->b_flags & B_BUSY); splassert(IPL_BIO); if (bp->b_data) {
bcstats.busymapped--;
TAILQ_INSERT_TAIL(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail++;
if (buf_needva) { buf_needva=0;
wakeup(&buf_needva);
}
}
CLR(bp->b_flags, B_BUSY);
}
/*
* Deallocate all memory resources for this buffer. We need to be careful
* to not drop kvm since we have no way to reclaim it. So, if the buffer
* has kvm, we need to free it later. We put it on the front of the
* freelist just so it gets picked up faster.
*
* Also, lots of assertions count on bp->b_data being NULL, so we
* set it temporarily to NULL.
*
* Return non-zero if we take care of the freeing later.
*/
int
buf_dealloc_mem(struct buf *bp)
{
caddr_t data;
splassert(IPL_BIO);
data = bp->b_data;
bp->b_data = NULL;
if (data) { if (bp->b_flags & B_BUSY) bcstats.busymapped--;
pmap_kremove((vaddr_t)data, bp->b_bufsize);
pmap_update(pmap_kernel());
}
if (bp->b_pobj) buf_free_pages(bp); if (data == NULL)
return (0);
bp->b_data = data;
if (!(bp->b_flags & B_BUSY)) { /* XXX - need better test */
TAILQ_REMOVE(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail--;
} else {
CLR(bp->b_flags, B_BUSY);
if (buf_needva) { buf_needva = 0;
wakeup(&buf_needva);
}
}
SET(bp->b_flags, B_RELEASED);
TAILQ_INSERT_HEAD(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail++;
return (1);
}
/*
* Only used by bread_cluster.
*/
void
buf_fix_mapping(struct buf *bp, vsize_t newsize)
{
vaddr_t va = (vaddr_t)bp->b_data;
if (newsize < bp->b_bufsize) {
pmap_kremove(va + newsize, bp->b_bufsize - newsize);
pmap_update(pmap_kernel());
/*
* Note: the size we lost is actually with the other
* buffers read in by bread_cluster
*/
bp->b_bufsize = newsize;
}
}
vaddr_t
buf_unmap(struct buf *bp)
{
vaddr_t va;
KASSERT((bp->b_flags & B_BUSY) == 0); KASSERT(bp->b_data != NULL); splassert(IPL_BIO); TAILQ_REMOVE(&buf_valist, bp, b_valist);
bcstats.kvaslots_avail--;
va = (vaddr_t)bp->b_data;
bp->b_data = NULL;
pmap_kremove(va, bp->b_bufsize);
pmap_update(pmap_kernel());
if (bp->b_flags & B_RELEASED) pool_put(&bufpool, bp);
return (va);
}
/* Always allocates in dma-reachable memory */
void
buf_alloc_pages(struct buf *bp, vsize_t size)
{
int i;
KASSERT(size == round_page(size)); KASSERT(bp->b_pobj == NULL); KASSERT(bp->b_data == NULL); splassert(IPL_BIO);
uvm_obj_init(&bp->b_uobj, &bufcache_pager, 1);
/*
* Attempt to allocate with NOWAIT. if we can't, then throw
* away some clean pages and try again. Finally, if that
* fails, do a WAITOK allocation so the page daemon can find
* memory for us.
*/
do {
i = uvm_pagealloc_multi(&bp->b_uobj, 0, size,
UVM_PLA_NOWAIT | UVM_PLA_NOWAKE);
if (i == 0)
break;
} while (bufbackoff(&dma_constraint, size) == 0);
if (i != 0)
i = uvm_pagealloc_multi(&bp->b_uobj, 0, size,
UVM_PLA_WAITOK);
/* should not happen */
if (i != 0) panic("uvm_pagealloc_multi unable to allocate an buf_object "
"of size %lu", size);
bcstats.numbufpages += atop(size);
bcstats.dmapages += atop(size);
SET(bp->b_flags, B_DMA);
bp->b_pobj = &bp->b_uobj;
bp->b_poffs = 0;
bp->b_bufsize = size;
}
void
buf_free_pages(struct buf *bp)
{
struct uvm_object *uobj = bp->b_pobj;
struct vm_page *pg;
voff_t off, i;
KASSERT(bp->b_data == NULL); KASSERT(uobj != NULL); splassert(IPL_BIO);
off = bp->b_poffs;
bp->b_pobj = NULL;
bp->b_poffs = 0;
for (i = 0; i < atop(bp->b_bufsize); i++) {
pg = uvm_pagelookup(uobj, off + ptoa(i));
KASSERT(pg != NULL); KASSERT(pg->wire_count == 1);
pg->wire_count = 0;
bcstats.numbufpages--;
if (ISSET(bp->b_flags, B_DMA)) bcstats.dmapages--;
}
CLR(bp->b_flags, B_DMA);
/* XXX refactor to do this without splbio later */
uvm_obj_free(uobj);
}
/* Reallocate a buf into a particular pmem range specified by "where". */
int
buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where,
int flags)
{
vaddr_t va;
int dma;
int i, r;
KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
splassert(IPL_BIO);
KASSERT(ISSET(bp->b_flags, B_BUSY));
dma = ISSET(bp->b_flags, B_DMA);
/* if the original buf is mapped, unmap it */
if (bp->b_data != NULL) {
va = (vaddr_t)bp->b_data;
pmap_kremove(va, bp->b_bufsize);
pmap_update(pmap_kernel());
}
do {
r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
bp->b_bufsize, UVM_PLA_NOWAIT | UVM_PLA_NOWAKE, where);
if (r == 0)
break;
} while ((bufbackoff(where, atop(bp->b_bufsize)) == 0));
/*
* bufbackoff() failed, so there's no more we can do without
* waiting. If allowed do, make that attempt.
*/
if (r != 0 && (flags & UVM_PLA_WAITOK))
r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
bp->b_bufsize, flags, where);
/*
* If the allocation has succeeded, we may be somewhere different.
* If the allocation has failed, we are in the same place.
*
* We still have to re-map the buffer before returning.
*/
/* take it out of dma stats until we know where we are */
if (dma)
bcstats.dmapages -= atop(bp->b_bufsize);
dma = 1;
/* if the original buf was mapped, re-map it */
for (i = 0; i < atop(bp->b_bufsize); i++) {
struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
bp->b_poffs + ptoa(i));
KASSERT(pg != NULL);
if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
dma = 0;
if (bp->b_data != NULL) {
pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
PROT_READ|PROT_WRITE);
pmap_update(pmap_kernel());
}
}
if (dma) {
SET(bp->b_flags, B_DMA);
bcstats.dmapages += atop(bp->b_bufsize);
} else
CLR(bp->b_flags, B_DMA);
return(r);
}
/* $OpenBSD: ufs_lookup.c,v 1.59 2022/01/11 03:13:59 jsg Exp $ */
/* $NetBSD: ufs_lookup.c,v 1.7 1996/02/09 22:36:06 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
extern struct nchstats nchstats;
#ifdef DIAGNOSTIC
int dirchk = 1;
#else
int dirchk = 0;
#endif
#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen == 0)
/*
* Convert a component of a pathname into a pointer to a locked inode.
* This is a very central and rather complicated routine.
* If the file system is not maintained in a strict tree hierarchy,
* this can result in a deadlock situation (see comments in code below).
*
* The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
* on whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it and the target of the pathname
* exists, lookup returns both the target and its parent directory locked.
* When creating or renaming and LOCKPARENT is specified, the target may
* not be ".". When deleting and LOCKPARENT is specified, the target may
* be "."., but the caller must check to ensure it does an vrele and vput
* instead of two vputs.
*
* Overall outline of ufs_lookup:
*
* check accessibility of directory
* look for name in cache, if found, then if at end of path
* and deleting or creating, drop it, else return name
* search for name in directory, to found or notfound
* notfound:
* if creating, return locked directory, leaving info on available slots
* else return error
* found:
* if at end of path and deleting, return information to allow delete
* if at end of path and rewriting (RENAME and LOCKPARENT), lock target
* inode and return info to allow rewrite
* if not at end, add name to cache; if at end and neither creating
* nor deleting, add name to cache
*/
int
ufs_lookup(void *v)
{
struct vop_lookup_args *ap = v;
struct vnode *vdp; /* vnode for directory being searched */
struct inode *dp; /* inode for directory being searched */
struct buf *bp; /* a buffer of directory entries */
struct direct *ep; /* the current directory entry */
int entryoffsetinblock; /* offset of ep in bp's buffer */
enum {NONE, COMPACT, FOUND} slotstatus;
doff_t slotoffset; /* offset of area with free space */
int slotsize; /* size of area at slotoffset */
int slotfreespace; /* amount of space free in slot */
int slotneeded; /* size of the entry we're seeking */
int numdirpasses; /* strategy for directory search */
doff_t endsearch; /* offset to end directory search */
doff_t prevoff; /* prev entry dp->i_offset */
struct vnode *pdp; /* saved dp during symlink work */
struct vnode *tdp; /* returned by VFS_VGET */
doff_t enduseful; /* pointer past last used dir slot */
u_long bmask; /* block offset mask */
int lockparent; /* 1 => lockparent flag is set */
int wantparent; /* 1 => wantparent or lockparent flag */
int namlen, error;
struct vnode **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
struct ucred *cred = cnp->cn_cred;
int flags;
int nameiop = cnp->cn_nameiop;
cnp->cn_flags &= ~PDIRUNLOCK;
flags = cnp->cn_flags;
bp = NULL;
slotoffset = -1;
*vpp = NULL;
vdp = ap->a_dvp;
dp = VTOI(vdp);
lockparent = flags & LOCKPARENT;
wantparent = flags & (LOCKPARENT|WANTPARENT);
/*
* Check accessibility of directory.
*/
if ((DIP(dp, mode) & IFMT) != IFDIR)
return (ENOTDIR);
if ((error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) != 0)
return (error);
if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
return (EROFS);
/*
* We now have a segment name to search for, and a directory to search.
*
* Before tediously performing a linear scan of the directory,
* check the name cache to see if the directory/name pair
* we are looking for is known already.
*/
if ((error = cache_lookup(vdp, vpp, cnp)) >= 0)
return (error);
/*
* Suppress search for slots unless creating
* file and at end of pathname, in which case
* we watch for a place to put the new file in
* case it doesn't already exist.
*/
slotstatus = FOUND;
slotfreespace = slotsize = slotneeded = 0;
if ((nameiop == CREATE || nameiop == RENAME) &&
(flags & ISLASTCN)) {
slotstatus = NONE;
slotneeded = (sizeof(struct direct) - MAXNAMLEN +
cnp->cn_namelen + 3) &~ 3;
}
/*
* If there is cached information on a previous search of
* this directory, pick up where we last left off.
* We cache only lookups as these are the most common
* and have the greatest payoff. Caching CREATE has little
* benefit as it usually must search the entire directory
* to determine that the entry does not exist. Caching the
* location of the last DELETE or RENAME has not reduced
* profiling time and hence has been removed in the interest
* of simplicity.
*/
bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
#ifdef UFS_DIRHASH
/*
* Use dirhash for fast operations on large directories. The logic
* to determine whether to hash the directory is contained within
* ufsdirhash_build(); a zero return means that it decided to hash
* this directory and it successfully built up the hash table.
*/
if (ufsdirhash_build(dp) == 0) {
/* Look for a free slot if needed. */
enduseful = DIP(dp, size); if (slotstatus != FOUND) {
slotoffset = ufsdirhash_findfree(dp, slotneeded,
&slotsize);
if (slotoffset >= 0) {
slotstatus = COMPACT;
enduseful = ufsdirhash_enduseful(dp);
if (enduseful < 0) enduseful = DIP(dp, size);
}
}
/* Look up the component. */
numdirpasses = 1;
entryoffsetinblock = 0; /* silence compiler warning */
switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
&dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
case 0:
ep = (struct direct *)((char *)bp->b_data +
(dp->i_offset & bmask));
goto foundentry;
case ENOENT:
#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
dp->i_offset = roundup2(DIP(dp, size), DIRBLKSIZ);
goto notfound;
default:
/* Something failed; just do a linear search. */
break;
}
}
#endif /* UFS_DIRHASH */
if (nameiop != LOOKUP || dp->i_diroff == 0 || dp->i_diroff >= DIP(dp, size)) {
entryoffsetinblock = 0;
dp->i_offset = 0;
numdirpasses = 1;
} else {
dp->i_offset = dp->i_diroff;
if ((entryoffsetinblock = dp->i_offset & bmask) &&
(error = UFS_BUFATOFF(dp, (off_t)dp->i_offset, NULL, &bp)))
return (error);
numdirpasses = 2;
nchstats.ncs_2passes++;
}
prevoff = dp->i_offset; endsearch = roundup(DIP(dp, size), DIRBLKSIZ);
enduseful = 0;
searchloop:
while (dp->i_offset < endsearch) {
/*
* If necessary, get the next directory block.
*/
if ((dp->i_offset & bmask) == 0) { if (bp != NULL) brelse(bp);
error = UFS_BUFATOFF(dp, (off_t)dp->i_offset, NULL,
&bp);
if (error)
return (error);
entryoffsetinblock = 0;
}
/*
* If still looking for a slot, and at a DIRBLKSIZE
* boundary, have to start looking for free space again.
*/
if (slotstatus == NONE &&
(entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) {
slotoffset = -1;
slotfreespace = 0;
}
/*
* Get pointer to next entry.
* Full validation checks are slow, so we only check
* enough to insure forward progress through the
* directory. Complete checks can be run by patching
* "dirchk" to be true.
*/
ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock);
if (ep->d_reclen == 0 || (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
int i;
ufs_dirbad(dp, dp->i_offset, "mangled entry");
i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
dp->i_offset += i;
entryoffsetinblock += i;
continue;
}
/*
* If an appropriate sized slot has not yet been found,
* check to see if one is available. Also accumulate space
* in the current block so that we can determine if
* compaction is viable.
*/
if (slotstatus != FOUND) {
int size = ep->d_reclen;
if (ep->d_ino != 0) size -= DIRSIZ(OFSFMT(dp), ep); if (size > 0) {
if (size >= slotneeded) {
slotstatus = FOUND;
slotoffset = dp->i_offset;
slotsize = ep->d_reclen;
} else if (slotstatus == NONE) {
slotfreespace += size;
if (slotoffset == -1) slotoffset = dp->i_offset; if (slotfreespace >= slotneeded) {
slotstatus = COMPACT;
slotsize = dp->i_offset +
ep->d_reclen - slotoffset;
}
}
}
}
/*
* Check for a name match.
*/
if (ep->d_ino) {
# if (BYTE_ORDER == LITTLE_ENDIAN)
if (OFSFMT(dp))
namlen = ep->d_type;
else
namlen = ep->d_namlen;
# else
namlen = ep->d_namlen;
# endif
if (namlen == cnp->cn_namelen &&
!memcmp(cnp->cn_nameptr, ep->d_name, namlen)) {
#ifdef UFS_DIRHASH
foundentry:
#endif
/*
* Save directory entry's inode number and
* reclen in ndp->ni_ufs area, and release
* directory buffer.
*/
dp->i_ino = ep->d_ino;
dp->i_reclen = ep->d_reclen;
goto found;
}
}
prevoff = dp->i_offset;
dp->i_offset += ep->d_reclen;
entryoffsetinblock += ep->d_reclen;
if (ep->d_ino)
enduseful = dp->i_offset;
}
#ifdef UFS_DIRHASH
notfound:
#endif
/*
* If we started in the middle of the directory and failed
* to find our target, we must check the beginning as well.
*/
if (numdirpasses == 2) {
numdirpasses--;
dp->i_offset = 0;
endsearch = dp->i_diroff;
goto searchloop;
}
if (bp != NULL) brelse(bp);
/*
* If creating, and at end of pathname and current
* directory has not been removed, then can consider
* allowing file to be created.
*/
if ((nameiop == CREATE || nameiop == RENAME) &&
(flags & ISLASTCN) && dp->i_effnlink != 0) {
/*
* Access for write is interpreted as allowing
* creation of files in the directory.
*/
error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc);
if (error)
return (error);
/*
* Return an indication of where the new directory
* entry should be put. If we didn't find a slot,
* then set dp->i_count to 0 indicating
* that the new slot belongs at the end of the
* directory. If we found a slot, then the new entry
* can be put in the range from dp->i_offset to
* dp->i_offset + dp->i_count.
*/
if (slotstatus == NONE) {
dp->i_offset = roundup(DIP(dp, size), DIRBLKSIZ);
dp->i_count = 0;
enduseful = dp->i_offset;
} else if (nameiop == DELETE) {
dp->i_offset = slotoffset;
if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
dp->i_count = 0;
else
dp->i_count = dp->i_offset - prevoff;
} else {
dp->i_offset = slotoffset;
dp->i_count = slotsize;
if (enduseful < slotoffset + slotsize)
enduseful = slotoffset + slotsize;
}
dp->i_endoff = roundup(enduseful, DIRBLKSIZ);
/*
* We return with the directory locked, so that
* the parameters we set up above will still be
* valid if we actually decide to do a direnter().
* We return ni_vp == NULL to indicate that the entry
* does not currently exist; we leave a pointer to
* the (locked) directory inode in ndp->ni_dvp.
* The pathname buffer is saved so that the name
* can be obtained later.
*
* NB - if the directory is unlocked, then this
* information cannot be used.
*/
cnp->cn_flags |= SAVENAME;
if (!lockparent) { VOP_UNLOCK(vdp);
cnp->cn_flags |= PDIRUNLOCK;
}
return (EJUSTRETURN);
}
/*
* Insert name into cache (as non-existent) if appropriate.
*/
if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(vdp, *vpp, cnp);
return (ENOENT);
found:
if (numdirpasses == 2) nchstats.ncs_pass2++;
/*
* Check that directory length properly reflects presence
* of this entry.
*/
if (dp->i_offset + DIRSIZ(OFSFMT(dp), ep) > DIP(dp, size)) {
ufs_dirbad(dp, dp->i_offset, "i_ffs_size too small");
DIP_ASSIGN(dp, size, dp->i_offset + DIRSIZ(OFSFMT(dp), ep));
dp->i_flag |= IN_CHANGE | IN_UPDATE;
}
brelse(bp);
/*
* Found component in pathname.
* If the final component of path name, save information
* in the cache as to where the entry was found.
*/
if ((flags & ISLASTCN) && nameiop == LOOKUP)
dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1);
/*
* If deleting, and at end of pathname, return
* parameters which can be used to remove file.
* If the wantparent flag isn't set, we return only
* the directory (in ndp->ni_dvp), otherwise we go
* on and lock the inode, being careful with ".".
*/
if (nameiop == DELETE && (flags & ISLASTCN)) {
/*
* Write access to directory required to delete files.
*/
error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc);
if (error)
return (error);
/*
* Return pointer to current entry in dp->i_offset,
* and distance past previous entry (if there
* is a previous entry in this block) in dp->i_count.
* Save directory inode pointer in ndp->ni_dvp for dirremove().
*/
if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
dp->i_count = 0;
else
dp->i_count = dp->i_offset - prevoff;
if (dp->i_number == dp->i_ino) {
vref(vdp);
*vpp = vdp;
return (0);
}
error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp);
if (error)
return (error);
/*
* If directory is "sticky", then user must own
* the directory, or the file in it, else she
* may not delete it (unless she's root). This
* implements append-only directories.
*/
if ((DIP(dp, mode) & ISVTX) && cred->cr_uid != 0 && cred->cr_uid != DIP(dp, uid) && !vnoperm(vdp) && DIP(VTOI(tdp), uid) != cred->cr_uid) {
vput(tdp);
return (EPERM);
}
*vpp = tdp;
if (!lockparent) { VOP_UNLOCK(vdp);
cnp->cn_flags |= PDIRUNLOCK;
}
return (0);
}
/*
* If rewriting (RENAME), return the inode and the
* information required to rewrite the present directory
* Must get inode of directory entry to verify it's a
* regular file, or empty directory.
*/
if (nameiop == RENAME && wantparent &&
(flags & ISLASTCN)) {
error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc);
if (error)
return (error);
/*
* Careful about locking second inode.
* This can only occur if the target is ".".
*/
if (dp->i_number == dp->i_ino)
return (EISDIR);
error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp);
if (error)
return (error);
*vpp = tdp;
cnp->cn_flags |= SAVENAME;
if (!lockparent) { VOP_UNLOCK(vdp);
cnp->cn_flags |= PDIRUNLOCK;
}
return (0);
}
/*
* Step through the translation in the name. We do not `vput' the
* directory because we may need it again if a symbolic link
* is relative to the current directory. Instead we save it
* unlocked as "pdp". We must get the target inode before unlocking
* the directory to insure that the inode will not be removed
* before we get it. We prevent deadlock by always fetching
* inodes from the root, moving down the directory tree. Thus
* when following backward pointers ".." we must unlock the
* parent directory before getting the requested directory.
* There is a potential race condition here if both the current
* and parent directories are removed before the VFS_VGET for the
* inode associated with ".." returns. We hope that this occurs
* infrequently since we cannot avoid this race condition without
* implementing a sophisticated deadlock detection algorithm.
* Note also that this simple deadlock detection scheme will not
* work if the file system has any hard links other than ".."
* that point backwards in the directory structure.
*/
pdp = vdp;
if (flags & ISDOTDOT) {
VOP_UNLOCK(pdp); /* race to get the inode */
cnp->cn_flags |= PDIRUNLOCK;
error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp);
if (error) {
if (vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY) == 0) cnp->cn_flags &= ~PDIRUNLOCK;
return (error);
}
if (lockparent && (flags & ISLASTCN)) {
if ((error = vn_lock(pdp, LK_EXCLUSIVE))) {
vput(tdp);
return (error);
}
cnp->cn_flags &= ~PDIRUNLOCK;
}
*vpp = tdp;
} else if (dp->i_number == dp->i_ino) {
vref(vdp); /* we want ourself, ie "." */
*vpp = vdp;
} else {
error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp);
if (error)
return (error);
if (!lockparent || !(flags & ISLASTCN)) { VOP_UNLOCK(pdp);
cnp->cn_flags |= PDIRUNLOCK;
}
*vpp = tdp;
}
/*
* Insert name into cache if appropriate.
*/
if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp);
return (0);
}
void
ufs_dirbad(struct inode *ip, doff_t offset, char *how)
{
struct mount *mp;
mp = ITOV(ip)->v_mount;
(void)printf("%s: bad dir ino %u at offset %d: %s\n",
mp->mnt_stat.f_mntonname, ip->i_number, offset, how);
if ((mp->mnt_stat.f_flags & MNT_RDONLY) == 0)
panic("bad dir");
}
/*
* Do consistency checking on a directory entry:
* record length must be multiple of 4
* entry must fit in rest of its DIRBLKSIZ block
* record must be large enough to contain entry
* name is not longer than MAXNAMLEN
* name must be as long as advertised, and null terminated
*/
int
ufs_dirbadentry(struct vnode *vdp, struct direct *ep, int entryoffsetinblock)
{
struct inode *dp;
int i;
int namlen;
dp = VTOI(vdp);
# if (BYTE_ORDER == LITTLE_ENDIAN)
if (OFSFMT(dp))
namlen = ep->d_type;
else
namlen = ep->d_namlen;
# else
namlen = ep->d_namlen;
# endif
if ((ep->d_reclen & 0x3) != 0 ||
ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > MAXNAMLEN) {
/*return (1); */
printf("First bad\n");
goto bad;
}
if (ep->d_ino == 0)
return (0);
for (i = 0; i < namlen; i++) if (ep->d_name[i] == '\0') {
/*return (1); */
printf("Second bad\n");
goto bad;
}
if (ep->d_name[i])
goto bad;
return (0);
bad:
return (1);
}
/*
* Construct a new directory entry after a call to namei, using the
* parameters that it left in the componentname argument cnp. The
* argument ip is the inode to which the new directory entry will refer.
*/
void
ufs_makedirentry(struct inode *ip, struct componentname *cnp,
struct direct *newdirp)
{
#ifdef DIAGNOSTIC
if ((cnp->cn_flags & SAVENAME) == 0)
panic("ufs_makedirentry: missing name");
#endif
newdirp->d_ino = ip->i_number;
newdirp->d_namlen = cnp->cn_namelen;
memset(newdirp->d_name + (cnp->cn_namelen & ~(DIR_ROUNDUP-1)),
0, DIR_ROUNDUP);
memcpy(newdirp->d_name, cnp->cn_nameptr, cnp->cn_namelen);
if (OFSFMT(ip)) {
newdirp->d_type = 0;
# if (BYTE_ORDER == LITTLE_ENDIAN)
{ u_char tmp = newdirp->d_namlen;
newdirp->d_namlen = newdirp->d_type;
newdirp->d_type = tmp; }
# endif
} else
newdirp->d_type = IFTODT(DIP(ip, mode));
}
/*
* Write a directory entry after a call to namei, using the parameters
* that it left in nameidata. The argument dirp is the new directory
* entry contents. Dvp is a pointer to the directory to be written,
* which was left locked by namei. Remaining parameters (dp->i_offset,
* dp->i_count) indicate how the space for the new entry is to be obtained.
* Non-null bp indicates that a directory is being created (for the
* soft dependency code).
*/
int
ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
struct ucred *cr;
struct proc *p;
int newentrysize;
struct inode *dp;
struct buf *bp;
u_int dsize;
struct direct *ep, *nep;
int error, ret, blkoff, loc, spacefree, flags;
char *dirbuf;
error = 0;
cr = cnp->cn_cred;
p = cnp->cn_proc;
dp = VTOI(dvp);
newentrysize = DIRSIZ(OFSFMT(dp), dirp);
if (dp->i_count == 0) {
/*
* If dp->i_count is 0, then namei could find no
* space in the directory. Here, dp->i_offset will
* be on a directory block boundary and we will write the
* new entry into a fresh block.
*/
if (dp->i_offset & (DIRBLKSIZ - 1))
panic("ufs_direnter: newblk");
flags = B_CLRBUF;
if (!DOINGSOFTDEP(dvp))
flags |= B_SYNC;
if ((error = UFS_BUF_ALLOC(dp, (off_t)dp->i_offset, DIRBLKSIZ,
cr, flags, &bp)) != 0) {
if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp);
return (error);
}
DIP_ASSIGN(dp, size, dp->i_offset + DIRBLKSIZ);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
uvm_vnp_setsize(dvp, DIP(dp, size));
dirp->d_reclen = DIRBLKSIZ;
blkoff = dp->i_offset &
(VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1);
memcpy(bp->b_data + blkoff, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, dp->i_offset);
ufsdirhash_add(dp, dirp, dp->i_offset);
ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
dp->i_offset);
}
#endif
if (DOINGSOFTDEP(dvp)) {
/*
* Ensure that the entire newly allocated block is a
* valid directory so that future growth within the
* block does not have to ensure that the block is
* written before the inode.
*/
blkoff += DIRBLKSIZ;
while (blkoff < bp->b_bcount) {
((struct direct *)
(bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
blkoff += DIRBLKSIZ;
}
if (softdep_setup_directory_add(bp, dp, dp->i_offset,
dirp->d_ino, newdirbp, 1) == 0) {
bdwrite(bp);
return (UFS_UPDATE(dp, 0));
}
/* We have just allocated a directory block in an
* indirect block. Rather than tracking when it gets
* claimed by the inode, we simply do a VOP_FSYNC
* now to ensure that it is there (in case the user
* does a future fsync). Note that we have to unlock
* the inode for the entry that we just entered, as
* the VOP_FSYNC may need to lock other inodes which
* can lead to deadlock if we also hold a lock on
* the newly entered node.
*/
if ((error = VOP_BWRITE(bp)))
return (error);
if (tvp != NULL)
VOP_UNLOCK(tvp); error = VOP_FSYNC(dvp, p->p_ucred, MNT_WAIT, p);
if (tvp != NULL)
vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
return (error);
}
error = VOP_BWRITE(bp);
ret = UFS_UPDATE(dp, !DOINGSOFTDEP(dvp));
if (error == 0)
return (ret);
return (error);
}
/*
* If dp->i_count is non-zero, then namei found space for the new
* entry in the range dp->i_offset to dp->i_offset + dp->i_count
* in the directory. To use this space, we may have to compact
* the entries located there, by copying them together towards the
* beginning of the block, leaving the free space in one usable
* chunk at the end.
*/
/*
* Increase size of directory if entry eats into new space.
* This should never push the size past a new multiple of
* DIRBLKSIZE.
*
* N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
*/
if (dp->i_offset + dp->i_count > DIP(dp, size))
DIP_ASSIGN(dp, size, dp->i_offset + dp->i_count);
/*
* Get the block containing the space for the new directory entry.
*/
if ((error = UFS_BUFATOFF(dp, (off_t)dp->i_offset, &dirbuf, &bp))
!= 0) {
if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp);
return (error);
}
/*
* Find space for the new entry. In the simple case, the entry at
* offset base will have the space. If it does not, then namei
* arranged that compacting the region dp->i_offset to
* dp->i_offset + dp->i_count would yield the space.
*/
ep = (struct direct *)dirbuf;
dsize = ep->d_ino ? DIRSIZ(OFSFMT(dp), ep) : 0;
spacefree = ep->d_reclen - dsize;
for (loc = ep->d_reclen; loc < dp->i_count; ) { nep = (struct direct *)(dirbuf + loc);
/* Trim the existing slot (NB: dsize may be zero). */
ep->d_reclen = dsize;
ep = (struct direct *)((char *)ep + dsize);
/* Read nep->d_reclen now as the memmove() may clobber it. */
loc += nep->d_reclen;
if (nep->d_ino == 0) {
/*
* A mid-block unused entry. Such entries are
* never created by the kernel, but fsck_ffs
* can create them (and it doesn't fix them).
*
* Add up the free space, and initialise the
* relocated entry since we don't memmove it.
*/
spacefree += nep->d_reclen;
ep->d_ino = 0;
dsize = 0;
continue;
}
dsize = DIRSIZ(OFSFMT(dp), nep);
spacefree += nep->d_reclen - dsize;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_move(dp, nep,
dp->i_offset + ((char *)nep - dirbuf),
dp->i_offset + ((char *)ep - dirbuf));
#endif
if (DOINGSOFTDEP(dvp))
softdep_change_directoryentry_offset(dp, dirbuf,
(caddr_t)nep, (caddr_t)ep, dsize);
else
memmove(ep, nep, dsize);
}
/*
* Here, `ep' points to a directory entry containing `dsize' in-use
* bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
* then the entry is completely unused (dsize == 0). The value
* of ep->d_reclen is always indeterminate.
*
* Update the pointer fields in the previous entry (if any),
* copy in the new entry, and write out the block.
*/
if (ep->d_ino == 0) {
if (spacefree + dsize < newentrysize)
panic("ufs_direnter: compact1"); dirp->d_reclen = spacefree + dsize;
} else {
if (spacefree < newentrysize)
panic("ufs_direnter: compact2"); dirp->d_reclen = spacefree;
ep->d_reclen = dsize;
ep = (struct direct *)((char *)ep + dsize);
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
dirp->d_reclen == spacefree))
ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf));
#endif
memcpy(ep, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf -
(dp->i_offset & (DIRBLKSIZ - 1)),
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
if (DOINGSOFTDEP(dvp)) {
(void)softdep_setup_directory_add(bp, dp,
dp->i_offset + (caddr_t)ep - dirbuf,
dirp->d_ino, newdirbp, 0);
bdwrite(bp);
} else {
error = VOP_BWRITE(bp);
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* If all went well, and the directory can be shortened, proceed
* with the truncation. Note that we have to unlock the inode for
* the entry that we just entered, as the truncation may need to
* lock other inodes which can lead to deadlock if we also hold a
* lock on the newly entered node.
*/
if (error == 0 && dp->i_endoff && dp->i_endoff < DIP(dp, size)) { if (tvp != NULL) VOP_UNLOCK(tvp);
error = UFS_TRUNCATE(dp, (off_t)dp->i_endoff, IO_SYNC, cr);
#ifdef UFS_DIRHASH
if (error == 0 && dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, dp->i_endoff);
#endif
if (tvp != NULL) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
}
return (error);
}
/*
* Remove a directory entry after a call to namei, using
* the parameters which it left in nameidata. The entry
* dp->i_offset contains the offset into the directory of the
* entry to be eliminated. The dp->i_count field contains the
* size of the previous record in the directory. If this
* is 0, the first entry is being deleted, so we need only
* zero the inode number to mark the entry as free. If the
* entry is not the first in the directory, we must reclaim
* the space of the now empty record by adding the record size
* to the size of the previous entry.
*/
int
ufs_dirremove(struct vnode *dvp, struct inode *ip, int flags, int isrmdir)
{
struct inode *dp;
struct direct *ep;
struct buf *bp;
int error;
dp = VTOI(dvp);
if ((error = UFS_BUFATOFF(dp,
(off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0)
return (error);
#ifdef UFS_DIRHASH
/*
* Remove the dirhash entry. This is complicated by the fact
* that `ep' is the previous entry when dp->i_count != 0.
*/
if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, (dp->i_count == 0) ? ep : (struct direct *)((char *)ep + ep->d_reclen), dp->i_offset);
#endif
if (dp->i_count == 0) {
/*
* First entry in block: set d_ino to zero.
*/
ep->d_ino = 0;
} else {
/*
* Collapse new free space into previous entry.
*/
ep->d_reclen += dp->i_reclen;
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep -
((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)),
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
if (DOINGSOFTDEP(dvp)) {
if (ip) { ip->i_effnlink--;
softdep_change_linkcnt(ip, 0);
softdep_setup_remove(bp, dp, ip, isrmdir);
}
if (softdep_slowdown(dvp)) {
error = bwrite(bp);
} else {
bdwrite(bp);
error = 0;
}
} else {
if (ip) {
ip->i_effnlink--;
DIP_ADD(ip, nlink, -1);
ip->i_flag |= IN_CHANGE;
}
if (DOINGASYNC(dvp) && dp->i_count != 0) { bdwrite(bp);
error = 0;
} else
error = bwrite(bp);
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
return (error);
}
/*
* Rewrite an existing directory entry to point at the inode
* supplied. The parameters describing the directory entry are
* set up by a call to namei.
*/
int
ufs_dirrewrite(struct inode *dp, struct inode *oip, ufsino_t newinum,
int newtype, int isrmdir)
{
struct buf *bp;
struct direct *ep;
struct vnode *vdp = ITOV(dp);
int error;
error = UFS_BUFATOFF(dp, (off_t)dp->i_offset, (char **)&ep, &bp);
if (error)
return (error);
ep->d_ino = newinum;
if (!OFSFMT(dp)) ep->d_type = newtype;
oip->i_effnlink--;
if (DOINGSOFTDEP(vdp)) {
softdep_change_linkcnt(oip, 0);
softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
bdwrite(bp);
} else {
DIP_ADD(oip, nlink, -1);
oip->i_flag |= IN_CHANGE;
if (DOINGASYNC(vdp)) {
bdwrite(bp);
error = 0;
} else {
error = VOP_BWRITE(bp);
}
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
return (error);
}
/*
* Check if a directory is empty or not.
* Inode supplied must be locked.
*
* Using a struct dirtemplate here is not precisely
* what we want, but better than using a struct direct.
*
* NB: does not handle corrupted directories.
*/
int
ufs_dirempty(struct inode *ip, ufsino_t parentino, struct ucred *cred)
{
off_t off, m;
struct dirtemplate dbuf;
struct direct *dp = (struct direct *)&dbuf;
int error, namlen;
size_t count;
#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
m = DIP(ip, size); for (off = 0; off < m; off += dp->d_reclen) {
error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off,
UIO_SYSSPACE, IO_NODELOCKED, cred, &count, curproc);
/*
* Since we read MINDIRSIZ, residual must
* be 0 unless we're at end of file.
*/
if (error || count != 0)
return (0);
/* avoid infinite loops */
if (dp->d_reclen == 0)
return (0);
/* skip empty entries */
if (dp->d_ino == 0)
continue;
/* accept only "." and ".." */
# if (BYTE_ORDER == LITTLE_ENDIAN)
if (OFSFMT(ip))
namlen = dp->d_type;
else
namlen = dp->d_namlen;
# else
namlen = dp->d_namlen;
# endif
if (namlen > 2)
return (0);
if (dp->d_name[0] != '.')
return (0);
/*
* At this point namlen must be 1 or 2.
* 1 implies ".", 2 implies ".." if second
* char is also "."
*/
if (namlen == 1 && dp->d_ino == ip->i_number)
continue;
if (dp->d_name[1] == '.' && dp->d_ino == parentino)
continue;
return (0);
}
return (1);
}
/*
* Check if source directory is in the path of the target directory.
* Target is supplied locked, source is unlocked.
* The target is always vput before returning.
*/
int
ufs_checkpath(struct inode *source, struct inode *target, struct ucred *cred)
{
struct vnode *nextvp, *vp;
int error, rootino, namlen;
struct dirtemplate dirbuf;
vp = ITOV(target);
if (target->i_number == source->i_number) {
error = EEXIST;
goto out;
}
rootino = ROOTINO;
error = 0;
if (target->i_number == rootino)
goto out;
for (;;) {
if (vp->v_type != VDIR) {
error = ENOTDIR;
break;
}
error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf,
sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
IO_NODELOCKED, cred, NULL, curproc);
if (error != 0)
break;
# if (BYTE_ORDER == LITTLE_ENDIAN)
if (OFSFMT(VTOI(vp)))
namlen = dirbuf.dotdot_type;
else
namlen = dirbuf.dotdot_namlen;
# else
namlen = dirbuf.dotdot_namlen;
# endif
if (namlen != 2 ||
dirbuf.dotdot_name[0] != '.' ||
dirbuf.dotdot_name[1] != '.') {
error = ENOTDIR;
break;
}
if (dirbuf.dotdot_ino == source->i_number) {
error = EINVAL;
break;
}
if (dirbuf.dotdot_ino == rootino)
break;
VOP_UNLOCK(vp);
error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, &nextvp);
vrele(vp);
if (error) {
vp = NULL;
break;
}
vp = nextvp;
}
out:
if (error == ENOTDIR)
printf("checkpath: .. not a directory\n");
if (vp != NULL)
vput(vp);
return (error);
}
/* $OpenBSD: vfs_lookup.c,v 1.87 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_lookup.c,v 1.17 1996/02/09 19:00:59 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_lookup.c 8.6 (Berkeley) 11/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/syslimits.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/mount.h>
#include <sys/errno.h>
#include <sys/pool.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/pledge.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
int
component_push(struct componentname *cnp, char *component, size_t len)
{
if (cnp->cn_rpi + len + 1 >= MAXPATHLEN)
return 0;
if (cnp->cn_rpi > 1) cnp->cn_rpbuf[cnp->cn_rpi++] = '/';
memcpy(cnp->cn_rpbuf + cnp->cn_rpi, component, len);
cnp->cn_rpi+=len;
cnp->cn_rpbuf[cnp->cn_rpi] = '\0';
return 1;
}
void
component_pop(struct componentname *cnp)
{
while(cnp->cn_rpi && cnp->cn_rpbuf[cnp->cn_rpi] != '/' )
cnp->cn_rpi--;
if (cnp->cn_rpi == 0 && cnp->cn_rpbuf[0] == '/') cnp->cn_rpi++;
cnp->cn_rpbuf[cnp->cn_rpi] = '\0';
}
void
ndinitat(struct nameidata *ndp, u_long op, u_long flags,
enum uio_seg segflg, int dirfd, const char *namep, struct proc *p)
{
memset(ndp, 0, sizeof(*ndp));
ndp->ni_cnd.cn_nameiop = op;
ndp->ni_cnd.cn_flags = flags;
ndp->ni_segflg = segflg;
ndp->ni_dirfd = dirfd;
ndp->ni_dirp = namep;
ndp->ni_cnd.cn_proc = p;
}
/*
* Convert a pathname into a pointer to a vnode.
*
* The FOLLOW flag is set when symbolic links are to be followed
* when they occur at the end of the name translation process.
* Symbolic links are always followed for all other pathname
* components other than the last.
*
* If the LOCKLEAF flag is set, a locked vnode is returned.
*
* The segflg defines whether the name is to be copied from user
* space or kernel space.
*
* Overall outline of namei:
*
* copy in name
* get starting directory
* while (!done && !error) {
* call lookup to search path.
* if symbolic link, massage name in buffer and continue
* }
*/
int
namei(struct nameidata *ndp)
{
struct filedesc *fdp; /* pointer to file descriptor state */
char *cp; /* pointer into pathname argument */
struct vnode *dp; /* the directory we are searching */
struct iovec aiov; /* uio for reading symbolic links */
struct uio auio;
int error, linklen;
struct componentname *cnp = &ndp->ni_cnd;
struct proc *p = cnp->cn_proc;
ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred;
#ifdef DIAGNOSTIC
if (!cnp->cn_cred || !cnp->cn_proc)
panic ("namei: bad cred/proc");
if (cnp->cn_nameiop & (~OPMASK))
panic ("namei: nameiop contaminated with flags");
if (cnp->cn_flags & OPMASK)
panic ("namei: flags contaminated with nameiops");
#endif
fdp = cnp->cn_proc->p_fd;
/*
* Get a buffer for the name to be translated, and copy the
* name into the buffer.
*/
if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = pool_get(&namei_pool, PR_WAITOK);
if (ndp->ni_segflg == UIO_SYSSPACE)
error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
MAXPATHLEN, &ndp->ni_pathlen);
else
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
MAXPATHLEN, &ndp->ni_pathlen);
/*
* Fail on null pathnames
*/
if (error == 0 && ndp->ni_pathlen == 1)
error = ENOENT;
if (error)
goto fail;
#ifdef KTRACE
if (KTRPOINT(cnp->cn_proc, KTR_NAMEI)) ktrnamei(cnp->cn_proc, cnp->cn_pnbuf);
#endif
/*
* Strip trailing slashes, as requested
*/
if (cnp->cn_flags & STRIPSLASHES) {
char *end = cnp->cn_pnbuf + ndp->ni_pathlen - 2;
cp = end;
while (cp >= cnp->cn_pnbuf && (*cp == '/'))
cp--;
/* Still some remaining characters in the buffer */
if (cp >= cnp->cn_pnbuf) {
ndp->ni_pathlen -= (end - cp);
*(cp + 1) = '\0';
}
}
ndp->ni_loopcnt = 0;
/*
* Get starting point for the translation.
*/
if ((ndp->ni_rootdir = fdp->fd_rdir) == NULL ||
(ndp->ni_cnd.cn_flags & KERNELPATH))
ndp->ni_rootdir = rootvnode;
if (ndp->ni_cnd.cn_flags & KERNELPATH) {
ndp->ni_cnd.cn_flags |= BYPASSUNVEIL;
} else {
error = pledge_namei(p, ndp, cnp->cn_pnbuf);
if (error)
goto fail;
}
/*
* Check if starting from root directory or current directory.
*/
if (cnp->cn_pnbuf[0] == '/') {
dp = ndp->ni_rootdir;
vref(dp);
if (cnp->cn_flags & REALPATH && cnp->cn_rpi == 0) { cnp->cn_rpbuf[0] = '/';
cnp->cn_rpbuf[1] = '\0';
cnp->cn_rpi = 1;
}
} else if (ndp->ni_dirfd == AT_FDCWD) {
dp = fdp->fd_cdir;
vref(dp);
unveil_start_relative(p, ndp, dp);
unveil_check_component(p, ndp, dp);
} else {
struct file *fp = fd_getfile(fdp, ndp->ni_dirfd);
if (fp == NULL) {
error = EBADF;
goto fail;
}
dp = (struct vnode *)fp->f_data;
if (fp->f_type != DTYPE_VNODE || dp->v_type != VDIR) { FRELE(fp, p);
error = ENOTDIR;
goto fail;
}
vref(dp);
unveil_start_relative(p, ndp, dp);
unveil_check_component(p, ndp, dp);
FRELE(fp, p);
}
for (;;) {
if (!dp->v_mount) {
/* Give up if the directory is no longer mounted */
vrele(dp);
error = ENOENT;
goto fail;
}
cnp->cn_nameptr = cnp->cn_pnbuf;
ndp->ni_startdir = dp;
if ((error = vfs_lookup(ndp)) != 0)
goto fail;
/*
* If not a symbolic link, return search result.
*/
if ((cnp->cn_flags & ISSYMLINK) == 0) {
if ((error = unveil_check_final(p, ndp))) {
if ((cnp->cn_flags & LOCKPARENT) &&
(cnp->cn_flags & ISLASTCN) &&
(ndp->ni_vp != ndp->ni_dvp))
vput(ndp->ni_dvp); if (ndp->ni_vp) {
if ((cnp->cn_flags & LOCKLEAF))
vput(ndp->ni_vp);
else
vrele(ndp->ni_vp);
}
goto fail;
}
if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
pool_put(&namei_pool, cnp->cn_pnbuf);
else
cnp->cn_flags |= HASBUF;
return (0);
}
if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) VOP_UNLOCK(ndp->ni_dvp); if (ndp->ni_loopcnt++ >= SYMLOOP_MAX) {
error = ELOOP;
break;
}
if (ndp->ni_pathlen > 1)
cp = pool_get(&namei_pool, PR_WAITOK);
else
cp = cnp->cn_pnbuf;
aiov.iov_base = cp;
aiov.iov_len = MAXPATHLEN;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_procp = cnp->cn_proc;
auio.uio_resid = MAXPATHLEN;
error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
if (error) {
badlink:
if (ndp->ni_pathlen > 1) pool_put(&namei_pool, cp);
break;
}
linklen = MAXPATHLEN - auio.uio_resid;
if (linklen == 0) {
error = ENOENT;
goto badlink;
}
if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
error = ENAMETOOLONG;
goto badlink;
}
if (ndp->ni_pathlen > 1) {
memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
pool_put(&namei_pool, cnp->cn_pnbuf);
cnp->cn_pnbuf = cp;
} else
cnp->cn_pnbuf[linklen] = '\0';
ndp->ni_pathlen += linklen;
vput(ndp->ni_vp);
dp = ndp->ni_dvp;
/*
* Check if root directory should replace current directory.
*/
if (cnp->cn_pnbuf[0] == '/') {
vrele(dp);
dp = ndp->ni_rootdir;
vref(dp);
ndp->ni_unveil_match = NULL;
unveil_check_component(p, ndp, dp);
if (cnp->cn_flags & REALPATH) { cnp->cn_rpbuf[0] = '/';
cnp->cn_rpbuf[1] = '\0';
cnp->cn_rpi = 1;
}
} else if (cnp->cn_flags & REALPATH) {
component_pop(cnp);
}
}
vrele(ndp->ni_dvp);
vput(ndp->ni_vp);
fail:
pool_put(&namei_pool, cnp->cn_pnbuf);
ndp->ni_vp = NULL;
return (error);
}
/*
* Search a pathname.
* This is a very central and rather complicated routine.
*
* The pathname is pointed to by ni_cnd.cn_nameptr and is of length
* ni_pathlen. The starting directory is taken from ni_startdir. The
* pathname is descended until done, or a symbolic link is encountered.
* If the path is completed the flag ISLASTCN is set in ni_cnd.cn_flags.
* If a symbolic link need interpretation is encountered, the flag ISSYMLINK
* is set in ni_cnd.cn_flags.
*
* The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
* whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it, the parent directory is returned
* locked. If flag has WANTPARENT or'ed into it, the parent directory is
* returned unlocked. Otherwise the parent directory is not returned. If
* the target of the pathname exists and LOCKLEAF is or'ed into the flag
* the target is returned locked, otherwise it is returned unlocked.
* When creating or renaming and LOCKPARENT is specified, the target may not
* be ".". When deleting and LOCKPARENT is specified, the target may be ".".
*
* Overall outline of lookup:
*
* dirloop:
* identify next component of name at ndp->ni_ptr
* handle degenerate case where name is null string
* if .. and crossing mount points and on mounted filesys, find parent
* call VOP_LOOKUP routine for next component name
* directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
* component vnode returned in ni_vp (if it exists), locked.
* if result vnode is mounted on and crossing mount points,
* find mounted on vnode
* if more components of name, do next level at dirloop
* return the answer in ni_vp, locked if LOCKLEAF set
* if LOCKPARENT set, return locked parent in ni_dvp
* if WANTPARENT set, return unlocked parent in ni_dvp
*/
int
vfs_lookup(struct nameidata *ndp)
{
char *cp; /* pointer into pathname argument */
struct vnode *dp = NULL; /* the directory we are searching */
struct vnode *tdp; /* saved dp */
struct mount *mp; /* mount table entry */
int docache; /* == 0 do not cache last component */
int wantparent; /* 1 => wantparent or lockparent flag */
int rdonly; /* lookup read-only flag bit */
int error = 0;
int dpunlocked = 0; /* dp has already been unlocked */
int slashes;
struct componentname *cnp = &ndp->ni_cnd;
/*
* Setup: break out flag bits into variables.
*/
wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE))
docache = 0;
rdonly = cnp->cn_flags & RDONLY;
ndp->ni_dvp = NULL;
cnp->cn_flags &= ~ISSYMLINK;
dp = ndp->ni_startdir;
ndp->ni_startdir = NULLVP;
vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
/*
* If we have a leading string of slashes, remove them, and just make
* sure the current node is a directory.
*/
cp = cnp->cn_nameptr;
if (*cp == '/') {
do {
cp++;
} while (*cp == '/');
ndp->ni_pathlen -= cp - cnp->cn_nameptr;
cnp->cn_nameptr = cp;
if (dp->v_type != VDIR) {
error = ENOTDIR;
goto bad;
}
/*
* If we've exhausted the path name, then just return the
* current node. If the caller requested the parent node (i.e.
* it's a CREATE, DELETE, or RENAME), and we don't have one
* (because this is the root directory), then we must fail.
*/
if (cnp->cn_nameptr[0] == '\0') { if (ndp->ni_dvp == NULL && wantparent) {
error = EISDIR;
goto bad;
}
ndp->ni_vp = dp;
cnp->cn_flags |= ISLASTCN;
goto terminal;
}
}
dirloop:
/*
* Search a new directory.
*
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*/
cnp->cn_consume = 0;
/* XXX: Figure out the length of the last component. */
cp = cnp->cn_nameptr;
while (*cp && (*cp != '/')) cp++;
cnp->cn_namelen = cp - cnp->cn_nameptr;
if (cnp->cn_namelen > NAME_MAX) {
error = ENAMETOOLONG;
goto bad;
}
#ifdef NAMEI_DIAGNOSTIC
{ char c = *cp;
*cp = '\0';
printf("{%s}: ", cnp->cn_nameptr);
*cp = c; }
#endif
if (cnp->cn_flags & REALPATH) {
size_t len = cp - cnp->cn_nameptr;
if (len == 2 && cnp->cn_nameptr[0] == '.' &&
cnp->cn_nameptr[1] == '.')
component_pop(cnp);
else if (!(len == 1 && cnp->cn_nameptr[0] == '.')) {
if (!component_push(cnp, cnp->cn_nameptr, len)) {
error = ENAMETOOLONG;
goto bad;
}
}
}
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
/*
* If this component is followed by a slash, then move the pointer to
* the next component forward, and remember that this component must be
* a directory.
*/
if (*cp == '/') {
do {
cp++;
} while (*cp == '/'); slashes = cp - ndp->ni_next;
ndp->ni_pathlen -= slashes;
ndp->ni_next = cp;
cnp->cn_flags |= REQUIREDIR;
} else {
slashes = 0;
cnp->cn_flags &= ~REQUIREDIR;
}
/*
* We do special processing on the last component, whether or not it's
* a directory. Cache all intervening lookups, but not the final one.
*/
if (*cp == '\0') {
if (docache)
cnp->cn_flags |= MAKEENTRY;
else
cnp->cn_flags &= ~MAKEENTRY;
cnp->cn_flags |= ISLASTCN;
} else {
cnp->cn_flags |= MAKEENTRY;
cnp->cn_flags &= ~ISLASTCN;
}
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT;
else
cnp->cn_flags &= ~ISDOTDOT;
/*
* Handle "..": two special cases.
* 1. If at root directory (e.g. after chroot)
* or at absolute root directory
* then ignore it so can't get out.
* 2. If this vnode is the root of a mounted
* filesystem, then replace it with the
* vnode which was mounted on so we take the
* .. in the other file system.
*/
if (cnp->cn_flags & ISDOTDOT) {
for (;;) {
if (dp == ndp->ni_rootdir || dp == rootvnode) {
ndp->ni_dvp = dp;
ndp->ni_vp = dp;
vref(dp);
ndp->ni_unveil_match = NULL;
goto nextname;
}
if ((dp->v_flag & VROOT) == 0 ||
(cnp->cn_flags & NOCROSSMOUNT))
break;
tdp = dp;
dp = dp->v_mount->mnt_vnodecovered;
vput(tdp);
vref(dp);
unveil_check_component(curproc, ndp, dp);
vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
}
}
/*
* We now have a segment name to search for, and a directory to search.
*/
ndp->ni_dvp = dp;
ndp->ni_vp = NULL;
cnp->cn_flags &= ~PDIRUNLOCK;
unveil_check_component(curproc, ndp, dp);
if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
#ifdef DIAGNOSTIC
if (ndp->ni_vp != NULL)
panic("leaf should be empty");
#endif
#ifdef NAMEI_DIAGNOSTIC
printf("not found\n");
#endif
/*
* Allow for unveiling a file in a directory which we cannot
* create ourselves.
*/
if (ndp->ni_pledge == PLEDGE_UNVEIL &&
(error == EPERM || error == EACCES || error == EROFS))
error = EJUSTRETURN;
if (error != EJUSTRETURN)
goto bad;
/*
* If this was not the last component, or there were trailing
* slashes, then the name must exist.
*/
if (cnp->cn_flags & REQUIREDIR) {
error = ENOENT;
goto bad;
}
/*
* If creating and at end of pathname, then can consider
* allowing file to be created. Check for a read only
* filesystem and disallow this unless we are unveil'ing
*/
if (ndp->ni_pledge != PLEDGE_UNVEIL && (rdonly ||
(ndp->ni_dvp->v_mount->mnt_flag & MNT_RDONLY))) {
error = EROFS;
goto bad;
}
/*
* We return with ni_vp NULL to indicate that the entry
* doesn't currently exist, leaving a pointer to the
* (possibly locked) directory inode in ndp->ni_dvp.
*/
if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp;
vref(ndp->ni_startdir);
}
return (0);
}
#ifdef NAMEI_DIAGNOSTIC
printf("found\n");
#endif
/*
* Take into account any additional components consumed by the
* underlying filesystem. This will include any trailing slashes after
* the last component consumed.
*/
if (cnp->cn_consume > 0) { if (cnp->cn_consume >= slashes) { cnp->cn_flags &= ~REQUIREDIR;
}
ndp->ni_pathlen -= cnp->cn_consume - slashes;
ndp->ni_next += cnp->cn_consume - slashes;
cnp->cn_consume = 0;
if (ndp->ni_next[0] == '\0') cnp->cn_flags |= ISLASTCN;
}
dp = ndp->ni_vp;
/*
* Check to see if the vnode has been mounted on;
* if so find the root of the mounted file system.
*/
while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
if (vfs_busy(mp, VB_READ|VB_WAIT))
continue;
VOP_UNLOCK(dp);
error = VFS_ROOT(mp, &tdp);
vfs_unbusy(mp);
if (error) {
dpunlocked = 1;
goto bad2;
}
vrele(dp);
ndp->ni_vp = dp = tdp;
}
/*
* Check for symbolic link. Back up over any slashes that we skipped,
* as we will need them again.
*/
if ((dp->v_type == VLNK) && (cnp->cn_flags & (FOLLOW|REQUIREDIR))) { ndp->ni_pathlen += slashes;
ndp->ni_next -= slashes;
cnp->cn_flags |= ISSYMLINK;
return (0);
}
/*
* Check for directory, if the component was followed by a series of
* slashes.
*/
if ((dp->v_type != VDIR) && (cnp->cn_flags & REQUIREDIR)) {
error = ENOTDIR;
goto bad2;
}
nextname:
/*
* Not a symbolic link. If this was not the last component, then
* continue at the next component, else return.
*/
if (!(cnp->cn_flags & ISLASTCN)) { cnp->cn_nameptr = ndp->ni_next;
vrele(ndp->ni_dvp);
goto dirloop;
}
terminal:
/*
* Check for read-only file systems.
*/
if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) {
/*
* Disallow directory write attempts on read-only
* file systems.
*/
if (rdonly || (dp->v_mount->mnt_flag & MNT_RDONLY) || (wantparent &&
(ndp->ni_dvp->v_mount->mnt_flag & MNT_RDONLY))) {
error = EROFS;
goto bad2;
}
}
if (ndp->ni_dvp != NULL) { if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp;
vref(ndp->ni_startdir);
}
if (!wantparent) vrele(ndp->ni_dvp);
}
if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp);
return (0);
bad2:
if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
((cnp->cn_flags & PDIRUNLOCK) == 0))
VOP_UNLOCK(ndp->ni_dvp);
vrele(ndp->ni_dvp);
bad:
if (dpunlocked) vrele(dp);
else
vput(dp);
ndp->ni_vp = NULL;
return (error);
}
/*
* Reacquire a path name component.
*/
int
vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
{
struct vnode *dp = NULL; /* the directory we are searching */
int wantparent; /* 1 => wantparent or lockparent flag */
int rdonly; /* lookup read-only flag bit */
int error = 0;
#ifdef NAMEI_DIAGNOSTIC
char *cp; /* DEBUG: check name ptr/len */
#endif
/*
* Setup: break out flag bits into variables.
*/
wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
rdonly = cnp->cn_flags & RDONLY;
cnp->cn_flags &= ~ISSYMLINK;
dp = dvp;
vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
/* dirloop: */
/*
* Search a new directory.
*
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*/
#ifdef NAMEI_DIAGNOSTIC
/* XXX: Figure out the length of the last component. */
cp = cnp->cn_nameptr;
while (*cp && (*cp != '/')) {
cp++;
}
if (cnp->cn_namelen != cp - cnp->cn_nameptr)
panic("relookup: bad len");
if (*cp != 0)
panic("relookup: not last component");
printf("{%s}: ", cnp->cn_nameptr);
#endif
/*
* Check for degenerate name (e.g. / or "")
* which is a way of talking about a directory,
* e.g. like "/." or ".".
*/
if (cnp->cn_nameptr[0] == '\0')
panic("relookup: null name");
if (cnp->cn_flags & ISDOTDOT)
panic ("relookup: lookup on dot-dot");
/*
* We now have a segment name to search for, and a directory to search.
*/
if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
#ifdef DIAGNOSTIC
if (*vpp != NULL)
panic("leaf should be empty");
#endif
if (error != EJUSTRETURN)
goto bad;
/*
* If creating and at end of pathname, then can consider
* allowing file to be created.
*/
if (rdonly || (dvp->v_mount->mnt_flag & MNT_RDONLY)) {
error = EROFS;
goto bad;
}
/* ASSERT(dvp == ndp->ni_startdir) */
if (cnp->cn_flags & SAVESTART) vref(dvp);
/*
* We return with ni_vp NULL to indicate that the entry
* doesn't currently exist, leaving a pointer to the
* (possibly locked) directory inode in ndp->ni_dvp.
*/
return (0);
}
dp = *vpp;
#ifdef DIAGNOSTIC
/*
* Check for symbolic link
*/
if (dp->v_type == VLNK && (cnp->cn_flags & FOLLOW)) panic ("relookup: symlink found.");
#endif
/*
* Check for read-only file systems.
*/
if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) {
/*
* Disallow directory write attempts on read-only
* file systems.
*/
if (rdonly || (dp->v_mount->mnt_flag & MNT_RDONLY) || (wantparent &&
(dvp->v_mount->mnt_flag & MNT_RDONLY))) {
error = EROFS;
goto bad2;
}
}
/* ASSERT(dvp == ndp->ni_startdir) */
if (cnp->cn_flags & SAVESTART) vref(dvp); if (!wantparent) vrele(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp);
return (0);
bad2:
if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) VOP_UNLOCK(dvp);
vrele(dvp);
bad:
vput(dp);
*vpp = NULL;
return (error);
}
/* $OpenBSD: dead_vnops.c,v 1.41 2022/06/26 05:20:42 visa Exp $ */
/* $NetBSD: dead_vnops.c,v 1.16 1996/02/13 13:12:48 mycroft Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)dead_vnops.c 8.2 (Berkeley) 11/21/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/event.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/errno.h>
#include <sys/buf.h>
/*
* Prototypes for dead operations on vnodes.
*/
int dead_ebadf(void *);
int dead_open(void *);
int dead_read(void *);
int dead_write(void *);
int dead_ioctl(void *);
int dead_kqfilter(void *v);
int dead_inactive(void *);
int dead_lock(void *);
int dead_bmap(void *);
int dead_strategy(void *);
int dead_print(void *);
int chkvnlock(struct vnode *);
const struct vops dead_vops = {
.vop_lookup = vop_generic_lookup,
.vop_create = vop_generic_badop,
.vop_mknod = vop_generic_badop,
.vop_open = dead_open,
.vop_close = nullop,
.vop_access = dead_ebadf,
.vop_getattr = dead_ebadf,
.vop_setattr = dead_ebadf,
.vop_read = dead_read,
.vop_write = dead_write,
.vop_ioctl = dead_ioctl,
.vop_kqfilter = dead_kqfilter,
.vop_revoke = NULL,
.vop_fsync = nullop,
.vop_remove = vop_generic_badop,
.vop_link = vop_generic_badop,
.vop_rename = vop_generic_badop,
.vop_mkdir = vop_generic_badop,
.vop_rmdir = vop_generic_badop,
.vop_symlink = vop_generic_badop,
.vop_readdir = dead_ebadf,
.vop_readlink = dead_ebadf,
.vop_abortop = vop_generic_badop,
.vop_inactive = dead_inactive,
.vop_reclaim = nullop,
.vop_lock = dead_lock,
.vop_unlock = nullop,
.vop_islocked = nullop,
.vop_bmap = dead_bmap,
.vop_strategy = dead_strategy,
.vop_print = dead_print,
.vop_pathconf = dead_ebadf,
.vop_advlock = dead_ebadf,
.vop_bwrite = nullop,
};
/*
* Open always fails as if device did not exist.
*/
/* ARGSUSED */
int
dead_open(void *v)
{
return (ENXIO);
}
/*
* Vnode op for read
*/
/* ARGSUSED */
int
dead_read(void *v)
{
struct vop_read_args *ap = v;
if (chkvnlock(ap->a_vp))
panic("dead_read: lock");
/*
* Return EOF for tty devices, EIO for others
*/
if ((ap->a_vp->v_flag & VISTTY) == 0)
return (EIO);
return (0);
}
/*
* Vnode op for write
*/
/* ARGSUSED */
int
dead_write(void *v)
{
struct vop_write_args *ap = v;
if (chkvnlock(ap->a_vp))
panic("dead_write: lock");
return (EIO);
}
/*
* Device ioctl operation.
*/
/* ARGSUSED */
int
dead_ioctl(void *v)
{
struct vop_ioctl_args *ap = v;
if (!chkvnlock(ap->a_vp))
return (EBADF);
return ((ap->a_vp->v_op->vop_ioctl)(ap));
}
int
dead_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
switch (ap->a_kn->kn_filter) {
case EVFILT_READ:
case EVFILT_WRITE:
ap->a_kn->kn_fop = &dead_filtops;
break;
case EVFILT_EXCEPT:
if ((ap->a_kn->kn_flags & __EV_POLL) == 0)
return (EINVAL);
ap->a_kn->kn_fop = &dead_filtops;
break;
default:
return (EINVAL);
}
return (0);
}
/*
* Just call the device strategy routine
*/
int
dead_strategy(void *v)
{
struct vop_strategy_args *ap = v;
int s;
if (ap->a_bp->b_vp == NULL || !chkvnlock(ap->a_bp->b_vp)) {
ap->a_bp->b_flags |= B_ERROR;
s = splbio();
biodone(ap->a_bp);
splx(s);
return (EIO);
}
return (VOP_STRATEGY(ap->a_bp->b_vp, ap->a_bp));
}
int
dead_inactive(void *v)
{
struct vop_inactive_args *ap = v;
VOP_UNLOCK(ap->a_vp);
return (0);
}
/*
* Wait until the vnode has finished changing state.
*/
int
dead_lock(void *v)
{
struct vop_lock_args *ap = v;
struct vnode *vp = ap->a_vp;
if (ap->a_flags & LK_DRAIN || !chkvnlock(vp))
return (0);
return VOP_LOCK(vp, ap->a_flags);
}
/*
* Wait until the vnode has finished changing state.
*/
int
dead_bmap(void *v)
{
struct vop_bmap_args *ap = v;
if (!chkvnlock(ap->a_vp))
return (EIO);
return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp));
}
/*
* Print out the contents of a dead vnode.
*/
/* ARGSUSED */
int
dead_print(void *v)
{
printf("tag VT_NON, dead vnode\n");
return 0;
}
/*
* Empty vnode failed operation
*/
/*ARGSUSED*/
int
dead_ebadf(void *v)
{
return (EBADF);
}
/*
* We have to wait during times when the vnode is
* in a state of change.
*/
int
chkvnlock(struct vnode *vp)
{
int locked = 0;
mtx_enter(&vnode_mtx);
while (vp->v_lflag & VXLOCK) {
vp->v_lflag |= VXWANT;
msleep_nsec(vp, &vnode_mtx, PINOD, "chkvnlock", INFSLP);
locked = 1;
}
mtx_leave(&vnode_mtx);
return (locked);
}
/* $OpenBSD: kern_sched.c,v 1.75 2022/08/14 01:58:27 jsg Exp $ */
/*
* Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/sched.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/systm.h>
#include <sys/task.h>
#include <sys/smr.h>
#include <sys/tracepoint.h>
#include <uvm/uvm_extern.h>
void sched_kthreads_create(void *);
int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
struct proc *sched_steal_proc(struct cpu_info *);
/*
* To help choosing which cpu should run which process we keep track
* of cpus which are currently idle and which cpus have processes
* queued.
*/
struct cpuset sched_idle_cpus;
struct cpuset sched_queued_cpus;
struct cpuset sched_all_cpus;
/*
* Some general scheduler counters.
*/
uint64_t sched_nmigrations; /* Cpu migration counter */
uint64_t sched_nomigrations; /* Cpu no migration counter */
uint64_t sched_noidle; /* Times we didn't pick the idle task */
uint64_t sched_stolen; /* Times we stole proc from other cpus */
uint64_t sched_choose; /* Times we chose a cpu */
uint64_t sched_wasidle; /* Times we came out of idle */
int sched_smt;
/*
* A few notes about cpu_switchto that is implemented in MD code.
*
* cpu_switchto takes two arguments, the old proc and the proc
* it should switch to. The new proc will never be NULL, so we always have
* a saved state that we need to switch to. The old proc however can
* be NULL if the process is exiting. NULL for the old proc simply
* means "don't bother saving old state".
*
* cpu_switchto is supposed to atomically load the new state of the process
* including the pcb, pmap and setting curproc, the p_cpu pointer in the
* proc and p_stat to SONPROC. Atomically with respect to interrupts, other
* cpus in the system must not depend on this state being consistent.
* Therefore no locking is necessary in cpu_switchto other than blocking
* interrupts during the context switch.
*/
/*
* sched_init_cpu is called from main() for the boot cpu, then it's the
* responsibility of the MD code to call it for all other cpus.
*/
void
sched_init_cpu(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
int i;
for (i = 0; i < SCHED_NQS; i++)
TAILQ_INIT(&spc->spc_qs[i]);
spc->spc_idleproc = NULL;
kthread_create_deferred(sched_kthreads_create, ci);
LIST_INIT(&spc->spc_deadproc);
SIMPLEQ_INIT(&spc->spc_deferred);
/*
* Slight hack here until the cpuset code handles cpu_info
* structures.
*/
cpuset_init_cpu(ci);
#ifdef __HAVE_CPU_TOPOLOGY
if (!sched_smt && ci->ci_smt_id > 0)
return;
#endif
cpuset_add(&sched_all_cpus, ci);
}
void
sched_kthreads_create(void *v)
{
struct cpu_info *ci = v;
struct schedstate_percpu *spc = &ci->ci_schedstate;
static int num;
if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
&spc->spc_idleproc))
panic("fork idle");
/* Name it as specified. */
snprintf(spc->spc_idleproc->p_p->ps_comm,
sizeof(spc->spc_idleproc->p_p->ps_comm),
"idle%d", num);
num++;
}
void
sched_idle(void *v)
{
struct schedstate_percpu *spc;
struct proc *p = curproc;
struct cpu_info *ci = v;
int s;
KERNEL_UNLOCK();
spc = &ci->ci_schedstate;
/*
* First time we enter here, we're not supposed to idle,
* just go away for a while.
*/
SCHED_LOCK(s);
cpuset_add(&sched_idle_cpus, ci);
p->p_stat = SSLEEP;
p->p_cpu = ci;
atomic_setbits_int(&p->p_flag, P_CPUPEG);
mi_switch();
cpuset_del(&sched_idle_cpus, ci);
SCHED_UNLOCK(s);
KASSERT(ci == curcpu());
KASSERT(curproc == spc->spc_idleproc);
while (1) {
while (!cpu_is_idle(curcpu())) {
struct proc *dead;
SCHED_LOCK(s);
p->p_stat = SSLEEP;
mi_switch();
SCHED_UNLOCK(s);
while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
LIST_REMOVE(dead, p_hash);
exit2(dead);
}
}
splassert(IPL_NONE);
smr_idle();
cpuset_add(&sched_idle_cpus, ci);
cpu_idle_enter();
while (spc->spc_whichqs == 0) {
#ifdef MULTIPROCESSOR
if (spc->spc_schedflags & SPCF_SHOULDHALT &&
(spc->spc_schedflags & SPCF_HALTED) == 0) {
cpuset_del(&sched_idle_cpus, ci);
SCHED_LOCK(s);
atomic_setbits_int(&spc->spc_schedflags,
spc->spc_whichqs ? 0 : SPCF_HALTED);
SCHED_UNLOCK(s);
wakeup(spc);
}
#endif
cpu_idle_cycle();
}
cpu_idle_leave();
cpuset_del(&sched_idle_cpus, ci);
}
}
/*
* To free our address space we have to jump through a few hoops.
* The freeing is done by the reaper, but until we have one reaper
* per cpu, we have no way of putting this proc on the deadproc list
* and waking up the reaper without risking having our address space and
* stack torn from under us before we manage to switch to another proc.
* Therefore we have a per-cpu list of dead processes where we put this
* proc and have idle clean up that list and move it to the reaper list.
* All this will be unnecessary once we can bind the reaper this cpu
* and not risk having it switch to another in case it sleeps.
*/
void
sched_exit(struct proc *p)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
struct timespec ts;
struct proc *idle;
int s;
nanouptime(&ts);
timespecsub(&ts, &spc->spc_runtime, &ts);
timespecadd(&p->p_rtime, &ts, &p->p_rtime);
LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
#ifdef MULTIPROCESSOR
/* This process no longer needs to hold the kernel lock. */
KERNEL_ASSERT_LOCKED();
__mp_release_all(&kernel_lock);
#endif
SCHED_LOCK(s);
idle = spc->spc_idleproc;
idle->p_stat = SRUN;
cpu_switchto(NULL, idle);
panic("cpu_switchto returned");
}
/*
* Run queue management.
*/
void
sched_init_runqueues(void)
{
}
void
setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
{
struct schedstate_percpu *spc;
int queue = prio >> 2;
if (ci == NULL)
ci = sched_choosecpu(p);
KASSERT(ci != NULL); SCHED_ASSERT_LOCKED();
p->p_cpu = ci;
p->p_stat = SRUN;
p->p_runpri = prio;
spc = &p->p_cpu->ci_schedstate;
spc->spc_nrun++;
TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
p->p_p->ps_pid);
TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
spc->spc_whichqs |= (1U << queue);
cpuset_add(&sched_queued_cpus, p->p_cpu);
if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) cpu_unidle(p->p_cpu); if (prio < spc->spc_curpriority) need_resched(ci);
}
void
remrunqueue(struct proc *p)
{
struct schedstate_percpu *spc;
int queue = p->p_runpri >> 2;
SCHED_ASSERT_LOCKED();
spc = &p->p_cpu->ci_schedstate;
spc->spc_nrun--;
TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
p->p_p->ps_pid);
TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
spc->spc_whichqs &= ~(1U << queue);
if (spc->spc_whichqs == 0)
cpuset_del(&sched_queued_cpus, p->p_cpu);
}
}
struct proc *
sched_chooseproc(void)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
struct proc *p;
int queue;
SCHED_ASSERT_LOCKED();
#ifdef MULTIPROCESSOR
if (spc->spc_schedflags & SPCF_SHOULDHALT) { if (spc->spc_whichqs) { for (queue = 0; queue < SCHED_NQS; queue++) {
while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
remrunqueue(p);
setrunqueue(NULL, p, p->p_runpri);
if (p->p_cpu == curcpu()) { KASSERT(p->p_flag & P_CPUPEG);
goto again;
}
}
}
}
p = spc->spc_idleproc;
KASSERT(p); KASSERT(p->p_wchan == NULL); p->p_stat = SRUN;
return (p);
}
#endif
again:
if (spc->spc_whichqs) {
queue = ffs(spc->spc_whichqs) - 1;
p = TAILQ_FIRST(&spc->spc_qs[queue]);
remrunqueue(p);
sched_noidle++;
if (p->p_stat != SRUN)
panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
} else if ((p = sched_steal_proc(curcpu())) == NULL) {
p = spc->spc_idleproc;
if (p == NULL) {
int s;
/*
* We get here if someone decides to switch during
* boot before forking kthreads, bleh.
* This is kind of like a stupid idle loop.
*/
#ifdef MULTIPROCESSOR
__mp_unlock(&sched_lock);
#endif
spl0();
delay(10);
SCHED_LOCK(s);
goto again;
}
KASSERT(p);
p->p_stat = SRUN;
}
KASSERT(p->p_wchan == NULL);
return (p);
}
struct cpu_info *
sched_choosecpu_fork(struct proc *parent, int flags)
{
#ifdef MULTIPROCESSOR
struct cpu_info *choice = NULL;
fixpt_t load, best_load = ~0;
int run, best_run = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
#if 0
/*
* XXX
* Don't do this until we have a painless way to move the cpu in exec.
* Preferably when nuking the old pmap and getting a new one on a
* new cpu.
*/
/*
* PPWAIT forks are simple. We know that the parent will not
* run until we exec and choose another cpu, so we just steal its
* cpu.
*/
if (flags & FORK_PPWAIT)
return (parent->p_cpu);
#endif
/*
* Look at all cpus that are currently idle and have nothing queued.
* If there are none, pick the one with least queued procs first,
* then the one with lowest load average.
*/
cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
cpuset_intersection(&set, &set, &sched_all_cpus);
if (cpuset_first(&set) == NULL)
cpuset_copy(&set, &sched_all_cpus);
while ((ci = cpuset_first(&set)) != NULL) {
cpuset_del(&set, ci);
load = ci->ci_schedstate.spc_ldavg;
run = ci->ci_schedstate.spc_nrun;
if (choice == NULL || run < best_run ||
(run == best_run &&load < best_load)) {
choice = ci;
best_load = load;
best_run = run;
}
}
return (choice);
#else
return (curcpu());
#endif
}
struct cpu_info *
sched_choosecpu(struct proc *p)
{
#ifdef MULTIPROCESSOR
struct cpu_info *choice = NULL;
int last_cost = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
/*
* If pegged to a cpu, don't allow it to move.
*/
if (p->p_flag & P_CPUPEG)
return (p->p_cpu);
sched_choose++;
/*
* Look at all cpus that are currently idle and have nothing queued.
* If there are none, pick the cheapest of those.
* (idle + queued could mean that the cpu is handling an interrupt
* at this moment and haven't had time to leave idle yet).
*/
cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); cpuset_intersection(&set, &set, &sched_all_cpus);
/*
* First, just check if our current cpu is in that set, if it is,
* this is simple.
* Also, our cpu might not be idle, but if it's the current cpu
* and it has nothing else queued and we're curproc, take it.
*/
if (cpuset_isset(&set, p->p_cpu) || (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
curproc == p)) {
sched_wasidle++;
return (p->p_cpu);
}
if (cpuset_first(&set) == NULL)
cpuset_copy(&set, &sched_all_cpus);
while ((ci = cpuset_first(&set)) != NULL) {
int cost = sched_proc_to_cpu_cost(ci, p);
if (choice == NULL || cost < last_cost) {
choice = ci;
last_cost = cost;
}
cpuset_del(&set, ci);
}
if (p->p_cpu != choice)
sched_nmigrations++;
else
sched_nomigrations++;
return (choice);
#else
return (curcpu());
#endif
}
/*
* Attempt to steal a proc from some cpu.
*/
struct proc *
sched_steal_proc(struct cpu_info *self)
{
struct proc *best = NULL;
#ifdef MULTIPROCESSOR
struct schedstate_percpu *spc;
int bestcost = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
/* Don't steal if we don't want to schedule processes in this CPU. */
if (!cpuset_isset(&sched_all_cpus, self))
return (NULL);
cpuset_copy(&set, &sched_queued_cpus);
while ((ci = cpuset_first(&set)) != NULL) {
struct proc *p;
int queue;
int cost;
cpuset_del(&set, ci);
spc = &ci->ci_schedstate;
queue = ffs(spc->spc_whichqs) - 1;
TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { if (p->p_flag & P_CPUPEG)
continue;
cost = sched_proc_to_cpu_cost(self, p);
if (best == NULL || cost < bestcost) {
best = p;
bestcost = cost;
}
}
}
if (best == NULL)
return (NULL);
remrunqueue(best);
best->p_cpu = self;
sched_stolen++;
#endif
return (best);
}
#ifdef MULTIPROCESSOR
/*
* Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
*/
static int
log2(unsigned int i)
{
int ret = 0;
while (i >>= 1)
ret++;
return (ret);
}
/*
* Calculate the cost of moving the proc to this cpu.
*
* What we want is some guesstimate of how much "performance" it will
* cost us to move the proc here. Not just for caches and TLBs and NUMA
* memory, but also for the proc itself. A highly loaded cpu might not
* be the best candidate for this proc since it won't get run.
*
* Just total guesstimates for now.
*/
int sched_cost_load = 1;
int sched_cost_priority = 1;
int sched_cost_runnable = 3;
int sched_cost_resident = 1;
#endif
int
sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
{
int cost = 0;
#ifdef MULTIPROCESSOR
struct schedstate_percpu *spc;
int l2resident = 0;
spc = &ci->ci_schedstate;
/*
* First, account for the priority of the proc we want to move.
* More willing to move, the lower the priority of the destination
* and the higher the priority of the proc.
*/
if (!cpuset_isset(&sched_idle_cpus, ci)) { cost += (p->p_usrpri - spc->spc_curpriority) *
sched_cost_priority;
cost += sched_cost_runnable;
}
if (cpuset_isset(&sched_queued_cpus, ci))
cost += spc->spc_nrun * sched_cost_runnable;
/*
* Try to avoid the primary cpu as it handles hardware interrupts.
*
* XXX Needs to be revisited when we distribute interrupts
* over cpus.
*/
if (CPU_IS_PRIMARY(ci))
cost += sched_cost_runnable;
/*
* Higher load on the destination means we don't want to go there.
*/
cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
/*
* If the proc is on this cpu already, lower the cost by how much
* it has been running and an estimate of its footprint.
*/
if (p->p_cpu == ci && p->p_slptime == 0) {
l2resident =
log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
cost -= l2resident * sched_cost_resident;
}
#endif
return (cost);
}
/*
* Peg a proc to a cpu.
*/
void
sched_peg_curproc(struct cpu_info *ci)
{
struct proc *p = curproc;
int s;
SCHED_LOCK(s);
atomic_setbits_int(&p->p_flag, P_CPUPEG);
setrunqueue(ci, p, p->p_usrpri);
p->p_ru.ru_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
#ifdef MULTIPROCESSOR
void
sched_start_secondary_cpus(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
CPU_INFO_FOREACH(cii, ci) {
struct schedstate_percpu *spc = &ci->ci_schedstate;
if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
continue;
atomic_clearbits_int(&spc->spc_schedflags,
SPCF_SHOULDHALT | SPCF_HALTED);
#ifdef __HAVE_CPU_TOPOLOGY
if (!sched_smt && ci->ci_smt_id > 0)
continue;
#endif
cpuset_add(&sched_all_cpus, ci);
}
}
void
sched_stop_secondary_cpus(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
/*
* Make sure we stop the secondary CPUs.
*/
CPU_INFO_FOREACH(cii, ci) {
struct schedstate_percpu *spc = &ci->ci_schedstate;
if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
continue;
cpuset_del(&sched_all_cpus, ci);
atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
}
CPU_INFO_FOREACH(cii, ci) {
struct schedstate_percpu *spc = &ci->ci_schedstate;
struct sleep_state sls;
if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
continue;
while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
sleep_setup(&sls, spc, PZERO, "schedstate", 0);
sleep_finish(&sls,
(spc->spc_schedflags & SPCF_HALTED) == 0);
}
}
}
struct sched_barrier_state {
struct cpu_info *ci;
struct cond cond;
};
void
sched_barrier_task(void *arg)
{
struct sched_barrier_state *sb = arg;
struct cpu_info *ci = sb->ci;
sched_peg_curproc(ci);
cond_signal(&sb->cond);
atomic_clearbits_int(&curproc->p_flag, P_CPUPEG);
}
void
sched_barrier(struct cpu_info *ci)
{
struct sched_barrier_state sb;
struct task task;
CPU_INFO_ITERATOR cii;
if (ci == NULL) {
CPU_INFO_FOREACH(cii, ci) {
if (CPU_IS_PRIMARY(ci))
break;
}
}
KASSERT(ci != NULL);
if (ci == curcpu())
return;
sb.ci = ci;
cond_init(&sb.cond);
task_set(&task, sched_barrier_task, &sb);
task_add(systqmp, &task);
cond_wait(&sb.cond, "sbar");
}
#else
void
sched_barrier(struct cpu_info *ci)
{
}
#endif
/*
* Functions to manipulate cpu sets.
*/
struct cpu_info *cpuset_infos[MAXCPUS];
static struct cpuset cpuset_all;
void
cpuset_init_cpu(struct cpu_info *ci)
{
cpuset_add(&cpuset_all, ci);
cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
}
void
cpuset_clear(struct cpuset *cs)
{
memset(cs, 0, sizeof(*cs));
}
void
cpuset_add(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
}
void
cpuset_del(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
}
int
cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
return (cs->cs_set[num/32] & (1U << (num % 32)));
}
void
cpuset_add_all(struct cpuset *cs)
{
cpuset_copy(cs, &cpuset_all);
}
void
cpuset_copy(struct cpuset *to, struct cpuset *from)
{
memcpy(to, from, sizeof(*to));
}
struct cpu_info *
cpuset_first(struct cpuset *cs)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
if (cs->cs_set[i])
return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
return (NULL);
}
void
cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
}
void
cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
}
void
cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
}
int
cpuset_cardinality(struct cpuset *cs)
{
int cardinality, i, n;
cardinality = 0;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++) for (n = cs->cs_set[i]; n != 0; n &= n - 1)
cardinality++;
return (cardinality);
}
int
sysctl_hwncpuonline(void)
{
return cpuset_cardinality(&sched_all_cpus);
}
int
cpu_is_online(struct cpu_info *ci)
{
return cpuset_isset(&sched_all_cpus, ci);
}
#ifdef __HAVE_CPU_TOPOLOGY
#include <sys/sysctl.h>
int
sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
int err, newsmt;
newsmt = sched_smt;
err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
if (err)
return err;
if (newsmt == sched_smt)
return 0;
sched_smt = newsmt;
CPU_INFO_FOREACH(cii, ci) { if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
continue;
if (ci->ci_smt_id == 0)
continue;
if (sched_smt)
cpuset_add(&sched_all_cpus, ci);
else
cpuset_del(&sched_all_cpus, ci);
}
return 0;
}
#endif
/* $OpenBSD: if_ethersubr.c,v 1.284 2022/06/29 09:08:07 mvs Exp $ */
/* $NetBSD: if_ethersubr.c,v 1.19 1996/05/07 02:40:30 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93
*/
/*
%%% portions-copyright-nrl-95
Portions of this software are Copyright 1995-1998 by Randall Atkinson,
Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
Reserved. All rights under this copyright have been assigned to the US
Naval Research Laboratory (NRL). The NRL Copyright Notice and License
Agreement Version 1.1 (January 17, 1995) applies to these portions of the
software.
You should have received a copy of the license with this software. If you
didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
*/
#include "bpfilter.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/timeout.h>
#include <sys/smr.h>
#include <net/if.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/if_llc.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <netinet/ip_ipsp.h>
#if NBPFILTER > 0
#include <net/bpf.h>
#endif
#include "vlan.h"
#if NVLAN > 0
#include <net/if_vlan_var.h>
#endif
#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
#include "pppoe.h"
#if NPPPOE > 0
#include <net/if_pppoe.h>
#endif
#include "bpe.h"
#if NBPE > 0
#include <net/if_bpe.h>
#endif
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif
#ifdef PIPEX
#include <net/pipex.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#endif /* MPLS */
u_int8_t etherbroadcastaddr[ETHER_ADDR_LEN] =
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
u_int8_t etheranyaddr[ETHER_ADDR_LEN] =
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
#define senderr(e) { error = (e); goto bad;}
int
ether_ioctl(struct ifnet *ifp, struct arpcom *arp, u_long cmd, caddr_t data)
{
struct ifreq *ifr = (struct ifreq *)data;
int error = 0;
switch (cmd) {
case SIOCSIFADDR:
break;
case SIOCSIFMTU:
if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > ifp->if_hardmtu)
error = EINVAL;
else
ifp->if_mtu = ifr->ifr_mtu;
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (ifp->if_flags & IFF_MULTICAST) {
error = (cmd == SIOCADDMULTI) ?
ether_addmulti(ifr, arp) : ether_delmulti(ifr, arp);
} else
error = ENOTTY;
break;
default:
error = ENOTTY;
}
return (error);
}
void
ether_rtrequest(struct ifnet *ifp, int req, struct rtentry *rt)
{ if (rt == NULL)
return;
switch (rt_key(rt)->sa_family) {
case AF_INET:
arp_rtrequest(ifp, req, rt);
break;
#ifdef INET6
case AF_INET6:
nd6_rtrequest(ifp, req, rt);
break;
#endif
default:
break;
}
}
int
ether_resolve(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
struct rtentry *rt, struct ether_header *eh)
{
struct arpcom *ac = (struct arpcom *)ifp;
sa_family_t af = dst->sa_family;
int error = 0;
if (!ISSET(ifp->if_flags, IFF_RUNNING))
senderr(ENETDOWN);
KASSERT(rt != NULL || ISSET(m->m_flags, M_MCAST|M_BCAST) ||
af == AF_UNSPEC || af == pseudo_AF_HDRCMPLT);
#ifdef DIAGNOSTIC
if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) {
printf("%s: trying to send packet on wrong domain. "
"if %d vs. mbuf %d\n", ifp->if_xname,
ifp->if_rdomain, rtable_l2(m->m_pkthdr.ph_rtableid));
}
#endif
switch (af) {
case AF_INET:
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IP);
/*
* If broadcasting on a simplex interface, loopback a copy.
* The checksum must be calculated in software. Keep the
* condition in sync with in_ifcap_cksum().
*/
if (ISSET(m->m_flags, M_BCAST) && ISSET(ifp->if_flags, IFF_SIMPLEX) &&
!m->m_pkthdr.pf.routed) {
struct mbuf *mcopy;
/* XXX Should we input an unencrypted IPsec packet? */
mcopy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
if (mcopy != NULL) if_input_local(ifp, mcopy, af);
}
break;
#ifdef INET6
case AF_INET6:
KERNEL_LOCK();
/* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
KERNEL_UNLOCK();
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IPV6);
break;
#endif
#ifdef MPLS
case AF_MPLS:
if (rt == NULL)
senderr(EHOSTUNREACH);
if (!ISSET(ifp->if_xflags, IFXF_MPLS))
senderr(ENETUNREACH);
dst = ISSET(rt->rt_flags, RTF_GATEWAY) ?
rt->rt_gateway : rt_key(rt);
switch (dst->sa_family) {
case AF_LINK:
if (satosdl(dst)->sdl_alen < sizeof(eh->ether_dhost))
senderr(EHOSTUNREACH);
memcpy(eh->ether_dhost, LLADDR(satosdl(dst)),
sizeof(eh->ether_dhost));
break;
#ifdef INET6
case AF_INET6:
KERNEL_LOCK();
/* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
KERNEL_UNLOCK();
if (error)
return (error);
break;
#endif
case AF_INET:
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
if (error)
return (error);
break;
default:
senderr(EHOSTUNREACH);
}
/* XXX handling for simplex devices in case of M/BCAST ?? */
if (m->m_flags & (M_BCAST | M_MCAST))
eh->ether_type = htons(ETHERTYPE_MPLS_MCAST);
else
eh->ether_type = htons(ETHERTYPE_MPLS);
break;
#endif /* MPLS */
case pseudo_AF_HDRCMPLT:
/* take the whole header from the sa */
memcpy(eh, dst->sa_data, sizeof(*eh));
return (0);
case AF_UNSPEC:
/* take the dst and type from the sa, but get src below */
memcpy(eh, dst->sa_data, sizeof(*eh));
break;
default:
printf("%s: can't handle af%d\n", ifp->if_xname, af);
senderr(EAFNOSUPPORT);
}
memcpy(eh->ether_shost, ac->ac_enaddr, sizeof(eh->ether_shost));
return (0);
bad:
m_freem(m);
return (error);
}
struct mbuf*
ether_encap(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
struct rtentry *rt, int *errorp)
{
struct ether_header eh;
int error;
error = ether_resolve(ifp, m, dst, rt, &eh);
switch (error) {
case 0:
break;
case EAGAIN:
error = 0;
default:
*errorp = error;
return (NULL);
}
m = m_prepend(m, ETHER_ALIGN + sizeof(eh), M_DONTWAIT);
if (m == NULL) {
*errorp = ENOBUFS;
return (NULL);
}
m_adj(m, ETHER_ALIGN);
memcpy(mtod(m, struct ether_header *), &eh, sizeof(eh));
return (m);
}
int
ether_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
struct rtentry *rt)
{
int error;
m = ether_encap(ifp, m, dst, rt, &error);
if (m == NULL)
return (error);
return (if_enqueue(ifp, m));
}
/*
* Process a received Ethernet packet.
*
* Ethernet input has several "phases" of filtering packets to
* support virtual/pseudo interfaces before actual layer 3 protocol
* handling.
*
* First phase:
*
* The first phase supports drivers that aggregate multiple Ethernet
* ports into a single logical interface, ie, aggr(4) and trunk(4).
* These drivers intercept packets by swapping out the if_input handler
* on the "port" interfaces to steal the packets before they get here
* to ether_input().
*/
void
ether_input(struct ifnet *ifp, struct mbuf *m)
{
struct ether_header *eh;
void (*input)(struct ifnet *, struct mbuf *);
u_int16_t etype;
struct arpcom *ac;
const struct ether_brport *eb;
unsigned int sdelim = 0;
uint64_t dst, self;
/* Drop short frames */
if (m->m_len < ETHER_HDR_LEN)
goto dropanyway;
/*
* Second phase: service delimited packet filtering.
*
* Let vlan(4) and svlan(4) look at "service delimited"
* packets. If a virtual interface does not exist to take
* those packets, they're returned to ether_input() so a
* bridge can have a go at forwarding them.
*/
eh = mtod(m, struct ether_header *);
dst = ether_addr_to_e64((struct ether_addr *)eh->ether_dhost);
etype = ntohs(eh->ether_type);
if (ISSET(m->m_flags, M_VLANTAG) ||
etype == ETHERTYPE_VLAN || etype == ETHERTYPE_QINQ) {
#if NVLAN > 0
m = vlan_input(ifp, m, &sdelim);
if (m == NULL)
return;
#else
sdelim = 1;
#endif
}
/*
* Third phase: bridge processing.
*
* Give the packet to a bridge interface, ie, bridge(4),
* veb(4), or tpmr(4), if it is configured. A bridge
* may take the packet and forward it to another port, or it
* may return it here to ether_input() to support local
* delivery to this port.
*/
ac = (struct arpcom *)ifp;
smr_read_enter();
eb = SMR_PTR_GET(&ac->ac_brport);
if (eb != NULL)
eb->eb_port_take(eb->eb_port);
smr_read_leave();
if (eb != NULL) {
m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
eb->eb_port_rele(eb->eb_port);
if (m == NULL) {
return;
}
}
/*
* Fourth phase: drop service delimited packets.
*
* If the packet has a tag, and a bridge didn't want it,
* it's not for this port.
*/
if (sdelim)
goto dropanyway;
/*
* Fifth phase: destination address check.
*
* Is the packet specifically addressed to this port?
*/
eh = mtod(m, struct ether_header *);
self = ether_addr_to_e64((struct ether_addr *)ac->ac_enaddr);
if (dst != self) {
#if NCARP > 0
/*
* If it's not for this port, it could be for carp(4).
*/
if (ifp->if_type != IFT_CARP &&
!SRPL_EMPTY_LOCKED(&ifp->if_carp)) {
m = carp_input(ifp, m, dst);
if (m == NULL)
return;
eh = mtod(m, struct ether_header *);
}
#endif
/*
* If not, it must be multicast or broadcast to go further.
*/
if (!ETH64_IS_MULTICAST(dst))
goto dropanyway;
/*
* If this is not a simplex interface, drop the packet
* if it came from us.
*/
if ((ifp->if_flags & IFF_SIMPLEX) == 0) {
uint64_t src = ether_addr_to_e64(
(struct ether_addr *)eh->ether_shost);
if (self == src)
goto dropanyway;
}
SET(m->m_flags, ETH64_IS_BROADCAST(dst) ? M_BCAST : M_MCAST);
ifp->if_imcasts++;
}
/*
* Sixth phase: protocol demux.
*
* At this point it is known that the packet is destined
* for layer 3 protocol handling on the local port.
*/
etype = ntohs(eh->ether_type);
switch (etype) {
case ETHERTYPE_IP:
input = ipv4_input;
break;
case ETHERTYPE_ARP:
if (ifp->if_flags & IFF_NOARP)
goto dropanyway;
input = arpinput;
break;
case ETHERTYPE_REVARP:
if (ifp->if_flags & IFF_NOARP)
goto dropanyway;
input = revarpinput;
break;
#ifdef INET6
/*
* Schedule IPv6 software interrupt for incoming IPv6 packet.
*/
case ETHERTYPE_IPV6:
input = ipv6_input;
break;
#endif /* INET6 */
#if NPPPOE > 0 || defined(PIPEX)
case ETHERTYPE_PPPOEDISC:
case ETHERTYPE_PPPOE:
if (m->m_flags & (M_MCAST | M_BCAST))
goto dropanyway;
#ifdef PIPEX
if (pipex_enable) {
struct pipex_session *session;
if ((session = pipex_pppoe_lookup_session(m)) != NULL) { pipex_pppoe_input(m, session);
pipex_rele_session(session);
return;
}
}
#endif
if (etype == ETHERTYPE_PPPOEDISC) {
if (mq_enqueue(&pppoediscinq, m) == 0) schednetisr(NETISR_PPPOE);
} else {
if (mq_enqueue(&pppoeinq, m) == 0)
schednetisr(NETISR_PPPOE);
}
return;
#endif
#ifdef MPLS
case ETHERTYPE_MPLS:
case ETHERTYPE_MPLS_MCAST:
input = mpls_input;
break;
#endif
#if NBPE > 0
case ETHERTYPE_PBB:
bpe_input(ifp, m);
return;
#endif
default:
goto dropanyway;
}
m_adj(m, sizeof(*eh));
(*input)(ifp, m);
return;
dropanyway:
m_freem(m);
return;
}
int
ether_brport_isset(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
KERNEL_ASSERT_LOCKED();
if (SMR_PTR_GET_LOCKED(&ac->ac_brport) != NULL)
return (EBUSY);
return (0);
}
void
ether_brport_set(struct ifnet *ifp, const struct ether_brport *eb)
{
struct arpcom *ac = (struct arpcom *)ifp;
KERNEL_ASSERT_LOCKED();
KASSERTMSG(SMR_PTR_GET_LOCKED(&ac->ac_brport) == NULL,
"%s setting an already set brport", ifp->if_xname);
SMR_PTR_SET_LOCKED(&ac->ac_brport, eb);
}
void
ether_brport_clr(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
KERNEL_ASSERT_LOCKED();
KASSERTMSG(SMR_PTR_GET_LOCKED(&ac->ac_brport) != NULL,
"%s clearing an already clear brport", ifp->if_xname);
SMR_PTR_SET_LOCKED(&ac->ac_brport, NULL);
}
const struct ether_brport *
ether_brport_get(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
SMR_ASSERT_CRITICAL();
return (SMR_PTR_GET(&ac->ac_brport));
}
const struct ether_brport *
ether_brport_get_locked(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
KERNEL_ASSERT_LOCKED();
return (SMR_PTR_GET_LOCKED(&ac->ac_brport));
}
/*
* Convert Ethernet address to printable (loggable) representation.
*/
static char digits[] = "0123456789abcdef";
char *
ether_sprintf(u_char *ap)
{
int i;
static char etherbuf[ETHER_ADDR_LEN * 3];
char *cp = etherbuf;
for (i = 0; i < ETHER_ADDR_LEN; i++) {
*cp++ = digits[*ap >> 4];
*cp++ = digits[*ap++ & 0xf];
*cp++ = ':';
}
*--cp = 0;
return (etherbuf);
}
/*
* Generate a (hopefully) acceptable MAC address, if asked.
*/
void
ether_fakeaddr(struct ifnet *ifp)
{
static int unit;
int rng = arc4random();
/* Non-multicast; locally administered address */
((struct arpcom *)ifp)->ac_enaddr[0] = 0xfe;
((struct arpcom *)ifp)->ac_enaddr[1] = 0xe1;
((struct arpcom *)ifp)->ac_enaddr[2] = 0xba;
((struct arpcom *)ifp)->ac_enaddr[3] = 0xd0 | (unit++ & 0xf);
((struct arpcom *)ifp)->ac_enaddr[4] = rng;
((struct arpcom *)ifp)->ac_enaddr[5] = rng >> 8;
}
/*
* Perform common duties while attaching to interface list
*/
void
ether_ifattach(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
/*
* Any interface which provides a MAC address which is obviously
* invalid gets whacked, so that users will notice.
*/
if (ETHER_IS_MULTICAST(((struct arpcom *)ifp)->ac_enaddr))
ether_fakeaddr(ifp);
ifp->if_type = IFT_ETHER;
ifp->if_addrlen = ETHER_ADDR_LEN;
ifp->if_hdrlen = ETHER_HDR_LEN;
ifp->if_mtu = ETHERMTU;
ifp->if_input = ether_input;
if (ifp->if_output == NULL)
ifp->if_output = ether_output;
ifp->if_rtrequest = ether_rtrequest;
if (ifp->if_hardmtu == 0)
ifp->if_hardmtu = ETHERMTU;
if_alloc_sadl(ifp);
memcpy(LLADDR(ifp->if_sadl), ac->ac_enaddr, ifp->if_addrlen);
LIST_INIT(&ac->ac_multiaddrs);
#if NBPFILTER > 0
bpfattach(&ifp->if_bpf, ifp, DLT_EN10MB, ETHER_HDR_LEN);
#endif
}
void
ether_ifdetach(struct ifnet *ifp)
{
struct arpcom *ac = (struct arpcom *)ifp;
struct ether_multi *enm;
/* Undo pseudo-driver changes. */
if_deactivate(ifp);
for (enm = LIST_FIRST(&ac->ac_multiaddrs);
enm != NULL;
enm = LIST_FIRST(&ac->ac_multiaddrs)) {
LIST_REMOVE(enm, enm_list);
free(enm, M_IFMADDR, sizeof *enm);
}
}
#if 0
/*
* This is for reference. We have table-driven versions of the
* crc32 generators, which are faster than the double-loop.
*/
u_int32_t __pure
ether_crc32_le_update(u_int_32_t crc, const u_int8_t *buf, size_t len)
{
u_int32_t c, carry;
size_t i, j;
for (i = 0; i < len; i++) {
c = buf[i];
for (j = 0; j < 8; j++) {
carry = ((crc & 0x01) ? 1 : 0) ^ (c & 0x01);
crc >>= 1;
c >>= 1;
if (carry)
crc = (crc ^ ETHER_CRC_POLY_LE);
}
}
return (crc);
}
u_int32_t __pure
ether_crc32_be_update(u_int_32_t crc, const u_int8_t *buf, size_t len)
{
u_int32_t c, carry;
size_t i, j;
for (i = 0; i < len; i++) {
c = buf[i];
for (j = 0; j < 8; j++) {
carry = ((crc & 0x80000000U) ? 1 : 0) ^ (c & 0x01);
crc <<= 1;
c >>= 1;
if (carry)
crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
}
}
return (crc);
}
#else
u_int32_t __pure
ether_crc32_le_update(u_int32_t crc, const u_int8_t *buf, size_t len)
{
static const u_int32_t crctab[] = {
0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
};
size_t i;
for (i = 0; i < len; i++) {
crc ^= buf[i];
crc = (crc >> 4) ^ crctab[crc & 0xf];
crc = (crc >> 4) ^ crctab[crc & 0xf];
}
return (crc);
}
u_int32_t __pure
ether_crc32_be_update(u_int32_t crc, const u_int8_t *buf, size_t len)
{
static const u_int8_t rev[] = {
0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
};
static const u_int32_t crctab[] = {
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd
};
size_t i;
u_int8_t data;
for (i = 0; i < len; i++) {
data = buf[i];
crc = (crc << 4) ^ crctab[(crc >> 28) ^ rev[data & 0xf]];
crc = (crc << 4) ^ crctab[(crc >> 28) ^ rev[data >> 4]];
}
return (crc);
}
#endif
u_int32_t
ether_crc32_le(const u_int8_t *buf, size_t len)
{
return ether_crc32_le_update(0xffffffff, buf, len);
}
u_int32_t
ether_crc32_be(const u_int8_t *buf, size_t len)
{
return ether_crc32_be_update(0xffffffff, buf, len);
}
u_char ether_ipmulticast_min[ETHER_ADDR_LEN] =
{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
u_char ether_ipmulticast_max[ETHER_ADDR_LEN] =
{ 0x01, 0x00, 0x5e, 0x7f, 0xff, 0xff };
#ifdef INET6
u_char ether_ip6multicast_min[ETHER_ADDR_LEN] =
{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
u_char ether_ip6multicast_max[ETHER_ADDR_LEN] =
{ 0x33, 0x33, 0xff, 0xff, 0xff, 0xff };
#endif
/*
* Convert a sockaddr into an Ethernet address or range of Ethernet
* addresses.
*/
int
ether_multiaddr(struct sockaddr *sa, u_int8_t addrlo[ETHER_ADDR_LEN],
u_int8_t addrhi[ETHER_ADDR_LEN])
{
struct sockaddr_in *sin;
#ifdef INET6
struct sockaddr_in6 *sin6;
#endif /* INET6 */
switch (sa->sa_family) {
case AF_UNSPEC:
memcpy(addrlo, sa->sa_data, ETHER_ADDR_LEN);
memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
break;
case AF_INET:
sin = satosin(sa);
if (sin->sin_addr.s_addr == INADDR_ANY) {
/*
* An IP address of INADDR_ANY means listen to
* or stop listening to all of the Ethernet
* multicast addresses used for IP.
* (This is for the sake of IP multicast routers.)
*/
memcpy(addrlo, ether_ipmulticast_min, ETHER_ADDR_LEN);
memcpy(addrhi, ether_ipmulticast_max, ETHER_ADDR_LEN);
} else {
ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo);
memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
}
break;
#ifdef INET6
case AF_INET6:
sin6 = satosin6(sa);
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/*
* An IP6 address of 0 means listen to or stop
* listening to all of the Ethernet multicast
* address used for IP6.
*
* (This might not be healthy, given IPv6's reliance on
* multicast for things like neighbor discovery.
* Perhaps initializing all-nodes, solicited nodes, and
* possibly all-routers for this interface afterwards
* is not a bad idea.)
*/
memcpy(addrlo, ether_ip6multicast_min, ETHER_ADDR_LEN);
memcpy(addrhi, ether_ip6multicast_max, ETHER_ADDR_LEN);
} else {
ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, addrlo);
memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
}
break;
#endif
default:
return (EAFNOSUPPORT);
}
return (0);
}
/*
* Add an Ethernet multicast address or range of addresses to the list for a
* given interface.
*/
int
ether_addmulti(struct ifreq *ifr, struct arpcom *ac)
{
struct ether_multi *enm;
u_char addrlo[ETHER_ADDR_LEN];
u_char addrhi[ETHER_ADDR_LEN];
int s = splnet(), error;
error = ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi);
if (error != 0) {
splx(s);
return (error);
}
/*
* Verify that we have valid Ethernet multicast addresses.
*/
if ((addrlo[0] & 0x01) != 1 || (addrhi[0] & 0x01) != 1) {
splx(s);
return (EINVAL);
}
/*
* See if the address range is already in the list.
*/
ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm);
if (enm != NULL) {
/*
* Found it; just increment the reference count.
*/
++enm->enm_refcount;
splx(s);
return (0);
}
/*
* New address or range; malloc a new multicast record
* and link it into the interface's multicast list.
*/
enm = malloc(sizeof(*enm), M_IFMADDR, M_NOWAIT);
if (enm == NULL) {
splx(s);
return (ENOBUFS);
}
memcpy(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN);
memcpy(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN);
enm->enm_refcount = 1;
LIST_INSERT_HEAD(&ac->ac_multiaddrs, enm, enm_list);
ac->ac_multicnt++;
if (memcmp(addrlo, addrhi, ETHER_ADDR_LEN) != 0) ac->ac_multirangecnt++;
splx(s);
/*
* Return ENETRESET to inform the driver that the list has changed
* and its reception filter should be adjusted accordingly.
*/
return (ENETRESET);
}
/*
* Delete a multicast address record.
*/
int
ether_delmulti(struct ifreq *ifr, struct arpcom *ac)
{
struct ether_multi *enm;
u_char addrlo[ETHER_ADDR_LEN];
u_char addrhi[ETHER_ADDR_LEN];
int s = splnet(), error;
error = ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi);
if (error != 0) {
splx(s);
return (error);
}
/*
* Look up the address in our list.
*/
ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm);
if (enm == NULL) {
splx(s);
return (ENXIO);
}
if (--enm->enm_refcount != 0) {
/*
* Still some claims to this record.
*/
splx(s);
return (0);
}
/*
* No remaining claims to this record; unlink and free it.
*/
LIST_REMOVE(enm, enm_list);
free(enm, M_IFMADDR, sizeof *enm);
ac->ac_multicnt--;
if (memcmp(addrlo, addrhi, ETHER_ADDR_LEN) != 0) ac->ac_multirangecnt--;
splx(s);
/*
* Return ENETRESET to inform the driver that the list has changed
* and its reception filter should be adjusted accordingly.
*/
return (ENETRESET);
}
uint64_t
ether_addr_to_e64(const struct ether_addr *ea)
{
uint64_t e64 = 0;
size_t i;
for (i = 0; i < nitems(ea->ether_addr_octet); i++) {
e64 <<= 8;
e64 |= ea->ether_addr_octet[i];
}
return (e64);
}
void
ether_e64_to_addr(struct ether_addr *ea, uint64_t e64)
{
size_t i = nitems(ea->ether_addr_octet);
do {
ea->ether_addr_octet[--i] = e64;
e64 >>= 8;
} while (i > 0);
}
/* $OpenBSD: tcp_debug.c,v 1.30 2022/02/22 01:15:02 guenther Exp $ */
/* $NetBSD: tcp_debug.c,v 1.10 1996/02/13 23:43:36 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#ifdef TCPDEBUG
/* load symbolic names */
#define PRUREQUESTS
#define TCPSTATES
#define TCPTIMERS
#define TANAMES
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
#include <netinet/tcp_fsm.h>
#ifdef INET6
#include <netinet/ip6.h>
#endif /* INET6 */
#ifdef TCPDEBUG
int tcpconsdebug = 0;
#endif
struct tcp_debug tcp_debug[TCP_NDEBUG];
int tcp_debx;
/*
* Tcp debug routines
*/
void
tcp_trace(short act, short ostate, struct tcpcb *tp, struct tcpcb *otp,
caddr_t headers, int req, int len)
{
#ifdef TCPDEBUG
tcp_seq seq, ack;
int flags;
#endif
int pf = PF_UNSPEC;
struct tcp_debug *td = &tcp_debug[tcp_debx++];
struct tcpiphdr *ti = (struct tcpiphdr *)headers;
struct tcpipv6hdr *ti6 = (struct tcpipv6hdr *)headers;
struct tcphdr *th;
if (tcp_debx == TCP_NDEBUG)
tcp_debx = 0;
td->td_time = iptime();
td->td_act = act;
td->td_ostate = ostate;
td->td_tcb = (caddr_t)otp;
if (tp) {
pf = tp->pf;
td->td_cb = *tp;
} else
bzero((caddr_t)&td->td_cb, sizeof (*tp));
bzero(&td->td_ti6, sizeof(struct tcpipv6hdr));
bzero(&td->td_ti, sizeof(struct tcpiphdr));
if (headers) {
/* The address family may be in tcpcb or ip header. */
if (pf == PF_UNSPEC) { switch (ti6->ti6_i.ip6_vfc & IPV6_VERSION_MASK) {
#ifdef INET6
case IPV6_VERSION:
pf = PF_INET6;
break;
#endif /* INET6 */
case IPVERSION:
pf = PF_INET;
break;
}
}
switch (pf) {
#ifdef INET6
case PF_INET6:
th = &ti6->ti6_t;
td->td_ti6 = *ti6;
td->td_ti6.ti6_plen = len;
break;
#endif /* INET6 */
case PF_INET:
th = &ti->ti_t;
td->td_ti = *ti;
td->td_ti.ti_len = len;
break;
default:
headers = NULL;
break;
}
}
td->td_req = req;
#ifdef TCPDEBUG
if (tcpconsdebug == 0)
return;
if (otp)
printf("%p %s:", otp, tcpstates[ostate]);
else
printf("???????? ");
printf("%s ", tanames[act]);
switch (act) {
case TA_INPUT:
case TA_OUTPUT:
case TA_DROP:
if (headers == NULL)
break;
seq = th->th_seq;
ack = th->th_ack;
if (act == TA_OUTPUT) {
seq = ntohl(seq);
ack = ntohl(ack);
}
if (len)
printf("[%x..%x)", seq, seq+len);
else
printf("%x", seq);
printf("@%x, urp=%x", ack, th->th_urp);
flags = th->th_flags;
if (flags) {
char *cp = "<";
#define pf(f) { if (th->th_flags&TH_##f) { printf("%s%s", cp, #f); cp = ","; } }
pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG);
printf(">");
}
break;
case TA_USER:
printf("%s", prurequests[req]);
break;
case TA_TIMER:
printf("%s", tcptimers[req]);
break;
}
if (tp)
printf(" -> %s", tcpstates[tp->t_state]);
/* print out internal state of tp !?! */
printf("\n");
if (tp == NULL)
return;
printf("\trcv_(nxt,wnd,up) (%x,%lx,%x) snd_(una,nxt,max) (%x,%x,%x)\n",
tp->rcv_nxt, tp->rcv_wnd, tp->rcv_up, tp->snd_una, tp->snd_nxt,
tp->snd_max);
printf("\tsnd_(wl1,wl2,wnd) (%x,%x,%lx)\n",
tp->snd_wl1, tp->snd_wl2, tp->snd_wnd);
#endif /* TCPDEBUG */
}
/* $OpenBSD: db_usrreq.c,v 1.22 2021/01/09 20:58:12 gnezdo Exp $ */
/*
* Copyright (c) 1996 Michael Shalayeff. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/sysctl.h>
#include <dev/cons.h>
#include <ddb/db_var.h>
int db_log = 1;
int db_profile; /* Allow dynamic profiling */
const struct sysctl_bounded_args ddb_vars[] = {
{ DBCTL_RADIX, &db_radix, 8, 16 },
{ DBCTL_MAXWIDTH, &db_max_width, 0, INT_MAX },
{ DBCTL_TABSTOP, &db_tab_stop_width, 1, 16 },
{ DBCTL_MAXLINE, &db_max_line, 0, INT_MAX },
{ DBCTL_LOG, &db_log, 0, 1 },
};
int
ddb_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
/* All sysctl names at this level are terminal. */
if (namelen != 1)
return (ENOTDIR);
switch (name[0]) {
case DBCTL_PANIC:
if (securelevel > 0)
return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
&db_panic));
else {
return (sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&db_panic, 0, 1));
}
break;
case DBCTL_CONSOLE:
if (securelevel > 0)
return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
&db_console));
else {
return (sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&db_console, 0, 1));
}
break;
case DBCTL_TRIGGER:
if (newp && db_console) {
struct process *pr = curproc->p_p;
if (securelevel < 1 || (pr->ps_flags & PS_CONTROLT && cn_tab &&
cn_tab->cn_dev == pr->ps_session->s_ttyp->t_dev)) {
db_enter();
newp = NULL;
} else
return (ENODEV);
}
return (sysctl_rdint(oldp, oldlenp, newp, 0));
#if defined(DDBPROF)
case DBCTL_PROFILE:
if (securelevel > 0)
return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
&db_profile));
else {
return (sysctl_int_bounded(oldp, oldlenp, newp, newlen,
&db_profile, 0, 1));
}
break;
#endif /* DDBPROF */
default:
return (sysctl_bounded_arr(ddb_vars, nitems(ddb_vars), name,
namelen, oldp, oldlenp, newp, newlen));
}
/* NOTREACHED */
}
/* $OpenBSD: vfs_sync.c,v 1.68 2022/08/14 01:58:28 jsg Exp $ */
/*
* Portions of this code are:
*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Syncer daemon
*/
#include <sys/queue.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/time.h>
#ifdef FFS_SOFTUPDATES
int softdep_process_worklist(struct mount *);
#endif
/*
* The workitem queue.
*/
#define SYNCER_MAXDELAY 32 /* maximum sync delay time */
#define SYNCER_DEFAULT 30 /* default sync delay time */
int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
int syncdelay = SYNCER_DEFAULT; /* time to delay syncing vnodes */
int rushjob = 0; /* number of slots to run ASAP */
int stat_rush_requests = 0; /* number of rush requests */
int syncer_delayno = 0;
long syncer_mask;
LIST_HEAD(synclist, vnode);
static struct synclist *syncer_workitem_pending;
struct proc *syncerproc;
int syncer_chan;
/*
* The workitem queue.
*
* It is useful to delay writes of file data and filesystem metadata
* for tens of seconds so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To realize this,
* we append vnodes to a "workitem" queue. When running with a soft
* updates implementation, most pending metadata dependencies should
* not wait for more than a few seconds. Thus, mounted block devices
* are delayed only about half the time that file data is delayed.
* Similarly, directory updates are more critical, so are only delayed
* about a third the time that file data is delayed. Thus, there are
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
* one each second (driven off the filesystem syncer process). The
* syncer_delayno variable indicates the next queue that is to be processed.
* Items that need to be processed soon are placed in this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
*/
void
vn_initialize_syncerd(void)
{
syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK,
&syncer_mask);
syncer_maxdelay = syncer_mask + 1;
}
/*
* Add an item to the syncer work queue.
*/
void
vn_syncer_add_to_worklist(struct vnode *vp, int delay)
{
int s, slot;
if (delay > syncer_maxdelay - 2)
delay = syncer_maxdelay - 2;
slot = (syncer_delayno + delay) & syncer_mask;
s = splbio();
if (vp->v_bioflag & VBIOONSYNCLIST) LIST_REMOVE(vp, v_synclist);
vp->v_bioflag |= VBIOONSYNCLIST;
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
splx(s);
}
/*
* System filesystem synchronizer daemon.
*/
void
syncer_thread(void *arg)
{
uint64_t elapsed, start;
struct proc *p = curproc;
struct synclist *slp;
struct vnode *vp;
int s;
for (;;) {
start = getnsecuptime();
/*
* Push files whose dirty time has expired.
*/
s = splbio();
slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno == syncer_maxdelay)
syncer_delayno = 0;
while ((vp = LIST_FIRST(slp)) != NULL) {
if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT)) {
/*
* If we fail to get the lock, we move this
* vnode one second ahead in time.
* XXX - no good, but the best we can do.
*/
vn_syncer_add_to_worklist(vp, 1);
continue;
}
splx(s);
(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
vput(vp);
s = splbio();
if (LIST_FIRST(slp) == vp) {
/*
* Note: disk vps can remain on the
* worklist too with no dirty blocks, but
* since sync_fsync() moves it to a different
* slot we are safe.
*/
#ifdef DIAGNOSTIC
if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
vp->v_type != VBLK) {
vprint("fsync failed", vp);
if (vp->v_mount != NULL)
printf("mounted on: %s\n",
vp->v_mount->mnt_stat.f_mntonname);
panic("%s: fsync failed", __func__);
}
#endif /* DIAGNOSTIC */
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*/
vn_syncer_add_to_worklist(vp, syncdelay);
}
sched_pause(yield);
}
splx(s);
#ifdef FFS_SOFTUPDATES
/*
* Do soft update processing.
*/
softdep_process_worklist(NULL);
#endif
/*
* The variable rushjob allows the kernel to speed up the
* processing of the filesystem syncer process. A rushjob
* value of N tells the filesystem syncer to process the next
* N seconds worth of work on its queue ASAP. Currently rushjob
* is used by the soft update code to speed up the filesystem
* syncer process when the incore state is getting so far
* ahead of the disk that the kernel memory pool is being
* threatened with exhaustion.
*/
if (rushjob > 0) {
rushjob -= 1;
continue;
}
/*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
elapsed = getnsecuptime() - start;
if (elapsed < SEC_TO_NSEC(1)) {
tsleep_nsec(&syncer_chan, PPAUSE, "syncer",
SEC_TO_NSEC(1) - elapsed);
}
}
}
/*
* Request the syncer daemon to speed up its work.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
int
speedup_syncer(void)
{
if (syncerproc)
wakeup_proc(syncerproc, &syncer_chan);
if (rushjob < syncdelay / 2) {
rushjob += 1;
stat_rush_requests += 1;
return 1;
}
return 0;
}
/* Routine to create and manage a filesystem syncer vnode. */
int sync_fsync(void *);
int sync_inactive(void *);
int sync_print(void *);
const struct vops sync_vops = {
.vop_close = nullop,
.vop_fsync = sync_fsync,
.vop_inactive = sync_inactive,
.vop_reclaim = nullop,
.vop_lock = nullop,
.vop_unlock = nullop,
.vop_islocked = nullop,
.vop_print = sync_print
};
/*
* Create a new filesystem syncer vnode for the specified mount point.
*/
int
vfs_allocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
static long start, incr, next;
int error;
/* Allocate a new vnode */
if ((error = getnewvnode(VT_VFS, mp, &sync_vops, &vp)) != 0) {
mp->mnt_syncer = NULL;
return (error);
}
vp->v_writecount = 1;
vp->v_type = VNON;
/*
* Place the vnode onto the syncer worklist. We attempt to
* scatter them about on the list so that they will go off
* at evenly distributed times even if all the filesystems
* are mounted at once.
*/
next += incr;
if (next == 0 || next > syncer_maxdelay) {
start /= 2;
incr /= 2;
if (start == 0) {
start = syncer_maxdelay / 2;
incr = syncer_maxdelay;
}
next = start;
}
vn_syncer_add_to_worklist(vp, next);
mp->mnt_syncer = vp;
return (0);
}
/*
* Do a lazy sync of the filesystem.
*/
int
sync_fsync(void *v)
{
struct vop_fsync_args *ap = v;
struct vnode *syncvp = ap->a_vp;
struct mount *mp = syncvp->v_mount;
int asyncflag;
/*
* We only need to do something if this is a lazy evaluation.
*/
if (ap->a_waitfor != MNT_LAZY)
return (0);
/*
* Move ourselves to the back of the sync list.
*/
vn_syncer_add_to_worklist(syncvp, syncdelay);
/*
* Walk the list of vnodes pushing all that are dirty and
* not already on the sync list.
*/
if (vfs_busy(mp, VB_READ|VB_NOWAIT) == 0) {
asyncflag = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
VFS_SYNC(mp, MNT_LAZY, 0, ap->a_cred, ap->a_p);
if (asyncflag)
mp->mnt_flag |= MNT_ASYNC;
vfs_unbusy(mp);
}
return (0);
}
/*
* The syncer vnode is no longer needed and is being decommissioned.
*/
int
sync_inactive(void *v)
{
struct vop_inactive_args *ap = v;
struct vnode *vp = ap->a_vp;
int s;
if (vp->v_usecount == 0) {
VOP_UNLOCK(vp);
return (0);
}
vp->v_mount->mnt_syncer = NULL;
s = splbio();
LIST_REMOVE(vp, v_synclist);
vp->v_bioflag &= ~VBIOONSYNCLIST;
splx(s);
vp->v_writecount = 0;
vput(vp);
return (0);
}
/*
* Print out a syncer vnode.
*/
int
sync_print(void *v)
{
printf("syncer vnode\n");
return (0);
}
/* $OpenBSD: ffs_subr.c,v 1.34 2021/10/20 06:35:39 semarie Exp $ */
/* $NetBSD: ffs_subr.c,v 1.6 1996/03/17 02:16:23 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93
*/
#include <sys/param.h>
#include <ufs/ffs/fs.h>
#ifdef _KERNEL
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ffs/ffs_extern.h>
/*
* Return buffer with the contents of block "offset" from the beginning of
* directory "ip". If "res" is non-zero, fill it in with a pointer to the
* remaining space in the directory.
*/
int
ffs_bufatoff(struct inode *ip, off_t offset, char **res, struct buf **bpp)
{
struct fs *fs;
struct vnode *vp;
struct buf *bp;
daddr_t lbn;
int bsize, error;
vp = ITOV(ip);
fs = ip->i_fs;
lbn = lblkno(fs, offset);
bsize = blksize(fs, ip, lbn);
*bpp = NULL;
if ((error = bread(vp, lbn, fs->fs_bsize, &bp)) != 0) {
brelse(bp);
return (error);
}
buf_adjcnt(bp, bsize);
if (res)
*res = (char *)bp->b_data + blkoff(fs, offset);
*bpp = bp;
return (0);
}
#else
/* Prototypes for userland */
void ffs_fragacct(struct fs *, int, int32_t[], int);
int ffs_isfreeblock(struct fs *, u_char *, daddr_t);
int ffs_isblock(struct fs *, u_char *, daddr_t);
void ffs_clrblock(struct fs *, u_char *, daddr_t);
void ffs_setblock(struct fs *, u_char *, daddr_t);
__dead void panic(const char *, ...);
#endif
/*
* Update the frsum fields to reflect addition or deletion
* of some frags.
*/
void
ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt)
{
int inblk;
int field, subfield;
int siz, pos;
inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
fragmap <<= 1;
for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
continue;
field = around[siz];
subfield = inside[siz];
for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) { fraglist[siz] += cnt;
pos += siz;
field <<= siz;
subfield <<= siz;
}
field <<= 1;
subfield <<= 1;
}
}
}
#if defined(_KERNEL) && defined(DIAGNOSTIC)
void
ffs_checkoverlap(struct buf *bp, struct inode *ip)
{
daddr_t start, last;
struct vnode *vp;
struct buf *ep;
start = bp->b_blkno;
last = start + btodb(bp->b_bcount) - 1;
LIST_FOREACH(ep, &bufhead, b_list) {
if (ep == bp || (ep->b_flags & B_INVAL) ||
ep->b_vp == NULLVP)
continue;
if (VOP_BMAP(ep->b_vp, 0, &vp, NULL, NULL))
continue;
if (vp != ip->i_devvp)
continue;
/* look for overlap */
if (ep->b_bcount == 0 || ep->b_blkno > last ||
ep->b_blkno + btodb(ep->b_bcount) <= start)
continue;
vprint("Disk overlap", vp);
(void)printf("\tstart %lld, end %lld overlap start %llu, "
"end %llu\n", (long long)start, (long long)last,
(long long)ep->b_blkno,
(long long)(ep->b_blkno + btodb(ep->b_bcount) - 1));
panic("Disk buffer overlap");
}
}
#endif /* DIAGNOSTIC */
/*
* block operations
*
* check if a block is available
*/
int
ffs_isblock(struct fs *fs, u_char *cp, daddr_t h)
{
u_char mask;
switch (fs->fs_frag) {
default:
case 8:
return (cp[h] == 0xff);
case 4:
mask = 0x0f << ((h & 0x1) << 2);
return ((cp[h >> 1] & mask) == mask);
case 2:
mask = 0x03 << ((h & 0x3) << 1);
return ((cp[h >> 2] & mask) == mask);
case 1:
mask = 0x01 << (h & 0x7);
return ((cp[h >> 3] & mask) == mask);
}
}
/*
* take a block out of the map
*/
void
ffs_clrblock(struct fs *fs, u_char *cp, daddr_t h)
{
switch (fs->fs_frag) {
default:
case 8:
cp[h] = 0;
return;
case 4:
cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
return;
case 2:
cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
return;
case 1:
cp[h >> 3] &= ~(0x01 << (h & 0x7));
return;
}
}
/*
* put a block into the map
*/
void
ffs_setblock(struct fs *fs, u_char *cp, daddr_t h)
{
switch (fs->fs_frag) {
default:
case 8:
cp[h] = 0xff;
return;
case 4:
cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
return;
case 2:
cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
return;
case 1:
cp[h >> 3] |= (0x01 << (h & 0x7));
return;
}
}
/*
* check if a block is free
*/
int
ffs_isfreeblock(struct fs *fs, u_char *cp, daddr_t h)
{
switch (fs->fs_frag) {
default:
case 8:
return (cp[h] == 0);
case 4:
return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
case 2:
return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
case 1:
return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
}
}
#ifdef _KERNEL
/*
* Initialize the vnode associated with a new inode, handle aliased
* vnodes.
*/
int
ffs_vinit(struct mount *mntp, struct vnode **vpp)
{
struct inode *ip;
struct vnode *vp, *nvp;
struct timeval mtv;
vp = *vpp;
ip = VTOI(vp);
switch(vp->v_type = IFTOVT(DIP(ip, mode))) {
case VCHR:
case VBLK:
vp->v_op = &ffs_specvops;
if ((nvp = checkalias(vp, DIP(ip, rdev), mntp)) != NULL) {
/*
* Discard unneeded vnode, but save its inode.
* Note that the lock is carried over in the inode
* to the replacement vnode.
*/
nvp->v_data = vp->v_data;
vp->v_data = NULL;
vp->v_op = &spec_vops;
#ifdef VFSLCKDEBUG
vp->v_flag &= ~VLOCKSWORK;
#endif
vrele(vp);
vgone(vp);
/*
* Reinitialize aliased inode.
*/
vp = nvp;
ip->i_vnode = vp;
}
break;
case VFIFO:
#ifdef FIFO
vp->v_op = &ffs_fifovops;
break;
#else
return (EOPNOTSUPP);
#endif
case VNON:
case VBAD:
case VSOCK:
case VLNK:
case VDIR:
case VREG:
break;
}
if (ip->i_number == ROOTINO) vp->v_flag |= VROOT;
/*
* Initialize modrev times
*/
getmicrouptime(&mtv);
ip->i_modrev = (u_quad_t)mtv.tv_sec << 32;
ip->i_modrev |= (u_quad_t)mtv.tv_usec * 4294;
*vpp = vp;
return (0);
}
#endif /* _KERNEL */
/* $OpenBSD: tty_tty.c,v 1.32 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: tty_tty.c,v 1.13 1996/03/30 22:24:46 christos Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty_tty.c 8.2 (Berkeley) 9/23/93
*/
/*
* Indirect driver for controlling tty.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/fcntl.h>
#define cttyvp(p) \
((p)->p_p->ps_flags & PS_CONTROLT ? \
(p)->p_p->ps_session->s_ttyvp : NULL)
int
cttyopen(dev_t dev, int flag, int mode, struct proc *p)
{ struct vnode *ttyvp = cttyvp(p);
int error;
if (ttyvp == NULL)
return (ENXIO);
vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(ttyvp, flag, NOCRED, p);
VOP_UNLOCK(ttyvp);
return (error);
}
int
cttyread(dev_t dev, struct uio *uio, int flag)
{
struct vnode *ttyvp = cttyvp(uio->uio_procp);
int error;
if (ttyvp == NULL)
return (EIO);
vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_READ(ttyvp, uio, flag, NOCRED);
VOP_UNLOCK(ttyvp);
return (error);
}
int
cttywrite(dev_t dev, struct uio *uio, int flag)
{
struct vnode *ttyvp = cttyvp(uio->uio_procp);
int error;
if (ttyvp == NULL)
return (EIO);
vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_WRITE(ttyvp, uio, flag, NOCRED);
VOP_UNLOCK(ttyvp);
return (error);
}
int
cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
{
struct vnode *ttyvp = cttyvp(p);
struct session *sess;
int error, secs;
if (ttyvp == NULL)
return (EIO);
if (cmd == TIOCSCTTY) /* XXX */
return (EINVAL);
if (cmd == TIOCNOTTY) {
if (!SESS_LEADER(p->p_p)) {
atomic_clearbits_int(&p->p_p->ps_flags, PS_CONTROLT);
return (0);
} else
return (EINVAL);
}
switch (cmd) {
case TIOCSETVERAUTH:
if ((error = suser(p)))
return error;
secs = *(int *)addr;
if (secs < 1 || secs > 3600)
return EINVAL;
sess = p->p_p->ps_pgrp->pg_session;
sess->s_verauthuid = p->p_ucred->cr_ruid;
sess->s_verauthppid = p->p_p->ps_pptr->ps_pid;
timeout_add_sec(&sess->s_verauthto, secs);
return 0;
case TIOCCLRVERAUTH:
sess = p->p_p->ps_pgrp->pg_session;
timeout_del(&sess->s_verauthto);
zapverauth(sess);
return 0;
case TIOCCHKVERAUTH:
/*
* It's not clear when or what these checks are for.
* How can we reach this code with a different ruid?
* The ppid check is also more porous than desired.
* Nevertheless, the checks reflect the original intention;
* namely, that it be the same user using the same shell.
*/
sess = p->p_p->ps_pgrp->pg_session;
if (sess->s_verauthuid == p->p_ucred->cr_ruid &&
sess->s_verauthppid == p->p_p->ps_pptr->ps_pid)
return 0;
return EPERM;
}
return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p));
}
int
cttykqfilter(dev_t dev, struct knote *kn)
{
struct vnode *ttyvp = cttyvp(curproc);
if (ttyvp == NULL) {
if (kn->kn_flags & (__EV_POLL | __EV_SELECT))
return (seltrue_kqfilter(dev, kn));
return (ENXIO);
}
return (VOP_KQFILTER(ttyvp, FREAD|FWRITE, kn));
}
/* $OpenBSD: strncmp.c,v 1.11 2014/06/10 04:16:57 deraadt Exp $ */
/*
* Copyright (c) 1989 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <lib/libkern/libkern.h>
int
strncmp(const char *s1, const char *s2, size_t n)
{ if (n == 0)
return (0);
do {
if (*s1 != *s2++)
return (*(unsigned char *)s1 - *(unsigned char *)--s2); if (*s1++ == 0)
break;
} while (--n != 0);
return (0);
}
/* $OpenBSD: subr_poison.c,v 1.15 2022/08/14 01:58:28 jsg Exp $ */
/*
* Copyright (c) 2013 Ted Unangst <tedu@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
/*
* The POISON is used as known text to copy into free objects so
* that modifications after frees can be detected.
*/
#ifdef DEADBEEF0
#define POISON0 ((unsigned) DEADBEEF0)
#else
#define POISON0 ((unsigned) 0xdeadbeef)
#endif
#ifdef DEADBEEF1
#define POISON1 ((unsigned) DEADBEEF1)
#else
#define POISON1 ((unsigned) 0xdeafbead)
#endif
#define POISON_SIZE 64
uint32_t
poison_value(void *v)
{
ulong l = (u_long)v;
l = l >> PAGE_SHIFT;
switch (l & 3) {
case 0:
return POISON0;
case 1:
return POISON1;
case 2:
return (POISON0 & 0xffff0000) | (~POISON0 & 0x0000ffff);
case 3:
return (POISON1 & 0xffff0000) | (~POISON1 & 0x0000ffff);
}
return 0;
}
void
poison_mem(void *v, size_t len)
{
uint32_t *ip = v;
size_t i;
uint32_t poison;
poison = poison_value(v);
if (len > POISON_SIZE)
len = POISON_SIZE;
len = len / sizeof(*ip);
for (i = 0; i < len; i++)
ip[i] = poison;
}
int
poison_check(void *v, size_t len, size_t *pidx, uint32_t *pval)
{
uint32_t *ip = v;
size_t i;
uint32_t poison;
poison = poison_value(v);
if (len > POISON_SIZE)
len = POISON_SIZE;
len = len / sizeof(*ip);
for (i = 0; i < len; i++) {
if (ip[i] != poison) {
*pidx = i;
*pval = poison;
return 1;
}
}
return 0;
}
/* $OpenBSD: vfs_lockf.c,v 1.50 2022/08/14 01:58:28 jsg Exp $ */
/* $NetBSD: vfs_lockf.c,v 1.7 1996/02/04 02:18:21 christos Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Scooter Morris at Genentech Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/rwlock.h>
#include <sys/unistd.h>
/*
* The lockf structure is a kernel structure which contains the information
* associated with a byte range lock. The lockf structures are linked into
* the inode structure. Locks are sorted by the starting byte of the lock for
* efficiency.
*/
TAILQ_HEAD(locklist, lockf);
struct lockf {
short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
short lf_type; /* Lock type: F_RDLCK, F_WRLCK */
off_t lf_start; /* The byte # of the start of the lock */
off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/
caddr_t lf_id; /* The id of the resource holding the lock */
struct lockf_state *lf_state; /* State associated with the lock */
TAILQ_ENTRY(lockf) lf_entry;
struct lockf *lf_blk; /* The lock that blocks us */
struct locklist lf_blkhd; /* The list of blocked locks */
TAILQ_ENTRY(lockf) lf_block; /* A request waiting for a lock */
uid_t lf_uid; /* User ID responsible */
pid_t lf_pid; /* POSIX - owner pid */
};
struct lockf_state {
TAILQ_HEAD(, lockf) ls_locks; /* list of active locks */
TAILQ_HEAD(, lockf) ls_pending; /* list of pending locks */
struct lockf_state **ls_owner; /* owner */
int ls_refs; /* reference counter */
};
struct pool lockf_state_pool;
struct pool lockf_pool;
#define SELF 0x1
#define OTHERS 0x2
#ifdef LOCKF_DEBUG
#define DEBUG_SETLOCK 0x01
#define DEBUG_CLEARLOCK 0x02
#define DEBUG_GETLOCK 0x04
#define DEBUG_FINDOVR 0x08
#define DEBUG_SPLIT 0x10
#define DEBUG_WAKELOCK 0x20
#define DEBUG_LINK 0x40
int lockf_debug = DEBUG_SETLOCK|DEBUG_CLEARLOCK|DEBUG_WAKELOCK;
void lf_print(const char *, struct lockf *);
void lf_printlist(const char *, struct lockf *);
#define DPRINTF(args, level) if (lockf_debug & (level)) printf args
#define LFPRINT(args, level) if (lockf_debug & (level)) lf_print args
#else
#define DPRINTF(args, level)
#define LFPRINT(args, level)
#endif
struct lockf *lf_alloc(uid_t, int);
void lf_free(struct lockf *);
int lf_clearlock(struct lockf *);
int lf_findoverlap(struct lockf *, struct lockf *, int, struct lockf **);
struct lockf *lf_getblock(struct lockf *, struct lockf *);
int lf_getlock(struct lockf *, struct flock *);
int lf_setlock(struct lockf *);
void lf_split(struct lockf *, struct lockf *);
void lf_wakelock(struct lockf *, int);
int lf_deadlock(struct lockf *);
void ls_ref(struct lockf_state *);
void ls_rele(struct lockf_state *);
/*
* Serializes access to each instance of struct lockf and struct lockf_state
* and each pointer from a vnode to struct lockf_state.
*/
struct rwlock lockf_lock = RWLOCK_INITIALIZER("lockflk");
void
lf_init(void)
{
pool_init(&lockf_state_pool, sizeof(struct lockf_state), 0, IPL_NONE,
PR_WAITOK | PR_RWLOCK, "lockfspl", NULL);
pool_init(&lockf_pool, sizeof(struct lockf), 0, IPL_NONE,
PR_WAITOK | PR_RWLOCK, "lockfpl", NULL);
}
void
ls_ref(struct lockf_state *ls)
{
rw_assert_wrlock(&lockf_lock);
ls->ls_refs++;
}
void
ls_rele(struct lockf_state *ls)
{
rw_assert_wrlock(&lockf_lock);
if (--ls->ls_refs > 0)
return;
KASSERT(TAILQ_EMPTY(&ls->ls_locks)); KASSERT(TAILQ_EMPTY(&ls->ls_pending)); *ls->ls_owner = NULL;
pool_put(&lockf_state_pool, ls);
}
/*
* We enforce a limit on locks by uid, so that a single user cannot
* run the kernel out of memory. For now, the limit is pretty coarse.
* There is no limit on root.
*
* Splitting a lock will always succeed, regardless of current allocations.
* If you're slightly above the limit, we still have to permit an allocation
* so that the unlock can succeed. If the unlocking causes too many splits,
* however, you're totally cutoff.
*/
int maxlocksperuid = 1024;
/*
* 3 options for allowfail.
* 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit.
*/
struct lockf *
lf_alloc(uid_t uid, int allowfail)
{
struct uidinfo *uip;
struct lockf *lock;
uip = uid_find(uid);
if (uid && allowfail && uip->ui_lockcnt >
(allowfail == 1 ? maxlocksperuid : (maxlocksperuid * 2))) {
uid_release(uip);
return (NULL);
}
uip->ui_lockcnt++;
uid_release(uip);
lock = pool_get(&lockf_pool, PR_WAITOK);
lock->lf_uid = uid;
return (lock);
}
void
lf_free(struct lockf *lock)
{
struct uidinfo *uip;
rw_assert_wrlock(&lockf_lock);
LFPRINT(("lf_free", lock), DEBUG_LINK);
KASSERT(TAILQ_EMPTY(&lock->lf_blkhd));
ls_rele(lock->lf_state);
uip = uid_find(lock->lf_uid);
uip->ui_lockcnt--;
uid_release(uip);
pool_put(&lockf_pool, lock);
}
/*
* Do an advisory lock operation.
*/
int
lf_advlock(struct lockf_state **state, off_t size, caddr_t id, int op,
struct flock *fl, int flags)
{
struct proc *p = curproc;
struct lockf_state *ls;
struct lockf *lock;
off_t start, end;
int error = 0;
/*
* Convert the flock structure into a start and end.
*/
switch (fl->l_whence) {
case SEEK_SET:
case SEEK_CUR:
/*
* Caller is responsible for adding any necessary offset
* when SEEK_CUR is used.
*/
start = fl->l_start;
break;
case SEEK_END:
start = size + fl->l_start;
break;
default:
return (EINVAL);
}
if (start < 0)
return (EINVAL);
if (fl->l_len > 0) {
if (fl->l_len - 1 > LLONG_MAX - start)
return (EOVERFLOW);
end = start + (fl->l_len - 1);
/* Avoid ambiguity at the end of the range. */
if (end == LLONG_MAX)
end = -1;
} else if (fl->l_len < 0) { if (start + fl->l_len < 0)
return (EINVAL);
end = start - 1;
start += fl->l_len;
} else {
end = -1;
}
rw_enter_write(&lockf_lock);
ls = *state;
/*
* Avoid the common case of unlocking when inode has no locks.
*/
if (ls == NULL && op != F_SETLK) {
fl->l_type = F_UNLCK;
goto out;
}
if (ls == NULL) { ls = pool_get(&lockf_state_pool, PR_WAITOK | PR_ZERO);
ls->ls_owner = state;
TAILQ_INIT(&ls->ls_locks);
TAILQ_INIT(&ls->ls_pending);
*state = ls;
}
ls_ref(ls);
lock = lf_alloc(p->p_ucred->cr_uid, op == F_SETLK ? 1 : 2);
if (!lock) {
ls_rele(ls);
error = ENOLCK;
goto out;
}
lock->lf_flags = flags;
lock->lf_type = fl->l_type;
lock->lf_start = start;
lock->lf_end = end;
lock->lf_id = id;
lock->lf_state = ls;
lock->lf_blk = NULL;
lock->lf_pid = (flags & F_POSIX) ? p->p_p->ps_pid : -1;
TAILQ_INIT(&lock->lf_blkhd);
switch (op) {
case F_SETLK:
error = lf_setlock(lock);
break;
case F_UNLCK:
error = lf_clearlock(lock);
lf_free(lock);
break;
case F_GETLK:
error = lf_getlock(lock, fl);
lf_free(lock);
break;
default:
lf_free(lock);
error = EINVAL;
break;
}
out:
rw_exit_write(&lockf_lock);
return (error);
}
/*
* Set a byte-range lock.
*/
int
lf_setlock(struct lockf *lock)
{
struct lockf *block;
struct lockf *overlap, *ltmp;
int ovcase, priority, needtolink, error;
rw_assert_wrlock(&lockf_lock);
LFPRINT(("lf_setlock", lock), DEBUG_SETLOCK);
priority = PLOCK;
if (lock->lf_type == F_WRLCK)
priority += 4;
priority |= PCATCH;
/*
* Scan lock list for this file looking for locks that would block us.
*/
for (;;) {
block = lf_getblock(TAILQ_FIRST(&lock->lf_state->ls_locks),
lock);
if (block == NULL)
break;
if ((lock->lf_flags & F_WAIT) == 0) {
lf_free(lock);
return (EAGAIN);
}
/*
* Lock is blocked, check for deadlock before proceeding.
* Note: flock style locks cover the whole file, there is no
* chance for deadlock.
*/
if ((lock->lf_flags & F_POSIX) && lf_deadlock(lock)) { lf_free(lock);
return (EDEADLK);
}
/*
* For flock type locks, we must first remove
* any shared locks that we hold before we sleep
* waiting for an exclusive lock.
*/
if ((lock->lf_flags & F_FLOCK) && lock->lf_type == F_WRLCK) { lock->lf_type = F_UNLCK;
(void)lf_clearlock(lock);
lock->lf_type = F_WRLCK;
}
/*
* Add our lock to the blocked list and sleep until we're free.
* Remember who blocked us (for deadlock detection).
*/
lock->lf_blk = block;
LFPRINT(("lf_setlock", lock), DEBUG_SETLOCK);
LFPRINT(("lf_setlock: blocking on", block), DEBUG_SETLOCK);
TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
TAILQ_INSERT_TAIL(&lock->lf_state->ls_pending, lock, lf_entry);
error = rwsleep_nsec(lock, &lockf_lock, priority, "lockf",
INFSLP);
TAILQ_REMOVE(&lock->lf_state->ls_pending, lock, lf_entry);
wakeup_one(lock->lf_state);
if (lock->lf_blk != NULL) { TAILQ_REMOVE(&lock->lf_blk->lf_blkhd, lock, lf_block);
lock->lf_blk = NULL;
}
if (error) {
lf_free(lock);
return (error);
}
if (lock->lf_flags & F_INTR) { lf_free(lock);
return (EINTR);
}
}
/*
* No blocks!! Add the lock. Note that we will
* downgrade or upgrade any overlapping locks this
* process already owns.
*
* Skip over locks owned by other processes.
* Handle any locks that overlap and are owned by ourselves.
*/
block = TAILQ_FIRST(&lock->lf_state->ls_locks);
overlap = NULL;
needtolink = 1;
for (;;) {
ovcase = lf_findoverlap(block, lock, SELF, &overlap);
if (ovcase)
block = TAILQ_NEXT(overlap, lf_entry);
/*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
switch (ovcase) {
case 0: /* no overlap */
if (needtolink) {
if (overlap) /* insert before overlap */
TAILQ_INSERT_BEFORE(overlap, lock,
lf_entry);
else /* first or last lock in list */
TAILQ_INSERT_TAIL(&lock->lf_state->ls_locks,
lock, lf_entry);
}
break;
case 1: /* overlap == lock */
/*
* If downgrading lock, others may be
* able to acquire it.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK)
lf_wakelock(overlap, 0);
overlap->lf_type = lock->lf_type;
lf_free(lock);
lock = overlap; /* for debug output below */
break;
case 2: /* overlap contains lock */
/*
* Check for common starting point and different types.
*/
if (overlap->lf_type == lock->lf_type) {
if (!needtolink) TAILQ_REMOVE(&lock->lf_state->ls_locks,
lock, lf_entry);
lf_free(lock);
lock = overlap; /* for debug output below */
break;
}
if (overlap->lf_start == lock->lf_start) {
if (!needtolink) TAILQ_REMOVE(&lock->lf_state->ls_locks,
lock, lf_entry);
TAILQ_INSERT_BEFORE(overlap, lock, lf_entry);
overlap->lf_start = lock->lf_end + 1;
} else
lf_split(overlap, lock);
lf_wakelock(overlap, 0);
break;
case 3: /* lock contains overlap */
/*
* If downgrading lock, others may be able to
* acquire it, otherwise take the list.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK) {
lf_wakelock(overlap, 0);
} else {
while ((ltmp =
TAILQ_FIRST(&overlap->lf_blkhd))) {
TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
lf_block);
ltmp->lf_blk = lock;
TAILQ_INSERT_TAIL(&lock->lf_blkhd,
ltmp, lf_block);
}
}
/*
* Add the new lock if necessary and delete the overlap.
*/
if (needtolink) {
TAILQ_INSERT_BEFORE(overlap, lock, lf_entry);
needtolink = 0;
}
TAILQ_REMOVE(&lock->lf_state->ls_locks, overlap, lf_entry);
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
/*
* Add lock after overlap on the list.
*/
if (!needtolink) TAILQ_REMOVE(&lock->lf_state->ls_locks, lock,
lf_entry);
TAILQ_INSERT_AFTER(&lock->lf_state->ls_locks, overlap,
lock, lf_entry);
overlap->lf_end = lock->lf_start - 1;
lf_wakelock(overlap, 0);
needtolink = 0;
continue;
case 5: /* overlap ends after lock */
/*
* Add the new lock before overlap.
*/
if (needtolink)
TAILQ_INSERT_BEFORE(overlap, lock, lf_entry);
overlap->lf_start = lock->lf_end + 1;
lf_wakelock(overlap, 0);
break;
}
break;
}
LFPRINT(("lf_setlock: got the lock", lock), DEBUG_SETLOCK);
return (0);
}
/*
* Remove a byte-range lock on an inode.
*
* Generally, find the lock (or an overlap to that lock)
* and remove it (or shrink it), then wakeup anyone we can.
*/
int
lf_clearlock(struct lockf *lock)
{
struct lockf *lf, *overlap;
int ovcase;
rw_assert_wrlock(&lockf_lock);
lf = TAILQ_FIRST(&lock->lf_state->ls_locks);
if (lf == NULL)
return (0);
LFPRINT(("lf_clearlock", lock), DEBUG_CLEARLOCK);
while ((ovcase = lf_findoverlap(lf, lock, SELF, &overlap))) {
lf_wakelock(overlap, 0);
switch (ovcase) {
case 1: /* overlap == lock */
TAILQ_REMOVE(&lock->lf_state->ls_locks, overlap,
lf_entry);
lf_free(overlap);
break;
case 2: /* overlap contains lock: split it */
if (overlap->lf_start == lock->lf_start) {
overlap->lf_start = lock->lf_end + 1;
break;
}
lf_split(overlap, lock);
/*
* The lock is now part of the list, lf_clearlock() must
* ensure that the lock remains detached from the list.
*/
TAILQ_REMOVE(&lock->lf_state->ls_locks, lock, lf_entry);
break;
case 3: /* lock contains overlap */
lf = TAILQ_NEXT(overlap, lf_entry);
TAILQ_REMOVE(&lock->lf_state->ls_locks, overlap,
lf_entry);
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
overlap->lf_end = lock->lf_start - 1;
lf = TAILQ_NEXT(overlap, lf_entry);
continue;
case 5: /* overlap ends after lock */
overlap->lf_start = lock->lf_end + 1;
break;
}
break;
}
return (0);
}
/*
* Check whether there is a blocking lock,
* and if so return its process identifier.
*/
int
lf_getlock(struct lockf *lock, struct flock *fl)
{
struct lockf *block, *lf;
rw_assert_wrlock(&lockf_lock);
LFPRINT(("lf_getlock", lock), DEBUG_CLEARLOCK);
lf = TAILQ_FIRST(&lock->lf_state->ls_locks);
if ((block = lf_getblock(lf, lock)) != NULL) {
fl->l_type = block->lf_type;
fl->l_whence = SEEK_SET;
fl->l_start = block->lf_start;
if (block->lf_end == -1)
fl->l_len = 0;
else
fl->l_len = block->lf_end - block->lf_start + 1;
fl->l_pid = block->lf_pid;
} else {
fl->l_type = F_UNLCK;
}
return (0);
}
/*
* Walk the list of locks for an inode and
* return the first blocking lock.
*/
struct lockf *
lf_getblock(struct lockf *lf, struct lockf *lock)
{
struct lockf *overlap;
rw_assert_wrlock(&lockf_lock);
while (lf_findoverlap(lf, lock, OTHERS, &overlap) != 0) {
/*
* We've found an overlap, see if it blocks us
*/
if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
return (overlap);
/*
* Nope, point to the next one on the list and
* see if it blocks us
*/
lf = TAILQ_NEXT(overlap, lf_entry);
}
return (NULL);
}
/*
* Walk the list of locks for an inode to
* find an overlapping lock (if any).
*
* NOTE: this returns only the FIRST overlapping lock. There
* may be more than one.
*/
int
lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
struct lockf **overlap)
{
off_t start, end;
rw_assert_wrlock(&lockf_lock);
LFPRINT(("lf_findoverlap: looking for overlap in", lock), DEBUG_FINDOVR);
*overlap = lf;
start = lock->lf_start;
end = lock->lf_end;
while (lf != NULL) { if (((type & SELF) && lf->lf_id != lock->lf_id) || ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
*overlap = lf = TAILQ_NEXT(lf, lf_entry);
continue;
}
LFPRINT(("\tchecking", lf), DEBUG_FINDOVR);
/*
* OK, check for overlap
*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
/* Case 0 */
if ((lf->lf_end != -1 && start > lf->lf_end) || (end != -1 && lf->lf_start > end)) {
DPRINTF(("no overlap\n"), DEBUG_FINDOVR);
if ((type & SELF) && end != -1 && lf->lf_start > end)
return (0);
*overlap = lf = TAILQ_NEXT(lf, lf_entry);
continue;
}
/* Case 1 */
if ((lf->lf_start == start) && (lf->lf_end == end)) {
DPRINTF(("overlap == lock\n"), DEBUG_FINDOVR);
return (1);
}
/* Case 2 */
if ((lf->lf_start <= start) && (lf->lf_end == -1 || (end != -1 && lf->lf_end >= end))) {
DPRINTF(("overlap contains lock\n"), DEBUG_FINDOVR);
return (2);
}
/* Case 3 */
if (start <= lf->lf_start &&
(end == -1 || (lf->lf_end != -1 && end >= lf->lf_end))) {
DPRINTF(("lock contains overlap\n"), DEBUG_FINDOVR);
return (3);
}
/* Case 4 */
if ((lf->lf_start < start) &&
((lf->lf_end >= start) || (lf->lf_end == -1))) {
DPRINTF(("overlap starts before lock\n"),
DEBUG_FINDOVR);
return (4);
}
/* Case 5 */
if ((lf->lf_start > start) && (end != -1) && ((lf->lf_end > end) || (lf->lf_end == -1))) {
DPRINTF(("overlap ends after lock\n"), DEBUG_FINDOVR);
return (5);
}
panic("lf_findoverlap: default");
}
return (0);
}
/*
* Purge all locks associated with the given lock state.
*/
void
lf_purgelocks(struct lockf_state **state)
{
struct lockf_state *ls;
struct lockf *lock;
rw_enter_write(&lockf_lock);
ls = *state;
if (ls == NULL)
goto out;
ls_ref(ls);
/* Interrupt blocked locks and wait for all of them to finish. */
TAILQ_FOREACH(lock, &ls->ls_locks, lf_entry) {
LFPRINT(("lf_purgelocks: wakeup", lock), DEBUG_SETLOCK);
lf_wakelock(lock, F_INTR);
}
while (!TAILQ_EMPTY(&ls->ls_pending))
rwsleep_nsec(ls, &lockf_lock, PLOCK, "lockfp", INFSLP);
/*
* Any remaining locks cannot block other locks at this point and can
* safely be removed.
*/
while ((lock = TAILQ_FIRST(&ls->ls_locks))) {
TAILQ_REMOVE(&ls->ls_locks, lock, lf_entry);
lf_free(lock);
}
/* This is the last expected thread to hold a lock state reference. */
KASSERT(ls->ls_refs == 1);
ls_rele(ls);
out:
rw_exit_write(&lockf_lock);
}
/*
* Split a lock and a contained region into
* two or three locks as necessary.
*/
void
lf_split(struct lockf *lock1, struct lockf *lock2)
{
struct lockf *splitlock;
rw_assert_wrlock(&lockf_lock);
LFPRINT(("lf_split", lock1), DEBUG_SPLIT);
LFPRINT(("splitting from", lock2), DEBUG_SPLIT);
/*
* Check to see if splitting into only two pieces.
*/
if (lock1->lf_start == lock2->lf_start) {
lock1->lf_start = lock2->lf_end + 1;
TAILQ_INSERT_BEFORE(lock1, lock2, lf_entry);
return;
}
if (lock1->lf_end == lock2->lf_end) {
lock1->lf_end = lock2->lf_start - 1;
TAILQ_INSERT_AFTER(&lock1->lf_state->ls_locks, lock1, lock2,
lf_entry);
return;
}
/*
* Make a new lock consisting of the last part of
* the encompassing lock
*/
splitlock = lf_alloc(lock1->lf_uid, 0);
splitlock->lf_flags = lock1->lf_flags;
splitlock->lf_type = lock1->lf_type;
splitlock->lf_start = lock2->lf_end + 1;
splitlock->lf_end = lock1->lf_end;
splitlock->lf_id = lock1->lf_id;
splitlock->lf_state = lock1->lf_state;
splitlock->lf_blk = NULL;
splitlock->lf_pid = lock1->lf_pid;
TAILQ_INIT(&splitlock->lf_blkhd);
ls_ref(splitlock->lf_state);
lock1->lf_end = lock2->lf_start - 1;
TAILQ_INSERT_AFTER(&lock1->lf_state->ls_locks, lock1, lock2, lf_entry); TAILQ_INSERT_AFTER(&lock1->lf_state->ls_locks, lock2, splitlock,
lf_entry);
}
/*
* Wakeup a blocklist
*/
void
lf_wakelock(struct lockf *lock, int flags)
{
struct lockf *wakelock;
rw_assert_wrlock(&lockf_lock);
while ((wakelock = TAILQ_FIRST(&lock->lf_blkhd))) {
TAILQ_REMOVE(&lock->lf_blkhd, wakelock, lf_block);
wakelock->lf_blk = NULL;
wakelock->lf_flags |= flags;
wakeup_one(wakelock);
}
}
/*
* Returns non-zero if the given lock would cause a deadlock.
*/
int
lf_deadlock(struct lockf *lock)
{
struct lockf *block, *lf, *pending;
lf = TAILQ_FIRST(&lock->lf_state->ls_locks);
for (; (block = lf_getblock(lf, lock)) != NULL;
lf = TAILQ_NEXT(block, lf_entry)) {
if ((block->lf_flags & F_POSIX) == 0)
continue;
TAILQ_FOREACH(pending, &lock->lf_state->ls_pending, lf_entry) { if (pending->lf_blk == NULL)
continue; /* lock already unblocked */
if (pending->lf_pid == block->lf_pid &&
pending->lf_blk->lf_pid == lock->lf_pid)
return (1);
}
}
return (0);
}
#ifdef LOCKF_DEBUG
/*
* Print out a lock.
*/
void
lf_print(const char *tag, struct lockf *lock)
{
struct lockf *block;
if (tag)
printf("%s: ", tag);
printf("lock %p", lock);
if (lock == NULL) {
printf("\n");
return;
}
printf(", %s %p %s, start %lld, end %lld",
lock->lf_flags & F_POSIX ? "posix" : "flock",
lock->lf_id,
lock->lf_type == F_RDLCK ? "shared" :
lock->lf_type == F_WRLCK ? "exclusive" :
lock->lf_type == F_UNLCK ? "unlock" :
"unknown", lock->lf_start, lock->lf_end);
printf(", next %p, state %p",
TAILQ_NEXT(lock, lf_entry), lock->lf_state);
block = TAILQ_FIRST(&lock->lf_blkhd);
if (block)
printf(", block");
TAILQ_FOREACH(block, &lock->lf_blkhd, lf_block)
printf(" %p,", block);
printf("\n");
}
void
lf_printlist(const char *tag, struct lockf *lock)
{
struct lockf *lf;
printf("%s: Lock list:\n", tag);
TAILQ_FOREACH(lf, &lock->lf_state->ls_locks, lf_entry) {
if (lock == lf)
printf(" * ");
else
printf(" ");
lf_print(NULL, lf);
}
}
#endif /* LOCKF_DEBUG */
/* $OpenBSD: uvm_pmemrange.c,v 1.62 2022/06/02 18:00:53 kettenis Exp $ */
/*
* Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/mount.h>
/*
* 2 trees: addr tree and size tree.
*
* The allocator keeps chunks of free pages (called a range).
* Two pages are part of the same range if:
* - all pages in between are part of that range,
* - they are of the same memory type (zeroed or non-zeroed),
* - they are part of the same pmemrange.
* A pmemrange is a range of memory which is part of the same vm_physseg
* and has a use-count.
*
* addr tree is vm_page[0].objt
* size tree is vm_page[1].objt
*
* The size tree is not used for memory ranges of 1 page, instead,
* single queue is vm_page[0].pageq
*
* vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
* are joined, unless:
* - they have pages in between them which are not free
* - they belong to different memtypes (zeroed vs dirty memory)
* - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
* - they are not a continuation of the same array
* The latter issue is caused by vm_physseg ordering and splitting from the
* MD initialization machinery. The MD code is dependant on freelists and
* happens to split ISA memory from non-ISA memory.
* (Note: freelists die die die!)
*
* uvm_page_init guarantees that every vm_physseg contains an array of
* struct vm_page. Also, uvm_page_physload allocates an array of struct
* vm_page. This code depends on that array. The array may break across
* vm_physsegs boundaries.
*/
/*
* Validate the flags of the page. (Used in asserts.)
* Any free page must have the PQ_FREE flag set.
* Free pages may be zeroed.
* Pmap flags are left untouched.
*
* The PQ_FREE flag is not checked here: by not checking, we can easily use
* this check in pages which are freed.
*/
#define VALID_FLAGS(pg_flags) \
(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
/* Tree comparators. */
int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
const struct uvm_pmemrange *);
int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
int uvm_pmr_pg_to_memtype(struct vm_page *);
#ifdef DDB
void uvm_pmr_print(void);
#endif
/*
* Memory types. The page flags are used to derive what the current memory
* type of a page is.
*/
int
uvm_pmr_pg_to_memtype(struct vm_page *pg)
{
if (pg->pg_flags & PG_ZERO)
return UVM_PMR_MEMTYPE_ZERO;
/* Default: dirty memory. */
return UVM_PMR_MEMTYPE_DIRTY;
}
/* Trees. */
RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
uvm_pmemrange_addr_cmp);
/* Validation. */
#ifdef DEBUG
void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
#else
#define uvm_pmr_assertvalid(pmr) do {} while (0)
#endif
psize_t uvm_pmr_get1page(psize_t, int, struct pglist *,
paddr_t, paddr_t, int);
struct uvm_pmemrange *uvm_pmr_allocpmr(void);
struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *,
struct vm_page *, int);
void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
struct vm_page *pg, struct vm_page **pg_prev,
struct vm_page **pg_next);
struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *,
struct vm_page *, paddr_t);
struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *,
struct vm_page *, paddr_t);
psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t,
struct vm_page **, int);
psize_t uvm_pmr_remove_1strange_reverse(struct pglist *,
paddr_t *);
void uvm_pmr_split(paddr_t);
struct uvm_pmemrange *uvm_pmemrange_find(paddr_t);
struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
struct uvm_pmemrange *);
psize_t pow2divide(psize_t, psize_t);
struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *,
struct vm_page *, paddr_t, paddr_t, int);
/*
* Computes num/denom and rounds it up to the next power-of-2.
*
* This is a division function which calculates an approximation of
* num/denom, with result =~ num/denom. It is meant to be fast and doesn't
* have to be accurate.
*
* Providing too large a value makes the allocator slightly faster, at the
* risk of hitting the failure case more often. Providing too small a value
* makes the allocator a bit slower, but less likely to hit a failure case.
*/
psize_t
pow2divide(psize_t num, psize_t denom)
{
int rshift;
for (rshift = 0; num > denom; rshift++, denom <<= 1)
;
return (paddr_t)1 << rshift;
}
/*
* Predicate: lhs is a subrange or rhs.
*
* If rhs_low == 0: don't care about lower bound.
* If rhs_high == 0: don't care about upper bound.
*/
#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \
(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \
((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
/*
* Predicate: lhs intersects with rhs.
*
* If rhs_low == 0: don't care about lower bound.
* If rhs_high == 0: don't care about upper bound.
* Ranges don't intersect if they don't have any page in common, array
* semantics mean that < instead of <= should be used here.
*/
#define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \
(((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \
((rhs_high) == 0 || (lhs_low) < (rhs_high)))
/*
* Align to power-of-2 alignment.
*/
#define PMR_ALIGN(pgno, align) \
(((pgno) + ((align) - 1)) & ~((align) - 1))
#define PMR_ALIGN_DOWN(pgno, align) \
((pgno) & ~((align) - 1))
/*
* Comparator: sort by address ascending.
*/
int
uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
const struct uvm_pmemrange *rhs)
{
return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
}
/*
* Comparator: sort by use ascending.
*
* The higher the use value of a range, the more devices need memory in
* this range. Therefore allocate from the range with the lowest use first.
*/
int
uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
{
int result;
result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
if (result == 0)
result = uvm_pmemrange_addr_cmp(lhs, rhs);
return result;
}
int
uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
{
paddr_t lhs_addr, rhs_addr;
lhs_addr = VM_PAGE_TO_PHYS(lhs);
rhs_addr = VM_PAGE_TO_PHYS(rhs);
return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
}
int
uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
{
psize_t lhs_size, rhs_size;
int cmp;
/* Using second tree, so we receive pg[1] instead of pg[0]. */
lhs_size = (lhs - 1)->fpgsz;
rhs_size = (rhs - 1)->fpgsz;
cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
if (cmp == 0)
cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
return cmp;
}
/*
* Find the first range of free pages that is at least sz pages long.
*/
struct vm_page *
uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
{
struct vm_page *node, *best;
KASSERT(sz >= 1); if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
return TAILQ_FIRST(&pmr->single[mti]);
node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
best = NULL;
while (node != NULL) { if ((node - 1)->fpgsz >= sz) {
best = (node - 1);
node = RBT_LEFT(uvm_objtree, node);
} else
node = RBT_RIGHT(uvm_objtree, node);
}
return best;
}
/*
* Finds the next range. The next range has a size >= pg->fpgsz.
* Returns NULL if no more ranges are available.
*/
struct vm_page *
uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
{
struct vm_page *npg;
KASSERT(pmr != NULL && pg != NULL);
if (pg->fpgsz == 1) {
if (TAILQ_NEXT(pg, pageq) != NULL)
return TAILQ_NEXT(pg, pageq);
else
npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
} else
npg = RBT_NEXT(uvm_pmr_size, pg + 1);
return npg == NULL ? NULL : npg - 1;
}
/*
* Finds the previous and next ranges relative to the (uninserted) pg range.
*
* *pg_prev == NULL if no previous range is available, that can join with
* pg.
* *pg_next == NULL if no next range is available, that can join with
* pg.
*/
void
uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
struct vm_page **pg_prev, struct vm_page **pg_next)
{ KASSERT(pg_prev != NULL && pg_next != NULL);
*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
if (*pg_next == NULL)
*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
else
*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
KDASSERT(*pg_next == NULL ||
VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
KDASSERT(*pg_prev == NULL ||
VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
/* Reset if not contig. */
if (*pg_prev != NULL &&
(atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
!= atop(VM_PAGE_TO_PHYS(pg)) ||
*pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
*pg_prev = NULL;
if (*pg_next != NULL &&
(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
!= atop(VM_PAGE_TO_PHYS(*pg_next)) ||
pg + pg->fpgsz != *pg_next || /* Array broke. */
uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
*pg_next = NULL;
return;
}
/*
* Remove a range from the address tree.
* Address tree maintains pmr counters.
*/
void
uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
{
KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
KDASSERT(pg->pg_flags & PQ_FREE);
RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
pmr->nsegs--;
}
/*
* Remove a range from the size tree.
*/
void
uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
{
int memtype;
#ifdef DEBUG
struct vm_page *i;
#endif
KDASSERT(pg->fpgsz >= 1);
KDASSERT(pg->pg_flags & PQ_FREE);
memtype = uvm_pmr_pg_to_memtype(pg);
if (pg->fpgsz == 1) {
#ifdef DEBUG
TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
if (i == pg)
break;
}
KDASSERT(i == pg);
#endif
TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
} else {
KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
pg + 1) == pg + 1);
RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
}
}
/* Remove from both trees. */
void
uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
{
uvm_pmr_assertvalid(pmr);
uvm_pmr_remove_size(pmr, pg);
uvm_pmr_remove_addr(pmr, pg);
uvm_pmr_assertvalid(pmr);
}
/*
* Insert the range described in pg.
* Returns the range thus created (which may be joined with the previous and
* next ranges).
* If no_join, the caller guarantees that the range cannot possibly join
* with adjecent ranges.
*/
struct vm_page *
uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
{
struct vm_page *prev, *next;
#ifdef DEBUG
struct vm_page *i;
int mt;
#endif
KDASSERT(pg->pg_flags & PQ_FREE);
KDASSERT(pg->fpgsz >= 1);
#ifdef DEBUG
for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
TAILQ_FOREACH(i, &pmr->single[mt], pageq)
KDASSERT(i != pg);
if (pg->fpgsz > 1) {
KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
pg + 1) == NULL);
}
KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
}
#endif
if (!no_join) {
uvm_pmr_pnaddr(pmr, pg, &prev, &next);
if (next != NULL) {
uvm_pmr_remove_size(pmr, next);
uvm_pmr_remove_addr(pmr, next);
pg->fpgsz += next->fpgsz;
next->fpgsz = 0;
}
if (prev != NULL) {
uvm_pmr_remove_size(pmr, prev);
prev->fpgsz += pg->fpgsz;
pg->fpgsz = 0;
return prev;
}
}
RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
pmr->nsegs++;
return pg;
}
/*
* Insert the range described in pg.
* Returns the range thus created (which may be joined with the previous and
* next ranges).
* Page must already be in the address tree.
*/
void
uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
{
int memtype;
#ifdef DEBUG
struct vm_page *i;
int mti;
#endif
KDASSERT(pg->fpgsz >= 1);
KDASSERT(pg->pg_flags & PQ_FREE);
memtype = uvm_pmr_pg_to_memtype(pg);
#ifdef DEBUG
for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
TAILQ_FOREACH(i, &pmr->single[mti], pageq)
KDASSERT(i != pg);
if (pg->fpgsz > 1) {
KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
pg + 1) == NULL);
}
KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
}
for (i = pg; i < pg + pg->fpgsz; i++)
KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
#endif
if (pg->fpgsz == 1)
TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
else
RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
}
/* Insert in both trees. */
struct vm_page *
uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
{
uvm_pmr_assertvalid(pmr);
pg = uvm_pmr_insert_addr(pmr, pg, no_join);
uvm_pmr_insert_size(pmr, pg);
uvm_pmr_assertvalid(pmr);
return pg;
}
/*
* Find the last page that is part of this segment.
* => pg: the range at which to start the search.
* => boundary: the page number boundary specification (0 = no boundary).
* => pmr: the pmemrange of the page.
*
* This function returns 1 before the next range, so if you want to have the
* next range, you need to run TAILQ_NEXT(result, pageq) after calling.
* The reason is that this way, the length of the segment is easily
* calculated using: atop(result) - atop(pg) + 1.
* Hence this function also never returns NULL.
*/
struct vm_page *
uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
struct vm_page *pg, paddr_t boundary)
{
paddr_t first_boundary;
struct vm_page *next;
struct vm_page *prev;
KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
if (boundary != 0) {
first_boundary =
PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
} else
first_boundary = 0;
/*
* Increase next until it hits the first page of the next segment.
*
* While loop checks the following:
* - next != NULL we have not reached the end of pgl
* - boundary == 0 || next < first_boundary
* we do not cross a boundary
* - atop(prev) + 1 == atop(next)
* still in the same segment
* - low <= last
* - high > last still in the same memory range
* - memtype is equal allocator is unable to view different memtypes
* as part of the same segment
* - prev + 1 == next no array breakage occurs
*/
prev = pg;
next = TAILQ_NEXT(prev, pageq);
while (next != NULL && (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && pmr->high > atop(VM_PAGE_TO_PHYS(next)) && uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
prev + 1 == next) {
prev = next;
next = TAILQ_NEXT(prev, pageq);
}
/*
* End of this segment.
*/
return prev;
}
/*
* Find the first page that is part of this segment.
* => pg: the range at which to start the search.
* => boundary: the page number boundary specification (0 = no boundary).
* => pmr: the pmemrange of the page.
*
* This function returns 1 after the previous range, so if you want to have the
* previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
* The reason is that this way, the length of the segment is easily
* calculated using: atop(pg) - atop(result) + 1.
* Hence this function also never returns NULL.
*/
struct vm_page *
uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
struct vm_page *pg, paddr_t boundary)
{
paddr_t first_boundary;
struct vm_page *next;
struct vm_page *prev;
KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
if (boundary != 0) {
first_boundary =
PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
} else
first_boundary = 0;
/*
* Increase next until it hits the first page of the previous segment.
*
* While loop checks the following:
* - next != NULL we have not reached the end of pgl
* - boundary == 0 || next >= first_boundary
* we do not cross a boundary
* - atop(prev) - 1 == atop(next)
* still in the same segment
* - low <= last
* - high > last still in the same memory range
* - memtype is equal allocator is unable to view different memtypes
* as part of the same segment
* - prev - 1 == next no array breakage occurs
*/
prev = pg;
next = TAILQ_NEXT(prev, pageq);
while (next != NULL &&
(boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && pmr->high > atop(VM_PAGE_TO_PHYS(next)) && uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
prev - 1 == next) {
prev = next;
next = TAILQ_NEXT(prev, pageq);
}
/*
* Start of this segment.
*/
return prev;
}
/*
* Remove the first segment of contiguous pages from pgl.
* A segment ends if it crosses boundary (unless boundary = 0) or
* if it would enter a different uvm_pmemrange.
*
* Work: the page range that the caller is currently working with.
* May be null.
*
* If is_desperate is non-zero, the smallest segment is erased. Otherwise,
* the first segment is erased (which, if called by uvm_pmr_getpages(),
* probably is the smallest or very close to it).
*/
psize_t
uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
struct vm_page **work, int is_desperate)
{
struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
psize_t count;
struct uvm_pmemrange *pmr, *pmr_iter;
KASSERT(!TAILQ_EMPTY(pgl));
/*
* Initialize to first page.
* Unless desperate scan finds a better candidate, this is what'll be
* erased.
*/
start = TAILQ_FIRST(pgl);
pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
end = uvm_pmr_findnextsegment(pmr, start, boundary);
/*
* If we are desperate, we _really_ want to get rid of the smallest
* element (rather than a close match to the smallest element).
*/
if (is_desperate) {
/* Linear search for smallest segment. */
pmr_iter = pmr;
for (iter = TAILQ_NEXT(end, pageq);
iter != NULL && start != end;
iter = TAILQ_NEXT(iter_end, pageq)) {
/*
* Only update pmr if it doesn't match current
* iteration.
*/
if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
pmr_iter = uvm_pmemrange_find(atop(
VM_PAGE_TO_PHYS(iter)));
}
iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
boundary);
/*
* Current iteration is smaller than best match so
* far; update.
*/
if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
start = iter;
end = iter_end;
pmr = pmr_iter;
}
}
}
/*
* Calculate count and end of the list.
*/
count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
lowest = start;
end = TAILQ_NEXT(end, pageq);
/*
* Actually remove the range of pages.
*
* Sadly, this cannot be done using pointer iteration:
* vm_physseg is not guaranteed to be sorted on address, hence
* uvm_page_init() may not have initialized its array sorted by
* page number.
*/
for (iter = start; iter != end; iter = iter_end) {
iter_end = TAILQ_NEXT(iter, pageq);
TAILQ_REMOVE(pgl, iter, pageq);
}
lowest->fpgsz = count;
inserted = uvm_pmr_insert(pmr, lowest, 0);
/*
* If the caller was working on a range and this function modified
* that range, update the pointer.
*/
if (work != NULL && *work != NULL && atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
atop(VM_PAGE_TO_PHYS(*work)))
*work = inserted;
return count;
}
/*
* Remove the first segment of contiguous pages from a pgl
* with the list elements in reverse order of physaddr.
*
* A segment ends if it would enter a different uvm_pmemrange.
*
* Stores starting physical address of the segment in pstart.
*/
psize_t
uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
{
struct vm_page *start, *end, *iter, *iter_end, *lowest;
psize_t count;
struct uvm_pmemrange *pmr;
KASSERT(!TAILQ_EMPTY(pgl));
start = TAILQ_FIRST(pgl);
pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
end = uvm_pmr_findprevsegment(pmr, start, 0);
KASSERT(end <= start);
/*
* Calculate count and end of the list.
*/
count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
lowest = end;
end = TAILQ_NEXT(end, pageq);
/*
* Actually remove the range of pages.
*
* Sadly, this cannot be done using pointer iteration:
* vm_physseg is not guaranteed to be sorted on address, hence
* uvm_page_init() may not have initialized its array sorted by
* page number.
*/
for (iter = start; iter != end; iter = iter_end) {
iter_end = TAILQ_NEXT(iter, pageq);
TAILQ_REMOVE(pgl, iter, pageq);
}
lowest->fpgsz = count;
(void) uvm_pmr_insert(pmr, lowest, 0);
*pstart = VM_PAGE_TO_PHYS(lowest);
return count;
}
/*
* Extract a number of pages from a segment of free pages.
* Called by uvm_pmr_getpages.
*
* Returns the segment that was created from pages left over at the tail
* of the remove set of pages, or NULL if no pages were left at the tail.
*/
struct vm_page *
uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
paddr_t start, paddr_t end, struct pglist *result)
{
struct vm_page *after, *pg_i;
psize_t before_sz, after_sz;
#ifdef DEBUG
psize_t i;
#endif
KDASSERT(end > start);
KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
uvm_pmr_assertvalid(pmr);
uvm_pmr_remove_size(pmr, pg);
if (before_sz == 0) uvm_pmr_remove_addr(pmr, pg);
after = pg + before_sz + (end - start);
/* Add selected pages to result. */
for (pg_i = pg + before_sz; pg_i != after; pg_i++) { KASSERT(pg_i->pg_flags & PQ_FREE);
pg_i->fpgsz = 0;
TAILQ_INSERT_TAIL(result, pg_i, pageq);
}
/* Before handling. */
if (before_sz > 0) {
pg->fpgsz = before_sz;
uvm_pmr_insert_size(pmr, pg);
}
/* After handling. */
if (after_sz > 0) {
#ifdef DEBUG
for (i = 0; i < after_sz; i++) {
KASSERT(!uvm_pmr_isfree(after + i));
}
#endif
KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
after->fpgsz = after_sz;
after = uvm_pmr_insert_addr(pmr, after, 1);
uvm_pmr_insert_size(pmr, after);
}
uvm_pmr_assertvalid(pmr);
return (after_sz > 0 ? after : NULL);
}
/*
* Indicate to the page daemon that a nowait call failed and it should
* recover at least some memory in the most restricted region (assumed
* to be dma_constraint).
*/
extern volatile int uvm_nowait_failed;
/*
* Acquire a number of pages.
*
* count: the number of pages returned
* start: lowest page number
* end: highest page number +1
* (start = end = 0: no limitation)
* align: power-of-2 alignment constraint (align = 1: no alignment)
* boundary: power-of-2 boundary (boundary = 0: no boundary)
* maxseg: maximum number of segments to return
* flags: UVM_PLA_* flags
* result: returned pages storage (uses pageq)
*/
int
uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
paddr_t boundary, int maxseg, int flags, struct pglist *result)
{
struct uvm_pmemrange *pmr; /* Iterate memory ranges. */
struct vm_page *found, *f_next; /* Iterate chunks. */
psize_t fcount; /* Current found pages. */
int fnsegs; /* Current segment counter. */
int try, start_try;
psize_t search[3];
paddr_t fstart, fend; /* Pages to be taken from found. */
int memtype; /* Requested memtype. */
int memtype_init; /* Best memtype. */
int desperate; /* True if allocation failed. */
#ifdef DIAGNOSTIC
struct vm_page *diag_prev; /* Used during validation. */
#endif /* DIAGNOSTIC */
/*
* Validate arguments.
*/
KASSERT(count > 0); KASSERT(start == 0 || end == 0 || start < end); KASSERT(align >= 1); KASSERT(powerof2(align)); KASSERT(maxseg > 0); KASSERT(boundary == 0 || powerof2(boundary)); KASSERT(boundary == 0 || maxseg * boundary >= count); KASSERT(TAILQ_EMPTY(result)); KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
/*
* TRYCONTIG is a noop if you only want a single segment.
* Remove it if that's the case: otherwise it'll deny the fast
* allocation.
*/
if (maxseg == 1 || count == 1)
flags &= ~UVM_PLA_TRYCONTIG;
/*
* Configure search.
*
* search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
* search[1] is multiple segments, chosen to fulfill the search in
* approximately even-sized segments.
* This is a good trade-off between slightly reduced allocation speed
* and less fragmentation.
* search[2] is the worst case, in which all segments are evaluated.
* This provides the least fragmentation, but makes the search
* possibly longer (although in the case it is selected, that no
* longer matters most).
*
* The exception is when maxseg == 1: since we can only fulfill that
* with one segment of size pages, only a single search type has to
* be attempted.
*/
if (maxseg == 1 || count == 1) {
start_try = 2;
search[2] = count;
} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
start_try = 2;
search[2] = 1;
} else {
start_try = 0;
search[0] = count;
search[1] = pow2divide(count, maxseg);
search[2] = 1;
if ((flags & UVM_PLA_TRYCONTIG) == 0)
start_try = 1;
if (search[1] >= search[0]) {
search[1] = search[0];
start_try = 1;
}
if (search[2] >= search[start_try]) {
start_try = 2;
}
}
/*
* Memory type: if zeroed memory is requested, traverse the zero set.
* Otherwise, traverse the dirty set.
*
* The memtype iterator is reinitialized to memtype_init on entrance
* of a pmemrange.
*/
if (flags & UVM_PLA_ZERO)
memtype_init = UVM_PMR_MEMTYPE_ZERO;
else
memtype_init = UVM_PMR_MEMTYPE_DIRTY;
/*
* Initially, we're not desperate.
*
* Note that if we return from a sleep, we are still desperate.
* Chances are that memory pressure is still high, so resetting
* seems over-optimistic to me.
*/
desperate = 0;
again:
uvm_lock_fpageq();
/*
* check to see if we need to generate some free pages waking
* the pagedaemon.
*/
if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
(uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
wakeup(&uvm.pagedaemon);
/*
* fail if any of these conditions is true:
* [1] there really are no free pages, or
* [2] only kernel "reserved" pages remain and
* the UVM_PLA_USERESERVE flag wasn't used.
* [3] only pagedaemon "reserved" pages remain and
* the requestor isn't the pagedaemon nor the syncer.
*/
if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
!(flags & UVM_PLA_USERESERVE)) {
uvm_unlock_fpageq();
return ENOMEM;
}
if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
uvm_unlock_fpageq();
if (flags & UVM_PLA_WAITOK) { uvm_wait("uvm_pmr_getpages");
goto again;
}
return ENOMEM;
}
retry: /* Return point after sleeping. */
fcount = 0;
fnsegs = 0;
retry_desperate:
/*
* If we just want any page(s), go for the really fast option.
*/
if (count <= maxseg && align == 1 && boundary == 0 &&
(flags & UVM_PLA_TRYCONTIG) == 0) {
fcount += uvm_pmr_get1page(count - fcount, memtype_init,
result, start, end, 0);
/*
* If we found sufficient pages, go to the success exit code.
*
* Otherwise, go immediately to fail, since we collected
* all we could anyway.
*/
if (fcount == count)
goto out;
else
goto fail;
}
/*
* The heart of the contig case.
*
* The code actually looks like this:
*
* foreach (struct pmemrange) {
* foreach (memtype) {
* foreach(try) {
* foreach (free range of memtype in pmemrange,
* starting at search[try]) {
* while (range has space left)
* take from range
* }
* }
* }
*
* if next pmemrange has higher usecount than current:
* enter desperate case (which will drain the pmemranges
* until empty prior to moving to the next one)
* }
*
* When desperate is activated, try always starts at the highest
* value. The memtype loop is using a goto ReScanMemtype.
* The try loop is using a goto ReScan.
* The 'range has space left' loop uses label DrainFound.
*
* Writing them all as loops would take up a lot of screen space in
* the form of indentation and some parts are easier to express
* using the labels.
*/
TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
/* Empty range. */
if (pmr->nsegs == 0)
continue;
/* Outside requested range. */
if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
continue;
memtype = memtype_init;
rescan_memtype: /* Return point at memtype++. */
try = start_try;
rescan: /* Return point at try++. */
for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
found != NULL;
found = f_next) {
f_next = uvm_pmr_nextsz(pmr, found, memtype);
fstart = atop(VM_PAGE_TO_PHYS(found));
if (start != 0)
fstart = MAX(start, fstart);
drain_found:
/*
* Throw away the first segment if fnsegs == maxseg
*
* Note that f_next is still valid after this call,
* since we only allocated from entries before f_next.
* We don't revisit the entries we already extracted
* from unless we entered the desperate case.
*/
if (fnsegs == maxseg) {
fnsegs--;
fcount -=
uvm_pmr_remove_1strange(result, boundary,
&found, desperate);
}
fstart = PMR_ALIGN(fstart, align);
fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
if (end != 0)
fend = MIN(end, fend);
if (boundary != 0) {
fend =
MIN(fend, PMR_ALIGN(fstart + 1, boundary));
}
if (fstart >= fend)
continue;
if (fend - fstart > count - fcount)
fend = fstart + (count - fcount);
fcount += fend - fstart;
fnsegs++;
found = uvm_pmr_extract_range(pmr, found,
fstart, fend, result);
if (fcount == count)
goto out;
/*
* If there's still space left in found, try to
* fully drain it prior to continuing.
*/
if (found != NULL) {
fstart = fend;
goto drain_found;
}
}
/* Try a smaller search now. */
if (++try < nitems(search))
goto rescan;
/*
* Exhaust all memory types prior to going to the next memory
* segment.
* This means that zero-vs-dirty are eaten prior to moving
* to a pmemrange with a higher use-count.
*
* Code is basically a difficult way of writing:
* memtype = memtype_init;
* do {
* ...;
* memtype += 1;
* memtype %= MEMTYPE_MAX;
* } while (memtype != memtype_init);
*/
memtype += 1;
if (memtype == UVM_PMR_MEMTYPE_MAX)
memtype = 0;
if (memtype != memtype_init)
goto rescan_memtype;
/*
* If not desperate, enter desperate case prior to eating all
* the good stuff in the next range.
*/
if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
break;
}
/*
* Not enough memory of the requested type available. Fall back to
* less good memory that we'll clean up better later.
*
* This algorithm is not very smart though, it just starts scanning
* a different typed range, but the nicer ranges of the previous
* iteration may fall out. Hence there is a small chance of a false
* negative.
*
* When desperate: scan all sizes starting at the smallest
* (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
* allow us to hit the fast path now).
*
* Also, because we will revisit entries we scanned before, we need
* to reset the page queue, or we may end up releasing entries in
* such a way as to invalidate f_next.
*/
if (!desperate) {
desperate = 1;
start_try = nitems(search) - 1;
flags &= ~UVM_PLA_TRYCONTIG;
while (!TAILQ_EMPTY(result))
uvm_pmr_remove_1strange(result, 0, NULL, 0);
fnsegs = 0;
fcount = 0;
goto retry_desperate;
}
fail:
/* Allocation failed. */
/* XXX: claim from memory reserve here */
while (!TAILQ_EMPTY(result))
uvm_pmr_remove_1strange(result, 0, NULL, 0);
if (flags & UVM_PLA_WAITOK) {
if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
flags & UVM_PLA_FAILOK) == 0)
goto retry;
KASSERT(flags & UVM_PLA_FAILOK);
} else {
if (!(flags & UVM_PLA_NOWAKE)) { uvm_nowait_failed = 1;
wakeup(&uvm.pagedaemon);
}
}
uvm_unlock_fpageq();
return ENOMEM;
out:
/* Allocation successful. */
uvmexp.free -= fcount;
uvm_unlock_fpageq();
/* Update statistics and zero pages if UVM_PLA_ZERO. */
#ifdef DIAGNOSTIC
fnsegs = 0;
fcount = 0;
diag_prev = NULL;
#endif /* DIAGNOSTIC */
TAILQ_FOREACH(found, result, pageq) {
atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
if (found->pg_flags & PG_ZERO) {
uvm_lock_fpageq();
uvmexp.zeropages--;
if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages);
uvm_unlock_fpageq();
}
if (flags & UVM_PLA_ZERO) {
if (found->pg_flags & PG_ZERO)
uvmexp.pga_zerohit++;
else {
uvmexp.pga_zeromiss++;
uvm_pagezero(found);
}
}
atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
found->uobject = NULL;
found->uanon = NULL;
found->pg_version++;
/*
* Validate that the page matches range criterium.
*/
KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
#ifdef DIAGNOSTIC
/*
* Update fcount (# found pages) and
* fnsegs (# found segments) counters.
*/
if (diag_prev == NULL ||
/* new segment if it contains a hole */
atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
atop(VM_PAGE_TO_PHYS(found)) ||
/* new segment if it crosses boundary */
(atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
(atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
fnsegs++;
fcount++;
diag_prev = found;
#endif /* DIAGNOSTIC */
}
#ifdef DIAGNOSTIC
/*
* Panic on algorithm failure.
*/
if (fcount != count || fnsegs > maxseg) { panic("pmemrange allocation error: "
"allocated %ld pages in %d segments, "
"but request was %ld pages in %d segments",
fcount, fnsegs, count, maxseg);
}
#endif /* DIAGNOSTIC */
return 0;
}
/*
* Free a number of contig pages (invoked by uvm_page_init).
*/
void
uvm_pmr_freepages(struct vm_page *pg, psize_t count)
{
struct uvm_pmemrange *pmr;
psize_t i, pmr_count;
struct vm_page *firstpg = pg;
for (i = 0; i < count; i++) { KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
atop(VM_PAGE_TO_PHYS(pg)) + i);
if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
VALID_FLAGS(pg[i].pg_flags))) {
printf("Flags: 0x%x, will panic now.\n",
pg[i].pg_flags);
}
KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
VALID_FLAGS(pg[i].pg_flags));
atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
}
uvm_lock_fpageq(); for (i = count; i > 0; i -= pmr_count) {
pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
KASSERT(pmr != NULL);
pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
pg->fpgsz = pmr_count;
uvm_pmr_insert(pmr, pg, 0);
uvmexp.free += pmr_count;
pg += pmr_count;
}
wakeup(&uvmexp.free);
if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages);
uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
uvm_unlock_fpageq();
}
/*
* Free all pages in the queue.
*/
void
uvm_pmr_freepageq(struct pglist *pgl)
{
struct vm_page *pg;
paddr_t pstart;
psize_t plen;
TAILQ_FOREACH(pg, pgl, pageq) { if (!((pg->pg_flags & PQ_FREE) == 0 &&
VALID_FLAGS(pg->pg_flags))) {
printf("Flags: 0x%x, will panic now.\n",
pg->pg_flags);
}
KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
VALID_FLAGS(pg->pg_flags));
atomic_setbits_int(&pg->pg_flags, PQ_FREE);
atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
}
uvm_lock_fpageq();
while (!TAILQ_EMPTY(pgl)) {
pg = TAILQ_FIRST(pgl);
if (pg == TAILQ_NEXT(pg, pageq) + 1) {
/*
* If pg is one behind the position of the
* next page in the list in the page array,
* try going backwards instead of forward.
*/
plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
} else {
pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
}
uvmexp.free += plen;
uvm_wakeup_pla(pstart, ptoa(plen));
}
wakeup(&uvmexp.free);
if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages);
uvm_unlock_fpageq();
return;
}
/*
* Store a pmemrange in the list.
*
* The list is sorted by use.
*/
struct uvm_pmemrange *
uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
struct uvm_pmemrange *pmr)
{
struct uvm_pmemrange *iter;
int cmp = 1;
TAILQ_FOREACH(iter, useq, pmr_use) {
cmp = uvm_pmemrange_use_cmp(pmr, iter);
if (cmp == 0)
return iter;
if (cmp == -1)
break;
}
if (iter == NULL)
TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
else
TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
return NULL;
}
#ifdef DEBUG
/*
* Validation of the whole pmemrange.
* Called with fpageq locked.
*/
void
uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
{
struct vm_page *prev, *next, *i, *xref;
int lcv, mti;
/* Empty range */
if (pmr->nsegs == 0)
return;
/* Validate address tree. */
RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
/* Validate the range. */
KASSERT(i->fpgsz > 0);
KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
<= pmr->high);
/* Validate each page in this range. */
for (lcv = 0; lcv < i->fpgsz; lcv++) {
/*
* Only the first page has a size specification.
* Rest is size 0.
*/
KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
/*
* Flag check.
*/
KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
(i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
/*
* Free pages are:
* - not wired
* - have no vm_anon
* - have no uvm_object
*/
KASSERT(i[lcv].wire_count == 0);
KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
i[lcv].uanon == NULL);
KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
i[lcv].uobject == NULL);
/*
* Pages in a single range always have the same
* memtype.
*/
KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
uvm_pmr_pg_to_memtype(&i[lcv]));
}
/* Check that it shouldn't be joined with its predecessor. */
prev = RBT_PREV(uvm_pmr_addr, i);
if (prev != NULL) {
KASSERT(uvm_pmr_pg_to_memtype(i) !=
uvm_pmr_pg_to_memtype(prev) ||
atop(VM_PAGE_TO_PHYS(i)) >
atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
prev + prev->fpgsz != i);
}
/* Assert i is in the size tree as well. */
if (i->fpgsz == 1) {
TAILQ_FOREACH(xref,
&pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
if (xref == i)
break;
}
KASSERT(xref == i);
} else {
KASSERT(RBT_FIND(uvm_pmr_size,
&pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
i + 1);
}
}
/* Validate size tree. */
for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
next = uvm_pmr_nextsz(pmr, i, mti);
if (next != NULL) {
KASSERT(i->fpgsz <=
next->fpgsz);
}
/* Assert i is in the addr tree as well. */
KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
/* Assert i is of the correct memory type. */
KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
}
}
/* Validate nsegs statistic. */
lcv = 0;
RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
lcv++;
KASSERT(pmr->nsegs == lcv);
}
#endif /* DEBUG */
/*
* Split pmr at split point pageno.
* Called with fpageq unlocked.
*
* Split is only applied if a pmemrange spans pageno.
*/
void
uvm_pmr_split(paddr_t pageno)
{
struct uvm_pmemrange *pmr, *drain;
struct vm_page *rebuild, *prev, *next;
psize_t prev_sz;
uvm_lock_fpageq();
pmr = uvm_pmemrange_find(pageno);
if (pmr == NULL || !(pmr->low < pageno)) {
/* No split required. */
uvm_unlock_fpageq();
return;
}
KASSERT(pmr->low < pageno);
KASSERT(pmr->high > pageno);
/*
* uvm_pmr_allocpmr() calls into malloc() which in turn calls into
* uvm_kmemalloc which calls into pmemrange, making the locking
* a bit hard, so we just race!
*/
uvm_unlock_fpageq();
drain = uvm_pmr_allocpmr();
uvm_lock_fpageq();
pmr = uvm_pmemrange_find(pageno);
if (pmr == NULL || !(pmr->low < pageno)) {
/*
* We lost the race since someone else ran this or a related
* function, however this should be triggered very rarely so
* we just leak the pmr.
*/
printf("uvm_pmr_split: lost one pmr\n");
uvm_unlock_fpageq();
return;
}
drain->low = pageno;
drain->high = pmr->high;
drain->use = pmr->use;
uvm_pmr_assertvalid(pmr);
uvm_pmr_assertvalid(drain);
KASSERT(drain->nsegs == 0);
RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
break;
}
if (rebuild == NULL)
prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
else
prev = RBT_PREV(uvm_pmr_addr, rebuild);
KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
/*
* Handle free chunk that spans the split point.
*/
if (prev != NULL &&
atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
psize_t before, after;
KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
uvm_pmr_remove(pmr, prev);
prev_sz = prev->fpgsz;
before = pageno - atop(VM_PAGE_TO_PHYS(prev));
after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
KASSERT(before > 0);
KASSERT(after > 0);
prev->fpgsz = before;
uvm_pmr_insert(pmr, prev, 1);
(prev + before)->fpgsz = after;
uvm_pmr_insert(drain, prev + before, 1);
}
/* Move free chunks that no longer fall in the range. */
for (; rebuild != NULL; rebuild = next) {
next = RBT_NEXT(uvm_pmr_addr, rebuild);
uvm_pmr_remove(pmr, rebuild);
uvm_pmr_insert(drain, rebuild, 1);
}
pmr->high = pageno;
uvm_pmr_assertvalid(pmr);
uvm_pmr_assertvalid(drain);
RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
uvm_unlock_fpageq();
}
/*
* Increase the usage counter for the given range of memory.
*
* The more usage counters a given range of memory has, the more will be
* attempted not to allocate from it.
*
* Addresses here are in paddr_t, not page-numbers.
* The lowest and highest allowed address are specified.
*/
void
uvm_pmr_use_inc(paddr_t low, paddr_t high)
{
struct uvm_pmemrange *pmr;
paddr_t sz;
/* pmr uses page numbers, translate low and high. */
high++;
high = atop(trunc_page(high));
low = atop(round_page(low));
uvm_pmr_split(low);
uvm_pmr_split(high);
sz = 0;
uvm_lock_fpageq();
/* Increase use count on segments in range. */
RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
pmr->use++;
sz += pmr->high - pmr->low;
uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
}
uvm_pmr_assertvalid(pmr);
}
uvm_unlock_fpageq();
KASSERT(sz >= high - low);
}
/*
* Allocate a pmemrange.
*
* If called from uvm_page_init, the uvm_pageboot_alloc is used.
* If called after uvm_init, malloc is used.
* (And if called in between, you're dead.)
*/
struct uvm_pmemrange *
uvm_pmr_allocpmr(void)
{
struct uvm_pmemrange *nw;
int i;
/* We're only ever hitting the !uvm.page_init_done case for now. */
if (!uvm.page_init_done) {
nw = (struct uvm_pmemrange *)
uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
} else {
nw = malloc(sizeof(struct uvm_pmemrange),
M_VMMAP, M_NOWAIT);
}
KASSERT(nw != NULL);
memset(nw, 0, sizeof(struct uvm_pmemrange));
RBT_INIT(uvm_pmr_addr, &nw->addr);
for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
RBT_INIT(uvm_pmr_size, &nw->size[i]);
TAILQ_INIT(&nw->single[i]);
}
return nw;
}
/*
* Initialization of pmr.
* Called by uvm_page_init.
*
* Sets up pmemranges.
*/
void
uvm_pmr_init(void)
{
struct uvm_pmemrange *new_pmr;
int i;
TAILQ_INIT(&uvm.pmr_control.use);
RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
TAILQ_INIT(&uvm.pmr_control.allocs);
/* By default, one range for the entire address space. */
new_pmr = uvm_pmr_allocpmr();
new_pmr->low = 0;
new_pmr->high = atop((paddr_t)-1) + 1;
RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
for (i = 0; uvm_md_constraints[i] != NULL; i++) {
uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
uvm_md_constraints[i]->ucr_high);
}
}
/*
* Find the pmemrange that contains the given page number.
*
* (Manually traverses the binary tree, because that is cheaper on stack
* usage.)
*/
struct uvm_pmemrange *
uvm_pmemrange_find(paddr_t pageno)
{
struct uvm_pmemrange *pmr;
pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
while (pmr != NULL) {
if (pmr->low > pageno)
pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); else if (pmr->high <= pageno) pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
else
break;
}
return pmr;
}
#if defined(DDB) || defined(DEBUG)
/*
* Return true if the given page is in any of the free lists.
* Used by uvm_page_printit.
* This function is safe, even if the page is not on the freeq.
* Note: does not apply locking, only called from ddb.
*/
int
uvm_pmr_isfree(struct vm_page *pg)
{
struct vm_page *r;
struct uvm_pmemrange *pmr;
pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
if (pmr == NULL)
return 0;
r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
if (r == NULL)
r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
else if (r != pg)
r = RBT_PREV(uvm_pmr_addr, r);
if (r == NULL)
return 0; /* Empty tree. */
KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
atop(VM_PAGE_TO_PHYS(pg));
}
#endif /* DEBUG */
/*
* Given a root of a tree, find a range which intersects start, end and
* is of the same memtype.
*
* Page must be in the address tree.
*/
struct vm_page*
uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
paddr_t start, paddr_t end, int memtype)
{
int direction;
struct vm_page *root;
struct vm_page *high, *high_next;
struct vm_page *low, *low_next;
KDASSERT(pmr != NULL && init_root != NULL);
root = init_root;
/* Which direction to use for searching. */
if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
direction = 1;
else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
direction = -1;
else /* nothing to do */
return root;
/* First, update root to fall within the chosen range. */
while (root && !PMR_INTERSECTS_WITH(
atop(VM_PAGE_TO_PHYS(root)),
atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
start, end)) {
if (direction == 1)
root = RBT_RIGHT(uvm_objtree, root);
else
root = RBT_LEFT(uvm_objtree, root);
}
if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
return root;
/*
* Root is valid, but of the wrong memtype.
*
* Try to find a range that has the given memtype in the subtree
* (memtype mismatches are costly, either because the conversion
* is expensive, or a later allocation will need to do the opposite
* conversion, which will be expensive).
*
*
* First, simply increase address until we hit something we can use.
* Cache the upper page, so we can page-walk later.
*/
high = root;
high_next = RBT_RIGHT(uvm_objtree, high);
while (high_next != NULL && PMR_INTERSECTS_WITH(
atop(VM_PAGE_TO_PHYS(high_next)),
atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
start, end)) {
high = high_next;
if (uvm_pmr_pg_to_memtype(high) == memtype)
return high;
high_next = RBT_RIGHT(uvm_objtree, high);
}
/*
* Second, decrease the address until we hit something we can use.
* Cache the lower page, so we can page-walk later.
*/
low = root;
low_next = RBT_LEFT(uvm_objtree, low);
while (low_next != NULL && PMR_INTERSECTS_WITH(
atop(VM_PAGE_TO_PHYS(low_next)),
atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
start, end)) {
low = low_next;
if (uvm_pmr_pg_to_memtype(low) == memtype)
return low;
low_next = RBT_LEFT(uvm_objtree, low);
}
if (low == high)
return NULL;
/* No hits. Walk the address tree until we find something usable. */
for (low = RBT_NEXT(uvm_pmr_addr, low);
low != high;
low = RBT_NEXT(uvm_pmr_addr, low)) {
KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
start, end));
if (uvm_pmr_pg_to_memtype(low) == memtype)
return low;
}
/* Nothing found. */
return NULL;
}
/*
* Allocate any page, the fastest way. Page number constraints only.
*/
psize_t
uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
paddr_t start, paddr_t end, int memtype_only)
{
struct uvm_pmemrange *pmr;
struct vm_page *found, *splitpg;
psize_t fcount;
int memtype;
fcount = 0;
TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
/* We're done. */
if (fcount == count)
break;
/* Outside requested range. */
if (!(start == 0 && end == 0) && !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
continue;
/* Range is empty. */
if (pmr->nsegs == 0)
continue;
/* Loop over all memtypes, starting at memtype_init. */
memtype = memtype_init;
while (fcount != count) {
found = TAILQ_FIRST(&pmr->single[memtype]);
/*
* If found is outside the range, walk the list
* until we find something that intersects with
* boundaries.
*/
while (found && !PMR_INTERSECTS_WITH(
atop(VM_PAGE_TO_PHYS(found)),
atop(VM_PAGE_TO_PHYS(found)) + 1,
start, end))
found = TAILQ_NEXT(found, pageq);
if (found == NULL) {
/*
* Check if the size tree contains a range
* that intersects with the boundaries. As the
* allocation is for any page, try the smallest
* range so that large ranges are preserved for
* more constrained cases. Only one entry is
* checked here, to avoid a brute-force search.
*
* Note that a size tree gives pg[1] instead of
* pg[0].
*/
found = RBT_MIN(uvm_pmr_size,
&pmr->size[memtype]);
if (found != NULL) {
found--;
if (!PMR_INTERSECTS_WITH(
atop(VM_PAGE_TO_PHYS(found)),
atop(VM_PAGE_TO_PHYS(found)) +
found->fpgsz, start, end))
found = NULL;
}
}
if (found == NULL) {
/*
* Try address-guided search to meet the page
* number constraints.
*/
found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
if (found != NULL) {
found = uvm_pmr_rootupdate(pmr, found,
start, end, memtype);
}
}
if (found != NULL) {
uvm_pmr_assertvalid(pmr);
uvm_pmr_remove_size(pmr, found);
/*
* If the page intersects the end, then it'll
* need splitting.
*
* Note that we don't need to split if the page
* intersects start: the drain function will
* simply stop on hitting start.
*/
if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
found->fpgsz > end) {
psize_t splitsz =
atop(VM_PAGE_TO_PHYS(found)) +
found->fpgsz - end;
uvm_pmr_remove_addr(pmr, found);
uvm_pmr_assertvalid(pmr);
found->fpgsz -= splitsz;
splitpg = found + found->fpgsz;
splitpg->fpgsz = splitsz;
uvm_pmr_insert(pmr, splitpg, 1);
/*
* At this point, splitpg and found
* actually should be joined.
* But we explicitly disable that,
* because we will start subtracting
* from found.
*/
KASSERT(start == 0 ||
atop(VM_PAGE_TO_PHYS(found)) +
found->fpgsz > start);
uvm_pmr_insert_addr(pmr, found, 1);
}
/*
* Fetch pages from the end.
* If the range is larger than the requested
* number of pages, this saves us an addr-tree
* update.
*
* Since we take from the end and insert at
* the head, any ranges keep preserved.
*/
while (found->fpgsz > 0 && fcount < count && (start == 0 ||
atop(VM_PAGE_TO_PHYS(found)) +
found->fpgsz > start)) {
found->fpgsz--;
fcount++;
TAILQ_INSERT_HEAD(result,
&found[found->fpgsz], pageq);
}
if (found->fpgsz > 0) {
uvm_pmr_insert_size(pmr, found);
KDASSERT(fcount == count);
uvm_pmr_assertvalid(pmr);
return fcount;
}
/*
* Delayed addr-tree removal.
*/
uvm_pmr_remove_addr(pmr, found);
uvm_pmr_assertvalid(pmr);
} else {
if (memtype_only)
break;
/*
* Skip to the next memtype.
*/
memtype += 1;
if (memtype == UVM_PMR_MEMTYPE_MAX)
memtype = 0;
if (memtype == memtype_init)
break;
}
}
}
/*
* Search finished.
*
* Ran out of ranges before enough pages were gathered, or we hit the
* case where found->fpgsz == count - fcount, in which case the
* above exit condition didn't trigger.
*
* On failure, caller will free the pages.
*/
return fcount;
}
#ifdef DDB
/*
* Print information about pmemrange.
* Does not do locking (so either call it from DDB or acquire fpageq lock
* before invoking.
*/
void
uvm_pmr_print(void)
{
struct uvm_pmemrange *pmr;
struct vm_page *pg;
psize_t size[UVM_PMR_MEMTYPE_MAX];
psize_t free;
int useq_len;
int mt;
printf("Ranges, use queue:\n");
useq_len = 0;
TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
useq_len++;
free = 0;
for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
if (pg != NULL)
pg--;
else
pg = TAILQ_FIRST(&pmr->single[mt]);
size[mt] = (pg == NULL ? 0 : pg->fpgsz);
RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
free += pg->fpgsz;
}
printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
(unsigned long)pmr->low, (unsigned long)pmr->high,
pmr->use, (unsigned long)pmr->nsegs);
for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
printf(" maxsegsz[%d]=0x%lx", mt,
(unsigned long)size[mt]);
}
printf(" free=0x%lx\n", (unsigned long)free);
}
printf("#ranges = %d\n", useq_len);
}
#endif
/*
* uvm_wait_pla: wait (sleep) for the page daemon to free some pages
* in a specific physmem area.
*
* Returns ENOMEM if the pagedaemon failed to free any pages.
* If not failok, failure will lead to panic.
*
* Must be called with fpageq locked.
*/
int
uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
{
struct uvm_pmalloc pma;
const char *wmsg = "pmrwait";
if (curproc == uvm.pagedaemon_proc) {
/*
* This is not that uncommon when the pagedaemon is trying
* to flush out a large mmapped file. VOP_WRITE will circle
* back through the buffer cache and try to get more memory.
* The pagedaemon starts by calling bufbackoff, but we can
* easily use up that reserve in a single scan iteration.
*/
uvm_unlock_fpageq();
if (bufbackoff(NULL, atop(size)) == 0) {
uvm_lock_fpageq();
return 0;
}
uvm_lock_fpageq();
/*
* XXX detect pagedaemon deadlock - see comment in
* uvm_wait(), as this is exactly the same issue.
*/
printf("pagedaemon: wait_pla deadlock detected!\n");
msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
MSEC_TO_NSEC(125));
#if defined(DEBUG)
/* DEBUG: panic so we can debug it */
panic("wait_pla pagedaemon deadlock");
#endif
return 0;
}
for (;;) {
pma.pm_constraint.ucr_low = low;
pma.pm_constraint.ucr_high = high;
pma.pm_size = size;
pma.pm_flags = UVM_PMA_LINKED;
TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
wakeup(&uvm.pagedaemon); /* wake the daemon! */
while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
if (!(pma.pm_flags & UVM_PMA_FREED) &&
pma.pm_flags & UVM_PMA_FAIL) {
if (failok)
return ENOMEM;
printf("uvm_wait: failed to free %ld pages between "
"0x%lx-0x%lx\n", atop(size), low, high);
} else
return 0;
}
/* UNREACHABLE */
}
/*
* Wake up uvm_pmalloc sleepers.
*/
void
uvm_wakeup_pla(paddr_t low, psize_t len)
{
struct uvm_pmalloc *pma, *pma_next;
paddr_t high;
high = low + len;
/* Wake specific allocations waiting for this memory. */
for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
pma = pma_next) {
pma_next = TAILQ_NEXT(pma, pmq);
if (low < pma->pm_constraint.ucr_high &&
high > pma->pm_constraint.ucr_low) {
pma->pm_flags |= UVM_PMA_FREED;
if (!(pma->pm_flags & UVM_PMA_BUSY)) { pma->pm_flags &= ~UVM_PMA_LINKED;
TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
pmq);
wakeup(pma);
}
}
}
}
void
uvm_pagezero_thread(void *arg)
{
struct pglist pgl;
struct vm_page *pg;
int count;
/* Run at the lowest possible priority. */
curproc->p_p->ps_nice = NZERO + PRIO_MAX;
KERNEL_UNLOCK();
TAILQ_INIT(&pgl);
for (;;) {
uvm_lock_fpageq();
while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
(count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
&pgl, 0, 0, 1)) == 0) {
msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
MAXPRI, "pgzero", INFSLP);
}
uvm_unlock_fpageq();
TAILQ_FOREACH(pg, &pgl, pageq) {
uvm_pagezero(pg);
atomic_setbits_int(&pg->pg_flags, PG_ZERO);
}
uvm_lock_fpageq();
while (!TAILQ_EMPTY(&pgl))
uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
uvmexp.zeropages += count;
uvm_unlock_fpageq();
yield();
}
}