18 8 8 7 2 1 2 1 3 3 3 2 1 2 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 /* $OpenBSD: wsevent.c,v 1.26 2022/07/02 08:50:42 visa Exp $ */ /* $NetBSD: wsevent.c,v 1.16 2003/08/07 16:31:29 agc Exp $ */ /* * Copyright (c) 1996, 1997 Christopher G. Demetriou. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christopher G. Demetriou * for the NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)event.c 8.1 (Berkeley) 6/11/93 */ /* * Internal "wscons_event" queue interface for the keyboard and mouse drivers. */ #include <sys/param.h> #include <sys/malloc.h> #include <sys/systm.h> #include <sys/vnode.h> #include <dev/wscons/wsconsio.h> #include <dev/wscons/wseventvar.h> void filt_wseventdetach(struct knote *); int filt_wseventread(struct knote *, long); const struct filterops wsevent_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_wseventdetach, .f_event = filt_wseventread, }; /* * Initialize a wscons_event queue. */ int wsevent_init(struct wseventvar *ev) { struct wscons_event *queue; if (ev->q != NULL) return (0); queue = mallocarray(WSEVENT_QSIZE, sizeof(struct wscons_event), M_DEVBUF, M_WAITOK | M_ZERO); if (ev->q != NULL) { free(queue, M_DEVBUF, WSEVENT_QSIZE * sizeof(struct wscons_event)); return (1); } ev->q = queue; ev->get = ev->put = 0; sigio_init(&ev->sigio); return (0); } /* * Tear down a wscons_event queue. */ void wsevent_fini(struct wseventvar *ev) { if (ev->q == NULL) { #ifdef DIAGNOSTIC printf("wsevent_fini: already invoked\n"); #endif return; } free(ev->q, M_DEVBUF, WSEVENT_QSIZE * sizeof(struct wscons_event)); ev->q = NULL; sigio_free(&ev->sigio); } /* * User-level interface: read, kqueue. * (User cannot write an event queue.) */ int wsevent_read(struct wseventvar *ev, struct uio *uio, int flags) { int s, error; u_int cnt; size_t n; /* * Make sure we can return at least 1. */ if (uio->uio_resid < sizeof(struct wscons_event)) return (EMSGSIZE); /* ??? */ s = splwsevent(); while (ev->get == ev->put) { if (flags & IO_NDELAY) { splx(s); return (EWOULDBLOCK); } ev->wanted = 1; error = tsleep_nsec(ev, PWSEVENT | PCATCH, "wsevent_read", INFSLP); if (error) { splx(s); return (error); } } /* * Move wscons_event from tail end of queue (there is at least one * there). */ if (ev->put < ev->get) cnt = WSEVENT_QSIZE - ev->get; /* events in [get..QSIZE) */ else cnt = ev->put - ev->get; /* events in [get..put) */ splx(s); n = howmany(uio->uio_resid, sizeof(struct wscons_event)); if (cnt > n) cnt = n; error = uiomove((caddr_t)&ev->q[ev->get], cnt * sizeof(struct wscons_event), uio); n -= cnt; /* * If we do not wrap to 0, used up all our space, or had an error, * stop. Otherwise move from front of queue to put index, if there * is anything there to move. */ if ((ev->get = (ev->get + cnt) % WSEVENT_QSIZE) != 0 || n == 0 || error || (cnt = ev->put) == 0) return (error); if (cnt > n) cnt = n; error = uiomove((caddr_t)&ev->q[0], cnt * sizeof(struct wscons_event), uio); ev->get = cnt; return (error); } int wsevent_kqfilter(struct wseventvar *ev, struct knote *kn) { struct klist *klist; int s; klist = &ev->sel.si_note; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &wsevent_filtops; break; default: return (EINVAL); } kn->kn_hook = ev; s = splwsevent(); klist_insert_locked(klist, kn); splx(s); return (0); } void filt_wseventdetach(struct knote *kn) { struct wseventvar *ev = kn->kn_hook; struct klist *klist = &ev->sel.si_note; int s; s = splwsevent(); klist_remove_locked(klist, kn); splx(s); } int filt_wseventread(struct knote *kn, long hint) { struct wseventvar *ev = kn->kn_hook; if (ev->get == ev->put) return (0); if (ev->get < ev->put) kn->kn_data = ev->put - ev->get; else kn->kn_data = (WSEVENT_QSIZE - ev->get) + ev->put; return (1); }
30 13 24 41 15 37 43 12 12 19 19 2 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 /* $OpenBSD: mld6.c,v 1.60 2022/09/05 15:47:39 bluhm Exp $ */ /* $KAME: mld6.c,v 1.26 2001/02/16 14:50:35 itojun Exp $ */ /* * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/protosw.h> #include <sys/syslog.h> #include <net/if.h> #include <net/if_var.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> #include <netinet6/mld6.h> #include <netinet6/mld6_var.h> static struct ip6_pktopts ip6_opts; int mld6_timers_are_running; /* [N] shortcut for fast timer */ void mld6_checktimer(struct ifnet *); static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); void mld6_init(void) { static u_int8_t hbh_buf[8]; struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); mld6_timers_are_running = 0; /* ip6h_nxt will be fill in later */ hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ /* XXX: grotty hard coding... */ hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ hbh_buf[3] = 0; hbh_buf[4] = IP6OPT_ROUTER_ALERT; hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; memcpy(&hbh_buf[6], (caddr_t)&rtalert_code, sizeof(u_int16_t)); ip6_initpktopts(&ip6_opts); ip6_opts.ip6po_hbh = hbh; } void mld6_start_listening(struct in6_multi *in6m) { /* XXX: These are necessary for KAME's link-local hack */ struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; /* * RFC2710 page 10: * The node never sends a Report or Done for the link-scope all-nodes * address. * MLD messages are never sent for multicast addresses whose scope is 0 * (reserved) or 1 (node-local). */ all_nodes.s6_addr16[1] = htons(in6m->in6m_ifidx); if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes) || __IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < __IPV6_ADDR_SCOPE_LINKLOCAL) { in6m->in6m_timer = 0; in6m->in6m_state = MLD_OTHERLISTENER; } else { mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ); in6m->in6m_state = MLD_IREPORTEDLAST; mld6_timers_are_running = 1; } } void mld6_stop_listening(struct in6_multi *in6m) { /* XXX: These are necessary for KAME's link-local hack */ struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct in6_addr all_routers = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; all_nodes.s6_addr16[1] = htons(in6m->in6m_ifidx); /* XXX: necessary when mrouting */ all_routers.s6_addr16[1] = htons(in6m->in6m_ifidx); if (in6m->in6m_state == MLD_IREPORTEDLAST && (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes)) && __IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > __IPV6_ADDR_SCOPE_INTFACELOCAL) mld6_sendpkt(in6m, MLD_LISTENER_DONE, &all_routers); } void mld6_input(struct mbuf *m, int off) { struct ip6_hdr *ip6; struct mld_hdr *mldh; struct ifnet *ifp; struct in6_multi *in6m; struct ifmaddr *ifma; int timer; /* timer value in the MLD query header */ /* XXX: These are necessary for KAME's link-local hack */ struct in6_addr all_nodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh)); if (mldh == NULL) { icmp6stat_inc(icp6s_tooshort); return; } /* source address validation */ ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */ if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { #if 0 char src[INET6_ADDRSTRLEN], grp[INET6_ADDRSTRLEN]; log(LOG_ERR, "mld_input: src %s is not link-local (grp=%s)\n", inet_ntop(AF_INET6, &ip6->ip6_src, src, sizeof(src)), inet_ntop(AF_INET6, &mldh->mld_addr, grp, sizeof(grp))); #endif /* * spec (RFC2710) does not explicitly * specify to discard the packet from a non link-local * source address. But we believe it's expected to do so. */ m_freem(m); return; } ifp = if_get(m->m_pkthdr.ph_ifidx); if (ifp == NULL) { m_freem(m); return; } /* * In the MLD6 specification, there are 3 states and a flag. * * In Non-Listener state, we simply don't have a membership record. * In Delaying Listener state, our timer is running (in6m->in6m_timer) * In Idle Listener state, our timer is not running (in6m->in6m_timer==0) * * The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if * we have heard a report from another member, or MLD_IREPORTEDLAST * if we sent the last report. */ switch(mldh->mld_type) { case MLD_LISTENER_QUERY: if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) && !IN6_IS_ADDR_MULTICAST(&mldh->mld_addr)) break; /* print error or log stat? */ if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to the "all-nodes" group (ff02::1). * - Restart any timer that is already running but has * A value longer than the requested timeout. * - Use the value specified in the query message as * the maximum timeout. */ /* * XXX: System timer resolution is too low to handle Max * Response Delay, so set 1 to the internal timer even if * the calculated value equals to zero when Max Response * Delay is positive. */ timer = ntohs(mldh->mld_maxdelay)*PR_FASTHZ/MLD_TIMER_SCALE; if (timer == 0 && mldh->mld_maxdelay) timer = 1; all_nodes.s6_addr16[1] = htons(ifp->if_index); TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; in6m = ifmatoin6m(ifma); if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_nodes) || __IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < __IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) || IN6_ARE_ADDR_EQUAL(&mldh->mld_addr, &in6m->in6m_addr)) { if (timer == 0) { /* send a report immediately */ mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = 0; /* reset timer */ in6m->in6m_state = MLD_IREPORTEDLAST; } else if (in6m->in6m_timer == 0 || /* idle */ in6m->in6m_timer > timer) { in6m->in6m_timer = MLD_RANDOM_DELAY(timer); mld6_timers_are_running = 1; } } } if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ break; case MLD_LISTENER_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * can potentially get looped back if we are a multicast * router, so discard reports sourced by me. * Note that it is impossible to check IFF_LOOPBACK flag of * ifp for this purpose, since ip6_mloopback pass the physical * interface to if_input_local(). */ if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ break; if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr)) break; if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ /* * If we belong to the group being reported, stop * our timer for that group. */ IN6_LOOKUP_MULTI(mldh->mld_addr, ifp, in6m); if (in6m) { in6m->in6m_timer = 0; /* transit to idle state */ in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */ } if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ break; default: /* this is impossible */ #if 0 /* * this case should be impossible because of filtering in * icmp6_input(). But we explicitly disabled this part * just in case. */ log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type); #endif break; } if_put(ifp); m_freem(m); } void mld6_fasttimeo(void) { struct ifnet *ifp; /* * Quick check to see if any work needs to be done, in order * to minimize the overhead of fasttimo processing. * Variable mld6_timers_are_running is read atomically, but without * lock intentionally. In case it is not set due to MP races, we may * miss to check the timers. Then run the loop at next fast timeout. */ if (!mld6_timers_are_running) return; NET_LOCK(); mld6_timers_are_running = 0; TAILQ_FOREACH(ifp, &ifnet, if_list) mld6_checktimer(ifp); NET_UNLOCK(); } void mld6_checktimer(struct ifnet *ifp) { struct in6_multi *in6m; struct ifmaddr *ifma; NET_ASSERT_LOCKED(); TAILQ_FOREACH(ifma, &ifp->if_maddrlist, ifma_list) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; in6m = ifmatoin6m(ifma); if (in6m->in6m_timer == 0) { /* do nothing */ } else if (--in6m->in6m_timer == 0) { mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_state = MLD_IREPORTEDLAST; } else { mld6_timers_are_running = 1; } } } static void mld6_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst) { struct mbuf *mh, *md; struct mld_hdr *mldh; struct ip6_hdr *ip6; struct ip6_moptions im6o; struct in6_ifaddr *ia6; struct ifnet *ifp; int ignflags; ifp = if_get(in6m->in6m_ifidx); if (ifp == NULL) return; /* * At first, find a link local address on the outgoing interface * to use as the source address of the MLD packet. * We do not reject tentative addresses for MLD report to deal with * the case where we first join a link-local address. */ ignflags = IN6_IFF_DUPLICATED|IN6_IFF_ANYCAST; if ((ia6 = in6ifa_ifpforlinklocal(ifp, ignflags)) == NULL) { if_put(ifp); return; } if ((ia6->ia6_flags & IN6_IFF_TENTATIVE)) ia6 = NULL; /* * Allocate mbufs to store ip6 header and MLD header. * We allocate 2 mbufs and make chain in advance because * it is more convenient when inserting the hop-by-hop option later. */ MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == NULL) { if_put(ifp); return; } MGET(md, M_DONTWAIT, MT_DATA); if (md == NULL) { m_free(mh); if_put(ifp); return; } mh->m_next = md; mh->m_pkthdr.ph_ifidx = 0; mh->m_pkthdr.ph_rtableid = ifp->if_rdomain; mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); m_align(mh, sizeof(struct ip6_hdr)); /* fill in the ip6 header */ ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; /* ip6_hlim will be set by im6o.im6o_hlim */ ip6->ip6_src = ia6 ? ia6->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; /* fill in the MLD header */ md->m_len = sizeof(struct mld_hdr); mldh = mtod(md, struct mld_hdr *); mldh->mld_type = type; mldh->mld_code = 0; mldh->mld_cksum = 0; /* XXX: we assume the function will not be called for query messages */ mldh->mld_maxdelay = 0; mldh->mld_reserved = 0; mldh->mld_addr = in6m->in6m_addr; if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ mh->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT; /* construct multicast option */ bzero(&im6o, sizeof(im6o)); im6o.im6o_ifidx = ifp->if_index; im6o.im6o_hlim = 1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ #ifdef MROUTING im6o.im6o_loop = (ip6_mrouter[ifp->if_rdomain] != NULL); #endif if_put(ifp); icmp6stat_inc(icp6s_outhist + type); ip6_output(mh, &ip6_opts, NULL, ia6 ? 0 : IPV6_UNSPECSRC, &im6o, NULL); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 /* $OpenBSD: video.c,v 1.57 2022/07/02 08:50:41 visa Exp $ */ /* * Copyright (c) 2008 Robert Nagy <robert@openbsd.org> * Copyright (c) 2008 Marcus Glocker <mglocker@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/device.h> #include <sys/vnode.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/conf.h> #include <sys/proc.h> #include <sys/videoio.h> #include <dev/video_if.h> #include <uvm/uvm_extern.h> #ifdef VIDEO_DEBUG int video_debug = 1; #define DPRINTF(l, x...) do { if ((l) <= video_debug) printf(x); } while (0) #else #define DPRINTF(l, x...) #endif struct video_softc { struct device dev; void *hw_hdl; /* hardware driver handle */ struct device *sc_dev; /* hardware device struct */ const struct video_hw_if *hw_if; /* hardware interface */ char sc_dying; /* device detached */ struct process *sc_owner; /* owner process */ uint8_t sc_open; /* device opened */ int sc_fsize; uint8_t *sc_fbuffer; caddr_t sc_fbuffer_mmap; size_t sc_fbufferlen; int sc_vidmode; /* access mode */ #define VIDMODE_NONE 0 #define VIDMODE_MMAP 1 #define VIDMODE_READ 2 int sc_frames_ready; struct selinfo sc_rsel; /* read selector */ }; int videoprobe(struct device *, void *, void *); void videoattach(struct device *, struct device *, void *); int videodetach(struct device *, int); int videoactivate(struct device *, int); int videoprint(void *, const char *); void video_intr(void *); int video_stop(struct video_softc *); int video_claim(struct video_softc *, struct process *); const struct cfattach video_ca = { sizeof(struct video_softc), videoprobe, videoattach, videodetach, videoactivate }; struct cfdriver video_cd = { NULL, "video", DV_DULL }; /* * Global flag to control if video recording is enabled by kern.video.record. */ int video_record_enable = 0; int videoprobe(struct device *parent, void *match, void *aux) { return (1); } void videoattach(struct device *parent, struct device *self, void *aux) { struct video_softc *sc = (void *)self; struct video_attach_args *sa = aux; printf("\n"); sc->hw_if = sa->hwif; sc->hw_hdl = sa->hdl; sc->sc_dev = parent; sc->sc_fbufferlen = 0; sc->sc_owner = NULL; if (sc->hw_if->get_bufsize) sc->sc_fbufferlen = (sc->hw_if->get_bufsize)(sc->hw_hdl); if (sc->sc_fbufferlen == 0) { printf("video: could not request frame buffer size\n"); return; } sc->sc_fbuffer = malloc(sc->sc_fbufferlen, M_DEVBUF, M_NOWAIT); if (sc->sc_fbuffer == NULL) { printf("video: could not allocate frame buffer\n"); return; } } int videoopen(dev_t dev, int flags, int fmt, struct proc *p) { int unit = VIDEOUNIT(dev); struct video_softc *sc; int error = 0; KERNEL_ASSERT_LOCKED(); if (unit >= video_cd.cd_ndevs || (sc = video_cd.cd_devs[unit]) == NULL || sc->hw_if == NULL) return (ENXIO); if (sc->sc_open) { DPRINTF(1, "%s: device already open\n", __func__); return (0); } sc->sc_vidmode = VIDMODE_NONE; sc->sc_frames_ready = 0; if (sc->hw_if->open != NULL) { error = sc->hw_if->open(sc->hw_hdl, flags, &sc->sc_fsize, sc->sc_fbuffer, video_intr, sc); } if (error == 0) { sc->sc_open = 1; DPRINTF(1, "%s: set device to open\n", __func__); } return (error); } int videoclose(dev_t dev, int flags, int fmt, struct proc *p) { struct video_softc *sc; int error = 0; KERNEL_ASSERT_LOCKED(); DPRINTF(1, "%s: last close\n", __func__); sc = video_cd.cd_devs[VIDEOUNIT(dev)]; error = video_stop(sc); sc->sc_open = 0; return (error); } int videoread(dev_t dev, struct uio *uio, int ioflag) { int unit = VIDEOUNIT(dev); struct video_softc *sc; int error; size_t size; KERNEL_ASSERT_LOCKED(); if (unit >= video_cd.cd_ndevs || (sc = video_cd.cd_devs[unit]) == NULL) return (ENXIO); if (sc->sc_dying) return (EIO); if (sc->sc_vidmode == VIDMODE_MMAP) return (EBUSY); if ((error = video_claim(sc, curproc->p_p))) return (error); /* start the stream if not already started */ if (sc->sc_vidmode == VIDMODE_NONE && sc->hw_if->start_read) { error = sc->hw_if->start_read(sc->hw_hdl); if (error) return (error); sc->sc_vidmode = VIDMODE_READ; } DPRINTF(1, "resid=%zu\n", uio->uio_resid); if (sc->sc_frames_ready < 1) { /* block userland read until a frame is ready */ error = tsleep_nsec(sc, PWAIT | PCATCH, "vid_rd", INFSLP); if (sc->sc_dying) error = EIO; if (error) return (error); } /* move no more than 1 frame to userland, as per specification */ size = ulmin(uio->uio_resid, sc->sc_fsize); if (!video_record_enable) bzero(sc->sc_fbuffer, size); error = uiomove(sc->sc_fbuffer, size, uio); sc->sc_frames_ready--; if (error) return (error); DPRINTF(1, "uiomove successfully done (%zu bytes)\n", size); return (0); } int videoioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p) { int unit = VIDEOUNIT(dev); struct video_softc *sc; struct v4l2_buffer *vb = (struct v4l2_buffer *)data; int error; KERNEL_ASSERT_LOCKED(); if (unit >= video_cd.cd_ndevs || (sc = video_cd.cd_devs[unit]) == NULL || sc->hw_if == NULL) return (ENXIO); DPRINTF(3, "video_ioctl(%zu, '%c', %zu)\n", IOCPARM_LEN(cmd), (int) IOCGROUP(cmd), cmd & 0xff); error = EOPNOTSUPP; switch (cmd) { case VIDIOC_G_CTRL: if (sc->hw_if->g_ctrl) error = (sc->hw_if->g_ctrl)(sc->hw_hdl, (struct v4l2_control *)data); break; case VIDIOC_S_CTRL: if (sc->hw_if->s_ctrl) error = (sc->hw_if->s_ctrl)(sc->hw_hdl, (struct v4l2_control *)data); break; default: error = (ENOTTY); } if (error != ENOTTY) return (error); if ((error = video_claim(sc, p->p_p))) return (error); /* * The following IOCTLs can only be called by the device owner. * For further shared IOCTLs please move it up. */ error = EOPNOTSUPP; switch (cmd) { case VIDIOC_QUERYCAP: if (sc->hw_if->querycap) error = (sc->hw_if->querycap)(sc->hw_hdl, (struct v4l2_capability *)data); break; case VIDIOC_ENUM_FMT: if (sc->hw_if->enum_fmt) error = (sc->hw_if->enum_fmt)(sc->hw_hdl, (struct v4l2_fmtdesc *)data); break; case VIDIOC_ENUM_FRAMESIZES: if (sc->hw_if->enum_fsizes) error = (sc->hw_if->enum_fsizes)(sc->hw_hdl, (struct v4l2_frmsizeenum *)data); break; case VIDIOC_ENUM_FRAMEINTERVALS: if (sc->hw_if->enum_fivals) error = (sc->hw_if->enum_fivals)(sc->hw_hdl, (struct v4l2_frmivalenum *)data); break; case VIDIOC_S_FMT: if (!(flags & FWRITE)) return (EACCES); if (sc->hw_if->s_fmt) error = (sc->hw_if->s_fmt)(sc->hw_hdl, (struct v4l2_format *)data); break; case VIDIOC_G_FMT: if (sc->hw_if->g_fmt) error = (sc->hw_if->g_fmt)(sc->hw_hdl, (struct v4l2_format *)data); break; case VIDIOC_S_PARM: if (sc->hw_if->s_parm) error = (sc->hw_if->s_parm)(sc->hw_hdl, (struct v4l2_streamparm *)data); break; case VIDIOC_G_PARM: if (sc->hw_if->g_parm) error = (sc->hw_if->g_parm)(sc->hw_hdl, (struct v4l2_streamparm *)data); break; case VIDIOC_ENUMINPUT: if (sc->hw_if->enum_input) error = (sc->hw_if->enum_input)(sc->hw_hdl, (struct v4l2_input *)data); break; case VIDIOC_S_INPUT: if (sc->hw_if->s_input) error = (sc->hw_if->s_input)(sc->hw_hdl, (int)*data); break; case VIDIOC_G_INPUT: if (sc->hw_if->g_input) error = (sc->hw_if->g_input)(sc->hw_hdl, (int *)data); break; case VIDIOC_REQBUFS: if (sc->hw_if->reqbufs) error = (sc->hw_if->reqbufs)(sc->hw_hdl, (struct v4l2_requestbuffers *)data); break; case VIDIOC_QUERYBUF: if (sc->hw_if->querybuf) error = (sc->hw_if->querybuf)(sc->hw_hdl, (struct v4l2_buffer *)data); break; case VIDIOC_QBUF: if (sc->hw_if->qbuf) error = (sc->hw_if->qbuf)(sc->hw_hdl, (struct v4l2_buffer *)data); break; case VIDIOC_DQBUF: if (!sc->hw_if->dqbuf) break; /* should have called mmap() before now */ if (sc->sc_vidmode != VIDMODE_MMAP) { error = EINVAL; break; } error = (sc->hw_if->dqbuf)(sc->hw_hdl, (struct v4l2_buffer *)data); if (!video_record_enable) bzero(sc->sc_fbuffer_mmap + vb->m.offset, vb->length); sc->sc_frames_ready--; break; case VIDIOC_STREAMON: if (sc->hw_if->streamon) error = (sc->hw_if->streamon)(sc->hw_hdl, (int)*data); break; case VIDIOC_STREAMOFF: if (sc->hw_if->streamoff) error = (sc->hw_if->streamoff)(sc->hw_hdl, (int)*data); if (!error) { /* Release device ownership and streaming buffers. */ error = video_stop(sc); } break; case VIDIOC_TRY_FMT: if (sc->hw_if->try_fmt) error = (sc->hw_if->try_fmt)(sc->hw_hdl, (struct v4l2_format *)data); break; case VIDIOC_QUERYCTRL: if (sc->hw_if->queryctrl) error = (sc->hw_if->queryctrl)(sc->hw_hdl, (struct v4l2_queryctrl *)data); break; default: error = (ENOTTY); } return (error); } paddr_t videommap(dev_t dev, off_t off, int prot) { int unit = VIDEOUNIT(dev); struct video_softc *sc; caddr_t p; paddr_t pa; KERNEL_ASSERT_LOCKED(); DPRINTF(2, "%s: off=%lld, prot=%d\n", __func__, off, prot); if (unit >= video_cd.cd_ndevs || (sc = video_cd.cd_devs[unit]) == NULL) return (-1); if (sc->sc_dying) return (-1); if (sc->hw_if->mappage == NULL) return (-1); p = sc->hw_if->mappage(sc->hw_hdl, off, prot); if (p == NULL) return (-1); if (pmap_extract(pmap_kernel(), (vaddr_t)p, &pa) == FALSE) panic("videommap: invalid page"); sc->sc_vidmode = VIDMODE_MMAP; /* store frame buffer base address for later blanking */ if (off == 0) sc->sc_fbuffer_mmap = p; return (pa); } void filt_videodetach(struct knote *kn) { struct video_softc *sc = kn->kn_hook; int s; s = splhigh(); klist_remove_locked(&sc->sc_rsel.si_note, kn); splx(s); } int filt_videoread(struct knote *kn, long hint) { struct video_softc *sc = kn->kn_hook; if (sc->sc_frames_ready > 0) return (1); return (0); } const struct filterops video_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_videodetach, .f_event = filt_videoread, }; int videokqfilter(dev_t dev, struct knote *kn) { int unit = VIDEOUNIT(dev); struct video_softc *sc; int s, error; KERNEL_ASSERT_LOCKED(); if (unit >= video_cd.cd_ndevs || (sc = video_cd.cd_devs[unit]) == NULL) return (ENXIO); if (sc->sc_dying) return (ENXIO); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &video_filtops; kn->kn_hook = sc; break; default: return (EINVAL); } if ((error = video_claim(sc, curproc->p_p))) return (error); /* * Start the stream in read() mode if not already started. If * the user wanted mmap() mode, he should have called mmap() * before now. */ if (sc->sc_vidmode == VIDMODE_NONE && sc->hw_if->start_read) { if (sc->hw_if->start_read(sc->hw_hdl)) return (ENXIO); sc->sc_vidmode = VIDMODE_READ; } s = splhigh(); klist_insert_locked(&sc->sc_rsel.si_note, kn); splx(s); return (0); } int video_submatch(struct device *parent, void *match, void *aux) { struct cfdata *cf = match; return (cf->cf_driver == &video_cd); } /* * Called from hardware driver. This is where the MI video driver gets * probed/attached to the hardware driver */ struct device * video_attach_mi(const struct video_hw_if *rhwp, void *hdlp, struct device *dev) { struct video_attach_args arg; arg.hwif = rhwp; arg.hdl = hdlp; return (config_found_sm(dev, &arg, videoprint, video_submatch)); } void video_intr(void *addr) { struct video_softc *sc = (struct video_softc *)addr; DPRINTF(3, "video_intr sc=%p\n", sc); if (sc->sc_vidmode != VIDMODE_NONE) sc->sc_frames_ready++; else printf("%s: interrupt but no streams!\n", __func__); if (sc->sc_vidmode == VIDMODE_READ) wakeup(sc); selwakeup(&sc->sc_rsel); } int video_stop(struct video_softc *sc) { int error = 0; DPRINTF(1, "%s: stream close\n", __func__); if (sc->hw_if->close != NULL) error = sc->hw_if->close(sc->hw_hdl); sc->sc_vidmode = VIDMODE_NONE; sc->sc_frames_ready = 0; sc->sc_owner = NULL; return (error); } int video_claim(struct video_softc *sc, struct process *pr) { if (sc->sc_owner != NULL && sc->sc_owner != pr) { DPRINTF(1, "%s: already owned=%p\n", __func__, sc->sc_owner); return (EBUSY); } if (sc->sc_owner == NULL) { sc->sc_owner = pr; DPRINTF(1, "%s: new owner=%p\n", __func__, sc->sc_owner); } return (0); } int videoprint(void *aux, const char *pnp) { if (pnp != NULL) printf("video at %s", pnp); return (UNCONF); } int videodetach(struct device *self, int flags) { struct video_softc *sc = (struct video_softc *)self; int s, maj, mn; /* locate the major number */ for (maj = 0; maj < nchrdev; maj++) if (cdevsw[maj].d_open == videoopen) break; /* Nuke the vnodes for any open instances (calls close). */ mn = self->dv_unit; vdevgone(maj, mn, mn, VCHR); s = splhigh(); klist_invalidate(&sc->sc_rsel.si_note); splx(s); free(sc->sc_fbuffer, M_DEVBUF, sc->sc_fbufferlen); return (0); } int videoactivate(struct device *self, int act) { struct video_softc *sc = (struct video_softc *)self; switch (act) { case DVACT_DEACTIVATE: sc->sc_dying = 1; break; } return (0); }
18 18 18 18 18 17 1 256 352 78 41 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 /* $OpenBSD: kern_task.c,v 1.33 2022/08/15 11:38:35 mvs Exp $ */ /* * Copyright (c) 2013 David Gwynne <dlg@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/kthread.h> #include <sys/task.h> #include <sys/proc.h> #include <sys/witness.h> #include "kcov.h" #if NKCOV > 0 #include <sys/kcov.h> #endif #ifdef WITNESS static struct lock_type taskq_lock_type = { .lt_name = "taskq" }; #define TASKQ_LOCK_FLAGS LO_WITNESS | LO_INITIALIZED | LO_SLEEPABLE | \ (LO_CLASS_RWLOCK << LO_CLASSSHIFT) #endif /* WITNESS */ struct taskq_thread { SLIST_ENTRY(taskq_thread) tt_entry; struct proc *tt_thread; }; SLIST_HEAD(taskq_threads, taskq_thread); struct taskq { enum { TQ_S_CREATED, TQ_S_RUNNING, TQ_S_DESTROYED } tq_state; unsigned int tq_running; unsigned int tq_nthreads; unsigned int tq_flags; const char *tq_name; struct mutex tq_mtx; struct task_list tq_worklist; struct taskq_threads tq_threads; unsigned int tq_barriers; unsigned int tq_bgen; unsigned int tq_bthreads; #ifdef WITNESS struct lock_object tq_lock_object; #endif }; static const char taskq_sys_name[] = "systq"; struct taskq taskq_sys = { .tq_state = TQ_S_CREATED, .tq_running = 0, .tq_nthreads = 1, .tq_flags = 0, .tq_name = taskq_sys_name, .tq_mtx = MUTEX_INITIALIZER_FLAGS(IPL_HIGH, taskq_sys_name, 0), .tq_worklist = TAILQ_HEAD_INITIALIZER(taskq_sys.tq_worklist), .tq_threads = SLIST_HEAD_INITIALIZER(taskq_sys.tq_threads), .tq_barriers = 0, .tq_bgen = 0, .tq_bthreads = 0, #ifdef WITNESS .tq_lock_object = { .lo_name = taskq_sys_name, .lo_flags = TASKQ_LOCK_FLAGS, }, #endif }; static const char taskq_sys_mp_name[] = "systqmp"; struct taskq taskq_sys_mp = { .tq_state = TQ_S_CREATED, .tq_running = 0, .tq_nthreads = 1, .tq_flags = TASKQ_MPSAFE, .tq_name = taskq_sys_mp_name, .tq_mtx = MUTEX_INITIALIZER_FLAGS(IPL_HIGH, taskq_sys_mp_name, 0), .tq_worklist = TAILQ_HEAD_INITIALIZER(taskq_sys_mp.tq_worklist), .tq_threads = SLIST_HEAD_INITIALIZER(taskq_sys_mp.tq_threads), .tq_barriers = 0, .tq_bgen = 0, .tq_bthreads = 0, #ifdef WITNESS .tq_lock_object = { .lo_name = taskq_sys_mp_name, .lo_flags = TASKQ_LOCK_FLAGS, }, #endif }; struct taskq *const systq = &taskq_sys; struct taskq *const systqmp = &taskq_sys_mp; void taskq_init(void); /* called in init_main.c */ void taskq_create_thread(void *); void taskq_barrier_task(void *); int taskq_sleep(const volatile void *, struct mutex *, int, const char *, int); int taskq_next_work(struct taskq *, struct task *); void taskq_thread(void *); void taskq_init(void) { WITNESS_INIT(&systq->tq_lock_object, &taskq_lock_type); kthread_create_deferred(taskq_create_thread, systq); WITNESS_INIT(&systqmp->tq_lock_object, &taskq_lock_type); kthread_create_deferred(taskq_create_thread, systqmp); } struct taskq * taskq_create(const char *name, unsigned int nthreads, int ipl, unsigned int flags) { struct taskq *tq; tq = malloc(sizeof(*tq), M_DEVBUF, M_WAITOK); if (tq == NULL) return (NULL); tq->tq_state = TQ_S_CREATED; tq->tq_running = 0; tq->tq_nthreads = nthreads; tq->tq_name = name; tq->tq_flags = flags; mtx_init_flags(&tq->tq_mtx, ipl, name, 0); TAILQ_INIT(&tq->tq_worklist); SLIST_INIT(&tq->tq_threads); tq->tq_barriers = 0; tq->tq_bgen = 0; tq->tq_bthreads = 0; #ifdef WITNESS memset(&tq->tq_lock_object, 0, sizeof(tq->tq_lock_object)); tq->tq_lock_object.lo_name = name; tq->tq_lock_object.lo_flags = TASKQ_LOCK_FLAGS; witness_init(&tq->tq_lock_object, &taskq_lock_type); #endif /* try to create a thread to guarantee that tasks will be serviced */ kthread_create_deferred(taskq_create_thread, tq); return (tq); } void taskq_destroy(struct taskq *tq) { mtx_enter(&tq->tq_mtx); switch (tq->tq_state) { case TQ_S_CREATED: /* tq is still referenced by taskq_create_thread */ tq->tq_state = TQ_S_DESTROYED; mtx_leave(&tq->tq_mtx); return; case TQ_S_RUNNING: tq->tq_state = TQ_S_DESTROYED; break; default: panic("unexpected %s tq state %u", tq->tq_name, tq->tq_state); } while (tq->tq_running > 0) { wakeup(tq); msleep_nsec(&tq->tq_running, &tq->tq_mtx, PWAIT, "tqdestroy", INFSLP); } mtx_leave(&tq->tq_mtx); free(tq, M_DEVBUF, sizeof(*tq)); } void taskq_create_thread(void *arg) { struct taskq *tq = arg; int rv; mtx_enter(&tq->tq_mtx); switch (tq->tq_state) { case TQ_S_DESTROYED: mtx_leave(&tq->tq_mtx); free(tq, M_DEVBUF, sizeof(*tq)); return; case TQ_S_CREATED: tq->tq_state = TQ_S_RUNNING; break; default: panic("unexpected %s tq state %d", tq->tq_name, tq->tq_state); } do { tq->tq_running++; mtx_leave(&tq->tq_mtx); rv = kthread_create(taskq_thread, tq, NULL, tq->tq_name); mtx_enter(&tq->tq_mtx); if (rv != 0) { printf("unable to create thread for \"%s\" taskq\n", tq->tq_name); tq->tq_running--; /* could have been destroyed during kthread_create */ if (tq->tq_state == TQ_S_DESTROYED && tq->tq_running == 0) wakeup_one(&tq->tq_running); break; } } while (tq->tq_running < tq->tq_nthreads); mtx_leave(&tq->tq_mtx); } void taskq_barrier_task(void *p) { struct taskq *tq = p; unsigned int gen; mtx_enter(&tq->tq_mtx); tq->tq_bthreads++; wakeup(&tq->tq_bthreads); gen = tq->tq_bgen; do { msleep_nsec(&tq->tq_bgen, &tq->tq_mtx, PWAIT, "tqbarend", INFSLP); } while (gen == tq->tq_bgen); mtx_leave(&tq->tq_mtx); } static void taskq_do_barrier(struct taskq *tq) { struct task t = TASK_INITIALIZER(taskq_barrier_task, tq); struct proc *thread = curproc; struct taskq_thread *tt; mtx_enter(&tq->tq_mtx); tq->tq_barriers++; /* is the barrier being run from a task inside the taskq? */ SLIST_FOREACH(tt, &tq->tq_threads, tt_entry) { if (tt->tt_thread == thread) { tq->tq_bthreads++; wakeup(&tq->tq_bthreads); break; } } while (tq->tq_bthreads < tq->tq_nthreads) { /* shove the task into the queue for a worker to pick up */ SET(t.t_flags, TASK_ONQUEUE); TAILQ_INSERT_TAIL(&tq->tq_worklist, &t, t_entry); wakeup_one(tq); msleep_nsec(&tq->tq_bthreads, &tq->tq_mtx, PWAIT, "tqbar", INFSLP); /* * another thread running a barrier might have * done this work for us. */ if (ISSET(t.t_flags, TASK_ONQUEUE)) TAILQ_REMOVE(&tq->tq_worklist, &t, t_entry); } if (--tq->tq_barriers == 0) { /* we're the last one out */ tq->tq_bgen++; wakeup(&tq->tq_bgen); tq->tq_bthreads = 0; } else { unsigned int gen = tq->tq_bgen; do { msleep_nsec(&tq->tq_bgen, &tq->tq_mtx, PWAIT, "tqbarwait", INFSLP); } while (gen == tq->tq_bgen); } mtx_leave(&tq->tq_mtx); } void taskq_barrier(struct taskq *tq) { WITNESS_CHECKORDER(&tq->tq_lock_object, LOP_NEWORDER, NULL); taskq_do_barrier(tq); } void taskq_del_barrier(struct taskq *tq, struct task *t) { WITNESS_CHECKORDER(&tq->tq_lock_object, LOP_NEWORDER, NULL); if (task_del(tq, t)) return; taskq_do_barrier(tq); } void task_set(struct task *t, void (*fn)(void *), void *arg) { t->t_func = fn; t->t_arg = arg; t->t_flags = 0; } int task_add(struct taskq *tq, struct task *w) { int rv = 0; if (ISSET(w->t_flags, TASK_ONQUEUE)) return (0); mtx_enter(&tq->tq_mtx); if (!ISSET(w->t_flags, TASK_ONQUEUE)) { rv = 1; SET(w->t_flags, TASK_ONQUEUE); TAILQ_INSERT_TAIL(&tq->tq_worklist, w, t_entry); #if NKCOV > 0 w->t_process = curproc->p_p; #endif } mtx_leave(&tq->tq_mtx); if (rv) wakeup_one(tq); return (rv); } int task_del(struct taskq *tq, struct task *w) { int rv = 0; if (!ISSET(w->t_flags, TASK_ONQUEUE)) return (0); mtx_enter(&tq->tq_mtx); if (ISSET(w->t_flags, TASK_ONQUEUE)) { rv = 1; CLR(w->t_flags, TASK_ONQUEUE); TAILQ_REMOVE(&tq->tq_worklist, w, t_entry); } mtx_leave(&tq->tq_mtx); return (rv); } int taskq_next_work(struct taskq *tq, struct task *work) { struct task *next; mtx_enter(&tq->tq_mtx); while ((next = TAILQ_FIRST(&tq->tq_worklist)) == NULL) { if (tq->tq_state != TQ_S_RUNNING) { mtx_leave(&tq->tq_mtx); return (0); } msleep_nsec(tq, &tq->tq_mtx, PWAIT, "bored", INFSLP); } TAILQ_REMOVE(&tq->tq_worklist, next, t_entry); CLR(next->t_flags, TASK_ONQUEUE); *work = *next; /* copy to caller to avoid races */ next = TAILQ_FIRST(&tq->tq_worklist); mtx_leave(&tq->tq_mtx); if (next != NULL && tq->tq_nthreads > 1) wakeup_one(tq); return (1); } void taskq_thread(void *xtq) { struct taskq_thread self = { .tt_thread = curproc }; struct taskq *tq = xtq; struct task work; int last; if (ISSET(tq->tq_flags, TASKQ_MPSAFE)) KERNEL_UNLOCK(); mtx_enter(&tq->tq_mtx); SLIST_INSERT_HEAD(&tq->tq_threads, &self, tt_entry); mtx_leave(&tq->tq_mtx); WITNESS_CHECKORDER(&tq->tq_lock_object, LOP_NEWORDER, NULL); while (taskq_next_work(tq, &work)) { WITNESS_LOCK(&tq->tq_lock_object, 0); #if NKCOV > 0 kcov_remote_enter(KCOV_REMOTE_COMMON, work.t_process); #endif (*work.t_func)(work.t_arg); #if NKCOV > 0 kcov_remote_leave(KCOV_REMOTE_COMMON, work.t_process); #endif WITNESS_UNLOCK(&tq->tq_lock_object, 0); sched_pause(yield); } mtx_enter(&tq->tq_mtx); SLIST_REMOVE(&tq->tq_threads, &self, taskq_thread, tt_entry); last = (--tq->tq_running == 0); mtx_leave(&tq->tq_mtx); if (ISSET(tq->tq_flags, TASKQ_MPSAFE)) KERNEL_LOCK(); if (last) wakeup_one(&tq->tq_running); kthread_exit(0); }
220 207 14 14 14 236 14 4 6 1 1 13 13 13 7 7 4 3 1 3 1 1 1 13 13 3 3 2 1 15 12 20 2 35 23 8 22 6 17 11 11 6 20 6 34 3 9 158 15 96 218 53 191 191 217 214 215 1 1 6 6 5 5 138 2 107 2 23 4 27 67 100 3 94 25 102 87 3 67 38 8 29 2 31 5 98 327 327 250 324 326 281 282 90 15 27 264 32 268 24 21 14 245 20 20 9 24 259 64 1 18 23 10 73 227 153 1 24 43 7 3 213 34 221 12 238 58 242 63 46 22 12 11 200 39 97 44 152 5 3 150 152 87 1 1 148 22 61 6 89 9 32 109 52 152 168 206 60 152 2 2 12 15 15 11 5 15 15 14 1 15 194 126 295 10 9 139 215 214 3 54 257 257 118 283 240 241 241 41 73 74 11 46 14 4 20 14 23 14 279 277 279 19 69 259 236 24 158 141 16 158 158 137 26 156 22 138 283 27 27 33 77 65 123 22 86 123 121 24 185 185 29 32 51 54 54 23 24 24 59 24 59 24 7 7 32 32 19 70 6 6 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 /* $OpenBSD: kern_event.c,v 1.193 2022/08/14 01:58:27 jsg Exp $ */ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/pledge.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/fcntl.h> #include <sys/queue.h> #include <sys/event.h> #include <sys/eventvar.h> #include <sys/ktrace.h> #include <sys/pool.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/time.h> #include <sys/timeout.h> #include <sys/vnode.h> #include <sys/wait.h> #ifdef DIAGNOSTIC #define KLIST_ASSERT_LOCKED(kl) do { \ if ((kl)->kl_ops != NULL) \ (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \ else \ KERNEL_ASSERT_LOCKED(); \ } while (0) #else #define KLIST_ASSERT_LOCKED(kl) ((void)(kl)) #endif struct kqueue *kqueue_alloc(struct filedesc *); void kqueue_terminate(struct proc *p, struct kqueue *); void KQREF(struct kqueue *); void KQRELE(struct kqueue *); void kqueue_purge(struct proc *, struct kqueue *); int kqueue_sleep(struct kqueue *, struct timespec *); int kqueue_read(struct file *, struct uio *, int); int kqueue_write(struct file *, struct uio *, int); int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p); int kqueue_kqfilter(struct file *fp, struct knote *kn); int kqueue_stat(struct file *fp, struct stat *st, struct proc *p); int kqueue_close(struct file *fp, struct proc *p); void kqueue_wakeup(struct kqueue *kq); #ifdef KQUEUE_DEBUG void kqueue_do_check(struct kqueue *kq, const char *func, int line); #define kqueue_check(kq) kqueue_do_check((kq), __func__, __LINE__) #else #define kqueue_check(kq) do {} while (0) #endif static int filter_attach(struct knote *kn); static void filter_detach(struct knote *kn); static int filter_event(struct knote *kn, long hint); static int filter_modify(struct kevent *kev, struct knote *kn); static int filter_process(struct knote *kn, struct kevent *kev); static void kqueue_expand_hash(struct kqueue *kq); static void kqueue_expand_list(struct kqueue *kq, int fd); static void kqueue_task(void *); static int klist_lock(struct klist *); static void klist_unlock(struct klist *, int); const struct fileops kqueueops = { .fo_read = kqueue_read, .fo_write = kqueue_write, .fo_ioctl = kqueue_ioctl, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close }; void knote_attach(struct knote *kn); void knote_detach(struct knote *kn); void knote_drop(struct knote *kn, struct proc *p); void knote_enqueue(struct knote *kn); void knote_dequeue(struct knote *kn); int knote_acquire(struct knote *kn, struct klist *, int); void knote_release(struct knote *kn); void knote_activate(struct knote *kn); void knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx, int purge); void filt_kqdetach(struct knote *kn); int filt_kqueue(struct knote *kn, long hint); int filt_kqueuemodify(struct kevent *kev, struct knote *kn); int filt_kqueueprocess(struct knote *kn, struct kevent *kev); int filt_kqueue_common(struct knote *kn, struct kqueue *kq); int filt_procattach(struct knote *kn); void filt_procdetach(struct knote *kn); int filt_proc(struct knote *kn, long hint); int filt_fileattach(struct knote *kn); void filt_timerexpire(void *knx); int filt_timerattach(struct knote *kn); void filt_timerdetach(struct knote *kn); int filt_timermodify(struct kevent *kev, struct knote *kn); int filt_timerprocess(struct knote *kn, struct kevent *kev); void filt_seltruedetach(struct knote *kn); const struct filterops kqread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_kqdetach, .f_event = filt_kqueue, .f_modify = filt_kqueuemodify, .f_process = filt_kqueueprocess, }; const struct filterops proc_filtops = { .f_flags = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; const struct filterops file_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = filt_fileattach, .f_detach = NULL, .f_event = NULL, }; const struct filterops timer_filtops = { .f_flags = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = NULL, .f_modify = filt_timermodify, .f_process = filt_timerprocess, }; struct pool knote_pool; struct pool kqueue_pool; struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); int kq_ntimeouts = 0; int kq_timeoutmax = (4 * 1024); #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) /* * Table for for all system-defined filters. */ const struct filterops *const sysfilt_ops[] = { &file_filtops, /* EVFILT_READ */ &file_filtops, /* EVFILT_WRITE */ NULL, /*&aio_filtops,*/ /* EVFILT_AIO */ &file_filtops, /* EVFILT_VNODE */ &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ &timer_filtops, /* EVFILT_TIMER */ &file_filtops, /* EVFILT_DEVICE */ &file_filtops, /* EVFILT_EXCEPT */ }; void KQREF(struct kqueue *kq) { refcnt_take(&kq->kq_refcnt); } void KQRELE(struct kqueue *kq) { struct filedesc *fdp; if (refcnt_rele(&kq->kq_refcnt) == 0) return; fdp = kq->kq_fdp; if (rw_status(&fdp->fd_lock) == RW_WRITE) { LIST_REMOVE(kq, kq_next); } else { fdplock(fdp); LIST_REMOVE(kq, kq_next); fdpunlock(fdp); } KASSERT(TAILQ_EMPTY(&kq->kq_head)); KASSERT(kq->kq_nknotes == 0); free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct knlist)); hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); klist_free(&kq->kq_klist); pool_put(&kqueue_pool, kq); } void kqueue_init(void) { pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR, PR_WAITOK, "kqueuepl", NULL); pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR, PR_WAITOK, "knotepl", NULL); } void kqueue_init_percpu(void) { pool_cache_init(&knote_pool); } int filt_fileattach(struct knote *kn) { struct file *fp = kn->kn_fp; return fp->f_ops->fo_kqfilter(fp, kn); } int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &kqread_filtops; klist_insert(&kq->kq_klist, kn); return (0); } void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; klist_remove(&kq->kq_klist, kn); } int filt_kqueue_common(struct knote *kn, struct kqueue *kq) { MUTEX_ASSERT_LOCKED(&kq->kq_lock); kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); active = filt_kqueue_common(kn, kq); mtx_leave(&kq->kq_lock); return (active); } int filt_kqueuemodify(struct kevent *kev, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); knote_assign(kev, kn); active = filt_kqueue_common(kn, kq); mtx_leave(&kq->kq_lock); return (active); } int filt_kqueueprocess(struct knote *kn, struct kevent *kev) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) active = 1; else active = filt_kqueue_common(kn, kq); if (active) knote_submit(kn, kev); mtx_leave(&kq->kq_lock); return (active); } int filt_procattach(struct knote *kn) { struct process *pr; int s; if ((curproc->p_p->ps_flags & PS_PLEDGE) && (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) return pledge_fail(curproc, EPERM, PLEDGE_PROC); if (kn->kn_id > PID_MAX) return ESRCH; pr = prfind(kn->kn_id); if (pr == NULL) return (ESRCH); /* exiting processes can't be specified */ if (pr->ps_flags & PS_EXITING) return (ESRCH); kn->kn_ptr.p_process = pr; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } s = splhigh(); klist_insert_locked(&pr->ps_klist, kn); splx(s); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ void filt_procdetach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct process *pr = kn->kn_ptr.p_process; int s, status; mtx_enter(&kq->kq_lock); status = kn->kn_status; mtx_leave(&kq->kq_lock); if (status & KN_DETACHED) return; s = splhigh(); klist_remove_locked(&pr->ps_klist, kn); splx(s); } int filt_proc(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_kq; u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished and remove it * from the process's klist */ if (event == NOTE_EXIT) { struct process *pr = kn->kn_ptr.p_process; int s; mtx_enter(&kq->kq_lock); kn->kn_status |= KN_DETACHED; mtx_leave(&kq->kq_lock); s = splhigh(); kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig); klist_remove_locked(&pr->ps_klist, kn); splx(s); return (1); } /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an * event with the parent's pid. */ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; /* * register knote with new process. */ memset(&kev, 0, sizeof(kev)); kev.ident = hint & NOTE_PDATAMASK; /* pid */ kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_udata; /* preserve udata */ error = kqueue_register(kq, &kev, 0, NULL); if (error) kn->kn_fflags |= NOTE_TRACKERR; } return (kn->kn_fflags != 0); } static void filt_timer_timeout_add(struct knote *kn) { struct timeval tv; struct timeout *to = kn->kn_hook; int tticks; tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); /* Remove extra tick from tvtohz() if timeout has fired before. */ if (timeout_triggered(to)) tticks--; timeout_add(to, (tticks > 0) ? tticks : 1); } void filt_timerexpire(void *knx) { struct knote *kn = knx; struct kqueue *kq = kn->kn_kq; kn->kn_data++; mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); if ((kn->kn_flags & EV_ONESHOT) == 0) filt_timer_timeout_add(kn); } /* * data contains amount of time to sleep, in milliseconds */ int filt_timerattach(struct knote *kn) { struct timeout *to; if (kq_ntimeouts > kq_timeoutmax) return (ENOMEM); kq_ntimeouts++; kn->kn_flags |= EV_CLEAR; /* automatically set */ to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); timeout_set(to, filt_timerexpire, kn); kn->kn_hook = to; filt_timer_timeout_add(kn); return (0); } void filt_timerdetach(struct knote *kn) { struct timeout *to; to = (struct timeout *)kn->kn_hook; timeout_del_barrier(to); free(to, M_KEVENT, sizeof(*to)); kq_ntimeouts--; } int filt_timermodify(struct kevent *kev, struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct timeout *to = kn->kn_hook; /* Reset the timer. Any pending events are discarded. */ timeout_del_barrier(to); mtx_enter(&kq->kq_lock); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; mtx_leave(&kq->kq_lock); kn->kn_data = 0; knote_assign(kev, kn); /* Reinit timeout to invoke tick adjustment again. */ timeout_set(to, filt_timerexpire, kn); filt_timer_timeout_add(kn); return (0); } int filt_timerprocess(struct knote *kn, struct kevent *kev) { int active, s; s = splsoftclock(); active = (kn->kn_data != 0); if (active) knote_submit(kn, kev); splx(s); return (active); } /* * filt_seltrue: * * This filter "event" routine simulates seltrue(). */ int filt_seltrue(struct knote *kn, long hint) { /* * We don't know how much data can be read/written, * but we know that it *can* be. This is about as * good as select/poll does as well. */ kn->kn_data = 0; return (1); } int filt_seltruemodify(struct kevent *kev, struct knote *kn) { knote_assign(kev, kn); return (kn->kn_fop->f_event(kn, 0)); } int filt_seltrueprocess(struct knote *kn, struct kevent *kev) { int active; active = kn->kn_fop->f_event(kn, 0); if (active) knote_submit(kn, kev); return (active); } /* * This provides full kqfilter entry for device switch tables, which * has same effect as filter using filt_seltrue() as filter method. */ void filt_seltruedetach(struct knote *kn) { /* Nothing to do */ } const struct filterops seltrue_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_seltruedetach, .f_event = filt_seltrue, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; int seltrue_kqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: case EVFILT_WRITE: kn->kn_fop = &seltrue_filtops; break; default: return (EINVAL); } /* Nothing more to do */ return (0); } static int filt_dead(struct knote *kn, long hint) { if (kn->kn_filter == EVFILT_EXCEPT) { /* * Do not deliver event because there is no out-of-band data. * However, let HUP condition pass for poll(2). */ if ((kn->kn_flags & __EV_POLL) == 0) { kn->kn_flags |= EV_DISABLE; return (0); } } kn->kn_flags |= (EV_EOF | EV_ONESHOT); if (kn->kn_flags & __EV_POLL) kn->kn_flags |= __EV_HUP; kn->kn_data = 0; return (1); } static void filt_deaddetach(struct knote *kn) { /* Nothing to do */ } const struct filterops dead_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_deaddetach, .f_event = filt_dead, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; static int filt_badfd(struct knote *kn, long hint) { kn->kn_flags |= (EV_ERROR | EV_ONESHOT); kn->kn_data = EBADF; return (1); } /* For use with kqpoll. */ const struct filterops badfd_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_deaddetach, .f_event = filt_badfd, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; static int filter_attach(struct knote *kn) { int error; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { error = kn->kn_fop->f_attach(kn); } else { KERNEL_LOCK(); error = kn->kn_fop->f_attach(kn); KERNEL_UNLOCK(); } return (error); } static void filter_detach(struct knote *kn) { if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { kn->kn_fop->f_detach(kn); } else { KERNEL_LOCK(); kn->kn_fop->f_detach(kn); KERNEL_UNLOCK(); } } static int filter_event(struct knote *kn, long hint) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); return (kn->kn_fop->f_event(kn, hint)); } static int filter_modify(struct kevent *kev, struct knote *kn) { int active, s; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { active = kn->kn_fop->f_modify(kev, kn); } else { KERNEL_LOCK(); if (kn->kn_fop->f_modify != NULL) { active = kn->kn_fop->f_modify(kev, kn); } else { s = splhigh(); active = knote_modify(kev, kn); splx(s); } KERNEL_UNLOCK(); } return (active); } static int filter_process(struct knote *kn, struct kevent *kev) { int active, s; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { active = kn->kn_fop->f_process(kn, kev); } else { KERNEL_LOCK(); if (kn->kn_fop->f_process != NULL) { active = kn->kn_fop->f_process(kn, kev); } else { s = splhigh(); active = knote_process(kn, kev); splx(s); } KERNEL_UNLOCK(); } return (active); } /* * Initialize the current thread for poll/select system call. * num indicates the number of serials that the system call may utilize. * After this function, the valid range of serials is * p_kq_serial <= x < p_kq_serial + num. */ void kqpoll_init(unsigned int num) { struct proc *p = curproc; struct filedesc *fdp; if (p->p_kq == NULL) { p->p_kq = kqueue_alloc(p->p_fd); p->p_kq_serial = arc4random(); fdp = p->p_fd; fdplock(fdp); LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next); fdpunlock(fdp); } if (p->p_kq_serial + num < p->p_kq_serial) { /* Serial is about to wrap. Clear all attached knotes. */ kqueue_purge(p, p->p_kq); p->p_kq_serial = 0; } } /* * Finish poll/select system call. * num must have the same value that was used with kqpoll_init(). */ void kqpoll_done(unsigned int num) { struct proc *p = curproc; struct kqueue *kq = p->p_kq; KASSERT(p->p_kq != NULL); KASSERT(p->p_kq_serial + num >= p->p_kq_serial); p->p_kq_serial += num; /* * Because of kn_pollid key, a thread can in principle allocate * up to O(maxfiles^2) knotes by calling poll(2) repeatedly * with suitably varying pollfd arrays. * Prevent such a large allocation by clearing knotes eagerly * if there are too many of them. * * A small multiple of kq_knlistsize should give enough margin * that eager clearing is infrequent, or does not happen at all, * with normal programs. * A single pollfd entry can use up to three knotes. * Typically there is no significant overlap of fd and events * between different entries in the pollfd array. */ if (kq->kq_nknotes > 4 * kq->kq_knlistsize) kqueue_purge(p, kq); } void kqpoll_exit(void) { struct proc *p = curproc; if (p->p_kq == NULL) return; kqueue_purge(p, p->p_kq); kqueue_terminate(p, p->p_kq); KASSERT(p->p_kq->kq_refcnt.r_refs == 1); KQRELE(p->p_kq); p->p_kq = NULL; } struct kqueue * kqueue_alloc(struct filedesc *fdp) { struct kqueue *kq; kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO); refcnt_init(&kq->kq_refcnt); kq->kq_fdp = fdp; TAILQ_INIT(&kq->kq_head); mtx_init(&kq->kq_lock, IPL_HIGH); task_set(&kq->kq_task, kqueue_task, kq); klist_init_mutex(&kq->kq_klist, &kqueue_klist_lock); return (kq); } int sys_kqueue(struct proc *p, void *v, register_t *retval) { struct filedesc *fdp = p->p_fd; struct kqueue *kq; struct file *fp; int fd, error; kq = kqueue_alloc(fdp); fdplock(fdp); error = falloc(p, &fp, &fd); if (error) goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; fp->f_data = kq; *retval = fd; LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next); kq = NULL; fdinsert(fdp, fd, 0, fp); FRELE(fp, p); out: fdpunlock(fdp); if (kq != NULL) pool_put(&kqueue_pool, kq); return (error); } int sys_kevent(struct proc *p, void *v, register_t *retval) { struct kqueue_scan_state scan; struct filedesc* fdp = p->p_fd; struct sys_kevent_args /* { syscallarg(int) fd; syscallarg(const struct kevent *) changelist; syscallarg(int) nchanges; syscallarg(struct kevent *) eventlist; syscallarg(int) nevents; syscallarg(const struct timespec *) timeout; } */ *uap = v; struct kevent *kevp; struct kqueue *kq; struct file *fp; struct timespec ts; struct timespec *tsp = NULL; int i, n, nerrors, error; int ready, total; struct kevent kev[KQ_NEVENTS]; if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) return (EBADF); if (fp->f_type != DTYPE_KQUEUE) { error = EBADF; goto done; } if (SCARG(uap, timeout) != NULL) { error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); if (error) goto done; #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) { error = EINVAL; goto done; } tsp = &ts; } kq = fp->f_data; nerrors = 0; while ((n = SCARG(uap, nchanges)) > 0) { if (n > nitems(kev)) n = nitems(kev); error = copyin(SCARG(uap, changelist), kev, n * sizeof(struct kevent)); if (error) goto done; #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrevent(p, kev, n); #endif for (i = 0; i < n; i++) { kevp = &kev[i]; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, 0, p); if (error || (kevp->flags & EV_RECEIPT)) { if (SCARG(uap, nevents) != 0) { kevp->flags = EV_ERROR; kevp->data = error; copyout(kevp, SCARG(uap, eventlist), sizeof(*kevp)); SCARG(uap, eventlist)++; SCARG(uap, nevents)--; nerrors++; } else { goto done; } } } SCARG(uap, nchanges) -= n; SCARG(uap, changelist) += n; } if (nerrors) { *retval = nerrors; error = 0; goto done; } kqueue_scan_setup(&scan, kq); FRELE(fp, p); /* * Collect as many events as we can. The timeout on successive * loops is disabled (kqueue_scan() becomes non-blocking). */ total = 0; error = 0; while ((n = SCARG(uap, nevents) - total) > 0) { if (n > nitems(kev)) n = nitems(kev); ready = kqueue_scan(&scan, n, kev, tsp, p, &error); if (ready == 0) break; error = copyout(kev, SCARG(uap, eventlist) + total, sizeof(struct kevent) * ready); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrevent(p, kev, ready); #endif total += ready; if (error || ready < n) break; } kqueue_scan_finish(&scan); *retval = total; return (error); done: FRELE(fp, p); return (error); } #ifdef KQUEUE_DEBUG void kqueue_do_check(struct kqueue *kq, const char *func, int line) { struct knote *kn; int count = 0, nmarker = 0; MUTEX_ASSERT_LOCKED(&kq->kq_lock); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { if (kn->kn_filter == EVFILT_MARKER) { if ((kn->kn_status & KN_QUEUED) != 0) panic("%s:%d: kq=%p kn=%p marker QUEUED", func, line, kq, kn); nmarker++; } else { if ((kn->kn_status & KN_ACTIVE) == 0) panic("%s:%d: kq=%p kn=%p knote !ACTIVE", func, line, kq, kn); if ((kn->kn_status & KN_QUEUED) == 0) panic("%s:%d: kq=%p kn=%p knote !QUEUED", func, line, kq, kn); if (kn->kn_kq != kq) panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq", func, line, kq, kn, kn->kn_kq); count++; if (count > kq->kq_count) goto bad; } } if (count != kq->kq_count) { bad: panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d", func, line, kq, kq->kq_count, count, nmarker); } } #endif int kqueue_register(struct kqueue *kq, struct kevent *kev, unsigned int pollid, struct proc *p) { struct filedesc *fdp = kq->kq_fdp; const struct filterops *fops = NULL; struct file *fp = NULL; struct knote *kn = NULL, *newkn = NULL; struct knlist *list = NULL; int active, error = 0; KASSERT(pollid == 0 || (p != NULL && p->p_kq == kq)); if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) return (EINVAL); fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } if (fops == NULL) { /* * XXX * filter attach routine is responsible for ensuring that * the identifier can be attached to it. */ return (EINVAL); } if (fops->f_flags & FILTEROP_ISFD) { /* validate descriptor */ if (kev->ident > INT_MAX) return (EBADF); } if (kev->flags & EV_ADD) newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO); again: if (fops->f_flags & FILTEROP_ISFD) { if ((fp = fd_getfile(fdp, kev->ident)) == NULL) { error = EBADF; goto done; } mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_list(kq, kev->ident); if (kev->ident < kq->kq_knlistsize) list = &kq->kq_knlist[kev->ident]; } else { mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_hash(kq); if (kq->kq_knhashmask != 0) { list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; } } if (list != NULL) { SLIST_FOREACH(kn, list, kn_link) { if (kev->filter == kn->kn_filter && kev->ident == kn->kn_id && pollid == kn->kn_pollid) { if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released * kq_lock. */ if (fp != NULL) { FRELE(fp, p); fp = NULL; } goto again; } break; } } } KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { mtx_leave(&kq->kq_lock); error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match. */ if (kev->flags & EV_ADD) { if (kn == NULL) { kn = newkn; newkn = NULL; kn->kn_status = KN_PROCESSING; kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_pollid = pollid; knote_attach(kn); mtx_leave(&kq->kq_lock); error = filter_attach(kn); if (error != 0) { knote_drop(kn, p); goto done; } /* * If this is a file descriptor filter, check if * fd was closed while the knote was being added. * knote_fdclose() has missed kn if the function * ran before kn appeared in kq_knlist. */ if ((fops->f_flags & FILTEROP_ISFD) && fd_checkclosed(fdp, kev->ident, kn->kn_fp)) { /* * Drop the knote silently without error * because another thread might already have * seen it. This corresponds to the insert * happening in full before the close. */ filter_detach(kn); knote_drop(kn, p); goto done; } /* Check if there is a pending event. */ active = filter_process(kn, NULL); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); } else if (kn->kn_fop == &badfd_filtops) { /* * Nothing expects this badfd knote any longer. * Drop it to make room for the new knote and retry. */ KASSERT(kq == p->p_kq); mtx_leave(&kq->kq_lock); filter_detach(kn); knote_drop(kn, p); KASSERT(fp != NULL); FRELE(fp, p); fp = NULL; goto again; } else { /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filters which have already been triggered. */ mtx_leave(&kq->kq_lock); active = filter_modify(kev, kn); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); if (kev->flags & EV_ERROR) { error = kev->data; goto release; } } } else if (kev->flags & EV_DELETE) { mtx_leave(&kq->kq_lock); filter_detach(kn); knote_drop(kn, p); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) kn->kn_status |= KN_DISABLED; if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { kn->kn_status &= ~KN_DISABLED; mtx_leave(&kq->kq_lock); /* Check if there is a pending event. */ active = filter_process(kn, NULL); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); } release: knote_release(kn); mtx_leave(&kq->kq_lock); done: if (fp != NULL) FRELE(fp, p); if (newkn != NULL) pool_put(&knote_pool, newkn); return (error); } int kqueue_sleep(struct kqueue *kq, struct timespec *tsp) { struct timespec elapsed, start, stop; uint64_t nsecs; int error; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (tsp != NULL) { getnanouptime(&start); nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); } else nsecs = INFSLP; error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK, "kqread", nsecs); if (tsp != NULL) { getnanouptime(&stop); timespecsub(&stop, &start, &elapsed); timespecsub(tsp, &elapsed, tsp); if (tsp->tv_sec < 0) timespecclear(tsp); } return (error); } /* * Scan the kqueue, blocking if necessary until the target time is reached. * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both * 0 we do not block at all. */ int kqueue_scan(struct kqueue_scan_state *scan, int maxevents, struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp) { struct kqueue *kq = scan->kqs_kq; struct knote *kn; int error = 0, nkev = 0; int reinserted; if (maxevents == 0) goto done; retry: KASSERT(nkev == 0); error = 0; reinserted = 0; /* msleep() with PCATCH requires kernel lock. */ KERNEL_LOCK(); mtx_enter(&kq->kq_lock); if (kq->kq_state & KQ_DYING) { mtx_leave(&kq->kq_lock); KERNEL_UNLOCK(); error = EBADF; goto done; } if (kq->kq_count == 0) { /* * Successive loops are only necessary if there are more * ready events to gather, so they don't need to block. */ if ((tsp != NULL && !timespecisset(tsp)) || scan->kqs_nevent != 0) { mtx_leave(&kq->kq_lock); KERNEL_UNLOCK(); error = 0; goto done; } kq->kq_state |= KQ_SLEEP; error = kqueue_sleep(kq, tsp); /* kqueue_sleep() has released kq_lock. */ KERNEL_UNLOCK(); if (error == 0 || error == EWOULDBLOCK) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; goto done; } /* The actual scan does not sleep on kq, so unlock the kernel. */ KERNEL_UNLOCK(); /* * Put the end marker in the queue to limit the scan to the events * that are currently active. This prevents events from being * recollected if they reactivate during scan. * * If a partial scan has been performed already but no events have * been collected, reposition the end marker to make any new events * reachable. */ if (!scan->kqs_queued) { TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); scan->kqs_queued = 1; } else if (scan->kqs_nevent == 0) { TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); } TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe); while (nkev < maxevents) { kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe); if (kn->kn_filter == EVFILT_MARKER) { if (kn == &scan->kqs_end) break; /* Move start marker past another thread's marker. */ TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start, kn_tqe); continue; } if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released kq_lock. */ mtx_enter(&kq->kq_lock); continue; } kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; kqueue_check(kq); if (kn->kn_status & KN_DISABLED) { knote_release(kn); continue; } mtx_leave(&kq->kq_lock); /* Drop expired kqpoll knotes. */ if (p->p_kq == kq && p->p_kq_serial > (unsigned long)kn->kn_udata) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); continue; } /* * Invalidate knotes whose vnodes have been revoked. * This is a workaround; it is tricky to clear existing * knotes and prevent new ones from being registered * with the current revocation mechanism. */ if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL && kn->kn_fp->f_type == DTYPE_VNODE) { struct vnode *vp = kn->kn_fp->f_data; if (__predict_false(vp->v_op == &dead_vops && kn->kn_fop != &dead_filtops)) { filter_detach(kn); kn->kn_fop = &dead_filtops; /* * Check if the event should be delivered. * Use f_event directly because this is * a special situation. */ if (kn->kn_fop->f_event(kn, 0) == 0) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); continue; } } } memset(kevp, 0, sizeof(*kevp)); if (filter_process(kn, kevp) == 0) { mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) kn->kn_status &= ~KN_ACTIVE; knote_release(kn); kqueue_check(kq); continue; } /* * Post-event action on the note */ if (kevp->flags & EV_ONESHOT) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) { mtx_enter(&kq->kq_lock); if (kevp->flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; if ((kn->kn_status & KN_QUEUED) == 0) kn->kn_status &= ~KN_ACTIVE; knote_release(kn); } else { mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) { kqueue_check(kq); kq->kq_count++; kn->kn_status |= KN_QUEUED; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); /* Wakeup is done after loop. */ reinserted = 1; } knote_release(kn); } kqueue_check(kq); kevp++; nkev++; scan->kqs_nevent++; } TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); if (reinserted && kq->kq_count != 0) kqueue_wakeup(kq); mtx_leave(&kq->kq_lock); if (scan->kqs_nevent == 0) goto retry; done: *errorp = error; return (nkev); } void kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq) { memset(scan, 0, sizeof(*scan)); KQREF(kq); scan->kqs_kq = kq; scan->kqs_start.kn_filter = EVFILT_MARKER; scan->kqs_start.kn_status = KN_PROCESSING; scan->kqs_end.kn_filter = EVFILT_MARKER; scan->kqs_end.kn_status = KN_PROCESSING; } void kqueue_scan_finish(struct kqueue_scan_state *scan) { struct kqueue *kq = scan->kqs_kq; KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER); KASSERT(scan->kqs_start.kn_status == KN_PROCESSING); KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER); KASSERT(scan->kqs_end.kn_status == KN_PROCESSING); if (scan->kqs_queued) { scan->kqs_queued = 0; mtx_enter(&kq->kq_lock); TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); mtx_leave(&kq->kq_lock); } KQRELE(kq); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ int kqueue_read(struct file *fp, struct uio *uio, int fflags) { return (ENXIO); } int kqueue_write(struct file *fp, struct uio *uio, int fflags) { return (ENXIO); } int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) { return (ENOTTY); } int kqueue_stat(struct file *fp, struct stat *st, struct proc *p) { struct kqueue *kq = fp->f_data; memset(st, 0, sizeof(*st)); st->st_size = kq->kq_count; /* unlocked read */ st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); } void kqueue_purge(struct proc *p, struct kqueue *kq) { int i; mtx_enter(&kq->kq_lock); for (i = 0; i < kq->kq_knlistsize; i++) knote_remove(p, kq, &kq->kq_knlist, i, 1); if (kq->kq_knhashmask != 0) { for (i = 0; i < kq->kq_knhashmask + 1; i++) knote_remove(p, kq, &kq->kq_knhash, i, 1); } mtx_leave(&kq->kq_lock); } void kqueue_terminate(struct proc *p, struct kqueue *kq) { struct knote *kn; int state; mtx_enter(&kq->kq_lock); /* * Any remaining entries should be scan markers. * They are removed when the ongoing scans finish. */ KASSERT(kq->kq_count == 0); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) KASSERT(kn->kn_filter == EVFILT_MARKER); kq->kq_state |= KQ_DYING; state = kq->kq_state; kqueue_wakeup(kq); mtx_leave(&kq->kq_lock); /* * Any knotes that were attached to this kqueue were deleted * by knote_fdclose() when this kqueue's file descriptor was closed. */ KASSERT(klist_empty(&kq->kq_klist)); if (state & KQ_TASK) taskq_del_barrier(systqmp, &kq->kq_task); } int kqueue_close(struct file *fp, struct proc *p) { struct kqueue *kq = fp->f_data; fp->f_data = NULL; kqueue_purge(p, kq); kqueue_terminate(p, kq); KQRELE(kq); return (0); } static void kqueue_task(void *arg) { struct kqueue *kq = arg; mtx_enter(&kqueue_klist_lock); KNOTE(&kq->kq_klist, 0); mtx_leave(&kqueue_klist_lock); } void kqueue_wakeup(struct kqueue *kq) { MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if (!klist_empty(&kq->kq_klist)) { /* Defer activation to avoid recursion. */ kq->kq_state |= KQ_TASK; task_add(systqmp, &kq->kq_task); } } static void kqueue_expand_hash(struct kqueue *kq) { struct knlist *hash; u_long hashmask; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_knhashmask == 0) { mtx_leave(&kq->kq_lock); hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); mtx_enter(&kq->kq_lock); if (kq->kq_knhashmask == 0) { kq->kq_knhash = hash; kq->kq_knhashmask = hashmask; } else { /* Another thread has allocated the hash. */ mtx_leave(&kq->kq_lock); hashfree(hash, KN_HASHSIZE, M_KEVENT); mtx_enter(&kq->kq_lock); } } } static void kqueue_expand_list(struct kqueue *kq, int fd) { struct knlist *list, *olist; int size, osize; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; mtx_leave(&kq->kq_lock); while (size <= fd) size += KQEXTENT; list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); mtx_enter(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { memcpy(list, kq->kq_knlist, kq->kq_knlistsize * sizeof(*list)); memset(&list[kq->kq_knlistsize], 0, (size - kq->kq_knlistsize) * sizeof(*list)); olist = kq->kq_knlist; osize = kq->kq_knlistsize; kq->kq_knlist = list; kq->kq_knlistsize = size; mtx_leave(&kq->kq_lock); free(olist, M_KEVENT, osize * sizeof(*list)); mtx_enter(&kq->kq_lock); } else { /* Another thread has expanded the list. */ mtx_leave(&kq->kq_lock); free(list, M_KEVENT, size * sizeof(*list)); mtx_enter(&kq->kq_lock); } } } /* * Acquire a knote, return non-zero on success, 0 on failure. * * If we cannot acquire the knote we sleep and return 0. The knote * may be stale on return in this case and the caller must restart * whatever loop they are in. * * If we are about to sleep and klist is non-NULL, the list is unlocked * before sleep and remains unlocked on return. */ int knote_acquire(struct knote *kn, struct klist *klist, int ls) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); if (kn->kn_status & KN_PROCESSING) { kn->kn_status |= KN_WAITING; if (klist != NULL) { mtx_leave(&kq->kq_lock); klist_unlock(klist, ls); /* XXX Timeout resolves potential loss of wakeup. */ tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); } else { msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts", SEC_TO_NSEC(1)); } /* knote may be stale now */ return (0); } kn->kn_status |= KN_PROCESSING; return (1); } /* * Release an acquired knote, clearing KN_PROCESSING. */ void knote_release(struct knote *kn) { MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_PROCESSING); if (kn->kn_status & KN_WAITING) { kn->kn_status &= ~KN_WAITING; wakeup(kn); } kn->kn_status &= ~KN_PROCESSING; /* kn should not be accessed anymore */ } /* * activate one knote. */ void knote_activate(struct knote *kn) { MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) knote_enqueue(kn); } /* * walk down a list of knotes, activating them if their event has triggered. */ void knote(struct klist *list, long hint) { struct knote *kn, *kn0; struct kqueue *kq; KLIST_ASSERT_LOCKED(list); SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) { if (filter_event(kn, hint)) { kq = kn->kn_kq; mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); } } } /* * remove all knotes from a specified knlist */ void knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx, int purge) { struct knote *kn; MUTEX_ASSERT_LOCKED(&kq->kq_lock); /* Always fetch array pointer as another thread can resize kq_knlist. */ while ((kn = SLIST_FIRST(*plist + idx)) != NULL) { KASSERT(kn->kn_kq == kq); if (!purge) { /* Skip pending badfd knotes. */ while (kn->kn_fop == &badfd_filtops) { kn = SLIST_NEXT(kn, kn_link); if (kn == NULL) return; KASSERT(kn->kn_kq == kq); } } if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released kq_lock. */ mtx_enter(&kq->kq_lock); continue; } mtx_leave(&kq->kq_lock); filter_detach(kn); /* * Notify poll(2) and select(2) when a monitored * file descriptor is closed. * * This reuses the original knote for delivering the * notification so as to avoid allocating memory. */ if (!purge && (kn->kn_flags & (__EV_POLL | __EV_SELECT)) && !(p->p_kq == kq && p->p_kq_serial > (unsigned long)kn->kn_udata) && kn->kn_fop != &badfd_filtops) { KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD); FRELE(kn->kn_fp, p); kn->kn_fp = NULL; kn->kn_fop = &badfd_filtops; filter_event(kn, 0); mtx_enter(&kq->kq_lock); knote_activate(kn); knote_release(kn); continue; } knote_drop(kn, p); mtx_enter(&kq->kq_lock); } } /* * remove all knotes referencing a specified fd */ void knote_fdclose(struct proc *p, int fd) { struct filedesc *fdp = p->p_p->ps_fd; struct kqueue *kq; /* * fdplock can be ignored if the file descriptor table is being freed * because no other thread can access the fdp. */ if (fdp->fd_refcnt != 0) fdpassertlocked(fdp); LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { mtx_enter(&kq->kq_lock); if (fd < kq->kq_knlistsize) knote_remove(p, kq, &kq->kq_knlist, fd, 0); mtx_leave(&kq->kq_lock); } } /* * handle a process exiting, including the triggering of NOTE_EXIT notes * XXX this could be more efficient, doing a single pass down the klist */ void knote_processexit(struct process *pr) { KERNEL_ASSERT_LOCKED(); KNOTE(&pr->ps_klist, NOTE_EXIT); /* remove other knotes hanging off the process */ klist_invalidate(&pr->ps_klist); } void knote_attach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { KASSERT(kq->kq_knlistsize > kn->kn_id); list = &kq->kq_knlist[kn->kn_id]; } else { KASSERT(kq->kq_knhashmask != 0); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); kq->kq_nknotes++; } void knote_detach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); kq->kq_nknotes--; if (kn->kn_fop->f_flags & FILTEROP_ISFD) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; SLIST_REMOVE(list, kn, knote, kn_link); } /* * should be called at spl == 0, since we don't want to hold spl * while calling FRELE and pool_put. */ void knote_drop(struct knote *kn, struct proc *p) { struct kqueue *kq = kn->kn_kq; KASSERT(kn->kn_filter != EVFILT_MARKER); mtx_enter(&kq->kq_lock); knote_detach(kn); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); if (kn->kn_status & KN_WAITING) { kn->kn_status &= ~KN_WAITING; wakeup(kn); } mtx_leave(&kq->kq_lock); if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL) FRELE(kn->kn_fp, p); pool_put(&knote_pool, kn); } void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT((kn->kn_status & KN_QUEUED) == 0); kqueue_check(kq); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_check(kq); kqueue_wakeup(kq); } void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_QUEUED); kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; kqueue_check(kq); } /* * Assign parameters to the knote. * * The knote's object lock must be held. */ void knote_assign(const struct kevent *kev, struct knote *kn) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kn->kn_udata = kev->udata; } /* * Submit the knote's event for delivery. * * The knote's object lock must be held. */ void knote_submit(struct knote *kn, struct kevent *kev) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); if (kev != NULL) { *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_fflags = 0; kn->kn_data = 0; } } } void klist_init(struct klist *klist, const struct klistops *ops, void *arg) { SLIST_INIT(&klist->kl_list); klist->kl_ops = ops; klist->kl_arg = arg; } void klist_free(struct klist *klist) { KASSERT(SLIST_EMPTY(&klist->kl_list)); } void klist_insert(struct klist *klist, struct knote *kn) { int ls; ls = klist_lock(klist); SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); klist_unlock(klist, ls); } void klist_insert_locked(struct klist *klist, struct knote *kn) { KLIST_ASSERT_LOCKED(klist); SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); } void klist_remove(struct klist *klist, struct knote *kn) { int ls; ls = klist_lock(klist); SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); klist_unlock(klist, ls); } void klist_remove_locked(struct klist *klist, struct knote *kn) { KLIST_ASSERT_LOCKED(klist); SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); } /* * Detach all knotes from klist. The knotes are rewired to indicate EOF. * * The caller of this function must not hold any locks that can block * filterops callbacks that run with KN_PROCESSING. * Otherwise this function might deadlock. */ void klist_invalidate(struct klist *list) { struct knote *kn; struct kqueue *kq; struct proc *p = curproc; int ls; NET_ASSERT_UNLOCKED(); ls = klist_lock(list); while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) { kq = kn->kn_kq; mtx_enter(&kq->kq_lock); if (!knote_acquire(kn, list, ls)) { /* knote_acquire() has released kq_lock * and klist lock. */ ls = klist_lock(list); continue; } mtx_leave(&kq->kq_lock); klist_unlock(list, ls); filter_detach(kn); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { kn->kn_fop = &dead_filtops; filter_event(kn, 0); mtx_enter(&kq->kq_lock); knote_activate(kn); knote_release(kn); mtx_leave(&kq->kq_lock); } else { knote_drop(kn, p); } ls = klist_lock(list); } klist_unlock(list, ls); } static int klist_lock(struct klist *list) { int ls = 0; if (list->kl_ops != NULL) { ls = list->kl_ops->klo_lock(list->kl_arg); } else { KERNEL_LOCK(); ls = splhigh(); } return ls; } static void klist_unlock(struct klist *list, int ls) { if (list->kl_ops != NULL) { list->kl_ops->klo_unlock(list->kl_arg, ls); } else { splx(ls); KERNEL_UNLOCK(); } } static void klist_mutex_assertlk(void *arg) { struct mutex *mtx = arg; (void)mtx; MUTEX_ASSERT_LOCKED(mtx); } static int klist_mutex_lock(void *arg) { struct mutex *mtx = arg; mtx_enter(mtx); return 0; } static void klist_mutex_unlock(void *arg, int s) { struct mutex *mtx = arg; mtx_leave(mtx); } static const struct klistops mutex_klistops = { .klo_assertlk = klist_mutex_assertlk, .klo_lock = klist_mutex_lock, .klo_unlock = klist_mutex_unlock, }; void klist_init_mutex(struct klist *klist, struct mutex *mtx) { klist_init(klist, &mutex_klistops, mtx); } static void klist_rwlock_assertlk(void *arg) { struct rwlock *rwl = arg; (void)rwl; rw_assert_wrlock(rwl); } static int klist_rwlock_lock(void *arg) { struct rwlock *rwl = arg; rw_enter_write(rwl); return 0; } static void klist_rwlock_unlock(void *arg, int s) { struct rwlock *rwl = arg; rw_exit_write(rwl); } static const struct klistops rwlock_klistops = { .klo_assertlk = klist_rwlock_assertlk, .klo_lock = klist_rwlock_lock, .klo_unlock = klist_rwlock_unlock, }; void klist_init_rwlock(struct klist *klist, struct rwlock *rwl) { klist_init(klist, &rwlock_klistops, rwl); }
224 2 192 135 142 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 /* $OpenBSD: strncmp.c,v 1.11 2014/06/10 04:16:57 deraadt Exp $ */ /* * Copyright (c) 1989 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <lib/libkern/libkern.h> int strncmp(const char *s1, const char *s2, size_t n) { if (n == 0) return (0); do { if (*s1 != *s2++) return (*(unsigned char *)s1 - *(unsigned char *)--s2); if (*s1++ == 0) break; } while (--n != 0); return (0); }
14 11 2 1 17 2 3 1 1 1 1 2 1 1 1 1 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 /* $OpenBSD: hotplug.c,v 1.22 2022/07/02 08:50:41 visa Exp $ */ /* * Copyright (c) 2004 Alexander Yurchenko <grange@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Device attachment and detachment notifications. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/fcntl.h> #include <sys/hotplug.h> #include <sys/ioctl.h> #include <sys/vnode.h> #define HOTPLUG_MAXEVENTS 64 static int opened; static struct hotplug_event evqueue[HOTPLUG_MAXEVENTS]; static int evqueue_head, evqueue_tail, evqueue_count; static struct selinfo hotplug_sel; void filt_hotplugrdetach(struct knote *); int filt_hotplugread(struct knote *, long); const struct filterops hotplugread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_hotplugrdetach, .f_event = filt_hotplugread, }; #define EVQUEUE_NEXT(p) (p == HOTPLUG_MAXEVENTS - 1 ? 0 : p + 1) int hotplug_put_event(struct hotplug_event *); int hotplug_get_event(struct hotplug_event *); void hotplugattach(int); void hotplugattach(int count) { opened = 0; evqueue_head = 0; evqueue_tail = 0; evqueue_count = 0; } void hotplug_device_attach(enum devclass class, char *name) { struct hotplug_event he; he.he_type = HOTPLUG_DEVAT; he.he_devclass = class; strlcpy(he.he_devname, name, sizeof(he.he_devname)); hotplug_put_event(&he); } void hotplug_device_detach(enum devclass class, char *name) { struct hotplug_event he; he.he_type = HOTPLUG_DEVDT; he.he_devclass = class; strlcpy(he.he_devname, name, sizeof(he.he_devname)); hotplug_put_event(&he); } int hotplug_put_event(struct hotplug_event *he) { if (evqueue_count == HOTPLUG_MAXEVENTS && opened) { printf("hotplug: event lost, queue full\n"); return (1); } evqueue[evqueue_head] = *he; evqueue_head = EVQUEUE_NEXT(evqueue_head); if (evqueue_count == HOTPLUG_MAXEVENTS) evqueue_tail = EVQUEUE_NEXT(evqueue_tail); else evqueue_count++; wakeup(&evqueue); selwakeup(&hotplug_sel); return (0); } int hotplug_get_event(struct hotplug_event *he) { int s; if (evqueue_count == 0) return (1); s = splbio(); *he = evqueue[evqueue_tail]; evqueue_tail = EVQUEUE_NEXT(evqueue_tail); evqueue_count--; splx(s); return (0); } int hotplugopen(dev_t dev, int flag, int mode, struct proc *p) { if (minor(dev) != 0) return (ENXIO); if ((flag & FWRITE)) return (EPERM); if (opened) return (EBUSY); opened = 1; return (0); } int hotplugclose(dev_t dev, int flag, int mode, struct proc *p) { struct hotplug_event he; while (hotplug_get_event(&he) == 0) continue; opened = 0; return (0); } int hotplugread(dev_t dev, struct uio *uio, int flags) { struct hotplug_event he; int error; if (uio->uio_resid != sizeof(he)) return (EINVAL); again: if (hotplug_get_event(&he) == 0) return (uiomove(&he, sizeof(he), uio)); if (flags & IO_NDELAY) return (EAGAIN); error = tsleep_nsec(&evqueue, PRIBIO | PCATCH, "htplev", INFSLP); if (error) return (error); goto again; } int hotplugioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { switch (cmd) { case FIOASYNC: /* ignore */ case FIONBIO: /* handled in the upper fs layer */ break; default: return (ENOTTY); } return (0); } int hotplugkqfilter(dev_t dev, struct knote *kn) { struct klist *klist; int s; switch (kn->kn_filter) { case EVFILT_READ: klist = &hotplug_sel.si_note; kn->kn_fop = &hotplugread_filtops; break; default: return (EINVAL); } s = splbio(); klist_insert_locked(klist, kn); splx(s); return (0); } void filt_hotplugrdetach(struct knote *kn) { int s; s = splbio(); klist_remove_locked(&hotplug_sel.si_note, kn); splx(s); } int filt_hotplugread(struct knote *kn, long hint) { kn->kn_data = evqueue_count; return (evqueue_count > 0); }
215 215 2 2 2 2 2 217 216 216 217 217 217 217 144 181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 /* $OpenBSD: bus_dma.c,v 1.51 2019/06/09 12:52:04 kettenis Exp $ */ /* $NetBSD: bus_dma.c,v 1.3 2003/05/07 21:33:58 fvdl Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * The following is included because _bus_dma_uiomove is derived from * uiomove() in kern_subr.c. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <machine/bus.h> #include <uvm/uvm_extern.h> int _bus_dmamap_load_buffer(bus_dma_tag_t, bus_dmamap_t, void *, bus_size_t, struct proc *, int, paddr_t *, int *, int); /* * Common function for DMA map creation. May be called by bus-specific * DMA map creation functions. */ int _bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments, bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp) { struct bus_dmamap *map; void *mapstore; size_t mapsize; /* * Allocate and initialize the DMA map. The end of the map * is a variable-sized array of segments, so we allocate enough * room for them in one shot. * * Note we don't preserve the WAITOK or NOWAIT flags. Preservation * of ALLOCNOW notifies others that we've reserved these resources, * and they are not to be freed. * * The bus_dmamap_t includes one bus_dma_segment_t, hence * the (nsegments - 1). */ mapsize = sizeof(struct bus_dmamap) + (sizeof(bus_dma_segment_t) * (nsegments - 1)); if ((mapstore = malloc(mapsize, M_DEVBUF, (flags & BUS_DMA_NOWAIT) ? (M_NOWAIT|M_ZERO) : (M_WAITOK|M_ZERO))) == NULL) return (ENOMEM); map = (struct bus_dmamap *)mapstore; map->_dm_size = size; map->_dm_segcnt = nsegments; map->_dm_maxsegsz = maxsegsz; map->_dm_boundary = boundary; map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT); *dmamp = map; return (0); } /* * Common function for DMA map destruction. May be called by bus-specific * DMA map destruction functions. */ void _bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map) { size_t mapsize; mapsize = sizeof(struct bus_dmamap) + (sizeof(bus_dma_segment_t) * (map->_dm_segcnt - 1)); free(map, M_DEVBUF, mapsize); } /* * Common function for loading a DMA map with a linear buffer. May * be called by bus-specific DMA map load functions. */ int _bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t map, void *buf, bus_size_t buflen, struct proc *p, int flags) { bus_addr_t lastaddr = 0; int seg, error; /* * Make sure that on error condition we return "no valid mappings". */ map->dm_mapsize = 0; map->dm_nsegs = 0; if (buflen > map->_dm_size) return (EINVAL); seg = 0; error = _bus_dmamap_load_buffer(t, map, buf, buflen, p, flags, &lastaddr, &seg, 1); if (error == 0) { map->dm_mapsize = buflen; map->dm_nsegs = seg + 1; } return (error); } /* * Like _bus_dmamap_load(), but for mbufs. */ int _bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0, int flags) { paddr_t lastaddr = 0; int seg, error, first; struct mbuf *m; /* * Make sure that on error condition we return "no valid mappings". */ map->dm_mapsize = 0; map->dm_nsegs = 0; #ifdef DIAGNOSTIC if ((m0->m_flags & M_PKTHDR) == 0) panic("_bus_dmamap_load_mbuf: no packet header"); #endif if (m0->m_pkthdr.len > map->_dm_size) return (EINVAL); first = 1; seg = 0; error = 0; for (m = m0; m != NULL && error == 0; m = m->m_next) { if (m->m_len == 0) continue; error = _bus_dmamap_load_buffer(t, map, m->m_data, m->m_len, NULL, flags, &lastaddr, &seg, first); first = 0; } if (error == 0) { map->dm_mapsize = m0->m_pkthdr.len; map->dm_nsegs = seg + 1; } return (error); } /* * Like _bus_dmamap_load(), but for uios. */ int _bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio, int flags) { paddr_t lastaddr = 0; int seg, i, error, first; bus_size_t minlen, resid; struct proc *p = NULL; struct iovec *iov; caddr_t addr; /* * Make sure that on error condition we return "no valid mappings". */ map->dm_mapsize = 0; map->dm_nsegs = 0; resid = uio->uio_resid; iov = uio->uio_iov; if (uio->uio_segflg == UIO_USERSPACE) { p = uio->uio_procp; #ifdef DIAGNOSTIC if (p == NULL) panic("_bus_dmamap_load_uio: USERSPACE but no proc"); #endif } first = 1; seg = 0; error = 0; for (i = 0; i < uio->uio_iovcnt && resid != 0 && error == 0; i++) { /* * Now at the first iovec to load. Load each iovec * until we have exhausted the residual count. */ minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len; addr = (caddr_t)iov[i].iov_base; error = _bus_dmamap_load_buffer(t, map, addr, minlen, p, flags, &lastaddr, &seg, first); first = 0; resid -= minlen; } if (error == 0) { map->dm_mapsize = uio->uio_resid; map->dm_nsegs = seg + 1; } return (error); } /* * Like _bus_dmamap_load(), but for raw memory allocated with * bus_dmamem_alloc(). */ int _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, bus_dma_segment_t *segs, int nsegs, bus_size_t size, int flags) { bus_addr_t paddr, baddr, bmask, lastaddr = 0; bus_size_t plen, sgsize, mapsize; int first = 1; int i, seg = 0; /* * Make sure that on error condition we return "no valid mappings". */ map->dm_mapsize = 0; map->dm_nsegs = 0; if (nsegs > map->_dm_segcnt || size > map->_dm_size) return (EINVAL); mapsize = size; bmask = ~(map->_dm_boundary - 1); for (i = 0; i < nsegs && size > 0; i++) { paddr = segs[i].ds_addr; plen = MIN(segs[i].ds_len, size); while (plen > 0) { /* * Compute the segment size, and adjust counts. */ sgsize = PAGE_SIZE - ((u_long)paddr & PGOFSET); if (plen < sgsize) sgsize = plen; if (paddr > dma_constraint.ucr_high && (map->_dm_flags & BUS_DMA_64BIT) == 0) panic("Non dma-reachable buffer at paddr %#lx(raw)", paddr); /* * Make sure we don't cross any boundaries. */ if (map->_dm_boundary > 0) { baddr = (paddr + map->_dm_boundary) & bmask; if (sgsize > (baddr - paddr)) sgsize = (baddr - paddr); } /* * Insert chunk into a segment, coalescing with * previous segment if possible. */ if (first) { map->dm_segs[seg].ds_addr = paddr; map->dm_segs[seg].ds_len = sgsize; first = 0; } else { if (paddr == lastaddr && (map->dm_segs[seg].ds_len + sgsize) <= map->_dm_maxsegsz && (map->_dm_boundary == 0 || (map->dm_segs[seg].ds_addr & bmask) == (paddr & bmask))) map->dm_segs[seg].ds_len += sgsize; else { if (++seg >= map->_dm_segcnt) return (EINVAL); map->dm_segs[seg].ds_addr = paddr; map->dm_segs[seg].ds_len = sgsize; } } paddr += sgsize; plen -= sgsize; size -= sgsize; lastaddr = paddr; } } map->dm_mapsize = mapsize; map->dm_nsegs = seg + 1; return (0); } /* * Common function for unloading a DMA map. May be called by * bus-specific DMA map unload functions. */ void _bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map) { /* * No resources to free; just mark the mappings as * invalid. */ map->dm_mapsize = 0; map->dm_nsegs = 0; } /* * Common function for DMA map synchronization. May be called * by bus-specific DMA map synchronization functions. */ void _bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t addr, bus_size_t size, int op) { /* Nothing to do here. */ } /* * Common function for DMA-safe memory allocation. May be called * by bus-specific DMA memory allocation functions. */ int _bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags) { /* * XXX in the presence of decent (working) iommus and bouncebuffers * we can then fallback this allocation to a range of { 0, -1 }. * However for now we err on the side of caution and allocate dma * memory under the 4gig boundary. */ return (_bus_dmamem_alloc_range(t, size, alignment, boundary, segs, nsegs, rsegs, flags, (bus_addr_t)0, (bus_addr_t)0xffffffff)); } /* * Common function for freeing DMA-safe memory. May be called by * bus-specific DMA memory free functions. */ void _bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs) { struct vm_page *m; bus_addr_t addr; struct pglist mlist; int curseg; /* * Build a list of pages to free back to the VM system. */ TAILQ_INIT(&mlist); for (curseg = 0; curseg < nsegs; curseg++) { for (addr = segs[curseg].ds_addr; addr < (segs[curseg].ds_addr + segs[curseg].ds_len); addr += PAGE_SIZE) { m = PHYS_TO_VM_PAGE(addr); TAILQ_INSERT_TAIL(&mlist, m, pageq); } } uvm_pglistfree(&mlist); } /* * Common function for mapping DMA-safe memory. May be called by * bus-specific DMA memory map functions. */ int _bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, size_t size, caddr_t *kvap, int flags) { vaddr_t va, sva; size_t ssize; bus_addr_t addr; int curseg, pmapflags = 0, error; const struct kmem_dyn_mode *kd; if (nsegs == 1 && (flags & BUS_DMA_NOCACHE) == 0) { *kvap = (caddr_t)PMAP_DIRECT_MAP(segs[0].ds_addr); return (0); } if (flags & BUS_DMA_NOCACHE) pmapflags |= PMAP_NOCACHE; size = round_page(size); kd = flags & BUS_DMA_NOWAIT ? &kd_trylock : &kd_waitok; va = (vaddr_t)km_alloc(size, &kv_any, &kp_none, kd); if (va == 0) return (ENOMEM); *kvap = (caddr_t)va; sva = va; ssize = size; for (curseg = 0; curseg < nsegs; curseg++) { for (addr = segs[curseg].ds_addr; addr < (segs[curseg].ds_addr + segs[curseg].ds_len); addr += PAGE_SIZE, va += PAGE_SIZE, size -= PAGE_SIZE) { if (size == 0) panic("_bus_dmamem_map: size botch"); error = pmap_enter(pmap_kernel(), va, addr | pmapflags, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PMAP_WIRED | PMAP_CANFAIL); if (error) { pmap_update(pmap_kernel()); km_free((void *)sva, ssize, &kv_any, &kp_none); return (error); } } } pmap_update(pmap_kernel()); return (0); } /* * Common function for unmapping DMA-safe memory. May be called by * bus-specific DMA memory unmapping functions. */ void _bus_dmamem_unmap(bus_dma_tag_t t, caddr_t kva, size_t size) { #ifdef DIAGNOSTIC if ((u_long)kva & PGOFSET) panic("_bus_dmamem_unmap"); #endif if (kva >= (caddr_t)PMAP_DIRECT_BASE && kva <= (caddr_t)PMAP_DIRECT_END) return; km_free(kva, round_page(size), &kv_any, &kp_none); } /* * Common function for mmap(2)'ing DMA-safe memory. May be called by * bus-specific DMA mmap(2)'ing functions. */ paddr_t _bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs, off_t off, int prot, int flags) { int i, pmapflags = 0; if (flags & BUS_DMA_NOCACHE) pmapflags |= PMAP_NOCACHE; for (i = 0; i < nsegs; i++) { #ifdef DIAGNOSTIC if (off & PGOFSET) panic("_bus_dmamem_mmap: offset unaligned"); if (segs[i].ds_addr & PGOFSET) panic("_bus_dmamem_mmap: segment unaligned"); if (segs[i].ds_len & PGOFSET) panic("_bus_dmamem_mmap: segment size not multiple" " of page size"); #endif if (off >= segs[i].ds_len) { off -= segs[i].ds_len; continue; } return ((segs[i].ds_addr + off) | pmapflags); } /* Page not found. */ return (-1); } /********************************************************************** * DMA utility functions **********************************************************************/ /* * Utility function to load a linear buffer. lastaddrp holds state * between invocations (for multiple-buffer loads). segp contains * the starting segment on entrance, and the ending segment on exit. * first indicates if this is the first invocation of this function. */ int _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf, bus_size_t buflen, struct proc *p, int flags, paddr_t *lastaddrp, int *segp, int first) { bus_size_t sgsize; bus_addr_t curaddr, lastaddr, baddr, bmask; vaddr_t vaddr = (vaddr_t)buf; int seg; pmap_t pmap; if (p != NULL) pmap = p->p_vmspace->vm_map.pmap; else pmap = pmap_kernel(); lastaddr = *lastaddrp; bmask = ~(map->_dm_boundary - 1); for (seg = *segp; buflen > 0 ; ) { /* * Get the physical address for this segment. */ pmap_extract(pmap, vaddr, (paddr_t *)&curaddr); if (curaddr > dma_constraint.ucr_high && (map->_dm_flags & BUS_DMA_64BIT) == 0) panic("Non dma-reachable buffer at curaddr %#lx(raw)", curaddr); /* * Compute the segment size, and adjust counts. */ sgsize = PAGE_SIZE - ((u_long)vaddr & PGOFSET); if (buflen < sgsize) sgsize = buflen; /* * Make sure we don't cross any boundaries. */ if (map->_dm_boundary > 0) { baddr = (curaddr + map->_dm_boundary) & bmask; if (sgsize > (baddr - curaddr)) sgsize = (baddr - curaddr); } /* * Insert chunk into a segment, coalescing with * previous segment if possible. */ if (first) { map->dm_segs[seg].ds_addr = curaddr; map->dm_segs[seg].ds_len = sgsize; first = 0; } else { if (curaddr == lastaddr && (map->dm_segs[seg].ds_len + sgsize) <= map->_dm_maxsegsz && (map->_dm_boundary == 0 || (map->dm_segs[seg].ds_addr & bmask) == (curaddr & bmask))) map->dm_segs[seg].ds_len += sgsize; else { if (++seg >= map->_dm_segcnt) break; map->dm_segs[seg].ds_addr = curaddr; map->dm_segs[seg].ds_len = sgsize; } } lastaddr = curaddr + sgsize; vaddr += sgsize; buflen -= sgsize; } *segp = seg; *lastaddrp = lastaddr; /* * Did we fit? */ if (buflen != 0) return (EFBIG); /* XXX better return value here? */ return (0); } /* * Allocate physical memory from the given physical address range. * Called by DMA-safe memory allocation methods. */ int _bus_dmamem_alloc_range(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags, bus_addr_t low, bus_addr_t high) { paddr_t curaddr, lastaddr; struct vm_page *m; struct pglist mlist; int curseg, error, plaflag; /* Always round the size. */ size = round_page(size); segs[0]._ds_boundary = boundary; segs[0]._ds_align = alignment; /* * Allocate pages from the VM system. */ plaflag = flags & BUS_DMA_NOWAIT ? UVM_PLA_NOWAIT : UVM_PLA_WAITOK; if (flags & BUS_DMA_ZERO) plaflag |= UVM_PLA_ZERO; TAILQ_INIT(&mlist); error = uvm_pglistalloc(size, low, high, alignment, boundary, &mlist, nsegs, plaflag); if (error) return (error); /* * Compute the location, size, and number of segments actually * returned by the VM code. */ m = TAILQ_FIRST(&mlist); curseg = 0; lastaddr = segs[curseg].ds_addr = VM_PAGE_TO_PHYS(m); segs[curseg].ds_len = PAGE_SIZE; for (m = TAILQ_NEXT(m, pageq); m != NULL; m = TAILQ_NEXT(m, pageq)) { curaddr = VM_PAGE_TO_PHYS(m); #ifdef DIAGNOSTIC if (curseg == nsegs) { printf("uvm_pglistalloc returned too many\n"); panic("_bus_dmamem_alloc_range"); } if (curaddr < low || curaddr >= high) { printf("uvm_pglistalloc returned non-sensical" " address 0x%lx\n", curaddr); panic("_bus_dmamem_alloc_range"); } #endif if (curaddr == (lastaddr + PAGE_SIZE)) segs[curseg].ds_len += PAGE_SIZE; else { curseg++; segs[curseg].ds_addr = curaddr; segs[curseg].ds_len = PAGE_SIZE; } lastaddr = curaddr; } *rsegs = curseg + 1; return (0); }
10 87 96 36 60 20 6 71 7 69 126 23 2 21 4 2 11 143 78 65 13 114 111 2 3 3 114 4 111 119 57 77 69 1 3 1 3 117 114 6 3 9 9 2 3 3 2 120 15 15 258 2 254 1 29 27 3 3 27 571 1 576 464 93 33 1 27 460 4 112 144 2 431 38 555 4 6 31 4 555 12 565 566 4 549 292 283 570 571 144 507 69 43 26 199 314 146 56 5 5 56 154 21 133 129 30 144 6 2 145 3 142 17 48 7 30 4 91 4 1 1 96 3 83 43 47 9 96 9 39 35 4 96 2 1 1 22 2 17 1 2 26 6 20 80 58 5 58 12 38 38 1 9 33 5 8 50 3 2 3 5 42 2 5 3 1 3 14 8 36 92 24 8 32 91 2 92 64 29 14 24 152 93 57 44 45 3 1 1 12 1 3 2 34 3 1 20 1 1 1 7 18 7 16 27 15 6 15 25 4 2 23 4 15 2 1 16 3 11 5 2 6 2 8 7 3 1 6 1 1 6 2 3 3 1 7 4 5 7 23 25 2 17 538 532 3 3 539 534 298 304 5 309 801 2 648 75 5 4 2 3 36 21 7 4 4 8 1 1 6 1 2 3 2 2 4 2 4 3 3 2 7 2 23 17 67 52 188 2 118 27 4 2 2 2 2 2 2 2 2 2 3 2 1 3 2 2 4 69 1 43 25 15 6 153 154 4 5 4 3 5 136 5 42 3 5 6 3 31 4 17 24 24 11 16 4 16 5 18 3 2 23 72 209 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 /* $OpenBSD: uipc_socket.c,v 1.289 2022/09/05 14:56:08 bluhm Exp $ */ /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/domain.h> #include <sys/event.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/unpcb.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/pool.h> #include <sys/atomic.h> #include <sys/rwlock.h> #include <sys/time.h> #include <sys/refcnt.h> #ifdef DDB #include <machine/db_machdep.h> #endif void sbsync(struct sockbuf *, struct mbuf *); int sosplice(struct socket *, int, off_t, struct timeval *); void sounsplice(struct socket *, struct socket *, int); void soidle(void *); void sotask(void *); void soreaper(void *); void soput(void *); int somove(struct socket *, int); void sorflush(struct socket *); void filt_sordetach(struct knote *kn); int filt_soread(struct knote *kn, long hint); void filt_sowdetach(struct knote *kn); int filt_sowrite(struct knote *kn, long hint); int filt_soexcept(struct knote *kn, long hint); int filt_solisten(struct knote *kn, long hint); int filt_somodify(struct kevent *kev, struct knote *kn); int filt_soprocess(struct knote *kn, struct kevent *kev); const struct filterops solisten_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_solisten, .f_modify = filt_somodify, .f_process = filt_soprocess, }; const struct filterops soread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, .f_modify = filt_somodify, .f_process = filt_soprocess, }; const struct filterops sowrite_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sowdetach, .f_event = filt_sowrite, .f_modify = filt_somodify, .f_process = filt_soprocess, }; const struct filterops soexcept_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soexcept, .f_modify = filt_somodify, .f_process = filt_soprocess, }; #ifndef SOMINCONN #define SOMINCONN 80 #endif /* SOMINCONN */ int somaxconn = SOMAXCONN; int sominconn = SOMINCONN; struct pool socket_pool; #ifdef SOCKET_SPLICE struct pool sosplice_pool; struct taskq *sosplice_taskq; struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); #endif void soinit(void) { pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, "sockpl", NULL); #ifdef SOCKET_SPLICE pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, "sosppl", NULL); #endif } struct socket * soalloc(int prflags) { struct socket *so; so = pool_get(&socket_pool, prflags); if (so == NULL) return (NULL); rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); refcnt_init(&so->so_refcnt); return (so); } /* * Socket operation routines. * These routines are called by the routines in * sys_socket.c or from a system process, and * implement the semantics of socket operations by * switching out to the protocol specific routines. */ int socreate(int dom, struct socket **aso, int type, int proto) { struct proc *p = curproc; /* XXX */ const struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL || prp->pr_usrreqs == NULL) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(PR_WAITOK | PR_ZERO); klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); sigio_init(&so->so_sigio); TAILQ_INIT(&so->so_q0); TAILQ_INIT(&so->so_q); so->so_type = type; if (suser(p) == 0) so->so_state = SS_PRIV; so->so_ruid = p->p_ucred->cr_ruid; so->so_euid = p->p_ucred->cr_uid; so->so_rgid = p->p_ucred->cr_rgid; so->so_egid = p->p_ucred->cr_gid; so->so_cpid = p->p_p->ps_pid; so->so_proto = prp; so->so_snd.sb_timeo_nsecs = INFSLP; so->so_rcv.sb_timeo_nsecs = INFSLP; solock(so); error = pru_attach(so, proto); if (error) { so->so_state |= SS_NOFDREF; /* sofree() calls sounlock(). */ sofree(so, 0); return (error); } sounlock(so); *aso = so; return (0); } int sobind(struct socket *so, struct mbuf *nam, struct proc *p) { soassertlocked(so); return pru_bind(so, nam, p); } int solisten(struct socket *so, int backlog) { int error; soassertlocked(so); if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) return (EINVAL); #ifdef SOCKET_SPLICE if (isspliced(so) || issplicedback(so)) return (EOPNOTSUPP); #endif /* SOCKET_SPLICE */ error = pru_listen(so); if (error) return (error); if (TAILQ_FIRST(&so->so_q) == NULL) so->so_options |= SO_ACCEPTCONN; if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; if (backlog < sominconn) backlog = sominconn; so->so_qlimit = backlog; return (0); } #define SOSP_FREEING_READ 1 #define SOSP_FREEING_WRITE 2 void sofree(struct socket *so, int keep_lock) { int persocket = solock_persocket(so); soassertlocked(so); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { if (!keep_lock) sounlock(so); return; } if (so->so_head) { struct socket *head = so->so_head; /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ if (so->so_onq == &head->so_q) { if (!keep_lock) sounlock(so); return; } if (persocket) { /* * Concurrent close of `head' could * abort `so' due to re-lock. */ soref(so); soref(head); sounlock(so); solock(head); solock(so); if (so->so_onq != &head->so_q0) { sounlock(head); sounlock(so); sorele(head); sorele(so); return; } sorele(head); sorele(so); } soqremque(so, 0); if (persocket) sounlock(head); } if (persocket) { sounlock(so); refcnt_finalize(&so->so_refcnt, "sofinal"); solock(so); } sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); #ifdef SOCKET_SPLICE if (so->so_sp) { if (issplicedback(so)) { int freeing = SOSP_FREEING_WRITE; if (so->so_sp->ssp_soback == so) freeing |= SOSP_FREEING_READ; sounsplice(so->so_sp->ssp_soback, so, freeing); } if (isspliced(so)) { int freeing = SOSP_FREEING_READ; if (so == so->so_sp->ssp_socket) freeing |= SOSP_FREEING_WRITE; sounsplice(so, so->so_sp->ssp_socket, freeing); } } #endif /* SOCKET_SPLICE */ sbrelease(so, &so->so_snd); sorflush(so); if (!keep_lock) sounlock(so); #ifdef SOCKET_SPLICE if (so->so_sp) { /* Reuse splice idle, sounsplice() has been called before. */ timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); timeout_add(&so->so_sp->ssp_idleto, 0); } else #endif /* SOCKET_SPLICE */ { pool_put(&socket_pool, so); } } static inline uint64_t solinger_nsec(struct socket *so) { if (so->so_linger == 0) return INFSLP; return SEC_TO_NSEC(so->so_linger); } /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. * Free socket when disconnect complete. */ int soclose(struct socket *so, int flags) { struct socket *so2; int error = 0; solock(so); /* Revoke async IO early. There is a final revocation in sofree(). */ sigio_free(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if (so->so_pcb == NULL) goto discard; if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) goto drop; } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (flags & MSG_DONTWAIT)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = sosleep_nsec(so, &so->so_timeo, PSOCK | PCATCH, "netcls", solinger_nsec(so)); if (error) break; } } } drop: if (so->so_pcb) { int error2; error2 = pru_detach(so); if (error == 0) error = error2; } if (so->so_options & SO_ACCEPTCONN) { int persocket = solock_persocket(so); if (persocket) { /* Wait concurrent sonewconn() threads. */ while (so->so_newconn > 0) { so->so_state |= SS_NEWCONN_WAIT; sosleep_nsec(so, &so->so_newconn, PSOCK, "netlck", INFSLP); } } while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { if (persocket) solock(so2); (void) soqremque(so2, 0); if (persocket) sounlock(so); soabort(so2); if (persocket) solock(so); } while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { if (persocket) solock(so2); (void) soqremque(so2, 1); if (persocket) sounlock(so); soabort(so2); if (persocket) solock(so); } } discard: if (so->so_state & SS_NOFDREF) panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); so->so_state |= SS_NOFDREF; /* sofree() calls sounlock(). */ sofree(so, 0); return (error); } void soabort(struct socket *so) { soassertlocked(so); pru_abort(so); } int soaccept(struct socket *so, struct mbuf *nam) { int error = 0; soassertlocked(so); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); so->so_state &= ~SS_NOFDREF; if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = pru_accept(so, nam); else error = ECONNABORTED; return (error); } int soconnect(struct socket *so, struct mbuf *nam) { int error; soassertlocked(so); if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) error = EISCONN; else error = pru_connect(so, nam); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int persocket, error; if ((persocket = solock_persocket(so1))) solock_pair(so1, so2); else solock(so1); error = pru_connect2(so1, so2); if (persocket) sounlock(so2); sounlock(so1); return (error); } int sodisconnect(struct socket *so) { int error; soassertlocked(so); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); error = pru_disconnect(so); return (error); } int m_getuio(struct mbuf **, int, long, struct uio *); #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) /* * Send on a socket. * If send must go all at once and message is larger than * send buffering, then hard error. * Lock against other senders. * If must go all at once and not enough room now, then * inform user that this would block and do nothing. * Otherwise, if nonblocking, send as much as possible. * The data to be sent is described by "uio" if nonzero, * otherwise by the mbuf chain "top" (which must be null * if uio is not). Data provided in mbuf chain must be small * enough to send all at once. * * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. */ int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags) { long space, clen = 0; size_t resid; int error; int atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* MSG_EOR on a SOCK_STREAM socket is invalid. */ if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { m_freem(top); m_freem(control); return (EINVAL); } if (uio && uio->uio_procp) uio->uio_procp->p_ru.ru_msgsnd++; if (control) { /* * In theory clen should be unsigned (since control->m_len is). * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and clen. */ clen = control->m_len; /* reserve extra space for AF_UNIX's internalize */ if (so->so_proto->pr_domain->dom_family == AF_UNIX && clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) clen = CMSG_SPACE( (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * (sizeof(struct fdpass) / sizeof(int))); } #define snderr(errno) { error = errno; goto release; } solock(so); restart: if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; so->so_state |= SS_ISSENDING; do { if (so->so_state & SS_CANTSENDMORE) snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; snderr(error); } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (!(resid == 0 && clen != 0)) snderr(ENOTCONN); } else if (addr == NULL) snderr(EDESTADDRREQ); } space = sbspace(so, &so->so_snd); if (flags & MSG_OOB) space += 1024; if (so->so_proto->pr_domain->dom_family == AF_UNIX) { if (atomic && resid > so->so_snd.sb_hiwat) snderr(EMSGSIZE); } else { if (clen > so->so_snd.sb_hiwat || (atomic && resid > so->so_snd.sb_hiwat - clen)) snderr(EMSGSIZE); } if (space < clen || (space - clen < resid && (atomic || space < so->so_snd.sb_lowat))) { if (flags & MSG_DONTWAIT) snderr(EWOULDBLOCK); sbunlock(so, &so->so_snd); error = sbwait(so, &so->so_snd); so->so_state &= ~SS_ISSENDING; if (error) goto out; goto restart; } space -= clen; do { if (uio == NULL) { /* * Data is prepackaged in "top". */ resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { sounlock(so); error = m_getuio(&top, atomic, space, uio); solock(so); if (error) goto release; space -= top->m_pkthdr.len; resid = uio->uio_resid; if (flags & MSG_EOR) top->m_flags |= M_EOR; } if (resid == 0) so->so_state &= ~SS_ISSENDING; if (top && so->so_options & SO_ZEROIZE) top->m_flags |= M_ZEROIZE; if (flags & MSG_OOB) error = pru_sendoob(so, top, addr, control); else error = pru_send(so, top, addr, control); clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: so->so_state &= ~SS_ISSENDING; sbunlock(so, &so->so_snd); out: sounlock(so); m_freem(top); m_freem(control); return (error); } int m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) { struct mbuf *m, *top = NULL; struct mbuf **nextp = &top; u_long len, mlen; size_t resid = uio->uio_resid; int error; do { if (top == NULL) { MGETHDR(m, M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; m->m_pkthdr.ph_ifidx = 0; } else { MGET(m, M_WAIT, MT_DATA); mlen = MLEN; } /* chain mbuf together */ *nextp = m; nextp = &m->m_next; resid = ulmin(resid, space); if (resid >= MINCLSIZE) { MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); if ((m->m_flags & M_EXT) == 0) MCLGETL(m, M_NOWAIT, MCLBYTES); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = m->m_ext.ext_size; len = ulmin(mlen, resid); /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && m == top && len < mlen - max_hdr) m->m_data += max_hdr; } else { nopages: len = ulmin(mlen, resid); /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && m == top && len < mlen - max_hdr) m_align(m, len); } error = uiomove(mtod(m, caddr_t), len, uio); if (error) { m_freem(top); return (error); } /* adjust counters */ resid = uio->uio_resid; space -= len; m->m_len = len; top->m_pkthdr.len += len; /* Is there more space and more data? */ } while (space > 0 && resid > 0); *mp = top; return 0; } /* * Following replacement or removal of the first mbuf on the first * mbuf chain of a socket buffer, push necessary state changes back * into the socket buffer so that other consumers see the values * consistently. 'nextrecord' is the callers locally stored value of * the original value of sb->sb_mb->m_nextpkt which must be restored * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. */ void sbsync(struct sockbuf *sb, struct mbuf *nextrecord) { /* * First, update for the new value of nextrecord. If necessary, * make it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect * the new state. This is an inline of SB_EMPTY_FIXUP, with * the addition of a second clause that takes care of the * case where sb_mb has been updated, but remains the last * record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) * must begin with an address if the protocol so specifies, * followed by an optional mbuf or mbufs containing ancillary data, * and then zero or more mbufs of data. * In order to avoid blocking network for the entire time here, we release * the solock() while doing the actual copy to user space. * Although the sockbuf is locked, new data may still be appended, * and thus we must maintain consistency of the sockbuf during that time. * * The caller may receive the data as a single mbuf chain by supplying * an mbuf **mp0 for use in returning the chain. The uio is then used * only for the count in uio_resid. */ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp, socklen_t controllen) { struct mbuf *m, **mp; struct mbuf *cm; u_long len, offset, moff; int flags, error, type, uio_error = 0; const struct protosw *pr = so->so_proto; struct mbuf *nextrecord; size_t resid, orig_resid = uio->uio_resid; mp = mp0; if (paddr) *paddr = NULL; if (controlp) *controlp = NULL; if (flagsp) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); solock(so); error = pru_rcvoob(so, m, flags & MSG_PEEK); sounlock(so); if (error) goto bad; do { error = uiomove(mtod(m, caddr_t), ulmin(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: m_freem(m); return (error); } if (mp) *mp = NULL; solock_shared(so); restart: if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { sounlock_shared(so); return (error); } m = so->so_rcv.sb_mb; #ifdef SOCKET_SPLICE if (isspliced(so)) m = NULL; #endif /* SOCKET_SPLICE */ /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: * 1. the current count is less than the low water mark, * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat), or * 3. MSG_DONTWAIT is not set. * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { #ifdef DIAGNOSTIC if (m == NULL && so->so_rcv.sb_cc) #ifdef SOCKET_SPLICE if (!isspliced(so)) #endif /* SOCKET_SPLICE */ panic("receive 1: so %p, so_type %d, sb_cc %lu", so, so->so_type, so->so_rcv.sb_cc); #endif if (so->so_error) { if (m) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; goto release; } if (so->so_state & SS_CANTRCVMORE) { if (m) goto dontblock; else if (so->so_rcv.sb_cc == 0) goto release; } for (; m; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if (uio->uio_resid == 0 && controlp == NULL) goto release; if (flags & MSG_DONTWAIT) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(so, &so->so_rcv); error = sbwait(so, &so->so_rcv); if (error) { sounlock_shared(so); return (error); } goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before operations that * may sleep, and re-reading them afterwards. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. */ if (uio->uio_procp) uio->uio_procp->p_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC if (m->m_type != MT_SONAME) panic("receive 1a: so %p, so_type %d, m %p, m_type %d", so, so->so_type, m, m->m_type); #endif orig_resid = 0; if (flags & MSG_PEEK) { if (paddr) *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); m = m->m_next; } else { sbfree(so, &so->so_rcv, m); if (paddr) { *paddr = m; so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; m = so->so_rcv.sb_mb; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sbsync(&so->so_rcv, nextrecord); } } while (m && m->m_type == MT_CONTROL && error == 0) { int skip = 0; if (flags & MSG_PEEK) { if (mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) { /* don't leak internalized SCM_RIGHTS msgs */ skip = 1; } else if (controlp) *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); m = m->m_next; } else { sbfree(so, &so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_nextpkt = m->m_next = NULL; cm = m; m = so->so_rcv.sb_mb; sbsync(&so->so_rcv, nextrecord); if (controlp) { if (pr->pr_domain->dom_externalize) { sounlock_shared(so); error = (*pr->pr_domain->dom_externalize) (cm, controllen, flags); solock_shared(so); } *controlp = cm; } else { /* * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ if (pr->pr_domain->dom_dispose) pr->pr_domain->dom_dispose(cm); m_free(cm); } } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; if (controlp && !skip) controlp = &(*controlp)->m_next; orig_resid = 0; } /* If m is non-NULL, we have some data to read. */ if (m) { type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; if (m->m_flags & M_BCAST) flags |= MSG_BCAST; if (m->m_flags & M_MCAST) flags |= MSG_MCAST; } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); moff = 0; offset = 0; while (m && uio->uio_resid > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; } else if (type == MT_OOBDATA) { break; } else if (m->m_type == MT_CONTROL) { /* * If there is more than one control message in the * stream, we do a short read. Next can be received * or disposed by another system call. */ break; #ifdef DIAGNOSTIC } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { panic("receive 3: so %p, so_type %d, m %p, m_type %d", so, so->so_type, m, m->m_type); #endif } so->so_state &= ~SS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. * Otherwise copy them out via the uio, then free. * Sockbuf must be consistent here (points to current mbuf, * it points to next record) when we drop priority; * we must note any additions to the sockbuf when we * block interrupts again. */ if (mp == NULL && uio_error == 0) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; sounlock_shared(so); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); solock_shared(so); if (uio_error) uio->uio_resid = resid - len; } else uio->uio_resid -= len; if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; orig_resid = 0; } else { nextrecord = m->m_nextpkt; sbfree(so, &so->so_rcv, m); if (mp) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } /* * If m != NULL, we also know that * so->so_rcv.sb_mb != NULL. */ KASSERT(so->so_rcv.sb_mb == m); if (m) { m->m_nextpkt = nextrecord; if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m; } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } } else { if (flags & MSG_PEEK) { moff += len; orig_resid = 0; } else { if (mp) *mp = m_copym(m, 0, len, M_WAIT); m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; so->so_rcv.sb_datacc -= len; } } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), * we must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return * with a short count but without error. * Keep sockbuf locked against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(so, &so->so_rcv); if (error) { sbunlock(so, &so->so_rcv); sounlock_shared(so); return (0); } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } } if (m && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(so, &so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD) pru_rcvd(so); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(so, &so->so_rcv); goto restart; } if (uio_error) error = uio_error; if (flagsp) *flagsp |= flags; release: sbunlock(so, &so->so_rcv); sounlock_shared(so); return (error); } int soshutdown(struct socket *so, int how) { int error = 0; solock(so); switch (how) { case SHUT_RD: sorflush(so); break; case SHUT_RDWR: sorflush(so); /* FALLTHROUGH */ case SHUT_WR: error = pru_shutdown(so); break; default: error = EINVAL; break; } sounlock(so); return (error); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct mbuf *m; const struct protosw *pr = so->so_proto; int error; sb->sb_flags |= SB_NOINTR; error = sblock(so, sb, M_WAITOK); /* with SB_NOINTR and M_WAITOK sblock() must not fail */ KASSERT(error == 0); socantrcvmore(so); m = sb->sb_mb; memset(&sb->sb_startzero, 0, (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); sb->sb_timeo_nsecs = INFSLP; sbunlock(so, sb); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(m); m_purge(m); } #ifdef SOCKET_SPLICE #define so_splicelen so_sp->ssp_len #define so_splicemax so_sp->ssp_max #define so_idletv so_sp->ssp_idletv #define so_idleto so_sp->ssp_idleto #define so_splicetask so_sp->ssp_task int sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) { struct file *fp; struct socket *sosp; struct sosplice *sp; struct taskq *tq; int error = 0; soassertlocked(so); if (sosplice_taskq == NULL) { rw_enter_write(&sosplice_lock); if (sosplice_taskq == NULL) { tq = taskq_create("sosplice", 1, IPL_SOFTNET, TASKQ_MPSAFE); /* Ensure the taskq is fully visible to other CPUs. */ membar_producer(); sosplice_taskq = tq; } rw_exit_write(&sosplice_lock); } if (sosplice_taskq == NULL) return (ENOMEM); if ((so->so_proto->pr_flags & PR_SPLICE) == 0) return (EPROTONOSUPPORT); if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (ENOTCONN); if (so->so_sp == NULL) { sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); if (so->so_sp == NULL) so->so_sp = sp; else pool_put(&sosplice_pool, sp); } /* If no fd is given, unsplice by removing existing link. */ if (fd < 0) { /* Lock receive buffer. */ if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { return (error); } if (so->so_sp->ssp_socket) sounsplice(so, so->so_sp->ssp_socket, 0); sbunlock(so, &so->so_rcv); return (0); } if (max && max < 0) return (EINVAL); if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) return (EINVAL); /* Find sosp, the drain socket where data will be spliced into. */ if ((error = getsock(curproc, fd, &fp)) != 0) return (error); sosp = fp->f_data; if (sosp->so_proto->pr_usrreqs->pru_send != so->so_proto->pr_usrreqs->pru_send) { error = EPROTONOSUPPORT; goto frele; } if (sosp->so_sp == NULL) { sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); if (sosp->so_sp == NULL) sosp->so_sp = sp; else pool_put(&sosplice_pool, sp); } /* Lock both receive and send buffer. */ if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { goto frele; } if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { sbunlock(so, &so->so_rcv); goto frele; } if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { error = EBUSY; goto release; } if (sosp->so_options & SO_ACCEPTCONN) { error = EOPNOTSUPP; goto release; } if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { error = ENOTCONN; goto release; } /* Splice so and sosp together. */ so->so_sp->ssp_socket = sosp; sosp->so_sp->ssp_soback = so; so->so_splicelen = 0; so->so_splicemax = max; if (tv) so->so_idletv = *tv; else timerclear(&so->so_idletv); timeout_set_proc(&so->so_idleto, soidle, so); task_set(&so->so_splicetask, sotask, so); /* * To prevent softnet interrupt from calling somove() while * we sleep, the socket buffers are not marked as spliced yet. */ if (somove(so, M_WAIT)) { so->so_rcv.sb_flags |= SB_SPLICE; sosp->so_snd.sb_flags |= SB_SPLICE; } release: sbunlock(sosp, &sosp->so_snd); sbunlock(so, &so->so_rcv); frele: /* * FRELE() must not be called with the socket lock held. It is safe to * release the lock here as long as no other operation happen on the * socket when sosplice() returns. The dance could be avoided by * grabbing the socket lock inside this function. */ sounlock(so); FRELE(fp, curproc); solock(so); return (error); } void sounsplice(struct socket *so, struct socket *sosp, int freeing) { soassertlocked(so); task_del(sosplice_taskq, &so->so_splicetask); timeout_del(&so->so_idleto); sosp->so_snd.sb_flags &= ~SB_SPLICE; so->so_rcv.sb_flags &= ~SB_SPLICE; so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; /* Do not wakeup a socket that is about to be freed. */ if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) sorwakeup(so); if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) sowwakeup(sosp); } void soidle(void *arg) { struct socket *so = arg; solock(so); if (so->so_rcv.sb_flags & SB_SPLICE) { so->so_error = ETIMEDOUT; sounsplice(so, so->so_sp->ssp_socket, 0); } sounlock(so); } void sotask(void *arg) { struct socket *so = arg; solock(so); if (so->so_rcv.sb_flags & SB_SPLICE) { /* * We may not sleep here as sofree() and unsplice() may be * called from softnet interrupt context. This would remove * the socket during somove(). */ somove(so, M_DONTWAIT); } sounlock(so); /* Avoid user land starvation. */ yield(); } /* * The socket splicing task or idle timeout may sleep while grabbing the net * lock. As sofree() can be called anytime, sotask() or soidle() could access * the socket memory of a freed socket after wakeup. So delay the pool_put() * after all pending socket splicing tasks or timeouts have finished. Do this * by scheduling it on the same threads. */ void soreaper(void *arg) { struct socket *so = arg; /* Reuse splice task, sounsplice() has been called before. */ task_set(&so->so_sp->ssp_task, soput, so); task_add(sosplice_taskq, &so->so_sp->ssp_task); } void soput(void *arg) { struct socket *so = arg; pool_put(&sosplice_pool, so->so_sp); pool_put(&socket_pool, so); } /* * Move data from receive buffer of spliced source socket to send * buffer of drain socket. Try to move as much as possible in one * big chunk. It is a TCP only implementation. * Return value 0 means splicing has been finished, 1 continue. */ int somove(struct socket *so, int wait) { struct socket *sosp = so->so_sp->ssp_socket; struct mbuf *m, **mp, *nextrecord; u_long len, off, oobmark; long space; int error = 0, maxreached = 0; unsigned int state; soassertlocked(so); nextpkt: if (so->so_error) { error = so->so_error; goto release; } if (sosp->so_state & SS_CANTSENDMORE) { error = EPIPE; goto release; } if (sosp->so_error && sosp->so_error != ETIMEDOUT && sosp->so_error != EFBIG && sosp->so_error != ELOOP) { error = sosp->so_error; goto release; } if ((sosp->so_state & SS_ISCONNECTED) == 0) goto release; /* Calculate how many bytes can be copied now. */ len = so->so_rcv.sb_datacc; if (so->so_splicemax) { KASSERT(so->so_splicelen < so->so_splicemax); if (so->so_splicemax <= so->so_splicelen + len) { len = so->so_splicemax - so->so_splicelen; maxreached = 1; } } space = sbspace(sosp, &sosp->so_snd); if (so->so_oobmark && so->so_oobmark < len && so->so_oobmark < space + 1024) space += 1024; if (space <= 0) { maxreached = 0; goto release; } if (space < len) { maxreached = 0; if (space < sosp->so_snd.sb_lowat) goto release; len = space; } sosp->so_state |= SS_ISSENDING; SBLASTRECORDCHK(&so->so_rcv, "somove 1"); SBLASTMBUFCHK(&so->so_rcv, "somove 1"); m = so->so_rcv.sb_mb; if (m == NULL) goto release; nextrecord = m->m_nextpkt; /* Drop address and control information not used with splicing. */ if (so->so_proto->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC if (m->m_type != MT_SONAME) panic("somove soname: so %p, so_type %d, m %p, " "m_type %d", so, so->so_type, m, m->m_type); #endif m = m->m_next; } while (m && m->m_type == MT_CONTROL) m = m->m_next; if (m == NULL) { sbdroprecord(so, &so->so_rcv); if (so->so_proto->pr_flags & PR_WANTRCVD) pru_rcvd(so); goto nextpkt; } /* * By splicing sockets connected to localhost, userland might create a * loop. Dissolve splicing with error if loop is detected by counter. * * If we deal with looped broadcast/multicast packet we bail out with * no error to suppress splice termination. */ if ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { error = ELOOP; goto release; } if (so->so_proto->pr_flags & PR_ATOMIC) { if ((m->m_flags & M_PKTHDR) == 0) panic("somove !PKTHDR: so %p, so_type %d, m %p, " "m_type %d", so, so->so_type, m, m->m_type); if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { error = EMSGSIZE; goto release; } if (len < m->m_pkthdr.len) goto release; if (m->m_pkthdr.len < len) { maxreached = 0; len = m->m_pkthdr.len; } /* * Throw away the name mbuf after it has been assured * that the whole first record can be processed. */ m = so->so_rcv.sb_mb; sbfree(so, &so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); sbsync(&so->so_rcv, nextrecord); } /* * Throw away the control mbufs after it has been assured * that the whole first record can be processed. */ m = so->so_rcv.sb_mb; while (m && m->m_type == MT_CONTROL) { sbfree(so, &so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sbsync(&so->so_rcv, nextrecord); } SBLASTRECORDCHK(&so->so_rcv, "somove 2"); SBLASTMBUFCHK(&so->so_rcv, "somove 2"); /* Take at most len mbufs out of receive buffer. */ for (off = 0, mp = &m; off <= len && *mp; off += (*mp)->m_len, mp = &(*mp)->m_next) { u_long size = len - off; #ifdef DIAGNOSTIC if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) panic("somove type: so %p, so_type %d, m %p, " "m_type %d", so, so->so_type, *mp, (*mp)->m_type); #endif if ((*mp)->m_len > size) { /* * Move only a partial mbuf at maximum splice length or * if the drain buffer is too small for this large mbuf. */ if (!maxreached && so->so_snd.sb_datacc > 0) { len -= size; break; } *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); if (*mp == NULL) { len -= size; break; } so->so_rcv.sb_mb->m_data += size; so->so_rcv.sb_mb->m_len -= size; so->so_rcv.sb_cc -= size; so->so_rcv.sb_datacc -= size; } else { *mp = so->so_rcv.sb_mb; sbfree(so, &so->so_rcv, *mp); so->so_rcv.sb_mb = (*mp)->m_next; sbsync(&so->so_rcv, nextrecord); } } *mp = NULL; SBLASTRECORDCHK(&so->so_rcv, "somove 3"); SBLASTMBUFCHK(&so->so_rcv, "somove 3"); SBCHECK(so, &so->so_rcv); if (m == NULL) goto release; m->m_nextpkt = NULL; if (m->m_flags & M_PKTHDR) { m_resethdr(m); m->m_pkthdr.len = len; } /* Send window update to source peer as receive buffer has changed. */ if (so->so_proto->pr_flags & PR_WANTRCVD) pru_rcvd(so); /* Receive buffer did shrink by len bytes, adjust oob. */ state = so->so_state; so->so_state &= ~SS_RCVATMARK; oobmark = so->so_oobmark; so->so_oobmark = oobmark > len ? oobmark - len : 0; if (oobmark) { if (oobmark == len) so->so_state |= SS_RCVATMARK; if (oobmark >= len) oobmark = 0; } /* * Handle oob data. If any malloc fails, ignore error. * TCP urgent data is not very reliable anyway. */ while (((state & SS_RCVATMARK) || oobmark) && (so->so_options & SO_OOBINLINE)) { struct mbuf *o = NULL; if (state & SS_RCVATMARK) { o = m_get(wait, MT_DATA); state &= ~SS_RCVATMARK; } else if (oobmark) { o = m_split(m, oobmark, wait); if (o) { error = pru_send(sosp, m, NULL, NULL); if (error) { if (sosp->so_state & SS_CANTSENDMORE) error = EPIPE; m_freem(o); goto release; } len -= oobmark; so->so_splicelen += oobmark; m = o; o = m_get(wait, MT_DATA); } oobmark = 0; } if (o) { o->m_len = 1; *mtod(o, caddr_t) = *mtod(m, caddr_t); error = pru_sendoob(sosp, o, NULL, NULL); if (error) { if (sosp->so_state & SS_CANTSENDMORE) error = EPIPE; m_freem(m); goto release; } len -= 1; so->so_splicelen += 1; if (oobmark) { oobmark -= 1; if (oobmark == 0) state |= SS_RCVATMARK; } m_adj(m, 1); } } /* Append all remaining data to drain socket. */ if (so->so_rcv.sb_cc == 0 || maxreached) sosp->so_state &= ~SS_ISSENDING; error = pru_send(sosp, m, NULL, NULL); if (error) { if (sosp->so_state & SS_CANTSENDMORE) error = EPIPE; goto release; } so->so_splicelen += len; /* Move several packets if possible. */ if (!maxreached && nextrecord) goto nextpkt; release: sosp->so_state &= ~SS_ISSENDING; if (!error && maxreached && so->so_splicemax == so->so_splicelen) error = EFBIG; if (error) so->so_error = error; if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { sounsplice(so, sosp, 0); return (0); } if (timerisset(&so->so_idletv)) timeout_add_tv(&so->so_idleto, &so->so_idletv); return (1); } #endif /* SOCKET_SPLICE */ void sorwakeup(struct socket *so) { soassertlocked(so); #ifdef SOCKET_SPLICE if (so->so_rcv.sb_flags & SB_SPLICE) { /* * TCP has a sendbuffer that can handle multiple packets * at once. So queue the stream a bit to accumulate data. * The sosplice thread will call somove() later and send * the packets calling tcp_output() only once. * In the UDP case, send out the packets immediately. * Using a thread would make things slower. */ if (so->so_proto->pr_flags & PR_WANTRCVD) task_add(sosplice_taskq, &so->so_splicetask); else somove(so, M_DONTWAIT); } if (isspliced(so)) return; #endif sowakeup(so, &so->so_rcv); if (so->so_upcall) (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); } void sowwakeup(struct socket *so) { soassertlocked(so); #ifdef SOCKET_SPLICE if (so->so_snd.sb_flags & SB_SPLICE) task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); if (issplicedback(so)) return; #endif sowakeup(so, &so->so_snd); } int sosetopt(struct socket *so, int level, int optname, struct mbuf *m) { int error = 0; soassertlocked(so); if (level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput) { error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, level, optname, m); return (error); } error = ENOPROTOOPT; } else { switch (optname) { case SO_BINDANY: if ((error = suser(curproc)) != 0) /* XXX */ return (error); break; } switch (optname) { case SO_LINGER: if (m == NULL || m->m_len != sizeof (struct linger) || mtod(m, struct linger *)->l_linger < 0 || mtod(m, struct linger *)->l_linger > SHRT_MAX) return (EINVAL); so->so_linger = mtod(m, struct linger *)->l_linger; /* FALLTHROUGH */ case SO_BINDANY: case SO_DEBUG: case SO_KEEPALIVE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_ZEROIZE: if (m == NULL || m->m_len < sizeof (int)) return (EINVAL); if (*mtod(m, int *)) so->so_options |= optname; else so->so_options &= ~optname; break; case SO_DONTROUTE: if (m == NULL || m->m_len < sizeof (int)) return (EINVAL); if (*mtod(m, int *)) error = EOPNOTSUPP; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: { u_long cnt; if (m == NULL || m->m_len < sizeof (int)) return (EINVAL); cnt = *mtod(m, int *); if ((long)cnt <= 0) cnt = 1; switch (optname) { case SO_SNDBUF: if (so->so_state & SS_CANTSENDMORE) return (EINVAL); if (sbcheckreserve(cnt, so->so_snd.sb_wat) || sbreserve(so, &so->so_snd, cnt)) return (ENOBUFS); so->so_snd.sb_wat = cnt; break; case SO_RCVBUF: if (so->so_state & SS_CANTRCVMORE) return (EINVAL); if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || sbreserve(so, &so->so_rcv, cnt)) return (ENOBUFS); so->so_rcv.sb_wat = cnt; break; case SO_SNDLOWAT: so->so_snd.sb_lowat = (cnt > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : cnt; break; case SO_RCVLOWAT: so->so_rcv.sb_lowat = (cnt > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : cnt; break; } break; } case SO_SNDTIMEO: case SO_RCVTIMEO: { struct timeval tv; uint64_t nsecs; if (m == NULL || m->m_len < sizeof (tv)) return (EINVAL); memcpy(&tv, mtod(m, struct timeval *), sizeof tv); if (!timerisvalid(&tv)) return (EINVAL); nsecs = TIMEVAL_TO_NSEC(&tv); if (nsecs == UINT64_MAX) return (EDOM); if (nsecs == 0) nsecs = INFSLP; switch (optname) { case SO_SNDTIMEO: so->so_snd.sb_timeo_nsecs = nsecs; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo_nsecs = nsecs; break; } break; } case SO_RTABLE: if (so->so_proto->pr_domain && so->so_proto->pr_domain->dom_protosw && so->so_proto->pr_ctloutput) { const struct domain *dom = so->so_proto->pr_domain; level = dom->dom_protosw->pr_protocol; error = (*so->so_proto->pr_ctloutput) (PRCO_SETOPT, so, level, optname, m); return (error); } error = ENOPROTOOPT; break; #ifdef SOCKET_SPLICE case SO_SPLICE: if (m == NULL) { error = sosplice(so, -1, 0, NULL); } else if (m->m_len < sizeof(int)) { return (EINVAL); } else if (m->m_len < sizeof(struct splice)) { error = sosplice(so, *mtod(m, int *), 0, NULL); } else { error = sosplice(so, mtod(m, struct splice *)->sp_fd, mtod(m, struct splice *)->sp_max, &mtod(m, struct splice *)->sp_idle); } break; #endif /* SOCKET_SPLICE */ default: error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput) { (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, level, optname, m); } } return (error); } int sogetopt(struct socket *so, int level, int optname, struct mbuf *m) { int error = 0; soassertlocked(so); if (level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput) { m->m_len = 0; error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, level, optname, m); if (error) return (error); return (0); } else return (ENOPROTOOPT); } else { m->m_len = sizeof (int); switch (optname) { case SO_LINGER: m->m_len = sizeof (struct linger); mtod(m, struct linger *)->l_onoff = so->so_options & SO_LINGER; mtod(m, struct linger *)->l_linger = so->so_linger; break; case SO_BINDANY: case SO_USELOOPBACK: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_ZEROIZE: *mtod(m, int *) = so->so_options & optname; break; case SO_DONTROUTE: *mtod(m, int *) = 0; break; case SO_TYPE: *mtod(m, int *) = so->so_type; break; case SO_ERROR: *mtod(m, int *) = so->so_error; so->so_error = 0; break; case SO_DOMAIN: *mtod(m, int *) = so->so_proto->pr_domain->dom_family; break; case SO_PROTOCOL: *mtod(m, int *) = so->so_proto->pr_protocol; break; case SO_SNDBUF: *mtod(m, int *) = so->so_snd.sb_hiwat; break; case SO_RCVBUF: *mtod(m, int *) = so->so_rcv.sb_hiwat; break; case SO_SNDLOWAT: *mtod(m, int *) = so->so_snd.sb_lowat; break; case SO_RCVLOWAT: *mtod(m, int *) = so->so_rcv.sb_lowat; break; case SO_SNDTIMEO: case SO_RCVTIMEO: { struct timeval tv; uint64_t nsecs = (optname == SO_SNDTIMEO ? so->so_snd.sb_timeo_nsecs : so->so_rcv.sb_timeo_nsecs); m->m_len = sizeof(struct timeval); memset(&tv, 0, sizeof(tv)); if (nsecs != INFSLP) NSEC_TO_TIMEVAL(nsecs, &tv); memcpy(mtod(m, struct timeval *), &tv, sizeof tv); break; } case SO_RTABLE: if (so->so_proto->pr_domain && so->so_proto->pr_domain->dom_protosw && so->so_proto->pr_ctloutput) { const struct domain *dom = so->so_proto->pr_domain; level = dom->dom_protosw->pr_protocol; error = (*so->so_proto->pr_ctloutput) (PRCO_GETOPT, so, level, optname, m); if (error) return (error); break; } return (ENOPROTOOPT); #ifdef SOCKET_SPLICE case SO_SPLICE: { off_t len; m->m_len = sizeof(off_t); len = so->so_sp ? so->so_sp->ssp_len : 0; memcpy(mtod(m, off_t *), &len, sizeof(off_t)); break; } #endif /* SOCKET_SPLICE */ case SO_PEERCRED: if (so->so_proto->pr_protocol == AF_UNIX) { struct unpcb *unp = sotounpcb(so); if (unp->unp_flags & UNP_FEIDS) { m->m_len = sizeof(unp->unp_connid); memcpy(mtod(m, caddr_t), &(unp->unp_connid), m->m_len); break; } return (ENOTCONN); } return (EOPNOTSUPP); default: return (ENOPROTOOPT); } return (0); } } void sohasoutofband(struct socket *so) { pgsigio(&so->so_sigio, SIGURG, 0); KNOTE(&so->so_rcv.sb_sel.si_note, 0); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; solock(so); switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; sb = &so->so_snd; break; case EVFILT_EXCEPT: kn->kn_fop = &soexcept_filtops; sb = &so->so_rcv; break; default: sounlock(so); return (EINVAL); } klist_insert_locked(&sb->sb_sel.si_note, kn); sounlock(so); return (0); } void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; klist_remove(&so->so_rcv.sb_sel.si_note, kn); } int filt_soread(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int rv = 0; soassertlocked(so); kn->kn_data = so->so_rcv.sb_cc; #ifdef SOCKET_SPLICE if (isspliced(so)) { rv = 0; } else #endif /* SOCKET_SPLICE */ if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; if (kn->kn_flags & __EV_POLL) { if (so->so_state & SS_ISDISCONNECTED) kn->kn_flags |= __EV_HUP; } kn->kn_fflags = so->so_error; rv = 1; } else if (so->so_error) { /* temporary udp error */ rv = 1; } else if (kn->kn_sfflags & NOTE_LOWAT) { rv = (kn->kn_data >= kn->kn_sdata); } else { rv = (kn->kn_data >= so->so_rcv.sb_lowat); } return rv; } void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; klist_remove(&so->so_snd.sb_sel.si_note, kn); } int filt_sowrite(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int rv; soassertlocked(so); kn->kn_data = sbspace(so, &so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; if (kn->kn_flags & __EV_POLL) { if (so->so_state & SS_ISDISCONNECTED) kn->kn_flags |= __EV_HUP; } kn->kn_fflags = so->so_error; rv = 1; } else if (so->so_error) { /* temporary udp error */ rv = 1; } else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { rv = 0; } else if (kn->kn_sfflags & NOTE_LOWAT) { rv = (kn->kn_data >= kn->kn_sdata); } else { rv = (kn->kn_data >= so->so_snd.sb_lowat); } return (rv); } int filt_soexcept(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int rv = 0; soassertlocked(so); #ifdef SOCKET_SPLICE if (isspliced(so)) { rv = 0; } else #endif /* SOCKET_SPLICE */ if (kn->kn_sfflags & NOTE_OOB) { if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { kn->kn_fflags |= NOTE_OOB; kn->kn_data -= so->so_oobmark; rv = 1; } } if (kn->kn_flags & __EV_POLL) { if (so->so_state & SS_ISDISCONNECTED) { kn->kn_flags |= __EV_HUP; rv = 1; } } return rv; } int filt_solisten(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; int active; soassertlocked(so); kn->kn_data = so->so_qlen; active = (kn->kn_data != 0); if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { if (so->so_state & SS_ISDISCONNECTED) { kn->kn_flags |= __EV_HUP; active = 1; } else { active = soreadable(so); } } return (active); } int filt_somodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; int rv; solock(so); rv = knote_modify(kev, kn); sounlock(so); return (rv); } int filt_soprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_fp->f_data; int rv; solock(so); rv = knote_process(kn, kev); sounlock(so); return (rv); } void klist_soassertlk(void *arg) { struct socket *so = arg; soassertlocked(so); } int klist_solock(void *arg) { struct socket *so = arg; solock(so); return (1); } void klist_sounlock(void *arg, int ls) { struct socket *so = arg; sounlock(so); } const struct klistops socket_klistops = { .klo_assertlk = klist_soassertlk, .klo_lock = klist_solock, .klo_unlock = klist_sounlock, }; #ifdef DDB void sobuf_print(struct sockbuf *, int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); void sobuf_print(struct sockbuf *sb, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { (*pr)("\tsb_cc: %lu\n", sb->sb_cc); (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); (*pr)("\tsb_wat: %lu\n", sb->sb_wat); (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); (*pr)("\tsb_mb: %p\n", sb->sb_mb); (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); (*pr)("\tsb_sel: ...\n"); (*pr)("\tsb_flags: %i\n", sb->sb_flags); (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); } void so_print(void *v, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { struct socket *so = v; (*pr)("socket %p\n", so); (*pr)("so_type: %i\n", so->so_type); (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ (*pr)("so_linger: %i\n", so->so_linger); (*pr)("so_state: 0x%04x\n", so->so_state); (*pr)("so_pcb: %p\n", so->so_pcb); (*pr)("so_proto: %p\n", so->so_proto); (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); (*pr)("so_head: %p\n", so->so_head); (*pr)("so_onq: %p\n", so->so_onq); (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); (*pr)("so_q0len: %i\n", so->so_q0len); (*pr)("so_qlen: %i\n", so->so_qlen); (*pr)("so_qlimit: %i\n", so->so_qlimit); (*pr)("so_timeo: %i\n", so->so_timeo); (*pr)("so_obmark: %lu\n", so->so_oobmark); (*pr)("so_sp: %p\n", so->so_sp); if (so->so_sp != NULL) { (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); (*pr)("\tssp_len: %lld\n", (unsigned long long)so->so_sp->ssp_len); (*pr)("\tssp_max: %lld\n", (unsigned long long)so->so_sp->ssp_max); (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, so->so_sp->ssp_idletv.tv_usec); (*pr)("\tssp_idleto: %spending (@%i)\n", timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", so->so_sp->ssp_idleto.to_time); } (*pr)("so_rcv:\n"); sobuf_print(&so->so_rcv, pr); (*pr)("so_snd:\n"); sobuf_print(&so->so_snd, pr); (*pr)("so_upcall: %p so_upcallarg: %p\n", so->so_upcall, so->so_upcallarg); (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); (*pr)("so_cpid: %d\n", so->so_cpid); } #endif
2 1 1 3 1 2 3 1 2 1 34 32 1 10 6 6 1 1 24 24 4 1 2 1 1 2 1 4 3 3 2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 /* $OpenBSD: pci.c,v 1.125 2022/06/17 10:08:36 kettenis Exp $ */ /* $NetBSD: pci.c,v 1.31 1997/06/06 23:48:04 thorpej Exp $ */ /* * Copyright (c) 1995, 1996 Christopher G. Demetriou. All rights reserved. * Copyright (c) 1994 Charles Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Charles Hannum. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * PCI bus autoconfiguration. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/malloc.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> #include <dev/pci/pcidevs.h> #include <dev/pci/ppbreg.h> int pcimatch(struct device *, void *, void *); void pciattach(struct device *, struct device *, void *); int pcidetach(struct device *, int); int pciactivate(struct device *, int); void pci_suspend(struct pci_softc *); void pci_powerdown(struct pci_softc *); void pci_resume(struct pci_softc *); struct msix_vector { uint32_t mv_ma; uint32_t mv_mau32; uint32_t mv_md; uint32_t mv_vc; }; #define NMAPREG ((PCI_MAPREG_END - PCI_MAPREG_START) / \ sizeof(pcireg_t)) struct pci_dev { struct device *pd_dev; LIST_ENTRY(pci_dev) pd_next; pcitag_t pd_tag; /* pci register tag */ pcireg_t pd_csr; pcireg_t pd_bhlc; pcireg_t pd_int; pcireg_t pd_map[NMAPREG]; pcireg_t pd_mask[NMAPREG]; pcireg_t pd_msi_mc; pcireg_t pd_msi_ma; pcireg_t pd_msi_mau32; pcireg_t pd_msi_md; pcireg_t pd_msix_mc; struct msix_vector *pd_msix_table; int pd_pmcsr_state; int pd_vga_decode; }; #ifdef APERTURE extern int allowaperture; #endif const struct cfattach pci_ca = { sizeof(struct pci_softc), pcimatch, pciattach, pcidetach, pciactivate }; struct cfdriver pci_cd = { NULL, "pci", DV_DULL }; int pci_ndomains; struct proc *pci_vga_proc; struct pci_softc *pci_vga_pci; pcitag_t pci_vga_tag; int pci_dopm; int pciprint(void *, const char *); int pcisubmatch(struct device *, void *, void *); #ifdef PCI_MACHDEP_ENUMERATE_BUS #define pci_enumerate_bus PCI_MACHDEP_ENUMERATE_BUS #else int pci_enumerate_bus(struct pci_softc *, int (*)(struct pci_attach_args *), struct pci_attach_args *); #endif int pci_reserve_resources(struct pci_attach_args *); int pci_primary_vga(struct pci_attach_args *); /* * Important note about PCI-ISA bridges: * * Callbacks are used to configure these devices so that ISA/EISA bridges * can attach their child busses after PCI configuration is done. * * This works because: * (1) there can be at most one ISA/EISA bridge per PCI bus, and * (2) any ISA/EISA bridges must be attached to primary PCI * busses (i.e. bus zero). * * That boils down to: there can only be one of these outstanding * at a time, it is cleared when configuring PCI bus 0 before any * subdevices have been found, and it is run after all subdevices * of PCI bus 0 have been found. * * This is needed because there are some (legacy) PCI devices which * can show up as ISA/EISA devices as well (the prime example of which * are VGA controllers). If you attach ISA from a PCI-ISA/EISA bridge, * and the bridge is seen before the video board is, the board can show * up as an ISA device, and that can (bogusly) complicate the PCI device's * attach code, or make the PCI device not be properly attached at all. * * We use the generic config_defer() facility to achieve this. */ int pcimatch(struct device *parent, void *match, void *aux) { struct cfdata *cf = match; struct pcibus_attach_args *pba = aux; if (strcmp(pba->pba_busname, cf->cf_driver->cd_name)) return (0); /* Check the locators */ if (cf->pcibuscf_bus != PCIBUS_UNK_BUS && cf->pcibuscf_bus != pba->pba_bus) return (0); /* sanity */ if (pba->pba_bus < 0 || pba->pba_bus > 255) return (0); /* * XXX check other (hardware?) indicators */ return (1); } void pciattach(struct device *parent, struct device *self, void *aux) { struct pcibus_attach_args *pba = aux; struct pci_softc *sc = (struct pci_softc *)self; pci_attach_hook(parent, self, pba); printf("\n"); LIST_INIT(&sc->sc_devs); sc->sc_iot = pba->pba_iot; sc->sc_memt = pba->pba_memt; sc->sc_dmat = pba->pba_dmat; sc->sc_pc = pba->pba_pc; sc->sc_flags = pba->pba_flags; sc->sc_ioex = pba->pba_ioex; sc->sc_memex = pba->pba_memex; sc->sc_pmemex = pba->pba_pmemex; sc->sc_busex = pba->pba_busex; sc->sc_domain = pba->pba_domain; sc->sc_bus = pba->pba_bus; sc->sc_bridgetag = pba->pba_bridgetag; sc->sc_bridgeih = pba->pba_bridgeih; sc->sc_maxndevs = pci_bus_maxdevs(pba->pba_pc, pba->pba_bus); sc->sc_intrswiz = pba->pba_intrswiz; sc->sc_intrtag = pba->pba_intrtag; /* Reserve our own bus number. */ if (sc->sc_busex) extent_alloc_region(sc->sc_busex, sc->sc_bus, 1, EX_NOWAIT); pci_enumerate_bus(sc, pci_reserve_resources, NULL); /* Find the VGA device that's currently active. */ if (pci_enumerate_bus(sc, pci_primary_vga, NULL)) pci_vga_pci = sc; pci_enumerate_bus(sc, NULL, NULL); } int pcidetach(struct device *self, int flags) { return pci_detach_devices((struct pci_softc *)self, flags); } int pciactivate(struct device *self, int act) { int rv = 0; switch (act) { case DVACT_SUSPEND: rv = config_activate_children(self, act); pci_suspend((struct pci_softc *)self); break; case DVACT_RESUME: pci_resume((struct pci_softc *)self); rv = config_activate_children(self, act); break; case DVACT_POWERDOWN: rv = config_activate_children(self, act); pci_powerdown((struct pci_softc *)self); break; default: rv = config_activate_children(self, act); break; } return (rv); } void pci_suspend(struct pci_softc *sc) { struct pci_dev *pd; pcireg_t bhlc, reg; int off, i; LIST_FOREACH(pd, &sc->sc_devs, pd_next) { /* * Only handle header type 0 here; PCI-PCI bridges and * CardBus bridges need special handling, which will * be done in their specific drivers. */ bhlc = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlc) != 0) continue; /* Save registers that may get lost. */ for (i = 0; i < NMAPREG; i++) pd->pd_map[i] = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_MAPREG_START + (i * 4)); pd->pd_csr = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_COMMAND_STATUS_REG); pd->pd_bhlc = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_BHLC_REG); pd->pd_int = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_INTERRUPT_REG); if (pci_get_capability(sc->sc_pc, pd->pd_tag, PCI_CAP_MSI, &off, &reg)) { pd->pd_msi_ma = pci_conf_read(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MA); if (reg & PCI_MSI_MC_C64) { pd->pd_msi_mau32 = pci_conf_read(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MAU32); pd->pd_msi_md = pci_conf_read(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MD64); } else { pd->pd_msi_md = pci_conf_read(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MD32); } pd->pd_msi_mc = reg; } pci_suspend_msix(sc->sc_pc, pd->pd_tag, sc->sc_memt, &pd->pd_msix_mc, pd->pd_msix_table); } } void pci_powerdown(struct pci_softc *sc) { struct pci_dev *pd; pcireg_t bhlc; LIST_FOREACH(pd, &sc->sc_devs, pd_next) { /* * Only handle header type 0 here; PCI-PCI bridges and * CardBus bridges need special handling, which will * be done in their specific drivers. */ bhlc = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlc) != 0) continue; if (pci_dopm) { /* * Place the device into the lowest possible * power state. */ pd->pd_pmcsr_state = pci_get_powerstate(sc->sc_pc, pd->pd_tag); pci_set_powerstate(sc->sc_pc, pd->pd_tag, pci_min_powerstate(sc->sc_pc, pd->pd_tag)); } } } void pci_resume(struct pci_softc *sc) { struct pci_dev *pd; pcireg_t bhlc, reg; int off, i; LIST_FOREACH(pd, &sc->sc_devs, pd_next) { /* * Only handle header type 0 here; PCI-PCI bridges and * CardBus bridges need special handling, which will * be done in their specific drivers. */ bhlc = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlc) != 0) continue; /* Restore power. */ if (pci_dopm) pci_set_powerstate(sc->sc_pc, pd->pd_tag, pd->pd_pmcsr_state); /* Restore the registers saved above. */ for (i = 0; i < NMAPREG; i++) pci_conf_write(sc->sc_pc, pd->pd_tag, PCI_MAPREG_START + (i * 4), pd->pd_map[i]); reg = pci_conf_read(sc->sc_pc, pd->pd_tag, PCI_COMMAND_STATUS_REG); pci_conf_write(sc->sc_pc, pd->pd_tag, PCI_COMMAND_STATUS_REG, (reg & 0xffff0000) | (pd->pd_csr & 0x0000ffff)); pci_conf_write(sc->sc_pc, pd->pd_tag, PCI_BHLC_REG, pd->pd_bhlc); pci_conf_write(sc->sc_pc, pd->pd_tag, PCI_INTERRUPT_REG, pd->pd_int); if (pci_get_capability(sc->sc_pc, pd->pd_tag, PCI_CAP_MSI, &off, &reg)) { pci_conf_write(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MA, pd->pd_msi_ma); if (reg & PCI_MSI_MC_C64) { pci_conf_write(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MAU32, pd->pd_msi_mau32); pci_conf_write(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MD64, pd->pd_msi_md); } else { pci_conf_write(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MD32, pd->pd_msi_md); } pci_conf_write(sc->sc_pc, pd->pd_tag, off + PCI_MSI_MC, pd->pd_msi_mc); } pci_resume_msix(sc->sc_pc, pd->pd_tag, sc->sc_memt, pd->pd_msix_mc, pd->pd_msix_table); } } int pciprint(void *aux, const char *pnp) { struct pci_attach_args *pa = aux; char devinfo[256]; if (pnp) { pci_devinfo(pa->pa_id, pa->pa_class, 1, devinfo, sizeof devinfo); printf("%s at %s", devinfo, pnp); } printf(" dev %d function %d", pa->pa_device, pa->pa_function); if (!pnp) { pci_devinfo(pa->pa_id, pa->pa_class, 0, devinfo, sizeof devinfo); printf(" %s", devinfo); } return (UNCONF); } int pcisubmatch(struct device *parent, void *match, void *aux) { struct cfdata *cf = match; struct pci_attach_args *pa = aux; if (cf->pcicf_dev != PCI_UNK_DEV && cf->pcicf_dev != pa->pa_device) return (0); if (cf->pcicf_function != PCI_UNK_FUNCTION && cf->pcicf_function != pa->pa_function) return (0); return ((*cf->cf_attach->ca_match)(parent, match, aux)); } int pci_probe_device(struct pci_softc *sc, pcitag_t tag, int (*match)(struct pci_attach_args *), struct pci_attach_args *pap) { pci_chipset_tag_t pc = sc->sc_pc; struct pci_attach_args pa; struct pci_dev *pd; pcireg_t id, class, intr, bhlcr, cap; int pin, bus, device, function; int off, ret = 0; uint64_t addr; pci_decompose_tag(pc, tag, &bus, &device, &function); bhlcr = pci_conf_read(pc, tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlcr) > 2) return (0); id = pci_conf_read(pc, tag, PCI_ID_REG); class = pci_conf_read(pc, tag, PCI_CLASS_REG); /* Invalid vendor ID value? */ if (PCI_VENDOR(id) == PCI_VENDOR_INVALID) return (0); /* XXX Not invalid, but we've done this ~forever. */ if (PCI_VENDOR(id) == 0) return (0); pa.pa_iot = sc->sc_iot; pa.pa_memt = sc->sc_memt; pa.pa_dmat = sc->sc_dmat; pa.pa_pc = pc; pa.pa_ioex = sc->sc_ioex; pa.pa_memex = sc->sc_memex; pa.pa_pmemex = sc->sc_pmemex; pa.pa_busex = sc->sc_busex; pa.pa_domain = sc->sc_domain; pa.pa_bus = bus; pa.pa_device = device; pa.pa_function = function; pa.pa_tag = tag; pa.pa_id = id; pa.pa_class = class; pa.pa_bridgetag = sc->sc_bridgetag; pa.pa_bridgeih = sc->sc_bridgeih; /* This is a simplification of the NetBSD code. We don't support turning off I/O or memory on broken hardware. <csapuntz@stanford.edu> */ pa.pa_flags = sc->sc_flags; pa.pa_flags |= PCI_FLAGS_IO_ENABLED | PCI_FLAGS_MEM_ENABLED; if (sc->sc_bridgetag == NULL) { pa.pa_intrswiz = 0; pa.pa_intrtag = tag; } else { pa.pa_intrswiz = sc->sc_intrswiz + device; pa.pa_intrtag = sc->sc_intrtag; } intr = pci_conf_read(pc, tag, PCI_INTERRUPT_REG); pin = PCI_INTERRUPT_PIN(intr); pa.pa_rawintrpin = pin; if (pin == PCI_INTERRUPT_PIN_NONE) { /* no interrupt */ pa.pa_intrpin = 0; } else { /* * swizzle it based on the number of busses we're * behind and our device number. */ pa.pa_intrpin = /* XXX */ ((pin + pa.pa_intrswiz - 1) % 4) + 1; } pa.pa_intrline = PCI_INTERRUPT_LINE(intr); if (pci_get_ht_capability(pc, tag, PCI_HT_CAP_MSI, &off, &cap)) { /* * XXX Should we enable MSI mapping ourselves on * systems that have it disabled? */ if (cap & PCI_HT_MSI_ENABLED) { if ((cap & PCI_HT_MSI_FIXED) == 0) { addr = pci_conf_read(pc, tag, off + PCI_HT_MSI_ADDR); addr |= (uint64_t)pci_conf_read(pc, tag, off + PCI_HT_MSI_ADDR_HI32) << 32; } else addr = PCI_HT_MSI_FIXED_ADDR; /* * XXX This will fail to enable MSI on systems * that don't use the canonical address. */ if (addr == PCI_HT_MSI_FIXED_ADDR) pa.pa_flags |= PCI_FLAGS_MSI_ENABLED; } } /* * Give the MD code a chance to alter pci_attach_args and/or * skip devices. */ if (pci_probe_device_hook(pc, &pa) != 0) return (0); if (match != NULL) { ret = (*match)(&pa); if (ret != 0 && pap != NULL) *pap = pa; } else { pcireg_t address, csr; int i, reg, reg_start, reg_end; int s; pd = malloc(sizeof *pd, M_DEVBUF, M_ZERO | M_WAITOK); pd->pd_tag = tag; LIST_INSERT_HEAD(&sc->sc_devs, pd, pd_next); switch (PCI_HDRTYPE_TYPE(bhlcr)) { case 0: reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_END; break; case 1: /* PCI-PCI bridge */ reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_PPB_END; break; case 2: /* PCI-CardBus bridge */ reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_PCB_END; break; default: return (0); } pd->pd_msix_table = pci_alloc_msix_table(sc->sc_pc, pd->pd_tag); s = splhigh(); csr = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); if (csr & (PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE)) pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr & ~(PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE)); for (reg = reg_start, i = 0; reg < reg_end; reg += 4, i++) { address = pci_conf_read(pc, tag, reg); pci_conf_write(pc, tag, reg, 0xffffffff); pd->pd_mask[i] = pci_conf_read(pc, tag, reg); pci_conf_write(pc, tag, reg, address); } if (csr & (PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE)) pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr); splx(s); if ((PCI_CLASS(class) == PCI_CLASS_DISPLAY && PCI_SUBCLASS(class) == PCI_SUBCLASS_DISPLAY_VGA) || (PCI_CLASS(class) == PCI_CLASS_PREHISTORIC && PCI_SUBCLASS(class) == PCI_SUBCLASS_PREHISTORIC_VGA)) pd->pd_vga_decode = 1; pd->pd_dev = config_found_sm(&sc->sc_dev, &pa, pciprint, pcisubmatch); if (pd->pd_dev) pci_dev_postattach(pd->pd_dev, &pa); } return (ret); } int pci_detach_devices(struct pci_softc *sc, int flags) { struct pci_dev *pd, *next; int ret; ret = config_detach_children(&sc->sc_dev, flags); if (ret != 0) return (ret); for (pd = LIST_FIRST(&sc->sc_devs); pd != NULL; pd = next) { pci_free_msix_table(sc->sc_pc, pd->pd_tag, pd->pd_msix_table); next = LIST_NEXT(pd, pd_next); free(pd, M_DEVBUF, sizeof *pd); } LIST_INIT(&sc->sc_devs); return (0); } int pci_get_capability(pci_chipset_tag_t pc, pcitag_t tag, int capid, int *offset, pcireg_t *value) { pcireg_t reg; unsigned int ofs; reg = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); if (!(reg & PCI_STATUS_CAPLIST_SUPPORT)) return (0); /* Determine the Capability List Pointer register to start with. */ reg = pci_conf_read(pc, tag, PCI_BHLC_REG); switch (PCI_HDRTYPE_TYPE(reg)) { case 0: /* standard device header */ case 1: /* PCI-PCI bridge header */ ofs = PCI_CAPLISTPTR_REG; break; case 2: /* PCI-CardBus bridge header */ ofs = PCI_CARDBUS_CAPLISTPTR_REG; break; default: return (0); } ofs = PCI_CAPLIST_PTR(pci_conf_read(pc, tag, ofs)); while (ofs != 0) { /* * Some devices, like parts of the NVIDIA C51 chipset, * have a broken Capabilities List. So we need to do * a sanity check here. */ if ((ofs & 3) || (ofs < 0x40)) return (0); reg = pci_conf_read(pc, tag, ofs); if (PCI_CAPLIST_CAP(reg) == capid) { if (offset) *offset = ofs; if (value) *value = reg; return (1); } ofs = PCI_CAPLIST_NEXT(reg); } return (0); } int pci_get_ht_capability(pci_chipset_tag_t pc, pcitag_t tag, int capid, int *offset, pcireg_t *value) { pcireg_t reg; unsigned int ofs; if (pci_get_capability(pc, tag, PCI_CAP_HT, &ofs, NULL) == 0) return (0); while (ofs != 0) { #ifdef DIAGNOSTIC if ((ofs & 3) || (ofs < 0x40)) panic("pci_get_ht_capability"); #endif reg = pci_conf_read(pc, tag, ofs); if (PCI_HT_CAP(reg) == capid) { if (offset) *offset = ofs; if (value) *value = reg; return (1); } ofs = PCI_CAPLIST_NEXT(reg); } return (0); } int pci_get_ext_capability(pci_chipset_tag_t pc, pcitag_t tag, int capid, int *offset, pcireg_t *value) { pcireg_t reg; unsigned int ofs; /* Make sure this is a PCI Express device. */ if (pci_get_capability(pc, tag, PCI_CAP_PCIEXPRESS, NULL, NULL) == 0) return (0); /* Scan PCI Express extended capabilities. */ ofs = PCI_PCIE_ECAP; while (ofs != 0) { #ifdef DIAGNOSTIC if ((ofs & 3) || (ofs < PCI_PCIE_ECAP)) panic("pci_get_ext_capability"); #endif reg = pci_conf_read(pc, tag, ofs); if (PCI_PCIE_ECAP_ID(reg) == capid) { if (offset) *offset = ofs; if (value) *value = reg; return (1); } ofs = PCI_PCIE_ECAP_NEXT(reg); } return (0); } uint16_t pci_requester_id(pci_chipset_tag_t pc, pcitag_t tag) { int bus, dev, func; pci_decompose_tag(pc, tag, &bus, &dev, &func); return ((bus << 8) | (dev << 3) | func); } int pci_find_device(struct pci_attach_args *pa, int (*match)(struct pci_attach_args *)) { extern struct cfdriver pci_cd; struct device *pcidev; int i; for (i = 0; i < pci_cd.cd_ndevs; i++) { pcidev = pci_cd.cd_devs[i]; if (pcidev != NULL && pci_enumerate_bus((struct pci_softc *)pcidev, match, pa) != 0) return (1); } return (0); } int pci_get_powerstate(pci_chipset_tag_t pc, pcitag_t tag) { pcireg_t reg; int offset; if (pci_get_capability(pc, tag, PCI_CAP_PWRMGMT, &offset, 0)) { reg = pci_conf_read(pc, tag, offset + PCI_PMCSR); return (reg & PCI_PMCSR_STATE_MASK); } return (PCI_PMCSR_STATE_D0); } int pci_set_powerstate(pci_chipset_tag_t pc, pcitag_t tag, int state) { pcireg_t reg; int offset, ostate = state; /* * Warn the firmware that we are going to put the device * into the given state. */ pci_set_powerstate_md(pc, tag, state, 1); if (pci_get_capability(pc, tag, PCI_CAP_PWRMGMT, &offset, 0)) { if (state == PCI_PMCSR_STATE_D3) { /* * The PCI Power Management spec says we * should disable I/O and memory space as well * as bus mastering before we place the device * into D3. */ reg = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); reg &= ~PCI_COMMAND_IO_ENABLE; reg &= ~PCI_COMMAND_MEM_ENABLE; reg &= ~PCI_COMMAND_MASTER_ENABLE; pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, reg); } reg = pci_conf_read(pc, tag, offset + PCI_PMCSR); if ((reg & PCI_PMCSR_STATE_MASK) != state) { ostate = reg & PCI_PMCSR_STATE_MASK; pci_conf_write(pc, tag, offset + PCI_PMCSR, (reg & ~PCI_PMCSR_STATE_MASK) | state); if (state == PCI_PMCSR_STATE_D3 || ostate == PCI_PMCSR_STATE_D3) delay(10 * 1000); } } /* * Warn the firmware that the device is now in the given * state. */ pci_set_powerstate_md(pc, tag, state, 0); return (ostate); } #ifndef PCI_MACHDEP_ENUMERATE_BUS /* * Generic PCI bus enumeration routine. Used unless machine-dependent * code needs to provide something else. */ int pci_enumerate_bus(struct pci_softc *sc, int (*match)(struct pci_attach_args *), struct pci_attach_args *pap) { pci_chipset_tag_t pc = sc->sc_pc; int device, function, nfunctions, ret; int maxndevs = sc->sc_maxndevs; const struct pci_quirkdata *qd; pcireg_t id, bhlcr, cap; pcitag_t tag; /* * PCIe downstream ports and root ports should only forward * configuration requests for device number 0. However, not * all hardware implements this correctly, and some devices * will respond to other device numbers making the device show * up 32 times. Prevent this by only scanning a single * device. */ if (sc->sc_bridgetag && pci_get_capability(pc, *sc->sc_bridgetag, PCI_CAP_PCIEXPRESS, NULL, &cap)) { switch (PCI_PCIE_XCAP_TYPE(cap)) { case PCI_PCIE_XCAP_TYPE_RP: case PCI_PCIE_XCAP_TYPE_DOWN: case PCI_PCIE_XCAP_TYPE_PCI2PCIE: maxndevs = 1; break; } } for (device = 0; device < maxndevs; device++) { tag = pci_make_tag(pc, sc->sc_bus, device, 0); bhlcr = pci_conf_read(pc, tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlcr) > 2) continue; id = pci_conf_read(pc, tag, PCI_ID_REG); /* Invalid vendor ID value? */ if (PCI_VENDOR(id) == PCI_VENDOR_INVALID) continue; /* XXX Not invalid, but we've done this ~forever. */ if (PCI_VENDOR(id) == 0) continue; qd = pci_lookup_quirkdata(PCI_VENDOR(id), PCI_PRODUCT(id)); if (qd != NULL && (qd->quirks & PCI_QUIRK_MULTIFUNCTION) != 0) nfunctions = 8; else if (qd != NULL && (qd->quirks & PCI_QUIRK_MONOFUNCTION) != 0) nfunctions = 1; else nfunctions = PCI_HDRTYPE_MULTIFN(bhlcr) ? 8 : 1; for (function = 0; function < nfunctions; function++) { tag = pci_make_tag(pc, sc->sc_bus, device, function); ret = pci_probe_device(sc, tag, match, pap); if (match != NULL && ret != 0) return (ret); } } return (0); } #endif /* PCI_MACHDEP_ENUMERATE_BUS */ int pci_reserve_resources(struct pci_attach_args *pa) { pci_chipset_tag_t pc = pa->pa_pc; pcitag_t tag = pa->pa_tag; pcireg_t bhlc, blr, type, bir; pcireg_t addr, mask; bus_addr_t base, limit; bus_size_t size; int reg, reg_start, reg_end, reg_rom; int bus, dev, func; int sec, sub; int flags; int s; pci_decompose_tag(pc, tag, &bus, &dev, &func); bhlc = pci_conf_read(pc, tag, PCI_BHLC_REG); switch (PCI_HDRTYPE_TYPE(bhlc)) { case 0: reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_END; reg_rom = PCI_ROM_REG; break; case 1: /* PCI-PCI bridge */ reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_PPB_END; reg_rom = 0; /* 0x38 */ break; case 2: /* PCI-CardBus bridge */ reg_start = PCI_MAPREG_START; reg_end = PCI_MAPREG_PCB_END; reg_rom = 0; break; default: return (0); } for (reg = reg_start; reg < reg_end; reg += 4) { if (!pci_mapreg_probe(pc, tag, reg, &type)) continue; if (pci_mapreg_info(pc, tag, reg, type, &base, &size, &flags)) continue; if (base == 0) continue; switch (type) { case PCI_MAPREG_TYPE_MEM | PCI_MAPREG_MEM_TYPE_32BIT: case PCI_MAPREG_TYPE_MEM | PCI_MAPREG_MEM_TYPE_64BIT: if (ISSET(flags, BUS_SPACE_MAP_PREFETCHABLE) && pa->pa_pmemex && extent_alloc_region(pa->pa_pmemex, base, size, EX_NOWAIT) == 0) { break; } #ifdef __sparc64__ /* * Certain SPARC T5 systems assign * non-prefetchable 64-bit BARs of its onboard * mpii(4) controllers addresses in the * prefetchable memory range. This is * (probably) safe, as reads from the device * registers mapped by these BARs are * side-effect free. So assume the firmware * knows what it is doing. */ if (base >= 0x100000000 && pa->pa_pmemex && extent_alloc_region(pa->pa_pmemex, base, size, EX_NOWAIT) == 0) { break; } #endif if (pa->pa_memex && extent_alloc_region(pa->pa_memex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: mem address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, reg, 0); if (type & PCI_MAPREG_MEM_TYPE_64BIT) pci_conf_write(pc, tag, reg + 4, 0); } break; case PCI_MAPREG_TYPE_IO: if (pa->pa_ioex && extent_alloc_region(pa->pa_ioex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: io address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, reg, 0); } break; } if (type & PCI_MAPREG_MEM_TYPE_64BIT) reg += 4; } if (reg_rom != 0) { s = splhigh(); addr = pci_conf_read(pc, tag, PCI_ROM_REG); pci_conf_write(pc, tag, PCI_ROM_REG, ~PCI_ROM_ENABLE); mask = pci_conf_read(pc, tag, PCI_ROM_REG); pci_conf_write(pc, tag, PCI_ROM_REG, addr); splx(s); base = PCI_ROM_ADDR(addr); size = PCI_ROM_SIZE(mask); if (base != 0 && size != 0) { if (pa->pa_pmemex && extent_alloc_region(pa->pa_pmemex, base, size, EX_NOWAIT) && pa->pa_memex && extent_alloc_region(pa->pa_memex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: rom address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, PCI_ROM_REG, 0); } } } if (PCI_HDRTYPE_TYPE(bhlc) != 1) return (0); /* Figure out the I/O address range of the bridge. */ blr = pci_conf_read(pc, tag, PPB_REG_IOSTATUS); base = (blr & 0x000000f0) << 8; limit = (blr & 0x000f000) | 0x00000fff; blr = pci_conf_read(pc, tag, PPB_REG_IO_HI); base |= (blr & 0x0000ffff) << 16; limit |= (blr & 0xffff0000); if (limit > base) size = (limit - base + 1); else size = 0; if (pa->pa_ioex && base > 0 && size > 0) { if (extent_alloc_region(pa->pa_ioex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: bridge io address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); blr &= 0xffff0000; blr |= 0x000000f0; pci_conf_write(pc, tag, PPB_REG_IOSTATUS, blr); } } /* Figure out the memory mapped I/O address range of the bridge. */ blr = pci_conf_read(pc, tag, PPB_REG_MEM); base = (blr & 0x0000fff0) << 16; limit = (blr & 0xfff00000) | 0x000fffff; if (limit > base) size = (limit - base + 1); else size = 0; if (pa->pa_memex && base > 0 && size > 0) { if (extent_alloc_region(pa->pa_memex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: bridge mem address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, PPB_REG_MEM, 0x0000fff0); } } /* Figure out the prefetchable memory address range of the bridge. */ blr = pci_conf_read(pc, tag, PPB_REG_PREFMEM); base = (blr & 0x0000fff0) << 16; limit = (blr & 0xfff00000) | 0x000fffff; #ifdef __LP64__ blr = pci_conf_read(pc, pa->pa_tag, PPB_REG_PREFBASE_HI32); base |= ((uint64_t)blr) << 32; blr = pci_conf_read(pc, pa->pa_tag, PPB_REG_PREFLIM_HI32); limit |= ((uint64_t)blr) << 32; #endif if (limit > base) size = (limit - base + 1); else size = 0; if (pa->pa_pmemex && base > 0 && size > 0) { if (extent_alloc_region(pa->pa_pmemex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: bridge mem address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, PPB_REG_PREFMEM, 0x0000fff0); } } else if (pa->pa_memex && base > 0 && size > 0) { if (extent_alloc_region(pa->pa_memex, base, size, EX_NOWAIT)) { printf("%d:%d:%d: bridge mem address conflict 0x%lx/0x%lx\n", bus, dev, func, base, size); pci_conf_write(pc, tag, PPB_REG_PREFMEM, 0x0000fff0); } } /* Figure out the bus range handled by the bridge. */ bir = pci_conf_read(pc, tag, PPB_REG_BUSINFO); sec = PPB_BUSINFO_SECONDARY(bir); sub = PPB_BUSINFO_SUBORDINATE(bir); if (pa->pa_busex && sub >= sec && sub > 0) { if (extent_alloc_region(pa->pa_busex, sec, sub - sec + 1, EX_NOWAIT)) { printf("%d:%d:%d: bridge bus conflict %d-%d\n", bus, dev, func, sec, sub); } } return (0); } /* * Vital Product Data (PCI 2.2) */ int pci_vpd_read(pci_chipset_tag_t pc, pcitag_t tag, int offset, int count, pcireg_t *data) { uint32_t reg; int ofs, i, j; KASSERT(data != NULL); if ((offset + count) >= PCI_VPD_ADDRESS_MASK) return (EINVAL); if (pci_get_capability(pc, tag, PCI_CAP_VPD, &ofs, &reg) == 0) return (ENXIO); for (i = 0; i < count; offset += sizeof(*data), i++) { reg &= 0x0000ffff; reg &= ~PCI_VPD_OPFLAG; reg |= PCI_VPD_ADDRESS(offset); pci_conf_write(pc, tag, ofs, reg); /* * PCI 2.2 does not specify how long we should poll * for completion nor whether the operation can fail. */ j = 0; do { if (j++ == 20) return (EIO); delay(4); reg = pci_conf_read(pc, tag, ofs); } while ((reg & PCI_VPD_OPFLAG) == 0); data[i] = pci_conf_read(pc, tag, PCI_VPD_DATAREG(ofs)); } return (0); } int pci_vpd_write(pci_chipset_tag_t pc, pcitag_t tag, int offset, int count, pcireg_t *data) { pcireg_t reg; int ofs, i, j; KASSERT(data != NULL); KASSERT((offset + count) < 0x7fff); if (pci_get_capability(pc, tag, PCI_CAP_VPD, &ofs, &reg) == 0) return (1); for (i = 0; i < count; offset += sizeof(*data), i++) { pci_conf_write(pc, tag, PCI_VPD_DATAREG(ofs), data[i]); reg &= 0x0000ffff; reg |= PCI_VPD_OPFLAG; reg |= PCI_VPD_ADDRESS(offset); pci_conf_write(pc, tag, ofs, reg); /* * PCI 2.2 does not specify how long we should poll * for completion nor whether the operation can fail. */ j = 0; do { if (j++ == 20) return (1); delay(1); reg = pci_conf_read(pc, tag, ofs); } while (reg & PCI_VPD_OPFLAG); } return (0); } int pci_matchbyid(struct pci_attach_args *pa, const struct pci_matchid *ids, int nent) { const struct pci_matchid *pm; int i; for (i = 0, pm = ids; i < nent; i++, pm++) if (PCI_VENDOR(pa->pa_id) == pm->pm_vid && PCI_PRODUCT(pa->pa_id) == pm->pm_pid) return (1); return (0); } void pci_disable_legacy_vga(struct device *dev) { struct pci_softc *pci; struct pci_dev *pd; /* XXX Until we attach the drm drivers directly to pci. */ while (dev->dv_parent->dv_cfdata->cf_driver != &pci_cd) dev = dev->dv_parent; pci = (struct pci_softc *)dev->dv_parent; LIST_FOREACH(pd, &pci->sc_devs, pd_next) { if (pd->pd_dev == dev) { pd->pd_vga_decode = 0; break; } } } #ifdef USER_PCICONF /* * This is the user interface to PCI configuration space. */ #include <sys/pciio.h> #include <sys/fcntl.h> #ifdef DEBUG #define PCIDEBUG(x) printf x #else #define PCIDEBUG(x) #endif void pci_disable_vga(pci_chipset_tag_t, pcitag_t); void pci_enable_vga(pci_chipset_tag_t, pcitag_t); void pci_route_vga(struct pci_softc *); void pci_unroute_vga(struct pci_softc *); int pciopen(dev_t dev, int oflags, int devtype, struct proc *p); int pciclose(dev_t dev, int flag, int devtype, struct proc *p); int pciioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); int pciopen(dev_t dev, int oflags, int devtype, struct proc *p) { PCIDEBUG(("pciopen ndevs: %d\n" , pci_cd.cd_ndevs)); if (minor(dev) >= pci_ndomains) { return ENXIO; } #ifndef APERTURE if ((oflags & FWRITE) && securelevel > 0) { return EPERM; } #else if ((oflags & FWRITE) && securelevel > 0 && allowaperture == 0) { return EPERM; } #endif return (0); } int pciclose(dev_t dev, int flag, int devtype, struct proc *p) { PCIDEBUG(("pciclose\n")); pci_vga_proc = NULL; return (0); } int pciioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { struct pcisel *sel = (struct pcisel *)data; struct pci_io *io; struct pci_dev *pd; struct pci_rom *rom; int i, error; pcitag_t tag; struct pci_softc *pci; pci_chipset_tag_t pc; switch (cmd) { case PCIOCREAD: case PCIOCREADMASK: break; case PCIOCWRITE: if (!(flag & FWRITE)) return EPERM; break; case PCIOCGETROMLEN: case PCIOCGETROM: case PCIOCGETVPD: break; case PCIOCGETVGA: case PCIOCSETVGA: if (pci_vga_pci == NULL) return EINVAL; break; default: return ENOTTY; } for (i = 0; i < pci_cd.cd_ndevs; i++) { pci = pci_cd.cd_devs[i]; if (pci != NULL && pci->sc_domain == minor(dev) && pci->sc_bus == sel->pc_bus) break; } if (i >= pci_cd.cd_ndevs) return ENXIO; /* Check bounds */ if (pci->sc_bus >= 256 || sel->pc_dev >= pci_bus_maxdevs(pci->sc_pc, pci->sc_bus) || sel->pc_func >= 8) return EINVAL; pc = pci->sc_pc; LIST_FOREACH(pd, &pci->sc_devs, pd_next) { int bus, dev, func; pci_decompose_tag(pc, pd->pd_tag, &bus, &dev, &func); if (bus == sel->pc_bus && dev == sel->pc_dev && func == sel->pc_func) break; } if (pd == LIST_END(&pci->sc_devs)) return ENXIO; tag = pci_make_tag(pc, sel->pc_bus, sel->pc_dev, sel->pc_func); switch (cmd) { case PCIOCREAD: io = (struct pci_io *)data; switch (io->pi_width) { case 4: /* Configuration space bounds check */ if (io->pi_reg < 0 || io->pi_reg >= pci_conf_size(pc, tag)) return EINVAL; /* Make sure the register is properly aligned */ if (io->pi_reg & 0x3) return EINVAL; io->pi_data = pci_conf_read(pc, tag, io->pi_reg); error = 0; break; default: error = EINVAL; break; } break; case PCIOCWRITE: io = (struct pci_io *)data; switch (io->pi_width) { case 4: /* Configuration space bounds check */ if (io->pi_reg < 0 || io->pi_reg >= pci_conf_size(pc, tag)) return EINVAL; /* Make sure the register is properly aligned */ if (io->pi_reg & 0x3) return EINVAL; pci_conf_write(pc, tag, io->pi_reg, io->pi_data); error = 0; break; default: error = EINVAL; break; } break; case PCIOCREADMASK: io = (struct pci_io *)data; if (io->pi_width != 4 || io->pi_reg & 0x3 || io->pi_reg < PCI_MAPREG_START || io->pi_reg >= PCI_MAPREG_END) return (EINVAL); i = (io->pi_reg - PCI_MAPREG_START) / 4; io->pi_data = pd->pd_mask[i]; error = 0; break; case PCIOCGETROMLEN: case PCIOCGETROM: { pcireg_t addr, mask, bhlc; bus_space_handle_t h; bus_size_t len, off; char buf[256]; int s; rom = (struct pci_rom *)data; bhlc = pci_conf_read(pc, tag, PCI_BHLC_REG); if (PCI_HDRTYPE_TYPE(bhlc) != 0) return (ENODEV); s = splhigh(); addr = pci_conf_read(pc, tag, PCI_ROM_REG); pci_conf_write(pc, tag, PCI_ROM_REG, ~PCI_ROM_ENABLE); mask = pci_conf_read(pc, tag, PCI_ROM_REG); pci_conf_write(pc, tag, PCI_ROM_REG, addr); splx(s); /* * Section 6.2.5.2 `Expansion ROM Base Address Register', * * tells us that only the upper 21 bits are writable. * This means that the size of a ROM must be a * multiple of 2 KB. So reading the ROM in chunks of * 256 bytes should work just fine. */ if ((PCI_ROM_ADDR(addr) == 0 || PCI_ROM_SIZE(mask) % sizeof(buf)) != 0) return (ENODEV); /* If we're just after the size, skip reading the ROM. */ if (cmd == PCIOCGETROMLEN) { error = 0; goto fail; } if (rom->pr_romlen < PCI_ROM_SIZE(mask)) { error = ENOMEM; goto fail; } error = bus_space_map(pci->sc_memt, PCI_ROM_ADDR(addr), PCI_ROM_SIZE(mask), 0, &h); if (error) goto fail; off = 0; len = PCI_ROM_SIZE(mask); while (len > 0 && error == 0) { s = splhigh(); pci_conf_write(pc, tag, PCI_ROM_REG, addr | PCI_ROM_ENABLE); bus_space_read_region_1(pci->sc_memt, h, off, buf, sizeof(buf)); pci_conf_write(pc, tag, PCI_ROM_REG, addr); splx(s); error = copyout(buf, rom->pr_rom + off, sizeof(buf)); off += sizeof(buf); len -= sizeof(buf); } bus_space_unmap(pci->sc_memt, h, PCI_ROM_SIZE(mask)); fail: rom->pr_romlen = PCI_ROM_SIZE(mask); break; } case PCIOCGETVPD: { struct pci_vpd_req *pv = (struct pci_vpd_req *)data; pcireg_t *data; size_t len; unsigned int i; int s; CTASSERT(sizeof(*data) == sizeof(*pv->pv_data)); data = mallocarray(pv->pv_count, sizeof(*data), M_TEMP, M_WAITOK|M_CANFAIL); if (data == NULL) { error = ENOMEM; break; } s = splhigh(); error = pci_vpd_read(pc, tag, pv->pv_offset, pv->pv_count, data); splx(s); len = pv->pv_count * sizeof(*pv->pv_data); if (error == 0) { for (i = 0; i < pv->pv_count; i++) data[i] = letoh32(data[i]); error = copyout(data, pv->pv_data, len); } free(data, M_TEMP, len); break; } case PCIOCGETVGA: { struct pci_vga *vga = (struct pci_vga *)data; struct pci_dev *pd; int bus, dev, func; vga->pv_decode = 0; LIST_FOREACH(pd, &pci->sc_devs, pd_next) { pci_decompose_tag(pc, pd->pd_tag, NULL, &dev, &func); if (dev == sel->pc_dev && func == sel->pc_func) { if (pd->pd_vga_decode) vga->pv_decode = PCI_VGA_IO_ENABLE | PCI_VGA_MEM_ENABLE; break; } } pci_decompose_tag(pci_vga_pci->sc_pc, pci_vga_tag, &bus, &dev, &func); vga->pv_sel.pc_bus = bus; vga->pv_sel.pc_dev = dev; vga->pv_sel.pc_func = func; error = 0; break; } case PCIOCSETVGA: { struct pci_vga *vga = (struct pci_vga *)data; int bus, dev, func; switch (vga->pv_lock) { case PCI_VGA_UNLOCK: case PCI_VGA_LOCK: case PCI_VGA_TRYLOCK: break; default: return (EINVAL); } if (vga->pv_lock == PCI_VGA_UNLOCK) { if (pci_vga_proc != p) return (EINVAL); pci_vga_proc = NULL; wakeup(&pci_vga_proc); return (0); } while (pci_vga_proc != p && pci_vga_proc != NULL) { if (vga->pv_lock == PCI_VGA_TRYLOCK) return (EBUSY); error = tsleep_nsec(&pci_vga_proc, PLOCK | PCATCH, "vgalk", INFSLP); if (error) return (error); } pci_vga_proc = p; pci_decompose_tag(pci_vga_pci->sc_pc, pci_vga_tag, &bus, &dev, &func); if (bus != vga->pv_sel.pc_bus || dev != vga->pv_sel.pc_dev || func != vga->pv_sel.pc_func) { pci_disable_vga(pci_vga_pci->sc_pc, pci_vga_tag); if (pci != pci_vga_pci) { pci_unroute_vga(pci_vga_pci); pci_route_vga(pci); pci_vga_pci = pci; } pci_enable_vga(pc, tag); pci_vga_tag = tag; } error = 0; break; } default: error = ENOTTY; break; } return (error); } void pci_disable_vga(pci_chipset_tag_t pc, pcitag_t tag) { pcireg_t csr; csr = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); csr &= ~(PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE); pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr); } void pci_enable_vga(pci_chipset_tag_t pc, pcitag_t tag) { pcireg_t csr; csr = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG); csr |= PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE; pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr); } void pci_route_vga(struct pci_softc *sc) { pci_chipset_tag_t pc = sc->sc_pc; pcireg_t bc; if (sc->sc_bridgetag == NULL) return; bc = pci_conf_read(pc, *sc->sc_bridgetag, PPB_REG_BRIDGECONTROL); bc |= PPB_BC_VGA_ENABLE; pci_conf_write(pc, *sc->sc_bridgetag, PPB_REG_BRIDGECONTROL, bc); pci_route_vga((struct pci_softc *)sc->sc_dev.dv_parent->dv_parent); } void pci_unroute_vga(struct pci_softc *sc) { pci_chipset_tag_t pc = sc->sc_pc; pcireg_t bc; if (sc->sc_bridgetag == NULL) return; bc = pci_conf_read(pc, *sc->sc_bridgetag, PPB_REG_BRIDGECONTROL); bc &= ~PPB_BC_VGA_ENABLE; pci_conf_write(pc, *sc->sc_bridgetag, PPB_REG_BRIDGECONTROL, bc); pci_unroute_vga((struct pci_softc *)sc->sc_dev.dv_parent->dv_parent); } #endif /* USER_PCICONF */ int pci_primary_vga(struct pci_attach_args *pa) { /* XXX For now, only handle the first PCI domain. */ if (pa->pa_domain != 0) return (0); if ((PCI_CLASS(pa->pa_class) != PCI_CLASS_DISPLAY || PCI_SUBCLASS(pa->pa_class) != PCI_SUBCLASS_DISPLAY_VGA) && (PCI_CLASS(pa->pa_class) != PCI_CLASS_PREHISTORIC || PCI_SUBCLASS(pa->pa_class) != PCI_SUBCLASS_PREHISTORIC_VGA)) return (0); if ((pci_conf_read(pa->pa_pc, pa->pa_tag, PCI_COMMAND_STATUS_REG) & (PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE)) != (PCI_COMMAND_IO_ENABLE | PCI_COMMAND_MEM_ENABLE)) return (0); pci_vga_tag = pa->pa_tag; return (1); } #ifdef __HAVE_PCI_MSIX struct msix_vector * pci_alloc_msix_table(pci_chipset_tag_t pc, pcitag_t tag) { struct msix_vector *table; pcireg_t reg; int tblsz; if (pci_get_capability(pc, tag, PCI_CAP_MSIX, NULL, &reg) == 0) return NULL; tblsz = PCI_MSIX_MC_TBLSZ(reg) + 1; table = mallocarray(tblsz, sizeof(*table), M_DEVBUF, M_WAITOK); return table; } void pci_free_msix_table(pci_chipset_tag_t pc, pcitag_t tag, struct msix_vector *table) { pcireg_t reg; int tblsz; if (pci_get_capability(pc, tag, PCI_CAP_MSIX, NULL, &reg) == 0) return; tblsz = PCI_MSIX_MC_TBLSZ(reg) + 1; free(table, M_DEVBUF, tblsz * sizeof(*table)); } void pci_suspend_msix(pci_chipset_tag_t pc, pcitag_t tag, bus_space_tag_t memt, pcireg_t *mc, struct msix_vector *table) { bus_space_handle_t memh; pcireg_t reg; int tblsz, i; if (pci_get_capability(pc, tag, PCI_CAP_MSIX, NULL, &reg) == 0) return; KASSERT(table != NULL); if (pci_msix_table_map(pc, tag, memt, &memh)) return; tblsz = PCI_MSIX_MC_TBLSZ(reg) + 1; for (i = 0; i < tblsz; i++) { table[i].mv_ma = bus_space_read_4(memt, memh, PCI_MSIX_MA(i)); table[i].mv_mau32 = bus_space_read_4(memt, memh, PCI_MSIX_MAU32(i)); table[i].mv_md = bus_space_read_4(memt, memh, PCI_MSIX_MD(i)); table[i].mv_vc = bus_space_read_4(memt, memh, PCI_MSIX_VC(i)); } pci_msix_table_unmap(pc, tag, memt, memh); *mc = reg; } void pci_resume_msix(pci_chipset_tag_t pc, pcitag_t tag, bus_space_tag_t memt, pcireg_t mc, struct msix_vector *table) { bus_space_handle_t memh; pcireg_t reg; int tblsz, i; int off; if (pci_get_capability(pc, tag, PCI_CAP_MSIX, &off, &reg) == 0) return; KASSERT(table != NULL); if (pci_msix_table_map(pc, tag, memt, &memh)) return; tblsz = PCI_MSIX_MC_TBLSZ(reg) + 1; for (i = 0; i < tblsz; i++) { bus_space_write_4(memt, memh, PCI_MSIX_MA(i), table[i].mv_ma); bus_space_write_4(memt, memh, PCI_MSIX_MAU32(i), table[i].mv_mau32); bus_space_write_4(memt, memh, PCI_MSIX_MD(i), table[i].mv_md); bus_space_barrier(memt, memh, PCI_MSIX_MA(i), 16, BUS_SPACE_BARRIER_WRITE); bus_space_write_4(memt, memh, PCI_MSIX_VC(i), table[i].mv_vc); bus_space_barrier(memt, memh, PCI_MSIX_VC(i), 4, BUS_SPACE_BARRIER_WRITE); } pci_msix_table_unmap(pc, tag, memt, memh); pci_conf_write(pc, tag, off, mc); } int pci_intr_msix_count(struct pci_attach_args *pa) { pcireg_t reg; if ((pa->pa_flags & PCI_FLAGS_MSI_ENABLED) == 0) return (0); if (pci_get_capability(pa->pa_pc, pa->pa_tag, PCI_CAP_MSIX, NULL, &reg) == 0) return (0); return (PCI_MSIX_MC_TBLSZ(reg) + 1); } #else /* __HAVE_PCI_MSIX */ struct msix_vector * pci_alloc_msix_table(pci_chipset_tag_t pc, pcitag_t tag) { return NULL; } void pci_free_msix_table(pci_chipset_tag_t pc, pcitag_t tag, struct msix_vector *table) { } void pci_suspend_msix(pci_chipset_tag_t pc, pcitag_t tag, bus_space_tag_t memt, pcireg_t *mc, struct msix_vector *table) { } void pci_resume_msix(pci_chipset_tag_t pc, pcitag_t tag, bus_space_tag_t memt, pcireg_t mc, struct msix_vector *table) { } int pci_intr_msix_count(struct pci_attach_args *pa) { return (0); } #endif /* __HAVE_PCI_MSIX */
15 14 15 15 15 7 14 3 11 4 4 3 3 3 3 3 3 3 2 3 107 107 14 100 100 100 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* $OpenBSD: ufs_bmap.c,v 1.37 2021/12/12 09:14:59 visa Exp $ */ /* $NetBSD: ufs_bmap.c,v 1.3 1996/02/09 22:36:00 christos Exp $ */ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/specdev.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(void *v) { struct vop_bmap_args *ap = v; /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap, int *nump, int *runp) { struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; daddr_t daddr, metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { /* * XXX * If MAXBSIZE is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ *runp = 0; maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; } xap = ap == NULL ? a : ap; if (!nump) nump = &num; if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, DIP(ip, db[bn])); if (*bnp == 0) *bnp = -1; else if (runp) for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, DIP(ip, db[bn - 1]), DIP(ip, db[bn])); ++bn, ++*runp); return (0); } /* Get disk address out of indirect block array */ daddr = DIP(ip, ib[xap->in_off]); devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) brelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, INFSLP); if (bp->b_flags & (B_DONE | B_DELWRI)) { ; } #ifdef DIAGNOSTIC else if (!daddr) panic("ufs_bmaparray: indirect block not in cache"); #endif else { bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; bcstats.pendingreads++; bcstats.numreads++; VOP_STRATEGY(bp->b_vp, bp); curproc->p_ru.ru_inblock++; /* XXX */ if ((error = biowait(bp)) != 0) { brelse(bp); return (error); } } #ifdef FFS2 if (ip->i_ump->um_fstype == UM_UFS2) { daddr = ((int64_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((int64_t *)bp->b_data)[bn - 1], ((int64_t *)bp->b_data)[bn]); ++bn, ++*runp); continue; } #endif /* FFS2 */ daddr = ((int32_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((int32_t *)bp->b_data)[bn - 1], ((int32_t *)bp->b_data)[bn]); ++bn, ++*runp); } if (bp) brelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ffs_ib and * once with the offset into the page itself. */ int ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump) { daddr_t metalbn, realbn; struct ufsmount *ump; int64_t blockcnt; int i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if (bn < 0) bn = -bn; #ifdef DIAGNOSTIC if (realbn < 0 && realbn > -NDADDR) { panic ("ufs_getlbns: Invalid indirect block %lld specified", (long long)realbn); } #endif /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); blockcnt *= MNINDIR(ump); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(ump); off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; } #ifdef DIAGNOSTIC if (realbn < 0 && metalbn != realbn) { panic("ufs_getlbns: indirect block %lld not found", (long long)realbn); } #endif if (nump) *nump = numlevels; return (0); }
217 2 215 2 215 19 207 210 17 217 217 217 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 /* $OpenBSD: vioscsi.c,v 1.30 2022/04/16 19:19:59 naddy Exp $ */ /* * Copyright (c) 2013 Google Inc. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/mutex.h> #include <machine/bus.h> #include <machine/intr.h> #include <dev/pv/vioscsireg.h> #include <dev/pv/virtiovar.h> #include <scsi/scsi_all.h> #include <scsi/scsiconf.h> enum { vioscsi_debug = 0 }; #define DPRINTF(f...) do { if (vioscsi_debug) printf(f); } while (0) /* Number of DMA segments for buffers that the device must support */ #define SEG_MAX (MAXPHYS/PAGE_SIZE + 1) /* In the virtqueue, we need space for header and footer, too */ #define ALLOC_SEGS (SEG_MAX + 2) struct vioscsi_req { struct virtio_scsi_req_hdr vr_req; struct virtio_scsi_res_hdr vr_res; struct scsi_xfer *vr_xs; bus_dmamap_t vr_control; bus_dmamap_t vr_data; SLIST_ENTRY(vioscsi_req) vr_list; int vr_qe_index; }; struct vioscsi_softc { struct device sc_dev; struct scsi_iopool sc_iopool; struct mutex sc_vr_mtx; struct virtqueue sc_vqs[3]; struct vioscsi_req *sc_reqs; bus_dma_segment_t sc_reqs_segs[1]; SLIST_HEAD(, vioscsi_req) sc_freelist; }; int vioscsi_match(struct device *, void *, void *); void vioscsi_attach(struct device *, struct device *, void *); int vioscsi_alloc_reqs(struct vioscsi_softc *, struct virtio_softc *, int); void vioscsi_scsi_cmd(struct scsi_xfer *); int vioscsi_vq_done(struct virtqueue *); void vioscsi_req_done(struct vioscsi_softc *, struct virtio_softc *, struct vioscsi_req *); void *vioscsi_req_get(void *); void vioscsi_req_put(void *, void *); const struct cfattach vioscsi_ca = { sizeof(struct vioscsi_softc), vioscsi_match, vioscsi_attach, }; struct cfdriver vioscsi_cd = { NULL, "vioscsi", DV_DULL, }; const struct scsi_adapter vioscsi_switch = { vioscsi_scsi_cmd, NULL, NULL, NULL, NULL }; const char *const vioscsi_vq_names[] = { "control", "event", "request", }; int vioscsi_match(struct device *parent, void *self, void *aux) { struct virtio_softc *va = (struct virtio_softc *)aux; if (va->sc_childdevid == PCI_PRODUCT_VIRTIO_SCSI) return (1); return (0); } void vioscsi_attach(struct device *parent, struct device *self, void *aux) { struct virtio_softc *vsc = (struct virtio_softc *)parent; struct vioscsi_softc *sc = (struct vioscsi_softc *)self; struct scsibus_attach_args saa; int i, rv; if (vsc->sc_child != NULL) { printf(": parent already has a child\n"); return; } vsc->sc_child = &sc->sc_dev; vsc->sc_ipl = IPL_BIO; // TODO(matthew): Negotiate hotplug. vsc->sc_vqs = sc->sc_vqs; vsc->sc_nvqs = nitems(sc->sc_vqs); virtio_negotiate_features(vsc, NULL); uint32_t cmd_per_lun = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_CMD_PER_LUN); uint32_t seg_max = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_SEG_MAX); uint16_t max_target = virtio_read_device_config_2(vsc, VIRTIO_SCSI_CONFIG_MAX_TARGET); if (seg_max < SEG_MAX) { printf("\nMax number of segments %d too small\n", seg_max); goto err; } for (i = 0; i < nitems(sc->sc_vqs); i++) { rv = virtio_alloc_vq(vsc, &sc->sc_vqs[i], i, MAXPHYS, ALLOC_SEGS, vioscsi_vq_names[i]); if (rv) { printf(": failed to allocate virtqueue %d\n", i); goto err; } sc->sc_vqs[i].vq_done = vioscsi_vq_done; } int qsize = sc->sc_vqs[2].vq_num; printf(": qsize %d\n", qsize); SLIST_INIT(&sc->sc_freelist); mtx_init(&sc->sc_vr_mtx, IPL_BIO); scsi_iopool_init(&sc->sc_iopool, sc, vioscsi_req_get, vioscsi_req_put); int nreqs = vioscsi_alloc_reqs(sc, vsc, qsize); if (nreqs == 0) { printf("\nCan't alloc reqs\n"); goto err; } saa.saa_adapter = &vioscsi_switch; saa.saa_adapter_softc = sc; saa.saa_adapter_target = SDEV_NO_ADAPTER_TARGET; saa.saa_adapter_buswidth = max_target; saa.saa_luns = 8; saa.saa_openings = (nreqs > cmd_per_lun) ? cmd_per_lun : nreqs; saa.saa_pool = &sc->sc_iopool; saa.saa_quirks = saa.saa_flags = 0; saa.saa_wwpn = saa.saa_wwnn = 0; config_found(self, &saa, scsiprint); return; err: vsc->sc_child = VIRTIO_CHILD_ERROR; return; } void vioscsi_scsi_cmd(struct scsi_xfer *xs) { struct vioscsi_softc *sc = xs->sc_link->bus->sb_adapter_softc; struct virtio_softc *vsc = (struct virtio_softc *)sc->sc_dev.dv_parent; struct vioscsi_req *vr = xs->io; struct virtio_scsi_req_hdr *req = &vr->vr_req; struct virtqueue *vq = &sc->sc_vqs[2]; int slot = vr->vr_qe_index; DPRINTF("vioscsi_scsi_cmd: enter\n"); // TODO(matthew): Support bidirectional SCSI commands? if ((xs->flags & (SCSI_DATA_IN | SCSI_DATA_OUT)) == (SCSI_DATA_IN | SCSI_DATA_OUT)) { goto stuffup; } vr->vr_xs = xs; /* * "The only supported format for the LUN field is: first byte set to * 1, second byte set to target, third and fourth byte representing a * single level LUN structure, followed by four zero bytes." */ if (xs->sc_link->target >= 256 || xs->sc_link->lun >= 16384) goto stuffup; req->lun[0] = 1; req->lun[1] = xs->sc_link->target; req->lun[2] = 0x40 | (xs->sc_link->lun >> 8); req->lun[3] = xs->sc_link->lun; memset(req->lun + 4, 0, 4); if ((size_t)xs->cmdlen > sizeof(req->cdb)) goto stuffup; memset(req->cdb, 0, sizeof(req->cdb)); memcpy(req->cdb, &xs->cmd, xs->cmdlen); int isread = !!(xs->flags & SCSI_DATA_IN); int nsegs = 2; if (xs->flags & (SCSI_DATA_IN | SCSI_DATA_OUT)) { if (bus_dmamap_load(vsc->sc_dmat, vr->vr_data, xs->data, xs->datalen, NULL, ((isread ? BUS_DMA_READ : BUS_DMA_WRITE) | BUS_DMA_NOWAIT))) goto stuffup; nsegs += vr->vr_data->dm_nsegs; } /* * Adjust reservation to the number needed, or virtio gets upset. Note * that it may trim UP if 'xs' is being recycled w/o getting a new * reservation! */ int s = splbio(); virtio_enqueue_trim(vq, slot, nsegs); splx(s); bus_dmamap_sync(vsc->sc_dmat, vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_PREWRITE); bus_dmamap_sync(vsc->sc_dmat, vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_PREREAD); if (xs->flags & (SCSI_DATA_IN | SCSI_DATA_OUT)) bus_dmamap_sync(vsc->sc_dmat, vr->vr_data, 0, xs->datalen, isread ? BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE); s = splbio(); virtio_enqueue_p(vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), 1); if (xs->flags & SCSI_DATA_OUT) virtio_enqueue(vq, slot, vr->vr_data, 1); virtio_enqueue_p(vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), 0); if (xs->flags & SCSI_DATA_IN) virtio_enqueue(vq, slot, vr->vr_data, 0); virtio_enqueue_commit(vsc, vq, slot, 1); if (ISSET(xs->flags, SCSI_POLL)) { DPRINTF("vioscsi_scsi_cmd: polling...\n"); int timeout = 1000; do { virtio_poll_intr(vsc); if (vr->vr_xs != xs) break; delay(1000); } while (--timeout > 0); if (vr->vr_xs == xs) { // TODO(matthew): Abort the request. xs->error = XS_TIMEOUT; xs->resid = xs->datalen; DPRINTF("vioscsi_scsi_cmd: polling timeout\n"); scsi_done(xs); } DPRINTF("vioscsi_scsi_cmd: done (timeout=%d)\n", timeout); } splx(s); return; stuffup: xs->error = XS_DRIVER_STUFFUP; xs->resid = xs->datalen; DPRINTF("vioscsi_scsi_cmd: stuffup\n"); scsi_done(xs); } void vioscsi_req_done(struct vioscsi_softc *sc, struct virtio_softc *vsc, struct vioscsi_req *vr) { struct scsi_xfer *xs = vr->vr_xs; DPRINTF("vioscsi_req_done: enter vr: %p xs: %p\n", vr, xs); int isread = !!(xs->flags & SCSI_DATA_IN); bus_dmamap_sync(vsc->sc_dmat, vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_POSTWRITE); bus_dmamap_sync(vsc->sc_dmat, vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_POSTREAD); if (xs->flags & (SCSI_DATA_IN | SCSI_DATA_OUT)) { bus_dmamap_sync(vsc->sc_dmat, vr->vr_data, 0, xs->datalen, isread ? BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(vsc->sc_dmat, vr->vr_data); } if (vr->vr_res.response != VIRTIO_SCSI_S_OK) { xs->error = XS_DRIVER_STUFFUP; xs->resid = xs->datalen; DPRINTF("vioscsi_req_done: stuffup: %d\n", vr->vr_res.response); goto done; } size_t sense_len = MIN(sizeof(xs->sense), vr->vr_res.sense_len); memcpy(&xs->sense, vr->vr_res.sense, sense_len); xs->error = (sense_len == 0) ? XS_NOERROR : XS_SENSE; xs->status = vr->vr_res.status; xs->resid = vr->vr_res.residual; DPRINTF("vioscsi_req_done: done %d, %d, %zd\n", xs->error, xs->status, xs->resid); done: vr->vr_xs = NULL; scsi_done(xs); } int vioscsi_vq_done(struct virtqueue *vq) { struct virtio_softc *vsc = vq->vq_owner; struct vioscsi_softc *sc = (struct vioscsi_softc *)vsc->sc_child; struct vq_entry *qe; struct vioscsi_req *vr; int ret = 0; DPRINTF("vioscsi_vq_done: enter\n"); for (;;) { int r, s, slot; s = splbio(); r = virtio_dequeue(vsc, vq, &slot, NULL); splx(s); if (r != 0) break; DPRINTF("vioscsi_vq_done: slot=%d\n", slot); qe = &vq->vq_entries[slot]; vr = &sc->sc_reqs[qe->qe_vr_index]; vioscsi_req_done(sc, vsc, vr); ret = 1; } DPRINTF("vioscsi_vq_done: exit %d\n", ret); return (ret); } /* * vioscso_req_get() provides the SCSI layer with all the * resources necessary to start an I/O on the device. * * Since the size of the I/O is unknown at this time the * resources allocated (a.k.a. reserved) must be sufficient * to allow the maximum possible I/O size. * * When the I/O is actually attempted via vioscsi_scsi_cmd() * excess resources will be returned via virtio_enqueue_trim(). */ void * vioscsi_req_get(void *cookie) { struct vioscsi_softc *sc = cookie; struct vioscsi_req *vr = NULL; mtx_enter(&sc->sc_vr_mtx); vr = SLIST_FIRST(&sc->sc_freelist); if (vr != NULL) SLIST_REMOVE_HEAD(&sc->sc_freelist, vr_list); mtx_leave(&sc->sc_vr_mtx); DPRINTF("vioscsi_req_get: %p\n", vr); return (vr); } void vioscsi_req_put(void *cookie, void *io) { struct vioscsi_softc *sc = cookie; struct vioscsi_req *vr = io; DPRINTF("vioscsi_req_put: %p\n", vr); mtx_enter(&sc->sc_vr_mtx); /* * Do *NOT* call virtio_dequeue_commit()! * * Descriptors are permanently associated with the vioscsi_req and * should not be placed on the free list! */ SLIST_INSERT_HEAD(&sc->sc_freelist, vr, vr_list); mtx_leave(&sc->sc_vr_mtx); } int vioscsi_alloc_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc, int qsize) { struct virtqueue *vq = &sc->sc_vqs[2]; struct vioscsi_req *vr; struct vring_desc *vd; size_t allocsize; int i, r, nreqs, rsegs, slot; void *vaddr; if (vq->vq_indirect != NULL) nreqs = qsize; else nreqs = qsize / ALLOC_SEGS; allocsize = nreqs * sizeof(struct vioscsi_req); r = bus_dmamem_alloc(vsc->sc_dmat, allocsize, 0, 0, &sc->sc_reqs_segs[0], 1, &rsegs, BUS_DMA_NOWAIT); if (r != 0) { printf("bus_dmamem_alloc, size %zd, error %d\n", allocsize, r); return 0; } r = bus_dmamem_map(vsc->sc_dmat, &sc->sc_reqs_segs[0], 1, allocsize, (caddr_t *)&vaddr, BUS_DMA_NOWAIT); if (r != 0) { printf("bus_dmamem_map failed, error %d\n", r); bus_dmamem_free(vsc->sc_dmat, &sc->sc_reqs_segs[0], 1); return 0; } sc->sc_reqs = vaddr; memset(vaddr, 0, allocsize); for (i = 0; i < nreqs; i++) { /* * Assign descriptors and create the DMA maps for each * allocated request. */ vr = &sc->sc_reqs[i]; r = virtio_enqueue_prep(vq, &slot); if (r == 0) r = virtio_enqueue_reserve(vq, slot, ALLOC_SEGS); if (r != 0) return i; if (vq->vq_indirect == NULL) { /* * The reserved slots must be a contiguous block * starting at vq_desc[slot]. */ vd = &vq->vq_desc[slot]; for (r = 0; r < ALLOC_SEGS - 1; r++) { DPRINTF("vd[%d].next = %d should be %d\n", r, vd[r].next, (slot + r + 1)); if (vd[r].next != (slot + r + 1)) return i; } if (r == (ALLOC_SEGS -1) && vd[r].next != 0) return i; DPRINTF("Reserved slots are contiguous as required!\n"); } vr->vr_qe_index = slot; vr->vr_req.id = slot; vr->vr_req.task_attr = VIRTIO_SCSI_S_SIMPLE; vq->vq_entries[slot].qe_vr_index = i; r = bus_dmamap_create(vsc->sc_dmat, offsetof(struct vioscsi_req, vr_xs), 1, offsetof(struct vioscsi_req, vr_xs), 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_control); if (r != 0) { printf("bus_dmamap_create vr_control failed, error %d\n", r); return i; } r = bus_dmamap_create(vsc->sc_dmat, MAXPHYS, SEG_MAX, MAXPHYS, 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_data); if (r != 0) { printf("bus_dmamap_create vr_data failed, error %d\n", r ); return i; } r = bus_dmamap_load(vsc->sc_dmat, vr->vr_control, vr, offsetof(struct vioscsi_req, vr_xs), NULL, BUS_DMA_NOWAIT); if (r != 0) { printf("bus_dmamap_load vr_control failed, error %d\n", r ); return i; } SLIST_INSERT_HEAD(&sc->sc_freelist, vr, vr_list); } return nreqs; }
15 2 9 4 11 6 8 5 16 11 5 1 12 8 1 1 7 2 8 1 7 7 884 883 15 15 18 17 1138 1137 25 25 478 476 1887 1879 989 988 466 468 134 135 12 179 178 134 134 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 /* $OpenBSD: kern_pledge.c,v 1.295 2022/09/05 16:37:47 mbuhl Exp $ */ /* * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org> * Copyright (c) 2015 Theo de Raadt <deraadt@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/mount.h> #include <sys/proc.h> #include <sys/mutex.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/namei.h> #include <sys/socketvar.h> #include <sys/vnode.h> #include <sys/mman.h> #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/ktrace.h> #include <sys/acct.h> #include <sys/swap.h> #include <sys/ioctl.h> #include <sys/termios.h> #include <sys/tty.h> #include <sys/device.h> #include <sys/disklabel.h> #include <sys/dkio.h> #include <sys/mtio.h> #include <sys/audioio.h> #include <sys/videoio.h> #include <net/bpf.h> #include <net/route.h> #include <net/if.h> #include <net/if_var.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #include <netinet6/nd6.h> #include <netinet/tcp.h> #include <net/pfvar.h> #include <sys/conf.h> #include <sys/specdev.h> #include <sys/signal.h> #include <sys/signalvar.h> #include <sys/syscall.h> #include <sys/syscallargs.h> #include <sys/systm.h> #include <dev/biovar.h> #define PLEDGENAMES #include <sys/pledge.h> #include "audio.h" #include "bpfilter.h" #include "pf.h" #include "video.h" #include "pty.h" #if defined(__amd64__) #include "vmm.h" #if NVMM > 0 #include <machine/conf.h> #endif #endif #include "drm.h" uint64_t pledgereq_flags(const char *req); int parsepledges(struct proc *p, const char *kname, const char *promises, u_int64_t *fp); int canonpath(const char *input, char *buf, size_t bufsize); void unveil_destroy(struct process *ps); /* #define DEBUG_PLEDGE */ #ifdef DEBUG_PLEDGE int debug_pledge = 1; #define DPRINTF(x...) do { if (debug_pledge) printf(x); } while (0) #define DNPRINTF(n,x...) do { if (debug_pledge >= (n)) printf(x); } while (0) #else #define DPRINTF(x...) #define DNPRINTF(n,x...) #endif /* * Ordered in blocks starting with least risky and most required. */ const uint64_t pledge_syscalls[SYS_MAXSYSCALL] = { /* * Minimum required */ [SYS_exit] = PLEDGE_ALWAYS, [SYS_kbind] = PLEDGE_ALWAYS, [SYS_msyscall] = PLEDGE_ALWAYS, [SYS___get_tcb] = PLEDGE_ALWAYS, [SYS___set_tcb] = PLEDGE_ALWAYS, [SYS_pledge] = PLEDGE_ALWAYS, [SYS_sendsyslog] = PLEDGE_ALWAYS, /* stack protector reporting */ [SYS_thrkill] = PLEDGE_ALWAYS, /* raise, abort, stack pro */ [SYS_utrace] = PLEDGE_ALWAYS, /* ltrace(1) from ld.so */ /* "getting" information about self is considered safe */ [SYS_getuid] = PLEDGE_STDIO, [SYS_geteuid] = PLEDGE_STDIO, [SYS_getresuid] = PLEDGE_STDIO, [SYS_getgid] = PLEDGE_STDIO, [SYS_getegid] = PLEDGE_STDIO, [SYS_getresgid] = PLEDGE_STDIO, [SYS_getgroups] = PLEDGE_STDIO, [SYS_getlogin_r] = PLEDGE_STDIO, [SYS_getpgrp] = PLEDGE_STDIO, [SYS_getpgid] = PLEDGE_STDIO, [SYS_getppid] = PLEDGE_STDIO, [SYS_getsid] = PLEDGE_STDIO, [SYS_getthrid] = PLEDGE_STDIO, [SYS_getrlimit] = PLEDGE_STDIO, [SYS_getrtable] = PLEDGE_STDIO, [SYS_gettimeofday] = PLEDGE_STDIO, [SYS_getdtablecount] = PLEDGE_STDIO, [SYS_getrusage] = PLEDGE_STDIO, [SYS_issetugid] = PLEDGE_STDIO, [SYS_clock_getres] = PLEDGE_STDIO, [SYS_clock_gettime] = PLEDGE_STDIO, [SYS_getpid] = PLEDGE_STDIO, /* * Almost exclusively read-only, Very narrow subset. * Use of "route", "inet", "dns", "ps", or "vminfo" * expands access. */ [SYS_sysctl] = PLEDGE_STDIO, /* Support for malloc(3) family of operations */ [SYS_getentropy] = PLEDGE_STDIO, [SYS_madvise] = PLEDGE_STDIO, [SYS_minherit] = PLEDGE_STDIO, [SYS_mmap] = PLEDGE_STDIO, [SYS_mprotect] = PLEDGE_STDIO, [SYS_mquery] = PLEDGE_STDIO, [SYS_munmap] = PLEDGE_STDIO, [SYS_msync] = PLEDGE_STDIO, [SYS_break] = PLEDGE_STDIO, [SYS_umask] = PLEDGE_STDIO, /* read/write operations */ [SYS_read] = PLEDGE_STDIO, [SYS_readv] = PLEDGE_STDIO, [SYS_pread] = PLEDGE_STDIO, [SYS_preadv] = PLEDGE_STDIO, [SYS_write] = PLEDGE_STDIO, [SYS_writev] = PLEDGE_STDIO, [SYS_pwrite] = PLEDGE_STDIO, [SYS_pwritev] = PLEDGE_STDIO, [SYS_recvmsg] = PLEDGE_STDIO, [SYS_recvmmsg] = PLEDGE_STDIO, [SYS_recvfrom] = PLEDGE_STDIO, [SYS_ftruncate] = PLEDGE_STDIO, [SYS_lseek] = PLEDGE_STDIO, [SYS_fpathconf] = PLEDGE_STDIO, #if 1 [SYS_pad_mquery] = PLEDGE_STDIO, [SYS_pad_mmap] = PLEDGE_STDIO, [SYS_pad_pread] = PLEDGE_STDIO, [SYS_pad_preadv] = PLEDGE_STDIO, [SYS_pad_pwrite] = PLEDGE_STDIO, [SYS_pad_pwritev] = PLEDGE_STDIO, [SYS_pad_ftruncate] = PLEDGE_STDIO, [SYS_pad_lseek] = PLEDGE_STDIO, [SYS_pad_truncate] = PLEDGE_WPATH, #endif /* * Address selection required a network pledge ("inet", * "unix", "dns". */ [SYS_sendto] = PLEDGE_STDIO, /* * Address specification required a network pledge ("inet", * "unix", "dns". SCM_RIGHTS requires "sendfd" or "recvfd". */ [SYS_sendmsg] = PLEDGE_STDIO, [SYS_sendmmsg] = PLEDGE_STDIO, /* Common signal operations */ [SYS_nanosleep] = PLEDGE_STDIO, [SYS_sigaltstack] = PLEDGE_STDIO, [SYS_sigprocmask] = PLEDGE_STDIO, [SYS_sigsuspend] = PLEDGE_STDIO, [SYS_sigaction] = PLEDGE_STDIO, [SYS_sigreturn] = PLEDGE_STDIO, [SYS_sigpending] = PLEDGE_STDIO, [SYS_getitimer] = PLEDGE_STDIO, [SYS_setitimer] = PLEDGE_STDIO, /* * To support event driven programming. */ [SYS_poll] = PLEDGE_STDIO, [SYS_ppoll] = PLEDGE_STDIO, [SYS_kevent] = PLEDGE_STDIO, [SYS_kqueue] = PLEDGE_STDIO, [SYS_select] = PLEDGE_STDIO, [SYS_pselect] = PLEDGE_STDIO, [SYS_fstat] = PLEDGE_STDIO, [SYS_fsync] = PLEDGE_STDIO, [SYS_setsockopt] = PLEDGE_STDIO, /* narrow whitelist */ [SYS_getsockopt] = PLEDGE_STDIO, /* narrow whitelist */ /* F_SETOWN requires PLEDGE_PROC */ [SYS_fcntl] = PLEDGE_STDIO, [SYS_close] = PLEDGE_STDIO, [SYS_dup] = PLEDGE_STDIO, [SYS_dup2] = PLEDGE_STDIO, [SYS_dup3] = PLEDGE_STDIO, [SYS_closefrom] = PLEDGE_STDIO, [SYS_shutdown] = PLEDGE_STDIO, [SYS_fchdir] = PLEDGE_STDIO, /* XXX consider tightening */ [SYS_pipe] = PLEDGE_STDIO, [SYS_pipe2] = PLEDGE_STDIO, [SYS_socketpair] = PLEDGE_STDIO, [SYS_wait4] = PLEDGE_STDIO, /* * Can kill self with "stdio". Killing another pid * requires "proc" */ [SYS_kill] = PLEDGE_STDIO, /* * FIONREAD/FIONBIO for "stdio" * Other ioctl are selectively allowed based upon other pledges. */ [SYS_ioctl] = PLEDGE_STDIO, /* * Path access/creation calls encounter many extensive * checks done during pledge_namei() */ [SYS_open] = PLEDGE_STDIO, [SYS_stat] = PLEDGE_STDIO, [SYS_access] = PLEDGE_STDIO, [SYS_readlink] = PLEDGE_STDIO, [SYS___realpath] = PLEDGE_STDIO, [SYS_adjtime] = PLEDGE_STDIO, /* setting requires "settime" */ [SYS_adjfreq] = PLEDGE_SETTIME, [SYS_settimeofday] = PLEDGE_SETTIME, /* * Needed by threaded programs * XXX should we have a new "threads"? */ [SYS___tfork] = PLEDGE_STDIO, [SYS_sched_yield] = PLEDGE_STDIO, [SYS_futex] = PLEDGE_STDIO, [SYS___thrsleep] = PLEDGE_STDIO, [SYS___thrwakeup] = PLEDGE_STDIO, [SYS___threxit] = PLEDGE_STDIO, [SYS___thrsigdivert] = PLEDGE_STDIO, [SYS_fork] = PLEDGE_PROC, [SYS_vfork] = PLEDGE_PROC, [SYS_setpgid] = PLEDGE_PROC, [SYS_setsid] = PLEDGE_PROC, [SYS_setrlimit] = PLEDGE_PROC | PLEDGE_ID, [SYS_getpriority] = PLEDGE_PROC | PLEDGE_ID, [SYS_setpriority] = PLEDGE_PROC | PLEDGE_ID, [SYS_setuid] = PLEDGE_ID, [SYS_seteuid] = PLEDGE_ID, [SYS_setreuid] = PLEDGE_ID, [SYS_setresuid] = PLEDGE_ID, [SYS_setgid] = PLEDGE_ID, [SYS_setegid] = PLEDGE_ID, [SYS_setregid] = PLEDGE_ID, [SYS_setresgid] = PLEDGE_ID, [SYS_setgroups] = PLEDGE_ID, [SYS_setlogin] = PLEDGE_ID, [SYS_setrtable] = PLEDGE_ID, [SYS_unveil] = PLEDGE_UNVEIL, [SYS_execve] = PLEDGE_EXEC, [SYS_chdir] = PLEDGE_RPATH, [SYS_openat] = PLEDGE_RPATH | PLEDGE_WPATH, [SYS_fstatat] = PLEDGE_RPATH | PLEDGE_WPATH, [SYS_faccessat] = PLEDGE_RPATH | PLEDGE_WPATH, [SYS_readlinkat] = PLEDGE_RPATH | PLEDGE_WPATH, [SYS_lstat] = PLEDGE_RPATH | PLEDGE_WPATH | PLEDGE_TMPPATH, [SYS_truncate] = PLEDGE_WPATH, [SYS_rename] = PLEDGE_RPATH | PLEDGE_CPATH, [SYS_rmdir] = PLEDGE_CPATH, [SYS_renameat] = PLEDGE_CPATH, [SYS_link] = PLEDGE_CPATH, [SYS_linkat] = PLEDGE_CPATH, [SYS_symlink] = PLEDGE_CPATH, [SYS_symlinkat] = PLEDGE_CPATH, [SYS_unlink] = PLEDGE_CPATH | PLEDGE_TMPPATH, [SYS_unlinkat] = PLEDGE_CPATH, [SYS_mkdir] = PLEDGE_CPATH, [SYS_mkdirat] = PLEDGE_CPATH, [SYS_mkfifo] = PLEDGE_DPATH, [SYS_mkfifoat] = PLEDGE_DPATH, [SYS_mknod] = PLEDGE_DPATH, [SYS_mknodat] = PLEDGE_DPATH, [SYS_revoke] = PLEDGE_TTY, /* also requires PLEDGE_RPATH */ /* * Classify as RPATH|WPATH, because of path information leakage. * WPATH due to unknown use of mk*temp(3) on non-/tmp paths.. */ [SYS___getcwd] = PLEDGE_RPATH | PLEDGE_WPATH, /* Classify as RPATH, because these leak path information */ [SYS_getdents] = PLEDGE_RPATH, [SYS_getfsstat] = PLEDGE_RPATH, [SYS_statfs] = PLEDGE_RPATH, [SYS_fstatfs] = PLEDGE_RPATH, [SYS_pathconf] = PLEDGE_RPATH, [SYS_utimes] = PLEDGE_FATTR, [SYS_futimes] = PLEDGE_FATTR, [SYS_utimensat] = PLEDGE_FATTR, [SYS_futimens] = PLEDGE_FATTR, [SYS_chmod] = PLEDGE_FATTR, [SYS_fchmod] = PLEDGE_FATTR, [SYS_fchmodat] = PLEDGE_FATTR, [SYS_chflags] = PLEDGE_FATTR, [SYS_chflagsat] = PLEDGE_FATTR, [SYS_fchflags] = PLEDGE_FATTR, [SYS_chown] = PLEDGE_CHOWN, [SYS_fchownat] = PLEDGE_CHOWN, [SYS_lchown] = PLEDGE_CHOWN, [SYS_fchown] = PLEDGE_CHOWN, [SYS_socket] = PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS, [SYS_connect] = PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS, [SYS_bind] = PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS, [SYS_getsockname] = PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS, [SYS_listen] = PLEDGE_INET | PLEDGE_UNIX, [SYS_accept4] = PLEDGE_INET | PLEDGE_UNIX, [SYS_accept] = PLEDGE_INET | PLEDGE_UNIX, [SYS_getpeername] = PLEDGE_INET | PLEDGE_UNIX, [SYS_flock] = PLEDGE_FLOCK, [SYS_ypconnect] = PLEDGE_GETPW, [SYS_swapctl] = PLEDGE_VMINFO, }; static const struct { char *name; uint64_t flags; } pledgereq[] = { { "audio", PLEDGE_AUDIO }, { "bpf", PLEDGE_BPF }, { "chown", PLEDGE_CHOWN | PLEDGE_CHOWNUID }, { "cpath", PLEDGE_CPATH }, { "disklabel", PLEDGE_DISKLABEL }, { "dns", PLEDGE_DNS }, { "dpath", PLEDGE_DPATH }, { "drm", PLEDGE_DRM }, { "error", PLEDGE_ERROR }, { "exec", PLEDGE_EXEC }, { "fattr", PLEDGE_FATTR | PLEDGE_CHOWN }, { "flock", PLEDGE_FLOCK }, { "getpw", PLEDGE_GETPW }, { "id", PLEDGE_ID }, { "inet", PLEDGE_INET }, { "mcast", PLEDGE_MCAST }, { "pf", PLEDGE_PF }, { "proc", PLEDGE_PROC }, { "prot_exec", PLEDGE_PROTEXEC }, { "ps", PLEDGE_PS }, { "recvfd", PLEDGE_RECVFD }, { "route", PLEDGE_ROUTE }, { "rpath", PLEDGE_RPATH }, { "sendfd", PLEDGE_SENDFD }, { "settime", PLEDGE_SETTIME }, { "stdio", PLEDGE_STDIO }, { "tape", PLEDGE_TAPE }, { "tmppath", PLEDGE_TMPPATH }, { "tty", PLEDGE_TTY }, { "unix", PLEDGE_UNIX }, { "unveil", PLEDGE_UNVEIL }, { "video", PLEDGE_VIDEO }, { "vminfo", PLEDGE_VMINFO }, { "vmm", PLEDGE_VMM }, { "wpath", PLEDGE_WPATH }, { "wroute", PLEDGE_WROUTE }, }; int parsepledges(struct proc *p, const char *kname, const char *promises, u_int64_t *fp) { size_t rbuflen; char *rbuf, *rp, *pn; u_int64_t flags = 0, f; int error; rbuf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(promises, rbuf, MAXPATHLEN, &rbuflen); if (error) { free(rbuf, M_TEMP, MAXPATHLEN); return (error); } #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrstruct(p, kname, rbuf, rbuflen-1); #endif for (rp = rbuf; rp && *rp; rp = pn) { pn = strchr(rp, ' '); /* find terminator */ if (pn) { while (*pn == ' ') *pn++ = '\0'; } if ((f = pledgereq_flags(rp)) == 0) { free(rbuf, M_TEMP, MAXPATHLEN); return (EINVAL); } flags |= f; } free(rbuf, M_TEMP, MAXPATHLEN); *fp = flags; return 0; } int sys_pledge(struct proc *p, void *v, register_t *retval) { struct sys_pledge_args /* { syscallarg(const char *)promises; syscallarg(const char *)execpromises; } */ *uap = v; struct process *pr = p->p_p; uint64_t promises, execpromises; int error; int unveil_cleanup = 0; /* Check for any error in user input */ if (SCARG(uap, promises)) { error = parsepledges(p, "pledgereq", SCARG(uap, promises), &promises); if (error) return (error); } if (SCARG(uap, execpromises)) { error = parsepledges(p, "pledgeexecreq", SCARG(uap, execpromises), &execpromises); if (error) return (error); } mtx_enter(&pr->ps_mtx); /* Check for any error wrt current promises */ if (SCARG(uap, promises)) { /* In "error" mode, ignore promise increase requests, * but accept promise decrease requests */ if (ISSET(pr->ps_flags, PS_PLEDGE) && (pr->ps_pledge & PLEDGE_ERROR)) promises &= (pr->ps_pledge & PLEDGE_USERSET); /* Only permit reductions */ if (ISSET(pr->ps_flags, PS_PLEDGE) && (((promises | pr->ps_pledge) != pr->ps_pledge))) { mtx_leave(&pr->ps_mtx); return (EPERM); } } if (SCARG(uap, execpromises)) { /* Only permit reductions */ if (ISSET(pr->ps_flags, PS_EXECPLEDGE) && (((execpromises | pr->ps_execpledge) != pr->ps_execpledge))) { mtx_leave(&pr->ps_mtx); return (EPERM); } } /* Set up promises */ if (SCARG(uap, promises)) { pr->ps_pledge = promises; atomic_setbits_int(&pr->ps_flags, PS_PLEDGE); if ((pr->ps_pledge & (PLEDGE_RPATH | PLEDGE_WPATH | PLEDGE_CPATH | PLEDGE_DPATH | PLEDGE_TMPPATH | PLEDGE_EXEC | PLEDGE_UNIX | PLEDGE_UNVEIL)) == 0) unveil_cleanup = 1; } if (SCARG(uap, execpromises)) { pr->ps_execpledge = execpromises; atomic_setbits_int(&pr->ps_flags, PS_EXECPLEDGE); } mtx_leave(&pr->ps_mtx); if (unveil_cleanup) { /* * Kill off unveil and drop unveil vnode refs if we no * longer are holding any path-accessing pledge */ KERNEL_LOCK(); unveil_destroy(pr); KERNEL_UNLOCK(); } return (0); } int pledge_syscall(struct proc *p, int code, uint64_t *tval) { p->p_pledge_syscall = code; *tval = 0; if (code < 0 || code > SYS_MAXSYSCALL - 1) return (EINVAL); if (pledge_syscalls[code] == PLEDGE_ALWAYS) return (0); if (p->p_p->ps_pledge & pledge_syscalls[code]) return (0); *tval = pledge_syscalls[code]; return (EPERM); } int pledge_fail(struct proc *p, int error, uint64_t code) { const char *codes = ""; int i; /* Print first matching pledge */ for (i = 0; code && pledgenames[i].bits != 0; i++) if (pledgenames[i].bits & code) { codes = pledgenames[i].name; break; } #ifdef KTRACE if (KTRPOINT(p, KTR_PLEDGE)) ktrpledge(p, error, code, p->p_pledge_syscall); #endif if (p->p_p->ps_pledge & PLEDGE_ERROR) return (ENOSYS); KERNEL_LOCK(); log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); p->p_p->ps_acflag |= APLEDGE; /* Stop threads immediately, because this process is suspect */ if (P_HASSIBLING(p)) single_thread_set(p, SINGLE_SUSPEND, 1); /* Send uncatchable SIGABRT for coredump */ sigabort(p); p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ KERNEL_UNLOCK(); return (error); } /* * Need to make it more obvious that one cannot get through here * without the right flags set */ int pledge_namei(struct proc *p, struct nameidata *ni, char *origpath) { char path[PATH_MAX]; uint64_t pledge; int error; if ((p->p_p->ps_flags & PS_PLEDGE) == 0 || (p->p_p->ps_flags & PS_COREDUMP)) return (0); pledge = READ_ONCE(p->p_p->ps_pledge); if (ni->ni_pledge == 0) panic("pledge_namei: ni_pledge"); /* * We set the BYPASSUNVEIL flag to skip unveil checks * as necessary */ /* Doing a permitted execve() */ if ((ni->ni_pledge & PLEDGE_EXEC) && (pledge & PLEDGE_EXEC)) return (0); error = canonpath(origpath, path, sizeof(path)); if (error) return (error); /* Detect what looks like a mkstemp(3) family operation */ if ((pledge & PLEDGE_TMPPATH) && (p->p_pledge_syscall == SYS_open) && (ni->ni_pledge & PLEDGE_CPATH) && strncmp(path, "/tmp/", sizeof("/tmp/") - 1) == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } /* Allow unlinking of a mkstemp(3) file... * Good opportunity for strict checks here. */ if ((pledge & PLEDGE_TMPPATH) && (p->p_pledge_syscall == SYS_unlink) && strncmp(path, "/tmp/", sizeof("/tmp/") - 1) == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } /* Whitelisted paths */ switch (p->p_pledge_syscall) { case SYS_access: /* tzset() needs this. */ if (ni->ni_pledge == PLEDGE_RPATH && strcmp(path, "/etc/localtime") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } break; case SYS_open: /* daemon(3) or other such functions */ if ((ni->ni_pledge & ~(PLEDGE_RPATH | PLEDGE_WPATH)) == 0 && strcmp(path, "/dev/null") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } /* readpassphrase(3), getpass(3) */ if ((pledge & PLEDGE_TTY) && (ni->ni_pledge & ~(PLEDGE_RPATH | PLEDGE_WPATH)) == 0 && strcmp(path, "/dev/tty") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } /* getpw* and friends need a few files */ if ((ni->ni_pledge == PLEDGE_RPATH) && (pledge & PLEDGE_GETPW)) { if (strcmp(path, "/etc/spwd.db") == 0) return (EPERM); /* don't call pledge_fail */ if (strcmp(path, "/etc/pwd.db") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/group") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/netid") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } } /* DNS needs /etc/{resolv.conf,hosts,services,protocols}. */ if ((ni->ni_pledge == PLEDGE_RPATH) && (pledge & PLEDGE_DNS)) { if (strcmp(path, "/etc/resolv.conf") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/hosts") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/services") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/protocols") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } } /* tzset() needs these. */ if ((ni->ni_pledge == PLEDGE_RPATH) && strncmp(path, "/usr/share/zoneinfo/", sizeof("/usr/share/zoneinfo/") - 1) == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if ((ni->ni_pledge == PLEDGE_RPATH) && strcmp(path, "/etc/localtime") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } break; case SYS_stat: /* DNS needs /etc/{resolv.conf,hosts}. */ if ((ni->ni_pledge == PLEDGE_RPATH) && (pledge & PLEDGE_DNS)) { if (strcmp(path, "/etc/resolv.conf") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } if (strcmp(path, "/etc/hosts") == 0) { ni->ni_cnd.cn_flags |= BYPASSUNVEIL; return (0); } } break; } /* * Ensure each flag of ni_pledge has counterpart allowing it in * ps_pledge. */ if (ni->ni_pledge & ~pledge) return (pledge_fail(p, EPERM, (ni->ni_pledge & ~pledge))); /* continue, and check unveil if present */ return (0); } /* * Only allow reception of safe file descriptors. */ int pledge_recvfd(struct proc *p, struct file *fp) { struct vnode *vp; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & PLEDGE_RECVFD) == 0) return pledge_fail(p, EPERM, PLEDGE_RECVFD); switch (fp->f_type) { case DTYPE_SOCKET: case DTYPE_PIPE: case DTYPE_DMABUF: case DTYPE_SYNC: return (0); case DTYPE_VNODE: vp = fp->f_data; if (vp->v_type != VDIR) return (0); } return pledge_fail(p, EINVAL, PLEDGE_RECVFD); } /* * Only allow sending of safe file descriptors. */ int pledge_sendfd(struct proc *p, struct file *fp) { struct vnode *vp; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & PLEDGE_SENDFD) == 0) return pledge_fail(p, EPERM, PLEDGE_SENDFD); switch (fp->f_type) { case DTYPE_SOCKET: case DTYPE_PIPE: case DTYPE_DMABUF: case DTYPE_SYNC: return (0); case DTYPE_VNODE: vp = fp->f_data; if (vp->v_type != VDIR) return (0); break; } return pledge_fail(p, EINVAL, PLEDGE_SENDFD); } int pledge_sysctl(struct proc *p, int miblen, int *mib, void *new) { char buf[80]; uint64_t pledge; int i; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); pledge = READ_ONCE(p->p_p->ps_pledge); if (new) return pledge_fail(p, EFAULT, 0); /* routing table observation */ if ((pledge & PLEDGE_ROUTE)) { if ((miblen == 6 || miblen == 7) && mib[0] == CTL_NET && mib[1] == PF_ROUTE && mib[2] == 0 && mib[4] == NET_RT_DUMP) return (0); if (miblen == 6 && mib[0] == CTL_NET && mib[1] == PF_ROUTE && mib[2] == 0 && (mib[3] == 0 || mib[3] == AF_INET6 || mib[3] == AF_INET) && (mib[4] == NET_RT_TABLE || mib[4] == NET_RT_SOURCE)) return (0); if (miblen == 7 && /* exposes MACs */ mib[0] == CTL_NET && mib[1] == PF_ROUTE && mib[2] == 0 && (mib[3] == 0 || mib[3] == AF_INET6 || mib[3] == AF_INET) && mib[4] == NET_RT_FLAGS && mib[5] == RTF_LLINFO) return (0); } if ((pledge & PLEDGE_WROUTE)) { if (miblen == 4 && mib[0] == CTL_NET && mib[1] == PF_INET6 && mib[2] == IPPROTO_IPV6 && mib[3] == IPV6CTL_SOIIKEY) return (0); } if (pledge & (PLEDGE_PS | PLEDGE_VMINFO)) { if (miblen == 2 && /* kern.fscale */ mib[0] == CTL_KERN && mib[1] == KERN_FSCALE) return (0); if (miblen == 2 && /* kern.boottime */ mib[0] == CTL_KERN && mib[1] == KERN_BOOTTIME) return (0); if (miblen == 2 && /* kern.consdev */ mib[0] == CTL_KERN && mib[1] == KERN_CONSDEV) return (0); if (miblen == 2 && /* kern.cptime */ mib[0] == CTL_KERN && mib[1] == KERN_CPTIME) return (0); if (miblen == 3 && /* kern.cptime2 */ mib[0] == CTL_KERN && mib[1] == KERN_CPTIME2) return (0); if (miblen == 3 && /* kern.cpustats */ mib[0] == CTL_KERN && mib[1] == KERN_CPUSTATS) return (0); } if ((pledge & PLEDGE_PS)) { if (miblen == 4 && /* kern.procargs.* */ mib[0] == CTL_KERN && mib[1] == KERN_PROC_ARGS && (mib[3] == KERN_PROC_ARGV || mib[3] == KERN_PROC_ENV)) return (0); if (miblen == 6 && /* kern.proc.* */ mib[0] == CTL_KERN && mib[1] == KERN_PROC) return (0); if (miblen == 3 && /* kern.proc_cwd.* */ mib[0] == CTL_KERN && mib[1] == KERN_PROC_CWD) return (0); if (miblen == 2 && /* kern.ccpu */ mib[0] == CTL_KERN && mib[1] == KERN_CCPU) return (0); if (miblen == 2 && /* vm.maxslp */ mib[0] == CTL_VM && mib[1] == VM_MAXSLP) return (0); } if ((pledge & PLEDGE_VMINFO)) { if (miblen == 2 && /* vm.uvmexp */ mib[0] == CTL_VM && mib[1] == VM_UVMEXP) return (0); if (miblen == 3 && /* vfs.generic.bcachestat */ mib[0] == CTL_VFS && mib[1] == VFS_GENERIC && mib[2] == VFS_BCACHESTAT) return (0); if (miblen == 3 && /* for sysconf(3) */ mib[0] == CTL_NET && mib[1] == PF_INET6) return (0); } if ((pledge & (PLEDGE_INET | PLEDGE_UNIX))) { if (miblen == 2 && /* kern.somaxconn */ mib[0] == CTL_KERN && mib[1] == KERN_SOMAXCONN) return (0); } if ((pledge & (PLEDGE_ROUTE | PLEDGE_INET | PLEDGE_DNS))) { if (miblen == 6 && /* getifaddrs() */ mib[0] == CTL_NET && mib[1] == PF_ROUTE && mib[2] == 0 && (mib[3] == 0 || mib[3] == AF_INET6 || mib[3] == AF_INET) && mib[4] == NET_RT_IFLIST) return (0); } if ((pledge & PLEDGE_DISKLABEL)) { if (miblen == 2 && /* kern.rawpartition */ mib[0] == CTL_KERN && mib[1] == KERN_RAWPARTITION) return (0); if (miblen == 2 && /* kern.maxpartitions */ mib[0] == CTL_KERN && mib[1] == KERN_MAXPARTITIONS) return (0); #ifdef CPU_CHR2BLK if (miblen == 3 && /* machdep.chr2blk */ mib[0] == CTL_MACHDEP && mib[1] == CPU_CHR2BLK) return (0); #endif /* CPU_CHR2BLK */ } if (miblen >= 3 && /* ntpd(8) to read sensors */ mib[0] == CTL_HW && mib[1] == HW_SENSORS) return (0); if (miblen == 6 && /* if_nameindex() */ mib[0] == CTL_NET && mib[1] == PF_ROUTE && mib[2] == 0 && mib[3] == 0 && mib[4] == NET_RT_IFNAMES) return (0); if (miblen == 2) { switch (mib[0]) { case CTL_KERN: switch (mib[1]) { case KERN_DOMAINNAME: /* getdomainname() */ case KERN_HOSTNAME: /* gethostname() */ case KERN_OSTYPE: /* uname() */ case KERN_OSRELEASE: /* uname() */ case KERN_OSVERSION: /* uname() */ case KERN_VERSION: /* uname() */ case KERN_CLOCKRATE: /* kern.clockrate */ case KERN_ARGMAX: /* kern.argmax */ case KERN_NGROUPS: /* kern.ngroups */ case KERN_SYSVSHM: /* kern.sysvshm */ case KERN_POSIX1: /* kern.posix1version */ return (0); } break; case CTL_HW: switch (mib[1]) { case HW_MACHINE: /* uname() */ case HW_PAGESIZE: /* getpagesize() */ case HW_PHYSMEM64: /* hw.physmem */ case HW_NCPU: /* hw.ncpu */ case HW_NCPUONLINE: /* hw.ncpuonline */ case HW_USERMEM64: /* hw.usermem */ return (0); } break; case CTL_VM: switch (mib[1]) { case VM_PSSTRINGS: /* setproctitle() */ case VM_LOADAVG: /* vm.loadavg / getloadavg(3) */ case VM_MALLOC_CONF: /* vm.malloc_conf */ return (0); } break; default: break; } } #ifdef CPU_SSE if (miblen == 2 && /* i386 libm tests for SSE */ mib[0] == CTL_MACHDEP && mib[1] == CPU_SSE) return (0); #endif /* CPU_SSE */ #ifdef CPU_ID_AA64ISAR0 if (miblen == 2 && /* arm64 libcrypto inspects CPU features */ mib[0] == CTL_MACHDEP && mib[1] == CPU_ID_AA64ISAR0) return (0); #endif /* CPU_ID_AA64ISAR0 */ snprintf(buf, sizeof(buf), "%s(%d): pledge sysctl %d:", p->p_p->ps_comm, p->p_p->ps_pid, miblen); for (i = 0; i < miblen; i++) { char *p = buf + strlen(buf); snprintf(p, sizeof(buf) - (p - buf), " %d", mib[i]); } log(LOG_ERR, "%s\n", buf); return pledge_fail(p, EINVAL, 0); } int pledge_chown(struct proc *p, uid_t uid, gid_t gid) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if (p->p_p->ps_pledge & PLEDGE_CHOWNUID) return (0); if (uid != -1 && uid != p->p_ucred->cr_uid) return (EPERM); if (gid != -1 && !groupmember(gid, p->p_ucred)) return (EPERM); return (0); } int pledge_adjtime(struct proc *p, const void *v) { const struct timeval *delta = v; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & PLEDGE_SETTIME)) return (0); if (delta) return (EPERM); return (0); } int pledge_sendit(struct proc *p, const void *to) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & (PLEDGE_INET | PLEDGE_UNIX | PLEDGE_DNS))) return (0); /* may use address */ if (to == NULL) return (0); /* behaves just like write */ return pledge_fail(p, EPERM, PLEDGE_INET); } int pledge_ioctl(struct proc *p, long com, struct file *fp) { struct vnode *vp = NULL; int error = EPERM; uint64_t pledge; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); pledge = READ_ONCE(p->p_p->ps_pledge); /* * The ioctl's which are always allowed. */ switch (com) { case FIONREAD: case FIONBIO: case FIOCLEX: case FIONCLEX: return (0); } /* fp != NULL was already checked */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_data; if (vp->v_type == VBAD) return (ENOTTY); } if ((pledge & PLEDGE_INET)) { switch (com) { case SIOCATMARK: case SIOCGIFGROUP: if (fp->f_type == DTYPE_SOCKET) return (0); break; } } #if NBPFILTER > 0 if ((pledge & PLEDGE_BPF)) { switch (com) { case BIOCGSTATS: /* bpf: tcpdump privsep on ^C */ if (fp->f_type == DTYPE_VNODE && fp->f_ops->fo_ioctl == vn_ioctl && vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_open == bpfopen) return (0); break; } } #endif /* NBPFILTER > 0 */ if ((pledge & PLEDGE_TAPE)) { switch (com) { case MTIOCGET: case MTIOCTOP: /* for pax(1) and such, checking tapes... */ if (fp->f_type == DTYPE_VNODE && vp->v_type == VCHR) { if (vp->v_flag & VISTTY) return (ENOTTY); else return (0); } break; } } #if NDRM > 0 if ((pledge & PLEDGE_DRM)) { if ((fp->f_type == DTYPE_VNODE) && (vp->v_type == VCHR) && (cdevsw[major(vp->v_rdev)].d_open == drmopen)) { error = pledge_ioctl_drm(p, com, vp->v_rdev); if (error == 0) return 0; } } #endif /* NDRM > 0 */ #if NAUDIO > 0 if ((pledge & PLEDGE_AUDIO)) { switch (com) { case AUDIO_GETPOS: case AUDIO_GETPAR: case AUDIO_SETPAR: case AUDIO_START: case AUDIO_STOP: case AUDIO_MIXER_DEVINFO: case AUDIO_MIXER_READ: case AUDIO_MIXER_WRITE: if (fp->f_type == DTYPE_VNODE && vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_open == audioopen) return (0); } } #endif /* NAUDIO > 0 */ if ((pledge & PLEDGE_DISKLABEL)) { switch (com) { case DIOCGDINFO: case DIOCGPDINFO: case DIOCRLDINFO: case DIOCWDINFO: case BIOCDISK: case BIOCINQ: case BIOCINSTALLBOOT: case BIOCVOL: if (fp->f_type == DTYPE_VNODE && ((vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_type == D_DISK) || (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type == D_DISK))) return (0); break; case DIOCMAP: if (fp->f_type == DTYPE_VNODE && vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_ioctl == diskmapioctl) return (0); break; } } #if NVIDEO > 0 if ((pledge & PLEDGE_VIDEO)) { switch (com) { case VIDIOC_QUERYCAP: case VIDIOC_TRY_FMT: case VIDIOC_ENUM_FMT: case VIDIOC_S_FMT: case VIDIOC_QUERYCTRL: case VIDIOC_G_CTRL: case VIDIOC_S_CTRL: case VIDIOC_G_PARM: case VIDIOC_S_PARM: case VIDIOC_REQBUFS: case VIDIOC_QBUF: case VIDIOC_DQBUF: case VIDIOC_QUERYBUF: case VIDIOC_STREAMON: case VIDIOC_STREAMOFF: case VIDIOC_ENUM_FRAMESIZES: case VIDIOC_ENUM_FRAMEINTERVALS: case VIDIOC_DQEVENT: case VIDIOC_ENCODER_CMD: case VIDIOC_EXPBUF: case VIDIOC_G_CROP: case VIDIOC_G_EXT_CTRLS: case VIDIOC_G_FMT: case VIDIOC_G_SELECTION: case VIDIOC_QUERYMENU: case VIDIOC_SUBSCRIBE_EVENT: case VIDIOC_S_EXT_CTRLS: case VIDIOC_S_SELECTION: case VIDIOC_TRY_DECODER_CMD: case VIDIOC_TRY_ENCODER_CMD: if (fp->f_type == DTYPE_VNODE && vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_open == videoopen) return (0); break; } } #endif #if NPF > 0 if ((pledge & PLEDGE_PF)) { switch (com) { case DIOCADDRULE: case DIOCGETSTATUS: case DIOCNATLOOK: case DIOCRADDTABLES: case DIOCRCLRADDRS: case DIOCRCLRTABLES: case DIOCRCLRTSTATS: case DIOCRGETTSTATS: case DIOCRSETADDRS: case DIOCXBEGIN: case DIOCXCOMMIT: case DIOCKILLSRCNODES: if ((fp->f_type == DTYPE_VNODE) && (vp->v_type == VCHR) && (cdevsw[major(vp->v_rdev)].d_open == pfopen)) return (0); break; } } #endif if ((pledge & PLEDGE_TTY)) { switch (com) { #if NPTY > 0 case PTMGET: if ((pledge & PLEDGE_RPATH) == 0) break; if ((pledge & PLEDGE_WPATH) == 0) break; if (fp->f_type != DTYPE_VNODE || vp->v_type != VCHR) break; if (cdevsw[major(vp->v_rdev)].d_open != ptmopen) break; return (0); case TIOCUCNTL: /* vmd */ if ((pledge & PLEDGE_RPATH) == 0) break; if ((pledge & PLEDGE_WPATH) == 0) break; if (cdevsw[major(vp->v_rdev)].d_open != ptcopen) break; return (0); #endif /* NPTY > 0 */ case TIOCSPGRP: if ((pledge & PLEDGE_PROC) == 0) break; /* FALLTHROUGH */ case TIOCFLUSH: /* getty, telnet */ case TIOCSTART: /* emacs, etc */ case TIOCGPGRP: case TIOCGETA: case TIOCGWINSZ: /* ENOTTY return for non-tty */ case TIOCSTAT: /* csh */ if (fp->f_type == DTYPE_VNODE && (vp->v_flag & VISTTY)) return (0); return (ENOTTY); case TIOCSWINSZ: case TIOCEXT: /* mail, libedit .. */ case TIOCCBRK: /* cu */ case TIOCSBRK: /* cu */ case TIOCCDTR: /* cu */ case TIOCSDTR: /* cu */ case TIOCEXCL: /* cu */ case TIOCSETA: /* cu, ... */ case TIOCSETAW: /* cu, ... */ case TIOCSETAF: /* tcsetattr TCSAFLUSH, script */ case TIOCSCTTY: /* forkpty(3), login_tty(3), ... */ if (fp->f_type == DTYPE_VNODE && (vp->v_flag & VISTTY)) return (0); break; } } if ((pledge & PLEDGE_ROUTE)) { switch (com) { case SIOCGIFADDR: case SIOCGIFAFLAG_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFDESCR: case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFGMEMB: case SIOCGIFRDOMAIN: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFXFLAGS: case SIOCGNBRINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGIFMEDIA: if (fp->f_type == DTYPE_SOCKET) return (0); break; } } if ((pledge & PLEDGE_WROUTE)) { switch (com) { case SIOCAIFADDR: case SIOCDIFADDR: case SIOCAIFADDR_IN6: case SIOCDIFADDR_IN6: if (fp->f_type == DTYPE_SOCKET) return (0); break; case SIOCSIFMTU: if (fp->f_type == DTYPE_SOCKET) return (0); break; } } #if NVMM > 0 if ((pledge & PLEDGE_VMM)) { if ((fp->f_type == DTYPE_VNODE) && (vp->v_type == VCHR) && (cdevsw[major(vp->v_rdev)].d_open == vmmopen)) { error = pledge_ioctl_vmm(p, com); if (error == 0) return 0; } } #endif return pledge_fail(p, error, PLEDGE_TTY); } int pledge_sockopt(struct proc *p, int set, int level, int optname) { uint64_t pledge; if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); pledge = READ_ONCE(p->p_p->ps_pledge); /* Always allow these, which are too common to reject */ switch (level) { case SOL_SOCKET: switch (optname) { case SO_RCVBUF: case SO_ERROR: return (0); } break; } if ((pledge & PLEDGE_WROUTE)) { switch (level) { case SOL_SOCKET: switch (optname) { case SO_RTABLE: return (0); } } } if ((pledge & (PLEDGE_INET|PLEDGE_UNIX|PLEDGE_DNS)) == 0) return pledge_fail(p, EPERM, PLEDGE_INET); /* In use by some service libraries */ switch (level) { case SOL_SOCKET: switch (optname) { case SO_TIMESTAMP: return (0); } break; } /* DNS resolver may do these requests */ if ((pledge & PLEDGE_DNS)) { switch (level) { case IPPROTO_IPV6: switch (optname) { case IPV6_RECVPKTINFO: case IPV6_USE_MIN_MTU: return (0); } } } if ((pledge & (PLEDGE_INET|PLEDGE_UNIX)) == 0) return pledge_fail(p, EPERM, PLEDGE_INET); switch (level) { case SOL_SOCKET: switch (optname) { case SO_RTABLE: return pledge_fail(p, EINVAL, PLEDGE_WROUTE); } return (0); } if ((pledge & PLEDGE_INET) == 0) return pledge_fail(p, EPERM, PLEDGE_INET); switch (level) { case IPPROTO_TCP: switch (optname) { case TCP_NODELAY: case TCP_MD5SIG: case TCP_SACK_ENABLE: case TCP_MAXSEG: case TCP_NOPUSH: case TCP_INFO: return (0); } break; case IPPROTO_IP: switch (optname) { case IP_OPTIONS: if (!set) return (0); break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_IPDEFTTL: case IP_PORTRANGE: case IP_RECVDSTADDR: case IP_RECVDSTPORT: return (0); case IP_MULTICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: if (pledge & PLEDGE_MCAST) return (0); break; } break; case IPPROTO_ICMP: break; case IPPROTO_IPV6: switch (optname) { case IPV6_TCLASS: case IPV6_UNICAST_HOPS: case IPV6_MINHOPCOUNT: case IPV6_RECVHOPLIMIT: case IPV6_PORTRANGE: case IPV6_RECVPKTINFO: case IPV6_RECVDSTPORT: case IPV6_V6ONLY: return (0); case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: if (pledge & PLEDGE_MCAST) return (0); break; } break; case IPPROTO_ICMPV6: break; } return pledge_fail(p, EPERM, PLEDGE_INET); } int pledge_socket(struct proc *p, int domain, unsigned int state) { uint64_t pledge; if (!ISSET(p->p_p->ps_flags, PS_PLEDGE)) return 0; pledge = READ_ONCE(p->p_p->ps_pledge); if (ISSET(state, SS_DNS)) { if (ISSET(pledge, PLEDGE_DNS)) return 0; return pledge_fail(p, EPERM, PLEDGE_DNS); } switch (domain) { case -1: /* accept on any domain */ return (0); case AF_INET: case AF_INET6: if (ISSET(pledge, PLEDGE_INET)) return 0; return pledge_fail(p, EPERM, PLEDGE_INET); case AF_UNIX: if (ISSET(pledge, PLEDGE_UNIX)) return 0; return pledge_fail(p, EPERM, PLEDGE_UNIX); } return pledge_fail(p, EINVAL, PLEDGE_INET); } int pledge_flock(struct proc *p) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & PLEDGE_FLOCK)) return (0); return (pledge_fail(p, EPERM, PLEDGE_FLOCK)); } int pledge_swapctl(struct proc *p, int cmd) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if (p->p_p->ps_pledge & PLEDGE_VMINFO) { switch (cmd) { case SWAP_NSWAP: case SWAP_STATS: return (0); } } return pledge_fail(p, EPERM, PLEDGE_VMINFO); } /* bsearch over pledgereq. return flags value if found, 0 else */ uint64_t pledgereq_flags(const char *req_name) { int base = 0, cmp, i, lim; for (lim = nitems(pledgereq); lim != 0; lim >>= 1) { i = base + (lim >> 1); cmp = strcmp(req_name, pledgereq[i].name); if (cmp == 0) return (pledgereq[i].flags); if (cmp > 0) { /* not found before, move right */ base = i + 1; lim--; } /* else move left */ } return (0); } int pledge_fcntl(struct proc *p, int cmd) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return (0); if ((p->p_p->ps_pledge & PLEDGE_PROC) == 0 && cmd == F_SETOWN) return pledge_fail(p, EPERM, PLEDGE_PROC); return (0); } int pledge_kill(struct proc *p, pid_t pid) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return 0; if (p->p_p->ps_pledge & PLEDGE_PROC) return 0; if (pid == 0 || pid == p->p_p->ps_pid) return 0; return pledge_fail(p, EPERM, PLEDGE_PROC); } int pledge_protexec(struct proc *p, int prot) { if ((p->p_p->ps_flags & PS_PLEDGE) == 0) return 0; /* Before kbind(2) call, ld.so and crt may create EXEC mappings */ if (p->p_p->ps_kbind_addr == 0 && p->p_p->ps_kbind_cookie == 0) return 0; if (!(p->p_p->ps_pledge & PLEDGE_PROTEXEC) && (prot & PROT_EXEC)) return pledge_fail(p, EPERM, PLEDGE_PROTEXEC); return 0; } int canonpath(const char *input, char *buf, size_t bufsize) { const char *p; char *q; /* can't canon relative paths, don't bother */ if (input[0] != '/') { if (strlcpy(buf, input, bufsize) >= bufsize) return ENAMETOOLONG; return 0; } p = input; q = buf; while (*p && (q - buf < bufsize)) { if (p[0] == '/' && (p[1] == '/' || p[1] == '\0')) { p += 1; } else if (p[0] == '/' && p[1] == '.' && (p[2] == '/' || p[2] == '\0')) { p += 2; } else if (p[0] == '/' && p[1] == '.' && p[2] == '.' && (p[3] == '/' || p[3] == '\0')) { p += 3; if (q != buf) /* "/../" at start of buf */ while (*--q != '/') continue; } else { *q++ = *p++; } } if ((*p == '\0') && (q - buf < bufsize)) { *q = 0; return 0; } else return ENAMETOOLONG; }
9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* $OpenBSD: vm_machdep.c,v 1.46 2022/08/07 23:56:06 guenther Exp $ */ /* $NetBSD: vm_machdep.c,v 1.1 2003/04/26 18:39:33 fvdl Exp $ */ /*- * Copyright (c) 1995 Charles M. Hannum. All rights reserved. * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 */ /* * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/user.h> #include <uvm/uvm_extern.h> #include <machine/cpu.h> #include <machine/fpu.h> void setguardpage(struct proc *); /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the kernel stack and pcb, making the child * ready to run, and marking it so that it can return differently * than the parent. */ void cpu_fork(struct proc *p1, struct proc *p2, void *stack, void *tcb, void (*func)(void *), void *arg) { struct cpu_info *ci = curcpu(); struct pcb *pcb = &p2->p_addr->u_pcb; struct pcb *pcb1 = &p1->p_addr->u_pcb; struct trapframe *tf; struct switchframe *sf; /* Save the fpu h/w state to p1's pcb so that we can copy it. */ if (p1 != &proc0 && (ci->ci_pflags & CPUPF_USERXSTATE)) fpusave(&pcb1->pcb_savefpu); p2->p_md.md_flags = p1->p_md.md_flags; #ifdef DIAGNOSTIC if (p1 != curproc && p1 != &proc0) panic("cpu_fork: curproc"); #endif *pcb = *pcb1; /* * Activate the address space. */ pmap_activate(p2); /* Record where this process's kernel stack is */ pcb->pcb_kstack = (u_int64_t)p2->p_addr + USPACE - 16 - (arc4random() & PAGE_MASK & ~_STACKALIGNBYTES); /* * Copy the trapframe. */ p2->p_md.md_regs = tf = (struct trapframe *)pcb->pcb_kstack - 1; *tf = *p1->p_md.md_regs; setguardpage(p2); /* * If specified, give the child a different stack and/or TCB */ if (stack != NULL) tf->tf_rsp = (u_int64_t)stack; if (tcb != NULL) pcb->pcb_fsbase = (u_int64_t)tcb; sf = (struct switchframe *)tf - 1; sf->sf_r12 = (u_int64_t)func; sf->sf_r13 = (u_int64_t)arg; sf->sf_rip = (u_int64_t)proc_trampoline; pcb->pcb_rsp = (u_int64_t)sf; pcb->pcb_rbp = 0; } /* * cpu_exit is called as the last action during exit. * * We clean up a little and then call sched_exit() with the old proc as an * argument. */ void cpu_exit(struct proc *p) { pmap_deactivate(p); sched_exit(p); } /* * Set a red zone in the kernel stack after the u. area. */ void setguardpage(struct proc *p) { pmap_remove(pmap_kernel(), (vaddr_t)p->p_addr + PAGE_SIZE, (vaddr_t)p->p_addr + 2 * PAGE_SIZE); pmap_update(pmap_kernel()); } struct kmem_va_mode kv_physwait = { .kv_map = &phys_map, .kv_wait = 1, }; /* * Map a user I/O request into kernel virtual address space. * Note: the pages are already locked by uvm_vslock(), so we * do not need to pass an access_type to pmap_enter(). */ void vmapbuf(struct buf *bp, vsize_t len) { vaddr_t faddr, taddr, off; paddr_t fpa; if ((bp->b_flags & B_PHYS) == 0) panic("vmapbuf"); faddr = trunc_page((vaddr_t)(bp->b_saveaddr = bp->b_data)); off = (vaddr_t)bp->b_data - faddr; len = round_page(off + len); taddr = (vaddr_t)km_alloc(len, &kv_physwait, &kp_none, &kd_waitok); bp->b_data = (caddr_t)(taddr + off); /* * The region is locked, so we expect that pmap_pte() will return * non-NULL. * XXX: unwise to expect this in a multithreaded environment. * anything can happen to a pmap between the time we lock a * region, release the pmap lock, and then relock it for * the pmap_extract(). * * no need to flush TLB since we expect nothing to be mapped * where we we just allocated (TLB will be flushed when our * mapping is removed). */ while (len) { (void) pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map), faddr, &fpa); pmap_kenter_pa(taddr, fpa, PROT_READ | PROT_WRITE); faddr += PAGE_SIZE; taddr += PAGE_SIZE; len -= PAGE_SIZE; } pmap_update(pmap_kernel()); } /* * Unmap a previously-mapped user I/O request. */ void vunmapbuf(struct buf *bp, vsize_t len) { vaddr_t addr, off; if ((bp->b_flags & B_PHYS) == 0) panic("vunmapbuf"); addr = trunc_page((vaddr_t)bp->b_data); off = (vaddr_t)bp->b_data - addr; len = round_page(off + len); pmap_kremove(addr, len); pmap_update(pmap_kernel()); km_free((void *)addr, len, &kv_physwait, &kp_none); bp->b_data = bp->b_saveaddr; bp->b_saveaddr = NULL; } void * tcb_get(struct proc *p) { return ((void *)p->p_addr->u_pcb.pcb_fsbase); } void tcb_set(struct proc *p, void *tcb) { KASSERT(p == curproc); reset_segs(); p->p_addr->u_pcb.pcb_fsbase = (u_int64_t)tcb; }
2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 /* * Created: Fri Jan 19 10:48:35 2001 by faith@acm.org * * Copyright 2001 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include <sys/param.h> #include <sys/fcntl.h> #include <sys/specdev.h> #include <sys/vnode.h> #include <machine/bus.h> #ifdef __HAVE_ACPI #include <dev/acpi/acpidev.h> #include <dev/acpi/acpivar.h> #include <dev/acpi/dsdt.h> #endif #include <linux/debugfs.h> #include <linux/fs.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/slab.h> #include <linux/srcu.h> #include <drm/drm_cache.h> #include <drm/drm_client.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_managed.h> #include <drm/drm_mode_object.h> #include <drm/drm_print.h> #include <drm/drm_gem.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #include "drm_legacy.h" MODULE_AUTHOR("Gareth Hughes, Leif Delgass, José Fonseca, Jon Smirl"); MODULE_DESCRIPTION("DRM shared core routines"); MODULE_LICENSE("GPL and additional rights"); static DEFINE_SPINLOCK(drm_minor_lock); static struct idr drm_minors_idr; /* * If the drm core fails to init for whatever reason, * we should prevent any drivers from registering with it. * It's best to check this at drm_dev_init(), as some drivers * prefer to embed struct drm_device into their own device * structure and call drm_dev_init() themselves. */ static bool drm_core_init_complete; static struct dentry *drm_debugfs_root; #ifdef notyet DEFINE_STATIC_SRCU(drm_unplug_srcu); #endif /* * Some functions are only called once on init regardless of how many times * drm attaches. In linux this is handled via module_init()/module_exit() */ int drm_refcnt; struct drm_softc { struct device sc_dev; struct drm_device *sc_drm; int sc_allocated; }; struct drm_attach_args { struct drm_device *drm; const struct drm_driver *driver; char *busid; bus_dma_tag_t dmat; bus_space_tag_t bst; size_t busid_len; int is_agp; struct pci_attach_args *pa; int primary; }; void drm_linux_init(void); void drm_linux_exit(void); int drm_linux_acpi_notify(struct aml_node *, int, void *); int drm_dequeue_event(struct drm_device *, struct drm_file *, size_t, struct drm_pending_event **); int drmprint(void *, const char *); int drmsubmatch(struct device *, void *, void *); const struct pci_device_id * drm_find_description(int, int, const struct pci_device_id *); int drm_file_cmp(struct drm_file *, struct drm_file *); SPLAY_PROTOTYPE(drm_file_tree, drm_file, link, drm_file_cmp); #define DRMDEVCF_PRIMARY 0 #define drmdevcf_primary cf_loc[DRMDEVCF_PRIMARY] /* spec'd as primary? */ #define DRMDEVCF_PRIMARY_UNK -1 /* * DRM Minors * A DRM device can provide several char-dev interfaces on the DRM-Major. Each * of them is represented by a drm_minor object. Depending on the capabilities * of the device-driver, different interfaces are registered. * * Minors can be accessed via dev->$minor_name. This pointer is either * NULL or a valid drm_minor pointer and stays valid as long as the device is * valid. This means, DRM minors have the same life-time as the underlying * device. However, this doesn't mean that the minor is active. Minors are * registered and unregistered dynamically according to device-state. */ static struct drm_minor **drm_minor_get_slot(struct drm_device *dev, unsigned int type) { switch (type) { case DRM_MINOR_PRIMARY: return &dev->primary; case DRM_MINOR_RENDER: return &dev->render; default: BUG(); } } static void drm_minor_alloc_release(struct drm_device *dev, void *data) { struct drm_minor *minor = data; unsigned long flags; WARN_ON(dev != minor->dev); #ifdef __linux__ put_device(minor->kdev); #endif spin_lock_irqsave(&drm_minor_lock, flags); idr_remove(&drm_minors_idr, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); } static int drm_minor_alloc(struct drm_device *dev, unsigned int type) { struct drm_minor *minor; unsigned long flags; int r; minor = drmm_kzalloc(dev, sizeof(*minor), GFP_KERNEL); if (!minor) return -ENOMEM; minor->type = type; minor->dev = dev; idr_preload(GFP_KERNEL); spin_lock_irqsave(&drm_minor_lock, flags); r = idr_alloc(&drm_minors_idr, NULL, 64 * type, 64 * (type + 1), GFP_NOWAIT); spin_unlock_irqrestore(&drm_minor_lock, flags); idr_preload_end(); if (r < 0) return r; minor->index = r; r = drmm_add_action_or_reset(dev, drm_minor_alloc_release, minor); if (r) return r; #ifdef __linux__ minor->kdev = drm_sysfs_minor_alloc(minor); if (IS_ERR(minor->kdev)) return PTR_ERR(minor->kdev); #endif *drm_minor_get_slot(dev, type) = minor; return 0; } static int drm_minor_register(struct drm_device *dev, unsigned int type) { struct drm_minor *minor; unsigned long flags; #ifdef __linux__ int ret; #endif DRM_DEBUG("\n"); minor = *drm_minor_get_slot(dev, type); if (!minor) return 0; #ifdef __linux__ ret = drm_debugfs_init(minor, minor->index, drm_debugfs_root); if (ret) { DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n"); goto err_debugfs; } ret = device_add(minor->kdev); if (ret) goto err_debugfs; #else drm_debugfs_root = NULL; #endif /* replace NULL with @minor so lookups will succeed from now on */ spin_lock_irqsave(&drm_minor_lock, flags); idr_replace(&drm_minors_idr, minor, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); DRM_DEBUG("new minor registered %d\n", minor->index); return 0; #ifdef __linux__ err_debugfs: drm_debugfs_cleanup(minor); return ret; #endif } static void drm_minor_unregister(struct drm_device *dev, unsigned int type) { struct drm_minor *minor; unsigned long flags; minor = *drm_minor_get_slot(dev, type); #ifdef __linux__ if (!minor || !device_is_registered(minor->kdev)) #else if (!minor) #endif return; /* replace @minor with NULL so lookups will fail from now on */ spin_lock_irqsave(&drm_minor_lock, flags); idr_replace(&drm_minors_idr, NULL, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); #ifdef __linux__ device_del(minor->kdev); #endif dev_set_drvdata(minor->kdev, NULL); /* safety belt */ drm_debugfs_cleanup(minor); } /* * Looks up the given minor-ID and returns the respective DRM-minor object. The * refence-count of the underlying device is increased so you must release this * object with drm_minor_release(). * * As long as you hold this minor, it is guaranteed that the object and the * minor->dev pointer will stay valid! However, the device may get unplugged and * unregistered while you hold the minor. */ struct drm_minor *drm_minor_acquire(unsigned int minor_id) { struct drm_minor *minor; unsigned long flags; spin_lock_irqsave(&drm_minor_lock, flags); minor = idr_find(&drm_minors_idr, minor_id); if (minor) drm_dev_get(minor->dev); spin_unlock_irqrestore(&drm_minor_lock, flags); if (!minor) { return ERR_PTR(-ENODEV); } else if (drm_dev_is_unplugged(minor->dev)) { drm_dev_put(minor->dev); return ERR_PTR(-ENODEV); } return minor; } void drm_minor_release(struct drm_minor *minor) { drm_dev_put(minor->dev); } /** * DOC: driver instance overview * * A device instance for a drm driver is represented by &struct drm_device. This * is allocated and initialized with devm_drm_dev_alloc(), usually from * bus-specific ->probe() callbacks implemented by the driver. The driver then * needs to initialize all the various subsystems for the drm device like memory * management, vblank handling, modesetting support and initial output * configuration plus obviously initialize all the corresponding hardware bits. * Finally when everything is up and running and ready for userspace the device * instance can be published using drm_dev_register(). * * There is also deprecated support for initializing device instances using * bus-specific helpers and the &drm_driver.load callback. But due to * backwards-compatibility needs the device instance have to be published too * early, which requires unpretty global locking to make safe and is therefore * only support for existing drivers not yet converted to the new scheme. * * When cleaning up a device instance everything needs to be done in reverse: * First unpublish the device instance with drm_dev_unregister(). Then clean up * any other resources allocated at device initialization and drop the driver's * reference to &drm_device using drm_dev_put(). * * Note that any allocation or resource which is visible to userspace must be * released only when the final drm_dev_put() is called, and not when the * driver is unbound from the underlying physical struct &device. Best to use * &drm_device managed resources with drmm_add_action(), drmm_kmalloc() and * related functions. * * devres managed resources like devm_kmalloc() can only be used for resources * directly related to the underlying hardware device, and only used in code * paths fully protected by drm_dev_enter() and drm_dev_exit(). * * Display driver example * ~~~~~~~~~~~~~~~~~~~~~~ * * The following example shows a typical structure of a DRM display driver. * The example focus on the probe() function and the other functions that is * almost always present and serves as a demonstration of devm_drm_dev_alloc(). * * .. code-block:: c * * struct driver_device { * struct drm_device drm; * void *userspace_facing; * struct clk *pclk; * }; * * static const struct drm_driver driver_drm_driver = { * [...] * }; * * static int driver_probe(struct platform_device *pdev) * { * struct driver_device *priv; * struct drm_device *drm; * int ret; * * priv = devm_drm_dev_alloc(&pdev->dev, &driver_drm_driver, * struct driver_device, drm); * if (IS_ERR(priv)) * return PTR_ERR(priv); * drm = &priv->drm; * * ret = drmm_mode_config_init(drm); * if (ret) * return ret; * * priv->userspace_facing = drmm_kzalloc(..., GFP_KERNEL); * if (!priv->userspace_facing) * return -ENOMEM; * * priv->pclk = devm_clk_get(dev, "PCLK"); * if (IS_ERR(priv->pclk)) * return PTR_ERR(priv->pclk); * * // Further setup, display pipeline etc * * platform_set_drvdata(pdev, drm); * * drm_mode_config_reset(drm); * * ret = drm_dev_register(drm); * if (ret) * return ret; * * drm_fbdev_generic_setup(drm, 32); * * return 0; * } * * // This function is called before the devm_ resources are released * static int driver_remove(struct platform_device *pdev) * { * struct drm_device *drm = platform_get_drvdata(pdev); * * drm_dev_unregister(drm); * drm_atomic_helper_shutdown(drm) * * return 0; * } * * // This function is called on kernel restart and shutdown * static void driver_shutdown(struct platform_device *pdev) * { * drm_atomic_helper_shutdown(platform_get_drvdata(pdev)); * } * * static int __maybe_unused driver_pm_suspend(struct device *dev) * { * return drm_mode_config_helper_suspend(dev_get_drvdata(dev)); * } * * static int __maybe_unused driver_pm_resume(struct device *dev) * { * drm_mode_config_helper_resume(dev_get_drvdata(dev)); * * return 0; * } * * static const struct dev_pm_ops driver_pm_ops = { * SET_SYSTEM_SLEEP_PM_OPS(driver_pm_suspend, driver_pm_resume) * }; * * static struct platform_driver driver_driver = { * .driver = { * [...] * .pm = &driver_pm_ops, * }, * .probe = driver_probe, * .remove = driver_remove, * .shutdown = driver_shutdown, * }; * module_platform_driver(driver_driver); * * Drivers that want to support device unplugging (USB, DT overlay unload) should * use drm_dev_unplug() instead of drm_dev_unregister(). The driver must protect * regions that is accessing device resources to prevent use after they're * released. This is done using drm_dev_enter() and drm_dev_exit(). There is one * shortcoming however, drm_dev_unplug() marks the drm_device as unplugged before * drm_atomic_helper_shutdown() is called. This means that if the disable code * paths are protected, they will not run on regular driver module unload, * possibly leaving the hardware enabled. */ /** * drm_put_dev - Unregister and release a DRM device * @dev: DRM device * * Called at module unload time or when a PCI device is unplugged. * * Cleans up all DRM device, calling drm_lastclose(). * * Note: Use of this function is deprecated. It will eventually go away * completely. Please use drm_dev_unregister() and drm_dev_put() explicitly * instead to make sure that the device isn't userspace accessible any more * while teardown is in progress, ensuring that userspace can't access an * inconsistent state. */ void drm_put_dev(struct drm_device *dev) { DRM_DEBUG("\n"); if (!dev) { DRM_ERROR("cleanup called no dev\n"); return; } drm_dev_unregister(dev); drm_dev_put(dev); } EXPORT_SYMBOL(drm_put_dev); /** * drm_dev_enter - Enter device critical section * @dev: DRM device * @idx: Pointer to index that will be passed to the matching drm_dev_exit() * * This function marks and protects the beginning of a section that should not * be entered after the device has been unplugged. The section end is marked * with drm_dev_exit(). Calls to this function can be nested. * * Returns: * True if it is OK to enter the section, false otherwise. */ bool drm_dev_enter(struct drm_device *dev, int *idx) { #ifdef notyet *idx = srcu_read_lock(&drm_unplug_srcu); if (dev->unplugged) { srcu_read_unlock(&drm_unplug_srcu, *idx); return false; } #endif return true; } EXPORT_SYMBOL(drm_dev_enter); /** * drm_dev_exit - Exit device critical section * @idx: index returned from drm_dev_enter() * * This function marks the end of a section that should not be entered after * the device has been unplugged. */ void drm_dev_exit(int idx) { #ifdef notyet srcu_read_unlock(&drm_unplug_srcu, idx); #endif } EXPORT_SYMBOL(drm_dev_exit); /** * drm_dev_unplug - unplug a DRM device * @dev: DRM device * * This unplugs a hotpluggable DRM device, which makes it inaccessible to * userspace operations. Entry-points can use drm_dev_enter() and * drm_dev_exit() to protect device resources in a race free manner. This * essentially unregisters the device like drm_dev_unregister(), but can be * called while there are still open users of @dev. */ void drm_dev_unplug(struct drm_device *dev) { STUB(); #ifdef notyet /* * After synchronizing any critical read section is guaranteed to see * the new value of ->unplugged, and any critical section which might * still have seen the old value of ->unplugged is guaranteed to have * finished. */ dev->unplugged = true; synchronize_srcu(&drm_unplug_srcu); drm_dev_unregister(dev); /* Clear all CPU mappings pointing to this device */ unmap_mapping_range(dev->anon_inode->i_mapping, 0, 0, 1); #endif } EXPORT_SYMBOL(drm_dev_unplug); #ifdef __linux__ /* * DRM internal mount * We want to be able to allocate our own "struct address_space" to control * memory-mappings in VRAM (or stolen RAM, ...). However, core MM does not allow * stand-alone address_space objects, so we need an underlying inode. As there * is no way to allocate an independent inode easily, we need a fake internal * VFS mount-point. * * The drm_fs_inode_new() function allocates a new inode, drm_fs_inode_free() * frees it again. You are allowed to use iget() and iput() to get references to * the inode. But each drm_fs_inode_new() call must be paired with exactly one * drm_fs_inode_free() call (which does not have to be the last iput()). * We use drm_fs_inode_*() to manage our internal VFS mount-point and share it * between multiple inode-users. You could, technically, call * iget() + drm_fs_inode_free() directly after alloc and sometime later do an * iput(), but this way you'd end up with a new vfsmount for each inode. */ static int drm_fs_cnt; static struct vfsmount *drm_fs_mnt; static int drm_fs_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, 0x010203ff) ? 0 : -ENOMEM; } static struct file_system_type drm_fs_type = { .name = "drm", .owner = THIS_MODULE, .init_fs_context = drm_fs_init_fs_context, .kill_sb = kill_anon_super, }; static struct inode *drm_fs_inode_new(void) { struct inode *inode; int r; r = simple_pin_fs(&drm_fs_type, &drm_fs_mnt, &drm_fs_cnt); if (r < 0) { DRM_ERROR("Cannot mount pseudo fs: %d\n", r); return ERR_PTR(r); } inode = alloc_anon_inode(drm_fs_mnt->mnt_sb); if (IS_ERR(inode)) simple_release_fs(&drm_fs_mnt, &drm_fs_cnt); return inode; } static void drm_fs_inode_free(struct inode *inode) { if (inode) { iput(inode); simple_release_fs(&drm_fs_mnt, &drm_fs_cnt); } } #endif /* __linux__ */ /** * DOC: component helper usage recommendations * * DRM drivers that drive hardware where a logical device consists of a pile of * independent hardware blocks are recommended to use the :ref:`component helper * library<component>`. For consistency and better options for code reuse the * following guidelines apply: * * - The entire device initialization procedure should be run from the * &component_master_ops.master_bind callback, starting with * devm_drm_dev_alloc(), then binding all components with * component_bind_all() and finishing with drm_dev_register(). * * - The opaque pointer passed to all components through component_bind_all() * should point at &struct drm_device of the device instance, not some driver * specific private structure. * * - The component helper fills the niche where further standardization of * interfaces is not practical. When there already is, or will be, a * standardized interface like &drm_bridge or &drm_panel, providing its own * functions to find such components at driver load time, like * drm_of_find_panel_or_bridge(), then the component helper should not be * used. */ static void drm_dev_init_release(struct drm_device *dev, void *res) { drm_legacy_ctxbitmap_cleanup(dev); drm_legacy_remove_map_hash(dev); #ifdef __linux__ drm_fs_inode_free(dev->anon_inode); put_device(dev->dev); #endif /* Prevent use-after-free in drm_managed_release when debugging is * enabled. Slightly awkward, but can't really be helped. */ dev->dev = NULL; mutex_destroy(&dev->master_mutex); mutex_destroy(&dev->clientlist_mutex); mutex_destroy(&dev->filelist_mutex); mutex_destroy(&dev->struct_mutex); drm_legacy_destroy_members(dev); } #ifdef notyet static int drm_dev_init(struct drm_device *dev, const struct drm_driver *driver, struct device *parent) { struct inode *inode; int ret; if (!drm_core_init_complete) { DRM_ERROR("DRM core is not initialized\n"); return -ENODEV; } if (WARN_ON(!parent)) return -EINVAL; kref_init(&dev->ref); dev->dev = get_device(parent); dev->driver = driver; INIT_LIST_HEAD(&dev->managed.resources); spin_lock_init(&dev->managed.lock); /* no per-device feature limits by default */ dev->driver_features = ~0u; drm_legacy_init_members(dev); INIT_LIST_HEAD(&dev->filelist); INIT_LIST_HEAD(&dev->filelist_internal); INIT_LIST_HEAD(&dev->clientlist); INIT_LIST_HEAD(&dev->vblank_event_list); spin_lock_init(&dev->event_lock); mutex_init(&dev->struct_mutex); mutex_init(&dev->filelist_mutex); mutex_init(&dev->clientlist_mutex); mutex_init(&dev->master_mutex); ret = drmm_add_action(dev, drm_dev_init_release, NULL); if (ret) return ret; inode = drm_fs_inode_new(); if (IS_ERR(inode)) { ret = PTR_ERR(inode); DRM_ERROR("Cannot allocate anonymous inode: %d\n", ret); goto err; } dev->anon_inode = inode; if (drm_core_check_feature(dev, DRIVER_RENDER)) { ret = drm_minor_alloc(dev, DRM_MINOR_RENDER); if (ret) goto err; } ret = drm_minor_alloc(dev, DRM_MINOR_PRIMARY); if (ret) goto err; ret = drm_legacy_create_map_hash(dev); if (ret) goto err; drm_legacy_ctxbitmap_init(dev); if (drm_core_check_feature(dev, DRIVER_GEM)) { ret = drm_gem_init(dev); if (ret) { DRM_ERROR("Cannot initialize graphics execution manager (GEM)\n"); goto err; } } ret = drm_dev_set_unique(dev, dev_name(parent)); if (ret) goto err; return 0; err: drm_managed_release(dev); return ret; } static void devm_drm_dev_init_release(void *data) { drm_dev_put(data); } static int devm_drm_dev_init(struct device *parent, struct drm_device *dev, const struct drm_driver *driver) { int ret; ret = drm_dev_init(dev, driver, parent); if (ret) return ret; return devm_add_action_or_reset(parent, devm_drm_dev_init_release, dev); } void *__devm_drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset) { void *container; struct drm_device *drm; int ret; container = kzalloc(size, GFP_KERNEL); if (!container) return ERR_PTR(-ENOMEM); drm = container + offset; ret = devm_drm_dev_init(parent, drm, driver); if (ret) { kfree(container); return ERR_PTR(ret); } drmm_add_final_kfree(drm, container); return container; } EXPORT_SYMBOL(__devm_drm_dev_alloc); /** * drm_dev_alloc - Allocate new DRM device * @driver: DRM driver to allocate device for * @parent: Parent device object * * This is the deprecated version of devm_drm_dev_alloc(), which does not support * subclassing through embedding the struct &drm_device in a driver private * structure, and which does not support automatic cleanup through devres. * * RETURNS: * Pointer to new DRM device, or ERR_PTR on failure. */ struct drm_device *drm_dev_alloc(const struct drm_driver *driver, struct device *parent) { struct drm_device *dev; int ret; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); ret = drm_dev_init(dev, driver, parent); if (ret) { kfree(dev); return ERR_PTR(ret); } drmm_add_final_kfree(dev, dev); return dev; } EXPORT_SYMBOL(drm_dev_alloc); #endif static void drm_dev_release(struct kref *ref) { struct drm_device *dev = container_of(ref, struct drm_device, ref); if (dev->driver->release) dev->driver->release(dev); drm_managed_release(dev); kfree(dev->managed.final_kfree); } /** * drm_dev_get - Take reference of a DRM device * @dev: device to take reference of or NULL * * This increases the ref-count of @dev by one. You *must* already own a * reference when calling this. Use drm_dev_put() to drop this reference * again. * * This function never fails. However, this function does not provide *any* * guarantee whether the device is alive or running. It only provides a * reference to the object and the memory associated with it. */ void drm_dev_get(struct drm_device *dev) { if (dev) kref_get(&dev->ref); } EXPORT_SYMBOL(drm_dev_get); /** * drm_dev_put - Drop reference of a DRM device * @dev: device to drop reference of or NULL * * This decreases the ref-count of @dev by one. The device is destroyed if the * ref-count drops to zero. */ void drm_dev_put(struct drm_device *dev) { if (dev) kref_put(&dev->ref, drm_dev_release); } EXPORT_SYMBOL(drm_dev_put); static int create_compat_control_link(struct drm_device *dev) { struct drm_minor *minor; char *name; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return 0; minor = *drm_minor_get_slot(dev, DRM_MINOR_PRIMARY); if (!minor) return 0; /* * Some existing userspace out there uses the existing of the controlD* * sysfs files to figure out whether it's a modeset driver. It only does * readdir, hence a symlink is sufficient (and the least confusing * option). Otherwise controlD* is entirely unused. * * Old controlD chardev have been allocated in the range * 64-127. */ name = kasprintf(GFP_KERNEL, "controlD%d", minor->index + 64); if (!name) return -ENOMEM; ret = sysfs_create_link(minor->kdev->kobj.parent, &minor->kdev->kobj, name); kfree(name); return ret; } static void remove_compat_control_link(struct drm_device *dev) { struct drm_minor *minor; char *name; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; minor = *drm_minor_get_slot(dev, DRM_MINOR_PRIMARY); if (!minor) return; name = kasprintf(GFP_KERNEL, "controlD%d", minor->index + 64); if (!name) return; sysfs_remove_link(minor->kdev->kobj.parent, name); kfree(name); } /** * drm_dev_register - Register DRM device * @dev: Device to register * @flags: Flags passed to the driver's .load() function * * Register the DRM device @dev with the system, advertise device to user-space * and start normal device operation. @dev must be initialized via drm_dev_init() * previously. * * Never call this twice on any device! * * NOTE: To ensure backward compatibility with existing drivers method this * function calls the &drm_driver.load method after registering the device * nodes, creating race conditions. Usage of the &drm_driver.load methods is * therefore deprecated, drivers must perform all initialization before calling * drm_dev_register(). * * RETURNS: * 0 on success, negative error code on failure. */ int drm_dev_register(struct drm_device *dev, unsigned long flags) { const struct drm_driver *driver = dev->driver; int ret; if (!driver->load) drm_mode_config_validate(dev); WARN_ON(!dev->managed.final_kfree); if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); ret = drm_minor_register(dev, DRM_MINOR_RENDER); if (ret) goto err_minors; ret = drm_minor_register(dev, DRM_MINOR_PRIMARY); if (ret) goto err_minors; ret = create_compat_control_link(dev); if (ret) goto err_minors; dev->registered = true; if (dev->driver->load) { ret = dev->driver->load(dev, flags); if (ret) goto err_minors; } if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_modeset_register_all(dev); DRM_INFO("Initialized %s %d.%d.%d %s for %s on minor %d\n", driver->name, driver->major, driver->minor, driver->patchlevel, driver->date, dev->dev ? dev_name(dev->dev) : "virtual device", dev->primary->index); goto out_unlock; err_minors: remove_compat_control_link(dev); drm_minor_unregister(dev, DRM_MINOR_PRIMARY); drm_minor_unregister(dev, DRM_MINOR_RENDER); out_unlock: if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return ret; } EXPORT_SYMBOL(drm_dev_register); /** * drm_dev_unregister - Unregister DRM device * @dev: Device to unregister * * Unregister the DRM device from the system. This does the reverse of * drm_dev_register() but does not deallocate the device. The caller must call * drm_dev_put() to drop their final reference. * * A special form of unregistering for hotpluggable devices is drm_dev_unplug(), * which can be called while there are still open users of @dev. * * This should be called first in the device teardown code to make sure * userspace can't access the device instance any more. */ void drm_dev_unregister(struct drm_device *dev) { if (drm_core_check_feature(dev, DRIVER_LEGACY)) drm_lastclose(dev); dev->registered = false; drm_client_dev_unregister(dev); if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_modeset_unregister_all(dev); if (dev->driver->unload) dev->driver->unload(dev); drm_legacy_pci_agp_destroy(dev); drm_legacy_rmmaps(dev); remove_compat_control_link(dev); drm_minor_unregister(dev, DRM_MINOR_PRIMARY); drm_minor_unregister(dev, DRM_MINOR_RENDER); } EXPORT_SYMBOL(drm_dev_unregister); /** * drm_dev_set_unique - Set the unique name of a DRM device * @dev: device of which to set the unique name * @name: unique name * * Sets the unique name of a DRM device using the specified string. This is * already done by drm_dev_init(), drivers should only override the default * unique name for backwards compatibility reasons. * * Return: 0 on success or a negative error code on failure. */ int drm_dev_set_unique(struct drm_device *dev, const char *name) { drmm_kfree(dev, dev->unique); dev->unique = drmm_kstrdup(dev, name, GFP_KERNEL); return dev->unique ? 0 : -ENOMEM; } EXPORT_SYMBOL(drm_dev_set_unique); /* * DRM Core * The DRM core module initializes all global DRM objects and makes them * available to drivers. Once setup, drivers can probe their respective * devices. * Currently, core management includes: * - The "DRM-Global" key/value database * - Global ID management for connectors * - DRM major number allocation * - DRM minor management * - DRM sysfs class * - DRM debugfs root * * Furthermore, the DRM core provides dynamic char-dev lookups. For each * interface registered on a DRM device, you can request minor numbers from DRM * core. DRM core takes care of major-number management and char-dev * registration. A stub ->open() callback forwards any open() requests to the * registered minor. */ #ifdef __linux__ static int drm_stub_open(struct inode *inode, struct file *filp) { const struct file_operations *new_fops; struct drm_minor *minor; int err; DRM_DEBUG("\n"); minor = drm_minor_acquire(iminor(inode)); if (IS_ERR(minor)) return PTR_ERR(minor); new_fops = fops_get(minor->dev->driver->fops); if (!new_fops) { err = -ENODEV; goto out; } replace_fops(filp, new_fops); if (filp->f_op->open) err = filp->f_op->open(inode, filp); else err = 0; out: drm_minor_release(minor); return err; } static const struct file_operations drm_stub_fops = { .owner = THIS_MODULE, .open = drm_stub_open, .llseek = noop_llseek, }; #endif /* __linux__ */ static void drm_core_exit(void) { #ifdef __linux__ unregister_chrdev(DRM_MAJOR, "drm"); debugfs_remove(drm_debugfs_root); drm_sysfs_destroy(); #endif idr_destroy(&drm_minors_idr); drm_connector_ida_destroy(); } static int __init drm_core_init(void) { #ifdef __linux__ int ret; #endif drm_connector_ida_init(); idr_init(&drm_minors_idr); drm_memcpy_init_early(); #ifdef __linux__ ret = drm_sysfs_init(); if (ret < 0) { DRM_ERROR("Cannot create DRM class: %d\n", ret); goto error; } drm_debugfs_root = debugfs_create_dir("dri", NULL); ret = register_chrdev(DRM_MAJOR, "drm", &drm_stub_fops); if (ret < 0) goto error; #endif drm_core_init_complete = true; DRM_DEBUG("Initialized\n"); return 0; #ifdef __linux__ error: drm_core_exit(); return ret; #endif } #ifdef __linux__ module_init(drm_core_init); module_exit(drm_core_exit); #endif void drm_attach_platform(struct drm_driver *driver, bus_space_tag_t iot, bus_dma_tag_t dmat, struct device *dev, struct drm_device *drm) { struct drm_attach_args arg; memset(&arg, 0, sizeof(arg)); arg.driver = driver; arg.bst = iot; arg.dmat = dmat; arg.drm = drm; arg.busid = dev->dv_xname; arg.busid_len = strlen(dev->dv_xname) + 1; config_found_sm(dev, &arg, drmprint, drmsubmatch); } struct drm_device * drm_attach_pci(const struct drm_driver *driver, struct pci_attach_args *pa, int is_agp, int primary, struct device *dev, struct drm_device *drm) { struct drm_attach_args arg; struct drm_softc *sc; arg.drm = drm; arg.driver = driver; arg.dmat = pa->pa_dmat; arg.bst = pa->pa_memt; arg.is_agp = is_agp; arg.primary = primary; arg.pa = pa; arg.busid_len = 20; arg.busid = malloc(arg.busid_len + 1, M_DRM, M_NOWAIT); if (arg.busid == NULL) { printf("%s: no memory for drm\n", dev->dv_xname); return (NULL); } snprintf(arg.busid, arg.busid_len, "pci:%04x:%02x:%02x.%1x", pa->pa_domain, pa->pa_bus, pa->pa_device, pa->pa_function); sc = (struct drm_softc *)config_found_sm(dev, &arg, drmprint, drmsubmatch); if (sc == NULL) return NULL; return sc->sc_drm; } int drmprint(void *aux, const char *pnp) { if (pnp != NULL) printf("drm at %s", pnp); return (UNCONF); } int drmsubmatch(struct device *parent, void *match, void *aux) { extern struct cfdriver drm_cd; struct cfdata *cf = match; /* only allow drm to attach */ if (cf->cf_driver == &drm_cd) return ((*cf->cf_attach->ca_match)(parent, match, aux)); return (0); } int drm_pciprobe(struct pci_attach_args *pa, const struct pci_device_id *idlist) { const struct pci_device_id *id_entry; id_entry = drm_find_description(PCI_VENDOR(pa->pa_id), PCI_PRODUCT(pa->pa_id), idlist); if (id_entry != NULL) return 1; return 0; } int drm_probe(struct device *parent, void *match, void *aux) { struct cfdata *cf = match; struct drm_attach_args *da = aux; if (cf->drmdevcf_primary != DRMDEVCF_PRIMARY_UNK) { /* * If primary-ness of device specified, either match * exactly (at high priority), or fail. */ if (cf->drmdevcf_primary != 0 && da->primary != 0) return (10); else return (0); } /* If primary-ness unspecified, it wins. */ return (1); } void drm_attach(struct device *parent, struct device *self, void *aux) { struct drm_softc *sc = (struct drm_softc *)self; struct drm_attach_args *da = aux; struct drm_device *dev = da->drm; int ret; if (drm_refcnt == 0) { drm_linux_init(); drm_core_init(); } drm_refcnt++; if (dev == NULL) { dev = malloc(sizeof(struct drm_device), M_DRM, M_WAITOK | M_ZERO); sc->sc_allocated = 1; } sc->sc_drm = dev; dev->dev = self; dev->dev_private = parent; dev->driver = da->driver; INIT_LIST_HEAD(&dev->managed.resources); mtx_init(&dev->managed.lock, IPL_TTY); /* no per-device feature limits by default */ dev->driver_features = ~0u; dev->dmat = da->dmat; dev->bst = da->bst; dev->unique = da->busid; if (da->pa) { struct pci_attach_args *pa = da->pa; pcireg_t subsys; subsys = pci_conf_read(pa->pa_pc, pa->pa_tag, PCI_SUBSYS_ID_REG); dev->pdev = &dev->_pdev; dev->pdev->vendor = PCI_VENDOR(pa->pa_id); dev->pdev->device = PCI_PRODUCT(pa->pa_id); dev->pdev->subsystem_vendor = PCI_VENDOR(subsys); dev->pdev->subsystem_device = PCI_PRODUCT(subsys); dev->pdev->revision = PCI_REVISION(pa->pa_class); dev->pdev->class = (PCI_CLASS(pa->pa_class) << 16) | (PCI_SUBCLASS(pa->pa_class) << 8) | PCI_INTERFACE(pa->pa_class); dev->pdev->devfn = PCI_DEVFN(pa->pa_device, pa->pa_function); dev->pdev->bus = &dev->pdev->_bus; dev->pdev->bus->pc = pa->pa_pc; dev->pdev->bus->number = pa->pa_bus; dev->pdev->bus->domain_nr = pa->pa_domain; dev->pdev->bus->bridgetag = pa->pa_bridgetag; if (pa->pa_bridgetag != NULL) { dev->pdev->bus->self = malloc(sizeof(struct pci_dev), M_DRM, M_WAITOK | M_ZERO); dev->pdev->bus->self->pc = pa->pa_pc; dev->pdev->bus->self->tag = *pa->pa_bridgetag; } dev->pdev->pc = pa->pa_pc; dev->pdev->tag = pa->pa_tag; dev->pdev->pci = (struct pci_softc *)parent->dv_parent; #ifdef CONFIG_ACPI dev->pdev->dev.node = acpi_find_pci(pa->pa_pc, pa->pa_tag); aml_register_notify(dev->pdev->dev.node, NULL, drm_linux_acpi_notify, NULL, ACPIDEV_NOPOLL); #endif } mtx_init(&dev->quiesce_mtx, IPL_NONE); mtx_init(&dev->event_lock, IPL_TTY); rw_init(&dev->struct_mutex, "drmdevlk"); rw_init(&dev->filelist_mutex, "drmflist"); rw_init(&dev->clientlist_mutex, "drmclist"); rw_init(&dev->master_mutex, "drmmast"); ret = drmm_add_action(dev, drm_dev_init_release, NULL); if (ret) goto error; SPLAY_INIT(&dev->files); INIT_LIST_HEAD(&dev->filelist_internal); INIT_LIST_HEAD(&dev->clientlist); INIT_LIST_HEAD(&dev->vblank_event_list); if (drm_core_check_feature(dev, DRIVER_RENDER)) { ret = drm_minor_alloc(dev, DRM_MINOR_RENDER); if (ret) goto error; } ret = drm_minor_alloc(dev, DRM_MINOR_PRIMARY); if (ret) goto error; #ifdef CONFIG_DRM_LEGACY if (drm_core_check_feature(dev, DRIVER_USE_AGP)) { #if IS_ENABLED(CONFIG_AGP) if (da->is_agp) dev->agp = drm_agp_init(); #endif if (dev->agp != NULL) { if (drm_mtrr_add(dev->agp->info.ai_aperture_base, dev->agp->info.ai_aperture_size, DRM_MTRR_WC) == 0) dev->agp->mtrr = 1; } } #endif if (dev->driver->gem_size > 0) { KASSERT(dev->driver->gem_size >= sizeof(struct drm_gem_object)); /* XXX unique name */ pool_init(&dev->objpl, dev->driver->gem_size, 0, IPL_NONE, 0, "drmobjpl", NULL); } if (drm_core_check_feature(dev, DRIVER_GEM)) { ret = drm_gem_init(dev); if (ret) { DRM_ERROR("Cannot initialize graphics execution manager (GEM)\n"); goto error; } } drmm_add_final_kfree(dev, dev); printf("\n"); return; error: drm_managed_release(dev); dev->dev_private = NULL; } int drm_detach(struct device *self, int flags) { struct drm_softc *sc = (struct drm_softc *)self; struct drm_device *dev = sc->sc_drm; drm_refcnt--; if (drm_refcnt == 0) { drm_core_exit(); drm_linux_exit(); } drm_lastclose(dev); if (drm_core_check_feature(dev, DRIVER_GEM)) { if (dev->driver->gem_size > 0) pool_destroy(&dev->objpl); } #ifdef CONFIG_DRM_LEGACY if (dev->agp && dev->agp->mtrr) { int retcode; retcode = drm_mtrr_del(0, dev->agp->info.ai_aperture_base, dev->agp->info.ai_aperture_size, DRM_MTRR_WC); DRM_DEBUG("mtrr_del = %d", retcode); } free(dev->agp, M_DRM, 0); #endif if (dev->pdev && dev->pdev->bus) free(dev->pdev->bus->self, M_DRM, sizeof(struct pci_dev)); if (sc->sc_allocated) free(dev, M_DRM, sizeof(struct drm_device)); return 0; } void drm_quiesce(struct drm_device *dev) { mtx_enter(&dev->quiesce_mtx); dev->quiesce = 1; while (dev->quiesce_count > 0) { msleep_nsec(&dev->quiesce_count, &dev->quiesce_mtx, PZERO, "drmqui", INFSLP); } mtx_leave(&dev->quiesce_mtx); } void drm_wakeup(struct drm_device *dev) { mtx_enter(&dev->quiesce_mtx); dev->quiesce = 0; wakeup(&dev->quiesce); mtx_leave(&dev->quiesce_mtx); } int drm_activate(struct device *self, int act) { struct drm_softc *sc = (struct drm_softc *)self; struct drm_device *dev = sc->sc_drm; switch (act) { case DVACT_QUIESCE: drm_quiesce(dev); break; case DVACT_WAKEUP: drm_wakeup(dev); break; } return (0); } const struct cfattach drm_ca = { sizeof(struct drm_softc), drm_probe, drm_attach, drm_detach, drm_activate }; struct cfdriver drm_cd = { 0, "drm", DV_DULL }; const struct pci_device_id * drm_find_description(int vendor, int device, const struct pci_device_id *idlist) { int i = 0; for (i = 0; idlist[i].vendor != 0; i++) { if ((idlist[i].vendor == vendor) && (idlist[i].device == device) && (idlist[i].subvendor == PCI_ANY_ID) && (idlist[i].subdevice == PCI_ANY_ID)) return &idlist[i]; } return NULL; } int drm_file_cmp(struct drm_file *f1, struct drm_file *f2) { return (f1->fminor < f2->fminor ? -1 : f1->fminor > f2->fminor); } SPLAY_GENERATE(drm_file_tree, drm_file, link, drm_file_cmp); struct drm_file * drm_find_file_by_minor(struct drm_device *dev, int minor) { struct drm_file key; key.fminor = minor; return (SPLAY_FIND(drm_file_tree, &dev->files, &key)); } struct drm_device * drm_get_device_from_kdev(dev_t kdev) { int unit = minor(kdev) & ((1 << CLONE_SHIFT) - 1); /* control */ if (unit >= 64 && unit < 128) unit -= 64; /* render */ if (unit >= 128) unit -= 128; struct drm_softc *sc; if (unit < drm_cd.cd_ndevs) { sc = (struct drm_softc *)drm_cd.cd_devs[unit]; if (sc) return sc->sc_drm; } return NULL; } void filt_drmdetach(struct knote *kn) { struct drm_device *dev = kn->kn_hook; int s; s = spltty(); klist_remove_locked(&dev->note, kn); splx(s); } int filt_drmkms(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } void filt_drmreaddetach(struct knote *kn) { struct drm_file *file_priv = kn->kn_hook; int s; s = spltty(); klist_remove_locked(&file_priv->rsel.si_note, kn); splx(s); } int filt_drmread(struct knote *kn, long hint) { struct drm_file *file_priv = kn->kn_hook; int val = 0; if ((hint & NOTE_SUBMIT) == 0) mtx_enter(&file_priv->minor->dev->event_lock); val = !list_empty(&file_priv->event_list); if ((hint & NOTE_SUBMIT) == 0) mtx_leave(&file_priv->minor->dev->event_lock); return (val); } const struct filterops drm_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_drmdetach, .f_event = filt_drmkms, }; const struct filterops drmread_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_drmreaddetach, .f_event = filt_drmread, }; int drmkqfilter(dev_t kdev, struct knote *kn) { struct drm_device *dev = NULL; struct drm_file *file_priv = NULL; int s; dev = drm_get_device_from_kdev(kdev); if (dev == NULL || dev->dev_private == NULL) return (ENXIO); switch (kn->kn_filter) { case EVFILT_READ: mutex_lock(&dev->struct_mutex); file_priv = drm_find_file_by_minor(dev, minor(kdev)); mutex_unlock(&dev->struct_mutex); if (file_priv == NULL) return (ENXIO); kn->kn_fop = &drmread_filtops; kn->kn_hook = file_priv; s = spltty(); klist_insert_locked(&file_priv->rsel.si_note, kn); splx(s); break; case EVFILT_DEVICE: kn->kn_fop = &drm_filtops; kn->kn_hook = dev; s = spltty(); klist_insert_locked(&dev->note, kn); splx(s); break; default: return (EINVAL); } return (0); } int drmopen(dev_t kdev, int flags, int fmt, struct proc *p) { struct drm_device *dev = NULL; struct drm_file *file_priv; struct drm_minor *dm; int ret = 0; int dminor, realminor, minor_type; int need_setup = 0; dev = drm_get_device_from_kdev(kdev); if (dev == NULL || dev->dev_private == NULL) return (ENXIO); DRM_DEBUG("open_count = %d\n", atomic_read(&dev->open_count)); if (flags & O_EXCL) return (EBUSY); /* No exclusive opens */ if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); if (!atomic_fetch_inc(&dev->open_count)) need_setup = 1; dminor = minor(kdev); realminor = dminor & ((1 << CLONE_SHIFT) - 1); if (realminor < 64) minor_type = DRM_MINOR_PRIMARY; else if (realminor >= 64 && realminor < 128) minor_type = DRM_MINOR_CONTROL; else minor_type = DRM_MINOR_RENDER; dm = *drm_minor_get_slot(dev, minor_type); dm->index = minor(kdev); file_priv = drm_file_alloc(dm); if (IS_ERR(file_priv)) { ret = ENOMEM; goto err; } /* first opener automatically becomes master */ if (drm_is_primary_client(file_priv)) { ret = drm_master_open(file_priv); if (ret != 0) goto out_file_free; } file_priv->filp = (void *)file_priv; file_priv->fminor = minor(kdev); mutex_lock(&dev->filelist_mutex); SPLAY_INSERT(drm_file_tree, &dev->files, file_priv); mutex_unlock(&dev->filelist_mutex); if (need_setup) { ret = drm_legacy_setup(dev); if (ret) goto out_file_free; } if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return 0; out_file_free: drm_file_free(file_priv); err: atomic_dec(&dev->open_count); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return (ret); } int drmclose(dev_t kdev, int flags, int fmt, struct proc *p) { struct drm_device *dev = drm_get_device_from_kdev(kdev); struct drm_file *file_priv; int retcode = 0; if (dev == NULL) return (ENXIO); if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); DRM_DEBUG("open_count = %d\n", atomic_read(&dev->open_count)); mutex_lock(&dev->filelist_mutex); file_priv = drm_find_file_by_minor(dev, minor(kdev)); if (file_priv == NULL) { DRM_ERROR("can't find authenticator\n"); retcode = EINVAL; mutex_unlock(&dev->filelist_mutex); goto done; } SPLAY_REMOVE(drm_file_tree, &dev->files, file_priv); mutex_unlock(&dev->filelist_mutex); drm_file_free(file_priv); done: if (atomic_dec_and_test(&dev->open_count)) drm_lastclose(dev); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return (retcode); } int drmread(dev_t kdev, struct uio *uio, int ioflag) { struct drm_device *dev = drm_get_device_from_kdev(kdev); struct drm_file *file_priv; struct drm_pending_event *ev; int error = 0; if (dev == NULL) return (ENXIO); mutex_lock(&dev->filelist_mutex); file_priv = drm_find_file_by_minor(dev, minor(kdev)); mutex_unlock(&dev->filelist_mutex); if (file_priv == NULL) return (ENXIO); /* * The semantics are a little weird here. We will wait until we * have events to process, but as soon as we have events we will * only deliver as many as we have. * Note that events are atomic, if the read buffer will not fit in * a whole event, we won't read any of it out. */ mtx_enter(&dev->event_lock); while (error == 0 && list_empty(&file_priv->event_list)) { if (ioflag & IO_NDELAY) { mtx_leave(&dev->event_lock); return (EAGAIN); } error = msleep_nsec(&file_priv->event_wait, &dev->event_lock, PWAIT | PCATCH, "drmread", INFSLP); } if (error) { mtx_leave(&dev->event_lock); return (error); } while (drm_dequeue_event(dev, file_priv, uio->uio_resid, &ev)) { MUTEX_ASSERT_UNLOCKED(&dev->event_lock); /* XXX we always destroy the event on error. */ error = uiomove(ev->event, ev->event->length, uio); kfree(ev); if (error) break; mtx_enter(&dev->event_lock); } MUTEX_ASSERT_UNLOCKED(&dev->event_lock); return (error); } /* * Deqeue an event from the file priv in question. returning 1 if an * event was found. We take the resid from the read as a parameter because * we will only dequeue and event if the read buffer has space to fit the * entire thing. * * We are called locked, but we will *unlock* the queue on return so that * we may sleep to copyout the event. */ int drm_dequeue_event(struct drm_device *dev, struct drm_file *file_priv, size_t resid, struct drm_pending_event **out) { struct drm_pending_event *e = NULL; int gotone = 0; MUTEX_ASSERT_LOCKED(&dev->event_lock); *out = NULL; if (list_empty(&file_priv->event_list)) goto out; e = list_first_entry(&file_priv->event_list, struct drm_pending_event, link); if (e->event->length > resid) goto out; file_priv->event_space += e->event->length; list_del(&e->link); *out = e; gotone = 1; out: mtx_leave(&dev->event_lock); return (gotone); } paddr_t drmmmap(dev_t kdev, off_t offset, int prot) { return -1; } struct drm_dmamem * drm_dmamem_alloc(bus_dma_tag_t dmat, bus_size_t size, bus_size_t alignment, int nsegments, bus_size_t maxsegsz, int mapflags, int loadflags) { struct drm_dmamem *mem; size_t strsize; /* * segs is the last member of the struct since we modify the size * to allow extra segments if more than one are allowed. */ strsize = sizeof(*mem) + (sizeof(bus_dma_segment_t) * (nsegments - 1)); mem = malloc(strsize, M_DRM, M_NOWAIT | M_ZERO); if (mem == NULL) return (NULL); mem->size = size; if (bus_dmamap_create(dmat, size, nsegments, maxsegsz, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &mem->map) != 0) goto strfree; if (bus_dmamem_alloc(dmat, size, alignment, 0, mem->segs, nsegments, &mem->nsegs, BUS_DMA_NOWAIT | BUS_DMA_ZERO) != 0) goto destroy; if (bus_dmamem_map(dmat, mem->segs, mem->nsegs, size, &mem->kva, BUS_DMA_NOWAIT | mapflags) != 0) goto free; if (bus_dmamap_load(dmat, mem->map, mem->kva, size, NULL, BUS_DMA_NOWAIT | loadflags) != 0) goto unmap; return (mem); unmap: bus_dmamem_unmap(dmat, mem->kva, size); free: bus_dmamem_free(dmat, mem->segs, mem->nsegs); destroy: bus_dmamap_destroy(dmat, mem->map); strfree: free(mem, M_DRM, 0); return (NULL); } void drm_dmamem_free(bus_dma_tag_t dmat, struct drm_dmamem *mem) { if (mem == NULL) return; bus_dmamap_unload(dmat, mem->map); bus_dmamem_unmap(dmat, mem->kva, mem->size); bus_dmamem_free(dmat, mem->segs, mem->nsegs); bus_dmamap_destroy(dmat, mem->map); free(mem, M_DRM, 0); } struct drm_dma_handle * drm_pci_alloc(struct drm_device *dev, size_t size, size_t align) { struct drm_dma_handle *dmah; dmah = malloc(sizeof(*dmah), M_DRM, M_WAITOK); dmah->mem = drm_dmamem_alloc(dev->dmat, size, align, 1, size, BUS_DMA_NOCACHE, 0); if (dmah->mem == NULL) { free(dmah, M_DRM, sizeof(*dmah)); return NULL; } dmah->busaddr = dmah->mem->segs[0].ds_addr; dmah->size = dmah->mem->size; dmah->vaddr = dmah->mem->kva; return (dmah); } void drm_pci_free(struct drm_device *dev, struct drm_dma_handle *dmah) { if (dmah == NULL) return; drm_dmamem_free(dev->dmat, dmah->mem); free(dmah, M_DRM, sizeof(*dmah)); } /* * Compute order. Can be made faster. */ int drm_order(unsigned long size) { int order; unsigned long tmp; for (order = 0, tmp = size; tmp >>= 1; ++order) ; if (size & ~(1 << order)) ++order; return order; } int drm_getpciinfo(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_pciinfo *info = data; if (dev->pdev == NULL) return -ENOTTY; info->domain = dev->pdev->bus->domain_nr; info->bus = dev->pdev->bus->number; info->dev = PCI_SLOT(dev->pdev->devfn); info->func = PCI_FUNC(dev->pdev->devfn); info->vendor_id = dev->pdev->vendor; info->device_id = dev->pdev->device; info->subvendor_id = dev->pdev->subsystem_vendor; info->subdevice_id = dev->pdev->subsystem_device; info->revision_id = 0; return 0; }
9015 9021 8928 206 9006 8974 8978 8896 201 1315 226 1147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 /* $OpenBSD: syscall_mi.h,v 1.26 2022/06/29 12:06:11 jca Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 */ #include <sys/param.h> #include <sys/pledge.h> #include <sys/tracepoint.h> #include <uvm/uvm_extern.h> #ifdef KTRACE #include <sys/ktrace.h> #endif #include "dt.h" #if NDT > 0 #include <dev/dt/dtvar.h> #endif /* * The MD setup for a system call has been done; here's the MI part. */ static inline int mi_syscall(struct proc *p, register_t code, const struct sysent *callp, register_t *argp, register_t retval[2]) { uint64_t tval; int lock = !(callp->sy_flags & SY_NOLOCK); int error, pledged; /* refresh the thread's cache of the process's creds */ refreshcreds(p); #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_call(p, code, argp); KERNEL_UNLOCK(); #endif TRACEPOINT(raw_syscalls, sys_enter, code, NULL); #if NDT > 0 DT_ENTER(syscall, code, callp->sy_argsize, argp); #endif #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { KERNEL_LOCK(); ktrsyscall(p, code, callp->sy_argsize, argp); KERNEL_UNLOCK(); } #endif /* SP must be within MAP_STACK space */ if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), "[%s]%d/%d sp=%lx inside %lx-%lx: not MAP_STACK\n", uvm_map_inentry_sp, p->p_vmspace->vm_map.sserial)) return (EPERM); /* PC must be in un-writeable permitted text (sigtramp, libc, ld.so) */ if (!uvm_map_inentry(p, &p->p_pcinentry, PROC_PC(p), "[%s]%d/%d pc=%lx inside %lx-%lx: bogus syscall\n", uvm_map_inentry_pc, p->p_vmspace->vm_map.wserial)) return (EPERM); pledged = (p->p_p->ps_flags & PS_PLEDGE); if (pledged && (error = pledge_syscall(p, code, &tval))) { KERNEL_LOCK(); error = pledge_fail(p, error, tval); KERNEL_UNLOCK(); return (error); } if (lock) KERNEL_LOCK(); error = (*callp->sy_call)(p, argp, retval); if (lock) KERNEL_UNLOCK(); return (error); } /* * Finish MI stuff on return, after the registers have been set */ static inline void mi_syscall_return(struct proc *p, register_t code, int error, const register_t retval[2]) { #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, error, retval); KERNEL_UNLOCK(); #endif #if NDT > 0 DT_LEAVE(syscall, code, error, retval[0], retval[1]); #endif TRACEPOINT(raw_syscalls, sys_exit, code, NULL); userret(p); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { KERNEL_LOCK(); ktrsysret(p, code, error, retval); KERNEL_UNLOCK(); } #endif } /* * Finish MI stuff for a new process/thread to return */ static inline void mi_child_return(struct proc *p) { #if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 int code = (p->p_flag & P_THREAD) ? SYS___tfork : (p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork; const register_t child_retval[2] = { 0, 1 }; #endif TRACEPOINT(sched, on__cpu, NULL); #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, 0, child_retval); KERNEL_UNLOCK(); #endif #if NDT > 0 DT_LEAVE(syscall, code, 0, child_retval[0], child_retval[1]); #endif TRACEPOINT(raw_syscalls, sys_exit, code, NULL); userret(p); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { KERNEL_LOCK(); ktrsysret(p, code, 0, child_retval); KERNEL_UNLOCK(); } #endif } /* * Do the specific processing necessary for an AST */ static inline void mi_ast(struct proc *p, int resched) { if (p->p_flag & P_OWEUPC) { KERNEL_LOCK(); ADDUPROF(p); KERNEL_UNLOCK(); } if (resched) preempt(); /* * XXX could move call to userret() here, but * hppa calls ast() in syscall return and sh calls * it after userret() */ }
36 36 35 36 58 58 58 118 3 115 56 60 115 115 115 115 58 220 46 178 4 159 74 74 28 60 9 15 15 36 36 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 /* $OpenBSD: uvm_km.c,v 1.151 2022/08/01 14:15:46 mpi Exp $ */ /* $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * uvm_km.c: handle kernel memory allocation and management */ /* * overview of kernel memory management: * * the kernel virtual address space is mapped by "kernel_map." kernel_map * starts at a machine-dependent address and is VM_KERNEL_SPACE_SIZE bytes * large. * * the kernel_map has several "submaps." submaps can only appear in * the kernel_map (user processes can't use them). submaps "take over" * the management of a sub-range of the kernel's address space. submaps * are typically allocated at boot time and are never released. kernel * virtual address space that is mapped by a submap is locked by the * submap's lock -- not the kernel_map's lock. * * thus, the useful feature of submaps is that they allow us to break * up the locking and protection of the kernel address space into smaller * chunks. * * The VM system has several standard kernel submaps: * kmem_map: Contains only wired kernel memory for malloc(9). * Note: All access to this map must be protected by splvm as * calls to malloc(9) are allowed in interrupt handlers. * exec_map: Memory to hold arguments to system calls are allocated from * this map. * XXX: This is primeraly used to artificially limit the number * of concurrent processes doing an exec. * phys_map: Buffers for vmapbuf (physio) are allocated from this map. * * the kernel allocates its private memory out of special uvm_objects whose * reference count is set to UVM_OBJ_KERN (thus indicating that the objects * are "special" and never die). all kernel objects should be thought of * as large, fixed-sized, sparsely populated uvm_objects. each kernel * object is equal to the size of kernel virtual address space (i.e. * VM_KERNEL_SPACE_SIZE). * * most kernel private memory lives in kernel_object. the only exception * to this is for memory that belongs to submaps that must be protected * by splvm(). each of these submaps manages their own pages. * * note that just because a kernel object spans the entire kernel virtual * address space doesn't mean that it has to be mapped into the entire space. * large chunks of a kernel object's space go unused either because * that area of kernel VM is unmapped, or there is some other type of * object mapped into that range (e.g. a vnode). for submap's kernel * objects, the only part of the object that can ever be populated is the * offsets that are managed by the submap. * * note that the "offset" in a kernel object is always the kernel virtual * address minus the vm_map_min(kernel_map). * example: * suppose kernel_map starts at 0xf8000000 and the kernel does a * uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the * kernel map]. if uvm_km_alloc returns virtual address 0xf8235000, * then that means that the page at offset 0x235000 in kernel_object is * mapped at 0xf8235000. * * kernel objects have one other special property: when the kernel virtual * memory mapping them is unmapped, the backing memory in the object is * freed right away. this is done with the uvm_km_pgremove() function. * this has to be done because there is no backing store for kernel pages * and no need to save them after they are no longer referenced. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/kthread.h> #include <uvm/uvm.h> /* * global data structures */ struct vm_map *kernel_map = NULL; /* Unconstraint range. */ struct uvm_constraint_range no_constraint = { 0x0, (paddr_t)-1 }; /* * local data structures */ static struct vm_map kernel_map_store; /* * uvm_km_init: init kernel maps and objects to reflect reality (i.e. * KVM already allocated for text, data, bss, and static data structures). * * => KVM is defined by [base.. base + VM_KERNEL_SPACE_SIZE]. * we assume that [base -> start] has already been allocated and that * "end" is the end of the kernel image span. */ void uvm_km_init(vaddr_t base, vaddr_t start, vaddr_t end) { /* kernel_object: for pageable anonymous kernel memory */ uao_init(); uvm.kernel_object = uao_create(VM_KERNEL_SPACE_SIZE, UAO_FLAG_KERNOBJ); /* * init the map and reserve already allocated kernel space * before installing. */ uvm_map_setup(&kernel_map_store, pmap_kernel(), base, end, #ifdef KVA_GUARDPAGES VM_MAP_PAGEABLE | VM_MAP_GUARDPAGES #else VM_MAP_PAGEABLE #endif ); if (base != start && uvm_map(&kernel_map_store, &base, start - base, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, UVM_FLAG_FIXED)) != 0) panic("uvm_km_init: could not reserve space for kernel"); kernel_map = &kernel_map_store; } /* * uvm_km_suballoc: allocate a submap in the kernel map. once a submap * is allocated all references to that area of VM must go through it. this * allows the locking of VAs in kernel_map to be broken up into regions. * * => if `fixed' is true, *min specifies where the region described * by the submap must start * => if submap is non NULL we use that as the submap, otherwise we * alloc a new map */ struct vm_map * uvm_km_suballoc(struct vm_map *map, vaddr_t *min, vaddr_t *max, vsize_t size, int flags, boolean_t fixed, struct vm_map *submap) { int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0); size = round_page(size); /* round up to pagesize */ /* first allocate a blank spot in the parent map */ if (uvm_map(map, min, size, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, mapflags)) != 0) { panic("uvm_km_suballoc: unable to allocate space in parent map"); } /* set VM bounds (min is filled in by uvm_map) */ *max = *min + size; /* add references to pmap and create or init the submap */ pmap_reference(vm_map_pmap(map)); if (submap == NULL) { submap = uvm_map_create(vm_map_pmap(map), *min, *max, flags); if (submap == NULL) panic("uvm_km_suballoc: unable to create submap"); } else { uvm_map_setup(submap, vm_map_pmap(map), *min, *max, flags); } /* * now let uvm_map_submap plug in it... */ if (uvm_map_submap(map, *min, *max, submap) != 0) panic("uvm_km_suballoc: submap allocation failed"); return(submap); } /* * uvm_km_pgremove: remove pages from a kernel uvm_object. * * => when you unmap a part of anonymous kernel memory you want to toss * the pages right away. (this gets called from uvm_unmap_...). */ void uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva) { const voff_t start = startva - vm_map_min(kernel_map); const voff_t end = endva - vm_map_min(kernel_map); struct vm_page *pp; voff_t curoff; int slot; int swpgonlydelta = 0; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock)); pmap_remove(pmap_kernel(), startva, endva); for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { pp = uvm_pagelookup(uobj, curoff); if (pp && pp->pg_flags & PG_BUSY) { uvm_pagewait(pp, uobj->vmobjlock, "km_pgrm"); rw_enter(uobj->vmobjlock, RW_WRITE); curoff -= PAGE_SIZE; /* loop back to us */ continue; } /* free the swap slot, then the page */ slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT); if (pp != NULL) { uvm_lock_pageq(); uvm_pagefree(pp); uvm_unlock_pageq(); } else if (slot != 0) { swpgonlydelta++; } } if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta); atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta); } } /* * uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for "intrsafe" * objects * * => when you unmap a part of anonymous kernel memory you want to toss * the pages right away. (this gets called from uvm_unmap_...). * => none of the pages will ever be busy, and none of them will ever * be on the active or inactive queues (because these objects are * never allowed to "page"). */ void uvm_km_pgremove_intrsafe(vaddr_t start, vaddr_t end) { struct vm_page *pg; vaddr_t va; paddr_t pa; for (va = start; va < end; va += PAGE_SIZE) { if (!pmap_extract(pmap_kernel(), va, &pa)) continue; pg = PHYS_TO_VM_PAGE(pa); if (pg == NULL) panic("uvm_km_pgremove_intrsafe: no page"); uvm_pagefree(pg); } pmap_kremove(start, end - start); } /* * uvm_km_kmemalloc: lower level kernel memory allocator for malloc() * * => we map wired memory into the specified map using the obj passed in * => NOTE: we can return NULL even if we can wait if there is not enough * free VM space in the map... caller should be prepared to handle * this case. * => we return KVA of memory allocated * => flags: NOWAIT, VALLOC - just allocate VA, TRYLOCK - fail if we can't * lock the map * => low, high, alignment, boundary, nsegs are the corresponding parameters * to uvm_pglistalloc * => flags: ZERO - correspond to uvm_pglistalloc flags */ vaddr_t uvm_km_kmemalloc_pla(struct vm_map *map, struct uvm_object *obj, vsize_t size, vsize_t valign, int flags, paddr_t low, paddr_t high, paddr_t alignment, paddr_t boundary, int nsegs) { vaddr_t kva, loopva; voff_t offset; struct vm_page *pg; struct pglist pgl; int pla_flags; KASSERT(vm_map_pmap(map) == pmap_kernel()); /* UVM_KMF_VALLOC => !UVM_KMF_ZERO */ KASSERT(!(flags & UVM_KMF_VALLOC) || !(flags & UVM_KMF_ZERO)); /* setup for call */ size = round_page(size); kva = vm_map_min(map); /* hint */ if (nsegs == 0) nsegs = atop(size); /* allocate some virtual space */ if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET, valign, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, (flags & UVM_KMF_TRYLOCK))) != 0)) { return 0; } /* if all we wanted was VA, return now */ if (flags & UVM_KMF_VALLOC) { return kva; } /* recover object offset from virtual address */ if (obj != NULL) offset = kva - vm_map_min(kernel_map); else offset = 0; /* * now allocate and map in the memory... note that we are the only ones * whom should ever get a handle on this area of VM. */ TAILQ_INIT(&pgl); pla_flags = 0; KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if ((flags & UVM_KMF_NOWAIT) || ((flags & UVM_KMF_CANFAIL) && uvmexp.swpages - uvmexp.swpgonly <= atop(size))) pla_flags |= UVM_PLA_NOWAIT; else pla_flags |= UVM_PLA_WAITOK; if (flags & UVM_KMF_ZERO) pla_flags |= UVM_PLA_ZERO; if (uvm_pglistalloc(size, low, high, alignment, boundary, &pgl, nsegs, pla_flags) != 0) { /* Failed. */ uvm_unmap(map, kva, kva + size); return (0); } if (obj != NULL) rw_enter(obj->vmobjlock, RW_WRITE); loopva = kva; while (loopva != kva + size) { pg = TAILQ_FIRST(&pgl); TAILQ_REMOVE(&pgl, pg, pageq); uvm_pagealloc_pg(pg, obj, offset, NULL); atomic_clearbits_int(&pg->pg_flags, PG_BUSY); UVM_PAGE_OWN(pg, NULL); /* * map it in: note that we call pmap_enter with the map and * object unlocked in case we are kmem_map. */ if (obj == NULL) { pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE); } else { pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PMAP_WIRED); } loopva += PAGE_SIZE; offset += PAGE_SIZE; } KASSERT(TAILQ_EMPTY(&pgl)); pmap_update(pmap_kernel()); if (obj != NULL) rw_exit(obj->vmobjlock); return kva; } /* * uvm_km_free: free an area of kernel memory */ void uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size) { uvm_unmap(map, trunc_page(addr), round_page(addr+size)); } /* * uvm_km_alloc1: allocate wired down memory in the kernel map. * * => we can sleep if needed */ vaddr_t uvm_km_alloc1(struct vm_map *map, vsize_t size, vsize_t align, boolean_t zeroit) { vaddr_t kva, loopva; voff_t offset; struct vm_page *pg; KASSERT(vm_map_pmap(map) == pmap_kernel()); size = round_page(size); kva = vm_map_min(map); /* hint */ /* allocate some virtual space */ if (__predict_false(uvm_map(map, &kva, size, uvm.kernel_object, UVM_UNKNOWN_OFFSET, align, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_INHERIT_NONE, MADV_RANDOM, 0)) != 0)) { return 0; } /* recover object offset from virtual address */ offset = kva - vm_map_min(kernel_map); /* now allocate the memory. we must be careful about released pages. */ loopva = kva; while (size) { rw_enter(uvm.kernel_object->vmobjlock, RW_WRITE); /* allocate ram */ pg = uvm_pagealloc(uvm.kernel_object, offset, NULL, 0); if (pg) { atomic_clearbits_int(&pg->pg_flags, PG_BUSY); UVM_PAGE_OWN(pg, NULL); } rw_exit(uvm.kernel_object->vmobjlock); if (__predict_false(pg == NULL)) { if (curproc == uvm.pagedaemon_proc) { /* * It is unfeasible for the page daemon to * sleep for memory, so free what we have * allocated and fail. */ uvm_unmap(map, kva, loopva - kva); return (0); } else { uvm_wait("km_alloc1w"); /* wait for memory */ continue; } } /* * map it in; note we're never called with an intrsafe * object, so we always use regular old pmap_enter(). */ pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE | PMAP_WIRED); loopva += PAGE_SIZE; offset += PAGE_SIZE; size -= PAGE_SIZE; } pmap_update(map->pmap); /* * zero on request (note that "size" is now zero due to the above loop * so we need to subtract kva from loopva to reconstruct the size). */ if (zeroit) memset((caddr_t)kva, 0, loopva - kva); return kva; } #if defined(__HAVE_PMAP_DIRECT) /* * uvm_km_page allocator, __HAVE_PMAP_DIRECT arch * On architectures with machine memory direct mapped into a portion * of KVM, we have very little work to do. Just get a physical page, * and find and return its VA. */ void uvm_km_page_init(void) { /* nothing */ } void uvm_km_page_lateinit(void) { /* nothing */ } #else /* * uvm_km_page allocator, non __HAVE_PMAP_DIRECT archs * This is a special allocator that uses a reserve of free pages * to fulfill requests. It is fast and interrupt safe, but can only * return page sized regions. Its primary use is as a backend for pool. * * The memory returned is allocated from the larger kernel_map, sparing * pressure on the small interrupt-safe kmem_map. It is wired, but * not zero filled. */ struct uvm_km_pages uvm_km_pages; void uvm_km_createthread(void *); void uvm_km_thread(void *); struct uvm_km_free_page *uvm_km_doputpage(struct uvm_km_free_page *); /* * Allocate the initial reserve, and create the thread which will * keep the reserve full. For bootstrapping, we allocate more than * the lowat amount, because it may be a while before the thread is * running. */ void uvm_km_page_init(void) { int lowat_min; int i; int len, bulk; vaddr_t addr; mtx_init(&uvm_km_pages.mtx, IPL_VM); if (!uvm_km_pages.lowat) { /* based on physmem, calculate a good value here */ uvm_km_pages.lowat = physmem / 256; lowat_min = physmem < atop(16 * 1024 * 1024) ? 32 : 128; if (uvm_km_pages.lowat < lowat_min) uvm_km_pages.lowat = lowat_min; } if (uvm_km_pages.lowat > UVM_KM_PAGES_LOWAT_MAX) uvm_km_pages.lowat = UVM_KM_PAGES_LOWAT_MAX; uvm_km_pages.hiwat = 4 * uvm_km_pages.lowat; if (uvm_km_pages.hiwat > UVM_KM_PAGES_HIWAT_MAX) uvm_km_pages.hiwat = UVM_KM_PAGES_HIWAT_MAX; /* Allocate all pages in as few allocations as possible. */ len = 0; bulk = uvm_km_pages.hiwat; while (len < uvm_km_pages.hiwat && bulk > 0) { bulk = MIN(bulk, uvm_km_pages.hiwat - len); addr = vm_map_min(kernel_map); if (uvm_map(kernel_map, &addr, (vsize_t)bulk << PAGE_SHIFT, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, UVM_KMF_TRYLOCK)) != 0) { bulk /= 2; continue; } for (i = len; i < len + bulk; i++, addr += PAGE_SIZE) uvm_km_pages.page[i] = addr; len += bulk; } uvm_km_pages.free = len; for (i = len; i < UVM_KM_PAGES_HIWAT_MAX; i++) uvm_km_pages.page[i] = 0; /* tone down if really high */ if (uvm_km_pages.lowat > 512) uvm_km_pages.lowat = 512; } void uvm_km_page_lateinit(void) { kthread_create_deferred(uvm_km_createthread, NULL); } void uvm_km_createthread(void *arg) { kthread_create(uvm_km_thread, NULL, &uvm_km_pages.km_proc, "kmthread"); } /* * Endless loop. We grab pages in increments of 16 pages, then * quickly swap them into the list. */ void uvm_km_thread(void *arg) { vaddr_t pg[16]; int i; int allocmore = 0; int flags; struct uvm_km_free_page *fp = NULL; KERNEL_UNLOCK(); for (;;) { mtx_enter(&uvm_km_pages.mtx); if (uvm_km_pages.free >= uvm_km_pages.lowat && uvm_km_pages.freelist == NULL) { msleep_nsec(&uvm_km_pages.km_proc, &uvm_km_pages.mtx, PVM, "kmalloc", INFSLP); } allocmore = uvm_km_pages.free < uvm_km_pages.lowat; fp = uvm_km_pages.freelist; uvm_km_pages.freelist = NULL; uvm_km_pages.freelistlen = 0; mtx_leave(&uvm_km_pages.mtx); if (allocmore) { /* * If there was nothing on the freelist, then we * must obtain at least one page to make progress. * So, only use UVM_KMF_TRYLOCK for the first page * if fp != NULL */ flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, fp != NULL ? UVM_KMF_TRYLOCK : 0); memset(pg, 0, sizeof(pg)); for (i = 0; i < nitems(pg); i++) { pg[i] = vm_map_min(kernel_map); if (uvm_map(kernel_map, &pg[i], PAGE_SIZE, NULL, UVM_UNKNOWN_OFFSET, 0, flags) != 0) { pg[i] = 0; break; } /* made progress, so don't sleep for more */ flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, MAP_INHERIT_NONE, MADV_RANDOM, UVM_KMF_TRYLOCK); } mtx_enter(&uvm_km_pages.mtx); for (i = 0; i < nitems(pg); i++) { if (uvm_km_pages.free == nitems(uvm_km_pages.page)) break; else if (pg[i] != 0) uvm_km_pages.page[uvm_km_pages.free++] = pg[i]; } wakeup(&uvm_km_pages.free); mtx_leave(&uvm_km_pages.mtx); /* Cleanup left-over pages (if any). */ for (; i < nitems(pg); i++) { if (pg[i] != 0) { uvm_unmap(kernel_map, pg[i], pg[i] + PAGE_SIZE); } } } while (fp) { fp = uvm_km_doputpage(fp); } } } struct uvm_km_free_page * uvm_km_doputpage(struct uvm_km_free_page *fp) { vaddr_t va = (vaddr_t)fp; struct vm_page *pg; int freeva = 1; struct uvm_km_free_page *nextfp = fp->next; pg = uvm_atopg(va); pmap_kremove(va, PAGE_SIZE); pmap_update(kernel_map->pmap); mtx_enter(&uvm_km_pages.mtx); if (uvm_km_pages.free < uvm_km_pages.hiwat) { uvm_km_pages.page[uvm_km_pages.free++] = va; freeva = 0; } mtx_leave(&uvm_km_pages.mtx); if (freeva) uvm_unmap(kernel_map, va, va + PAGE_SIZE); uvm_pagefree(pg); return (nextfp); } #endif /* !__HAVE_PMAP_DIRECT */ void * km_alloc(size_t sz, const struct kmem_va_mode *kv, const struct kmem_pa_mode *kp, const struct kmem_dyn_mode *kd) { struct vm_map *map; struct vm_page *pg; struct pglist pgl; int mapflags = 0; vm_prot_t prot; paddr_t pla_align; int pla_flags; int pla_maxseg; vaddr_t va, sva = 0; KASSERT(sz == round_page(sz)); TAILQ_INIT(&pgl); if (kp->kp_nomem || kp->kp_pageable) goto alloc_va; pla_flags = kd->kd_waitok ? UVM_PLA_WAITOK : UVM_PLA_NOWAIT; pla_flags |= UVM_PLA_TRYCONTIG; if (kp->kp_zero) pla_flags |= UVM_PLA_ZERO; pla_align = kp->kp_align; #ifdef __HAVE_PMAP_DIRECT if (pla_align < kv->kv_align) pla_align = kv->kv_align; #endif pla_maxseg = kp->kp_maxseg; if (pla_maxseg == 0) pla_maxseg = sz / PAGE_SIZE; if (uvm_pglistalloc(sz, kp->kp_constraint->ucr_low, kp->kp_constraint->ucr_high, pla_align, kp->kp_boundary, &pgl, pla_maxseg, pla_flags)) { return (NULL); } #ifdef __HAVE_PMAP_DIRECT /* * Only use direct mappings for single page or single segment * allocations. */ if (kv->kv_singlepage || kp->kp_maxseg == 1) { TAILQ_FOREACH(pg, &pgl, pageq) { va = pmap_map_direct(pg); if (pg == TAILQ_FIRST(&pgl)) sva = va; } return ((void *)sva); } #endif alloc_va: prot = PROT_READ | PROT_WRITE; if (kp->kp_pageable) { KASSERT(kp->kp_object); KASSERT(!kv->kv_singlepage); } else { KASSERT(kp->kp_object == NULL); } if (kv->kv_singlepage) { KASSERT(sz == PAGE_SIZE); #ifdef __HAVE_PMAP_DIRECT panic("km_alloc: DIRECT single page"); #else mtx_enter(&uvm_km_pages.mtx); while (uvm_km_pages.free == 0) { if (kd->kd_waitok == 0) { mtx_leave(&uvm_km_pages.mtx); uvm_pglistfree(&pgl); return NULL; } msleep_nsec(&uvm_km_pages.free, &uvm_km_pages.mtx, PVM, "getpage", INFSLP); } va = uvm_km_pages.page[--uvm_km_pages.free]; if (uvm_km_pages.free < uvm_km_pages.lowat && curproc != uvm_km_pages.km_proc) { if (kd->kd_slowdown) *kd->kd_slowdown = 1; wakeup(&uvm_km_pages.km_proc); } mtx_leave(&uvm_km_pages.mtx); #endif } else { struct uvm_object *uobj = NULL; if (kd->kd_trylock) mapflags |= UVM_KMF_TRYLOCK; if (kp->kp_object) uobj = *kp->kp_object; try_map: map = *kv->kv_map; va = vm_map_min(map); if (uvm_map(map, &va, sz, uobj, kd->kd_prefer, kv->kv_align, UVM_MAPFLAG(prot, prot, MAP_INHERIT_NONE, MADV_RANDOM, mapflags))) { if (kv->kv_wait && kd->kd_waitok) { tsleep_nsec(map, PVM, "km_allocva", INFSLP); goto try_map; } uvm_pglistfree(&pgl); return (NULL); } } sva = va; TAILQ_FOREACH(pg, &pgl, pageq) { if (kp->kp_pageable) pmap_enter(pmap_kernel(), va, VM_PAGE_TO_PHYS(pg), prot, prot | PMAP_WIRED); else pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), prot); va += PAGE_SIZE; } pmap_update(pmap_kernel()); return ((void *)sva); } void km_free(void *v, size_t sz, const struct kmem_va_mode *kv, const struct kmem_pa_mode *kp) { vaddr_t sva, eva, va; struct vm_page *pg; struct pglist pgl; sva = (vaddr_t)v; eva = sva + sz; if (kp->kp_nomem) goto free_va; #ifdef __HAVE_PMAP_DIRECT if (kv->kv_singlepage || kp->kp_maxseg == 1) { TAILQ_INIT(&pgl); for (va = sva; va < eva; va += PAGE_SIZE) { pg = pmap_unmap_direct(va); TAILQ_INSERT_TAIL(&pgl, pg, pageq); } uvm_pglistfree(&pgl); return; } #else if (kv->kv_singlepage) { struct uvm_km_free_page *fp = v; mtx_enter(&uvm_km_pages.mtx); fp->next = uvm_km_pages.freelist; uvm_km_pages.freelist = fp; if (uvm_km_pages.freelistlen++ > 16) wakeup(&uvm_km_pages.km_proc); mtx_leave(&uvm_km_pages.mtx); return; } #endif if (kp->kp_pageable) { pmap_remove(pmap_kernel(), sva, eva); pmap_update(pmap_kernel()); } else { TAILQ_INIT(&pgl); for (va = sva; va < eva; va += PAGE_SIZE) { paddr_t pa; if (!pmap_extract(pmap_kernel(), va, &pa)) continue; pg = PHYS_TO_VM_PAGE(pa); if (pg == NULL) { panic("km_free: unmanaged page 0x%lx", pa); } TAILQ_INSERT_TAIL(&pgl, pg, pageq); } pmap_kremove(sva, sz); pmap_update(pmap_kernel()); uvm_pglistfree(&pgl); } free_va: uvm_unmap(*kv->kv_map, sva, eva); if (kv->kv_wait) wakeup(*kv->kv_map); } const struct kmem_va_mode kv_any = { .kv_map = &kernel_map, }; const struct kmem_va_mode kv_intrsafe = { .kv_map = &kmem_map, }; const struct kmem_va_mode kv_page = { .kv_singlepage = 1 }; const struct kmem_pa_mode kp_dirty = { .kp_constraint = &no_constraint }; const struct kmem_pa_mode kp_dma = { .kp_constraint = &dma_constraint }; const struct kmem_pa_mode kp_dma_contig = { .kp_constraint = &dma_constraint, .kp_maxseg = 1 }; const struct kmem_pa_mode kp_dma_zero = { .kp_constraint = &dma_constraint, .kp_zero = 1 }; const struct kmem_pa_mode kp_zero = { .kp_constraint = &no_constraint, .kp_zero = 1 }; const struct kmem_pa_mode kp_pageable = { .kp_object = &uvm.kernel_object, .kp_pageable = 1 /* XXX - kp_nomem, maybe, but we'll need to fix km_free. */ }; const struct kmem_pa_mode kp_none = { .kp_nomem = 1 }; const struct kmem_dyn_mode kd_waitok = { .kd_waitok = 1, .kd_prefer = UVM_UNKNOWN_OFFSET }; const struct kmem_dyn_mode kd_nowait = { .kd_prefer = UVM_UNKNOWN_OFFSET }; const struct kmem_dyn_mode kd_trylock = { .kd_trylock = 1, .kd_prefer = UVM_UNKNOWN_OFFSET };
258 259 257 259 259 260 88 175 174 258 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* $OpenBSD: in_cksum.c,v 1.9 2019/04/22 22:47:49 bluhm Exp $ */ /* $NetBSD: in_cksum.c,v 1.11 1996/04/08 19:55:37 jonathan Exp $ */ /* * Copyright (c) 1988, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 */ #include <sys/param.h> #include <sys/mbuf.h> #include <sys/systm.h> /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. */ #define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) #define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} int in_cksum(struct mbuf *m, int len) { uint16_t *w; int sum = 0; int mlen = 0; int byte_swapped = 0; union { uint8_t c[2]; uint16_t s; } s_util; union { uint16_t s[2]; uint32_t l; } l_util; for (;m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, uint16_t *); if (mlen == -1) { /* * The first byte of this mbuf is the continuation * of a word spanning between this mbuf and the * last mbuf. * * s_util.c[0] is already saved when scanning previous * mbuf. */ s_util.c[1] = *(uint8_t *)w; sum += s_util.s; w = (uint16_t *)((uint8_t *)w + 1); mlen = m->m_len - 1; len--; } else mlen = m->m_len; if (len < mlen) mlen = len; len -= mlen; /* * Force to even boundary. */ if ((1 & (long) w) && (mlen > 0)) { REDUCE; sum <<= 8; s_util.c[0] = *(uint8_t *)w; w = (uint16_t *)((uint8_t *)w + 1); mlen--; byte_swapped = 1; } /* * Unroll the loop to make overhead from * branches &c small. */ while ((mlen -= 32) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; w += 16; } mlen += 32; while ((mlen -= 8) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; w += 4; } mlen += 8; if (mlen == 0 && byte_swapped == 0) continue; REDUCE; while ((mlen -= 2) >= 0) { sum += *w++; } if (byte_swapped) { REDUCE; sum <<= 8; byte_swapped = 0; if (mlen == -1) { s_util.c[1] = *(uint8_t *)w; sum += s_util.s; mlen = 0; } else mlen = -1; } else if (mlen == -1) s_util.c[0] = *(uint8_t *)w; } if (len) panic("%s: out of data, len %d", __func__, len); if (mlen == -1) { /* The last mbuf has odd # of bytes. Follow the standard (the odd byte may be shifted left by 8 bits or not as determined by endian-ness of the machine) */ s_util.c[1] = 0; sum += s_util.s; } REDUCE; return (~sum & 0xffff); }
279 1117 28 1120 32 32 32 27 32 268 14 181 257 95 268 1145 268 268 260 839 124 125 82 67 1 2 3 125 125 125 125 3 3 32 7 31 32 31 26 1492 1491 1494 1494 1477 1475 32 32 32 32 32 32 6 1493 1090 5 736 1076 738 20 1494 215 1494 189 189 189 1 188 145 145 124 145 124 145 125 20 20 20 1477 1477 1476 140 1410 1083 1475 20 1178 1124 20 1474 1477 1476 1475 1176 1084 267 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 /* $OpenBSD: uvm_pmemrange.c,v 1.62 2022/06/02 18:00:53 kettenis Exp $ */ /* * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/systm.h> #include <uvm/uvm.h> #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/mount.h> /* * 2 trees: addr tree and size tree. * * The allocator keeps chunks of free pages (called a range). * Two pages are part of the same range if: * - all pages in between are part of that range, * - they are of the same memory type (zeroed or non-zeroed), * - they are part of the same pmemrange. * A pmemrange is a range of memory which is part of the same vm_physseg * and has a use-count. * * addr tree is vm_page[0].objt * size tree is vm_page[1].objt * * The size tree is not used for memory ranges of 1 page, instead, * single queue is vm_page[0].pageq * * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges * are joined, unless: * - they have pages in between them which are not free * - they belong to different memtypes (zeroed vs dirty memory) * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) * - they are not a continuation of the same array * The latter issue is caused by vm_physseg ordering and splitting from the * MD initialization machinery. The MD code is dependant on freelists and * happens to split ISA memory from non-ISA memory. * (Note: freelists die die die!) * * uvm_page_init guarantees that every vm_physseg contains an array of * struct vm_page. Also, uvm_page_physload allocates an array of struct * vm_page. This code depends on that array. The array may break across * vm_physsegs boundaries. */ /* * Validate the flags of the page. (Used in asserts.) * Any free page must have the PQ_FREE flag set. * Free pages may be zeroed. * Pmap flags are left untouched. * * The PQ_FREE flag is not checked here: by not checking, we can easily use * this check in pages which are freed. */ #define VALID_FLAGS(pg_flags) \ (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) /* Tree comparators. */ int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, const struct uvm_pmemrange *); int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); int uvm_pmr_pg_to_memtype(struct vm_page *); #ifdef DDB void uvm_pmr_print(void); #endif /* * Memory types. The page flags are used to derive what the current memory * type of a page is. */ int uvm_pmr_pg_to_memtype(struct vm_page *pg) { if (pg->pg_flags & PG_ZERO) return UVM_PMR_MEMTYPE_ZERO; /* Default: dirty memory. */ return UVM_PMR_MEMTYPE_DIRTY; } /* Trees. */ RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, uvm_pmemrange_addr_cmp); /* Validation. */ #ifdef DEBUG void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); #else #define uvm_pmr_assertvalid(pmr) do {} while (0) #endif psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, paddr_t, paddr_t, int); struct uvm_pmemrange *uvm_pmr_allocpmr(void); struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, struct vm_page *, int); void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, struct vm_page **pg_prev, struct vm_page **pg_next); struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, struct vm_page *, paddr_t); struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, struct vm_page *, paddr_t); psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, struct vm_page **, int); psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, paddr_t *); void uvm_pmr_split(paddr_t); struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, struct uvm_pmemrange *); psize_t pow2divide(psize_t, psize_t); struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, struct vm_page *, paddr_t, paddr_t, int); /* * Computes num/denom and rounds it up to the next power-of-2. * * This is a division function which calculates an approximation of * num/denom, with result =~ num/denom. It is meant to be fast and doesn't * have to be accurate. * * Providing too large a value makes the allocator slightly faster, at the * risk of hitting the failure case more often. Providing too small a value * makes the allocator a bit slower, but less likely to hit a failure case. */ psize_t pow2divide(psize_t num, psize_t denom) { int rshift; for (rshift = 0; num > denom; rshift++, denom <<= 1) ; return (paddr_t)1 << rshift; } /* * Predicate: lhs is a subrange or rhs. * * If rhs_low == 0: don't care about lower bound. * If rhs_high == 0: don't care about upper bound. */ #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) /* * Predicate: lhs intersects with rhs. * * If rhs_low == 0: don't care about lower bound. * If rhs_high == 0: don't care about upper bound. * Ranges don't intersect if they don't have any page in common, array * semantics mean that < instead of <= should be used here. */ #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ ((rhs_high) == 0 || (lhs_low) < (rhs_high))) /* * Align to power-of-2 alignment. */ #define PMR_ALIGN(pgno, align) \ (((pgno) + ((align) - 1)) & ~((align) - 1)) #define PMR_ALIGN_DOWN(pgno, align) \ ((pgno) & ~((align) - 1)) /* * Comparator: sort by address ascending. */ int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, const struct uvm_pmemrange *rhs) { return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; } /* * Comparator: sort by use ascending. * * The higher the use value of a range, the more devices need memory in * this range. Therefore allocate from the range with the lowest use first. */ int uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) { int result; result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; if (result == 0) result = uvm_pmemrange_addr_cmp(lhs, rhs); return result; } int uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) { paddr_t lhs_addr, rhs_addr; lhs_addr = VM_PAGE_TO_PHYS(lhs); rhs_addr = VM_PAGE_TO_PHYS(rhs); return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); } int uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) { psize_t lhs_size, rhs_size; int cmp; /* Using second tree, so we receive pg[1] instead of pg[0]. */ lhs_size = (lhs - 1)->fpgsz; rhs_size = (rhs - 1)->fpgsz; cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); if (cmp == 0) cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); return cmp; } /* * Find the first range of free pages that is at least sz pages long. */ struct vm_page * uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) { struct vm_page *node, *best; KASSERT(sz >= 1); if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) return TAILQ_FIRST(&pmr->single[mti]); node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); best = NULL; while (node != NULL) { if ((node - 1)->fpgsz >= sz) { best = (node - 1); node = RBT_LEFT(uvm_objtree, node); } else node = RBT_RIGHT(uvm_objtree, node); } return best; } /* * Finds the next range. The next range has a size >= pg->fpgsz. * Returns NULL if no more ranges are available. */ struct vm_page * uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) { struct vm_page *npg; KASSERT(pmr != NULL && pg != NULL); if (pg->fpgsz == 1) { if (TAILQ_NEXT(pg, pageq) != NULL) return TAILQ_NEXT(pg, pageq); else npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); } else npg = RBT_NEXT(uvm_pmr_size, pg + 1); return npg == NULL ? NULL : npg - 1; } /* * Finds the previous and next ranges relative to the (uninserted) pg range. * * *pg_prev == NULL if no previous range is available, that can join with * pg. * *pg_next == NULL if no next range is available, that can join with * pg. */ void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, struct vm_page **pg_prev, struct vm_page **pg_next) { KASSERT(pg_prev != NULL && pg_next != NULL); *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); if (*pg_next == NULL) *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); else *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); KDASSERT(*pg_next == NULL || VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); KDASSERT(*pg_prev == NULL || VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); /* Reset if not contig. */ if (*pg_prev != NULL && (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz != atop(VM_PAGE_TO_PHYS(pg)) || *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) *pg_prev = NULL; if (*pg_next != NULL && (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz != atop(VM_PAGE_TO_PHYS(*pg_next)) || pg + pg->fpgsz != *pg_next || /* Array broke. */ uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) *pg_next = NULL; return; } /* * Remove a range from the address tree. * Address tree maintains pmr counters. */ void uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) { KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); KDASSERT(pg->pg_flags & PQ_FREE); RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); pmr->nsegs--; } /* * Remove a range from the size tree. */ void uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) { int memtype; #ifdef DEBUG struct vm_page *i; #endif KDASSERT(pg->fpgsz >= 1); KDASSERT(pg->pg_flags & PQ_FREE); memtype = uvm_pmr_pg_to_memtype(pg); if (pg->fpgsz == 1) { #ifdef DEBUG TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { if (i == pg) break; } KDASSERT(i == pg); #endif TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); } else { KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], pg + 1) == pg + 1); RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); } } /* Remove from both trees. */ void uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) { uvm_pmr_assertvalid(pmr); uvm_pmr_remove_size(pmr, pg); uvm_pmr_remove_addr(pmr, pg); uvm_pmr_assertvalid(pmr); } /* * Insert the range described in pg. * Returns the range thus created (which may be joined with the previous and * next ranges). * If no_join, the caller guarantees that the range cannot possibly join * with adjecent ranges. */ struct vm_page * uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) { struct vm_page *prev, *next; #ifdef DEBUG struct vm_page *i; int mt; #endif KDASSERT(pg->pg_flags & PQ_FREE); KDASSERT(pg->fpgsz >= 1); #ifdef DEBUG for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { TAILQ_FOREACH(i, &pmr->single[mt], pageq) KDASSERT(i != pg); if (pg->fpgsz > 1) { KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], pg + 1) == NULL); } KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); } #endif if (!no_join) { uvm_pmr_pnaddr(pmr, pg, &prev, &next); if (next != NULL) { uvm_pmr_remove_size(pmr, next); uvm_pmr_remove_addr(pmr, next); pg->fpgsz += next->fpgsz; next->fpgsz = 0; } if (prev != NULL) { uvm_pmr_remove_size(pmr, prev); prev->fpgsz += pg->fpgsz; pg->fpgsz = 0; return prev; } } RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); pmr->nsegs++; return pg; } /* * Insert the range described in pg. * Returns the range thus created (which may be joined with the previous and * next ranges). * Page must already be in the address tree. */ void uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) { int memtype; #ifdef DEBUG struct vm_page *i; int mti; #endif KDASSERT(pg->fpgsz >= 1); KDASSERT(pg->pg_flags & PQ_FREE); memtype = uvm_pmr_pg_to_memtype(pg); #ifdef DEBUG for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { TAILQ_FOREACH(i, &pmr->single[mti], pageq) KDASSERT(i != pg); if (pg->fpgsz > 1) { KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], pg + 1) == NULL); } KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); } for (i = pg; i < pg + pg->fpgsz; i++) KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); #endif if (pg->fpgsz == 1) TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); else RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); } /* Insert in both trees. */ struct vm_page * uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) { uvm_pmr_assertvalid(pmr); pg = uvm_pmr_insert_addr(pmr, pg, no_join); uvm_pmr_insert_size(pmr, pg); uvm_pmr_assertvalid(pmr); return pg; } /* * Find the last page that is part of this segment. * => pg: the range at which to start the search. * => boundary: the page number boundary specification (0 = no boundary). * => pmr: the pmemrange of the page. * * This function returns 1 before the next range, so if you want to have the * next range, you need to run TAILQ_NEXT(result, pageq) after calling. * The reason is that this way, the length of the segment is easily * calculated using: atop(result) - atop(pg) + 1. * Hence this function also never returns NULL. */ struct vm_page * uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, struct vm_page *pg, paddr_t boundary) { paddr_t first_boundary; struct vm_page *next; struct vm_page *prev; KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && pmr->high > atop(VM_PAGE_TO_PHYS(pg))); if (boundary != 0) { first_boundary = PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); } else first_boundary = 0; /* * Increase next until it hits the first page of the next segment. * * While loop checks the following: * - next != NULL we have not reached the end of pgl * - boundary == 0 || next < first_boundary * we do not cross a boundary * - atop(prev) + 1 == atop(next) * still in the same segment * - low <= last * - high > last still in the same memory range * - memtype is equal allocator is unable to view different memtypes * as part of the same segment * - prev + 1 == next no array breakage occurs */ prev = pg; next = TAILQ_NEXT(prev, pageq); while (next != NULL && (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && pmr->high > atop(VM_PAGE_TO_PHYS(next)) && uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && prev + 1 == next) { prev = next; next = TAILQ_NEXT(prev, pageq); } /* * End of this segment. */ return prev; } /* * Find the first page that is part of this segment. * => pg: the range at which to start the search. * => boundary: the page number boundary specification (0 = no boundary). * => pmr: the pmemrange of the page. * * This function returns 1 after the previous range, so if you want to have the * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. * The reason is that this way, the length of the segment is easily * calculated using: atop(pg) - atop(result) + 1. * Hence this function also never returns NULL. */ struct vm_page * uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, struct vm_page *pg, paddr_t boundary) { paddr_t first_boundary; struct vm_page *next; struct vm_page *prev; KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && pmr->high > atop(VM_PAGE_TO_PHYS(pg))); if (boundary != 0) { first_boundary = PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); } else first_boundary = 0; /* * Increase next until it hits the first page of the previous segment. * * While loop checks the following: * - next != NULL we have not reached the end of pgl * - boundary == 0 || next >= first_boundary * we do not cross a boundary * - atop(prev) - 1 == atop(next) * still in the same segment * - low <= last * - high > last still in the same memory range * - memtype is equal allocator is unable to view different memtypes * as part of the same segment * - prev - 1 == next no array breakage occurs */ prev = pg; next = TAILQ_NEXT(prev, pageq); while (next != NULL && (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && pmr->high > atop(VM_PAGE_TO_PHYS(next)) && uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && prev - 1 == next) { prev = next; next = TAILQ_NEXT(prev, pageq); } /* * Start of this segment. */ return prev; } /* * Remove the first segment of contiguous pages from pgl. * A segment ends if it crosses boundary (unless boundary = 0) or * if it would enter a different uvm_pmemrange. * * Work: the page range that the caller is currently working with. * May be null. * * If is_desperate is non-zero, the smallest segment is erased. Otherwise, * the first segment is erased (which, if called by uvm_pmr_getpages(), * probably is the smallest or very close to it). */ psize_t uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, struct vm_page **work, int is_desperate) { struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; psize_t count; struct uvm_pmemrange *pmr, *pmr_iter; KASSERT(!TAILQ_EMPTY(pgl)); /* * Initialize to first page. * Unless desperate scan finds a better candidate, this is what'll be * erased. */ start = TAILQ_FIRST(pgl); pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); end = uvm_pmr_findnextsegment(pmr, start, boundary); /* * If we are desperate, we _really_ want to get rid of the smallest * element (rather than a close match to the smallest element). */ if (is_desperate) { /* Linear search for smallest segment. */ pmr_iter = pmr; for (iter = TAILQ_NEXT(end, pageq); iter != NULL && start != end; iter = TAILQ_NEXT(iter_end, pageq)) { /* * Only update pmr if it doesn't match current * iteration. */ if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { pmr_iter = uvm_pmemrange_find(atop( VM_PAGE_TO_PHYS(iter))); } iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, boundary); /* * Current iteration is smaller than best match so * far; update. */ if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { start = iter; end = iter_end; pmr = pmr_iter; } } } /* * Calculate count and end of the list. */ count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; lowest = start; end = TAILQ_NEXT(end, pageq); /* * Actually remove the range of pages. * * Sadly, this cannot be done using pointer iteration: * vm_physseg is not guaranteed to be sorted on address, hence * uvm_page_init() may not have initialized its array sorted by * page number. */ for (iter = start; iter != end; iter = iter_end) { iter_end = TAILQ_NEXT(iter, pageq); TAILQ_REMOVE(pgl, iter, pageq); } lowest->fpgsz = count; inserted = uvm_pmr_insert(pmr, lowest, 0); /* * If the caller was working on a range and this function modified * that range, update the pointer. */ if (work != NULL && *work != NULL && atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > atop(VM_PAGE_TO_PHYS(*work))) *work = inserted; return count; } /* * Remove the first segment of contiguous pages from a pgl * with the list elements in reverse order of physaddr. * * A segment ends if it would enter a different uvm_pmemrange. * * Stores starting physical address of the segment in pstart. */ psize_t uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) { struct vm_page *start, *end, *iter, *iter_end, *lowest; psize_t count; struct uvm_pmemrange *pmr; KASSERT(!TAILQ_EMPTY(pgl)); start = TAILQ_FIRST(pgl); pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); end = uvm_pmr_findprevsegment(pmr, start, 0); KASSERT(end <= start); /* * Calculate count and end of the list. */ count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; lowest = end; end = TAILQ_NEXT(end, pageq); /* * Actually remove the range of pages. * * Sadly, this cannot be done using pointer iteration: * vm_physseg is not guaranteed to be sorted on address, hence * uvm_page_init() may not have initialized its array sorted by * page number. */ for (iter = start; iter != end; iter = iter_end) { iter_end = TAILQ_NEXT(iter, pageq); TAILQ_REMOVE(pgl, iter, pageq); } lowest->fpgsz = count; (void) uvm_pmr_insert(pmr, lowest, 0); *pstart = VM_PAGE_TO_PHYS(lowest); return count; } /* * Extract a number of pages from a segment of free pages. * Called by uvm_pmr_getpages. * * Returns the segment that was created from pages left over at the tail * of the remove set of pages, or NULL if no pages were left at the tail. */ struct vm_page * uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, paddr_t start, paddr_t end, struct pglist *result) { struct vm_page *after, *pg_i; psize_t before_sz, after_sz; #ifdef DEBUG psize_t i; #endif KDASSERT(end > start); KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); uvm_pmr_assertvalid(pmr); uvm_pmr_remove_size(pmr, pg); if (before_sz == 0) uvm_pmr_remove_addr(pmr, pg); after = pg + before_sz + (end - start); /* Add selected pages to result. */ for (pg_i = pg + before_sz; pg_i != after; pg_i++) { KASSERT(pg_i->pg_flags & PQ_FREE); pg_i->fpgsz = 0; TAILQ_INSERT_TAIL(result, pg_i, pageq); } /* Before handling. */ if (before_sz > 0) { pg->fpgsz = before_sz; uvm_pmr_insert_size(pmr, pg); } /* After handling. */ if (after_sz > 0) { #ifdef DEBUG for (i = 0; i < after_sz; i++) { KASSERT(!uvm_pmr_isfree(after + i)); } #endif KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); after->fpgsz = after_sz; after = uvm_pmr_insert_addr(pmr, after, 1); uvm_pmr_insert_size(pmr, after); } uvm_pmr_assertvalid(pmr); return (after_sz > 0 ? after : NULL); } /* * Indicate to the page daemon that a nowait call failed and it should * recover at least some memory in the most restricted region (assumed * to be dma_constraint). */ extern volatile int uvm_nowait_failed; /* * Acquire a number of pages. * * count: the number of pages returned * start: lowest page number * end: highest page number +1 * (start = end = 0: no limitation) * align: power-of-2 alignment constraint (align = 1: no alignment) * boundary: power-of-2 boundary (boundary = 0: no boundary) * maxseg: maximum number of segments to return * flags: UVM_PLA_* flags * result: returned pages storage (uses pageq) */ int uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, paddr_t boundary, int maxseg, int flags, struct pglist *result) { struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ struct vm_page *found, *f_next; /* Iterate chunks. */ psize_t fcount; /* Current found pages. */ int fnsegs; /* Current segment counter. */ int try, start_try; psize_t search[3]; paddr_t fstart, fend; /* Pages to be taken from found. */ int memtype; /* Requested memtype. */ int memtype_init; /* Best memtype. */ int desperate; /* True if allocation failed. */ #ifdef DIAGNOSTIC struct vm_page *diag_prev; /* Used during validation. */ #endif /* DIAGNOSTIC */ /* * Validate arguments. */ KASSERT(count > 0); KASSERT(start == 0 || end == 0 || start < end); KASSERT(align >= 1); KASSERT(powerof2(align)); KASSERT(maxseg > 0); KASSERT(boundary == 0 || powerof2(boundary)); KASSERT(boundary == 0 || maxseg * boundary >= count); KASSERT(TAILQ_EMPTY(result)); KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); /* * TRYCONTIG is a noop if you only want a single segment. * Remove it if that's the case: otherwise it'll deny the fast * allocation. */ if (maxseg == 1 || count == 1) flags &= ~UVM_PLA_TRYCONTIG; /* * Configure search. * * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. * search[1] is multiple segments, chosen to fulfill the search in * approximately even-sized segments. * This is a good trade-off between slightly reduced allocation speed * and less fragmentation. * search[2] is the worst case, in which all segments are evaluated. * This provides the least fragmentation, but makes the search * possibly longer (although in the case it is selected, that no * longer matters most). * * The exception is when maxseg == 1: since we can only fulfill that * with one segment of size pages, only a single search type has to * be attempted. */ if (maxseg == 1 || count == 1) { start_try = 2; search[2] = count; } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { start_try = 2; search[2] = 1; } else { start_try = 0; search[0] = count; search[1] = pow2divide(count, maxseg); search[2] = 1; if ((flags & UVM_PLA_TRYCONTIG) == 0) start_try = 1; if (search[1] >= search[0]) { search[1] = search[0]; start_try = 1; } if (search[2] >= search[start_try]) { start_try = 2; } } /* * Memory type: if zeroed memory is requested, traverse the zero set. * Otherwise, traverse the dirty set. * * The memtype iterator is reinitialized to memtype_init on entrance * of a pmemrange. */ if (flags & UVM_PLA_ZERO) memtype_init = UVM_PMR_MEMTYPE_ZERO; else memtype_init = UVM_PMR_MEMTYPE_DIRTY; /* * Initially, we're not desperate. * * Note that if we return from a sleep, we are still desperate. * Chances are that memory pressure is still high, so resetting * seems over-optimistic to me. */ desperate = 0; again: uvm_lock_fpageq(); /* * check to see if we need to generate some free pages waking * the pagedaemon. */ if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg && (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) wakeup(&uvm.pagedaemon); /* * fail if any of these conditions is true: * [1] there really are no free pages, or * [2] only kernel "reserved" pages remain and * the UVM_PLA_USERESERVE flag wasn't used. * [3] only pagedaemon "reserved" pages remain and * the requestor isn't the pagedaemon nor the syncer. */ if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) && !(flags & UVM_PLA_USERESERVE)) { uvm_unlock_fpageq(); return ENOMEM; } if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) { uvm_unlock_fpageq(); if (flags & UVM_PLA_WAITOK) { uvm_wait("uvm_pmr_getpages"); goto again; } return ENOMEM; } retry: /* Return point after sleeping. */ fcount = 0; fnsegs = 0; retry_desperate: /* * If we just want any page(s), go for the really fast option. */ if (count <= maxseg && align == 1 && boundary == 0 && (flags & UVM_PLA_TRYCONTIG) == 0) { fcount += uvm_pmr_get1page(count - fcount, memtype_init, result, start, end, 0); /* * If we found sufficient pages, go to the success exit code. * * Otherwise, go immediately to fail, since we collected * all we could anyway. */ if (fcount == count) goto out; else goto fail; } /* * The heart of the contig case. * * The code actually looks like this: * * foreach (struct pmemrange) { * foreach (memtype) { * foreach(try) { * foreach (free range of memtype in pmemrange, * starting at search[try]) { * while (range has space left) * take from range * } * } * } * * if next pmemrange has higher usecount than current: * enter desperate case (which will drain the pmemranges * until empty prior to moving to the next one) * } * * When desperate is activated, try always starts at the highest * value. The memtype loop is using a goto ReScanMemtype. * The try loop is using a goto ReScan. * The 'range has space left' loop uses label DrainFound. * * Writing them all as loops would take up a lot of screen space in * the form of indentation and some parts are easier to express * using the labels. */ TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { /* Empty range. */ if (pmr->nsegs == 0) continue; /* Outside requested range. */ if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) continue; memtype = memtype_init; rescan_memtype: /* Return point at memtype++. */ try = start_try; rescan: /* Return point at try++. */ for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); found != NULL; found = f_next) { f_next = uvm_pmr_nextsz(pmr, found, memtype); fstart = atop(VM_PAGE_TO_PHYS(found)); if (start != 0) fstart = MAX(start, fstart); drain_found: /* * Throw away the first segment if fnsegs == maxseg * * Note that f_next is still valid after this call, * since we only allocated from entries before f_next. * We don't revisit the entries we already extracted * from unless we entered the desperate case. */ if (fnsegs == maxseg) { fnsegs--; fcount -= uvm_pmr_remove_1strange(result, boundary, &found, desperate); } fstart = PMR_ALIGN(fstart, align); fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; if (end != 0) fend = MIN(end, fend); if (boundary != 0) { fend = MIN(fend, PMR_ALIGN(fstart + 1, boundary)); } if (fstart >= fend) continue; if (fend - fstart > count - fcount) fend = fstart + (count - fcount); fcount += fend - fstart; fnsegs++; found = uvm_pmr_extract_range(pmr, found, fstart, fend, result); if (fcount == count) goto out; /* * If there's still space left in found, try to * fully drain it prior to continuing. */ if (found != NULL) { fstart = fend; goto drain_found; } } /* Try a smaller search now. */ if (++try < nitems(search)) goto rescan; /* * Exhaust all memory types prior to going to the next memory * segment. * This means that zero-vs-dirty are eaten prior to moving * to a pmemrange with a higher use-count. * * Code is basically a difficult way of writing: * memtype = memtype_init; * do { * ...; * memtype += 1; * memtype %= MEMTYPE_MAX; * } while (memtype != memtype_init); */ memtype += 1; if (memtype == UVM_PMR_MEMTYPE_MAX) memtype = 0; if (memtype != memtype_init) goto rescan_memtype; /* * If not desperate, enter desperate case prior to eating all * the good stuff in the next range. */ if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) break; } /* * Not enough memory of the requested type available. Fall back to * less good memory that we'll clean up better later. * * This algorithm is not very smart though, it just starts scanning * a different typed range, but the nicer ranges of the previous * iteration may fall out. Hence there is a small chance of a false * negative. * * When desperate: scan all sizes starting at the smallest * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may * allow us to hit the fast path now). * * Also, because we will revisit entries we scanned before, we need * to reset the page queue, or we may end up releasing entries in * such a way as to invalidate f_next. */ if (!desperate) { desperate = 1; start_try = nitems(search) - 1; flags &= ~UVM_PLA_TRYCONTIG; while (!TAILQ_EMPTY(result)) uvm_pmr_remove_1strange(result, 0, NULL, 0); fnsegs = 0; fcount = 0; goto retry_desperate; } fail: /* Allocation failed. */ /* XXX: claim from memory reserve here */ while (!TAILQ_EMPTY(result)) uvm_pmr_remove_1strange(result, 0, NULL, 0); if (flags & UVM_PLA_WAITOK) { if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), flags & UVM_PLA_FAILOK) == 0) goto retry; KASSERT(flags & UVM_PLA_FAILOK); } else { if (!(flags & UVM_PLA_NOWAKE)) { uvm_nowait_failed = 1; wakeup(&uvm.pagedaemon); } } uvm_unlock_fpageq(); return ENOMEM; out: /* Allocation successful. */ uvmexp.free -= fcount; uvm_unlock_fpageq(); /* Update statistics and zero pages if UVM_PLA_ZERO. */ #ifdef DIAGNOSTIC fnsegs = 0; fcount = 0; diag_prev = NULL; #endif /* DIAGNOSTIC */ TAILQ_FOREACH(found, result, pageq) { atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); if (found->pg_flags & PG_ZERO) { uvm_lock_fpageq(); uvmexp.zeropages--; if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages); uvm_unlock_fpageq(); } if (flags & UVM_PLA_ZERO) { if (found->pg_flags & PG_ZERO) uvmexp.pga_zerohit++; else { uvmexp.pga_zeromiss++; uvm_pagezero(found); } } atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); found->uobject = NULL; found->uanon = NULL; found->pg_version++; /* * Validate that the page matches range criterium. */ KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); #ifdef DIAGNOSTIC /* * Update fcount (# found pages) and * fnsegs (# found segments) counters. */ if (diag_prev == NULL || /* new segment if it contains a hole */ atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != atop(VM_PAGE_TO_PHYS(found)) || /* new segment if it crosses boundary */ (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) fnsegs++; fcount++; diag_prev = found; #endif /* DIAGNOSTIC */ } #ifdef DIAGNOSTIC /* * Panic on algorithm failure. */ if (fcount != count || fnsegs > maxseg) { panic("pmemrange allocation error: " "allocated %ld pages in %d segments, " "but request was %ld pages in %d segments", fcount, fnsegs, count, maxseg); } #endif /* DIAGNOSTIC */ return 0; } /* * Free a number of contig pages (invoked by uvm_page_init). */ void uvm_pmr_freepages(struct vm_page *pg, psize_t count) { struct uvm_pmemrange *pmr; psize_t i, pmr_count; struct vm_page *firstpg = pg; for (i = 0; i < count; i++) { KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == atop(VM_PAGE_TO_PHYS(pg)) + i); if (!((pg[i].pg_flags & PQ_FREE) == 0 && VALID_FLAGS(pg[i].pg_flags))) { printf("Flags: 0x%x, will panic now.\n", pg[i].pg_flags); } KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && VALID_FLAGS(pg[i].pg_flags)); atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); } uvm_lock_fpageq(); for (i = count; i > 0; i -= pmr_count) { pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); KASSERT(pmr != NULL); pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); pg->fpgsz = pmr_count; uvm_pmr_insert(pmr, pg, 0); uvmexp.free += pmr_count; pg += pmr_count; } wakeup(&uvmexp.free); if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages); uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); uvm_unlock_fpageq(); } /* * Free all pages in the queue. */ void uvm_pmr_freepageq(struct pglist *pgl) { struct vm_page *pg; paddr_t pstart; psize_t plen; TAILQ_FOREACH(pg, pgl, pageq) { if (!((pg->pg_flags & PQ_FREE) == 0 && VALID_FLAGS(pg->pg_flags))) { printf("Flags: 0x%x, will panic now.\n", pg->pg_flags); } KASSERT((pg->pg_flags & PQ_FREE) == 0 && VALID_FLAGS(pg->pg_flags)); atomic_setbits_int(&pg->pg_flags, PQ_FREE); atomic_clearbits_int(&pg->pg_flags, PG_ZERO); } uvm_lock_fpageq(); while (!TAILQ_EMPTY(pgl)) { pg = TAILQ_FIRST(pgl); if (pg == TAILQ_NEXT(pg, pageq) + 1) { /* * If pg is one behind the position of the * next page in the list in the page array, * try going backwards instead of forward. */ plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); } else { pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); } uvmexp.free += plen; uvm_wakeup_pla(pstart, ptoa(plen)); } wakeup(&uvmexp.free); if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) wakeup(&uvmexp.zeropages); uvm_unlock_fpageq(); return; } /* * Store a pmemrange in the list. * * The list is sorted by use. */ struct uvm_pmemrange * uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, struct uvm_pmemrange *pmr) { struct uvm_pmemrange *iter; int cmp = 1; TAILQ_FOREACH(iter, useq, pmr_use) { cmp = uvm_pmemrange_use_cmp(pmr, iter); if (cmp == 0) return iter; if (cmp == -1) break; } if (iter == NULL) TAILQ_INSERT_TAIL(useq, pmr, pmr_use); else TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); return NULL; } #ifdef DEBUG /* * Validation of the whole pmemrange. * Called with fpageq locked. */ void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) { struct vm_page *prev, *next, *i, *xref; int lcv, mti; /* Empty range */ if (pmr->nsegs == 0) return; /* Validate address tree. */ RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { /* Validate the range. */ KASSERT(i->fpgsz > 0); KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz <= pmr->high); /* Validate each page in this range. */ for (lcv = 0; lcv < i->fpgsz; lcv++) { /* * Only the first page has a size specification. * Rest is size 0. */ KASSERT(lcv == 0 || i[lcv].fpgsz == 0); /* * Flag check. */ KASSERT(VALID_FLAGS(i[lcv].pg_flags) && (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); /* * Free pages are: * - not wired * - have no vm_anon * - have no uvm_object */ KASSERT(i[lcv].wire_count == 0); KASSERT(i[lcv].uanon == (void*)0xdeadbeef || i[lcv].uanon == NULL); KASSERT(i[lcv].uobject == (void*)0xdeadbeef || i[lcv].uobject == NULL); /* * Pages in a single range always have the same * memtype. */ KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == uvm_pmr_pg_to_memtype(&i[lcv])); } /* Check that it shouldn't be joined with its predecessor. */ prev = RBT_PREV(uvm_pmr_addr, i); if (prev != NULL) { KASSERT(uvm_pmr_pg_to_memtype(i) != uvm_pmr_pg_to_memtype(prev) || atop(VM_PAGE_TO_PHYS(i)) > atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || prev + prev->fpgsz != i); } /* Assert i is in the size tree as well. */ if (i->fpgsz == 1) { TAILQ_FOREACH(xref, &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { if (xref == i) break; } KASSERT(xref == i); } else { KASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == i + 1); } } /* Validate size tree. */ for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { next = uvm_pmr_nextsz(pmr, i, mti); if (next != NULL) { KASSERT(i->fpgsz <= next->fpgsz); } /* Assert i is in the addr tree as well. */ KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); /* Assert i is of the correct memory type. */ KASSERT(uvm_pmr_pg_to_memtype(i) == mti); } } /* Validate nsegs statistic. */ lcv = 0; RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) lcv++; KASSERT(pmr->nsegs == lcv); } #endif /* DEBUG */ /* * Split pmr at split point pageno. * Called with fpageq unlocked. * * Split is only applied if a pmemrange spans pageno. */ void uvm_pmr_split(paddr_t pageno) { struct uvm_pmemrange *pmr, *drain; struct vm_page *rebuild, *prev, *next; psize_t prev_sz; uvm_lock_fpageq(); pmr = uvm_pmemrange_find(pageno); if (pmr == NULL || !(pmr->low < pageno)) { /* No split required. */ uvm_unlock_fpageq(); return; } KASSERT(pmr->low < pageno); KASSERT(pmr->high > pageno); /* * uvm_pmr_allocpmr() calls into malloc() which in turn calls into * uvm_kmemalloc which calls into pmemrange, making the locking * a bit hard, so we just race! */ uvm_unlock_fpageq(); drain = uvm_pmr_allocpmr(); uvm_lock_fpageq(); pmr = uvm_pmemrange_find(pageno); if (pmr == NULL || !(pmr->low < pageno)) { /* * We lost the race since someone else ran this or a related * function, however this should be triggered very rarely so * we just leak the pmr. */ printf("uvm_pmr_split: lost one pmr\n"); uvm_unlock_fpageq(); return; } drain->low = pageno; drain->high = pmr->high; drain->use = pmr->use; uvm_pmr_assertvalid(pmr); uvm_pmr_assertvalid(drain); KASSERT(drain->nsegs == 0); RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) break; } if (rebuild == NULL) prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); else prev = RBT_PREV(uvm_pmr_addr, rebuild); KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); /* * Handle free chunk that spans the split point. */ if (prev != NULL && atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { psize_t before, after; KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); uvm_pmr_remove(pmr, prev); prev_sz = prev->fpgsz; before = pageno - atop(VM_PAGE_TO_PHYS(prev)); after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; KASSERT(before > 0); KASSERT(after > 0); prev->fpgsz = before; uvm_pmr_insert(pmr, prev, 1); (prev + before)->fpgsz = after; uvm_pmr_insert(drain, prev + before, 1); } /* Move free chunks that no longer fall in the range. */ for (; rebuild != NULL; rebuild = next) { next = RBT_NEXT(uvm_pmr_addr, rebuild); uvm_pmr_remove(pmr, rebuild); uvm_pmr_insert(drain, rebuild, 1); } pmr->high = pageno; uvm_pmr_assertvalid(pmr); uvm_pmr_assertvalid(drain); RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); uvm_unlock_fpageq(); } /* * Increase the usage counter for the given range of memory. * * The more usage counters a given range of memory has, the more will be * attempted not to allocate from it. * * Addresses here are in paddr_t, not page-numbers. * The lowest and highest allowed address are specified. */ void uvm_pmr_use_inc(paddr_t low, paddr_t high) { struct uvm_pmemrange *pmr; paddr_t sz; /* pmr uses page numbers, translate low and high. */ high++; high = atop(trunc_page(high)); low = atop(round_page(low)); uvm_pmr_split(low); uvm_pmr_split(high); sz = 0; uvm_lock_fpageq(); /* Increase use count on segments in range. */ RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); pmr->use++; sz += pmr->high - pmr->low; uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); } uvm_pmr_assertvalid(pmr); } uvm_unlock_fpageq(); KASSERT(sz >= high - low); } /* * Allocate a pmemrange. * * If called from uvm_page_init, the uvm_pageboot_alloc is used. * If called after uvm_init, malloc is used. * (And if called in between, you're dead.) */ struct uvm_pmemrange * uvm_pmr_allocpmr(void) { struct uvm_pmemrange *nw; int i; /* We're only ever hitting the !uvm.page_init_done case for now. */ if (!uvm.page_init_done) { nw = (struct uvm_pmemrange *) uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); } else { nw = malloc(sizeof(struct uvm_pmemrange), M_VMMAP, M_NOWAIT); } KASSERT(nw != NULL); memset(nw, 0, sizeof(struct uvm_pmemrange)); RBT_INIT(uvm_pmr_addr, &nw->addr); for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { RBT_INIT(uvm_pmr_size, &nw->size[i]); TAILQ_INIT(&nw->single[i]); } return nw; } /* * Initialization of pmr. * Called by uvm_page_init. * * Sets up pmemranges. */ void uvm_pmr_init(void) { struct uvm_pmemrange *new_pmr; int i; TAILQ_INIT(&uvm.pmr_control.use); RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); TAILQ_INIT(&uvm.pmr_control.allocs); /* By default, one range for the entire address space. */ new_pmr = uvm_pmr_allocpmr(); new_pmr->low = 0; new_pmr->high = atop((paddr_t)-1) + 1; RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); for (i = 0; uvm_md_constraints[i] != NULL; i++) { uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, uvm_md_constraints[i]->ucr_high); } } /* * Find the pmemrange that contains the given page number. * * (Manually traverses the binary tree, because that is cheaper on stack * usage.) */ struct uvm_pmemrange * uvm_pmemrange_find(paddr_t pageno) { struct uvm_pmemrange *pmr; pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); while (pmr != NULL) { if (pmr->low > pageno) pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); else if (pmr->high <= pageno) pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); else break; } return pmr; } #if defined(DDB) || defined(DEBUG) /* * Return true if the given page is in any of the free lists. * Used by uvm_page_printit. * This function is safe, even if the page is not on the freeq. * Note: does not apply locking, only called from ddb. */ int uvm_pmr_isfree(struct vm_page *pg) { struct vm_page *r; struct uvm_pmemrange *pmr; pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); if (pmr == NULL) return 0; r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); if (r == NULL) r = RBT_MAX(uvm_pmr_addr, &pmr->addr); else if (r != pg) r = RBT_PREV(uvm_pmr_addr, r); if (r == NULL) return 0; /* Empty tree. */ KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > atop(VM_PAGE_TO_PHYS(pg)); } #endif /* DEBUG */ /* * Given a root of a tree, find a range which intersects start, end and * is of the same memtype. * * Page must be in the address tree. */ struct vm_page* uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, paddr_t start, paddr_t end, int memtype) { int direction; struct vm_page *root; struct vm_page *high, *high_next; struct vm_page *low, *low_next; KDASSERT(pmr != NULL && init_root != NULL); root = init_root; /* Which direction to use for searching. */ if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) direction = 1; else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) direction = -1; else /* nothing to do */ return root; /* First, update root to fall within the chosen range. */ while (root && !PMR_INTERSECTS_WITH( atop(VM_PAGE_TO_PHYS(root)), atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, start, end)) { if (direction == 1) root = RBT_RIGHT(uvm_objtree, root); else root = RBT_LEFT(uvm_objtree, root); } if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) return root; /* * Root is valid, but of the wrong memtype. * * Try to find a range that has the given memtype in the subtree * (memtype mismatches are costly, either because the conversion * is expensive, or a later allocation will need to do the opposite * conversion, which will be expensive). * * * First, simply increase address until we hit something we can use. * Cache the upper page, so we can page-walk later. */ high = root; high_next = RBT_RIGHT(uvm_objtree, high); while (high_next != NULL && PMR_INTERSECTS_WITH( atop(VM_PAGE_TO_PHYS(high_next)), atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, start, end)) { high = high_next; if (uvm_pmr_pg_to_memtype(high) == memtype) return high; high_next = RBT_RIGHT(uvm_objtree, high); } /* * Second, decrease the address until we hit something we can use. * Cache the lower page, so we can page-walk later. */ low = root; low_next = RBT_LEFT(uvm_objtree, low); while (low_next != NULL && PMR_INTERSECTS_WITH( atop(VM_PAGE_TO_PHYS(low_next)), atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, start, end)) { low = low_next; if (uvm_pmr_pg_to_memtype(low) == memtype) return low; low_next = RBT_LEFT(uvm_objtree, low); } if (low == high) return NULL; /* No hits. Walk the address tree until we find something usable. */ for (low = RBT_NEXT(uvm_pmr_addr, low); low != high; low = RBT_NEXT(uvm_pmr_addr, low)) { KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, start, end)); if (uvm_pmr_pg_to_memtype(low) == memtype) return low; } /* Nothing found. */ return NULL; } /* * Allocate any page, the fastest way. Page number constraints only. */ psize_t uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, paddr_t start, paddr_t end, int memtype_only) { struct uvm_pmemrange *pmr; struct vm_page *found, *splitpg; psize_t fcount; int memtype; fcount = 0; TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { /* We're done. */ if (fcount == count) break; /* Outside requested range. */ if (!(start == 0 && end == 0) && !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) continue; /* Range is empty. */ if (pmr->nsegs == 0) continue; /* Loop over all memtypes, starting at memtype_init. */ memtype = memtype_init; while (fcount != count) { found = TAILQ_FIRST(&pmr->single[memtype]); /* * If found is outside the range, walk the list * until we find something that intersects with * boundaries. */ while (found && !PMR_INTERSECTS_WITH( atop(VM_PAGE_TO_PHYS(found)), atop(VM_PAGE_TO_PHYS(found)) + 1, start, end)) found = TAILQ_NEXT(found, pageq); if (found == NULL) { /*